From 4c75e21c205b3cc1d38a0add110aecc1d43fa7ae Mon Sep 17 00:00:00 2001
From: Mike Lin <mlin@dnanexus.com>
Date: Tue, 31 Dec 2013 17:19:38 -0800
Subject: [PATCH 1/9] Eliminate stdout message when launching a posix thread.

This seems out of place as it's the only time RocksDB prints to stdout in the
normal course of operations. Thread IDs can still be retrieved from the LOG
file: cut -d ' ' -f2 LOG | sort | uniq | egrep -x '[0-9a-f]+'
---
 util/env_posix.cc | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/util/env_posix.cc b/util/env_posix.cc
index 2be524e95..15ba161de 100644
--- a/util/env_posix.cc
+++ b/util/env_posix.cc
@@ -1393,9 +1393,6 @@ class PosixEnv : public Env {
                          nullptr,
                          &ThreadPool::BGThreadWrapper,
                          this));
-        fprintf(stdout,
-                "Created bg thread 0x%lx\n",
-                (unsigned long)t);
 
         // Set the thread name to aid debugging
 #if defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ)

From abd70ecc2b7fe02709955ef14905d1628581b306 Mon Sep 17 00:00:00 2001
From: Dhruba Borthakur <dhruba@fb.com>
Date: Thu, 30 Jan 2014 14:58:37 -0800
Subject: [PATCH 2/9] The default settings enable checksum verification on
 every read.

Summary: The default settings enable checksum verification on every read.

Test Plan: make check

Reviewers: haobo

Reviewed By: haobo

CC: leveldb

Differential Revision: https://reviews.facebook.net/D15591
---
 db/version_set.cc         | 1 -
 include/rocksdb/options.h | 8 +++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/db/version_set.cc b/db/version_set.cc
index 18081d748..89bdc9d49 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -2088,7 +2088,6 @@ Compaction* VersionSet::CompactRange(int input_level, int output_level,
 
 Iterator* VersionSet::MakeInputIterator(Compaction* c) {
   ReadOptions options;
-  options.verify_checksums = options_->paranoid_checks;
   options.fill_cache = false;
 
   // Level-0 files have to be merged together.  For other levels,
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index da6412aa2..6424e434f 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -672,7 +672,7 @@ enum ReadTier {
 struct ReadOptions {
   // If true, all data read from underlying storage will be
   // verified against corresponding checksums.
-  // Default: false
+  // Default: true
   bool verify_checksums;
 
   // Should the "data block"/"index block"/"filter block" read for this
@@ -713,11 +713,13 @@ struct ReadOptions {
 
   // Specify to create a tailing iterator -- a special iterator that has a
   // view of the complete database (i.e. it can also be used to read newly
-  // added data) and is optimized for sequential reads.
+  // added data) and is optimized for sequential reads. It will return records
+  // that were inserted into the database after the creation of the iterator.
+  // Default: false
   bool tailing;
 
   ReadOptions()
-      : verify_checksums(false),
+      : verify_checksums(true),
         fill_cache(true),
         prefix_seek(false),
         snapshot(nullptr),

From 56bea9f80d8feb5328e928609859f5bd88888980 Mon Sep 17 00:00:00 2001
From: Siying Dong <siying.d@fb.com>
Date: Mon, 27 Jan 2014 11:53:59 -0800
Subject: [PATCH 3/9] When using Universal Compaction, Zero out seqID in the
 last file too

Summary: I didn't figure out the reason why the feature of zeroing out earlier sequence ID is disabled in universal compaction. I do see bottommost_level is set correctly. It should simply work if we remove the constraint of universal compaction.

Test Plan: make all check

Reviewers: haobo, dhruba, kailiu, igor

Reviewed By: dhruba

CC: leveldb

Differential Revision: https://reviews.facebook.net/D15423
---
 db/db_impl.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index e3b95ad5f..8b08940a1 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -2507,8 +2507,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
         // If this is the bottommost level (no files in lower levels)
         // and the earliest snapshot is larger than this seqno
         // then we can squash the seqno to zero.
-        if (options_.compaction_style == kCompactionStyleLevel &&
-            bottommost_level && ikey.sequence < earliest_snapshot &&
+        if (bottommost_level && ikey.sequence < earliest_snapshot &&
             ikey.type != kTypeMerge) {
           assert(ikey.type != kTypeDeletion);
           // make a copy because updating in place would cause problems

From dbbffbd77278f6c52b74a2251d3584d500b6d018 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Fri, 31 Jan 2014 14:43:16 -0800
Subject: [PATCH 4/9] Mark the log_number file number used

Summary:
VersionSet::next_file_number_ is always assumed to be strictly greater than VersionSet::log_number_. In our new recovery code, we artificially set log_number_  to be (log_number + 1), so that once we flush, we don't recover from the same log file again (this is important because of merge operator non-idempotence)

When we set VersionSet::log_number_ to (log_number + 1), we also have to mark that file number used, such that next_file_number_ is increased to a legal level. Otherwise, VersionSet might assert.

This has not be a problem so far because here's what happens:
1. assume next_file_number is 5, we're recovering log_number 10
2. in DBImpl::Recover() we call MarkFileNumberUsed with 10. This will set VersionSet::next_file_number_ to 11.
3. If there are some updates, we will call WriteTable0ForRecovery(), which will use file number 11 as a new table file and advance VersionSet::next_file_number_ to 12.
4. When we LogAndApply() with log_number 11, assertion is true: assert(11 <= 12);

However, this was a lucky occurrence. Even though this diff doesn't cause a bug, I think the issue is important to fix.

Test Plan: In column families I have different recovery logic and this code path asserted. When adding MarkFileNumberUsed(log_number + 1) assert is gone.

Reviewers: dhruba, kailiu

Reviewed By: kailiu

CC: leveldb

Differential Revision: https://reviews.facebook.net/D15783
---
 db/db_impl.cc | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index 8b08940a1..1c4f73b8e 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -1066,6 +1066,11 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence,
     // Since we already recovered log_number, we want all logs
     // with numbers `<= log_number` (includes this one) to be ignored
     edit.SetLogNumber(log_number + 1);
+    // we must mark the next log number as used, even though it's
+    // not actually used. that is because VersionSet assumes
+    // VersionSet::next_file_number_ always to be strictly greater than any log
+    // number
+    versions_->MarkFileNumberUsed(log_number + 1);
     status = versions_->LogAndApply(&edit, &mutex_);
   }
 

From 30a700657d3423f7a9d89e8a6c89433deba5a5ea Mon Sep 17 00:00:00 2001
From: Dhruba Borthakur <dhruba@fb.com>
Date: Fri, 31 Jan 2014 17:16:30 -0800
Subject: [PATCH 5/9] Fix corruption_test failure caused by auto-enablement of
 checksum verification.

Summary:
Patch  https://reviews.facebook.net/D15591 enabled checksum
verification by default. This caused the unit test to fail.

Test Plan: ./corruption_test

Reviewers: igor, kailiu

Reviewed By: igor

CC: leveldb

Differential Revision: https://reviews.facebook.net/D15795
---
 HISTORY.md            | 4 ++++
 db/corruption_test.cc | 7 ++++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/HISTORY.md b/HISTORY.md
index 912599bc9..1c8509337 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,5 +1,9 @@
 # Rocksdb Change Log
 
+## Unreleased
+* By default, checksums are verified on every read from database
+
+
 ## 2.7.0 (01/28/2014)
 
 ### Public API changes
diff --git a/db/corruption_test.cc b/db/corruption_test.cc
index e7b7b4c8b..0bf385b08 100644
--- a/db/corruption_test.cc
+++ b/db/corruption_test.cc
@@ -95,7 +95,12 @@ class CorruptionTest {
     int bad_values = 0;
     int correct = 0;
     std::string value_space;
-    Iterator* iter = db_->NewIterator(ReadOptions());
+    // Do not verify checksums. If we verify checksums then the
+    // db itself will raise errors because data is corrupted.
+    // Instead, we want the reads to be successful and this test
+    // will detect whether the appropriate corruptions have
+    // occured.
+    Iterator* iter = db_->NewIterator(ReadOptions(false, true));
     for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
       uint64_t key;
       Slice in(iter->key());

From 5c6ef56152f8dc0fe9631332c5e8aa8b20bfbd5e Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Mon, 3 Feb 2014 10:21:58 -0800
Subject: [PATCH 6/9] Fix printf format

---
 db/compaction_picker.cc |  2 +-
 db/db_bench.cc          |  4 ++--
 util/options.cc         | 10 +++++-----
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc
index 9582b6a29..28cda9dac 100644
--- a/db/compaction_picker.cc
+++ b/db/compaction_picker.cc
@@ -551,7 +551,7 @@ Compaction* UniversalCompactionPicker::PickCompaction(Version* version) {
     return nullptr;
   }
   Version::FileSummaryStorage tmp;
-  Log(options_->info_log, "Universal: candidate files(%lu): %s\n",
+  Log(options_->info_log, "Universal: candidate files(%zu): %s\n",
       version->files_[level].size(),
       version->LevelFileSummary(&tmp, 0));
 
diff --git a/db/db_bench.cc b/db/db_bench.cc
index 18258fbb6..9e6ef70ca 100644
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -1526,14 +1526,14 @@ class Benchmark {
       ++count;
       char* tab = std::find(linep, linep + bufferLen, columnSeparator);
       if (tab == linep + bufferLen) {
-        fprintf(stderr, "[Error] No Key delimiter TAB at line %ld\n", count);
+        fprintf(stderr, "[Error] No Key delimiter TAB at line %zu\n", count);
         continue;
       }
       Slice key(linep, tab - linep);
       tab++;
       char* endLine = std::find(tab, linep + bufferLen, lineSeparator);
       if (endLine  == linep + bufferLen) {
-        fprintf(stderr, "[Error] No ENTER at end of line # %ld\n", count);
+        fprintf(stderr, "[Error] No ENTER at end of line # %zu\n", count);
         continue;
       }
       Slice value(tab, endLine - tab);
diff --git a/util/options.cc b/util/options.cc
index 671f3d681..97fffdb18 100644
--- a/util/options.cc
+++ b/util/options.cc
@@ -164,11 +164,11 @@ Options::Dump(Logger* log) const
     Log(log,"            Options.num_levels: %d", num_levels);
     Log(log,"       Options.disableDataSync: %d", disableDataSync);
     Log(log,"             Options.use_fsync: %d", use_fsync);
-    Log(log,"     Options.max_log_file_size: %ld", max_log_file_size);
+    Log(log,"     Options.max_log_file_size: %zu", max_log_file_size);
     Log(log,"Options.max_manifest_file_size: %lu",
         (unsigned long)max_manifest_file_size);
-    Log(log,"     Options.log_file_time_to_roll: %ld", log_file_time_to_roll);
-    Log(log,"     Options.keep_log_file_num: %ld", keep_log_file_num);
+    Log(log,"     Options.log_file_time_to_roll: %zu", log_file_time_to_roll);
+    Log(log,"     Options.keep_log_file_num: %zu", keep_log_file_num);
     Log(log," Options.db_stats_log_interval: %d",
         db_stats_log_interval);
     Log(log,"       Options.allow_os_buffer: %d", allow_os_buffer);
@@ -224,7 +224,7 @@ Options::Dump(Logger* log) const
         table_cache_numshardbits);
     Log(log,"    Options.table_cache_remove_scan_count_limit: %d",
         table_cache_remove_scan_count_limit);
-    Log(log,"                       Options.arena_block_size: %ld",
+    Log(log,"                       Options.arena_block_size: %zu",
         arena_block_size);
     Log(log,"    Options.delete_obsolete_files_period_micros: %lu",
         (unsigned long)delete_obsolete_files_period_micros);
@@ -244,7 +244,7 @@ Options::Dump(Logger* log) const
         (unsigned long)WAL_ttl_seconds);
     Log(log,"                      Options.WAL_size_limit_MB: %lu",
         (unsigned long)WAL_size_limit_MB);
-    Log(log,"            Options.manifest_preallocation_size: %ld",
+    Log(log,"            Options.manifest_preallocation_size: %zu",
         manifest_preallocation_size);
     Log(log,"         Options.purge_redundant_kvs_while_flush: %d",
         purge_redundant_kvs_while_flush);

From 5b3b6549d68b61e65c1614ad5f4da115a06a94f0 Mon Sep 17 00:00:00 2001
From: Lei Jin <lei@fb.com>
Date: Mon, 3 Feb 2014 13:13:36 -0800
Subject: [PATCH 7/9] use super_version in NewIterator() and MultiGet()
 function

Summary:
Use super_version insider NewIterator to avoid Ref() each component
separately under mutex
The new added bench shows NewIterator QPS increases from 515K to 719K
No meaningful improvement for multiget I guess due to its relatively small
cost comparing to 90 keys fetch in the test.

Test Plan: unit test and db_bench

Reviewers: igor, sdong

Reviewed By: igor

CC: leveldb, dhruba

Differential Revision: https://reviews.facebook.net/D15609
---
 db/db_bench.cc         |  13 ++++
 db/db_impl.cc          | 161 ++++++++++++++++-------------------------
 db/db_impl.h           |   4 +-
 db/db_impl_readonly.cc |  11 ++-
 db/db_test.cc          |  24 ++++++
 5 files changed, 110 insertions(+), 103 deletions(-)

diff --git a/db/db_bench.cc b/db/db_bench.cc
index 9e6ef70ca..8355a3f0c 100644
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -99,6 +99,7 @@ DEFINE_string(benchmarks,
               "Must be used with merge_operator\n"
               "\treadrandommergerandom -- perform N random read-or-merge "
               "operations. Must be used with merge_operator\n"
+              "\tnewiterator   -- repeated iterator creation\n"
               "\tseekrandom    -- N random seeks\n"
               "\tcrc32c        -- repeated crc32c of 4K of data\n"
               "\tacquireload   -- load N*1000 times\n"
@@ -1089,6 +1090,8 @@ class Benchmark {
         method = &Benchmark::ReadRandom;
       } else if (name == Slice("readmissing")) {
         method = &Benchmark::ReadMissing;
+      } else if (name == Slice("newiterator")) {
+        method = &Benchmark::IteratorCreation;
       } else if (name == Slice("seekrandom")) {
         method = &Benchmark::SeekRandom;
       } else if (name == Slice("readhot")) {
@@ -1877,6 +1880,16 @@ class Benchmark {
     thread->stats.AddMessage(msg);
   }
 
+  void IteratorCreation(ThreadState* thread) {
+    Duration duration(FLAGS_duration, reads_);
+    ReadOptions options(FLAGS_verify_checksum, true);
+    while (!duration.Done(1)) {
+      Iterator* iter = db_->NewIterator(options);
+      delete iter;
+      thread->stats.FinishedSingleOp(db_);
+    }
+  }
+
   void SeekRandom(ThreadState* thread) {
     Duration duration(FLAGS_duration, reads_);
     ReadOptions options(FLAGS_verify_checksum, true);
diff --git a/db/db_impl.cc b/db/db_impl.cc
index 1c4f73b8e..4dd457e48 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -2668,34 +2668,29 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
 
 namespace {
 struct IterState {
+  IterState(DBImpl* db, port::Mutex* mu, DBImpl::SuperVersion* super_version)
+    : db(db), mu(mu), super_version(super_version) {}
+
+  DBImpl* db;
   port::Mutex* mu;
-  Version* version = nullptr;
-  MemTable* mem = nullptr;
-  MemTableListVersion* imm = nullptr;
-  DBImpl *db;
+  DBImpl::SuperVersion* super_version;
 };
 
 static void CleanupIteratorState(void* arg1, void* arg2) {
   IterState* state = reinterpret_cast<IterState*>(arg1);
   DBImpl::DeletionState deletion_state(state->db->GetOptions().
                                        max_write_buffer_number);
-  state->mu->Lock();
-  if (state->mem) { // not set for immutable iterator
-    MemTable* m = state->mem->Unref();
-    if (m != nullptr) {
-      deletion_state.memtables_to_free.push_back(m);
-    }
+
+  bool need_cleanup = state->super_version->Unref();
+  if (need_cleanup) {
+    state->mu->Lock();
+    state->super_version->Cleanup();
+    state->db->FindObsoleteFiles(deletion_state, false, true);
+    state->mu->Unlock();
+
+    delete state->super_version;
+    state->db->PurgeObsoleteFiles(deletion_state);
   }
-  if (state->version) {  // not set for memtable-only iterator
-    state->version->Unref();
-  }
-  if (state->imm) {  // not set for memtable-only iterator
-    state->imm->Unref(&deletion_state.memtables_to_free);
-  }
-  // fast path FindObsoleteFiles
-  state->db->FindObsoleteFiles(deletion_state, false, true);
-  state->mu->Unlock();
-  state->db->PurgeObsoleteFiles(deletion_state);
 
   delete state;
 }
@@ -2703,36 +2698,23 @@ static void CleanupIteratorState(void* arg1, void* arg2) {
 
 Iterator* DBImpl::NewInternalIterator(const ReadOptions& options,
                                       SequenceNumber* latest_snapshot) {
-  IterState* cleanup = new IterState;
-  MemTable* mutable_mem;
-  MemTableListVersion* immutable_mems;
-  Version* version;
-
-  // Collect together all needed child iterators for mem
   mutex_.Lock();
   *latest_snapshot = versions_->LastSequence();
-  mem_->Ref();
-  mutable_mem = mem_;
-  // Collect together all needed child iterators for imm_
-  immutable_mems = imm_.current();
-  immutable_mems->Ref();
-  versions_->current()->Ref();
-  version = versions_->current();
+  SuperVersion* super_version = super_version_->Ref();
   mutex_.Unlock();
 
   std::vector<Iterator*> iterator_list;
-  iterator_list.push_back(mutable_mem->NewIterator(options));
-  cleanup->mem = mutable_mem;
-  cleanup->imm = immutable_mems;
+  // Collect iterator for mutable mem
+  iterator_list.push_back(super_version->mem->NewIterator(options));
   // Collect all needed child iterators for immutable memtables
-  immutable_mems->AddIterators(options, &iterator_list);
+  super_version->imm->AddIterators(options, &iterator_list);
   // Collect iterators for files in L0 - Ln
-  version->AddIterators(options, storage_options_, &iterator_list);
+  super_version->current->AddIterators(options, storage_options_,
+                                       &iterator_list);
   Iterator* internal_iter = NewMergingIterator(
       &internal_comparator_, &iterator_list[0], iterator_list.size());
-  cleanup->version = version;
-  cleanup->mu = &mutex_;
-  cleanup->db = this;
+
+  IterState* cleanup = new IterState(this, &mutex_, super_version);
   internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, nullptr);
 
   return internal_iter;
@@ -2747,53 +2729,36 @@ std::pair<Iterator*, Iterator*> DBImpl::GetTailingIteratorPair(
     const ReadOptions& options,
     uint64_t* superversion_number) {
 
-  MemTable* mutable_mem;
-  MemTableListVersion* immutable_mems;
-  Version* version;
-
-  // get all child iterators and bump their refcounts under lock
   mutex_.Lock();
-  mutable_mem = mem_;
-  mutable_mem->Ref();
-  immutable_mems = imm_.current();
-  immutable_mems->Ref();
-  version = versions_->current();
-  version->Ref();
+  SuperVersion* super_version = super_version_->Ref();
   if (superversion_number != nullptr) {
     *superversion_number = CurrentVersionNumber();
   }
   mutex_.Unlock();
 
-  Iterator* mutable_iter = mutable_mem->NewIterator(options);
-  IterState* mutable_cleanup = new IterState();
-  mutable_cleanup->mem = mutable_mem;
-  mutable_cleanup->db = this;
-  mutable_cleanup->mu = &mutex_;
-  mutable_iter->RegisterCleanup(CleanupIteratorState, mutable_cleanup, nullptr);
-
+  Iterator* mutable_iter = super_version->mem->NewIterator(options);
   // create a DBIter that only uses memtable content; see NewIterator()
   mutable_iter = NewDBIterator(&dbname_, env_, options_, user_comparator(),
                                mutable_iter, kMaxSequenceNumber);
 
-  Iterator* immutable_iter;
-  IterState* immutable_cleanup = new IterState();
   std::vector<Iterator*> list;
-  immutable_mems->AddIterators(options, &list);
-  immutable_cleanup->imm = immutable_mems;
-  version->AddIterators(options, storage_options_, &list);
-  immutable_cleanup->version = version;
-  immutable_cleanup->db = this;
-  immutable_cleanup->mu = &mutex_;
-
-  immutable_iter =
+  super_version->imm->AddIterators(options, &list);
+  super_version->current->AddIterators(options, storage_options_, &list);
+  Iterator* immutable_iter =
     NewMergingIterator(&internal_comparator_, &list[0], list.size());
-  immutable_iter->RegisterCleanup(CleanupIteratorState, immutable_cleanup,
-                                  nullptr);
 
   // create a DBIter that only uses memtable content; see NewIterator()
   immutable_iter = NewDBIterator(&dbname_, env_, options_, user_comparator(),
                                  immutable_iter, kMaxSequenceNumber);
 
+  // register cleanups
+  mutable_iter->RegisterCleanup(CleanupIteratorState,
+    new IterState(this, &mutex_, super_version), nullptr);
+
+  // bump the ref one more time since it will be Unref'ed twice
+  immutable_iter->RegisterCleanup(CleanupIteratorState,
+    new IterState(this, &mutex_, super_version->Ref()), nullptr);
+
   return std::make_pair(mutable_iter, immutable_iter);
 }
 
@@ -2924,7 +2889,6 @@ std::vector<Status> DBImpl::MultiGet(const ReadOptions& options,
 
   StopWatch sw(env_, options_.statistics.get(), DB_MULTIGET, false);
   SequenceNumber snapshot;
-  std::vector<MemTable*> to_delete;
 
   mutex_.Lock();
   if (options.snapshot != nullptr) {
@@ -2933,16 +2897,9 @@ std::vector<Status> DBImpl::MultiGet(const ReadOptions& options,
     snapshot = versions_->LastSequence();
   }
 
-  MemTable* mem = mem_;
-  MemTableListVersion* imm = imm_.current();
-  Version* current = versions_->current();
-  mem->Ref();
-  imm->Ref();
-  current->Ref();
-
-  // Unlock while reading from files and memtables
-
+  SuperVersion* get_version = super_version_->Ref();
   mutex_.Unlock();
+
   bool have_stat_update = false;
   Version::GetStats stats;
 
@@ -2967,12 +2924,14 @@ std::vector<Status> DBImpl::MultiGet(const ReadOptions& options,
     std::string* value = &(*values)[i];
 
     LookupKey lkey(keys[i], snapshot);
-    if (mem->Get(lkey, value, &s, merge_context, options_)) {
+    if (get_version->mem->Get(lkey, value, &s, merge_context, options_)) {
       // Done
-    } else if (imm->Get(lkey, value, &s, merge_context, options_)) {
+    } else if (get_version->imm->Get(lkey, value, &s, merge_context,
+                                     options_)) {
       // Done
     } else {
-      current->Get(options, lkey, value, &s, &merge_context, &stats, options_);
+      get_version->current->Get(options, lkey, value, &s, &merge_context,
+                                &stats, options_);
       have_stat_update = true;
     }
 
@@ -2981,20 +2940,28 @@ std::vector<Status> DBImpl::MultiGet(const ReadOptions& options,
     }
   }
 
-  // Post processing (decrement reference counts and record statistics)
-  mutex_.Lock();
-  if (!options_.disable_seek_compaction &&
-      have_stat_update && current->UpdateStats(stats)) {
-    MaybeScheduleFlushOrCompaction();
+  bool delete_get_version = false;
+  if (!options_.disable_seek_compaction && have_stat_update) {
+    mutex_.Lock();
+    if (get_version->current->UpdateStats(stats)) {
+      MaybeScheduleFlushOrCompaction();
+    }
+    if (get_version->Unref()) {
+      get_version->Cleanup();
+      delete_get_version = true;
+    }
+    mutex_.Unlock();
+  } else {
+    if (get_version->Unref()) {
+      mutex_.Lock();
+      get_version->Cleanup();
+      mutex_.Unlock();
+      delete_get_version = true;
+    }
+  }
+  if (delete_get_version) {
+    delete get_version;
   }
-  MemTable* m = mem->Unref();
-  imm->Unref(&to_delete);
-  current->Unref();
-  mutex_.Unlock();
-
-  // free up all obsolete memtables outside the mutex
-  delete m;
-  for (MemTable* v: to_delete) delete v;
 
   RecordTick(options_.statistics.get(), NUMBER_MULTIGET_CALLS);
   RecordTick(options_.statistics.get(), NUMBER_MULTIGET_KEYS_READ, numKeys);
diff --git a/db/db_impl.h b/db/db_impl.h
index 545b367aa..263439d99 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -249,8 +249,8 @@ class DBImpl : public DB {
     return internal_comparator_.user_comparator();
   }
 
-  MemTable* GetMemTable() {
-    return mem_;
+  SuperVersion* GetSuperVersion() {
+    return super_version_;
   }
 
   Iterator* NewInternalIterator(const ReadOptions&,
diff --git a/db/db_impl_readonly.cc b/db/db_impl_readonly.cc
index f1ffe3ca3..faa2ff3c7 100644
--- a/db/db_impl_readonly.cc
+++ b/db/db_impl_readonly.cc
@@ -56,15 +56,15 @@ Status DBImplReadOnly::Get(const ReadOptions& options,
                    const Slice& key,
                    std::string* value) {
   Status s;
-  MemTable* mem = GetMemTable();
-  Version* current = versions_->current();
   SequenceNumber snapshot = versions_->LastSequence();
+  SuperVersion* super_version = GetSuperVersion();
   MergeContext merge_context;
   LookupKey lkey(key, snapshot);
-  if (mem->Get(lkey, value, &s, merge_context, options_)) {
+  if (super_version->mem->Get(lkey, value, &s, merge_context, options_)) {
   } else {
     Version::GetStats stats;
-    current->Get(options, lkey, value, &s, &merge_context, &stats, options_);
+    super_version->current->Get(options, lkey, value, &s, &merge_context,
+                                &stats, options_);
   }
   return s;
 }
@@ -87,6 +87,9 @@ Status DB::OpenForReadOnly(const Options& options, const std::string& dbname,
   DBImplReadOnly* impl = new DBImplReadOnly(options, dbname);
   impl->mutex_.Lock();
   Status s = impl->Recover(true /* read only */, error_if_log_file_exist);
+  if (s.ok()) {
+    delete impl->InstallSuperVersion(new DBImpl::SuperVersion());
+  }
   impl->mutex_.Unlock();
   if (s.ok()) {
     *dbptr = impl;
diff --git a/db/db_test.cc b/db/db_test.cc
index baa6e7489..75b5ae2d9 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -421,6 +421,10 @@ class DBTest {
     return DB::Open(*options, dbname_, db);
   }
 
+  Status ReadOnlyReopen(Options* options) {
+    return DB::OpenForReadOnly(*options, dbname_, &db_);
+  }
+
   Status TryReopen(Options* options = nullptr) {
     delete db_;
     db_ = nullptr;
@@ -727,6 +731,26 @@ TEST(DBTest, ReadWrite) {
   } while (ChangeOptions());
 }
 
+TEST(DBTest, ReadOnlyDB) {
+  ASSERT_OK(Put("foo", "v1"));
+  ASSERT_OK(Put("bar", "v2"));
+  ASSERT_OK(Put("foo", "v3"));
+  Close();
+
+  Options options;
+  ASSERT_OK(ReadOnlyReopen(&options));
+  ASSERT_EQ("v3", Get("foo"));
+  ASSERT_EQ("v2", Get("bar"));
+  Iterator* iter = db_->NewIterator(ReadOptions());
+  int count = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ASSERT_OK(iter->status());
+    ++count;
+  }
+  ASSERT_EQ(count, 2);
+  delete iter;
+}
+
 // Make sure that when options.block_cache is set, after a new table is
 // created its index/filter blocks are added to block cache.
 TEST(DBTest, IndexAndFilterBlocksOfNewTableAddedToCache) {

From 2966d764cd8df0d402ef7fd05e6288a6460691ea Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Mon, 3 Feb 2014 13:48:30 -0800
Subject: [PATCH 8/9] Fix some 32-bit compile errors

Summary: RocksDB doesn't compile on 32-bit architecture apparently. This is attempt to fix some of 32-bit errors. They are reported here: https://gist.github.com/paxos/8789697

Test Plan: RocksDB still compiles on 64-bit :)

Reviewers: kailiu

Reviewed By: kailiu

CC: leveldb

Differential Revision: https://reviews.facebook.net/D15825
---
 db/memtable.cc          | 12 ++++--------
 include/rocksdb/env.h   |  2 +-
 table/filter_block.cc   |  2 +-
 tools/db_repl_stress.cc |  4 ++--
 util/ldb_cmd.cc         |  2 +-
 5 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/db/memtable.cc b/db/memtable.cc
index 91f4ed5d6..faa213d16 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -148,7 +148,7 @@ void MemTable::Add(SequenceNumber s, ValueType type,
   p += 8;
   p = EncodeVarint32(p, val_size);
   memcpy(p, value.data(), val_size);
-  assert((p + val_size) - buf == (unsigned)encoded_len);
+  assert((unsigned)(p + val_size - buf) == (unsigned)encoded_len);
   table_->Insert(buf);
 
   // The first sequence number inserted into the memtable
@@ -299,13 +299,9 @@ bool MemTable::Update(SequenceNumber seq, ValueType type,
                                      value.size());
             WriteLock wl(GetLock(lkey.user_key()));
             memcpy(p, value.data(), value.size());
-            assert(
-              (p + value.size()) - entry ==
-              (unsigned) (VarintLength(key_length) +
-                          key_length +
-                          VarintLength(value.size()) +
-                          value.size())
-            );
+            assert((unsigned)((p + value.size()) - entry) ==
+                   (unsigned)(VarintLength(key_length) + key_length +
+                              VarintLength(value.size()) + value.size()));
             return true;
           }
         }
diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h
index 06e9b94aa..e050aee7c 100644
--- a/include/rocksdb/env.h
+++ b/include/rocksdb/env.h
@@ -438,7 +438,7 @@ class WritableFile {
   // This asks the OS to initiate flushing the cached data to disk,
   // without waiting for completion.
   // Default implementation does nothing.
-  virtual Status RangeSync(off_t offset, off_t nbytes) {
+  virtual Status RangeSync(off64_t offset, off64_t nbytes) {
     return Status::OK();
   }
 
diff --git a/table/filter_block.cc b/table/filter_block.cc
index 82b6c6ee1..96ba7cb1d 100644
--- a/table/filter_block.cc
+++ b/table/filter_block.cc
@@ -173,7 +173,7 @@ bool FilterBlockReader::MayMatch(uint64_t block_offset, const Slice& entry) {
   if (index < num_) {
     uint32_t start = DecodeFixed32(offset_ + index*4);
     uint32_t limit = DecodeFixed32(offset_ + index*4 + 4);
-    if (start <= limit && limit <= (offset_ - data_)) {
+    if (start <= limit && limit <= (uint32_t)(offset_ - data_)) {
       Slice filter = Slice(data_ + start, limit - start);
       return policy_->KeyMayMatch(entry, filter);
     } else if (start == limit) {
diff --git a/tools/db_repl_stress.cc b/tools/db_repl_stress.cc
index 9dfe4b644..27cb6d5ab 100644
--- a/tools/db_repl_stress.cc
+++ b/tools/db_repl_stress.cc
@@ -125,8 +125,8 @@ int main(int argc, const char** argv) {
   replThread.stop.Release_Store(nullptr);
   if (replThread.no_read < dataPump.no_records) {
     // no. read should be => than inserted.
-    fprintf(stderr, "No. of Record's written and read not same\nRead : %ld"
-            " Written : %ld\n", replThread.no_read, dataPump.no_records);
+    fprintf(stderr, "No. of Record's written and read not same\nRead : %zu"
+            " Written : %zu\n", replThread.no_read, dataPump.no_records);
     exit(1);
   }
   fprintf(stderr, "Successful!\n");
diff --git a/util/ldb_cmd.cc b/util/ldb_cmd.cc
index 80f609cd4..86635a695 100644
--- a/util/ldb_cmd.cc
+++ b/util/ldb_cmd.cc
@@ -1003,7 +1003,7 @@ Options ReduceDBLevelsCommand::PrepareOptionsForOpenDB() {
   opt.num_levels = old_levels_;
   opt.max_bytes_for_level_multiplier_additional.resize(opt.num_levels, 1);
   // Disable size compaction
-  opt.max_bytes_for_level_base = 1UL << 50;
+  opt.max_bytes_for_level_base = 1ULL << 50;
   opt.max_bytes_for_level_multiplier = 1;
   opt.max_mem_compaction_level = 0;
   return opt;

From d411dc58841c92d1100d9c0beb81a10818d6ae95 Mon Sep 17 00:00:00 2001
From: Albert Strasheim <albert@cloudflare.com>
Date: Sun, 2 Feb 2014 12:52:52 -0800
Subject: [PATCH 9/9] crc32c: choose function in static initialization

Before: 4.430 micros/op 225732 ops/sec; 881.8 MB/s (4K per op)
After: 4.125 micros/op 242425 ops/sec; 947.0 MB/s (4K per op)
---
 util/crc32c.cc | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/util/crc32c.cc b/util/crc32c.cc
index bca955a09..5008dddee 100644
--- a/util/crc32c.cc
+++ b/util/crc32c.cc
@@ -333,17 +333,14 @@ static bool isSSE42() {
 }
 
 typedef void (*Function)(uint64_t*, uint8_t const**);
-static Function func = nullptr;
 
 static inline Function Choose_CRC32() {
   return isSSE42() ? Fast_CRC32 : Slow_CRC32;
 }
 
+static Function func = Choose_CRC32();
+
 static inline void CRC32(uint64_t* l, uint8_t const **p) {
-  if (func != nullptr) {
-    return func(l, p);
-  }
-  func = Choose_CRC32();
   func(l, p);
 }