Merge branch 'master' into columnfamilies

Conflicts: db/db_impl.cc db/db_impl.h db/transaction_log_impl.cc db/transaction_log_impl.h include/rocksdb/options.h util/env.cc util/options.cc
2014-03-03 17:54:04 -08:00 · 2014-03-03 17:54:04 -08:00 · 9d0577a6be
commit 9d0577a6be
parent f9b2f0ad79 5142b37000
45 changed files with 1413 additions and 769 deletions
--- a/HISTORY.md
+++ b/HISTORY.md
@ -14,6 +14,11 @@
 * Added is_manual_compaction to CompactionFilter::Context
 * Added "virtual void WaitForJoin() = 0" in class Env
 ### New Features
 * If we find one truncated record at the end of the MANIFEST or WAL files,
  we will ignore it. We assume that writers of these records were interrupted
  and that we can safely ignore it.
 ## 2.7.0 (01/28/2014)
 ### Public API changes
--- a/8
+++ b/8
@ -12,6 +12,10 @@ OPT += -O2 -fno-omit-frame-pointer -momit-leaf-frame-pointer
 else
 OPT += -fno-omit-frame-pointer -momit-leaf-frame-pointer
 endif
 ifeq ($(MAKECMDGOALS),shared_lib)
 PLATFORM_SHARED_LDFLAGS=-fPIC
 endif
 #-----------------------------------------------
 # detect what platform we're building on
@ -136,8 +140,8 @@ $(SHARED2): $(SHARED3)
 	ln -fs $(SHARED3) $(SHARED2)
 endif
-$(SHARED3): $(LIBOBJECTS)
+$(SHARED3):
-	$(CXX) $(PLATFORM_SHARED_LDFLAGS)$(SHARED2) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(LDFLAGS) $(SOURCES)-o $@
+	$(CXX) $(PLATFORM_SHARED_LDFLAGS)$(SHARED2) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(LDFLAGS) $(SOURCES) -o $@
 endif  # PLATFORM_SHARED_EXT
--- a/db/column_family.cc
+++ b/db/column_family.cc
@ -151,6 +151,18 @@ void SuperVersion::Init(MemTable* new_mem, MemTableListVersion* new_imm,
  refs.store(1, std::memory_order_relaxed);
 }
 namespace {
 void SuperVersionUnrefHandle(void* ptr) {
  SuperVersion* sv = static_cast<SuperVersion*>(ptr);
  if (sv->Unref()) {
    sv->db_mutex->Lock();
    sv->Cleanup();
    sv->db_mutex->Unlock();
    delete sv;
  }
 }
 }  // anonymous namespace
 ColumnFamilyData::ColumnFamilyData(const std::string& dbname, uint32_t id,
                                   const std::string& name,
                                   Version* dummy_versions, Cache* table_cache,
@ -173,6 +185,7 @@ ColumnFamilyData::ColumnFamilyData(const std::string& dbname, uint32_t id,
      imm_(options.min_write_buffer_number_to_merge),
      super_version_(nullptr),
      super_version_number_(0),
      local_sv_(new ThreadLocalPtr(&SuperVersionUnrefHandle)),
      next_(nullptr),
      prev_(nullptr),
      log_number_(0),
@ -209,6 +222,20 @@ ColumnFamilyData::~ColumnFamilyData() {
  prev->next_ = next;
  next->prev_ = prev;
  // Release SuperVersion reference kept in ThreadLocalPtr.
  // This must be done outside of mutex_ since unref handler can lock mutex.
  // It also needs to be done after FlushMemTable, which can trigger local_sv_
  // access.
  auto sv = static_cast<SuperVersion*>(local_sv_->Get());
  if (sv != nullptr) {
    auto mutex = sv->db_mutex;
    mutex->Unlock();
    delete local_sv_;
    mutex->Lock();
  } else {
    delete local_sv_;
  }
  if (super_version_ != nullptr) {
    bool is_last_reference __attribute__((unused));
    is_last_reference = super_version_->Unref();
@ -276,11 +303,13 @@ Compaction* ColumnFamilyData::CompactRange(int input_level, int output_level,
 }
 SuperVersion* ColumnFamilyData::InstallSuperVersion(
-    SuperVersion* new_superversion) {
+    SuperVersion* new_superversion, port::Mutex* db_mutex) {
  new_superversion->Init(mem_, imm_.current(), current_);
  SuperVersion* old_superversion = super_version_;
  super_version_ = new_superversion;
  ++super_version_number_;
  super_version_->version_number = super_version_number_;
  super_version_->db_mutex = db_mutex;
  if (old_superversion != nullptr && old_superversion->Unref()) {
    old_superversion->Cleanup();
    return old_superversion;  // will let caller delete outside of mutex
@ -288,6 +317,19 @@ SuperVersion* ColumnFamilyData::InstallSuperVersion(
  return nullptr;
 }
 void ColumnFamilyData::ResetThreadLocalSuperVersions() {
  autovector<void*> sv_ptrs;
  local_sv_->Scrape(&sv_ptrs);
  for (auto ptr : sv_ptrs) {
    assert(ptr);
    auto sv = static_cast<SuperVersion*>(ptr);
    if (sv->Unref()) {
      sv->Cleanup();
      delete sv;
    }
  }
 }
 ColumnFamilySet::ColumnFamilySet(const std::string& dbname,
                                 const DBOptions* db_options,
                                 const EnvOptions& storage_options,
--- a/db/column_family.h
+++ b/db/column_family.h
@ -19,6 +19,7 @@
 #include "db/memtable_list.h"
 #include "db/write_batch_internal.h"
 #include "db/table_cache.h"
 #include "util/thread_local.h"
 namespace rocksdb {
@ -72,6 +73,9 @@ struct SuperVersion {
  // all memtables that we need to free through this vector. We then
  // delete all those memtables outside of mutex, during destruction
  autovector<MemTable*> to_delete;
  // Version number of the current SuperVersion
  uint64_t version_number;
  port::Mutex* db_mutex;
  // should be called outside the mutex
  SuperVersion() = default;
@ -159,6 +163,12 @@ class ColumnFamilyData {
  }
  SuperVersion* GetSuperVersion() const { return super_version_; }
  SuperVersion* GetAndResetThreadLocalSuperVersion() const {
    return static_cast<SuperVersion*>(local_sv_->Swap(nullptr));
  }
  void SetThreadLocalSuperVersion(SuperVersion* super_version) {
    local_sv_->Reset(static_cast<void*>(super_version));
  }
  uint64_t GetSuperVersionNumber() const {
    return super_version_number_.load();
  }
@ -166,7 +176,10 @@ class ColumnFamilyData {
  // if its reference count is zero and needs deletion or nullptr if not
  // As argument takes a pointer to allocated SuperVersion to enable
  // the clients to allocate SuperVersion outside of mutex.
-  SuperVersion* InstallSuperVersion(SuperVersion* new_superversion);
+  SuperVersion* InstallSuperVersion(SuperVersion* new_superversion,
                                    port::Mutex* db_mutex);
  void ResetThreadLocalSuperVersions();
  // A Flag indicating whether write needs to slowdown because of there are
  // too many number of level0 files.
@ -212,6 +225,10 @@ class ColumnFamilyData {
  // changes.
  std::atomic<uint64_t> super_version_number_;
  // Thread's local copy of SuperVersion pointer
  // This needs to be destructed before mutex_
  ThreadLocalPtr* local_sv_;
  // pointers for a circular linked list. we use it to support iterations
  // that can be concurrent with writes
  ColumnFamilyData* next_;
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@ -43,6 +43,7 @@
 #include "db/write_batch_internal.h"
 #include "port/port.h"
 #include "rocksdb/cache.h"
 #include "port/likely.h"
 #include "rocksdb/compaction_filter.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
@ -238,8 +239,8 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname)
      delayed_writes_(0),
      storage_options_(options),
      bg_work_gate_closed_(false),
-      refitting_level_(false) {
+      refitting_level_(false),
-
+      opened_successfully_(false) {
  env_->GetAbsolutePath(dbname, &db_absolute_path_);
  // Reserve ten files or so for other uses and give the rest to TableCache.
@ -298,6 +299,26 @@ DBImpl::~DBImpl() {
         bg_logstats_scheduled_) {
    bg_cv_.Wait();
  }
  if (options_.allow_thread_local) {
    // Clean up obsolete files due to SuperVersion release.
    // (1) Need to delete to obsolete files before closing because RepairDB()
    // scans all existing files in the file system and builds manifest file.
    // Keeping obsolete files confuses the repair process.
    // (2) Need to check if we Open()/Recover() the DB successfully before
    // deleting because if VersionSet recover fails (may be due to corrupted
    // manifest file), it is not able to identify live files correctly. As a
    // result, all "live" files can get deleted by accident. However, corrupted
    // manifest is recoverable by RepairDB().
    if (opened_successfully_) {
      DeletionState deletion_state;
      FindObsoleteFiles(deletion_state, true);
      // manifest number starting from 2
      deletion_state.manifest_file_number = 1;
      PurgeObsoleteFiles(deletion_state);
    }
  }
  mutex_.Unlock();
  if (default_cf_handle_ != nullptr) {
    // we need to delete handle outside of lock because it does its own locking
@ -358,7 +379,8 @@ Status DBImpl::NewDB() {
  const std::string manifest = DescriptorFileName(dbname_, 1);
  unique_ptr<WritableFile> file;
-  Status s = env_->NewWritableFile(manifest, &file, storage_options_);
+  Status s = env_->NewWritableFile(manifest, &file,
                                   storage_options_.AdaptForLogWrite());
  if (!s.ok()) {
    return s;
  }
@ -1229,6 +1251,10 @@ Status DBImpl::FlushMemTableToOutputFile(ColumnFamilyData* cfd,
  if (s.ok()) {
    InstallSuperVersion(cfd, deletion_state);
    // Reset SuperVersions cached in thread local storage
    if (options_.allow_thread_local) {
      cfd->ResetThreadLocalSuperVersions();
    }
    if (madeProgress) {
      *madeProgress = 1;
    }
@ -1361,7 +1387,7 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
        edit.DebugString().data());
    status = versions_->LogAndApply(cfd, &edit, &mutex_, db_directory_.get());
-    superversion_to_free = cfd->InstallSuperVersion(new_superversion);
+    superversion_to_free = cfd->InstallSuperVersion(new_superversion, &mutex_);
    new_superversion = nullptr;
    Log(options_.info_log, "LogAndApply: %s\n", status.ToString().data());
@ -1406,8 +1432,9 @@ SequenceNumber DBImpl::GetLatestSequenceNumber() const {
  return versions_->LastSequence();
 }
-Status DBImpl::GetUpdatesSince(SequenceNumber seq,
+Status DBImpl::GetUpdatesSince(
-                               unique_ptr<TransactionLogIterator>* iter) {
+    SequenceNumber seq, unique_ptr<TransactionLogIterator>* iter,
    const TransactionLogIterator::ReadOptions& read_options) {
  RecordTick(options_.statistics.get(), GET_UPDATES_SINCE_CALLS);
  if (seq > versions_->LastSequence()) {
@ -1427,13 +1454,9 @@ Status DBImpl::GetUpdatesSince(SequenceNumber seq,
  if (!s.ok()) {
    return s;
  }
-  iter->reset(
+  iter->reset(new TransactionLogIteratorImpl(options_.wal_dir, &options_,
-    new TransactionLogIteratorImpl(options_.wal_dir,
+                                             read_options, storage_options_,
-                                   &options_,
+                                             seq, std::move(wal_files), this));
                                   storage_options_,
                                   seq,
                                   std::move(wal_files),
                                   this));
  return (*iter)->status();
 }
@ -2004,6 +2027,9 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress,
    status = versions_->LogAndApply(c->column_family_data(), c->edit(), &mutex_,
                                    db_directory_.get());
    InstallSuperVersion(c->column_family_data(), deletion_state);
    if (options_.allow_thread_local) {
      c->column_family_data()->ResetThreadLocalSuperVersions();
    }
    Version::LevelSummaryStorage tmp;
    Log(options_.info_log, "Moved #%lld to level-%d %lld bytes %s: %s\n",
@ -2815,7 +2841,7 @@ Status DBImpl::Get(const ReadOptions& options,
 // DeletionState gets created and destructed outside of the lock -- we
 // use this convinently to:
 // * malloc one SuperVersion() outside of the lock -- new_superversion
-// * delete one SuperVersion() outside of the lock -- superversion_to_free
+// * delete SuperVersion()s outside of the lock -- superversions_to_free
 //
 // However, if InstallSuperVersion() gets called twice with the same,
 // deletion_state, we can't reuse the SuperVersion() that got malloced because
@ -2829,14 +2855,10 @@ void DBImpl::InstallSuperVersion(ColumnFamilyData* cfd,
  SuperVersion* new_superversion =
    (deletion_state.new_superversion != nullptr) ?
    deletion_state.new_superversion : new SuperVersion();
-  SuperVersion* old_superversion = cfd->InstallSuperVersion(new_superversion);
+  SuperVersion* old_superversion =
      cfd->InstallSuperVersion(new_superversion, &mutex_);
  deletion_state.new_superversion = nullptr;
-  if (deletion_state.superversion_to_free != nullptr) {
+  deletion_state.superversions_to_free.push_back(old_superversion);
    // somebody already put it there
    delete old_superversion;
  } else {
    deletion_state.superversion_to_free = old_superversion;
  }
 }
 Status DBImpl::GetImpl(const ReadOptions& options,
@ -2849,10 +2871,6 @@ Status DBImpl::GetImpl(const ReadOptions& options,
  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
  auto cfd = cfh->cfd();
  mutex_.Lock();
  SuperVersion* get_version = cfd->GetSuperVersion()->Ref();
  mutex_.Unlock();
  SequenceNumber snapshot;
  if (options.snapshot != nullptr) {
    snapshot = reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_;
@ -2860,6 +2878,41 @@ Status DBImpl::GetImpl(const ReadOptions& options,
    snapshot = versions_->LastSequence();
  }
  // Acquire SuperVersion
  SuperVersion* sv = nullptr;
  if (LIKELY(options_.allow_thread_local)) {
    // The SuperVersion is cached in thread local storage to avoid acquiring
    // mutex when SuperVersion does not change since the last use. When a new
    // SuperVersion is installed, the compaction or flush thread cleans up
    // cached SuperVersion in all existing thread local storage. To avoid
    // acquiring mutex for this operation, we use atomic Swap() on the thread
    // local pointer to guarantee exclusive access. If the thread local pointer
    // is being used while a new SuperVersion is installed, the cached
    // SuperVersion can become stale. It will eventually get refreshed either
    // on the next GetImpl() call or next SuperVersion installation.
    sv = cfd->GetAndResetThreadLocalSuperVersion();
    if (!sv || sv->version_number != cfd->GetSuperVersionNumber()) {
      RecordTick(options_.statistics.get(), NUMBER_SUPERVERSION_UPDATES);
      SuperVersion* sv_to_delete = nullptr;
      if (sv && sv->Unref()) {
        mutex_.Lock();
        sv->Cleanup();
        sv_to_delete = sv;
      } else {
        mutex_.Lock();
      }
      sv = cfd->GetSuperVersion()->Ref();
      mutex_.Unlock();
      delete sv_to_delete;
    }
  } else {
    mutex_.Lock();
    sv = cfd->GetSuperVersion()->Ref();
    mutex_.Unlock();
  }
  bool have_stat_update = false;
  Version::GetStats stats;
@ -2872,12 +2925,11 @@ Status DBImpl::GetImpl(const ReadOptions& options,
  // merge_operands will contain the sequence of merges in the latter case.
  LookupKey lkey(key, snapshot);
  BumpPerfTime(&perf_context.get_snapshot_time, &snapshot_timer);
-  if (get_version->mem->Get(lkey, value, &s, merge_context,
+  if (sv->mem->Get(lkey, value, &s, merge_context, *cfd->full_options())) {
                            *cfd->full_options())) {
    // Done
    RecordTick(options_.statistics.get(), MEMTABLE_HIT);
-  } else if (get_version->imm->Get(lkey, value, &s, merge_context,
+  } else if (sv->imm->Get(lkey, value, &s, merge_context,
-                                   *cfd->full_options())) {
+                          *cfd->full_options())) {
    // Done
    RecordTick(options_.statistics.get(), MEMTABLE_HIT);
  } else {
@ -2885,8 +2937,8 @@ Status DBImpl::GetImpl(const ReadOptions& options,
    StopWatchNano from_files_timer(env_, false);
    StartPerfTimer(&from_files_timer);
-    get_version->current->Get(options, lkey, value, &s, &merge_context, &stats,
+    sv->current->Get(options, lkey, value, &s, &merge_context, &stats,
-                              *cfd->full_options(), value_found);
+                     *cfd->full_options(), value_found);
    have_stat_update = true;
    BumpPerfTime(&perf_context.get_from_output_files_time, &from_files_timer);
    RecordTick(options_.statistics.get(), MEMTABLE_MISS);
@ -2895,31 +2947,32 @@ Status DBImpl::GetImpl(const ReadOptions& options,
  StopWatchNano post_process_timer(env_, false);
  StartPerfTimer(&post_process_timer);
  bool delete_get_version = false;
  if (!cfd->options()->disable_seek_compaction && have_stat_update) {
    mutex_.Lock();
-    if (get_version->current->UpdateStats(stats)) {
+    if (sv->current->UpdateStats(stats)) {
      MaybeScheduleFlushOrCompaction();
    }
    if (get_version->Unref()) {
      get_version->Cleanup();
      delete_get_version = true;
    }
    mutex_.Unlock();
  } else {
    if (get_version->Unref()) {
      mutex_.Lock();
      get_version->Cleanup();
      mutex_.Unlock();
      delete_get_version = true;
    }
  }
-  if (delete_get_version) {
+
-    delete get_version;
+  // Release SuperVersion
  if (LIKELY(options_.allow_thread_local)) {
    // Put the SuperVersion back
    cfd->SetThreadLocalSuperVersion(sv);
  } else {
    bool delete_sv = false;
    if (sv->Unref()) {
      mutex_.Lock();
      sv->Cleanup();
      mutex_.Unlock();
      delete_sv = true;
    }
    if (delete_sv) {
      delete sv;
    }
  }
  // Note, tickers are atomic now - no lock protection needed any more.
  RecordTick(options_.statistics.get(), NUMBER_KEYS_READ);
  RecordTick(options_.statistics.get(), BYTES_READ, value->size());
  BumpPerfTime(&perf_context.get_post_process_time, &post_process_timer);
@ -3074,6 +3127,7 @@ Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& options,
    auto cfd =
        versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name);
    assert(cfd != nullptr);
    delete cfd->InstallSuperVersion(new SuperVersion(), &mutex_);
    *handle = new ColumnFamilyHandleImpl(cfd, this, &mutex_);
    Log(options_.info_log, "Created column family \"%s\" (ID %u)",
        column_family_name.c_str(), (unsigned)cfd->GetID());
@ -3575,11 +3629,9 @@ Status DBImpl::MakeRoomForWrite(ColumnFamilyData* cfd, bool force) {
      SuperVersion* new_superversion = nullptr;
      mutex_.Unlock();
      {
        EnvOptions soptions(storage_options_);
        soptions.use_mmap_writes = false;
        DelayLoggingAndReset();
        s = env_->NewWritableFile(LogFileName(options_.wal_dir, new_log_number),
-                                  &lfile, soptions);
+                                  &lfile, storage_options_.AdaptForLogWrite());
        if (s.ok()) {
          // Our final size should be less than write_buffer_size
          // (compression, etc) but err on the side of caution.
@ -3621,7 +3673,7 @@ Status DBImpl::MakeRoomForWrite(ColumnFamilyData* cfd, bool force) {
          cfd->GetID(), (unsigned long)logfile_number_);
      force = false;  // Do not force another compaction if have room
      MaybeScheduleFlushOrCompaction();
-      delete cfd->InstallSuperVersion(new_superversion);
+      delete cfd->InstallSuperVersion(new_superversion, &mutex_);
    }
  }
  return s;
@ -3888,7 +3940,6 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname,
                std::vector<ColumnFamilyHandle*>* handles, DB** dbptr) {
  *dbptr = nullptr;
  handles->clear();
  EnvOptions soptions(db_options);
  size_t max_write_buffer_size = 0;
  for (auto cf : column_families) {
@ -3918,12 +3969,10 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname,
  if (s.ok()) {
    uint64_t new_log_number = impl->versions_->NewFileNumber();
    unique_ptr<WritableFile> lfile;
-    soptions.use_mmap_writes = false;
+    EnvOptions soptions(db_options);
    s = impl->options_.env->NewWritableFile(
-      LogFileName(impl->options_.wal_dir, new_log_number),
+        LogFileName(impl->options_.wal_dir, new_log_number), &lfile,
-      &lfile,
+        soptions.AdaptForLogWrite());
      soptions
    );
    if (s.ok()) {
      lfile->SetPreallocationBlockSize(1.1 * max_write_buffer_size);
      VersionEdit edit;
@ -3953,7 +4002,7 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname,
    }
    if (s.ok()) {
      for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
-        delete cfd->InstallSuperVersion(new SuperVersion());
+        delete cfd->InstallSuperVersion(new SuperVersion(), &impl->mutex_);
        impl->alive_log_files_.push_back(impl->logfile_number_);
      }
      impl->DeleteObsoleteFiles();
@ -3985,6 +4034,7 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname,
  impl->mutex_.Unlock();
  if (s.ok()) {
    impl->opened_successfully_ = true;
    *dbptr = impl;
  } else {
    for (auto h : *handles) {
--- a/db/db_impl.h
+++ b/db/db_impl.h
@ -28,6 +28,7 @@
 #include "rocksdb/transaction_log.h"
 #include "util/autovector.h"
 #include "util/stats_logger.h"
 #include "util/thread_local.h"
 #include "db/internal_stats.h"
 namespace rocksdb {
@ -121,8 +122,10 @@ class DBImpl : public DB {
                              bool flush_memtable = true);
  virtual Status GetSortedWalFiles(VectorLogPtr& files);
  virtual SequenceNumber GetLatestSequenceNumber() const;
-  virtual Status GetUpdatesSince(SequenceNumber seq_number,
+  virtual Status GetUpdatesSince(
-                                 unique_ptr<TransactionLogIterator>* iter);
+      SequenceNumber seq_number, unique_ptr<TransactionLogIterator>* iter,
      const TransactionLogIterator::ReadOptions&
          read_options = TransactionLogIterator::ReadOptions());
  virtual Status DeleteFile(std::string name);
  virtual void GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata);
@ -204,7 +207,7 @@ class DBImpl : public DB {
    // a list of memtables to be free
    autovector<MemTable*> memtables_to_free;
-    SuperVersion* superversion_to_free;  // if nullptr nothing to free
+    autovector<SuperVersion*> superversions_to_free;
    SuperVersion* new_superversion;  // if nullptr no new superversion
@ -216,7 +219,6 @@ class DBImpl : public DB {
      manifest_file_number = 0;
      log_number = 0;
      prev_log_number = 0;
      superversion_to_free = nullptr;
      new_superversion = create_superversion ? new SuperVersion() : nullptr;
    }
@ -225,8 +227,10 @@ class DBImpl : public DB {
      for (auto m : memtables_to_free) {
        delete m;
      }
-      // free superversion. if nullptr, this will be noop
+      // free superversions
-      delete superversion_to_free;
+      for (auto s : superversions_to_free) {
        delete s;
      }
      // if new_superversion was not used, it will be non-nullptr and needs
      // to be freed here
      delete new_superversion;
@ -476,6 +480,9 @@ class DBImpl : public DB {
  // Guard against multiple concurrent refitting
  bool refitting_level_;
  // Indicate DB was opened successfully
  bool opened_successfully_;
  // No copying allowed
  DBImpl(const DBImpl&);
  void operator=(const DBImpl&);
--- a/db/db_impl_readonly.cc
+++ b/db/db_impl_readonly.cc
@ -103,7 +103,7 @@ Status DB::OpenForReadOnly(const Options& options, const std::string& dbname,
                           error_if_log_file_exist);
  if (s.ok()) {
    for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
-      delete cfd->InstallSuperVersion(new SuperVersion());
+      delete cfd->InstallSuperVersion(new SuperVersion(), &impl->mutex_);
    }
  }
  impl->mutex_.Unlock();
--- a/db/db_test.cc
+++ b/db/db_test.cc
@ -5288,8 +5288,10 @@ class ModelDB: public DB {
  virtual SequenceNumber GetLatestSequenceNumber() const {
    return 0;
  }
-  virtual Status GetUpdatesSince(rocksdb::SequenceNumber,
+  virtual Status GetUpdatesSince(
-                                 unique_ptr<rocksdb::TransactionLogIterator>*) {
+      rocksdb::SequenceNumber, unique_ptr<rocksdb::TransactionLogIterator>*,
      const TransactionLogIterator::ReadOptions&
          read_options = TransactionLogIterator::ReadOptions()) {
    return Status::NotSupported("Not supported in Model DB");
  }
--- a/db/log_reader.cc
+++ b/db/log_reader.cc
@ -140,7 +140,9 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch) {
      case kEof:
        if (in_fragmented_record) {
-          ReportCorruption(scratch->size(), "partial record without end(3)");
+          // This can be caused by the writer dying immediately after
          //  writing a physical record but before completing the next; don't
          //  treat it as a corruption, just ignore the entire logical record.
          scratch->clear();
        }
        return false;
@ -264,13 +266,12 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result) {
          eof_offset_ = buffer_.size();
        }
        continue;
      } else if (buffer_.size() == 0) {
        // End of file
        return kEof;
      } else {
-        size_t drop_size = buffer_.size();
+        // Note that if buffer_ is non-empty, we have a truncated header at the
        //  end of the file, which can be caused by the writer crashing in the
        //  middle of writing the header. Instead of considering this an error,
        //  just report EOF.
        buffer_.clear();
        ReportCorruption(drop_size, "truncated record at end of file");
        return kEof;
      }
    }
@ -284,14 +285,22 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result) {
    if (kHeaderSize + length > buffer_.size()) {
      size_t drop_size = buffer_.size();
      buffer_.clear();
-      ReportCorruption(drop_size, "bad record length");
+      if (!eof_) {
-      return kBadRecord;
+        ReportCorruption(drop_size, "bad record length");
        return kBadRecord;
      }
      // If the end of the file has been reached without reading |length| bytes
      // of payload, assume the writer died in the middle of writing the record.
      // Don't report a corruption.
      return kEof;
    }
    if (type == kZeroType && length == 0) {
      // Skip zero length record without reporting any drops since
      // such records are produced by the mmap based writing code in
      // env_posix.cc that preallocates file regions.
      // NOTE: this should never happen in DB written by new RocksDB versions,
      // since we turn off mmap writes to manifest and log files
      buffer_.clear();
      return kBadRecord;
    }
--- a/db/log_test.cc
+++ b/db/log_test.cc
@ -446,20 +446,32 @@ TEST(LogTest, BadRecordType) {
  ASSERT_EQ("OK", MatchError("unknown record type"));
 }
-TEST(LogTest, TruncatedTrailingRecord) {
+TEST(LogTest, TruncatedTrailingRecordIsIgnored) {
  Write("foo");
  ShrinkSize(4);   // Drop all payload as well as a header byte
  ASSERT_EQ("EOF", Read());
-  ASSERT_EQ((unsigned int)(kHeaderSize - 1), DroppedBytes());
+  // Truncated last record is ignored, not treated as an error
-  ASSERT_EQ("OK", MatchError("truncated record at end of file"));
+  ASSERT_EQ(0, DroppedBytes());
  ASSERT_EQ("", ReportMessage());
 }
 TEST(LogTest, BadLength) {
  const int kPayloadSize = kBlockSize - kHeaderSize;
  Write(BigString("bar", kPayloadSize));
  Write("foo");
  // Least significant size byte is stored in header[4].
  IncrementByte(4, 1);
  ASSERT_EQ("foo", Read());
  ASSERT_EQ(kBlockSize, DroppedBytes());
  ASSERT_EQ("OK", MatchError("bad record length"));
 }
 TEST(LogTest, BadLengthAtEndIsIgnored) {
  Write("foo");
  ShrinkSize(1);
  ASSERT_EQ("EOF", Read());
-  ASSERT_EQ((unsigned int)(kHeaderSize + 2), DroppedBytes());
+  ASSERT_EQ(0, DroppedBytes());
-  ASSERT_EQ("OK", MatchError("bad record length"));
+  ASSERT_EQ("", ReportMessage());
 }
 TEST(LogTest, ChecksumMismatch) {
@ -510,6 +522,24 @@ TEST(LogTest, UnexpectedFirstType) {
  ASSERT_EQ("OK", MatchError("partial record without end"));
 }
 TEST(LogTest, MissingLastIsIgnored) {
  Write(BigString("bar", kBlockSize));
  // Remove the LAST block, including header.
  ShrinkSize(14);
  ASSERT_EQ("EOF", Read());
  ASSERT_EQ("", ReportMessage());
  ASSERT_EQ(0, DroppedBytes());
 }
 TEST(LogTest, PartialLastIsIgnored) {
  Write(BigString("bar", kBlockSize));
  // Cause a bad record length in the LAST block.
  ShrinkSize(1);
  ASSERT_EQ("EOF", Read());
  ASSERT_EQ("", ReportMessage());
  ASSERT_EQ(0, DroppedBytes());
 }
 TEST(LogTest, ErrorJoinsRecords) {
  // Consider two fragmented records:
  //    first(R1) last(R1) first(R2) last(R2)
--- a/db/repair.cc
+++ b/db/repair.cc
@ -251,7 +251,6 @@ class Repairer {
  }
  void ExtractMetaData() {
    std::vector<TableInfo> kept;
    for (size_t i = 0; i < table_numbers_.size(); i++) {
      TableInfo t;
      t.meta.number = table_numbers_[i];
@ -317,7 +316,8 @@ class Repairer {
  Status WriteDescriptor() {
    std::string tmp = TempFileName(dbname_, 1);
    unique_ptr<WritableFile> file;
-    Status status = env_->NewWritableFile(tmp, &file, storage_options_);
+    Status status =
        env_->NewWritableFile(tmp, &file, storage_options_.AdaptForLogWrite());
    if (!status.ok()) {
      return status;
    }
--- a/db/transaction_log_impl.cc
+++ b/db/transaction_log_impl.cc
@ -10,10 +10,12 @@ namespace rocksdb {
 TransactionLogIteratorImpl::TransactionLogIteratorImpl(
    const std::string& dir, const DBOptions* options,
    const TransactionLogIterator::ReadOptions& read_options,
    const EnvOptions& soptions, const SequenceNumber seq,
    std::unique_ptr<VectorLogPtr> files, DBImpl const* const dbimpl)
    : dir_(dir),
      options_(options),
      read_options_(read_options),
      soptions_(soptions),
      startingSequenceNumber_(seq),
      files_(std::move(files)),
@ -250,9 +252,8 @@ Status TransactionLogIteratorImpl::OpenLogReader(const LogFile* logFile) {
    return status;
  }
  assert(file);
-  currentLogReader_.reset(
+  currentLogReader_.reset(new log::Reader(std::move(file), &reporter_,
-    new log::Reader(std::move(file), &reporter_, true, 0)
+                                          read_options_.verify_checksums_, 0));
  );
  return Status::OK();
 }
 }  //  namespace rocksdb
--- a/db/transaction_log_impl.h
+++ b/db/transaction_log_impl.h
@ -66,11 +66,11 @@ class LogFileImpl : public LogFile {
 class TransactionLogIteratorImpl : public TransactionLogIterator {
 public:
-  TransactionLogIteratorImpl(const std::string& dir, const DBOptions* options,
+  TransactionLogIteratorImpl(
-                             const EnvOptions& soptions,
+      const std::string& dir, const DBOptions* options,
-                             const SequenceNumber seqNum,
+      const TransactionLogIterator::ReadOptions& read_options,
-                             std::unique_ptr<VectorLogPtr> files,
+      const EnvOptions& soptions, const SequenceNumber seqNum,
-                             DBImpl const* const dbimpl);
+      std::unique_ptr<VectorLogPtr> files, DBImpl const* const dbimpl);
  virtual bool Valid();
@ -83,6 +83,7 @@ class TransactionLogIteratorImpl : public TransactionLogIterator {
 private:
  const std::string& dir_;
  const DBOptions* options_;
  const TransactionLogIterator::ReadOptions read_options_;
  const EnvOptions& soptions_;
  SequenceNumber startingSequenceNumber_;
  std::unique_ptr<VectorLogPtr> files_;
--- a/db/version_set.cc
+++ b/db/version_set.cc
@ -1585,9 +1585,8 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
    // only one thread can be here at the same time
    if (!new_manifest_filename.empty()) {
      unique_ptr<WritableFile> descriptor_file;
-      s = env_->NewWritableFile(new_manifest_filename,
+      s = env_->NewWritableFile(new_manifest_filename, &descriptor_file,
-                                &descriptor_file,
+                                storage_options_.AdaptForLogWrite());
                                storage_options_);
      if (s.ok()) {
        descriptor_log_.reset(new log::Writer(std::move(descriptor_file)));
        s = WriteSnapshot(descriptor_log_.get());
@ -2615,7 +2614,6 @@ ColumnFamilyData* VersionSet::CreateColumnFamily(
  AppendVersion(new_cfd, new Version(new_cfd, this, current_version_number_++));
  new_cfd->CreateNewMemtable();
  new_cfd->SetLogNumber(edit->log_number_);
  delete new_cfd->InstallSuperVersion(new SuperVersion());
  return new_cfd;
 }
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@ -420,8 +420,10 @@ class DB {
  // use this api, else the WAL files will get
  // cleared aggressively and the iterator might keep getting invalid before
  // an update is read.
-  virtual Status GetUpdatesSince(SequenceNumber seq_number,
+  virtual Status GetUpdatesSince(
-                                 unique_ptr<TransactionLogIterator>* iter) = 0;
+      SequenceNumber seq_number, unique_ptr<TransactionLogIterator>* iter,
      const TransactionLogIterator::ReadOptions&
          read_options = TransactionLogIterator::ReadOptions()) = 0;
  // Delete the file name from the db directory and update the internal state to
  // reflect that. Supports deletion of sst and log files only. 'name' must be
--- a/include/rocksdb/env.h
+++ b/include/rocksdb/env.h
@ -49,6 +49,8 @@ struct EnvOptions {
  // construct from Options
  explicit EnvOptions(const DBOptions& options);
  EnvOptions AdaptForLogWrite() const;
  // If true, then allow caching of data in environment buffers
  bool use_os_buffer = true;
@ -511,25 +513,56 @@ class Directory {
  virtual Status Fsync() = 0;
 };
 enum InfoLogLevel {
  DEBUG = 0,
  INFO,
  WARN,
  ERROR,
  FATAL,
  NUM_INFO_LOG_LEVELS,
 };
 // An interface for writing log messages.
 class Logger {
 public:
  enum { DO_NOT_SUPPORT_GET_LOG_FILE_SIZE = -1 };
-  Logger() { }
+  explicit Logger(const InfoLogLevel log_level = InfoLogLevel::ERROR)
      : log_level_(log_level) {}
  virtual ~Logger();
  // Write an entry to the log file with the specified format.
  virtual void Logv(const char* format, va_list ap) = 0;
  // Write an entry to the log file with the specified log level
  // and format.  Any log with level under the internal log level
  // of *this (see @SetInfoLogLevel and @GetInfoLogLevel) will not be
  // printed.
  void Logv(const InfoLogLevel log_level, const char* format, va_list ap) {
    static const char* kInfoLogLevelNames[5] = {"DEBUG", "INFO", "WARN",
                                                "ERROR", "FATAL"};
    if (log_level < log_level_) {
      return;
    }
    char new_format[500];
    snprintf(new_format, sizeof(new_format) - 1, "[%s] %s",
             kInfoLogLevelNames[log_level], format);
    Logv(new_format, ap);
  }
  virtual size_t GetLogFileSize() const {
    return DO_NOT_SUPPORT_GET_LOG_FILE_SIZE;
  }
  // Flush to the OS buffers
  virtual void Flush() {}
  virtual InfoLogLevel GetInfoLogLevel() const { return log_level_; }
  virtual void SetInfoLogLevel(const InfoLogLevel log_level) {
    log_level_ = log_level;
  }
 private:
  // No copying allowed
  Logger(const Logger&);
  void operator=(const Logger&);
  InfoLogLevel log_level_;
 };
@ -547,7 +580,18 @@ class FileLock {
 extern void LogFlush(const shared_ptr<Logger>& info_log);
 extern void Log(const InfoLogLevel log_level,
                const shared_ptr<Logger>& info_log, const char* format, ...);
 // a set of log functions with different log levels.
 extern void Debug(const shared_ptr<Logger>& info_log, const char* format, ...);
 extern void Info(const shared_ptr<Logger>& info_log, const char* format, ...);
 extern void Warn(const shared_ptr<Logger>& info_log, const char* format, ...);
 extern void Error(const shared_ptr<Logger>& info_log, const char* format, ...);
 extern void Fatal(const shared_ptr<Logger>& info_log, const char* format, ...);
 // Log the specified data to *info_log if info_log is non-nullptr.
 // The default info log level is InfoLogLevel::ERROR.
 extern void Log(const shared_ptr<Logger>& info_log, const char* format, ...)
 #   if defined(__GNUC__) || defined(__clang__)
    __attribute__((__format__ (__printf__, 2, 3)))
@ -556,12 +600,23 @@ extern void Log(const shared_ptr<Logger>& info_log, const char* format, ...)
 extern void LogFlush(Logger *info_log);
 extern void Log(const InfoLogLevel log_level, Logger* info_log,
                const char* format, ...);
 // The default info log level is InfoLogLevel::ERROR.
 extern void Log(Logger* info_log, const char* format, ...)
 #   if defined(__GNUC__) || defined(__clang__)
    __attribute__((__format__ (__printf__, 2, 3)))
 #   endif
    ;
 // a set of log functions with different log levels.
 extern void Debug(Logger* info_log, const char* format, ...);
 extern void Info(Logger* info_log, const char* format, ...);
 extern void Warn(Logger* info_log, const char* format, ...);
 extern void Error(Logger* info_log, const char* format, ...);
 extern void Fatal(Logger* info_log, const char* format, ...);
 // A utility routine: write "data" to the named file.
 extern Status WriteStringToFile(Env* env, const Slice& data,
                                const std::string& fname);
--- a/include/rocksdb/flush_block_policy.h
+++ b/include/rocksdb/flush_block_policy.h
@ -11,6 +11,7 @@ namespace rocksdb {
 class Slice;
 class BlockBuilder;
 struct Options;
 // FlushBlockPolicy provides a configurable way to determine when to flush a
 // block in the block based tables,
@ -36,29 +37,22 @@ class FlushBlockPolicyFactory {
  // Callers must delete the result after any database that is using the
  // result has been closed.
  virtual FlushBlockPolicy* NewFlushBlockPolicy(
-      const BlockBuilder& data_block_builder) const = 0;
+      const Options& options, const BlockBuilder& data_block_builder) const = 0;
  virtual ~FlushBlockPolicyFactory() { }
 };
 class FlushBlockBySizePolicyFactory : public FlushBlockPolicyFactory {
 public:
-  FlushBlockBySizePolicyFactory(const uint64_t block_size,
+  FlushBlockBySizePolicyFactory() {}
                                const uint64_t block_size_deviation) :
      block_size_(block_size),
      block_size_deviation_(block_size_deviation) {
  }
  virtual const char* Name() const override {
    return "FlushBlockBySizePolicyFactory";
  }
  virtual FlushBlockPolicy* NewFlushBlockPolicy(
      const Options& options,
      const BlockBuilder& data_block_builder) const override;
 private:
  const uint64_t block_size_;
  const uint64_t block_size_deviation_;
 };
 }  // rocksdb
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@ -717,6 +717,10 @@ struct DBOptions {
  // Default: 0
  uint64_t bytes_per_sync;
  // Allow RocksDB to use thread local storage to optimize performance.
  // Default: true
  bool allow_thread_local;
  // Create DBOptions with default values for all fields
  DBOptions();
  // Create DBOptions from Options
--- a/include/rocksdb/statistics.h
+++ b/include/rocksdb/statistics.h
@ -122,6 +122,7 @@ enum Tickers {
  // Number of table's properties loaded directly from file, without creating
  // table reader object.
  NUMBER_DIRECT_LOAD_TABLE_PROPERTIES,
  NUMBER_SUPERVERSION_UPDATES,
  TICKER_ENUM_MAX
 };
@ -176,7 +177,9 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
    {COMPACT_READ_BYTES, "rocksdb.compact.read.bytes"},
    {COMPACT_WRITE_BYTES, "rocksdb.compact.write.bytes"},
    {NUMBER_DIRECT_LOAD_TABLE_PROPERTIES,
-     "rocksdb.number.direct.load.table.properties"}, };
+     "rocksdb.number.direct.load.table.properties"},
    {NUMBER_SUPERVERSION_UPDATES, "rocksdb.number.superversion_updates"},
 };
 /**
 * Keep adding histogram's here.
--- a/include/rocksdb/table.h
+++ b/include/rocksdb/table.h
@ -54,6 +54,21 @@ struct BlockBasedTableOptions {
  // If not specified, each "table reader" object will pre-load index/filter
  // block during table initialization.
  bool cache_index_and_filter_blocks = false;
  // The index type that will be used for this table.
  enum IndexType : char {
    // A space efficient index block that is optimized for
    // binary-search-based index.
    kBinarySearch,
  };
  IndexType index_type = kBinarySearch;
 };
 // Table Properties that are specific to block-based table properties.
 struct BlockBasedTablePropertyNames {
  // value of this propertis is a fixed int32 number.
  static const std::string kIndexType;
 };
 // Create default block based table factory.
--- a/include/rocksdb/transaction_log.h
+++ b/include/rocksdb/transaction_log.h
@ -85,6 +85,19 @@ class TransactionLogIterator {
  // earliest transaction contained in the batch.
  // ONLY use if Valid() is true and status() is OK.
  virtual BatchResult GetBatch() = 0;
  // The read options for TransactionLogIterator.
  struct ReadOptions {
    // If true, all data read from underlying storage will be
    // verified against corresponding checksums.
    // Default: true
    bool verify_checksums_;
    ReadOptions() : verify_checksums_(true) {}
    explicit ReadOptions(bool verify_checksums)
        : verify_checksums_(verify_checksums) {}
  };
 };
 } //  namespace rocksdb
--- a/include/utilities/stackable_db.h
+++ b/include/utilities/stackable_db.h
@ -188,10 +188,10 @@ class StackableDB : public DB {
    return db_->GetPropertiesOfAllTables(column_family, props);
  }
-  virtual Status GetUpdatesSince(SequenceNumber seq_number,
+  virtual Status GetUpdatesSince(
-                                 unique_ptr<TransactionLogIterator>* iter)
+      SequenceNumber seq_number, unique_ptr<TransactionLogIterator>* iter,
-    override {
+      const TransactionLogIterator::ReadOptions& read_options) override {
-      return db_->GetUpdatesSince(seq_number, iter);
+    return db_->GetUpdatesSince(seq_number, iter, read_options);
  }
  virtual ColumnFamilyHandle* DefaultColumnFamily() const override {
--- a/port/likely.h
+++ b/port/likely.h
@ -0,0 +1,21 @@
 //  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
 //  This source code is licensed under the BSD-style license found in the
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 //
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 #ifndef PORT_LIKELY_H_
 #define PORT_LIKELY_H_
 #if defined(__GNUC__) && __GNUC__ >= 4
 #define LIKELY(x)   (__builtin_expect((x), 1))
 #define UNLIKELY(x) (__builtin_expect((x), 0))
 #else
 #define LIKELY(x)   (x)
 #define UNLIKELY(x) (x)
 #endif
 #endif  // PORT_LIKELY_H_
--- a/table/block.h
+++ b/table/block.h
@ -26,8 +26,8 @@ class Block {
  ~Block();
  size_t size() const { return size_; }
-  bool   isCachable() const { return cachable_; }
+  bool   cachable() const { return cachable_; }
-  CompressionType compressionType() const { return compression_type_; }
+  CompressionType compression_type() const { return compression_type_; }
  Iterator* NewIterator(const Comparator* comparator);
  const char* data() { return data_; }
--- a/table/block_based_table_builder.cc
+++ b/table/block_based_table_builder.cc
@ -11,23 +11,29 @@
 #include <assert.h>
 #include <inttypes.h>
 #include <map>
 #include <stdio.h>
-#include "rocksdb/flush_block_policy.h"
+#include <map>
 #include <memory>
 #include "db/dbformat.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/comparator.h"
 #include "table/table_builder.h"
 #include "rocksdb/env.h"
 #include "rocksdb/filter_policy.h"
 #include "rocksdb/flush_block_policy.h"
 #include "rocksdb/options.h"
-#include "db/dbformat.h"
+#include "rocksdb/table.h"
-#include "table/block_based_table_reader.h"
+
 #include "table/block.h"
 #include "table/block_based_table_reader.h"
 #include "table/block_builder.h"
 #include "table/filter_block.h"
 #include "table/format.h"
 #include "table/meta_blocks.h"
 #include "table/table_builder.h"
 #include "util/coding.h"
 #include "util/crc32c.h"
 #include "util/stop_watch.h"
@ -36,11 +42,167 @@ namespace rocksdb {
 namespace {
-static bool GoodCompressionRatio(size_t compressed_size, size_t raw_size) {
+typedef BlockBasedTableOptions::IndexType IndexType;
 // The interface for building index.
 // Instruction for adding a new concrete IndexBuilder:
 //  1. Create a subclass instantiated from IndexBuilder.
 //  2. Add a new entry associated with that subclass in TableOptions::IndexType.
 //  3. Add a create function for the new subclass in CreateIndexBuilder.
 // Note: we can devise more advanced design to simplify the process for adding
 // new subclass, which will, on the other hand, increase the code complexity and
 // catch unwanted attention from readers. Given that we won't add/change
 // indexes frequently, it makes sense to just embrace a more straightforward
 // design that just works.
 class IndexBuilder {
 public:
  explicit IndexBuilder(const Comparator* comparator)
      : comparator_(comparator) {}
  virtual ~IndexBuilder() {}
  // Add a new index entry to index block.
  // To allow further optimization, we provide `last_key_in_current_block` and
  // `first_key_in_next_block`, based on which the specific implementation can
  // determine the best index key to be used for the index block.
  // @last_key_in_current_block: this parameter maybe overridden with the value
  //                             "substitute key".
  // @first_key_in_next_block: it will be nullptr if the entry being added is
  //                           the last one in the table
  //
  // REQUIRES: Finish() has not yet been called.
  virtual void AddEntry(std::string* last_key_in_current_block,
                        const Slice* first_key_in_next_block,
                        const BlockHandle& block_handle) = 0;
  // Inform the index builder that all entries has been written. Block builder
  // may therefore perform any operation required for block finalization.
  //
  // REQUIRES: Finish() has not yet been called.
  virtual Slice Finish() = 0;
  // Get the estimated size for index block.
  virtual size_t EstimatedSize() const = 0;
 protected:
  const Comparator* comparator_;
 };
 // This index builder builds space-efficient index block for binary-search-based
 // index.
 //
 // Optimizations:
 //  1. Made block's `block_restart_interval` to be 1, which will avoid linear
 //     search when doing index lookup.
 //  2. Shorten the key length for index block. Other than honestly using the
 //     last key in the data block as the index key, we instead find a shortest
 //     substitute key that serves the same function.
 class BinarySearchIndexBuilder : public IndexBuilder {
 public:
  explicit BinarySearchIndexBuilder(const Comparator* comparator)
      : IndexBuilder(comparator),
        index_block_builder_(1 /* block_restart_interval == 1 */, comparator) {}
  virtual void AddEntry(std::string* last_key_in_current_block,
                        const Slice* first_key_in_next_block,
                        const BlockHandle& block_handle) override {
    if (first_key_in_next_block != nullptr) {
      comparator_->FindShortestSeparator(last_key_in_current_block,
                                         *first_key_in_next_block);
    } else {
      comparator_->FindShortSuccessor(last_key_in_current_block);
    }
    std::string handle_encoding;
    block_handle.EncodeTo(&handle_encoding);
    index_block_builder_.Add(*last_key_in_current_block, handle_encoding);
  }
  virtual Slice Finish() override { return index_block_builder_.Finish(); }
  virtual size_t EstimatedSize() const {
    return index_block_builder_.CurrentSizeEstimate();
  }
 private:
  BlockBuilder index_block_builder_;
 };
 // Create a index builder based on its type.
 IndexBuilder* CreateIndexBuilder(IndexType type, const Comparator* comparator) {
  switch (type) {
    case BlockBasedTableOptions::kBinarySearch: {
      return new BinarySearchIndexBuilder(comparator);
    }
    default: {
      assert(!"Do not recognize the index type ");
      return nullptr;
    }
  }
  // impossible.
  assert(false);
  return nullptr;
 }
 bool GoodCompressionRatio(size_t compressed_size, size_t raw_size) {
  // Check to see if compressed less than 12.5%
  return compressed_size < raw_size - (raw_size / 8u);
 }
 Slice CompressBlock(const Slice& raw,
                    const CompressionOptions& compression_options,
                    CompressionType* type, std::string* compressed_output) {
  if (*type == kNoCompression) {
    return raw;
  }
  // Will return compressed block contents if (1) the compression method is
  // supported in this platform and (2) the compression rate is "good enough".
  switch (*type) {
    case kSnappyCompression:
      if (port::Snappy_Compress(compression_options, raw.data(), raw.size(),
                                compressed_output) &&
          GoodCompressionRatio(compressed_output->size(), raw.size())) {
        return *compressed_output;
      }
      break;  // fall back to no compression.
    case kZlibCompression:
      if (port::Zlib_Compress(compression_options, raw.data(), raw.size(),
                              compressed_output) &&
          GoodCompressionRatio(compressed_output->size(), raw.size())) {
        return *compressed_output;
      }
      break;  // fall back to no compression.
    case kBZip2Compression:
      if (port::BZip2_Compress(compression_options, raw.data(), raw.size(),
                               compressed_output) &&
          GoodCompressionRatio(compressed_output->size(), raw.size())) {
        return *compressed_output;
      }
      break;  // fall back to no compression.
    case kLZ4Compression:
      if (port::LZ4_Compress(compression_options, raw.data(), raw.size(),
                             compressed_output) &&
          GoodCompressionRatio(compressed_output->size(), raw.size())) {
        return *compressed_output;
      }
      break;  // fall back to no compression.
    case kLZ4HCCompression:
      if (port::LZ4HC_Compress(compression_options, raw.data(), raw.size(),
                               compressed_output) &&
          GoodCompressionRatio(compressed_output->size(), raw.size())) {
        return *compressed_output;
      }
      break;     // fall back to no compression.
    default: {}  // Do not recognize this compression type
  }
  // Compression method is not supported, or not good compression ratio, so just
  // fall back to uncompressed form.
  *type = kNoCompression;
  return raw;
 }
 }  // anonymous namespace
 // kBlockBasedTableMagicNumber was picked by running
@ -51,6 +213,46 @@ static bool GoodCompressionRatio(size_t compressed_size, size_t raw_size) {
 extern const uint64_t kBlockBasedTableMagicNumber
    = 0xdb4775248b80fb57ull;
 // A collector that collects properties of interest to block-based table.
 // For now this class looks heavy-weight since we only write one additional
 // property.
 // But in the forseeable future, we will add more and more properties that are
 // specific to block-based table.
 class BlockBasedTableBuilder::BlockBasedTablePropertiesCollector
    : public TablePropertiesCollector {
 public:
  BlockBasedTablePropertiesCollector(
      BlockBasedTableOptions::IndexType index_type)
      : index_type_(index_type) {}
  virtual Status Add(const Slice& key, const Slice& value) {
    // Intentionally left blank. Have no interest in collecting stats for
    // individual key/value pairs.
    return Status::OK();
  }
  virtual Status Finish(UserCollectedProperties* properties) {
    std::string val;
    PutFixed32(&val, static_cast<uint32_t>(index_type_));
    properties->insert({BlockBasedTablePropertyNames::kIndexType, val});
    return Status::OK();
  }
  // The name of the properties collector can be used for debugging purpose.
  virtual const char* Name() const {
    return "BlockBasedTablePropertiesCollector";
  }
  virtual UserCollectedProperties GetReadableProperties() const {
    // Intentionally left blank.
    return UserCollectedProperties();
  }
 private:
  BlockBasedTableOptions::IndexType index_type_;
 };
 struct BlockBasedTableBuilder::Rep {
  Options options;
  const InternalKeyComparator& internal_comparator;
@ -58,7 +260,8 @@ struct BlockBasedTableBuilder::Rep {
  uint64_t offset = 0;
  Status status;
  BlockBuilder data_block;
-  BlockBuilder index_block;
+  std::unique_ptr<IndexBuilder> index_builder;
  std::string last_key;
  CompressionType compression_type;
  TableProperties props;
@ -75,28 +278,31 @@ struct BlockBasedTableBuilder::Rep {
  Rep(const Options& opt, const InternalKeyComparator& icomparator,
      WritableFile* f, FlushBlockPolicyFactory* flush_block_policy_factory,
-      CompressionType compression_type)
+      CompressionType compression_type, IndexType index_block_type)
      : options(opt),
        internal_comparator(icomparator),
        file(f),
        data_block(options, &internal_comparator),
-        // To avoid linear scan, we make the block_restart_interval to be `1`
+        index_builder(
-        // in index block builder
+            CreateIndexBuilder(index_block_type, &internal_comparator)),
        index_block(1 /* block_restart_interval */, &internal_comparator),
        compression_type(compression_type),
        filter_block(opt.filter_policy == nullptr
                         ? nullptr
                         : new FilterBlockBuilder(opt, &internal_comparator)),
-        flush_block_policy(
+        flush_block_policy(flush_block_policy_factory->NewFlushBlockPolicy(
-            flush_block_policy_factory->NewFlushBlockPolicy(data_block)) {}
+            options, data_block)) {
    options.table_properties_collectors.push_back(
        std::make_shared<BlockBasedTablePropertiesCollector>(index_block_type));
  }
 };
 BlockBasedTableBuilder::BlockBasedTableBuilder(
-    const Options& options, const InternalKeyComparator& internal_comparator,
+    const Options& options, const BlockBasedTableOptions& table_options,
-    WritableFile* file, FlushBlockPolicyFactory* flush_block_policy_factory,
+    const InternalKeyComparator& internal_comparator, WritableFile* file,
    CompressionType compression_type)
    : rep_(new Rep(options, internal_comparator, file,
-                   flush_block_policy_factory, compression_type)) {
+                   table_options.flush_block_policy_factory.get(),
                   compression_type, table_options.index_type)) {
  if (rep_->filter_block != nullptr) {
    rep_->filter_block->StartBlock(0);
  }
@ -136,10 +342,7 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
    // entries in the first block and < all entries in subsequent
    // blocks.
    if (ok()) {
-      r->internal_comparator.FindShortestSeparator(&r->last_key, key);
+      r->index_builder->AddEntry(&r->last_key, &key, r->pending_handle);
      std::string handle_encoding;
      r->pending_handle.EncodeTo(&handle_encoding);
      r->index_block.Add(r->last_key, Slice(handle_encoding));
    }
  }
@ -179,88 +382,25 @@ void BlockBasedTableBuilder::Flush() {
 void BlockBasedTableBuilder::WriteBlock(BlockBuilder* block,
                                        BlockHandle* handle) {
  WriteBlock(block->Finish(), handle);
  block->Reset();
 }
 void BlockBasedTableBuilder::WriteBlock(const Slice& raw_block_contents,
                                        BlockHandle* handle) {
  // File format contains a sequence of blocks where each block has:
  //    block_data: uint8[n]
  //    type: uint8
  //    crc: uint32
  assert(ok());
  Rep* r = rep_;
  Slice raw = block->Finish();
-  Slice block_contents;
+  auto type = r->compression_type;
-  std::string* compressed = &r->compressed_output;
+  auto block_contents =
-  CompressionType type = r->compression_type;
+      CompressBlock(raw_block_contents, r->options.compression_opts, &type,
-  switch (type) {
+                    &r->compressed_output);
    case kNoCompression:
      block_contents = raw;
      break;
    case kSnappyCompression: {
      std::string* compressed = &r->compressed_output;
      if (port::Snappy_Compress(r->options.compression_opts, raw.data(),
                                raw.size(), compressed) &&
          GoodCompressionRatio(compressed->size(), raw.size())) {
        block_contents = *compressed;
      } else {
        // Snappy not supported, or not good compression ratio, so just
        // store uncompressed form
        block_contents = raw;
        type = kNoCompression;
      }
      break;
    }
    case kZlibCompression:
      if (port::Zlib_Compress(r->options.compression_opts, raw.data(),
                              raw.size(), compressed) &&
          GoodCompressionRatio(compressed->size(), raw.size())) {
        block_contents = *compressed;
      } else {
        // Zlib not supported, or not good compression ratio, so just
        // store uncompressed form
        block_contents = raw;
        type = kNoCompression;
      }
      break;
    case kBZip2Compression:
      if (port::BZip2_Compress(r->options.compression_opts, raw.data(),
                               raw.size(), compressed) &&
          GoodCompressionRatio(compressed->size(), raw.size())) {
        block_contents = *compressed;
      } else {
        // BZip not supported, or not good compression ratio, so just
        // store uncompressed form
        block_contents = raw;
        type = kNoCompression;
      }
      break;
    case kLZ4Compression:
      if (port::LZ4_Compress(r->options.compression_opts, raw.data(),
                             raw.size(), compressed) &&
          GoodCompressionRatio(compressed->size(), raw.size())) {
        block_contents = *compressed;
      } else {
        // LZ4 not supported, or not good compression ratio, so just
        // store uncompressed form
        block_contents = raw;
        type = kNoCompression;
      }
      break;
    case kLZ4HCCompression:
      if (port::LZ4HC_Compress(r->options.compression_opts, raw.data(),
                               raw.size(), compressed) &&
          GoodCompressionRatio(compressed->size(), raw.size())) {
        block_contents = *compressed;
      } else {
        // LZ4 not supported, or not good compression ratio, so just
        // store uncompressed form
        block_contents = raw;
        type = kNoCompression;
      }
      break;
  }
  WriteRawBlock(block_contents, type, handle);
  r->compressed_output.clear();
  block->Reset();
 }
 void BlockBasedTableBuilder::WriteRawBlock(const Slice& block_contents,
@ -364,11 +504,8 @@ Status BlockBasedTableBuilder::Finish() {
  // block, we will finish writing all index entries here and flush them
  // to storage after metaindex block is written.
  if (ok() && !empty_data_block) {
-    r->internal_comparator.FindShortSuccessor(&r->last_key);
+    r->index_builder->AddEntry(&r->last_key, nullptr /* no next data block */,
-
+                               r->pending_handle);
    std::string handle_encoding;
    r->pending_handle.EncodeTo(&handle_encoding);
    r->index_block.Add(r->last_key, handle_encoding);
  }
  // Write meta blocks and metaindex block with the following order.
@ -394,11 +531,12 @@ Status BlockBasedTableBuilder::Finish() {
      r->props.filter_policy_name = r->options.filter_policy != nullptr ?
          r->options.filter_policy->Name() : "";
      r->props.index_size =
-        r->index_block.CurrentSizeEstimate() + kBlockTrailerSize;
+          r->index_builder->EstimatedSize() + kBlockTrailerSize;
      // Add basic properties
      property_block_builder.AddTableProperty(r->props);
      // Add use collected properties
      NotifyCollectTableCollectorsOnFinish(
          r->options.table_properties_collectors,
          r->options.info_log.get(),
@ -425,7 +563,7 @@ Status BlockBasedTableBuilder::Finish() {
  // Write index block
  if (ok()) {
-    WriteBlock(&r->index_block, &index_block_handle);
+    WriteBlock(r->index_builder->Finish(), &index_block_handle);
  }
  // Write footer
--- a/table/block_based_table_builder.h
+++ b/table/block_based_table_builder.h
@ -9,6 +9,7 @@
 #pragma once
 #include <stdint.h>
 #include "rocksdb/flush_block_policy.h"
 #include "rocksdb/options.h"
 #include "rocksdb/status.h"
@ -19,6 +20,7 @@ namespace rocksdb {
 class BlockBuilder;
 class BlockHandle;
 class WritableFile;
 struct BlockBasedTableOptions;
 class BlockBasedTableBuilder : public TableBuilder {
 public:
@ -26,10 +28,9 @@ class BlockBasedTableBuilder : public TableBuilder {
  // building in *file.  Does not close the file.  It is up to the
  // caller to close the file after calling Finish().
  BlockBasedTableBuilder(const Options& options,
                         const BlockBasedTableOptions& table_options,
                         const InternalKeyComparator& internal_comparator,
-                         WritableFile* file,
+                         WritableFile* file, CompressionType compression_type);
                         FlushBlockPolicyFactory* flush_block_policy_factory,
                         CompressionType compression_type);
  // REQUIRES: Either Finish() or Abandon() has been called.
  ~BlockBasedTableBuilder();
@ -63,11 +64,17 @@ class BlockBasedTableBuilder : public TableBuilder {
 private:
  bool ok() const { return status().ok(); }
  // Call block's Finish() method and then write the finalize block contents to
  // file.
  void WriteBlock(BlockBuilder* block, BlockHandle* handle);
  // Directly write block content to the file.
  void WriteBlock(const Slice& block_contents, BlockHandle* handle);
  void WriteRawBlock(const Slice& data, CompressionType, BlockHandle* handle);
  Status InsertBlockInCache(const Slice& block_contents,
-                         const CompressionType type, const BlockHandle* handle);
+                            const CompressionType type,
                            const BlockHandle* handle);
  struct Rep;
  class BlockBasedTablePropertiesCollector;
  Rep* rep_;
  // Advanced operation: flush any buffered key/value pairs to file.
@ -82,4 +89,3 @@ class BlockBasedTableBuilder : public TableBuilder {
 };
 }  // namespace rocksdb
--- a/table/block_based_table_factory.cc
+++ b/table/block_based_table_factory.cc
@ -11,13 +11,25 @@
 #include "table/block_based_table_factory.h"
 #include <memory>
 #include <string>
 #include <stdint.h>
 #include "rocksdb/flush_block_policy.h"
 #include "table/block_based_table_builder.h"
 #include "table/block_based_table_reader.h"
 #include "port/port.h"
 namespace rocksdb {
 BlockBasedTableFactory::BlockBasedTableFactory(
    const BlockBasedTableOptions& table_options)
    : table_options_(table_options) {
  if (table_options_.flush_block_policy_factory == nullptr) {
    table_options_.flush_block_policy_factory.reset(
        new FlushBlockBySizePolicyFactory());
  }
 }
 Status BlockBasedTableFactory::NewTableReader(
    const Options& options, const EnvOptions& soptions,
    const InternalKeyComparator& internal_comparator,
@ -31,34 +43,8 @@ Status BlockBasedTableFactory::NewTableReader(
 TableBuilder* BlockBasedTableFactory::NewTableBuilder(
    const Options& options, const InternalKeyComparator& internal_comparator,
    WritableFile* file, CompressionType compression_type) const {
-  auto flush_block_policy_factory = 
+  auto table_builder = new BlockBasedTableBuilder(
-    table_options_.flush_block_policy_factory.get();
+      options, table_options_, internal_comparator, file, compression_type);
  // if flush block policy factory is not set, we'll create the default one
  // from the options.
  //
  // NOTE: we cannot pre-cache the "default block policy factory" because
  // `FlushBlockBySizePolicyFactory` takes `options.block_size` and
  // `options.block_size_deviation` as parameters, which may be different
  // every time.
  if (flush_block_policy_factory == nullptr) {
    flush_block_policy_factory =
        new FlushBlockBySizePolicyFactory(options.block_size,
                                          options.block_size_deviation);
  }
  auto table_builder =
      new BlockBasedTableBuilder(options, internal_comparator, file,
                                 flush_block_policy_factory, compression_type);
  // Delete flush_block_policy_factory only when it's just created from the
  // options.
  // We can safely delete flush_block_policy_factory since it will only be used
  // during the construction of `BlockBasedTableBuilder`.
  if (flush_block_policy_factory != 
      table_options_.flush_block_policy_factory.get()) {
    delete flush_block_policy_factory;
  }
  return table_builder;
 }
@ -68,4 +54,7 @@ TableFactory* NewBlockBasedTableFactory(
  return new BlockBasedTableFactory(table_options);
 }
 const std::string BlockBasedTablePropertyNames::kIndexType =
    "rocksdb.block.based.table.index.type";
 }  // namespace rocksdb
--- a/table/block_based_table_factory.h
+++ b/table/block_based_table_factory.h
@ -26,8 +26,7 @@ class BlockBasedTableBuilder;
 class BlockBasedTableFactory : public TableFactory {
 public:
  explicit BlockBasedTableFactory(
-      const BlockBasedTableOptions& table_options = BlockBasedTableOptions())
+      const BlockBasedTableOptions& table_options = BlockBasedTableOptions());
      : table_options_(table_options) {}
  ~BlockBasedTableFactory() {}
--- a/table/block_based_table_reader.cc
+++ b/table/block_based_table_reader.cc
--- a/table/block_based_table_reader.h
+++ b/table/block_based_table_reader.h
@ -8,12 +8,14 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 #pragma once
-#include <memory>
+
 #include <stdint.h>
-#include "rocksdb/cache.h"
+#include <memory>
-#include "rocksdb/env.h"
+#include <utility>
-#include "rocksdb/iterator.h"
+
 #include "rocksdb/statistics.h"
 #include "rocksdb/status.h"
 #include "rocksdb/table.h"
 #include "table/table_reader.h"
 #include "util/coding.h"
@ -21,14 +23,19 @@ namespace rocksdb {
 class Block;
 class BlockHandle;
 class Cache;
 class FilterBlockReader;
 class Footer;
-struct Options;
+class InternalKeyComparator;
 class Iterator;
 class RandomAccessFile;
 struct ReadOptions;
 class TableCache;
 class TableReader;
-class FilterBlockReader;
+class WritableFile;
 struct BlockBasedTableOptions;
 struct EnvOptions;
 struct Options;
 struct ReadOptions;
 using std::unique_ptr;
@ -91,7 +98,9 @@ class BlockBasedTable : public TableReader {
  ~BlockBasedTable();
  bool TEST_filter_block_preloaded() const;
-  bool TEST_index_block_preloaded() const;
+  bool TEST_index_reader_preloaded() const;
  // Implementation of IndexReader will be exposed to internal cc file only.
  class IndexReader;
 private:
  template <class TValue>
@ -101,40 +110,51 @@ class BlockBasedTable : public TableReader {
  Rep* rep_;
  bool compaction_optimized_;
-  static Iterator* BlockReader(void*, const ReadOptions&,
+  static Iterator* DataBlockReader(void*, const ReadOptions&,
-                               const EnvOptions& soptions,
+                                   const EnvOptions& soptions,
-                               const InternalKeyComparator& icomparator,
+                                   const InternalKeyComparator& icomparator,
-                               const Slice&, bool for_compaction);
+                                   const Slice&, bool for_compaction);
-  static Iterator* BlockReader(void*, const ReadOptions&, const Slice&,
+  static Iterator* DataBlockReader(void*, const ReadOptions&, const Slice&,
-                               bool* didIO, bool for_compaction = false);
+                                   bool* didIO, bool for_compaction = false);
-  // if `no_io == true`, we will not try to read filter from sst file
+  // For the following two functions:
-  // if it is not cached yet.
+  // if `no_io == true`, we will not try to read filter/index from sst file
  // were they not present in cache yet.
  CachableEntry<FilterBlockReader> GetFilter(bool no_io = false) const;
-  Iterator* IndexBlockReader(const ReadOptions& options) const;
+  // Get the iterator from the index reader.
  //
  // Note: ErrorIterator with Status::Incomplete shall be returned if all the
  // following conditions are met:
  //  1. We enabled table_options.cache_index_and_filter_blocks.
  //  2. index is not present in block cache.
  //  3. We disallowed any io to be performed, that is, read_options ==
  //     kBlockCacheTier
  Iterator* NewIndexIterator(const ReadOptions& read_options) const;
-  // Read the block, either from sst file or from cache. This method will try
+  // Read block cache from block caches (if set): block_cache and
-  // to read from cache only when block_cache is set or ReadOption doesn't
+  // block_cache_compressed.
-  // explicitly prohibit storage IO.
+  // On success, Status::OK with be returned and @block will be populated with
  // pointer to the block as well as its block handle.
  static Status GetDataBlockFromCache(
      const Slice& block_cache_key, const Slice& compressed_block_cache_key,
      Cache* block_cache, Cache* block_cache_compressed, Statistics* statistics,
      const ReadOptions& read_options,
      BlockBasedTable::CachableEntry<Block>* block);
  // Put a raw block (maybe compressed) to the corresponding block caches.
  // This method will perform decompression against raw_block if needed and then
  // populate the block caches.
  // On success, Status::OK will be returned; also @block will be populated with
  // uncompressed block and its cache handle.
  //
-  // If the block is read from cache, the statistics for cache miss/hit of the
+  // REQUIRES: raw_block is heap-allocated. PutDataBlockToCache() will be
-  // the given type of block will be updated. User can specify
+  // responsible for releasing its memory if error occurs.
-  // `block_cache_miss_ticker` and `block_cache_hit_ticker` for the statistics
+  static Status PutDataBlockToCache(
-  // update.
+      const Slice& block_cache_key, const Slice& compressed_block_cache_key,
-  //
+      Cache* block_cache, Cache* block_cache_compressed,
-  // On success, the `result` parameter will be populated, which contains a
+      const ReadOptions& read_options, Statistics* statistics,
-  // pointer to the block and its cache handle, which will be nullptr if it's
+      CachableEntry<Block>* block, Block* raw_block);
  // not read from the cache.
  static Status GetBlock(const BlockBasedTable* table,
                         const BlockHandle& handle,
                         const ReadOptions& options,
                         bool for_compaction,
                         Tickers block_cache_miss_ticker,
                         Tickers block_cache_hit_ticker,
                         bool* didIO,
                         CachableEntry<Block>* result);
  // Calls (*handle_result)(arg, ...) repeatedly, starting with the entry found
  // after a call to Seek(key), until handle_result returns false.
@ -144,6 +164,7 @@ class BlockBasedTable : public TableReader {
  void ReadMeta(const Footer& footer);
  void ReadFilter(const Slice& filter_handle_value);
  Status CreateIndexReader(IndexReader** index_reader) const;
  // Read the meta block from sst.
  static Status ReadMetaBlock(
@ -159,10 +180,9 @@ class BlockBasedTable : public TableReader {
  static void SetupCacheKeyPrefix(Rep* rep);
-  explicit BlockBasedTable(Rep* rep) :
+  explicit BlockBasedTable(Rep* rep)
-      compaction_optimized_(false) {
+      : rep_(rep), compaction_optimized_(false) {}
-    rep_ = rep;
+
  }
  // Generate a cache key prefix from the file
  static void GenerateCachePrefix(Cache* cc,
    RandomAccessFile* file, char* buffer, size_t* size);
--- a/table/flush_block_policy.cc
+++ b/table/flush_block_policy.cc
@ -3,6 +3,7 @@
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 #include "rocksdb/options.h"
 #include "rocksdb/flush_block_policy.h"
 #include "rocksdb/slice.h"
 #include "table/block_builder.h"
@ -61,10 +62,9 @@ class FlushBlockBySizePolicy : public FlushBlockPolicy {
 };
 FlushBlockPolicy* FlushBlockBySizePolicyFactory::NewFlushBlockPolicy(
-    const BlockBuilder& data_block_builder) const {
+    const Options& options, const BlockBuilder& data_block_builder) const {
-  return new FlushBlockBySizePolicy(block_size_,
+  return new FlushBlockBySizePolicy(
-                                    block_size_deviation_,
+      options.block_size, options.block_size_deviation, data_block_builder);
                                    data_block_builder);
 }
 }  // namespace rocksdb
--- a/table/plain_table_reader.cc
+++ b/table/plain_table_reader.cc
@ -527,13 +527,14 @@ Status PlainTableReader::ReadKey(const char* start, ParsedInternalKey* key,
    key_ptr =
        GetVarint32Ptr(start, file_data_.data() + data_end_offset_, &tmp_size);
    if (key_ptr == nullptr) {
-      return Status::Corruption("Unable to read the next key");
+      return Status::Corruption(
          "Unexpected EOF when reading the next key's size");
    }
    user_key_size = (size_t)tmp_size;
    *bytes_read = key_ptr - start;
  }
  if (key_ptr + user_key_size + 1 >= file_data_.data() + data_end_offset_) {
-    return Status::Corruption("Unable to read the next key");
+    return Status::Corruption("Unexpected EOF when reading the next key");
  }
  if (*(key_ptr + user_key_size) == PlainTableFactory::kValueTypeSeqId0) {
@ -544,10 +545,12 @@ Status PlainTableReader::ReadKey(const char* start, ParsedInternalKey* key,
    *bytes_read += user_key_size + 1;
  } else {
    if (start + user_key_size + 8 >= file_data_.data() + data_end_offset_) {
-      return Status::Corruption("Unable to read the next key");
+      return Status::Corruption(
          "Unexpected EOF when reading internal bytes of the next key");
    }
    if (!ParseInternalKey(Slice(key_ptr, user_key_size + 8), key)) {
-      return Status::Corruption(Slice());
+      return Status::Corruption(
          Slice("Incorrect value type found when reading the next key"));
    }
    *bytes_read += user_key_size + 8;
  }
@ -569,15 +572,19 @@ Status PlainTableReader::Next(uint32_t* offset, ParsedInternalKey* key,
  const char* start = file_data_.data() + *offset;
  size_t bytes_for_key;
  Status s = ReadKey(start, key, &bytes_for_key);
  if (!s.ok()) {
    return s;
  }
  uint32_t value_size;
  const char* value_ptr = GetVarint32Ptr(
      start + bytes_for_key, file_data_.data() + data_end_offset_, &value_size);
  if (value_ptr == nullptr) {
-    return Status::Corruption("Error reading value length.");
+    return Status::Corruption(
        "Unexpected EOF when reading the next value's size.");
  }
  *offset = *offset + (value_ptr - start) + value_size;
  if (*offset > data_end_offset_) {
-    return Status::Corruption("Reach end of file when reading value");
+    return Status::Corruption("Unexpected EOF when reading the next value. ");
  }
  *value = Slice(value_ptr, value_size);
--- a/table/table_test.cc
+++ b/table/table_test.cc
@ -9,6 +9,7 @@
 #include <inttypes.h>
 #include <stdio.h>
 #include <algorithm>
 #include <map>
 #include <string>
@ -16,8 +17,6 @@
 #include <vector>
 #include "db/dbformat.h"
 #include "rocksdb/statistics.h"
 #include "util/statistics.h"
 #include "db/memtable.h"
 #include "db/write_batch_internal.h"
@ -25,11 +24,11 @@
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/memtablerep.h"
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/statistics.h"
 #include "table/block.h"
 #include "table/meta_blocks.h"
 #include "table/block_based_table_reader.h"
 #include "table/block_based_table_builder.h"
 #include "table/block_based_table_factory.h"
 #include "table/block_based_table_reader.h"
@ -39,6 +38,7 @@
 #include "table/plain_table_factory.h"
 #include "util/random.h"
 #include "util/statistics.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
@ -690,8 +690,7 @@ class Harness {
    switch (args.type) {
      case BLOCK_BASED_TABLE_TEST:
        table_options.flush_block_policy_factory.reset(
-            new FlushBlockBySizePolicyFactory(options_.block_size,
+            new FlushBlockBySizePolicyFactory());
                                              options_.block_size_deviation));
        options_.table_factory.reset(new BlockBasedTableFactory(table_options));
        constructor_ = new TableConstructor(options_.comparator);
        break;
@ -1203,7 +1202,7 @@ TEST(BlockBasedTableTest, BlockCacheDisabledTest) {
  // preloading filter/index blocks is enabled.
  auto reader = dynamic_cast<BlockBasedTable*>(c.table_reader());
  ASSERT_TRUE(reader->TEST_filter_block_preloaded());
-  ASSERT_TRUE(reader->TEST_index_block_preloaded());
+  ASSERT_TRUE(reader->TEST_index_reader_preloaded());
  {
    // nothing happens in the beginning
@ -1244,7 +1243,7 @@ TEST(BlockBasedTableTest, FilterBlockInBlockCache) {
  // preloading filter/index blocks is prohibited.
  auto reader = dynamic_cast<BlockBasedTable*>(c.table_reader());
  ASSERT_TRUE(!reader->TEST_filter_block_preloaded());
-  ASSERT_TRUE(!reader->TEST_index_block_preloaded());
+  ASSERT_TRUE(!reader->TEST_index_reader_preloaded());
  // -- PART 1: Open with regular block cache.
  // Since block_cache is disabled, no cache activities will be involved.
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@ -70,7 +70,7 @@ def main(argv):
            --threads=%s
            --write_buffer_size=%s
            --destroy_db_initially=0
-            --reopen=0
+            --reopen=20
            --readpercent=45
            --prefixpercent=5
            --writepercent=35
--- a/tools/db_crashtest2.py
+++ b/tools/db_crashtest2.py
@ -84,7 +84,7 @@ def main(argv):
            --threads=%s
            --write_buffer_size=%s
            --destroy_db_initially=0
-            --reopen=0
+            --reopen=20
            --readpercent=45
            --prefixpercent=5
            --writepercent=35
--- a/util/auto_roll_logger.h
+++ b/util/auto_roll_logger.h
@ -17,20 +17,21 @@ namespace rocksdb {
 class AutoRollLogger : public Logger {
 public:
  AutoRollLogger(Env* env, const std::string& dbname,
-                 const std::string& db_log_dir,
+                 const std::string& db_log_dir, size_t log_max_size,
-                 size_t log_max_size,
+                 size_t log_file_time_to_roll,
-                 size_t log_file_time_to_roll):
+                 const InfoLogLevel log_level = InfoLogLevel::ERROR)
-     dbname_(dbname),
+      : Logger(log_level),
-     db_log_dir_(db_log_dir),
+        dbname_(dbname),
-     env_(env),
+        db_log_dir_(db_log_dir),
-     status_(Status::OK()),
+        env_(env),
-     kMaxLogFileSize(log_max_size),
+        status_(Status::OK()),
-     kLogFileTimeToRoll(log_file_time_to_roll),
+        kMaxLogFileSize(log_max_size),
-     cached_now(static_cast<uint64_t>(env_->NowMicros() * 1e-6)),
+        kLogFileTimeToRoll(log_file_time_to_roll),
-     ctime_(cached_now),
+        cached_now(static_cast<uint64_t>(env_->NowMicros() * 1e-6)),
-     cached_now_access_count(0),
+        ctime_(cached_now),
-     call_NowMicros_every_N_records_(100),
+        cached_now_access_count(0),
-     mutex_() {
+        call_NowMicros_every_N_records_(100),
        mutex_() {
    env->GetAbsolutePath(dbname, &db_absolute_path_);
    log_fname_ = InfoLogFileName(dbname_, db_absolute_path_, db_log_dir_);
    RollLogFile();
--- a/util/auto_roll_logger_test.cc
+++ b/util/auto_roll_logger_test.cc
@ -5,12 +5,15 @@
 //
 #include <string>
 #include <cmath>
 #include <iostream>
 #include <fstream>
 #include <iterator>
 #include <algorithm>
 #include "util/testharness.h"
 #include "util/auto_roll_logger.h"
 #include "rocksdb/db.h"
 #include <sys/stat.h>
 #include <errno.h>
 #include <iostream>
 using namespace std;
@ -39,10 +42,8 @@ class AutoRollLoggerTest {
 const string AutoRollLoggerTest::kSampleMessage(
    "this is the message to be written to the log file!!");
-const string AutoRollLoggerTest::kTestDir(
+const string AutoRollLoggerTest::kTestDir(test::TmpDir() + "/db_log_test");
-    test::TmpDir() + "/db_log_test");
+const string AutoRollLoggerTest::kLogFile(test::TmpDir() + "/db_log_test/LOG");
 const string AutoRollLoggerTest::kLogFile(
    test::TmpDir() + "/db_log_test/LOG");
 Env* AutoRollLoggerTest::env = Env::Default();
 // In this test we only want to Log some simple log message with
@ -53,6 +54,11 @@ void LogMessage(Logger* logger, const char* message) {
  Log(logger, "%s", message);
 }
 void LogMessage(const InfoLogLevel log_level, Logger* logger,
                const char* message) {
  Log(log_level, logger, "%s", message);
 }
 void GetFileCreateTime(const std::string& fname, uint64_t* file_ctime) {
  struct stat s;
  if (stat(fname.c_str(), &s) != 0) {
@ -64,6 +70,7 @@ void GetFileCreateTime(const std::string& fname, uint64_t* file_ctime) {
 void AutoRollLoggerTest::RollLogFileBySizeTest(AutoRollLogger* logger,
                                               size_t log_max_size,
                                               const string& log_message) {
  logger->SetInfoLogLevel(InfoLogLevel::INFO);
  // measure the size of each message, which is supposed
  // to be equal or greater than log_message.size()
  LogMessage(logger, log_message.c_str());
@ -131,7 +138,6 @@ TEST(AutoRollLoggerTest, RollLogFileBySize) {
    RollLogFileBySizeTest(&logger, log_max_size,
                          kSampleMessage + ":RollLogFileBySize");
 }
 TEST(AutoRollLoggerTest, RollLogFileByTime) {
@ -235,6 +241,46 @@ TEST(AutoRollLoggerTest, CreateLoggerFromOptions) {
      kSampleMessage + ":CreateLoggerFromOptions - both");
 }
 TEST(AutoRollLoggerTest, InfoLogLevel) {
  InitTestDb();
  size_t log_size = 8192;
  size_t log_lines = 0;
  // an extra-scope to force the AutoRollLogger to flush the log file when it
  // becomes out of scope.
  {
    AutoRollLogger logger(Env::Default(), kTestDir, "", log_size, 0);
    for (int log_level = InfoLogLevel::FATAL; log_level >= InfoLogLevel::DEBUG;
         log_level--) {
      logger.SetInfoLogLevel((InfoLogLevel)log_level);
      for (int log_type = InfoLogLevel::DEBUG; log_type <= InfoLogLevel::FATAL;
           log_type++) {
        // log messages with log level smaller than log_level will not be
        // logged.
        LogMessage((InfoLogLevel)log_type, &logger, kSampleMessage.c_str());
      }
      log_lines += InfoLogLevel::FATAL - log_level + 1;
    }
    for (int log_level = InfoLogLevel::FATAL; log_level >= InfoLogLevel::DEBUG;
         log_level--) {
      logger.SetInfoLogLevel((InfoLogLevel)log_level);
      // again, messages with level smaller than log_level will not be logged.
      Debug(&logger, "%s", kSampleMessage.c_str());
      Info(&logger, "%s", kSampleMessage.c_str());
      Warn(&logger, "%s", kSampleMessage.c_str());
      Error(&logger, "%s", kSampleMessage.c_str());
      Fatal(&logger, "%s", kSampleMessage.c_str());
      log_lines += InfoLogLevel::FATAL - log_level + 1;
    }
  }
  std::ifstream inFile(AutoRollLoggerTest::kLogFile.c_str());
  size_t lines = std::count(std::istreambuf_iterator<char>(inFile),
                         std::istreambuf_iterator<char>(), '\n');
  ASSERT_EQ(log_lines, lines);
  inFile.close();
 }
 int OldLogFileCount(const string& dir) {
  std::vector<std::string> files;
  Env::Default()->GetChildren(dir, &files);
--- a/util/env.cc
+++ b/util/env.cc
@ -45,12 +45,120 @@ void Log(Logger* info_log, const char* format, ...) {
  }
 }
 void Log(const InfoLogLevel log_level, Logger* info_log, const char* format,
         ...) {
  if (info_log) {
    va_list ap;
    va_start(ap, format);
    info_log->Logv(log_level, format, ap);
    va_end(ap);
  }
 }
 void Debug(Logger* info_log, const char* format, ...) {
  if (info_log) {
    va_list ap;
    va_start(ap, format);
    info_log->Logv(InfoLogLevel::DEBUG, format, ap);
    va_end(ap);
  }
 }
 void Info(Logger* info_log, const char* format, ...) {
  if (info_log) {
    va_list ap;
    va_start(ap, format);
    info_log->Logv(InfoLogLevel::INFO, format, ap);
    va_end(ap);
  }
 }
 void Warn(Logger* info_log, const char* format, ...) {
  if (info_log) {
    va_list ap;
    va_start(ap, format);
    info_log->Logv(InfoLogLevel::WARN, format, ap);
    va_end(ap);
  }
 }
 void Error(Logger* info_log, const char* format, ...) {
  if (info_log) {
    va_list ap;
    va_start(ap, format);
    info_log->Logv(InfoLogLevel::ERROR, format, ap);
    va_end(ap);
  }
 }
 void Fatal(Logger* info_log, const char* format, ...) {
  if (info_log) {
    va_list ap;
    va_start(ap, format);
    info_log->Logv(InfoLogLevel::FATAL, format, ap);
    va_end(ap);
  }
 }
 void LogFlush(const shared_ptr<Logger>& info_log) {
  if (info_log) {
    info_log->Flush();
  }
 }
 void Log(const InfoLogLevel log_level, const shared_ptr<Logger>& info_log,
         const char* format, ...) {
  if (info_log) {
    va_list ap;
    va_start(ap, format);
    info_log->Logv(log_level, format, ap);
    va_end(ap);
  }
 }
 void Debug(const shared_ptr<Logger>& info_log, const char* format, ...) {
  if (info_log) {
    va_list ap;
    va_start(ap, format);
    info_log->Logv(InfoLogLevel::DEBUG, format, ap);
    va_end(ap);
  }
 }
 void Info(const shared_ptr<Logger>& info_log, const char* format, ...) {
  if (info_log) {
    va_list ap;
    va_start(ap, format);
    info_log->Logv(InfoLogLevel::INFO, format, ap);
    va_end(ap);
  }
 }
 void Warn(const shared_ptr<Logger>& info_log, const char* format, ...) {
  if (info_log) {
    va_list ap;
    va_start(ap, format);
    info_log->Logv(InfoLogLevel::WARN, format, ap);
    va_end(ap);
  }
 }
 void Error(const shared_ptr<Logger>& info_log, const char* format, ...) {
  if (info_log) {
    va_list ap;
    va_start(ap, format);
    info_log->Logv(InfoLogLevel::ERROR, format, ap);
    va_end(ap);
  }
 }
 void Fatal(const shared_ptr<Logger>& info_log, const char* format, ...) {
  if (info_log) {
    va_list ap;
    va_start(ap, format);
    info_log->Logv(InfoLogLevel::FATAL, format, ap);
    va_end(ap);
  }
 }
 void Log(const shared_ptr<Logger>& info_log, const char* format, ...) {
  if (info_log) {
    va_list ap;
@ -129,6 +237,12 @@ void AssignEnvOptions(EnvOptions* env_options, const DBOptions& options) {
 }
 EnvOptions EnvOptions::AdaptForLogWrite() const {
  EnvOptions adapted = *this;
  adapted.use_mmap_writes = false;
  return adapted;
 }
 EnvOptions::EnvOptions(const DBOptions& options) {
  AssignEnvOptions(this, options);
 }
--- a/util/env_hdfs.cc
+++ b/util/env_hdfs.cc
@ -236,8 +236,9 @@ class HdfsLogger : public Logger {
  uint64_t (*gettid_)();  // Return the thread id for the current thread
 public:
-  HdfsLogger(HdfsWritableFile* f, uint64_t (*gettid)())
+  HdfsLogger(HdfsWritableFile* f, uint64_t (*gettid)(),
-    : file_(f), gettid_(gettid) {
+             const InfoLogLevel log_level = InfoLogLevel::ERROR)
      : Logger(log_level), file_(f), gettid_(gettid) {
    Log(mylog, "[hdfs] HdfsLogger opened %s\n",
            file_->getName().c_str());
  }
--- a/util/options.cc
+++ b/util/options.cc
@ -176,7 +176,8 @@ DBOptions::DBOptions()
      advise_random_on_open(true),
      access_hint_on_compaction_start(NORMAL),
      use_adaptive_mutex(false),
-      bytes_per_sync(0) { }
+      bytes_per_sync(0),
      allow_thread_local(true) {}
 DBOptions::DBOptions(const Options& options)
    : create_if_missing(options.create_if_missing),
@ -214,7 +215,8 @@ DBOptions::DBOptions(const Options& options)
      advise_random_on_open(options.advise_random_on_open),
      access_hint_on_compaction_start(options.access_hint_on_compaction_start),
      use_adaptive_mutex(options.use_adaptive_mutex),
-      bytes_per_sync(options.bytes_per_sync) {}
+      bytes_per_sync(options.bytes_per_sync),
      allow_thread_local(options.allow_thread_local) {}
 static const char* const access_hints[] = {
  "NONE", "NORMAL", "SEQUENTIAL", "WILLNEED"
--- a/util/posix_logger.h
+++ b/util/posix_logger.h
@ -38,9 +38,16 @@ class PosixLogger : public Logger {
  Env* env_;
  bool flush_pending_;
 public:
-  PosixLogger(FILE* f, uint64_t (*gettid)(), Env* env) :
+  PosixLogger(FILE* f, uint64_t (*gettid)(), Env* env,
-    file_(f), gettid_(gettid), log_size_(0), fd_(fileno(f)),
+              const InfoLogLevel log_level = InfoLogLevel::ERROR)
-    last_flush_micros_(0), env_(env), flush_pending_(false) { }
+      : Logger(log_level),
        file_(f),
        gettid_(gettid),
        log_size_(0),
        fd_(fileno(f)),
        last_flush_micros_(0),
        env_(env),
        flush_pending_(false) {}
  virtual ~PosixLogger() {
    fclose(file_);
  }
--- a/util/statistics.h
+++ b/util/statistics.h
@ -7,11 +7,11 @@
 #include "rocksdb/statistics.h"
 #include "util/histogram.h"
 #include "util/mutexlock.h"
 #include "port/likely.h"
 #include <vector>
 #include <atomic>
 #define UNLIKELY(val) (__builtin_expect((val), 0))
 namespace rocksdb {
--- a/util/thread_local.cc
+++ b/util/thread_local.cc
@ -9,12 +9,8 @@
 #include "util/thread_local.h"
 #include "util/mutexlock.h"
 #include "port/likely.h"
 #if defined(__GNUC__) && __GNUC__ >= 4
 #define UNLIKELY(x) (__builtin_expect((x), 0))
 #else
 #define UNLIKELY(x) (x)
 #endif
 namespace rocksdb {
--- a/util/thread_local.h
+++ b/util/thread_local.h
@ -16,6 +16,7 @@
 #include "util/autovector.h"
 #include "port/port_posix.h"
 #include "util/thread_local.h"
 namespace rocksdb {
--- a/util/thread_local_test.cc
+++ b/util/thread_local_test.cc
@ -58,52 +58,52 @@ TEST(ThreadLocalTest, UniqueIdTest) {
  port::Mutex mu;
  port::CondVar cv(&mu);
-  ASSERT_EQ(IDChecker::PeekId(), 0);
+  ASSERT_EQ(IDChecker::PeekId(), 0u);
  // New ThreadLocal instance bumps id by 1
  {
    // Id used 0
-    Params p1(&mu, &cv, nullptr, 1);
+    Params p1(&mu, &cv, nullptr, 1u);
-    ASSERT_EQ(IDChecker::PeekId(), 1);
+    ASSERT_EQ(IDChecker::PeekId(), 1u);
    // Id used 1
-    Params p2(&mu, &cv, nullptr, 1);
+    Params p2(&mu, &cv, nullptr, 1u);
-    ASSERT_EQ(IDChecker::PeekId(), 2);
+    ASSERT_EQ(IDChecker::PeekId(), 2u);
    // Id used 2
-    Params p3(&mu, &cv, nullptr, 1);
+    Params p3(&mu, &cv, nullptr, 1u);
-    ASSERT_EQ(IDChecker::PeekId(), 3);
+    ASSERT_EQ(IDChecker::PeekId(), 3u);
    // Id used 3
-    Params p4(&mu, &cv, nullptr, 1);
+    Params p4(&mu, &cv, nullptr, 1u);
-    ASSERT_EQ(IDChecker::PeekId(), 4);
+    ASSERT_EQ(IDChecker::PeekId(), 4u);
  }
  // id 3, 2, 1, 0 are in the free queue in order
-  ASSERT_EQ(IDChecker::PeekId(), 0);
+  ASSERT_EQ(IDChecker::PeekId(), 0u);
  // pick up 0
-  Params p1(&mu, &cv, nullptr, 1);
+  Params p1(&mu, &cv, nullptr, 1u);
-  ASSERT_EQ(IDChecker::PeekId(), 1);
+  ASSERT_EQ(IDChecker::PeekId(), 1u);
  // pick up 1
-  Params* p2 = new Params(&mu, &cv, nullptr, 1);
+  Params* p2 = new Params(&mu, &cv, nullptr, 1u);
-  ASSERT_EQ(IDChecker::PeekId(), 2);
+  ASSERT_EQ(IDChecker::PeekId(), 2u);
  // pick up 2
-  Params p3(&mu, &cv, nullptr, 1);
+  Params p3(&mu, &cv, nullptr, 1u);
-  ASSERT_EQ(IDChecker::PeekId(), 3);
+  ASSERT_EQ(IDChecker::PeekId(), 3u);
  // return up 1
  delete p2;
-  ASSERT_EQ(IDChecker::PeekId(), 1);
+  ASSERT_EQ(IDChecker::PeekId(), 1u);
  // Now we have 3, 1 in queue
  // pick up 1
-  Params p4(&mu, &cv, nullptr, 1);
+  Params p4(&mu, &cv, nullptr, 1u);
-  ASSERT_EQ(IDChecker::PeekId(), 3);
+  ASSERT_EQ(IDChecker::PeekId(), 3u);
  // pick up 3
-  Params p5(&mu, &cv, nullptr, 1);
+  Params p5(&mu, &cv, nullptr, 1u);
  // next new id
-  ASSERT_EQ(IDChecker::PeekId(), 4);
+  ASSERT_EQ(IDChecker::PeekId(), 4u);
  // After exit, id sequence in queue:
  // 3, 1, 2, 0
 }
 TEST(ThreadLocalTest, SequentialReadWriteTest) {
  // global id list carries over 3, 1, 2, 0
-  ASSERT_EQ(IDChecker::PeekId(), 0);
+  ASSERT_EQ(IDChecker::PeekId(), 0u);
  port::Mutex mu;
  port::CondVar cv(&mu);
@ -133,7 +133,7 @@ TEST(ThreadLocalTest, SequentialReadWriteTest) {
  };
  for (int iter = 0; iter < 1024; ++iter) {
-    ASSERT_EQ(IDChecker::PeekId(), 1);
+    ASSERT_EQ(IDChecker::PeekId(), 1u);
    // Another new thread, read/write should not see value from previous thread
    env_->StartThread(func, static_cast<void*>(&p));
    mu.Lock();
@ -141,13 +141,13 @@ TEST(ThreadLocalTest, SequentialReadWriteTest) {
      cv.Wait();
    }
    mu.Unlock();
-    ASSERT_EQ(IDChecker::PeekId(), 1);
+    ASSERT_EQ(IDChecker::PeekId(), 1u);
  }
 }
 TEST(ThreadLocalTest, ConcurrentReadWriteTest) {
  // global id list carries over 3, 1, 2, 0
-  ASSERT_EQ(IDChecker::PeekId(), 0);
+  ASSERT_EQ(IDChecker::PeekId(), 0u);
  ThreadLocalPtr tls2;
  port::Mutex mu1;
@ -226,11 +226,11 @@ TEST(ThreadLocalTest, ConcurrentReadWriteTest) {
  }
  mu2.Unlock();
-  ASSERT_EQ(IDChecker::PeekId(), 3);
+  ASSERT_EQ(IDChecker::PeekId(), 3u);
 }
 TEST(ThreadLocalTest, Unref) {
-  ASSERT_EQ(IDChecker::PeekId(), 0);
+  ASSERT_EQ(IDChecker::PeekId(), 0u);
  auto unref = [](void* ptr) {
    auto& p = *static_cast<Params*>(ptr);