Fix DeleteScheduler::MarkAsTrash() handling existing trash

Summary: DeleteScheduler::MarkAsTrash() don't handle existing .trash files correctly This cause rocksdb to not being able to delete existing .trash files on restart Closes https://github.com/facebook/rocksdb/pull/3261 Differential Revision: D6548003 Pulled By: IslamAbdelRahman fbshipit-source-id: c3800639412e587a690062c63076a5a08881e0e6
Preserve overlapping file endpoint invariant
2017-12-21 09:56:02 -08:00 · 2017-12-07 09:14:44 -08:00 · 2017-12-06 14:31:05 -08:00 · 2017-12-06 14:30:46 -08:00 · 2017-11-30 16:05:36 -08:00 · 2017-11-30 12:15:53 -08:00
75 changed files with 2642 additions and 1276 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -811,7 +811,6 @@ if(WITH_TESTS)
        util/hash_test.cc
        util/heap_test.cc
        util/rate_limiter_test.cc
-        util/slice_test.cc
        util/slice_transform_test.cc
        util/timer_queue_test.cc
        util/thread_list_test.cc
--- a/HISTORY.md
+++ b/HISTORY.md
@ -1,8 +1,8 @@
 # Rocksdb Change Log
-## Unreleased
-### Public API Change
-### New Features
+## 5.9.1 (11/28/2017)
 ### Bug Fixes
+* Fix IOError on WAL write doesn't propagate to write group follower
+* Fix calculating filter partition target size

 ## 5.9.0 (11/1/2017)
 ### Public API Change
--- a/4
+++ b/4
@ -494,7 +494,6 @@ TESTS = \
 	repair_test \
 	env_timed_test \
 	write_prepared_transaction_test \
-	slice_test \

 PARALLEL_TEST = \
 	backupable_db_test \
@ -1478,9 +1477,6 @@ range_del_aggregator_test: db/range_del_aggregator_test.o db/db_test_util.o $(LI
 blob_db_test: utilities/blob_db/blob_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)

-slice_test: util/slice_test.o $(LIBOBJECTS) $(TESTHARNESS)
-	$(AM_LINK)
-
 #-------------------------------------------------
 # make install related stuff
 INSTALL_PATH ?= /usr/local
--- a/961
+++ b/961
--- a/buckifier/targets_builder.py
+++ b/buckifier/targets_builder.py
@ -3,10 +3,8 @@ from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals
 import targets_cfg
-import pprint

-# TODO(tec): replace this with PrettyPrinter
-def pretty_list(lst, indent=6):
+def pretty_list(lst, indent=8):
    if lst is None or len(lst) == 0:
        return ""

@ -14,8 +12,8 @@ def pretty_list(lst, indent=6):
        return "\"%s\"" % lst[0]
    
    separator = "\",\n%s\"" % (" " * indent)
-    res = separator.join(lst)
-    res = "\n" + (" " * indent) + "\"" + res + "\",\n" + (" " * (indent - 2))
+    res = separator.join(sorted(lst))
+    res = "\n" + (" " * indent) + "\"" + res + "\",\n" + (" " * (indent - 4))
    return res


@ -27,7 +25,7 @@ class TARGETSBuilder:
        self.total_lib = 0
        self.total_bin = 0
        self.total_test = 0
-        self.tests_cfg = []
+        self.tests_cfg = ""

    def __del__(self):
        self.targets_file.close()
@ -37,8 +35,8 @@ class TARGETSBuilder:
            headers = "AutoHeaders.RECURSIVE_GLOB"
        self.targets_file.write(targets_cfg.library_template % (
            name,
-            headers,
            pretty_list(srcs),
+            headers,
            pretty_list(deps)))
        self.total_lib = self.total_lib + 1

@ -53,13 +51,13 @@ class TARGETSBuilder:
        exec_mode = "serial"
        if is_parallel:
            exec_mode = "parallel"
-        self.tests_cfg.append([test_name, str(src), str(exec_mode)])
+        self.tests_cfg += targets_cfg.test_cfg_template % (
+            test_name,
+            str(src),
+            str(exec_mode))

        self.total_test = self.total_test + 1

    def flush_tests(self):
-        self.targets_file.write(targets_cfg.unittests_template % (
-            pprint.PrettyPrinter().pformat(self.tests_cfg)
-        ))
-
-        self.tests_cfg = []
+        self.targets_file.write(targets_cfg.unittests_template % self.tests_cfg)
+        self.tests_cfg = ""
--- a/buckifier/targets_cfg.py
+++ b/buckifier/targets_cfg.py
@ -2,13 +2,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals
-rocksdb_target_header = """
-import os
+rocksdb_target_header = """REPO_PATH = package_name() + "/"

-TARGETS_PATH = os.path.dirname(__file__)
-REPO_PATH = "rocksdb/src/"
 BUCK_BINS = "buck-out/gen/" + REPO_PATH
+
 TEST_RUNNER = REPO_PATH + "buckifier/rocks_test_runner.sh"
+
 rocksdb_compiler_flags = [
    "-fno-builtin-memcmp",
    "-DROCKSDB_PLATFORM_POSIX",
@ -33,13 +32,13 @@ rocksdb_compiler_flags = [
 ]

 rocksdb_external_deps = [
-  ('bzip2', None, 'bz2'),
-  ('snappy', None, "snappy"),
-  ('zlib', None, 'z'),
-  ('gflags', None, 'gflags'),
-  ('lz4', None, 'lz4'),
-  ('zstd', None),
-  ('tbb', None),
+    ("bzip2", None, "bz2"),
+    ("snappy", None, "snappy"),
+    ("zlib", None, "z"),
+    ("gflags", None, "gflags"),
+    ("lz4", None, "lz4"),
+    ("zstd", None),
+    ("tbb", None),
    ("numa", None, "numa"),
    ("googletest", None, "gtest"),
 ]
@ -53,18 +52,27 @@ rocksdb_preprocessor_flags = [
 rocksdb_arch_preprocessor_flags = {
    "x86_64": ["-DHAVE_SSE42"],
 }
+
+build_mode = read_config("fbcode", "build_mode")
+
+is_opt_mode = build_mode.startswith("opt")
+
+# -DNDEBUG is added by default in opt mode in fbcode. But adding it twice
+# doesn't harm and avoid forgetting to add it.
+if is_opt_mode:
+    rocksdb_compiler_flags.append("-DNDEBUG")
 """


 library_template = """
 cpp_library(
    name = "%s",
-    headers = %s,
    srcs = [%s],
-    deps = [%s],
-    preprocessor_flags = rocksdb_preprocessor_flags,
+    headers = %s,
    arch_preprocessor_flags = rocksdb_arch_preprocessor_flags,
    compiler_flags = rocksdb_compiler_flags,
+    preprocessor_flags = rocksdb_preprocessor_flags,
+    deps = [%s],
    external_deps = rocksdb_external_deps,
 )
 """
@ -73,21 +81,31 @@ binary_template = """
 cpp_binary(
    name = "%s",
    srcs = [%s],
-  deps = [%s],
-  preprocessor_flags = rocksdb_preprocessor_flags,
    arch_preprocessor_flags = rocksdb_arch_preprocessor_flags,
    compiler_flags = rocksdb_compiler_flags,
+    preprocessor_flags = rocksdb_preprocessor_flags,
+    deps = [%s],
    external_deps = rocksdb_external_deps,
 )
 """

+test_cfg_template = """    [
+        "%s",
+        "%s",
+        "%s",
+    ],
+"""
+
 unittests_template = """
 # [test_name, test_src, test_type]
-ROCKS_TESTS = %s
-
+ROCKS_TESTS = [
+%s]

 # Generate a test rule for each entry in ROCKS_TESTS
-for test_cfg in ROCKS_TESTS:
+# Do not build the tests in opt mode, since SyncPoint and other test code
+# will not be included.
+if not is_opt_mode:
+    for test_cfg in ROCKS_TESTS:
        test_name = test_cfg[0]
        test_cc = test_cfg[1]
        ttype = "gtest" if test_cfg[2] == "parallel" else "simple"
@ -112,13 +130,13 @@ for test_cfg in ROCKS_TESTS:

 custom_unittest(
    name = "make_rocksdbjavastatic",
-    type = "simple",
    command = ["internal_repo_rocksdb/make_rocksdbjavastatic.sh"],
+    type = "simple",
 )

 custom_unittest(
    name = "make_rocksdb_lite_release",
-    type = "simple",
    command = ["internal_repo_rocksdb/make_rocksdb_lite_release.sh"],
+    type = "simple",
 )
 """
--- a/db/column_family.cc
+++ b/db/column_family.cc
@ -949,6 +949,10 @@ void ColumnFamilyData::InstallSuperVersion(
      RecalculateWriteStallConditions(mutable_cf_options);

  if (old_superversion != nullptr) {
+    if (old_superversion->mutable_cf_options.write_buffer_size !=
+        mutable_cf_options.write_buffer_size) {
+      mem_->UpdateWriteBufferSize(mutable_cf_options.write_buffer_size);
+    }
    if (old_superversion->write_stall_condition !=
        new_superversion->write_stall_condition) {
      sv_context->PushWriteStallNotification(
--- a/db/compaction_iterator.cc
+++ b/db/compaction_iterator.cc
@ -182,7 +182,8 @@ void CompactionIterator::Next() {

 void CompactionIterator::InvokeFilterIfNeeded(bool* need_skip,
                                              Slice* skip_until) {
-  if (compaction_filter_ != nullptr && ikey_.type == kTypeValue &&
+  if (compaction_filter_ != nullptr &&
+      (ikey_.type == kTypeValue || ikey_.type == kTypeBlobIndex) &&
      (visible_at_tip_ || ikey_.sequence > latest_snapshot_ ||
       ignore_snapshots_)) {
    // If the user has specified a compaction filter and the sequence
@ -192,11 +193,13 @@ void CompactionIterator::InvokeFilterIfNeeded(bool* need_skip,
    CompactionFilter::Decision filter;
    compaction_filter_value_.clear();
    compaction_filter_skip_until_.Clear();
+    CompactionFilter::ValueType value_type =
+        ikey_.type == kTypeValue ? CompactionFilter::ValueType::kValue
+                                 : CompactionFilter::ValueType::kBlobIndex;
    {
      StopWatchNano timer(env_, true);
      filter = compaction_filter_->FilterV2(
-          compaction_->level(), ikey_.user_key,
-          CompactionFilter::ValueType::kValue, value_,
+          compaction_->level(), ikey_.user_key, value_type, value_,
          &compaction_filter_value_, compaction_filter_skip_until_.rep());
      iter_stats_.total_filter_time +=
          env_ != nullptr ? timer.ElapsedNanos() : 0;
--- a/db/compaction_job_test.cc
+++ b/db/compaction_job_test.cc
@ -143,7 +143,7 @@ class CompactionJobTest : public testing::Test {
  }

  void SetLastSequence(const SequenceNumber sequence_number) {
-    versions_->SetLastToBeWrittenSequence(sequence_number + 1);
+    versions_->SetLastAllocatedSequence(sequence_number + 1);
    versions_->SetLastSequence(sequence_number + 1);
  }

--- a/db/db_compaction_test.cc
+++ b/db/db_compaction_test.cc
@ -1517,6 +1517,60 @@ TEST_F(DBCompactionTest, DeleteFileRange) {
  ASSERT_GT(old_num_files, new_num_files);
 }

+TEST_F(DBCompactionTest, DeleteFileRangeFileEndpointsOverlapBug) {
+  // regression test for #2833: groups of files whose user-keys overlap at the
+  // endpoints could be split by `DeleteFilesInRange`. This caused old data to
+  // reappear, either because a new version of the key was removed, or a range
+  // deletion was partially dropped. It could also cause non-overlapping
+  // invariant to be violated if the files dropped by DeleteFilesInRange were
+  // a subset of files that a range deletion spans.
+  const int kNumL0Files = 2;
+  const int kValSize = 8 << 10;  // 8KB
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  options.target_file_size_base = 1 << 10;  // 1KB
+  DestroyAndReopen(options);
+
+  // The snapshot prevents key 1 from having its old version dropped. The low
+  // `target_file_size_base` ensures two keys will be in each output file.
+  const Snapshot* snapshot = nullptr;
+  Random rnd(301);
+  // The value indicates which flush the key belonged to, which is enough
+  // for us to determine the keys' relative ages. After L0 flushes finish,
+  // files look like:
+  //
+  // File 0: 0 -> vals[0], 1 -> vals[0]
+  // File 1:               1 -> vals[1], 2 -> vals[1]
+  //
+  // Then L0->L1 compaction happens, which outputs keys as follows:
+  //
+  // File 0: 0 -> vals[0], 1 -> vals[1]
+  // File 1:               1 -> vals[0], 2 -> vals[1]
+  //
+  // DeleteFilesInRange shouldn't be allowed to drop just file 0, as that
+  // would cause `1 -> vals[0]` (an older key) to reappear.
+  std::string vals[kNumL0Files];
+  for (int i = 0; i < kNumL0Files; ++i) {
+    vals[i] = RandomString(&rnd, kValSize);
+    Put(Key(i), vals[i]);
+    Put(Key(i + 1), vals[i]);
+    Flush();
+    if (i == 0) {
+      snapshot = db_->GetSnapshot();
+    }
+  }
+  dbfull()->TEST_WaitForCompact();
+
+  // Verify `DeleteFilesInRange` can't drop only file 0 which would cause
+  // "1 -> vals[0]" to reappear.
+  Slice begin = Key(0);
+  Slice end = Key(1);
+  ASSERT_OK(DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin, &end));
+  ASSERT_EQ(vals[1], Get(Key(1)));
+
+  db_->ReleaseSnapshot(snapshot);
+}
+
 TEST_P(DBCompactionTestWithParam, TrivialMoveToLastLevelWithFiles) {
  int32_t trivial_move = 0;
  int32_t non_trivial_move = 0;
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@ -136,7 +136,8 @@ void DumpSupportInfo(Logger* logger) {
 int64_t kDefaultLowPriThrottledRate = 2 * 1024 * 1024;
 } // namespace

-DBImpl::DBImpl(const DBOptions& options, const std::string& dbname)
+DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
+               const bool seq_per_batch)
    : env_(options.env),
      dbname_(dbname),
      initial_db_options_(SanitizeOptions(dbname, options)),
@ -185,18 +186,30 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname)
          env_options_, immutable_db_options_)),
      num_running_ingest_file_(0),
 #ifndef ROCKSDB_LITE
-      wal_manager_(immutable_db_options_, env_options_),
+      wal_manager_(immutable_db_options_, env_options_, seq_per_batch),
 #endif  // ROCKSDB_LITE
      event_logger_(immutable_db_options_.info_log.get()),
      bg_work_paused_(0),
      bg_compaction_paused_(0),
      refitting_level_(false),
      opened_successfully_(false),
-      concurrent_prepare_(options.concurrent_prepare),
+      two_write_queues_(options.two_write_queues),
      manual_wal_flush_(options.manual_wal_flush),
-      seq_per_batch_(options.seq_per_batch),
-      // TODO(myabandeh): revise this when we change options.seq_per_batch
-      use_custom_gc_(options.seq_per_batch),
+      seq_per_batch_(seq_per_batch),
+      // When two_write_queues_ and seq_per_batch_ are both enabled we
+      // sometimes allocate a seq also to indicate the commit timestmamp of a
+      // transaction. In such cases last_sequence_ would not indicate the last
+      // visible sequence number in memtable and should not be used for
+      // snapshots. It should use last_allocated_sequence_ instaed but also
+      // needs other mechanisms to exclude the data that after last_sequence_
+      // and before last_allocated_sequence_ from the snapshot. In
+      // WritePreparedTxn this property is ensured since such data are not
+      // committed yet.
+      allocate_seq_only_for_data_(!(seq_per_batch && options.two_write_queues)),
+      // Since seq_per_batch_ is currently set only by WritePreparedTxn which
+      // requires a custom gc for compaction, we use that to set use_custom_gc_
+      // as well.
+      use_custom_gc_(seq_per_batch),
      preserve_deletes_(options.preserve_deletes) {
  env_->GetAbsolutePath(dbname, &db_absolute_path_);

@ -751,7 +764,7 @@ SequenceNumber DBImpl::GetLatestSequenceNumber() const {
 }

 SequenceNumber DBImpl::IncAndFetchSequenceNumber() {
-  return versions_->FetchAddLastToBeWrittenSequence(1ull) + 1ull;
+  return versions_->FetchAddLastAllocatedSequence(1ull) + 1ull;
 }

 bool DBImpl::SetPreserveDeletesSequenceNumber(SequenceNumber seqnum) {
@ -977,9 +990,8 @@ Status DBImpl::GetImpl(const ReadOptions& read_options,
    // super versipon because a flush happening in between may compact
    // away data for the snapshot, but the snapshot is earlier than the
    // data overwriting it, so users may see wrong results.
-    snapshot = concurrent_prepare_ && seq_per_batch_
-                   ? versions_->LastToBeWrittenSequence()
-                   : versions_->LastSequence();
+    snapshot = allocate_seq_only_for_data_ ? versions_->LastSequence()
+                                           : versions_->LastAllocatedSequence();
  }
  TEST_SYNC_POINT("DBImpl::GetImpl:3");
  TEST_SYNC_POINT("DBImpl::GetImpl:4");
@ -1070,9 +1082,8 @@ std::vector<Status> DBImpl::MultiGet(
    snapshot = reinterpret_cast<const SnapshotImpl*>(
        read_options.snapshot)->number_;
  } else {
-    snapshot = concurrent_prepare_ && seq_per_batch_
-                   ? versions_->LastToBeWrittenSequence()
-                   : versions_->LastSequence();
+    snapshot = allocate_seq_only_for_data_ ? versions_->LastSequence()
+                                           : versions_->LastAllocatedSequence();
  }
  for (auto mgd_iter : multiget_cf_data) {
    mgd_iter.second->super_version =
@ -1478,8 +1489,9 @@ Iterator* DBImpl::NewIterator(const ReadOptions& read_options,
        read_callback);
 #endif
  } else {
-    // Note: no need to consider the special case of concurrent_prepare_ &&
-    // seq_per_batch_ since NewIterator is overridden in WritePreparedTxnDB
+    // Note: no need to consider the special case of
+    // allocate_seq_only_for_data_==false since NewIterator is overridden in
+    // WritePreparedTxnDB
    auto snapshot = read_options.snapshot != nullptr
                        ? read_options.snapshot->GetSequenceNumber()
                        : versions_->LastSequence();
@ -1595,8 +1607,9 @@ Status DBImpl::NewIterators(
    }
 #endif
  } else {
-    // Note: no need to consider the special case of concurrent_prepare_ &&
-    // seq_per_batch_ since NewIterators is overridden in WritePreparedTxnDB
+    // Note: no need to consider the special case of
+    // allocate_seq_only_for_data_==false since NewIterators is overridden in
+    // WritePreparedTxnDB
    auto snapshot = read_options.snapshot != nullptr
                        ? read_options.snapshot->GetSequenceNumber()
                        : versions_->LastSequence();
@ -1630,9 +1643,9 @@ const Snapshot* DBImpl::GetSnapshotImpl(bool is_write_conflict_boundary) {
    delete s;
    return nullptr;
  }
-  auto snapshot_seq = concurrent_prepare_ && seq_per_batch_
-                          ? versions_->LastToBeWrittenSequence()
-                          : versions_->LastSequence();
+  auto snapshot_seq = allocate_seq_only_for_data_
+                          ? versions_->LastSequence()
+                          : versions_->LastAllocatedSequence();
  return snapshots_.New(s, snapshot_seq, unix_time, is_write_conflict_boundary);
 }

@ -1643,9 +1656,9 @@ void DBImpl::ReleaseSnapshot(const Snapshot* s) {
    snapshots_.Delete(casted_s);
    uint64_t oldest_snapshot;
    if (snapshots_.empty()) {
-      oldest_snapshot = concurrent_prepare_ && seq_per_batch_
-                            ? versions_->LastToBeWrittenSequence()
-                            : versions_->LastSequence();
+      oldest_snapshot = allocate_seq_only_for_data_
+                            ? versions_->LastSequence()
+                            : versions_->LastAllocatedSequence();
    } else {
      oldest_snapshot = snapshots_.oldest()->number_;
    }
@ -1663,12 +1676,10 @@ void DBImpl::ReleaseSnapshot(const Snapshot* s) {
  delete casted_s;
 }

-bool DBImpl::HasActiveSnapshotLaterThanSN(SequenceNumber sn) {
+bool DBImpl::HasActiveSnapshotInRange(SequenceNumber lower_bound,
+                                      SequenceNumber upper_bound) {
  InstrumentedMutexLock l(&mutex_);
-  if (snapshots_.empty()) {
-    return false;
-  }
-  return (snapshots_.newest()->GetSequenceNumber() >= sn);
+  return snapshots_.HasSnapshotInRange(lower_bound, upper_bound);
 }

 #ifndef ROCKSDB_LITE
@ -2147,17 +2158,12 @@ Status DBImpl::DeleteFilesInRange(ColumnFamilyHandle* column_family,
        end_key = &end_storage;
      }

-      vstorage->GetOverlappingInputs(i, begin_key, end_key, &level_files, -1,
-                                     nullptr, false);
+      vstorage->GetCleanInputsWithinInterval(i, begin_key, end_key,
+                                             &level_files, -1 /* hint_index */,
+                                             nullptr /* file_index */);
      FileMetaData* level_file;
      for (uint32_t j = 0; j < level_files.size(); j++) {
        level_file = level_files[j];
-        if (((begin == nullptr) ||
-             (cfd->internal_comparator().user_comparator()->Compare(
-                  level_file->smallest.user_key(), *begin) >= 0)) &&
-            ((end == nullptr) ||
-             (cfd->internal_comparator().user_comparator()->Compare(
-                  level_file->largest.user_key(), *end) <= 0))) {
        if (level_file->being_compacted) {
          continue;
        }
@ -2167,7 +2173,6 @@ Status DBImpl::DeleteFilesInRange(ColumnFamilyHandle* column_family,
        level_file->being_compacted = true;
      }
    }
-    }
    if (edit.GetDeletedFiles().empty()) {
      job_context.Clean();
      return Status::OK();
@ -2755,7 +2760,7 @@ Status DBImpl::IngestExternalFile(
    WriteThread::Writer w;
    write_thread_.EnterUnbatched(&w, &mutex_);
    WriteThread::Writer nonmem_w;
-    if (concurrent_prepare_) {
+    if (two_write_queues_) {
      nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
    }

@ -2798,7 +2803,7 @@ Status DBImpl::IngestExternalFile(
    }

    // Resume writes to the DB
-    if (concurrent_prepare_) {
+    if (two_write_queues_) {
      nonmem_write_thread_.ExitUnbatched(&nonmem_w);
    }
    write_thread_.ExitUnbatched(&w);
--- a/db/db_impl.h
+++ b/db/db_impl.h
@ -68,7 +68,8 @@ struct MemTableInfo;

 class DBImpl : public DB {
 public:
-  DBImpl(const DBOptions& options, const std::string& dbname);
+  DBImpl(const DBOptions& options, const std::string& dbname,
+         const bool seq_per_batch = false);
  virtual ~DBImpl();

  // Implementations of the DB interface
@ -220,14 +221,16 @@ class DBImpl : public DB {

  virtual SequenceNumber GetLatestSequenceNumber() const override;
  virtual SequenceNumber IncAndFetchSequenceNumber();
-  // Returns LastToBeWrittenSequence in concurrent_prepare_ && seq_per_batch_
-  // mode and LastSequence otherwise. This is useful when visiblility depends
-  // also on data written to the WAL but not to the memtable.
-  SequenceNumber TEST_GetLatestVisibleSequenceNumber() const;
+  // Returns LastSequence in allocate_seq_only_for_data_
+  // mode and LastAllocatedSequence otherwise. This is useful when visiblility
+  // depends also on data written to the WAL but not to the memtable.
+  SequenceNumber TEST_GetLastVisibleSequence() const;

  virtual bool SetPreserveDeletesSequenceNumber(SequenceNumber seqnum) override;

-  bool HasActiveSnapshotLaterThanSN(SequenceNumber sn);
+  // Whether there is an active snapshot in range [lower_bound, upper_bound).
+  bool HasActiveSnapshotInRange(SequenceNumber lower_bound,
+                                SequenceNumber upper_bound);

 #ifndef ROCKSDB_LITE
  using DB::ResetStats;
@ -604,6 +607,12 @@ class DBImpl : public DB {

  Status NewDB();

+  // This is to be used only by internal rocksdb classes.
+  static Status Open(const DBOptions& db_options, const std::string& name,
+                     const std::vector<ColumnFamilyDescriptor>& column_families,
+                     std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
+                     const bool seq_per_batch);
+
 protected:
  Env* const env_;
  const std::string dbname_;
@ -903,12 +912,12 @@ class DBImpl : public DB {
  FileLock* db_lock_;

  // In addition to mutex_, log_write_mutex_ protected writes to logs_ and
-  // logfile_number_. With concurrent_prepare it also protects alive_log_files_,
+  // logfile_number_. With two_write_queues it also protects alive_log_files_,
  // and log_empty_. Refer to the definition of each variable below for more
  // details.
  InstrumentedMutex log_write_mutex_;
  // State below is protected by mutex_
-  // With concurrent_prepare enabled, some of the variables that accessed during
+  // With two_write_queues enabled, some of the variables that accessed during
  // WriteToWAL need different synchronization: log_empty_, alive_log_files_,
  // logs_, logfile_number_. Refer to the definition of each variable below for
  // more description.
@ -933,10 +942,10 @@ class DBImpl : public DB {
  std::deque<uint64_t>
      log_recycle_files;  // a list of log files that we can recycle
  bool log_dir_synced_;
-  // Without concurrent_prepare, read and writes to log_empty_ are protected by
+  // Without two_write_queues, read and writes to log_empty_ are protected by
  // mutex_. Since it is currently updated/read only in write_thread_, it can be
  // accessed from the same write_thread_ without any locks. With
-  // concurrent_prepare writes, where it can be updated in different threads,
+  // two_write_queues writes, where it can be updated in different threads,
  // read and writes are protected by log_write_mutex_ instead. This is to avoid
  // expesnive mutex_ lock during WAL write, which update log_empty_.
  bool log_empty_;
@ -973,10 +982,10 @@ class DBImpl : public DB {
    // true for some prefix of logs_
    bool getting_synced = false;
  };
-  // Without concurrent_prepare, read and writes to alive_log_files_ are
+  // Without two_write_queues, read and writes to alive_log_files_ are
  // protected by mutex_. However since back() is never popped, and push_back()
  // is done only from write_thread_, the same thread can access the item
-  // reffered by back() without mutex_. With concurrent_prepare_, writes
+  // reffered by back() without mutex_. With two_write_queues_, writes
  // are protected by locking both mutex_ and log_write_mutex_, and reads must
  // be under either mutex_ or log_write_mutex_.
  std::deque<LogFileNumberSize> alive_log_files_;
@ -1001,7 +1010,7 @@ class DBImpl : public DB {
  // memtable on normal writes and hence improving the throughput. Each new
  // write of the state will replace the previous state entirely even if the
  // keys in the two consecuitive states do not overlap.
-  // It is protected by log_write_mutex_ when concurrent_prepare_ is enabled.
+  // It is protected by log_write_mutex_ when two_write_queues_ is enabled.
  // Otherwise only the heaad of write_thread_ can access it.
  WriteBatch cached_recoverable_state_;
  std::atomic<bool> cached_recoverable_state_empty_ = {true};
@ -1317,9 +1326,22 @@ class DBImpl : public DB {

  // When set, we use a seprate queue for writes that dont write to memtable. In
  // 2PC these are the writes at Prepare phase.
-  const bool concurrent_prepare_;
+  const bool two_write_queues_;
  const bool manual_wal_flush_;
+  // Increase the sequence number after writing each batch, whether memtable is
+  // disabled for that or not. Otherwise the sequence number is increased after
+  // writing each key into memtable. This implies that when disable_memtable is
+  // set, the seq is not increased at all.
+  //
+  // Default: false
  const bool seq_per_batch_;
+  // A sequence number is allocated only for data written to DB. Otherwise it
+  // could also be allocated for operational purposes such as commit timestamp
+  // of a transaction.
+  const bool allocate_seq_only_for_data_;
+  // It indicates that a customized gc algorithm must be used for
+  // flush/compaction and if it is not provided vis SnapshotChecker, we should
+  // disable gc to be safe.
  const bool use_custom_gc_;

  // Clients must periodically call SetPreserveDeletesSequenceNumber()
--- a/db/db_impl_debug.cc
+++ b/db/db_impl_debug.cc
@ -209,11 +209,11 @@ int DBImpl::TEST_BGFlushesAllowed() const {
  return GetBGJobLimits().max_flushes;
 }

-SequenceNumber DBImpl::TEST_GetLatestVisibleSequenceNumber() const {
-  if (concurrent_prepare_ && seq_per_batch_) {
-    return versions_->LastToBeWrittenSequence();
-  } else {
+SequenceNumber DBImpl::TEST_GetLastVisibleSequence() const {
+  if (allocate_seq_only_for_data_) {
    return versions_->LastSequence();
+  } else {
+    return versions_->LastAllocatedSequence();
  }
 }

--- a/db/db_impl_files.cc
+++ b/db/db_impl_files.cc
@ -252,11 +252,11 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
      }
      job_context->size_log_to_delete += earliest.size;
      total_log_size_ -= earliest.size;
-      if (concurrent_prepare_) {
+      if (two_write_queues_) {
        log_write_mutex_.Lock();
      }
      alive_log_files_.pop_front();
-      if (concurrent_prepare_) {
+      if (two_write_queues_) {
        log_write_mutex_.Unlock();
      }
      // Current log should always stay alive since it can't have
--- a/db/db_impl_open.cc
+++ b/db/db_impl_open.cc
@ -592,9 +592,10 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
        // happen when we open and write to a corrupted DB, where sequence id
        // will start from the last sequence id we recovered.
        if (sequence == *next_sequence ||
-            // With seq_per_batch_, if previous run was with concurrent_prepare_
-            // then gap in the sequence numbers is expected by the commits
-            // without prepares.
+            // With seq_per_batch_, if previous run was with two_write_queues_
+            // then allocate_seq_only_for_data_ was disabled and a gap in the
+            // sequence numbers in the log is expected by the commits without
+            // prepares.
            (seq_per_batch_ && sequence >= *next_sequence)) {
          stop_replay_for_corruption = false;
        }
@ -754,7 +755,7 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
    auto last_sequence = *next_sequence - 1;
    if ((*next_sequence != kMaxSequenceNumber) &&
        (versions_->LastSequence() <= last_sequence)) {
-      versions_->SetLastToBeWrittenSequence(last_sequence);
+      versions_->SetLastAllocatedSequence(last_sequence);
      versions_->SetLastSequence(last_sequence);
    }
  }
@ -845,13 +846,13 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
  if (data_seen && !flushed) {
    // Mark these as alive so they'll be considered for deletion later by
    // FindObsoleteFiles()
-    if (concurrent_prepare_) {
+    if (two_write_queues_) {
      log_write_mutex_.Lock();
    }
    for (auto log_number : log_numbers) {
      alive_log_files_.push_back(LogFileNumberSize(log_number));
    }
-    if (concurrent_prepare_) {
+    if (two_write_queues_) {
      log_write_mutex_.Unlock();
    }
  }
@ -966,6 +967,15 @@ Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) {
 Status DB::Open(const DBOptions& db_options, const std::string& dbname,
                const std::vector<ColumnFamilyDescriptor>& column_families,
                std::vector<ColumnFamilyHandle*>* handles, DB** dbptr) {
+  const bool seq_per_batch = true;
+  return DBImpl::Open(db_options, dbname, column_families, handles, dbptr,
+                      !seq_per_batch);
+}
+
+Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
+                    const std::vector<ColumnFamilyDescriptor>& column_families,
+                    std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
+                    const bool seq_per_batch) {
  Status s = SanitizeOptionsByTable(db_options, column_families);
  if (!s.ok()) {
    return s;
@ -985,7 +995,7 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname,
        std::max(max_write_buffer_size, cf.options.write_buffer_size);
  }

-  DBImpl* impl = new DBImpl(db_options, dbname);
+  DBImpl* impl = new DBImpl(db_options, dbname, seq_per_batch);
  s = impl->env_->CreateDirIfMissing(impl->immutable_db_options_.wal_dir);
  if (s.ok()) {
    for (auto db_path : impl->immutable_db_options_.db_paths) {
@ -1070,12 +1080,12 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname,
            cfd, &sv_context, *cfd->GetLatestMutableCFOptions());
      }
      sv_context.Clean();
-      if (impl->concurrent_prepare_) {
+      if (impl->two_write_queues_) {
        impl->log_write_mutex_.Lock();
      }
      impl->alive_log_files_.push_back(
          DBImpl::LogFileNumberSize(impl->logfile_number_));
-      if (impl->concurrent_prepare_) {
+      if (impl->two_write_queues_) {
        impl->log_write_mutex_.Unlock();
      }
      impl->DeleteObsoleteFiles();
--- a/db/db_impl_write.cc
+++ b/db/db_impl_write.cc
@ -67,7 +67,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
  if (write_options.sync && write_options.disableWAL) {
    return Status::InvalidArgument("Sync writes has to enable WAL.");
  }
-  if (concurrent_prepare_ && immutable_db_options_.enable_pipelined_write) {
+  if (two_write_queues_ && immutable_db_options_.enable_pipelined_write) {
    return Status::NotSupported(
        "pipelined_writes is not compatible with concurrent prepares");
  }
@ -87,7 +87,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
    }
  }

-  if (concurrent_prepare_ && disable_memtable) {
+  if (two_write_queues_ && disable_memtable) {
    return WriteImplWALOnly(write_options, my_batch, callback, log_used,
                            log_ref, seq_used);
  }
@ -154,7 +154,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
  WriteThread::WriteGroup write_group;
  bool in_parallel_group = false;
  uint64_t last_sequence = kMaxSequenceNumber;
-  if (!concurrent_prepare_) {
+  if (!two_write_queues_) {
    last_sequence = versions_->LastSequence();
  }

@ -162,7 +162,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,

  bool need_log_sync = write_options.sync;
  bool need_log_dir_sync = need_log_sync && !log_dir_synced_;
-  if (!concurrent_prepare_ || !disable_memtable) {
+  if (!two_write_queues_ || !disable_memtable) {
    // With concurrent writes we do preprocess only in the write thread that
    // also does write to memtable to avoid sync issue on shared data structure
    // with the other thread
@ -209,7 +209,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
    }
    size_t seq_inc = seq_per_batch_ ? write_group.size : total_count;

-    const bool concurrent_update = concurrent_prepare_;
+    const bool concurrent_update = two_write_queues_;
    // Update stats while we are an exclusive group leader, so we know
    // that nobody else can be writing to these particular stats.
    // We're optimistic, updating the stats before we successfully
@ -237,7 +237,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,

    PERF_TIMER_STOP(write_pre_and_post_process_time);

-    if (!concurrent_prepare_) {
+    if (!two_write_queues_) {
      if (status.ok() && !write_options.disableWAL) {
        PERF_TIMER_GUARD(write_wal_time);
        status = WriteToWAL(write_group, log_writer, log_used, need_log_sync,
@ -246,13 +246,13 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
    } else {
      if (status.ok() && !write_options.disableWAL) {
        PERF_TIMER_GUARD(write_wal_time);
-        // LastToBeWrittenSequence is increased inside WriteToWAL under
+        // LastAllocatedSequence is increased inside WriteToWAL under
        // wal_write_mutex_ to ensure ordered events in WAL
        status = ConcurrentWriteToWAL(write_group, log_used, &last_sequence,
                                      seq_inc);
      } else {
        // Otherwise we inc seq number for memtable writes
-        last_sequence = versions_->FetchAddLastToBeWrittenSequence(seq_inc);
+        last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc);
      }
    }
    assert(last_sequence != kMaxSequenceNumber);
@ -310,9 +310,9 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
    mutex_.Lock();
    MarkLogsSynced(logfile_number_, need_log_dir_sync, status);
    mutex_.Unlock();
-    // Requesting sync with concurrent_prepare_ is expected to be very rare. We
+    // Requesting sync with two_write_queues_ is expected to be very rare. We
    // hance provide a simple implementation that is not necessarily efficient.
-    if (concurrent_prepare_) {
+    if (two_write_queues_) {
      if (manual_wal_flush_) {
        status = FlushWAL(true);
      } else {
@ -332,7 +332,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
      versions_->SetLastSequence(last_sequence);
    }
    MemTableInsertStatusCheck(w.status);
-    write_thread_.ExitAsBatchGroupLeader(write_group, w.status);
+    write_thread_.ExitAsBatchGroupLeader(write_group, status);
  }

  if (status.ok()) {
@ -532,7 +532,7 @@ Status DBImpl::WriteImplWALOnly(const WriteOptions& write_options,
  PERF_TIMER_STOP(write_pre_and_post_process_time);

  PERF_TIMER_GUARD(write_wal_time);
-  // LastToBeWrittenSequence is increased inside WriteToWAL under
+  // LastAllocatedSequence is increased inside WriteToWAL under
  // wal_write_mutex_ to ensure ordered events in WAL
  size_t seq_inc = seq_per_batch_ ? write_group.size : 0 /*total_count*/;
  status = ConcurrentWriteToWAL(write_group, log_used, &last_sequence, seq_inc);
@ -548,7 +548,7 @@ Status DBImpl::WriteImplWALOnly(const WriteOptions& write_options,
    }
  }
  if (status.ok() && write_options.sync) {
-    // Requesting sync with concurrent_prepare_ is expected to be very rare. We
+    // Requesting sync with two_write_queues_ is expected to be very rare. We
    // hance provide a simple implementation that is not necessarily efficient.
    if (manual_wal_flush_) {
      status = FlushWAL(true);
@ -561,7 +561,7 @@ Status DBImpl::WriteImplWALOnly(const WriteOptions& write_options,
  if (!w.CallbackFailed()) {
    WriteCallbackStatusCheck(status);
  }
-  nonmem_write_thread_.ExitAsBatchGroupLeader(write_group, w.status);
+  nonmem_write_thread_.ExitAsBatchGroupLeader(write_group, status);
  if (status.ok()) {
    status = w.FinalStatus();
  }
@ -719,7 +719,7 @@ WriteBatch* DBImpl::MergeBatch(const WriteThread::WriteGroup& write_group,
  return merged_batch;
 }

-// When concurrent_prepare_ is disabled, this function is called from the only
+// When two_write_queues_ is disabled, this function is called from the only
 // write thread. Otherwise this must be called holding log_write_mutex_.
 Status DBImpl::WriteToWAL(const WriteBatch& merged_batch,
                          log::Writer* log_writer, uint64_t* log_used,
@ -828,7 +828,7 @@ Status DBImpl::ConcurrentWriteToWAL(const WriteThread::WriteGroup& write_group,
      writer->log_used = logfile_number_;
    }
  }
-  *last_sequence = versions_->FetchAddLastToBeWrittenSequence(seq_inc);
+  *last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc);
  auto sequence = *last_sequence + 1;
  WriteBatchInternal::SetSequence(merged_batch, sequence);

@ -858,7 +858,7 @@ Status DBImpl::WriteRecoverableState() {
  if (!cached_recoverable_state_empty_) {
    bool dont_care_bool;
    SequenceNumber next_seq;
-    if (concurrent_prepare_) {
+    if (two_write_queues_) {
      log_write_mutex_.Lock();
    }
    SequenceNumber seq = versions_->LastSequence();
@ -869,7 +869,7 @@ Status DBImpl::WriteRecoverableState() {
        false /* concurrent_memtable_writes */, &next_seq, &dont_care_bool,
        seq_per_batch_);
    versions_->SetLastSequence(--next_seq);
-    if (concurrent_prepare_) {
+    if (two_write_queues_) {
      log_write_mutex_.Unlock();
    }
    if (status.ok()) {
@ -1109,7 +1109,7 @@ void DBImpl::NotifyOnMemTableSealed(ColumnFamilyData* cfd,
 Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
  mutex_.AssertHeld();
  WriteThread::Writer nonmem_w;
-  if (concurrent_prepare_) {
+  if (two_write_queues_) {
    // SwitchMemtable is a rare event. To simply the reasoning, we make sure
    // that there is no concurrent thread writing to WAL.
    nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
@ -1135,11 +1135,11 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
  // Attempt to switch to a new memtable and trigger flush of old.
  // Do this without holding the dbmutex lock.
  assert(versions_->prev_log_number() == 0);
-  if (concurrent_prepare_) {
+  if (two_write_queues_) {
    log_write_mutex_.Lock();
  }
  bool creating_new_log = !log_empty_;
-  if (concurrent_prepare_) {
+  if (two_write_queues_) {
    log_write_mutex_.Unlock();
  }
  uint64_t recycle_log_number = 0;
@ -1224,7 +1224,7 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
    assert(creating_new_log);
    assert(!new_mem);
    assert(!new_log);
-    if (concurrent_prepare_) {
+    if (two_write_queues_) {
      nonmem_write_thread_.ExitUnbatched(&nonmem_w);
    }
    return s;
@ -1264,7 +1264,7 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
  cfd->SetMemtable(new_mem);
  InstallSuperVersionAndScheduleWork(cfd, &context->superversion_context,
                                     mutable_cf_options);
-  if (concurrent_prepare_) {
+  if (two_write_queues_) {
    nonmem_write_thread_.ExitUnbatched(&nonmem_w);
  }
  return s;
--- a/db/db_sst_test.cc
+++ b/db/db_sst_test.cc
@ -376,6 +376,30 @@ TEST_F(DBSSTTest, RateLimitedDelete) {
  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
 }

+TEST_F(DBSSTTest, OpenDBWithExistingTrash) {
+  Options options = CurrentOptions();
+
+  options.sst_file_manager.reset(
+      NewSstFileManager(env_, nullptr, "", 1024 * 1024 /* 1 MB/sec */));
+  auto sfm = static_cast<SstFileManagerImpl*>(options.sst_file_manager.get());
+
+  Destroy(last_options_);
+
+  // Add some trash files to the db directory so the DB can clean them up
+  env_->CreateDirIfMissing(dbname_);
+  ASSERT_OK(WriteStringToFile(env_, "abc", dbname_ + "/" + "001.sst.trash"));
+  ASSERT_OK(WriteStringToFile(env_, "abc", dbname_ + "/" + "002.sst.trash"));
+  ASSERT_OK(WriteStringToFile(env_, "abc", dbname_ + "/" + "003.sst.trash"));
+
+  // Reopen the DB and verify that it deletes existing trash files
+  ASSERT_OK(TryReopen(options));
+  sfm->WaitForEmptyTrash();
+  ASSERT_NOK(env_->FileExists(dbname_ + "/" + "001.sst.trash"));
+  ASSERT_NOK(env_->FileExists(dbname_ + "/" + "002.sst.trash"));
+  ASSERT_NOK(env_->FileExists(dbname_ + "/" + "003.sst.trash"));
+}
+
+
 // Create a DB with 2 db_paths, and generate multiple files in the 2
 // db_paths using CompactRangeOptions, make sure that files that were
 // deleted from first db_path were deleted using DeleteScheduler and
--- a/db/db_test.cc
+++ b/db/db_test.cc
@ -3354,11 +3354,23 @@ TEST_F(DBTest, DynamicMemtableOptions) {
      {"write_buffer_size", "131072"},
  }));

-  // The existing memtable is still 64KB in size, after it becomes immutable,
-  // the next memtable will be 128KB in size. Write 256KB total, we should
-  // have a 64KB L0 file, a 128KB L0 file, and a memtable with 64KB data
-  gen_l0_kb(256);
-  ASSERT_EQ(NumTableFilesAtLevel(0), 2);  // (A)
+  // The existing memtable inflated 64KB->128KB when we invoked SetOptions().
+  // Write 192KB, we should have a 128KB L0 file and a memtable with 64KB data.
+  gen_l0_kb(192);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 1);  // (A)
+  ASSERT_LT(SizeAtLevel(0), k128KB + 2 * k5KB);
+  ASSERT_GT(SizeAtLevel(0), k128KB - 4 * k5KB);
+
+  // Decrease buffer size below current usage
+  ASSERT_OK(dbfull()->SetOptions({
+      {"write_buffer_size", "65536"},
+  }));
+  // The existing memtable became eligible for flush when we reduced its
+  // capacity to 64KB. Two keys need to be added to trigger flush: first causes
+  // memtable to be marked full, second schedules the flush. Then we should have
+  // a 128KB L0 file, a 64KB L0 file, and a memtable with just one key.
+  gen_l0_kb(2);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 2);
  ASSERT_LT(SizeAtLevel(0), k128KB + k64KB + 2 * k5KB);
  ASSERT_GT(SizeAtLevel(0), k128KB + k64KB - 4 * k5KB);

--- a/db/db_test_util.cc
+++ b/db/db_test_util.cc
@ -486,7 +486,7 @@ Options DBTestBase::GetOptions(
    }
    case kConcurrentWALWrites: {
      // This options optimize 2PC commit path
-      options.concurrent_prepare = true;
+      options.two_write_queues = true;
      options.manual_wal_flush = true;
      break;
    }
--- a/db/db_wal_test.cc
+++ b/db/db_wal_test.cc
@ -730,7 +730,7 @@ class RecoveryTestHelper {
        batch.Put(key, value);
        WriteBatchInternal::SetSequence(&batch, seq);
        current_log_writer->AddRecord(WriteBatchInternal::Contents(&batch));
-        versions->SetLastToBeWrittenSequence(seq);
+        versions->SetLastAllocatedSequence(seq);
        versions->SetLastSequence(seq);
      }
    }
--- a/db/db_write_test.cc
+++ b/db/db_write_test.cc
@ -3,12 +3,18 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).

+#include <atomic>
 #include <memory>
 #include <thread>
 #include <vector>
 #include "db/db_test_util.h"
 #include "db/write_batch_internal.h"
+#include "db/write_thread.h"
+#include "port/port.h"
 #include "port/stack_trace.h"
+#include "util/fault_injection_test_env.h"
+#include "util/string_util.h"
+#include "util/sync_point.h"

 namespace rocksdb {

@ -17,7 +23,9 @@ class DBWriteTest : public DBTestBase, public testing::WithParamInterface<int> {
 public:
  DBWriteTest() : DBTestBase("/db_write_test") {}

-  void Open() { DBTestBase::Reopen(GetOptions(GetParam())); }
+  Options GetOptions() { return DBTestBase::GetOptions(GetParam()); }
+
+  void Open() { DBTestBase::Reopen(GetOptions()); }
 };

 // It is invalid to do sync write while disabling WAL.
@ -77,6 +85,47 @@ TEST_P(DBWriteTest, ReturnSeuqneceNumberMultiThreaded) {
  }
 }

+TEST_P(DBWriteTest, IOErrorOnWALWritePropagateToWriteThreadFollower) {
+  constexpr int kNumThreads = 5;
+  std::unique_ptr<FaultInjectionTestEnv> mock_env(
+      new FaultInjectionTestEnv(Env::Default()));
+  Options options = GetOptions();
+  options.env = mock_env.get();
+  Reopen(options);
+  std::atomic<int> ready_count{0};
+  std::atomic<int> leader_count{0};
+  std::vector<port::Thread> threads;
+  mock_env->SetFilesystemActive(false);
+  // Wait until all threads linked to write threads, to make sure
+  // all threads join the same batch group.
+  SyncPoint::GetInstance()->SetCallBack(
+      "WriteThread::JoinBatchGroup:Wait", [&](void* arg) {
+        ready_count++;
+        auto* w = reinterpret_cast<WriteThread::Writer*>(arg);
+        if (w->state == WriteThread::STATE_GROUP_LEADER) {
+          leader_count++;
+          while (ready_count < kNumThreads) {
+            // busy waiting
+          }
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  for (int i = 0; i < kNumThreads; i++) {
+    threads.push_back(port::Thread(
+        [&](int index) {
+          // All threads should fail.
+          ASSERT_FALSE(Put("key" + ToString(index), "value").ok());
+        },
+        i));
+  }
+  for (int i = 0; i < kNumThreads; i++) {
+    threads[i].join();
+  }
+  ASSERT_EQ(1, leader_count);
+  // Close before mock_env destruct.
+  Close();
+}
+
 INSTANTIATE_TEST_CASE_P(DBWriteTestInstance, DBWriteTest,
                        testing::Values(DBTestBase::kDefault,
                                        DBTestBase::kConcurrentWALWrites,
--- a/db/external_sst_file_ingestion_job.cc
+++ b/db/external_sst_file_ingestion_job.cc
@ -164,7 +164,7 @@ Status ExternalSstFileIngestionJob::Run() {
    // if the dont overlap with any ranges since we have snapshots
    force_global_seqno = true;
  }
-  // It is safe to use this instead of LastToBeWrittenSequence since we are
+  // It is safe to use this instead of LastAllocatedSequence since we are
  // the only active writer, and hence they are equal
  const SequenceNumber last_seqno = versions_->LastSequence();
  SuperVersion* super_version = cfd_->GetSuperVersion();
@ -199,7 +199,7 @@ Status ExternalSstFileIngestionJob::Run() {
  }

  if (consumed_seqno) {
-    versions_->SetLastToBeWrittenSequence(last_seqno + 1);
+    versions_->SetLastAllocatedSequence(last_seqno + 1);
    versions_->SetLastSequence(last_seqno + 1);
  }

--- a/db/memtable.cc
+++ b/db/memtable.cc
@ -39,10 +39,10 @@

 namespace rocksdb {

-MemTableOptions::MemTableOptions(const ImmutableCFOptions& ioptions,
+ImmutableMemTableOptions::ImmutableMemTableOptions(
+    const ImmutableCFOptions& ioptions,
    const MutableCFOptions& mutable_cf_options)
-    : write_buffer_size(mutable_cf_options.write_buffer_size),
-      arena_block_size(mutable_cf_options.arena_block_size),
+    : arena_block_size(mutable_cf_options.arena_block_size),
      memtable_prefix_bloom_bits(
          static_cast<uint32_t>(
              static_cast<double>(mutable_cf_options.write_buffer_size) *
@ -83,6 +83,7 @@ MemTable::MemTable(const InternalKeyComparator& cmp,
      data_size_(0),
      num_entries_(0),
      num_deletes_(0),
+      write_buffer_size_(mutable_cf_options.write_buffer_size),
      flush_in_progress_(false),
      flush_completed_(false),
      file_number_(0),
@ -136,6 +137,7 @@ size_t MemTable::ApproximateMemoryUsage() {
 }

 bool MemTable::ShouldFlushNow() const {
+  size_t write_buffer_size = write_buffer_size_.load(std::memory_order_relaxed);
  // In a lot of times, we cannot allocate arena blocks that exactly matches the
  // buffer size. Thus we have to decide if we should over-allocate or
  // under-allocate.
@ -153,16 +155,14 @@ bool MemTable::ShouldFlushNow() const {
  // if we can still allocate one more block without exceeding the
  // over-allocation ratio, then we should not flush.
  if (allocated_memory + kArenaBlockSize <
-      moptions_.write_buffer_size +
-      kArenaBlockSize * kAllowOverAllocationRatio) {
+      write_buffer_size + kArenaBlockSize * kAllowOverAllocationRatio) {
    return false;
  }

-  // if user keeps adding entries that exceeds moptions.write_buffer_size,
-  // we need to flush earlier even though we still have much available
-  // memory left.
-  if (allocated_memory > moptions_.write_buffer_size +
-      kArenaBlockSize * kAllowOverAllocationRatio) {
+  // if user keeps adding entries that exceeds write_buffer_size, we need to
+  // flush earlier even though we still have much available memory left.
+  if (allocated_memory >
+      write_buffer_size + kArenaBlockSize * kAllowOverAllocationRatio) {
    return true;
  }

@ -265,7 +265,8 @@ class MemTableIterator : public InternalIterator {
        comparator_(mem.comparator_),
        valid_(false),
        arena_mode_(arena != nullptr),
-        value_pinned_(!mem.GetMemTableOptions()->inplace_update_support) {
+        value_pinned_(
+            !mem.GetImmutableMemTableOptions()->inplace_update_support) {
    if (use_range_del_table) {
      iter_ = mem.range_del_table_->GetIterator(arena);
    } else if (prefix_extractor_ != nullptr && !read_options.total_order_seek) {
--- a/db/memtable.h
+++ b/db/memtable.h
@ -36,11 +36,9 @@ class MemTableIterator;
 class MergeContext;
 class InternalIterator;

-struct MemTableOptions {
-  explicit MemTableOptions(
-      const ImmutableCFOptions& ioptions,
+struct ImmutableMemTableOptions {
+  explicit ImmutableMemTableOptions(const ImmutableCFOptions& ioptions,
                                    const MutableCFOptions& mutable_cf_options);
-  size_t write_buffer_size;
  size_t arena_block_size;
  uint32_t memtable_prefix_bloom_bits;
  size_t memtable_huge_page_size;
@ -262,6 +260,18 @@ class MemTable {
    return num_deletes_.load(std::memory_order_relaxed);
  }

+  // Dynamically change the memtable's capacity. If set below the current usage,
+  // the next key added will trigger a flush. Can only increase size when
+  // memtable prefix bloom is disabled, since we can't easily allocate more
+  // space.
+  void UpdateWriteBufferSize(size_t new_write_buffer_size) {
+    if (prefix_bloom_ == nullptr ||
+        new_write_buffer_size < write_buffer_size_) {
+      write_buffer_size_.store(new_write_buffer_size,
+                               std::memory_order_relaxed);
+    }
+  }
+
  // Returns the edits area that is needed for flushing the memtable
  VersionEdit* GetEdits() { return &edit_; }

@ -350,7 +360,9 @@ class MemTable {
    return comparator_.comparator;
  }

-  const MemTableOptions* GetMemTableOptions() const { return &moptions_; }
+  const ImmutableMemTableOptions* GetImmutableMemTableOptions() const {
+    return &moptions_;
+  }

  uint64_t ApproximateOldestKeyTime() const {
    return oldest_key_time_.load(std::memory_order_relaxed);
@ -364,7 +376,7 @@ class MemTable {
  friend class MemTableList;

  KeyComparator comparator_;
-  const MemTableOptions moptions_;
+  const ImmutableMemTableOptions moptions_;
  int refs_;
  const size_t kArenaBlockSize;
  AllocTracker mem_tracker_;
@ -378,6 +390,9 @@ class MemTable {
  std::atomic<uint64_t> num_entries_;
  std::atomic<uint64_t> num_deletes_;

+  // Dynamically changeable memtable option
+  std::atomic<size_t> write_buffer_size_;
+
  // These are used to manage memtable flushes to storage
  bool flush_in_progress_; // started the flush
  bool flush_completed_;   // finished the flush
--- a/db/repair.cc
+++ b/db/repair.cc
@ -546,7 +546,7 @@ class Repairer {
        max_sequence = tables_[i].max_sequence;
      }
    }
-    vset_.SetLastToBeWrittenSequence(max_sequence);
+    vset_.SetLastAllocatedSequence(max_sequence);
    vset_.SetLastSequence(max_sequence);

    for (const auto& cf_id_and_tables : cf_id_to_tables) {
--- a/db/snapshot_impl.h
+++ b/db/snapshot_impl.h
@ -108,6 +108,22 @@ class SnapshotList {
    return ret;
  }

+  // Whether there is an active snapshot in range [lower_bound, upper_bound).
+  bool HasSnapshotInRange(SequenceNumber lower_bound,
+                          SequenceNumber upper_bound) {
+    if (empty()) {
+      return false;
+    }
+    const SnapshotImpl* s = &list_;
+    while (s->next_ != &list_) {
+      if (s->next_->number_ >= lower_bound) {
+        return s->next_->number_ < upper_bound;
+      }
+      s = s->next_;
+    }
+    return false;
+  }
+
  // get the sequence number of the most recent snapshot
  SequenceNumber GetNewest() {
    if (empty()) {
--- a/db/transaction_log_impl.cc
+++ b/db/transaction_log_impl.cc
@ -19,7 +19,8 @@ TransactionLogIteratorImpl::TransactionLogIteratorImpl(
    const std::string& dir, const ImmutableDBOptions* options,
    const TransactionLogIterator::ReadOptions& read_options,
    const EnvOptions& soptions, const SequenceNumber seq,
-    std::unique_ptr<VectorLogPtr> files, VersionSet const* const versions)
+    std::unique_ptr<VectorLogPtr> files, VersionSet const* const versions,
+    const bool seq_per_batch)
    : dir_(dir),
      options_(options),
      read_options_(read_options),
@ -31,7 +32,8 @@ TransactionLogIteratorImpl::TransactionLogIteratorImpl(
      currentFileIndex_(0),
      currentBatchSeq_(0),
      currentLastSeq_(0),
-      versions_(versions) {
+      versions_(versions),
+      seq_per_batch_(seq_per_batch) {
  assert(files_ != nullptr);
  assert(versions_ != nullptr);

@ -241,12 +243,12 @@ void TransactionLogIteratorImpl::UpdateCurrentWriteBatch(const Slice& record) {
    }
    startingSequenceNumber_ = expectedSeq;
    // currentStatus_ will be set to Ok if reseek succeeds
-    // Note: this is still ok in seq_pre_batch_ && concurrent_preparep_ mode
+    // Note: this is still ok in seq_pre_batch_ && two_write_queuesp_ mode
    // that allows gaps in the WAL since it will still skip over the gap.
    currentStatus_ = Status::NotFound("Gap in sequence numbers");
-    // In seq_per_batch mode, gaps in the seq are possible so the strict mode
+    // In seq_per_batch_ mode, gaps in the seq are possible so the strict mode
    // should be disabled
-    return SeekToStartSequence(currentFileIndex_, !options_->seq_per_batch);
+    return SeekToStartSequence(currentFileIndex_, !seq_per_batch_);
  }

  struct BatchCounter : public WriteBatch::Handler {
@ -284,7 +286,7 @@ void TransactionLogIteratorImpl::UpdateCurrentWriteBatch(const Slice& record) {
  };

  currentBatchSeq_ = WriteBatchInternal::Sequence(batch.get());
-  if (options_->seq_per_batch) {
+  if (seq_per_batch_) {
    BatchCounter counter(currentBatchSeq_);
    batch->Iterate(&counter);
    currentLastSeq_ = counter.sequence_;
--- a/db/transaction_log_impl.h
+++ b/db/transaction_log_impl.h
@ -62,7 +62,8 @@ class TransactionLogIteratorImpl : public TransactionLogIterator {
      const std::string& dir, const ImmutableDBOptions* options,
      const TransactionLogIterator::ReadOptions& read_options,
      const EnvOptions& soptions, const SequenceNumber seqNum,
-      std::unique_ptr<VectorLogPtr> files, VersionSet const* const versions);
+      std::unique_ptr<VectorLogPtr> files, VersionSet const* const versions,
+      const bool seq_per_batch);

  virtual bool Valid() override;

@ -103,7 +104,7 @@ class TransactionLogIteratorImpl : public TransactionLogIterator {
  // Used only to get latest seq. num
  // TODO(icanadi) can this be just a callback?
  VersionSet const* const versions_;
-
+  const bool seq_per_batch_;
  // Reads from transaction log only if the writebatch record has been written
  bool RestrictedRead(Slice* record, std::string* scratch);
  // Seeks to startingSequenceNumber reading from startFileIndex in files_.
--- a/db/version_set.cc
+++ b/db/version_set.cc
@ -1851,27 +1851,33 @@ void VersionStorageInfo::GetOverlappingInputs(
 void VersionStorageInfo::GetCleanInputsWithinInterval(
    int level, const InternalKey* begin, const InternalKey* end,
    std::vector<FileMetaData*>* inputs, int hint_index, int* file_index) const {
-  if (level >= num_non_empty_levels_) {
-    // this level is empty, no inputs within range
-    return;
-  }
-
  inputs->clear();
-  Slice user_begin, user_end;
-  if (begin != nullptr) {
-    user_begin = begin->user_key();
-  }
-  if (end != nullptr) {
-    user_end = end->user_key();
-  }
  if (file_index) {
    *file_index = -1;
  }
-  if (begin != nullptr && end != nullptr && level > 0) {
+  if (level >= num_non_empty_levels_ || level == 0 ||
+      level_files_brief_[level].num_files == 0) {
+    // this level is empty, no inputs within range
+    // also don't support clean input interval within L0
+    return;
+  }
+
+  Slice user_begin, user_end;
+  const auto& level_files = level_files_brief_[level];
+  if (begin == nullptr) {
+    user_begin = ExtractUserKey(level_files.files[0].smallest_key);
+  } else {
+    user_begin = begin->user_key();
+  }
+  if (end == nullptr) {
+    user_end = ExtractUserKey(
+        level_files.files[level_files.num_files - 1].largest_key);
+  } else {
+    user_end = end->user_key();
+  }
  GetOverlappingInputsRangeBinarySearch(level, user_begin, user_end, inputs,
                                        hint_index, file_index,
                                        true /* within_interval */);
-  }
 }

 // Store in "*inputs" all files in "level" that overlap [begin,end]
@ -1934,8 +1940,8 @@ void VersionStorageInfo::GetOverlappingInputsRangeBinarySearch(
  } else {
    ExtendFileRangeOverlappingInterval(level, user_begin, user_end, mid,
                                       &start_index, &end_index);
-  }
    assert(end_index >= start_index);
+  }
  // insert overlapping files into vector
  for (int i = start_index; i <= end_index; i++) {
    inputs->push_back(files_[level][i]);
@ -2414,7 +2420,7 @@ VersionSet::VersionSet(const std::string& dbname,
      manifest_file_number_(0),  // Filled by Recover()
      pending_manifest_file_number_(0),
      last_sequence_(0),
-      last_to_be_written_sequence_(0),
+      last_allocated_sequence_(0),
      prev_log_number_(0),
      current_version_number_(0),
      manifest_file_size_(0),
@ -2754,9 +2760,8 @@ void VersionSet::LogAndApplyCFHelper(VersionEdit* edit) {
  // updated the last_sequence_ yet. It is also possible that the log has is
  // expecting some new data that is not written yet. Since LastSequence is an
  // upper bound on the sequence, it is ok to record
-  // last_to_be_written_sequence_ as the last sequence.
-  edit->SetLastSequence(db_options_->concurrent_prepare
-                            ? last_to_be_written_sequence_
+  // last_allocated_sequence_ as the last sequence.
+  edit->SetLastSequence(db_options_->two_write_queues ? last_allocated_sequence_
                                                      : last_sequence_);
  if (edit->is_column_family_drop_) {
    // if we drop column family, we have to make sure to save max column family,
@ -2784,9 +2789,8 @@ void VersionSet::LogAndApplyHelper(ColumnFamilyData* cfd,
  // updated the last_sequence_ yet. It is also possible that the log has is
  // expecting some new data that is not written yet. Since LastSequence is an
  // upper bound on the sequence, it is ok to record
-  // last_to_be_written_sequence_ as the last sequence.
-  edit->SetLastSequence(db_options_->concurrent_prepare
-                            ? last_to_be_written_sequence_
+  // last_allocated_sequence_ as the last sequence.
+  edit->SetLastSequence(db_options_->two_write_queues ? last_allocated_sequence_
                                                      : last_sequence_);

  builder->Apply(edit);
@ -3077,7 +3081,7 @@ Status VersionSet::Recover(

    manifest_file_size_ = current_manifest_file_size;
    next_file_number_.store(next_file + 1);
-    last_to_be_written_sequence_ = last_sequence;
+    last_allocated_sequence_ = last_sequence;
    last_sequence_ = last_sequence;
    prev_log_number_ = previous_log_number;

@ -3448,7 +3452,7 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
    }

    next_file_number_.store(next_file + 1);
-    last_to_be_written_sequence_ = last_sequence;
+    last_allocated_sequence_ = last_sequence;
    last_sequence_ = last_sequence;
    prev_log_number_ = previous_log_number;

--- a/db/version_set.h
+++ b/db/version_set.h
@ -765,28 +765,27 @@ class VersionSet {
  }

  // Note: memory_order_acquire must be sufficient.
-  uint64_t LastToBeWrittenSequence() const {
-    return last_to_be_written_sequence_.load(std::memory_order_seq_cst);
+  uint64_t LastAllocatedSequence() const {
+    return last_allocated_sequence_.load(std::memory_order_seq_cst);
  }

  // Set the last sequence number to s.
  void SetLastSequence(uint64_t s) {
    assert(s >= last_sequence_);
    // Last visible seqeunce must always be less than last written seq
-    assert(!db_options_->concurrent_prepare ||
-           s <= last_to_be_written_sequence_);
+    assert(!db_options_->two_write_queues || s <= last_allocated_sequence_);
    last_sequence_.store(s, std::memory_order_release);
  }

  // Note: memory_order_release must be sufficient
-  void SetLastToBeWrittenSequence(uint64_t s) {
-    assert(s >= last_to_be_written_sequence_);
-    last_to_be_written_sequence_.store(s, std::memory_order_seq_cst);
+  void SetLastAllocatedSequence(uint64_t s) {
+    assert(s >= last_allocated_sequence_);
+    last_allocated_sequence_.store(s, std::memory_order_seq_cst);
  }

  // Note: memory_order_release must be sufficient
-  uint64_t FetchAddLastToBeWrittenSequence(uint64_t s) {
-    return last_to_be_written_sequence_.fetch_add(s, std::memory_order_seq_cst);
+  uint64_t FetchAddLastAllocatedSequence(uint64_t s) {
+    return last_allocated_sequence_.fetch_add(s, std::memory_order_seq_cst);
  }

  // Mark the specified file number as used.
@ -894,8 +893,9 @@ class VersionSet {
  uint64_t pending_manifest_file_number_;
  // The last seq visible to reads
  std::atomic<uint64_t> last_sequence_;
-  // The last seq with which a writer has written/will write.
-  std::atomic<uint64_t> last_to_be_written_sequence_;
+  // The last seq that is already allocated. The seq might or might not have
+  // appreated in memtable.
+  std::atomic<uint64_t> last_allocated_sequence_;
  uint64_t prev_log_number_;  // 0 or backing store for memtable being compacted

  // Opened lazily
--- a/db/wal_manager.cc
+++ b/db/wal_manager.cc
@ -115,7 +115,7 @@ Status WalManager::GetUpdatesSince(
  }
  iter->reset(new TransactionLogIteratorImpl(
      db_options_.wal_dir, &db_options_, read_options, env_options_, seq,
-      std::move(wal_files), version_set));
+      std::move(wal_files), version_set, seq_per_batch_));
  return (*iter)->status();
 }

--- a/db/wal_manager.h
+++ b/db/wal_manager.h
@ -31,11 +31,12 @@ namespace rocksdb {
 class WalManager {
 public:
  WalManager(const ImmutableDBOptions& db_options,
-             const EnvOptions& env_options)
+             const EnvOptions& env_options, const bool seq_per_batch = false)
      : db_options_(db_options),
        env_options_(env_options),
        env_(db_options.env),
-        purge_wal_files_last_run_(0) {}
+        purge_wal_files_last_run_(0),
+        seq_per_batch_(seq_per_batch) {}

  Status GetSortedWalFiles(VectorLogPtr& files);

@ -86,6 +87,8 @@ class WalManager {
  // last time when PurgeObsoleteWALFiles ran.
  uint64_t purge_wal_files_last_run_;

+  bool seq_per_batch_;
+
  // obsolete files will be deleted every this seconds if ttl deletion is
  // enabled and archive size_limit is disabled.
  static const uint64_t kDefaultIntervalToDeleteObsoleteWAL = 600;
--- a/db/wal_manager_test.cc
+++ b/db/wal_manager_test.cc
@ -67,7 +67,7 @@ class WalManagerTest : public testing::Test {
    batch.Put(key, value);
    WriteBatchInternal::SetSequence(&batch, seq);
    current_log_writer_->AddRecord(WriteBatchInternal::Contents(&batch));
-    versions_->SetLastToBeWrittenSequence(seq);
+    versions_->SetLastAllocatedSequence(seq);
    versions_->SetLastSequence(seq);
  }

--- a/db/write_batch.cc
+++ b/db/write_batch.cc
@ -1035,7 +1035,7 @@ class MemTableInserter : public WriteBatch::Handler {
    }

    MemTable* mem = cf_mems_->GetMemTable();
-    auto* moptions = mem->GetMemTableOptions();
+    auto* moptions = mem->GetImmutableMemTableOptions();
    if (!moptions->inplace_update_support) {
      mem->Add(sequence_, value_type, key, value, concurrent_memtable_writes_,
               get_post_process_info(mem));
@ -1196,7 +1196,7 @@ class MemTableInserter : public WriteBatch::Handler {
    }

    MemTable* mem = cf_mems_->GetMemTable();
-    auto* moptions = mem->GetMemTableOptions();
+    auto* moptions = mem->GetImmutableMemTableOptions();
    bool perform_merge = false;

    // If we pass DB through and options.max_successive_merges is hit
--- a/db/write_callback_test.cc
+++ b/db/write_callback_test.cc
@ -136,9 +136,8 @@ TEST_F(WriteCallbackTest, WriteWithCallbackTest) {
              options.create_if_missing = true;
              options.allow_concurrent_memtable_write = allow_parallel;
              options.enable_pipelined_write = enable_pipelined_write;
-              options.concurrent_prepare = two_queues;
-              if (options.enable_pipelined_write &&
-                  options.concurrent_prepare) {
+              options.two_write_queues = two_queues;
+              if (options.enable_pipelined_write && options.two_write_queues) {
                // This combination is not supported
                continue;
              }
--- a/db/write_thread.cc
+++ b/db/write_thread.cc
@ -533,6 +533,11 @@ void WriteThread::ExitAsBatchGroupLeader(WriteGroup& write_group,
  Writer* last_writer = write_group.last_writer;
  assert(leader->link_older == nullptr);

+  // Propagate memtable write error to the whole group.
+  if (status.ok() && !write_group.status.ok()) {
+    status = write_group.status;
+  }
+
  if (enable_pipelined_write_) {
    // Notify writers don't write to memtable to exit.
    for (Writer* w = last_writer; w != leader;) {
--- a/include/rocksdb/compaction_filter.h
+++ b/include/rocksdb/compaction_filter.h
@ -36,6 +36,7 @@ class CompactionFilter {
  enum ValueType {
    kValue,
    kMergeOperand,
+    kBlobIndex,  // used internally by BlobDB.
  };

  enum class Decision {
@ -171,6 +172,8 @@ class CompactionFilter {
        bool rv = FilterMergeOperand(level, key, existing_value);
        return rv ? Decision::kRemove : Decision::kKeep;
      }
+      case ValueType::kBlobIndex:
+        return Decision::kKeep;
    }
    assert(false);
    return Decision::kKeep;
--- a/include/rocksdb/convenience.h
+++ b/include/rocksdb/convenience.h
@ -325,7 +325,8 @@ void CancelAllBackgroundWork(DB* db, bool wait = false);

 // Delete files which are entirely in the given range
 // Could leave some keys in the range which are in files which are not
-// entirely in the range.
+// entirely in the range. Also leaves L0 files regardless of whether they're
+// in the range.
 // Snapshots before the delete might not see the data in the given range.
 Status DeleteFilesInRange(DB* db, ColumnFamilyHandle* column_family,
                          const Slice* begin, const Slice* end);
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@ -905,22 +905,12 @@ struct DBOptions {
  // allows the memtable writes not to lag behind other writes. It can be used
  // to optimize MySQL 2PC in which only the commits, which are serial, write to
  // memtable.
-  bool concurrent_prepare = false;
+  bool two_write_queues = false;

  // If true WAL is not flushed automatically after each write. Instead it
  // relies on manual invocation of FlushWAL to write the WAL buffer to its
  // file.
  bool manual_wal_flush = false;
-
-  // Increase the sequence number after writing each batch, whether memtable is
-  // disabled for that or not. Otherwise the sequence number is increased after
-  // writing each key into memtable. This implies that when memtable_disable is
-  // set, the seq is not increased at all.
-  //
-  // Default: false
-  // Note: This option is experimental and meant to be used only for internal
-  // projects.
-  bool seq_per_batch = false;
 };

 // Options to control the behavior of a database (passed to DB::Open)
--- a/include/rocksdb/perf_context.h
+++ b/include/rocksdb/perf_context.h
@ -131,6 +131,11 @@ struct PerfContext {
  // total number of SST table bloom misses
  uint64_t bloom_sst_miss_count;

+  // Time spent waiting on key locks in transaction lock manager.
+  uint64_t key_lock_wait_time;
+  // number of times acquiring a lock was blocked by another transaction.
+  uint64_t key_lock_wait_count;
+
  // Total time spent in Env filesystem operations. These are only populated
  // when TimedEnv is used.
  uint64_t env_new_sequential_file_nanos;
--- a/include/rocksdb/slice.h
+++ b/include/rocksdb/slice.h
@ -133,27 +133,6 @@ class PinnableSlice : public Slice, public Cleanable {
  PinnableSlice(PinnableSlice&) = delete;
  PinnableSlice& operator=(PinnableSlice&) = delete;

-  PinnableSlice(PinnableSlice&& other) { *this = std::move(other); }
-
-  PinnableSlice& operator=(PinnableSlice&& other) {
-    if (this != &other) {
-      // cleanup itself.
-      Reset();
-
-      Slice::operator=(other);
-      Cleanable::operator=(std::move(other));
-      pinned_ = other.pinned_;
-      if (!pinned_ && other.buf_ == &other.self_space_) {
-        self_space_ = std::move(other.self_space_);
-        buf_ = &self_space_;
-        data_ = buf_->data();
-      } else {
-        buf_ = other.buf_;
-      }
-    }
-    return *this;
-  }
-
  inline void PinSlice(const Slice& s, CleanupFunction f, void* arg1,
                       void* arg2) {
    assert(!pinned_);
--- a/include/rocksdb/statistics.h
+++ b/include/rocksdb/statistics.h
@ -223,6 +223,76 @@ enum Tickers : uint32_t {
  // Number of refill intervals where rate limiter's bytes are fully consumed.
  NUMBER_RATE_LIMITER_DRAINS,

+  // Number of internal keys skipped by Iterator
+  NUMBER_ITER_SKIP,
+
+  // BlobDB specific stats
+  // # of Put/PutTTL/PutUntil to BlobDB.
+  BLOB_DB_NUM_PUT,
+  // # of Write to BlobDB.
+  BLOB_DB_NUM_WRITE,
+  // # of Get to BlobDB.
+  BLOB_DB_NUM_GET,
+  // # of MultiGet to BlobDB.
+  BLOB_DB_NUM_MULTIGET,
+  // # of Seek/SeekToFirst/SeekToLast/SeekForPrev to BlobDB iterator.
+  BLOB_DB_NUM_SEEK,
+  // # of Next to BlobDB iterator.
+  BLOB_DB_NUM_NEXT,
+  // # of Prev to BlobDB iterator.
+  BLOB_DB_NUM_PREV,
+  // # of keys written to BlobDB.
+  BLOB_DB_NUM_KEYS_WRITTEN,
+  // # of keys read from BlobDB.
+  BLOB_DB_NUM_KEYS_READ,
+  // # of bytes (key + value) written to BlobDB.
+  BLOB_DB_BYTES_WRITTEN,
+  // # of bytes (keys + value) read from BlobDB.
+  BLOB_DB_BYTES_READ,
+  // # of keys written by BlobDB as non-TTL inlined value.
+  BLOB_DB_WRITE_INLINED,
+  // # of keys written by BlobDB as TTL inlined value.
+  BLOB_DB_WRITE_INLINED_TTL,
+  // # of keys written by BlobDB as non-TTL blob value.
+  BLOB_DB_WRITE_BLOB,
+  // # of keys written by BlobDB as TTL blob value.
+  BLOB_DB_WRITE_BLOB_TTL,
+  // # of bytes written to blob file.
+  BLOB_DB_BLOB_FILE_BYTES_WRITTEN,
+  // # of bytes read from blob file.
+  BLOB_DB_BLOB_FILE_BYTES_READ,
+  // # of times a blob files being synced.
+  BLOB_DB_BLOB_FILE_SYNCED,
+  // # of blob index evicted from base DB by BlobDB compaction filter because
+  // of expiration.
+  BLOB_DB_BLOB_INDEX_EXPIRED,
+  // # of blob files being garbage collected.
+  BLOB_DB_GC_NUM_FILES,
+  // # of blob files generated by garbage collection.
+  BLOB_DB_GC_NUM_NEW_FILES,
+  // # of BlobDB garbage collection failures.
+  BLOB_DB_GC_FAILURES,
+  // # of keys drop by BlobDB garbage collection because they had been
+  // overwritten.
+  BLOB_DB_GC_NUM_KEYS_OVERWRITTEN,
+  // # of keys drop by BlobDB garbage collection because of expiration.
+  BLOB_DB_GC_NUM_KEYS_EXPIRED,
+  // # of keys relocated to new blob file by garbage collection.
+  BLOB_DB_GC_NUM_KEYS_RELOCATED,
+  // # of bytes drop by BlobDB garbage collection because they had been
+  // overwritten.
+  BLOB_DB_GC_BYTES_OVERWRITTEN,
+  // # of bytes drop by BlobDB garbage collection because of expiration.
+  BLOB_DB_GC_BYTES_EXPIRED,
+  // # of bytes relocated to new blob file by garbage collection.
+  BLOB_DB_GC_BYTES_RELOCATED,
+  // # of blob files evicted because of BlobDB is full.
+  BLOB_DB_FIFO_NUM_FILES_EVICTED,
+  // # of keys in the blob files evicted because of BlobDB is full.
+  BLOB_DB_FIFO_NUM_KEYS_EVICTED,
+  // # of bytes in the blob files evicted because of BlobDB is full.
+  BLOB_DB_FIFO_BYTES_EVICTED,
+
  TICKER_ENUM_MAX
 };

@ -328,6 +398,38 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
    {READ_AMP_ESTIMATE_USEFUL_BYTES, "rocksdb.read.amp.estimate.useful.bytes"},
    {READ_AMP_TOTAL_READ_BYTES, "rocksdb.read.amp.total.read.bytes"},
    {NUMBER_RATE_LIMITER_DRAINS, "rocksdb.number.rate_limiter.drains"},
+    {NUMBER_ITER_SKIP, "rocksdb.number.iter.skip"},
+    {BLOB_DB_NUM_PUT, "rocksdb.blobdb.num.put"},
+    {BLOB_DB_NUM_WRITE, "rocksdb.blobdb.num.write"},
+    {BLOB_DB_NUM_GET, "rocksdb.blobdb.num.get"},
+    {BLOB_DB_NUM_MULTIGET, "rocksdb.blobdb.num.multiget"},
+    {BLOB_DB_NUM_SEEK, "rocksdb.blobdb.num.seek"},
+    {BLOB_DB_NUM_NEXT, "rocksdb.blobdb.num.next"},
+    {BLOB_DB_NUM_PREV, "rocksdb.blobdb.num.prev"},
+    {BLOB_DB_NUM_KEYS_WRITTEN, "rocksdb.blobdb.num.keys.written"},
+    {BLOB_DB_NUM_KEYS_READ, "rocksdb.blobdb.num.keys.read"},
+    {BLOB_DB_BYTES_WRITTEN, "rocksdb.blobdb.bytes.written"},
+    {BLOB_DB_BYTES_READ, "rocksdb.blobdb.bytes.read"},
+    {BLOB_DB_WRITE_INLINED, "rocksdb.blobdb.write.inlined"},
+    {BLOB_DB_WRITE_INLINED_TTL, "rocksdb.blobdb.write.inlined.ttl"},
+    {BLOB_DB_WRITE_BLOB, "rocksdb.blobdb.write.blob"},
+    {BLOB_DB_WRITE_BLOB_TTL, "rocksdb.blobdb.write.blob.ttl"},
+    {BLOB_DB_BLOB_FILE_BYTES_WRITTEN, "rocksdb.blobdb.blob.file.bytes.written"},
+    {BLOB_DB_BLOB_FILE_BYTES_READ, "rocksdb.blobdb.blob.file,bytes.read"},
+    {BLOB_DB_BLOB_FILE_SYNCED, "rocksdb.blobdb.blob.file.synced"},
+    {BLOB_DB_BLOB_INDEX_EXPIRED, "rocksdb.blobdb.blob.index.expired"},
+    {BLOB_DB_GC_NUM_FILES, "rocksdb.blobdb.gc.num.files"},
+    {BLOB_DB_GC_NUM_NEW_FILES, "rocksdb.blobdb.gc.num.new.files"},
+    {BLOB_DB_GC_FAILURES, "rocksdb.blobdb.gc.failures"},
+    {BLOB_DB_GC_NUM_KEYS_OVERWRITTEN, "rocksdb.blobdb.gc.num.keys.overwritten"},
+    {BLOB_DB_GC_NUM_KEYS_EXPIRED, "rocksdb.blobdb.gc.num.keys.expired"},
+    {BLOB_DB_GC_NUM_KEYS_RELOCATED, "rocksdb.blobdb.gc.num.keys.relocated"},
+    {BLOB_DB_GC_BYTES_OVERWRITTEN, "rocksdb.blobdb.gc.bytes.overwritten"},
+    {BLOB_DB_GC_BYTES_EXPIRED, "rocksdb.blobdb.gc.bytes.expired"},
+    {BLOB_DB_GC_BYTES_RELOCATED, "rocksdb.blobdb.gc.bytes.relocated"},
+    {BLOB_DB_FIFO_NUM_FILES_EVICTED, "rocksdb.blobdb.fifo.num.files.evicted"},
+    {BLOB_DB_FIFO_NUM_KEYS_EVICTED, "rocksdb.blobdb.fifo.num.keys.evicted"},
+    {BLOB_DB_FIFO_BYTES_EVICTED, "rocksdb.blobdb.fifo.bytes.evicted"},
 };

 /**
@ -379,6 +481,36 @@ enum Histograms : uint32_t {
  // requests.
  READ_NUM_MERGE_OPERANDS,

+  // BlobDB specific stats
+  // Size of keys written to BlobDB.
+  BLOB_DB_KEY_SIZE,
+  // Size of values written to BlobDB.
+  BLOB_DB_VALUE_SIZE,
+  // BlobDB Put/PutWithTTL/PutUntil/Write latency.
+  BLOB_DB_WRITE_MICROS,
+  // BlobDB Get lagency.
+  BLOB_DB_GET_MICROS,
+  // BlobDB MultiGet latency.
+  BLOB_DB_MULTIGET_MICROS,
+  // BlobDB Seek/SeekToFirst/SeekToLast/SeekForPrev latency.
+  BLOB_DB_SEEK_MICROS,
+  // BlobDB Next latency.
+  BLOB_DB_NEXT_MICROS,
+  // BlobDB Prev latency.
+  BLOB_DB_PREV_MICROS,
+  // Blob file write latency.
+  BLOB_DB_BLOB_FILE_WRITE_MICROS,
+  // Blob file read latency.
+  BLOB_DB_BLOB_FILE_READ_MICROS,
+  // Blob file sync latency.
+  BLOB_DB_BLOB_FILE_SYNC_MICROS,
+  // BlobDB garbage collection time.
+  BLOB_DB_GC_MICROS,
+  // BlobDB compression time.
+  BLOB_DB_COMPRESSION_MICROS,
+  // BlobDB decompression time.
+  BLOB_DB_DECOMPRESSION_MICROS,
+
  HISTOGRAM_ENUM_MAX,  // TODO(ldemailly): enforce HistogramsNameMap match
 };

@ -414,6 +546,20 @@ const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = {
    {COMPRESSION_TIMES_NANOS, "rocksdb.compression.times.nanos"},
    {DECOMPRESSION_TIMES_NANOS, "rocksdb.decompression.times.nanos"},
    {READ_NUM_MERGE_OPERANDS, "rocksdb.read.num.merge_operands"},
+    {BLOB_DB_KEY_SIZE, "rocksdb.blobdb.key.size"},
+    {BLOB_DB_VALUE_SIZE, "rocksdb.blobdb.value.size"},
+    {BLOB_DB_WRITE_MICROS, "rocksdb.blobdb.write.micros"},
+    {BLOB_DB_GET_MICROS, "rocksdb.blobdb.get.micros"},
+    {BLOB_DB_MULTIGET_MICROS, "rocksdb.blobdb.multiget.micros"},
+    {BLOB_DB_SEEK_MICROS, "rocksdb.blobdb.seek.micros"},
+    {BLOB_DB_NEXT_MICROS, "rocksdb.blobdb.next.micros"},
+    {BLOB_DB_PREV_MICROS, "rocksdb.blobdb.prev.micros"},
+    {BLOB_DB_BLOB_FILE_WRITE_MICROS, "rocksdb.blobdb.blob.file.write.micros"},
+    {BLOB_DB_BLOB_FILE_READ_MICROS, "rocksdb.blobdb.blob.file.read.micros"},
+    {BLOB_DB_BLOB_FILE_SYNC_MICROS, "rocksdb.blobdb.blob.file.sync.micros"},
+    {BLOB_DB_GC_MICROS, "rocksdb.blobdb.gc.micros"},
+    {BLOB_DB_COMPRESSION_MICROS, "rocksdb.blobdb.compression.micros"},
+    {BLOB_DB_DECOMPRESSION_MICROS, "rocksdb.blobdb.decompression.micros"},
 };

 struct HistogramData {
--- a/include/rocksdb/version.h
+++ b/include/rocksdb/version.h
@ -6,7 +6,7 @@

 #define ROCKSDB_MAJOR 5
 #define ROCKSDB_MINOR 9
-#define ROCKSDB_PATCH 0
+#define ROCKSDB_PATCH 1

 // Do not use these. We made the mistake of declaring macros starting with
 // double underscore. Now we have to live with our choice. We'll deprecate these
--- a/monitoring/perf_context.cc
+++ b/monitoring/perf_context.cc
@ -79,6 +79,8 @@ void PerfContext::Reset() {
  bloom_memtable_miss_count = 0;
  bloom_sst_hit_count = 0;
  bloom_sst_miss_count = 0;
+  key_lock_wait_time = 0;
+  key_lock_wait_count = 0;

  env_new_sequential_file_nanos = 0;
  env_new_random_access_file_nanos = 0;
@ -158,6 +160,8 @@ std::string PerfContext::ToString(bool exclude_zero_counters) const {
  PERF_CONTEXT_OUTPUT(bloom_memtable_miss_count);
  PERF_CONTEXT_OUTPUT(bloom_sst_hit_count);
  PERF_CONTEXT_OUTPUT(bloom_sst_miss_count);
+  PERF_CONTEXT_OUTPUT(key_lock_wait_time);
+  PERF_CONTEXT_OUTPUT(key_lock_wait_count);
  PERF_CONTEXT_OUTPUT(env_new_sequential_file_nanos);
  PERF_CONTEXT_OUTPUT(env_new_random_access_file_nanos);
  PERF_CONTEXT_OUTPUT(env_new_writable_file_nanos);
--- a/options/db_options.cc
+++ b/options/db_options.cc
@ -85,9 +85,8 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options)
      avoid_flush_during_recovery(options.avoid_flush_during_recovery),
      allow_ingest_behind(options.allow_ingest_behind),
      preserve_deletes(options.preserve_deletes),
-      concurrent_prepare(options.concurrent_prepare),
-      manual_wal_flush(options.manual_wal_flush),
-      seq_per_batch(options.seq_per_batch) {
+      two_write_queues(options.two_write_queues),
+      manual_wal_flush(options.manual_wal_flush) {
 }

 void ImmutableDBOptions::Dump(Logger* log) const {
@ -217,11 +216,10 @@ void ImmutableDBOptions::Dump(Logger* log) const {
                   allow_ingest_behind);
  ROCKS_LOG_HEADER(log, "            Options.preserve_deletes: %d",
                   preserve_deletes);
-  ROCKS_LOG_HEADER(log, "            Options.concurrent_prepare: %d",
-                   concurrent_prepare);
+  ROCKS_LOG_HEADER(log, "            Options.two_write_queues: %d",
+                   two_write_queues);
  ROCKS_LOG_HEADER(log, "            Options.manual_wal_flush: %d",
                   manual_wal_flush);
-  ROCKS_LOG_HEADER(log, "            Options.seq_per_batch: %d", seq_per_batch);
 }

 MutableDBOptions::MutableDBOptions()
--- a/options/db_options.h
+++ b/options/db_options.h
@ -77,9 +77,8 @@ struct ImmutableDBOptions {
  bool avoid_flush_during_recovery;
  bool allow_ingest_behind;
  bool preserve_deletes;
-  bool concurrent_prepare;
+  bool two_write_queues;
  bool manual_wal_flush;
-  bool seq_per_batch;
 };

 struct MutableDBOptions {
--- a/options/options_helper.h
+++ b/options/options_helper.h
@ -360,18 +360,18 @@ static std::unordered_map<std::string, OptionTypeInfo> db_options_type_info = {
     {offsetof(struct DBOptions, preserve_deletes), OptionType::kBoolean,
      OptionVerificationType::kNormal, false,
      offsetof(struct ImmutableDBOptions, preserve_deletes)}},
-    {"concurrent_prepare",
-     {offsetof(struct DBOptions, concurrent_prepare), OptionType::kBoolean,
+    {"concurrent_prepare",  // Deprecated by two_write_queues
+     {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, false, 0}},
+    {"two_write_queues",
+     {offsetof(struct DBOptions, two_write_queues), OptionType::kBoolean,
      OptionVerificationType::kNormal, false,
-      offsetof(struct ImmutableDBOptions, concurrent_prepare)}},
+      offsetof(struct ImmutableDBOptions, two_write_queues)}},
    {"manual_wal_flush",
     {offsetof(struct DBOptions, manual_wal_flush), OptionType::kBoolean,
      OptionVerificationType::kNormal, false,
      offsetof(struct ImmutableDBOptions, manual_wal_flush)}},
    {"seq_per_batch",
-     {offsetof(struct DBOptions, seq_per_batch), OptionType::kBoolean,
-      OptionVerificationType::kNormal, false,
-      offsetof(struct ImmutableDBOptions, seq_per_batch)}}};
+     {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, false, 0}}};

 // offset_of is used to get the offset of a class data member
 // ex: offset_of(&ColumnFamilyOptions::num_levels)
--- a/options/options_settable_test.cc
+++ b/options/options_settable_test.cc
@ -284,6 +284,7 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) {
                             "allow_ingest_behind=false;"
                             "preserve_deletes=false;"
                             "concurrent_prepare=false;"
+                             "two_write_queues=false;"
                             "manual_wal_flush=false;"
                             "seq_per_batch=false;",
                             new_options));
--- a/src.mk
+++ b/src.mk
@ -15,13 +15,13 @@ LIB_SOURCES =                                                   \
  db/convenience.cc                                             \
  db/db_filesnapshot.cc                                         \
  db/db_impl.cc                                                 \
-  db/db_impl_write.cc                                           \
  db/db_impl_compaction_flush.cc                                \
-  db/db_impl_files.cc                                           \
-  db/db_impl_open.cc                                            \
  db/db_impl_debug.cc                                           \
  db/db_impl_experimental.cc                                    \
+  db/db_impl_files.cc                                           \
+  db/db_impl_open.cc                                            \
  db/db_impl_readonly.cc                                        \
+  db/db_impl_write.cc                                           \
  db/db_info_dumper.cc                                          \
  db/db_iter.cc                                                 \
  db/dbformat.cc                                                \
@ -155,9 +155,9 @@ LIB_SOURCES =                                                   \
  utilities/blob_db/blob_db.cc                                  \
  utilities/blob_db/blob_db_impl.cc                             \
  utilities/blob_db/blob_file.cc                                \
+  utilities/blob_db/blob_log_format.cc                          \
  utilities/blob_db/blob_log_reader.cc                          \
  utilities/blob_db/blob_log_writer.cc                          \
-  utilities/blob_db/blob_log_format.cc                          \
  utilities/blob_db/ttl_extractor.cc                            \
  utilities/cassandra/cassandra_compaction_filter.cc            \
  utilities/cassandra/format.cc                                 \
@ -192,8 +192,8 @@ LIB_SOURCES =                                                   \
  utilities/simulator_cache/sim_cache.cc                        \
  utilities/spatialdb/spatial_db.cc                             \
  utilities/table_properties_collectors/compact_on_deletion_collector.cc \
-  utilities/transactions/optimistic_transaction_db_impl.cc      \
  utilities/transactions/optimistic_transaction.cc              \
+  utilities/transactions/optimistic_transaction_db_impl.cc      \
  utilities/transactions/pessimistic_transaction.cc             \
  utilities/transactions/pessimistic_transaction_db.cc          \
  utilities/transactions/snapshot_checker.cc                    \
@ -231,14 +231,14 @@ BENCH_LIB_SOURCES = \
  tools/db_bench_tool.cc                                        \

 EXP_LIB_SOURCES = \
-  utilities/col_buf_encoder.cc                                          \
  utilities/col_buf_decoder.cc                                  \
+  utilities/col_buf_encoder.cc                                  \
  utilities/column_aware_encoding_util.cc

 TEST_LIB_SOURCES = \
+  db/db_test_util.cc                                            \
  util/testharness.cc                                           \
  util/testutil.cc                                              \
-  db/db_test_util.cc                                                    \
  utilities/cassandra/test_utils.cc                             \

 MAIN_SOURCES =                                                          \
@ -338,7 +338,6 @@ MAIN_SOURCES =                                                    \
  util/filelock_test.cc                                                 \
  util/log_write_bench.cc                                               \
  util/rate_limiter_test.cc                                             \
-  util/slice_test.cc                                                    \
  util/slice_transform_test.cc                                          \
  util/timer_queue_test.cc                                              \
  util/thread_list_test.cc                                              \
--- a/table/block_based_table_builder.cc
+++ b/table/block_based_table_builder.cc
@ -78,8 +78,8 @@ FilterBlockBuilder* CreateFilterBlockBuilder(
      // as partition size.
      assert(table_opt.block_size_deviation <= 100);
      auto partition_size = static_cast<uint32_t>(
-          table_opt.metadata_block_size *
-          (100 - table_opt.block_size_deviation));
+          ((table_opt.metadata_block_size *
+          (100 - table_opt.block_size_deviation)) + 99) / 100);
      partition_size = std::max(partition_size, static_cast<uint32_t>(1));
      return new PartitionedFilterBlockBuilder(
          opt.prefix_extractor, table_opt.whole_key_filtering,
@ -296,7 +296,7 @@ struct BlockBasedTableBuilder::Rep {
        file(f),
        data_block(table_options.block_restart_interval,
                   table_options.use_delta_encoding),
-        range_del_block(port::kMaxInt32),
+        range_del_block(1),  // TODO(andrewkr): restart_interval unnecessary
        internal_prefix_transform(_ioptions.prefix_extractor),
        compression_type(_compression_type),
        compression_opts(_compression_opts),
--- a/table/meta_blocks.cc
+++ b/table/meta_blocks.cc
@ -21,8 +21,7 @@
 namespace rocksdb {

 MetaIndexBuilder::MetaIndexBuilder()
-    : meta_index_block_(
-          new BlockBuilder(port::kMaxInt32 /* restart interval */)) {}
+    : meta_index_block_(new BlockBuilder(1 /* restart interval */)) {}

 void MetaIndexBuilder::Add(const std::string& key,
                           const BlockHandle& handle) {
@ -39,8 +38,7 @@ Slice MetaIndexBuilder::Finish() {
 }

 PropertyBlockBuilder::PropertyBlockBuilder()
-    : properties_block_(
-          new BlockBuilder(port::kMaxInt32 /* restart interval */)) {}
+    : properties_block_(new BlockBuilder(1 /* restart interval */)) {}

 void PropertyBlockBuilder::Add(const std::string& name,
                               const std::string& val) {
--- a/table/partitioned_filter_block_test.cc
+++ b/table/partitioned_filter_block_test.cc
@ -75,7 +75,8 @@ class PartitionedFilterBlockTest : public testing::Test {
    auto partition_size =
        filter_bits_reader->CalculateSpace(num_keys, &dont_care1, &dont_care2);
    delete filter_bits_reader;
-    return partition_size + table_options_.block_size_deviation;
+    return partition_size +
+               partition_size * table_options_.block_size_deviation / 100;
  }

  int last_offset = 10;
@ -94,8 +95,10 @@ class PartitionedFilterBlockTest : public testing::Test {
      PartitionedIndexBuilder* const p_index_builder) {
    assert(table_options_.block_size_deviation <= 100);
    auto partition_size = static_cast<uint32_t>(
-        table_options_.metadata_block_size *
-        ( 100 - table_options_.block_size_deviation));
+             ((table_options_.metadata_block_size *
+               (100 - table_options_.block_size_deviation)) +
+              99) /
+             100);
    partition_size = std::max(partition_size, static_cast<uint32_t>(1));
    return new PartitionedFilterBlockBuilder(
        nullptr, table_options_.whole_key_filtering,
--- a/util/delete_scheduler.cc
+++ b/util/delete_scheduler.cc
@ -148,6 +148,7 @@ Status DeleteScheduler::MarkAsTrash(const std::string& file_path,
  Status s;
  if (DeleteScheduler::IsTrashFile(file_path)) {
    // This is already a trash file
+    *trash_file = file_path;
    return s;
  }

--- a/util/slice_test.cc
+++ b/util/slice_test.cc
@ -1,70 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-#include "port/stack_trace.h"
-#include "rocksdb/slice.h"
-#include "util/testharness.h"
-
-namespace rocksdb {
-
-class SliceTest : public testing::Test {};
-
-namespace {
-void BumpCounter(void* arg1, void* arg2) {
-  (*reinterpret_cast<int*>(arg1))++;
-}
-}  // anonymous namespace
-
-TEST_F(SliceTest, PinnableSliceMoveConstruct) {
-  for (int i = 0; i < 3; i++) {
-    int orig_cleanup = 0;
-    int moved_cleanup = 0;
-    PinnableSlice* s1 = nullptr;
-    std::string external_storage;
-    switch (i) {
-      case 0:
-        s1 = new PinnableSlice();
-        *(s1->GetSelf()) = "foo";
-        s1->PinSelf();
-        s1->RegisterCleanup(BumpCounter, &moved_cleanup, nullptr);
-        break;
-      case 1:
-        s1 = new PinnableSlice(&external_storage);
-        *(s1->GetSelf()) = "foo";
-        s1->PinSelf();
-        s1->RegisterCleanup(BumpCounter, &moved_cleanup, nullptr);
-        break;
-      case 2:
-        s1 = new PinnableSlice();
-        s1->PinSlice("foo", BumpCounter, &moved_cleanup, nullptr);
-        break;
-    }
-    ASSERT_EQ("foo", s1->ToString());
-    PinnableSlice* s2 = new PinnableSlice();
-    s2->PinSelf("bar");
-    ASSERT_EQ("bar", s2->ToString());
-    s2->RegisterCleanup(BumpCounter, &orig_cleanup, nullptr);
-    *s2 = std::move(*s1);
-    ASSERT_EQ("foo", s2->ToString());
-    ASSERT_EQ(1, orig_cleanup);
-    ASSERT_EQ(0, moved_cleanup);
-    delete s1;
-    // ASAN will check if it will access storage of s1, which is deleted.
-    ASSERT_EQ("foo", s2->ToString());
-    ASSERT_EQ(1, orig_cleanup);
-    ASSERT_EQ(0, moved_cleanup);
-    delete s2;
-    ASSERT_EQ(1, orig_cleanup);
-    ASSERT_EQ(1, moved_cleanup);
-  }
-}
-
-}  // namespace rocksdb
-
-int main(int argc, char** argv) {
-  rocksdb::port::InstallStackTraceHandler();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
--- a/utilities/blob_db/blob_compaction_filter.h
+++ b/utilities/blob_db/blob_compaction_filter.h
@ -0,0 +1,90 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include "monitoring/statistics.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/env.h"
+#include "utilities/blob_db/blob_index.h"
+
+namespace rocksdb {
+namespace blob_db {
+
+// CompactionFilter to delete expired blob index from base DB.
+class BlobIndexCompactionFilter : public CompactionFilter {
+ public:
+  BlobIndexCompactionFilter(uint64_t current_time, Statistics* statistics)
+      : current_time_(current_time), statistics_(statistics) {}
+
+  virtual ~BlobIndexCompactionFilter() {
+    RecordTick(statistics_, BLOB_DB_BLOB_INDEX_EXPIRED, expired_count_);
+  }
+
+  virtual const char* Name() const override {
+    return "BlobIndexCompactionFilter";
+  }
+
+  // Filter expired blob indexes regardless of snapshots.
+  virtual bool IgnoreSnapshots() const override { return true; }
+
+  virtual Decision FilterV2(int /*level*/, const Slice& /*key*/,
+                            ValueType value_type, const Slice& value,
+                            std::string* /*new_value*/,
+                            std::string* /*skip_until*/) const override {
+    if (value_type != kBlobIndex) {
+      return Decision::kKeep;
+    }
+    BlobIndex blob_index;
+    Status s = blob_index.DecodeFrom(value);
+    if (!s.ok()) {
+      // Unable to decode blob index. Keeping the value.
+      return Decision::kKeep;
+    }
+    if (blob_index.HasTTL() && blob_index.expiration() <= current_time_) {
+      // Expired
+      expired_count_++;
+      return Decision::kRemove;
+    }
+    return Decision::kKeep;
+  }
+
+ private:
+  const uint64_t current_time_;
+  Statistics* statistics_;
+  // It is safe to not using std::atomic since the compaction filter, created
+  // from a compaction filter factroy, will not be called from multiple threads.
+  mutable uint64_t expired_count_ = 0;
+};
+
+class BlobIndexCompactionFilterFactory : public CompactionFilterFactory {
+ public:
+  BlobIndexCompactionFilterFactory(Env* env, Statistics* statistics)
+      : env_(env), statistics_(statistics) {}
+
+  virtual const char* Name() const override {
+    return "BlobIndexCompactionFilterFactory";
+  }
+
+  virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& /*context*/) override {
+    int64_t current_time = 0;
+    Status s = env_->GetCurrentTime(&current_time);
+    if (!s.ok()) {
+      return nullptr;
+    }
+    assert(current_time >= 0);
+    return std::unique_ptr<CompactionFilter>(new BlobIndexCompactionFilter(
+        static_cast<uint64_t>(current_time), statistics_));
+  }
+
+ private:
+  Env* env_;
+  Statistics* statistics_;
+};
+
+}  // namespace blob_db
+}  // namespace rocksdb
+#endif  // ROCKSDB_LITE
--- a/utilities/blob_db/blob_db.cc
+++ b/utilities/blob_db/blob_db.cc
@ -26,6 +26,7 @@
 #include "table/block_builder.h"
 #include "util/file_reader_writer.h"
 #include "util/filename.h"
+#include "utilities/blob_db/blob_compaction_filter.h"
 #include "utilities/blob_db/blob_db_impl.h"

 namespace rocksdb {
@ -45,6 +46,11 @@ Status BlobDB::OpenAndLoad(const Options& options,
                           const BlobDBOptions& bdb_options,
                           const std::string& dbname, BlobDB** blob_db,
                           Options* changed_options) {
+  if (options.compaction_filter != nullptr ||
+      options.compaction_filter_factory != nullptr) {
+    return Status::NotSupported("Blob DB doesn't support compaction filter.");
+  }
+
  *changed_options = options;
  *blob_db = nullptr;

@ -57,12 +63,19 @@ Status BlobDB::OpenAndLoad(const Options& options,
  {
    MutexLock l(&listener_mutex);
    all_blobdb_listeners.push_back(fblistener);
+    if (bdb_options.enable_garbage_collection) {
      all_blobdb_listeners.push_back(ce_listener);
+    }
    all_wal_filters.push_back(rw_filter);
  }

+  changed_options->compaction_filter_factory.reset(
+      new BlobIndexCompactionFilterFactory(options.env,
+                                           options.statistics.get()));
  changed_options->listeners.emplace_back(fblistener);
+  if (bdb_options.enable_garbage_collection) {
    changed_options->listeners.emplace_back(ce_listener);
+  }
  changed_options->wal_filter = rw_filter.get();

  DBOptions db_options(*changed_options);
@ -71,7 +84,9 @@ Status BlobDB::OpenAndLoad(const Options& options,
  BlobDBImpl* bdb = new BlobDBImpl(dbname, bdb_options, db_options);

  fblistener->SetImplPtr(bdb);
+  if (bdb_options.enable_garbage_collection) {
    ce_listener->SetImplPtr(bdb);
+  }
  rw_filter->SetImplPtr(bdb);

  Status s = bdb->OpenPhase1();
@ -106,6 +121,11 @@ Status BlobDB::Open(const DBOptions& db_options_input,
                    const std::vector<ColumnFamilyDescriptor>& column_families,
                    std::vector<ColumnFamilyHandle*>* handles, BlobDB** blob_db,
                    bool no_base_db) {
+  if (column_families.size() != 1 ||
+      column_families[0].name != kDefaultColumnFamilyName) {
+    return Status::NotSupported(
+        "Blob DB doesn't support non-default column family.");
+  }
  *blob_db = nullptr;
  Status s;

@ -124,20 +144,36 @@ Status BlobDB::Open(const DBOptions& db_options_input,
  ReconcileWalFilter_t rw_filter = std::make_shared<BlobReconcileWalFilter>();

  db_options.listeners.emplace_back(fblistener);
+  if (bdb_options.enable_garbage_collection) {
    db_options.listeners.emplace_back(ce_listener);
+  }
  db_options.wal_filter = rw_filter.get();

  {
    MutexLock l(&listener_mutex);
    all_blobdb_listeners.push_back(fblistener);
+    if (bdb_options.enable_garbage_collection) {
      all_blobdb_listeners.push_back(ce_listener);
+    }
    all_wal_filters.push_back(rw_filter);
  }

+  ColumnFamilyOptions cf_options(column_families[0].options);
+  if (cf_options.compaction_filter != nullptr ||
+      cf_options.compaction_filter_factory != nullptr) {
+    return Status::NotSupported("Blob DB doesn't support compaction filter.");
+  }
+  cf_options.compaction_filter_factory.reset(
+      new BlobIndexCompactionFilterFactory(db_options.env,
+                                           db_options.statistics.get()));
+  ColumnFamilyDescriptor cf_descriptor(kDefaultColumnFamilyName, cf_options);
+
  // we need to open blob db first so that recovery can happen
  BlobDBImpl* bdb = new BlobDBImpl(dbname, bdb_options, db_options);
  fblistener->SetImplPtr(bdb);
+  if (bdb_options.enable_garbage_collection) {
    ce_listener->SetImplPtr(bdb);
+  }
  rw_filter->SetImplPtr(bdb);

  s = bdb->OpenPhase1();
@ -152,7 +188,7 @@ Status BlobDB::Open(const DBOptions& db_options_input,
  }

  DB* db = nullptr;
-  s = DB::Open(db_options, dbname, column_families, handles, &db);
+  s = DB::Open(db_options, dbname, {cf_descriptor}, handles, &db);
  if (!s.ok()) {
    delete bdb;
    return s;
@ -190,6 +226,8 @@ void BlobDBOptions::Dump(Logger* log) const {
                   ttl_extractor.get());
  ROCKS_LOG_HEADER(log, "              blob_db_options.compression: %d",
                   static_cast<int>(compression));
+  ROCKS_LOG_HEADER(log, "blob_db_options.enable_garbage_collection: %d",
+                   enable_garbage_collection);
  ROCKS_LOG_HEADER(log, " blob_db_options.disable_background_tasks: %d",
                   disable_background_tasks);
 }
--- a/utilities/blob_db/blob_db.h
+++ b/utilities/blob_db/blob_db.h
@ -71,7 +71,12 @@ struct BlobDBOptions {
  // what compression to use for Blob's
  CompressionType compression = kNoCompression;

-  // Disable all background job.
+  // If enabled, blob DB periodically cleanup stale data by rewriting remaining
+  // live data in blob files to new files. If garbage collection is not enabled,
+  // blob files will be cleanup based on TTL.
+  bool enable_garbage_collection = false;
+
+  // Disable all background job. Used for test only.
  bool disable_background_tasks = false;

  void Dump(Logger* log) const;
--- a/utilities/blob_db/blob_db_impl.cc
+++ b/utilities/blob_db/blob_db_impl.cc
@ -14,6 +14,7 @@
 #include "db/db_impl.h"
 #include "db/write_batch_internal.h"
 #include "monitoring/instrumented_mutex.h"
+#include "monitoring/statistics.h"
 #include "rocksdb/convenience.h"
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
@ -30,6 +31,7 @@
 #include "util/logging.h"
 #include "util/mutexlock.h"
 #include "util/random.h"
+#include "util/stop_watch.h"
 #include "util/sync_point.h"
 #include "util/timer_queue.h"
 #include "utilities/blob_db/blob_db_iterator.h"
@ -62,13 +64,14 @@ bool blobf_compare_ttl::operator()(const std::shared_ptr<BlobFile>& lhs,
  if (lhs->expiration_range_.first > rhs->expiration_range_.first) {
    return false;
  }
-  return lhs->BlobFileNumber() > rhs->BlobFileNumber();
+  return lhs->BlobFileNumber() < rhs->BlobFileNumber();
 }

 void EvictAllVersionsCompactionListener::InternalListener::OnCompaction(
    int level, const Slice& key,
    CompactionEventListener::CompactionListenerValueType value_type,
    const Slice& existing_value, const SequenceNumber& sn, bool is_new) {
+  assert(impl_->bdb_options_.enable_garbage_collection);
  if (!is_new &&
      value_type ==
          CompactionEventListener::CompactionListenerValueType::kValue) {
@ -105,19 +108,17 @@ BlobDBImpl::BlobDBImpl(const std::string& dbname,
      bdb_options_(blob_db_options),
      db_options_(db_options),
      env_options_(db_options),
+      statistics_(db_options_.statistics.get()),
      dir_change_(false),
      next_file_number_(1),
      epoch_of_(0),
      shutdown_(false),
      current_epoch_(0),
      open_file_count_(0),
-      last_period_write_(0),
-      last_period_ampl_(0),
-      total_periods_write_(0),
-      total_periods_ampl_(0),
      total_blob_space_(0),
      open_p1_done_(false),
-      debug_level_(0) {
+      debug_level_(0),
+      oldest_file_evicted_(false) {
  blob_dir_ = (bdb_options_.path_relative)
                  ? dbname + "/" + bdb_options_.blob_dir
                  : bdb_options_.blob_dir;
@ -161,17 +162,15 @@ BlobDBImpl::BlobDBImpl(DB* db, const BlobDBOptions& blob_db_options)
      bdb_options_(blob_db_options),
      db_options_(db->GetOptions()),
      env_options_(db_->GetOptions()),
+      statistics_(db_options_.statistics.get()),
      dir_change_(false),
      next_file_number_(1),
      epoch_of_(0),
      shutdown_(false),
      current_epoch_(0),
      open_file_count_(0),
-      last_period_write_(0),
-      last_period_ampl_(0),
-      total_periods_write_(0),
-      total_periods_ampl_(0),
-      total_blob_space_(0) {
+      total_blob_space_(0),
+      oldest_file_evicted_(false) {
  if (!bdb_options_.blob_dir.empty())
    blob_dir_ = (bdb_options_.path_relative)
                    ? db_->GetName() + "/" + bdb_options_.blob_dir
@ -211,19 +210,19 @@ void BlobDBImpl::StartBackgroundTasks() {
      std::bind(&BlobDBImpl::ReclaimOpenFiles, this, std::placeholders::_1));
  tqueue_.add(kGCCheckPeriodMillisecs,
              std::bind(&BlobDBImpl::RunGC, this, std::placeholders::_1));
+  if (bdb_options_.enable_garbage_collection) {
    tqueue_.add(
        kDeleteCheckPeriodMillisecs,
        std::bind(&BlobDBImpl::EvictDeletions, this, std::placeholders::_1));
    tqueue_.add(
        kDeleteCheckPeriodMillisecs,
        std::bind(&BlobDBImpl::EvictCompacted, this, std::placeholders::_1));
+  }
  tqueue_.add(
      kDeleteObsoleteFilesPeriodMillisecs,
      std::bind(&BlobDBImpl::DeleteObsoleteFiles, this, std::placeholders::_1));
  tqueue_.add(kSanityCheckPeriodMillisecs,
              std::bind(&BlobDBImpl::SanityCheck, this, std::placeholders::_1));
-  tqueue_.add(kWriteAmplificationStatsPeriodMillisecs,
-              std::bind(&BlobDBImpl::WaStats, this, std::placeholders::_1));
  tqueue_.add(kFSyncFilesPeriodMillisecs,
              std::bind(&BlobDBImpl::FsyncFiles, this, std::placeholders::_1));
  tqueue_.add(
@ -325,6 +324,7 @@ Status BlobDBImpl::OpenAllFiles() {
      continue;
    }
    bfptr->SetHasTTL(bfptr->header_.has_ttl);
+    bfptr->SetCompression(bfptr->header_.compression);
    bfptr->header_valid_ = true;

    std::shared_ptr<RandomAccessFileReader> ra_reader =
@ -484,8 +484,8 @@ Status BlobDBImpl::CreateWriterLocked(const std::shared_ptr<BlobFile>& bfile) {
  }

  bfile->log_writer_ = std::make_shared<Writer>(
-      std::move(fwriter), bfile->file_number_, bdb_options_.bytes_per_sync,
-      db_options_.use_fsync, boffset);
+      std::move(fwriter), env_, statistics_, bfile->file_number_,
+      bdb_options_.bytes_per_sync, db_options_.use_fsync, boffset);
  bfile->log_writer_->last_elem_type_ = et;

  return s;
@ -562,6 +562,7 @@ std::shared_ptr<BlobFile> BlobDBImpl::SelectBlobFile() {
      reinterpret_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily())->GetID();
  bfile->header_valid_ = true;
  bfile->SetHasTTL(false);
+  bfile->SetCompression(bdb_options_.compression);

  Status s = writer->WriteHeader(bfile->header_);
  if (!s.ok()) {
@ -622,6 +623,7 @@ std::shared_ptr<BlobFile> BlobDBImpl::SelectBlobFileTTL(uint64_t expiration) {
  ;
  bfile->header_valid_ = true;
  bfile->SetHasTTL(true);
+  bfile->SetCompression(bdb_options_.compression);
  bfile->file_size_ = BlobLogHeader::kSize;

  // set the first value of the range, since that is
@ -657,8 +659,10 @@ Status BlobDBImpl::Delete(const WriteOptions& options, const Slice& key) {
  SequenceNumber lsn = db_impl_->GetLatestSequenceNumber();
  Status s = db_->Delete(options, key);

+  if (bdb_options_.enable_garbage_collection) {
    // add deleted key to list of keys that have been deleted for book-keeping
    delete_keys_q_.enqueue({DefaultColumnFamily(), key.ToString(), lsn});
+  }
  return s;
 }

@ -735,13 +739,23 @@ class BlobDBImpl::BlobInserter : public WriteBatch::Handler {
 };

 Status BlobDBImpl::Write(const WriteOptions& options, WriteBatch* updates) {
-  MutexLock l(&write_mutex_);
-
+  StopWatch write_sw(env_, statistics_, BLOB_DB_WRITE_MICROS);
+  RecordTick(statistics_, BLOB_DB_NUM_WRITE);
  uint32_t default_cf_id =
      reinterpret_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily())->GetID();
+  // TODO(yiwu): In case there are multiple writers the latest sequence would
+  // not be the actually sequence we are writting. Need to get the sequence
+  // from write batch after DB write instead.
  SequenceNumber current_seq = GetLatestSequenceNumber() + 1;
+  Status s;
  BlobInserter blob_inserter(options, this, default_cf_id, current_seq);
-  Status s = updates->Iterate(&blob_inserter);
+  {
+    // Release write_mutex_ before DB write to avoid race condition with
+    // flush begin listener, which also require write_mutex_ to sync
+    // blob files.
+    MutexLock l(&write_mutex_);
+    s = updates->Iterate(&blob_inserter);
+  }
  if (!s.ok()) {
    return s;
  }
@ -749,7 +763,6 @@ Status BlobDBImpl::Write(const WriteOptions& options, WriteBatch* updates) {
  if (!s.ok()) {
    return s;
  }
-  assert(blob_inserter.sequence() == GetLatestSequenceNumber() + 1);

  // add deleted key to list of keys that have been deleted for book-keeping
  class DeleteBookkeeper : public WriteBatch::Handler {
@ -778,11 +791,13 @@ Status BlobDBImpl::Write(const WriteOptions& options, WriteBatch* updates) {
    SequenceNumber sequence_;
  };

+  if (bdb_options_.enable_garbage_collection) {
    // add deleted key to list of keys that have been deleted for book-keeping
    DeleteBookkeeper delete_bookkeeper(this, current_seq);
-  updates->Iterate(&delete_bookkeeper);
+    s = updates->Iterate(&delete_bookkeeper);
+  }

-  return Status::OK();
+  return s;
 }

 Status BlobDBImpl::GetLiveFiles(std::vector<std::string>& ret,
@ -836,20 +851,32 @@ Status BlobDBImpl::PutWithTTL(const WriteOptions& options,

 Status BlobDBImpl::PutUntil(const WriteOptions& options, const Slice& key,
                            const Slice& value, uint64_t expiration) {
-  MutexLock l(&write_mutex_);
-  SequenceNumber sequence = GetLatestSequenceNumber() + 1;
+  StopWatch write_sw(env_, statistics_, BLOB_DB_WRITE_MICROS);
+  RecordTick(statistics_, BLOB_DB_NUM_PUT);
+  TEST_SYNC_POINT("BlobDBImpl::PutUntil:Start");
+  Status s;
  WriteBatch batch;
-  Status s = PutBlobValue(options, key, value, expiration, sequence, &batch);
+  {
+    // Release write_mutex_ before DB write to avoid race condition with
+    // flush begin listener, which also require write_mutex_ to sync
+    // blob files.
+    MutexLock l(&write_mutex_);
+    // TODO(yiwu): In case there are multiple writers the latest sequence would
+    // not be the actually sequence we are writting. Need to get the sequence
+    // from write batch after DB write instead.
+    SequenceNumber sequence = GetLatestSequenceNumber() + 1;
+    s = PutBlobValue(options, key, value, expiration, sequence, &batch);
+  }
  if (s.ok()) {
    s = db_->Write(options, &batch);
  }
+  TEST_SYNC_POINT("BlobDBImpl::PutUntil:Finish");
  return s;
 }

 Status BlobDBImpl::PutBlobValue(const WriteOptions& options, const Slice& key,
                                const Slice& value, uint64_t expiration,
                                SequenceNumber sequence, WriteBatch* batch) {
-  TEST_SYNC_POINT("BlobDBImpl::PutBlobValue:Start");
  Status s;
  std::string index_entry;
  uint32_t column_family_id =
@ -858,11 +885,13 @@ Status BlobDBImpl::PutBlobValue(const WriteOptions& options, const Slice& key,
    if (expiration == kNoExpiration) {
      // Put as normal value
      s = batch->Put(key, value);
+      RecordTick(statistics_, BLOB_DB_WRITE_INLINED);
    } else {
      // Inlined with TTL
      BlobIndex::EncodeInlinedTTL(&index_entry, expiration, value);
      s = WriteBatchInternal::PutBlobIndex(batch, column_family_id, key,
                                           index_entry);
+      RecordTick(statistics_, BLOB_DB_WRITE_INLINED_TTL);
    }
  } else {
    std::shared_ptr<BlobFile> bfile = (expiration != kNoExpiration)
@ -872,6 +901,7 @@ Status BlobDBImpl::PutBlobValue(const WriteOptions& options, const Slice& key,
      return Status::NotFound("Blob file not found");
    }

+    assert(bfile->compression() == bdb_options_.compression);
    std::string compression_output;
    Slice value_compressed = GetCompressedSlice(value, &compression_output);

@ -880,6 +910,11 @@ Status BlobDBImpl::PutBlobValue(const WriteOptions& options, const Slice& key,

    s = AppendBlob(bfile, headerbuf, key, value_compressed, expiration,
                   &index_entry);
+    if (expiration == kNoExpiration) {
+      RecordTick(statistics_, BLOB_DB_WRITE_BLOB);
+    } else {
+      RecordTick(statistics_, BLOB_DB_WRITE_BLOB_TTL);
+    }

    if (s.ok()) {
      bfile->ExtendSequenceRange(sequence);
@ -901,7 +936,11 @@ Status BlobDBImpl::PutBlobValue(const WriteOptions& options, const Slice& key,
    }
  }

-  TEST_SYNC_POINT("BlobDBImpl::PutBlobValue:Finish");
+  RecordTick(statistics_, BLOB_DB_NUM_KEYS_WRITTEN);
+  RecordTick(statistics_, BLOB_DB_BYTES_WRITTEN, key.size() + value.size());
+  MeasureTime(statistics_, BLOB_DB_KEY_SIZE, key.size());
+  MeasureTime(statistics_, BLOB_DB_VALUE_SIZE, value.size());
+
  return s;
 }

@ -910,6 +949,7 @@ Slice BlobDBImpl::GetCompressedSlice(const Slice& raw,
  if (bdb_options_.compression == kNoCompression) {
    return raw;
  }
+  StopWatch compression_sw(env_, statistics_, BLOB_DB_COMPRESSION_MICROS);
  CompressionType ct = bdb_options_.compression;
  CompressionOptions compression_opts;
  CompressBlock(raw, compression_opts, &ct, kBlockBasedTableVersionFormat,
@ -931,19 +971,74 @@ uint64_t BlobDBImpl::ExtractExpiration(const Slice& key, const Slice& value,
  return has_expiration ? expiration : kNoExpiration;
 }

+std::shared_ptr<BlobFile> BlobDBImpl::GetOldestBlobFile() {
+  std::vector<std::shared_ptr<BlobFile>> blob_files;
+  CopyBlobFiles(&blob_files, [](const std::shared_ptr<BlobFile>& f) {
+    return !f->Obsolete() && f->Immutable();
+  });
+  blobf_compare_ttl compare;
+  return *std::min_element(blob_files.begin(), blob_files.end(), compare);
+}
+
+bool BlobDBImpl::EvictOldestBlobFile() {
+  auto oldest_file = GetOldestBlobFile();
+  if (oldest_file == nullptr) {
+    return false;
+  }
+
+  WriteLock wl(&mutex_);
+  // Double check the file is not obsolete by others
+  if (oldest_file_evicted_ == false && !oldest_file->Obsolete()) {
+    auto expiration_range = oldest_file->GetExpirationRange();
+    ROCKS_LOG_INFO(db_options_.info_log,
+                   "Evict oldest blob file since DB out of space. Current "
+                   "space used: %" PRIu64 ", blob dir size: %" PRIu64
+                   ", evicted blob file #%" PRIu64
+                   " with expiration range (%" PRIu64 ", %" PRIu64 ").",
+                   total_blob_space_.load(), bdb_options_.blob_dir_size,
+                   oldest_file->BlobFileNumber(), expiration_range.first,
+                   expiration_range.second);
+    oldest_file->MarkObsolete(oldest_file->GetSequenceRange().second);
+    obsolete_files_.push_back(oldest_file);
+    oldest_file_evicted_.store(true);
+    RecordTick(statistics_, BLOB_DB_FIFO_NUM_FILES_EVICTED);
+    RecordTick(statistics_, BLOB_DB_FIFO_NUM_KEYS_EVICTED,
+               oldest_file->BlobCount());
+    RecordTick(statistics_, BLOB_DB_FIFO_BYTES_EVICTED,
+               oldest_file->GetFileSize());
+    return true;
+  }
+
+  return false;
+}
+
+Status BlobDBImpl::CheckSize(size_t blob_size) {
+  uint64_t new_space_util = total_blob_space_.load() + blob_size;
+  if (bdb_options_.blob_dir_size > 0) {
+    if (!bdb_options_.is_fifo &&
+        (new_space_util > bdb_options_.blob_dir_size)) {
+      return Status::NoSpace(
+          "Write failed, as writing it would exceed blob_dir_size limit.");
+    }
+    if (bdb_options_.is_fifo && !oldest_file_evicted_.load() &&
+        (new_space_util >
+         kEvictOldestFileAtSize * bdb_options_.blob_dir_size)) {
+      EvictOldestBlobFile();
+    }
+  }
+
+  return Status::OK();
+}
+
 Status BlobDBImpl::AppendBlob(const std::shared_ptr<BlobFile>& bfile,
                              const std::string& headerbuf, const Slice& key,
                              const Slice& value, uint64_t expiration,
                              std::string* index_entry) {
  auto size_put = BlobLogRecord::kHeaderSize + key.size() + value.size();
-  if (bdb_options_.blob_dir_size > 0 &&
-      (total_blob_space_.load() + size_put) > bdb_options_.blob_dir_size) {
-    if (!bdb_options_.is_fifo) {
-      return Status::NoSpace("Blob DB reached the maximum configured size.");
+  Status s = CheckSize(size_put);
+  if (!s.ok()) {
+    return s;
  }
-  }
-
-  Status s;

  uint64_t blob_offset = 0;
  uint64_t key_offset = 0;
@ -968,7 +1063,6 @@ Status BlobDBImpl::AppendBlob(const std::shared_ptr<BlobFile>& bfile,
  bfile->blob_count_++;

  bfile->file_size_ += size_put;
-  last_period_write_ += size_put;
  total_blob_space_ += size_put;

  if (expiration == kNoExpiration) {
@ -986,6 +1080,8 @@ Status BlobDBImpl::AppendBlob(const std::shared_ptr<BlobFile>& bfile,
 std::vector<Status> BlobDBImpl::MultiGet(
    const ReadOptions& read_options,
    const std::vector<Slice>& keys, std::vector<std::string>* values) {
+  StopWatch multiget_sw(env_, statistics_, BLOB_DB_MULTIGET_MICROS);
+  RecordTick(statistics_, BLOB_DB_NUM_MULTIGET);
  // Get a snapshot to avoid blob file get deleted between we
  // fetch and index entry and reading from the file.
  ReadOptions ro(read_options);
@ -1089,7 +1185,12 @@ Status BlobDBImpl::GetBlobValue(const Slice& key, const Slice& index_entry,
  char* buffer = &(*valueptr)[0];

  Slice blob_value;
-  s = reader->Read(blob_index.offset(), blob_index.size(), &blob_value, buffer);
+  {
+    StopWatch read_sw(env_, statistics_, BLOB_DB_BLOB_FILE_READ_MICROS);
+    s = reader->Read(blob_index.offset(), blob_index.size(), &blob_value,
+                     buffer);
+    RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_READ, blob_value.size());
+  }
  if (!s.ok() || blob_value.size() != blob_index.size()) {
    if (debug_level_ >= 2) {
      ROCKS_LOG_ERROR(db_options_.info_log,
@ -1135,15 +1236,17 @@ Status BlobDBImpl::GetBlobValue(const Slice& key, const Slice& index_entry,
    return Status::Corruption("Corruption. Blob CRC mismatch");
  }

-  // TODO(yiwu): Should use compression flag in the blob file instead of
-  // current compression option.
-  if (bdb_options_.compression != kNoCompression) {
+  if (bfile->compression() != kNoCompression) {
    BlockContents contents;
    auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily());
+    {
+      StopWatch decompression_sw(env_, statistics_,
+                                 BLOB_DB_DECOMPRESSION_MICROS);
      s = UncompressBlockContentsForCompressionType(
          blob_value.data(), blob_value.size(), &contents,
-        kBlockBasedTableVersionFormat, Slice(), bdb_options_.compression,
+          kBlockBasedTableVersionFormat, Slice(), bfile->compression(),
          *(cfh->cfd()->ioptions()));
+    }
    *(value->GetSelf()) = contents.data.ToString();
  }

@ -1155,6 +1258,14 @@ Status BlobDBImpl::GetBlobValue(const Slice& key, const Slice& index_entry,
 Status BlobDBImpl::Get(const ReadOptions& read_options,
                       ColumnFamilyHandle* column_family, const Slice& key,
                       PinnableSlice* value) {
+  StopWatch get_sw(env_, statistics_, BLOB_DB_GET_MICROS);
+  RecordTick(statistics_, BLOB_DB_NUM_GET);
+  return GetImpl(read_options, column_family, key, value);
+}
+
+Status BlobDBImpl::GetImpl(const ReadOptions& read_options,
+                           ColumnFamilyHandle* column_family, const Slice& key,
+                           PinnableSlice* value) {
  if (column_family != DefaultColumnFamily()) {
    return Status::NotSupported(
        "Blob DB doesn't support non-default column family.");
@ -1167,19 +1278,21 @@ Status BlobDBImpl::Get(const ReadOptions& read_options,

  Status s;
  bool is_blob_index = false;
-  s = db_impl_->GetImpl(ro, column_family, key, value, nullptr /*value_found*/,
-                        nullptr /*read_callback*/, &is_blob_index);
+  s = db_impl_->GetImpl(ro, column_family, key, value,
+                        nullptr /*value_found*/, nullptr /*read_callback*/,
+                        &is_blob_index);
  TEST_SYNC_POINT("BlobDBImpl::Get:AfterIndexEntryGet:1");
  TEST_SYNC_POINT("BlobDBImpl::Get:AfterIndexEntryGet:2");
-  if (s.ok()) {
-    if (is_blob_index) {
-      PinnableSlice index_entry = std::move(*value);
+  if (s.ok() && is_blob_index) {
+    std::string index_entry = value->ToString();
+    value->Reset();
    s = GetBlobValue(key, index_entry, value);
  }
-  }
  if (snapshot_created) {
    db_->ReleaseSnapshot(ro.snapshot);
  }
+  RecordTick(statistics_, BLOB_DB_NUM_KEYS_READ);
+  RecordTick(statistics_, BLOB_DB_BYTES_READ, value->size());
  return s;
 }

@ -1254,32 +1367,18 @@ Status BlobDBImpl::CloseBlobFileIfNeeded(std::shared_ptr<BlobFile>& bfile) {
  return CloseBlobFile(bfile);
 }

-bool BlobDBImpl::FileDeleteOk_SnapshotCheckLocked(
+bool BlobDBImpl::VisibleToActiveSnapshot(
    const std::shared_ptr<BlobFile>& bfile) {
  assert(bfile->Obsolete());
-
-  SequenceNumber esn = bfile->GetSequenceRange().first;
-
-  // TODO(yiwu): Here we should check instead if there is an active snapshot
-  // lies between the first sequence in the file, and the last sequence by
-  // the time the file finished being garbage collect.
-  bool notok = db_impl_->HasActiveSnapshotLaterThanSN(esn);
-  if (notok) {
-    ROCKS_LOG_INFO(db_options_.info_log,
-                   "Could not delete file due to snapshot failure %s",
-                   bfile->PathName().c_str());
-    return false;
-  } else {
-    ROCKS_LOG_INFO(db_options_.info_log,
-                   "Will delete file due to snapshot success %s",
-                   bfile->PathName().c_str());
-    return true;
-  }
+  SequenceNumber first_sequence = bfile->GetSequenceRange().first;
+  SequenceNumber obsolete_sequence = bfile->GetObsoleteSequence();
+  return db_impl_->HasActiveSnapshotInRange(first_sequence, obsolete_sequence);
 }

 bool BlobDBImpl::FindFileAndEvictABlob(uint64_t file_number, uint64_t key_size,
                                       uint64_t blob_offset,
                                       uint64_t blob_size) {
+  assert(bdb_options_.enable_garbage_collection);
  (void)blob_offset;
  std::shared_ptr<BlobFile> bfile;
  {
@ -1302,6 +1401,7 @@ bool BlobDBImpl::FindFileAndEvictABlob(uint64_t file_number, uint64_t key_size,
 }

 bool BlobDBImpl::MarkBlobDeleted(const Slice& key, const Slice& index_entry) {
+  assert(bdb_options_.enable_garbage_collection);
  BlobIndex blob_index;
  Status s = blob_index.DecodeFrom(index_entry);
  if (!s.ok()) {
@ -1316,6 +1416,7 @@ bool BlobDBImpl::MarkBlobDeleted(const Slice& key, const Slice& index_entry) {
 }

 std::pair<bool, int64_t> BlobDBImpl::EvictCompacted(bool aborted) {
+  assert(bdb_options_.enable_garbage_collection);
  if (aborted) return std::make_pair(false, -1);

  override_packet_t packet;
@ -1339,6 +1440,7 @@ std::pair<bool, int64_t> BlobDBImpl::EvictCompacted(bool aborted) {
 }

 std::pair<bool, int64_t> BlobDBImpl::EvictDeletions(bool aborted) {
+  assert(bdb_options_.enable_garbage_collection);
  if (aborted) return std::make_pair(false, -1);

  ColumnFamilyHandle* last_cfh = nullptr;
@ -1478,35 +1580,6 @@ std::pair<bool, int64_t> BlobDBImpl::ReclaimOpenFiles(bool aborted) {
  return std::make_pair(true, -1);
 }

-// TODO(yiwu): correct the stats and expose it.
-std::pair<bool, int64_t> BlobDBImpl::WaStats(bool aborted) {
-  if (aborted) return std::make_pair(false, -1);
-
-  WriteLock wl(&mutex_);
-
-  if (all_periods_write_.size() >= kWriteAmplificationStatsPeriods) {
-    total_periods_write_ -= (*all_periods_write_.begin());
-    total_periods_ampl_ = (*all_periods_ampl_.begin());
-
-    all_periods_write_.pop_front();
-    all_periods_ampl_.pop_front();
-  }
-
-  uint64_t val1 = last_period_write_.load();
-  uint64_t val2 = last_period_ampl_.load();
-
-  all_periods_write_.push_back(val1);
-  all_periods_ampl_.push_back(val2);
-
-  last_period_write_ = 0;
-  last_period_ampl_ = 0;
-
-  total_periods_write_ += val1;
-  total_periods_ampl_ += val2;
-
-  return std::make_pair(true, -1);
-}
-
 // Write callback for garbage collection to check if key has been updated
 // since last read. Similar to how OptimisticTransaction works. See inline
 // comment in GCFileAndUpdateLSM().
@ -1567,6 +1640,7 @@ class BlobDBImpl::GarbageCollectionWriteCallback : public WriteCallback {
 // DELETED in the LSM
 Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
                                      GCStats* gc_stats) {
+  StopWatch gc_sw(env_, statistics_, BLOB_DB_GC_MICROS);
  uint64_t now = EpochNow();

  std::shared_ptr<Reader> reader =
@ -1648,7 +1722,7 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
        ReadOptions(), cfh, record.key, &index_entry, nullptr /*value_found*/,
        nullptr /*read_callback*/, &is_blob_index);
    TEST_SYNC_POINT("BlobDBImpl::GCFileAndUpdateLSM:AfterGetFromBaseDB");
-    if (!get_status.ok() && !get_status.ok()) {
+    if (!get_status.ok() && !get_status.IsNotFound()) {
      // error
      s = get_status;
      ROCKS_LOG_ERROR(db_options_.info_log,
@ -1659,6 +1733,8 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
    if (get_status.IsNotFound() || !is_blob_index) {
      // Either the key is deleted or updated with a newer version whish is
      // inlined in LSM.
+      gc_stats->num_keys_overwritten++;
+      gc_stats->bytes_overwritten += record.record_size();
      continue;
    }

@ -1670,18 +1746,23 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
                      s.ToString().c_str());
      break;
    }
-    if (blob_index.file_number() != bfptr->BlobFileNumber() ||
+    if (blob_index.IsInlined() ||
+        blob_index.file_number() != bfptr->BlobFileNumber() ||
        blob_index.offset() != blob_offset) {
      // Key has been overwritten. Drop the blob record.
+      gc_stats->num_keys_overwritten++;
+      gc_stats->bytes_overwritten += record.record_size();
      continue;
    }

    GarbageCollectionWriteCallback callback(cfd, record.key, latest_seq);

    // If key has expired, remove it from base DB.
+    // TODO(yiwu): Blob indexes will be remove by BlobIndexCompactionFilter.
+    // We can just drop the blob record.
    if (no_relocation_ttl || (has_ttl && now >= record.expiration)) {
-      gc_stats->num_deletes++;
-      gc_stats->deleted_size += record.value_size;
+      gc_stats->num_keys_expired++;
+      gc_stats->bytes_expired += record.record_size();
      TEST_SYNC_POINT("BlobDBImpl::GCFileAndUpdateLSM:BeforeDelete");
      WriteBatch delete_batch;
      Status delete_status = delete_batch.Delete(record.key);
@ -1689,12 +1770,7 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
        delete_status = db_impl_->WriteWithCallback(WriteOptions(),
                                                    &delete_batch, &callback);
      }
-      if (delete_status.ok()) {
-        gc_stats->delete_succeeded++;
-      } else if (delete_status.IsBusy()) {
-        // The key is overwritten in the meanwhile. Drop the blob record.
-        gc_stats->overwritten_while_delete++;
-      } else {
+      if (!delete_status.ok() && !delete_status.IsBusy()) {
        // We hit an error.
        s = delete_status;
        ROCKS_LOG_ERROR(db_options_.info_log,
@ -1717,7 +1793,6 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
      std::string reason("GC of ");
      reason += bfptr->PathName();
      newfile = NewBlobFile(reason);
-      gc_stats->newfile = newfile;

      new_writer = CheckOrCreateWriterLocked(newfile);
      newfile->header_ = std::move(header);
@ -1739,9 +1814,7 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
      blob_files_.insert(std::make_pair(newfile->BlobFileNumber(), newfile));
    }

-    gc_stats->num_relocate++;
    std::string new_index_entry;
-
    uint64_t new_blob_offset = 0;
    uint64_t new_key_offset = 0;
    // write the blob to the blob log.
@ -1765,10 +1838,14 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
                                                   &rewrite_batch, &callback);
    }
    if (rewrite_status.ok()) {
-      gc_stats->relocate_succeeded++;
+      newfile->ExtendSequenceRange(
+          WriteBatchInternal::Sequence(&rewrite_batch));
+      gc_stats->num_keys_relocated++;
+      gc_stats->bytes_relocated += record.record_size();
    } else if (rewrite_status.IsBusy()) {
      // The key is overwritten in the meanwhile. Drop the blob record.
-      gc_stats->overwritten_while_relocate++;
+      gc_stats->num_keys_overwritten++;
+      gc_stats->bytes_overwritten += record.record_size();
    } else {
      // We hit an error.
      s = rewrite_status;
@ -1778,19 +1855,47 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
    }
  }  // end of ReadRecord loop

+  if (s.ok()) {
+    SequenceNumber obsolete_sequence =
+        newfile == nullptr ? bfptr->GetSequenceRange().second + 1
+                           : newfile->GetSequenceRange().second;
+    bfptr->MarkObsolete(obsolete_sequence);
+    if (!first_gc) {
+      WriteLock wl(&mutex_);
+      obsolete_files_.push_back(bfptr);
+    }
+  }
+
  ROCKS_LOG_INFO(
      db_options_.info_log,
-      "%s blob file %" PRIu64
-      ". Total blob records: %" PRIu64 ", Deletes: %" PRIu64 "/%" PRIu64
-      " succeeded, Relocates: %" PRIu64 "/%" PRIu64 " succeeded.",
+      "%s blob file %" PRIu64 ". Total blob records: %" PRIu64
+      ", Expired: %" PRIu64 " keys/%" PRIu64 " bytes, Overwritten: %" PRIu64
+      " keys/%" PRIu64 " bytes.",
      s.ok() ? "Successfully garbage collected" : "Failed to garbage collect",
-      bfptr->BlobFileNumber(), gc_stats->blob_count, gc_stats->delete_succeeded,
-      gc_stats->num_deletes, gc_stats->relocate_succeeded,
-      gc_stats->num_relocate);
+      bfptr->BlobFileNumber(), gc_stats->blob_count, gc_stats->num_keys_expired,
+      gc_stats->bytes_expired, gc_stats->num_keys_overwritten,
+      gc_stats->bytes_overwritten, gc_stats->num_keys_relocated,
+      gc_stats->bytes_relocated);
+  RecordTick(statistics_, BLOB_DB_GC_NUM_FILES);
+  RecordTick(statistics_, BLOB_DB_GC_NUM_KEYS_OVERWRITTEN,
+             gc_stats->num_keys_overwritten);
+  RecordTick(statistics_, BLOB_DB_GC_NUM_KEYS_EXPIRED,
+             gc_stats->num_keys_expired);
+  RecordTick(statistics_, BLOB_DB_GC_BYTES_OVERWRITTEN,
+             gc_stats->bytes_overwritten);
+  RecordTick(statistics_, BLOB_DB_GC_BYTES_EXPIRED, gc_stats->bytes_expired);
  if (newfile != nullptr) {
    total_blob_space_ += newfile->file_size_;
    ROCKS_LOG_INFO(db_options_.info_log, "New blob file %" PRIu64 ".",
                   newfile->BlobFileNumber());
+    RecordTick(statistics_, BLOB_DB_GC_NUM_NEW_FILES);
+    RecordTick(statistics_, BLOB_DB_GC_NUM_KEYS_RELOCATED,
+               gc_stats->num_keys_relocated);
+    RecordTick(statistics_, BLOB_DB_GC_BYTES_RELOCATED,
+               gc_stats->bytes_relocated);
+  }
+  if (!s.ok()) {
+    RecordTick(statistics_, BLOB_DB_GC_FAILURES);
  }
  return s;
 }
@ -1844,11 +1949,13 @@ bool BlobDBImpl::ShouldGCFile(std::shared_ptr<BlobFile> bfile, uint64_t now,

  ReadLock lockbfile_r(&bfile->mutex_);

+  if (bdb_options_.enable_garbage_collection) {
    if ((bfile->deleted_size_ * 100.0 / bfile->file_size_.load()) >
        kPartialExpirationPercentage) {
      *reason = "deleted simple blobs beyond threshold";
      return true;
    }
+  }

  // if we haven't reached limits of disk space, don't DELETE
  if (bdb_options_.blob_dir_size == 0 ||
@ -1884,11 +1991,17 @@ std::pair<bool, int64_t> BlobDBImpl::DeleteObsoleteFiles(bool aborted) {
    auto bfile = *iter;
    {
      ReadLock lockbfile_r(&bfile->mutex_);
-      if (!FileDeleteOk_SnapshotCheckLocked(bfile)) {
+      if (VisibleToActiveSnapshot(bfile)) {
+        ROCKS_LOG_INFO(db_options_.info_log,
+                       "Could not delete file due to snapshot failure %s",
+                       bfile->PathName().c_str());
        ++iter;
        continue;
      }
    }
+    ROCKS_LOG_INFO(db_options_.info_log,
+                   "Will delete file due to snapshot success %s",
+                   bfile->PathName().c_str());

    blob_files_.erase(bfile->BlobFileNumber());
    Status s = env_->DeleteFile(bfile->PathName());
@ -1910,7 +2023,12 @@ std::pair<bool, int64_t> BlobDBImpl::DeleteObsoleteFiles(bool aborted) {
  }

  // directory change. Fsync
-  if (file_deleted) dir_ent_->Fsync();
+  if (file_deleted) {
+    dir_ent_->Fsync();
+
+    // reset oldest_file_evicted flag
+    oldest_file_evicted_.store(false);
+  }

  // put files back into obsolete if for some reason, delete failed
  if (!tobsolete.empty()) {
@ -1924,14 +2042,19 @@ std::pair<bool, int64_t> BlobDBImpl::DeleteObsoleteFiles(bool aborted) {
 }

 void BlobDBImpl::CopyBlobFiles(
-    std::vector<std::shared_ptr<BlobFile>>* bfiles_copy) {
+    std::vector<std::shared_ptr<BlobFile>>* bfiles_copy,
+    std::function<bool(const std::shared_ptr<BlobFile>&)> predicate) {
  ReadLock rl(&mutex_);

-  // take a copy
-  bfiles_copy->reserve(blob_files_.size());
  for (auto const& p : blob_files_) {
+    bool pred_value = true;
+    if (predicate) {
+      pred_value = predicate(p.second);
+    }
+    if (pred_value) {
      bfiles_copy->push_back(p.second);
    }
+  }
 }

 void BlobDBImpl::FilterSubsetOfFiles(
@ -2008,8 +2131,6 @@ std::pair<bool, int64_t> BlobDBImpl::RunGC(bool aborted) {
  FilterSubsetOfFiles(blob_files, &to_process, current_epoch_,
                      files_to_collect);

-  // in this collect the set of files, which became obsolete
-  std::vector<std::shared_ptr<BlobFile>> obsoletes;
  for (auto bfile : to_process) {
    GCStats gc_stats;
    Status s = GCFileAndUpdateLSM(bfile, &gc_stats);
@ -2020,19 +2141,11 @@ std::pair<bool, int64_t> BlobDBImpl::RunGC(bool aborted) {
    if (bfile->gc_once_after_open_.load()) {
      WriteLock lockbfile_w(&bfile->mutex_);

-      bfile->deleted_size_ = gc_stats.deleted_size;
-      bfile->deleted_count_ = gc_stats.num_deletes;
+      bfile->deleted_size_ =
+          gc_stats.bytes_overwritten + gc_stats.bytes_expired;
+      bfile->deleted_count_ =
+          gc_stats.num_keys_overwritten + gc_stats.num_keys_expired;
      bfile->gc_once_after_open_ = false;
-    } else {
-      obsoletes.push_back(bfile);
-    }
-  }
-
-  if (!obsoletes.empty()) {
-    WriteLock wl(&mutex_);
-    for (auto bfile : obsoletes) {
-      bfile->SetCanBeDeleted();
-      obsolete_files_.push_front(bfile);
    }
  }

@ -2054,7 +2167,7 @@ Iterator* BlobDBImpl::NewIterator(const ReadOptions& read_options) {
  auto* iter = db_impl_->NewIteratorImpl(
      read_options, cfd, snapshot->GetSequenceNumber(),
      nullptr /*read_callback*/, true /*allow_blob*/);
-  return new BlobDBIterator(own_snapshot, iter, this);
+  return new BlobDBIterator(own_snapshot, iter, this, env_, statistics_);
 }

 Status DestroyBlobDB(const std::string& dbname, const Options& options,
@ -2129,16 +2242,6 @@ Status BlobDBImpl::TEST_GCFileAndUpdateLSM(std::shared_ptr<BlobFile>& bfile,
 }

 void BlobDBImpl::TEST_RunGC() { RunGC(false /*abort*/); }
-
-void BlobDBImpl::TEST_ObsoleteFile(std::shared_ptr<BlobFile>& bfile) {
-  uint64_t number = bfile->BlobFileNumber();
-  assert(blob_files_.count(number) > 0);
-  bfile->SetCanBeDeleted();
-  {
-    WriteLock l(&mutex_);
-    obsolete_files_.push_back(bfile);
-  }
-}
 #endif  //  !NDEBUG

 }  // namespace blob_db
--- a/utilities/blob_db/blob_db_impl.h
+++ b/utilities/blob_db/blob_db_impl.h
@ -24,6 +24,7 @@
 #include "rocksdb/db.h"
 #include "rocksdb/listener.h"
 #include "rocksdb/options.h"
+#include "rocksdb/statistics.h"
 #include "rocksdb/wal_filter.h"
 #include "util/mpsc.h"
 #include "util/mutexlock.h"
@ -135,16 +136,12 @@ struct blobf_compare_ttl {

 struct GCStats {
  uint64_t blob_count = 0;
-  uint64_t num_deletes = 0;
-  uint64_t deleted_size = 0;
-  uint64_t retry_delete = 0;
-  uint64_t delete_succeeded = 0;
-  uint64_t overwritten_while_delete = 0;
-  uint64_t num_relocate = 0;
-  uint64_t retry_relocate = 0;
-  uint64_t relocate_succeeded = 0;
-  uint64_t overwritten_while_relocate = 0;
-  std::shared_ptr<BlobFile> newfile = nullptr;
+  uint64_t num_keys_overwritten = 0;
+  uint64_t num_keys_expired = 0;
+  uint64_t num_keys_relocated = 0;
+  uint64_t bytes_overwritten = 0;
+  uint64_t bytes_expired = 0;
+  uint64_t bytes_relocated = 0;
 };

 /**
@ -178,10 +175,6 @@ class BlobDBImpl : public BlobDB {
  // how many periods of stats do we keep.
  static constexpr uint32_t kWriteAmplificationStatsPeriods = 24;

-  // what is the length of any period
-  static constexpr uint32_t kWriteAmplificationStatsPeriodMillisecs =
-      3600 * 1000;
-
  // we will garbage collect blob files in
  // which entire files have expired. However if the
  // ttl_range of files is very large say a day, we
@ -205,6 +198,10 @@ class BlobDBImpl : public BlobDB {
  // how often to schedule check seq files period
  static constexpr uint32_t kCheckSeqFilesPeriodMillisecs = 10 * 1000;

+  // when should oldest file be evicted:
+  // on reaching 90% of blob_dir_size
+  static constexpr double kEvictOldestFileAtSize = 0.9;
+
  using BlobDB::Put;
  Status Put(const WriteOptions& options, const Slice& key,
             const Slice& value) override;
@ -275,8 +272,6 @@ class BlobDBImpl : public BlobDB {

  void TEST_RunGC();

-  void TEST_ObsoleteFile(std::shared_ptr<BlobFile>& bfile);
-
  void TEST_DeleteObsoleteFiles();
 #endif  //  !NDEBUG

@ -290,6 +285,10 @@ class BlobDBImpl : public BlobDB {
  // Return true if a snapshot is created.
  bool SetSnapshotIfNeeded(ReadOptions* read_options);

+  Status GetImpl(const ReadOptions& read_options,
+                 ColumnFamilyHandle* column_family, const Slice& key,
+                 PinnableSlice* value);
+
  Status GetBlobValue(const Slice& key, const Slice& index_entry,
                      PinnableSlice* value);

@ -362,9 +361,6 @@ class BlobDBImpl : public BlobDB {
  // efficiency
  std::pair<bool, int64_t> ReclaimOpenFiles(bool aborted);

-  // periodically print write amplification statistics
-  std::pair<bool, int64_t> WaStats(bool aborted);
-
  // background task to do book-keeping of deleted keys
  std::pair<bool, int64_t> EvictDeletions(bool aborted);

@ -407,6 +403,7 @@ class BlobDBImpl : public BlobDB {

  // checks if there is no snapshot which is referencing the
  // blobs
+  bool VisibleToActiveSnapshot(const std::shared_ptr<BlobFile>& file);
  bool FileDeleteOk_SnapshotCheckLocked(const std::shared_ptr<BlobFile>& bfile);

  bool MarkBlobDeleted(const Slice& key, const Slice& lsmValue);
@ -414,7 +411,9 @@ class BlobDBImpl : public BlobDB {
  bool FindFileAndEvictABlob(uint64_t file_number, uint64_t key_size,
                             uint64_t blob_offset, uint64_t blob_size);

-  void CopyBlobFiles(std::vector<std::shared_ptr<BlobFile>>* bfiles_copy);
+  void CopyBlobFiles(
+      std::vector<std::shared_ptr<BlobFile>>* bfiles_copy,
+      std::function<bool(const std::shared_ptr<BlobFile>&)> predicate = {});

  void FilterSubsetOfFiles(
      const std::vector<std::shared_ptr<BlobFile>>& blob_files,
@ -423,6 +422,12 @@ class BlobDBImpl : public BlobDB {

  uint64_t EpochNow() { return env_->NowMicros() / 1000000; }

+  Status CheckSize(size_t blob_size);
+
+  std::shared_ptr<BlobFile> GetOldestBlobFile();
+
+  bool EvictOldestBlobFile();
+
  // the base DB
  DBImpl* db_impl_;
  Env* env_;
@ -433,6 +438,9 @@ class BlobDBImpl : public BlobDB {
  DBOptions db_options_;
  EnvOptions env_options_;

+  // Raw pointer of statistic. db_options_ has a shared_ptr to hold ownership.
+  Statistics* statistics_;
+
  // name of the database directory
  std::string dbname_;

@ -508,24 +516,14 @@ class BlobDBImpl : public BlobDB {
  // counter is used to monitor and close excess RA files.
  std::atomic<uint32_t> open_file_count_;

-  // should hold mutex to modify
-  // STATISTICS for WA of Blob Files due to GC
-  // collect by default 24 hourly periods
-  std::list<uint64_t> all_periods_write_;
-  std::list<uint64_t> all_periods_ampl_;
-
-  std::atomic<uint64_t> last_period_write_;
-  std::atomic<uint64_t> last_period_ampl_;
-
-  uint64_t total_periods_write_;
-  uint64_t total_periods_ampl_;
-
  // total size of all blob files at a given time
  std::atomic<uint64_t> total_blob_space_;
  std::list<std::shared_ptr<BlobFile>> obsolete_files_;
  bool open_p1_done_;

  uint32_t debug_level_;
+
+  std::atomic<bool> oldest_file_evicted_;
 };

 }  // namespace blob_db
--- a/utilities/blob_db/blob_db_iterator.h
+++ b/utilities/blob_db/blob_db_iterator.h
@ -6,7 +6,9 @@
 #pragma once
 #ifndef ROCKSDB_LITE

+#include "monitoring/statistics.h"
 #include "rocksdb/iterator.h"
+#include "util/stop_watch.h"
 #include "utilities/blob_db/blob_db_impl.h"

 namespace rocksdb {
@ -17,8 +19,12 @@ using rocksdb::ManagedSnapshot;
 class BlobDBIterator : public Iterator {
 public:
  BlobDBIterator(ManagedSnapshot* snapshot, ArenaWrappedDBIter* iter,
-                 BlobDBImpl* blob_db)
-      : snapshot_(snapshot), iter_(iter), blob_db_(blob_db) {}
+                 BlobDBImpl* blob_db, Env* env, Statistics* statistics)
+      : snapshot_(snapshot),
+        iter_(iter),
+        blob_db_(blob_db),
+        env_(env),
+        statistics_(statistics) {}

  virtual ~BlobDBIterator() = default;

@ -37,33 +43,45 @@ class BlobDBIterator : public Iterator {
  }

  void SeekToFirst() override {
+    StopWatch seek_sw(env_, statistics_, BLOB_DB_SEEK_MICROS);
+    RecordTick(statistics_, BLOB_DB_NUM_SEEK);
    iter_->SeekToFirst();
    UpdateBlobValue();
  }

  void SeekToLast() override {
+    StopWatch seek_sw(env_, statistics_, BLOB_DB_SEEK_MICROS);
+    RecordTick(statistics_, BLOB_DB_NUM_SEEK);
    iter_->SeekToLast();
    UpdateBlobValue();
  }

  void Seek(const Slice& target) override {
+    StopWatch seek_sw(env_, statistics_, BLOB_DB_SEEK_MICROS);
+    RecordTick(statistics_, BLOB_DB_NUM_SEEK);
    iter_->Seek(target);
    UpdateBlobValue();
  }

  void SeekForPrev(const Slice& target) override {
+    StopWatch seek_sw(env_, statistics_, BLOB_DB_SEEK_MICROS);
+    RecordTick(statistics_, BLOB_DB_NUM_SEEK);
    iter_->SeekForPrev(target);
    UpdateBlobValue();
  }

  void Next() override {
    assert(Valid());
+    StopWatch next_sw(env_, statistics_, BLOB_DB_NEXT_MICROS);
+    RecordTick(statistics_, BLOB_DB_NUM_NEXT);
    iter_->Next();
    UpdateBlobValue();
  }

  void Prev() override {
    assert(Valid());
+    StopWatch prev_sw(env_, statistics_, BLOB_DB_PREV_MICROS);
+    RecordTick(statistics_, BLOB_DB_NUM_PREV);
    iter_->Prev();
    UpdateBlobValue();
  }
@ -96,6 +114,8 @@ class BlobDBIterator : public Iterator {
  std::unique_ptr<ManagedSnapshot> snapshot_;
  std::unique_ptr<ArenaWrappedDBIter> iter_;
  BlobDBImpl* blob_db_;
+  Env* env_;
+  Statistics* statistics_;
  Status status_;
  PinnableSlice value_;
 };
--- a/utilities/blob_db/blob_db_test.cc
+++ b/utilities/blob_db/blob_db_test.cc
@ -47,10 +47,23 @@ class BlobDBTest : public testing::Test {

  ~BlobDBTest() { Destroy(); }

-  void Open(BlobDBOptions bdb_options = BlobDBOptions(),
+  Status TryOpen(BlobDBOptions bdb_options = BlobDBOptions(),
                 Options options = Options()) {
    options.create_if_missing = true;
-    ASSERT_OK(BlobDB::Open(options, bdb_options, dbname_, &blob_db_));
+    return BlobDB::Open(options, bdb_options, dbname_, &blob_db_);
+  }
+
+  void Open(BlobDBOptions bdb_options = BlobDBOptions(),
+            Options options = Options()) {
+    ASSERT_OK(TryOpen(bdb_options, options));
+  }
+
+  void Reopen(BlobDBOptions bdb_options = BlobDBOptions(),
+              Options options = Options()) {
+    assert(blob_db_ != nullptr);
+    delete blob_db_;
+    blob_db_ = nullptr;
+    Open(bdb_options, options);
  }

  void Destroy() {
@ -63,6 +76,26 @@ class BlobDBTest : public testing::Test {
    }
  }

+  BlobDBImpl *blob_db_impl() {
+    return reinterpret_cast<BlobDBImpl *>(blob_db_);
+  }
+
+  Status Put(const Slice &key, const Slice &value) {
+    return blob_db_->Put(WriteOptions(), key, value);
+  }
+
+  void Delete(const std::string &key,
+              std::map<std::string, std::string> *data = nullptr) {
+    ASSERT_OK(blob_db_->Delete(WriteOptions(), key));
+    if (data != nullptr) {
+      data->erase(key);
+    }
+  }
+
+  Status PutUntil(const Slice &key, const Slice &value, uint64_t expiration) {
+    return blob_db_->PutUntil(WriteOptions(), key, value, expiration);
+  }
+
  void PutRandomWithTTL(const std::string &key, uint64_t ttl, Random *rnd,
                        std::map<std::string, std::string> *data = nullptr) {
    int len = rnd->Next() % kMaxBlobSize + 1;
@ -111,20 +144,24 @@ class BlobDBTest : public testing::Test {
    }
  }

-  void Delete(const std::string &key,
-              std::map<std::string, std::string> *data = nullptr) {
-    ASSERT_OK(blob_db_->Delete(WriteOptions(), key));
-    if (data != nullptr) {
-      data->erase(key);
-    }
-  }
-
  // Verify blob db contain expected data and nothing more.
  void VerifyDB(const std::map<std::string, std::string> &data) {
    VerifyDB(blob_db_, data);
  }

  void VerifyDB(DB *db, const std::map<std::string, std::string> &data) {
+    // Verify normal Get
+    auto* cfh = db->DefaultColumnFamily();
+    for (auto &p : data) {
+      PinnableSlice value_slice;
+      ASSERT_OK(db->Get(ReadOptions(), cfh, p.first, &value_slice));
+      ASSERT_EQ(p.second, value_slice.ToString());
+      std::string value;
+      ASSERT_OK(db->Get(ReadOptions(), cfh, p.first, &value));
+      ASSERT_EQ(p.second, value);
+    }
+
+    // Verify iterators
    Iterator *iter = db->NewIterator(ReadOptions());
    iter->SeekToFirst();
    for (auto &p : data) {
@ -223,8 +260,8 @@ TEST_F(BlobDBTest, PutWithTTL) {
  ASSERT_OK(bdb_impl->TEST_CloseBlobFile(blob_files[0]));
  GCStats gc_stats;
  ASSERT_OK(bdb_impl->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats));
-  ASSERT_EQ(100 - data.size(), gc_stats.num_deletes);
-  ASSERT_EQ(data.size(), gc_stats.num_relocate);
+  ASSERT_EQ(100 - data.size(), gc_stats.num_keys_expired);
+  ASSERT_EQ(data.size(), gc_stats.num_keys_relocated);
  VerifyDB(data);
 }

@ -253,8 +290,8 @@ TEST_F(BlobDBTest, PutUntil) {
  ASSERT_OK(bdb_impl->TEST_CloseBlobFile(blob_files[0]));
  GCStats gc_stats;
  ASSERT_OK(bdb_impl->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats));
-  ASSERT_EQ(100 - data.size(), gc_stats.num_deletes);
-  ASSERT_EQ(data.size(), gc_stats.num_relocate);
+  ASSERT_EQ(100 - data.size(), gc_stats.num_keys_expired);
+  ASSERT_EQ(data.size(), gc_stats.num_keys_relocated);
  VerifyDB(data);
 }

@ -286,8 +323,8 @@ TEST_F(BlobDBTest, TTLExtrator_NoTTL) {
  ASSERT_OK(bdb_impl->TEST_CloseBlobFile(blob_files[0]));
  GCStats gc_stats;
  ASSERT_OK(bdb_impl->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats));
-  ASSERT_EQ(0, gc_stats.num_deletes);
-  ASSERT_EQ(100, gc_stats.num_relocate);
+  ASSERT_EQ(0, gc_stats.num_keys_expired);
+  ASSERT_EQ(100, gc_stats.num_keys_relocated);
  VerifyDB(data);
 }

@ -333,8 +370,8 @@ TEST_F(BlobDBTest, TTLExtractor_ExtractTTL) {
  GCStats gc_stats;
  ASSERT_OK(bdb_impl->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats));
  auto &data = static_cast<TestTTLExtractor *>(ttl_extractor_.get())->data;
-  ASSERT_EQ(100 - data.size(), gc_stats.num_deletes);
-  ASSERT_EQ(data.size(), gc_stats.num_relocate);
+  ASSERT_EQ(100 - data.size(), gc_stats.num_keys_expired);
+  ASSERT_EQ(data.size(), gc_stats.num_keys_relocated);
  VerifyDB(data);
 }

@ -381,8 +418,8 @@ TEST_F(BlobDBTest, TTLExtractor_ExtractExpiration) {
  GCStats gc_stats;
  ASSERT_OK(bdb_impl->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats));
  auto &data = static_cast<TestTTLExtractor *>(ttl_extractor_.get())->data;
-  ASSERT_EQ(100 - data.size(), gc_stats.num_deletes);
-  ASSERT_EQ(data.size(), gc_stats.num_relocate);
+  ASSERT_EQ(100 - data.size(), gc_stats.num_keys_expired);
+  ASSERT_EQ(data.size(), gc_stats.num_keys_relocated);
  VerifyDB(data);
 }

@ -438,8 +475,8 @@ TEST_F(BlobDBTest, TTLExtractor_ChangeValue) {
  ASSERT_OK(bdb_impl->TEST_CloseBlobFile(blob_files[0]));
  GCStats gc_stats;
  ASSERT_OK(bdb_impl->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats));
-  ASSERT_EQ(100 - data.size(), gc_stats.num_deletes);
-  ASSERT_EQ(data.size(), gc_stats.num_relocate);
+  ASSERT_EQ(100 - data.size(), gc_stats.num_keys_expired);
+  ASSERT_EQ(data.size(), gc_stats.num_keys_relocated);
  VerifyDB(data);
 }

@ -556,6 +593,24 @@ TEST_F(BlobDBTest, Compression) {
  }
  VerifyDB(data);
 }
+
+TEST_F(BlobDBTest, DecompressAfterReopen) {
+  Random rnd(301);
+  BlobDBOptions bdb_options;
+  bdb_options.min_blob_size = 0;
+  bdb_options.disable_background_tasks = true;
+  bdb_options.compression = CompressionType::kSnappyCompression;
+  Open(bdb_options);
+  std::map<std::string, std::string> data;
+  for (size_t i = 0; i < 100; i++) {
+    PutRandom("put-key" + ToString(i), &rnd, &data);
+  }
+  VerifyDB(data);
+  bdb_options.compression = CompressionType::kNoCompression;
+  Reopen(bdb_options);
+  VerifyDB(data);
+}
+
 #endif

 TEST_F(BlobDBTest, MultipleWriters) {
@ -593,16 +648,14 @@ TEST_F(BlobDBTest, GCAfterOverwriteKeys) {
  bdb_options.min_blob_size = 0;
  bdb_options.disable_background_tasks = true;
  Open(bdb_options);
-  BlobDBImpl *blob_db_impl =
-      static_cast_with_check<BlobDBImpl, BlobDB>(blob_db_);
  DBImpl *db_impl = static_cast_with_check<DBImpl, DB>(blob_db_->GetBaseDB());
  std::map<std::string, std::string> data;
  for (int i = 0; i < 200; i++) {
    PutRandom("key" + ToString(i), &rnd, &data);
  }
-  auto blob_files = blob_db_impl->TEST_GetBlobFiles();
+  auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
  ASSERT_EQ(1, blob_files.size());
-  ASSERT_OK(blob_db_impl->TEST_CloseBlobFile(blob_files[0]));
+  ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(blob_files[0]));
  // Test for data in SST
  size_t new_keys = 0;
  for (int i = 0; i < 100; i++) {
@ -620,10 +673,10 @@ TEST_F(BlobDBTest, GCAfterOverwriteKeys) {
    }
  }
  GCStats gc_stats;
-  ASSERT_OK(blob_db_impl->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats));
+  ASSERT_OK(blob_db_impl()->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats));
  ASSERT_EQ(200, gc_stats.blob_count);
-  ASSERT_EQ(0, gc_stats.num_deletes);
-  ASSERT_EQ(200 - new_keys, gc_stats.num_relocate);
+  ASSERT_EQ(0, gc_stats.num_keys_expired);
+  ASSERT_EQ(200 - new_keys, gc_stats.num_keys_relocated);
  VerifyDB(data);
 }

@ -634,16 +687,14 @@ TEST_F(BlobDBTest, GCRelocateKeyWhileOverwriting) {
  bdb_options.disable_background_tasks = true;
  Open(bdb_options);
  ASSERT_OK(blob_db_->Put(WriteOptions(), "foo", "v1"));
-  BlobDBImpl *blob_db_impl =
-      static_cast_with_check<BlobDBImpl, BlobDB>(blob_db_);
-  auto blob_files = blob_db_impl->TEST_GetBlobFiles();
+  auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
  ASSERT_EQ(1, blob_files.size());
-  ASSERT_OK(blob_db_impl->TEST_CloseBlobFile(blob_files[0]));
+  ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(blob_files[0]));

  SyncPoint::GetInstance()->LoadDependency(
      {{"BlobDBImpl::GCFileAndUpdateLSM:AfterGetFromBaseDB",
-        "BlobDBImpl::PutBlobValue:Start"},
-       {"BlobDBImpl::PutBlobValue:Finish",
+        "BlobDBImpl::PutUntil:Start"},
+       {"BlobDBImpl::PutUntil:Finish",
        "BlobDBImpl::GCFileAndUpdateLSM:BeforeRelocate"}});
  SyncPoint::GetInstance()->EnableProcessing();

@ -651,12 +702,11 @@ TEST_F(BlobDBTest, GCRelocateKeyWhileOverwriting) {
      [this]() { ASSERT_OK(blob_db_->Put(WriteOptions(), "foo", "v2")); });

  GCStats gc_stats;
-  ASSERT_OK(blob_db_impl->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats));
+  ASSERT_OK(blob_db_impl()->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats));
  ASSERT_EQ(1, gc_stats.blob_count);
-  ASSERT_EQ(0, gc_stats.num_deletes);
-  ASSERT_EQ(1, gc_stats.num_relocate);
-  ASSERT_EQ(0, gc_stats.relocate_succeeded);
-  ASSERT_EQ(1, gc_stats.overwritten_while_relocate);
+  ASSERT_EQ(0, gc_stats.num_keys_expired);
+  ASSERT_EQ(1, gc_stats.num_keys_overwritten);
+  ASSERT_EQ(0, gc_stats.num_keys_relocated);
  writer.join();
  VerifyDB({{"foo", "v2"}});
 }
@ -671,17 +721,15 @@ TEST_F(BlobDBTest, GCExpiredKeyWhileOverwriting) {
  Open(bdb_options, options);
  mock_env_->set_current_time(100);
  ASSERT_OK(blob_db_->PutUntil(WriteOptions(), "foo", "v1", 200));
-  BlobDBImpl *blob_db_impl =
-      static_cast_with_check<BlobDBImpl, BlobDB>(blob_db_);
-  auto blob_files = blob_db_impl->TEST_GetBlobFiles();
+  auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
  ASSERT_EQ(1, blob_files.size());
-  ASSERT_OK(blob_db_impl->TEST_CloseBlobFile(blob_files[0]));
+  ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(blob_files[0]));
  mock_env_->set_current_time(300);

  SyncPoint::GetInstance()->LoadDependency(
      {{"BlobDBImpl::GCFileAndUpdateLSM:AfterGetFromBaseDB",
-        "BlobDBImpl::PutBlobValue:Start"},
-       {"BlobDBImpl::PutBlobValue:Finish",
+        "BlobDBImpl::PutUntil:Start"},
+       {"BlobDBImpl::PutUntil:Finish",
        "BlobDBImpl::GCFileAndUpdateLSM:BeforeDelete"}});
  SyncPoint::GetInstance()->EnableProcessing();

@ -690,22 +738,23 @@ TEST_F(BlobDBTest, GCExpiredKeyWhileOverwriting) {
  });

  GCStats gc_stats;
-  ASSERT_OK(blob_db_impl->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats));
+  ASSERT_OK(blob_db_impl()->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats));
  ASSERT_EQ(1, gc_stats.blob_count);
-  ASSERT_EQ(1, gc_stats.num_deletes);
-  ASSERT_EQ(0, gc_stats.delete_succeeded);
-  ASSERT_EQ(1, gc_stats.overwritten_while_delete);
-  ASSERT_EQ(0, gc_stats.num_relocate);
+  ASSERT_EQ(1, gc_stats.num_keys_expired);
+  ASSERT_EQ(0, gc_stats.num_keys_relocated);
  writer.join();
  VerifyDB({{"foo", "v2"}});
 }

-TEST_F(BlobDBTest, GCOldestSimpleBlobFileWhenOutOfSpace) {
+// This test is no longer valid since we now return an error when we go
+// over the configured blob_dir_size.
+// The test needs to be re-written later in such a way that writes continue
+// after a GC happens.
+TEST_F(BlobDBTest, DISABLED_GCOldestSimpleBlobFileWhenOutOfSpace) {
  // Use mock env to stop wall clock.
  Options options;
  options.env = mock_env_.get();
  BlobDBOptions bdb_options;
-  bdb_options.is_fifo = true;
  bdb_options.blob_dir_size = 100;
  bdb_options.blob_file_size = 100;
  bdb_options.min_blob_size = 0;
@ -716,9 +765,7 @@ TEST_F(BlobDBTest, GCOldestSimpleBlobFileWhenOutOfSpace) {
  for (int i = 0; i < 10; i++) {
    ASSERT_OK(blob_db_->Put(WriteOptions(), "key" + ToString(i), value));
  }
-  BlobDBImpl *blob_db_impl =
-      static_cast_with_check<BlobDBImpl, BlobDB>(blob_db_);
-  auto blob_files = blob_db_impl->TEST_GetBlobFiles();
+  auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
  ASSERT_EQ(11, blob_files.size());
  ASSERT_TRUE(blob_files[0]->HasTTL());
  ASSERT_TRUE(blob_files[0]->Immutable());
@ -728,9 +775,9 @@ TEST_F(BlobDBTest, GCOldestSimpleBlobFileWhenOutOfSpace) {
      ASSERT_TRUE(blob_files[i]->Immutable());
    }
  }
-  blob_db_impl->TEST_RunGC();
+  blob_db_impl()->TEST_RunGC();
  // The oldest simple blob file (i.e. blob_files[1]) has been selected for GC.
-  auto obsolete_files = blob_db_impl->TEST_GetObsoleteFiles();
+  auto obsolete_files = blob_db_impl()->TEST_GetObsoleteFiles();
  ASSERT_EQ(1, obsolete_files.size());
  ASSERT_EQ(blob_files[1]->BlobFileNumber(),
            obsolete_files[0]->BlobFileNumber());
@ -744,13 +791,11 @@ TEST_F(BlobDBTest, ReadWhileGC) {
    bdb_options.disable_background_tasks = true;
    Open(bdb_options);
    blob_db_->Put(WriteOptions(), "foo", "bar");
-    BlobDBImpl *blob_db_impl =
-        static_cast_with_check<BlobDBImpl, BlobDB>(blob_db_);
-    auto blob_files = blob_db_impl->TEST_GetBlobFiles();
+    auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
    ASSERT_EQ(1, blob_files.size());
    std::shared_ptr<BlobFile> bfile = blob_files[0];
    uint64_t bfile_number = bfile->BlobFileNumber();
-    ASSERT_OK(blob_db_impl->TEST_CloseBlobFile(bfile));
+    ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(bfile));

    switch (i) {
      case 0:
@ -788,17 +833,15 @@ TEST_F(BlobDBTest, ReadWhileGC) {

    TEST_SYNC_POINT("BlobDBTest::ReadWhileGC:1");
    GCStats gc_stats;
-    ASSERT_OK(blob_db_impl->TEST_GCFileAndUpdateLSM(bfile, &gc_stats));
+    ASSERT_OK(blob_db_impl()->TEST_GCFileAndUpdateLSM(bfile, &gc_stats));
    ASSERT_EQ(1, gc_stats.blob_count);
-    ASSERT_EQ(1, gc_stats.num_relocate);
-    ASSERT_EQ(1, gc_stats.relocate_succeeded);
-    blob_db_impl->TEST_ObsoleteFile(blob_files[0]);
-    blob_db_impl->TEST_DeleteObsoleteFiles();
+    ASSERT_EQ(1, gc_stats.num_keys_relocated);
+    blob_db_impl()->TEST_DeleteObsoleteFiles();
    // The file shouln't be deleted
-    blob_files = blob_db_impl->TEST_GetBlobFiles();
+    blob_files = blob_db_impl()->TEST_GetBlobFiles();
    ASSERT_EQ(2, blob_files.size());
    ASSERT_EQ(bfile_number, blob_files[0]->BlobFileNumber());
-    auto obsolete_files = blob_db_impl->TEST_GetObsoleteFiles();
+    auto obsolete_files = blob_db_impl()->TEST_GetObsoleteFiles();
    ASSERT_EQ(1, obsolete_files.size());
    ASSERT_EQ(bfile_number, obsolete_files[0]->BlobFileNumber());
    TEST_SYNC_POINT("BlobDBTest::ReadWhileGC:2");
@ -806,16 +849,86 @@ TEST_F(BlobDBTest, ReadWhileGC) {
    SyncPoint::GetInstance()->DisableProcessing();

    // The file is deleted this time
-    blob_db_impl->TEST_DeleteObsoleteFiles();
-    blob_files = blob_db_impl->TEST_GetBlobFiles();
+    blob_db_impl()->TEST_DeleteObsoleteFiles();
+    blob_files = blob_db_impl()->TEST_GetBlobFiles();
    ASSERT_EQ(1, blob_files.size());
    ASSERT_NE(bfile_number, blob_files[0]->BlobFileNumber());
-    ASSERT_EQ(0, blob_db_impl->TEST_GetObsoleteFiles().size());
+    ASSERT_EQ(0, blob_db_impl()->TEST_GetObsoleteFiles().size());
    VerifyDB({{"foo", "bar"}});
    Destroy();
  }
 }

+TEST_F(BlobDBTest, SnapshotAndGarbageCollection) {
+  BlobDBOptions bdb_options;
+  bdb_options.min_blob_size = 0;
+  bdb_options.disable_background_tasks = true;
+  // i = when to take snapshot
+  for (int i = 0; i < 4; i++) {
+    for (bool delete_key : {true, false}) {
+      const Snapshot *snapshot = nullptr;
+      Destroy();
+      Open(bdb_options);
+      // First file
+      ASSERT_OK(Put("key1", "value"));
+      if (i == 0) {
+        snapshot = blob_db_->GetSnapshot();
+      }
+      auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
+      ASSERT_EQ(1, blob_files.size());
+      ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(blob_files[0]));
+      // Second file
+      ASSERT_OK(Put("key2", "value"));
+      if (i == 1) {
+        snapshot = blob_db_->GetSnapshot();
+      }
+      blob_files = blob_db_impl()->TEST_GetBlobFiles();
+      ASSERT_EQ(2, blob_files.size());
+      auto bfile = blob_files[1];
+      ASSERT_FALSE(bfile->Immutable());
+      ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(bfile));
+      // Third file
+      ASSERT_OK(Put("key3", "value"));
+      if (i == 2) {
+        snapshot = blob_db_->GetSnapshot();
+      }
+      if (delete_key) {
+        Delete("key2");
+      }
+      GCStats gc_stats;
+      ASSERT_OK(blob_db_impl()->TEST_GCFileAndUpdateLSM(bfile, &gc_stats));
+      ASSERT_TRUE(bfile->Obsolete());
+      ASSERT_EQ(1, gc_stats.blob_count);
+      if (delete_key) {
+        ASSERT_EQ(0, gc_stats.num_keys_relocated);
+        ASSERT_EQ(bfile->GetSequenceRange().second + 1,
+                  bfile->GetObsoleteSequence());
+      } else {
+        ASSERT_EQ(1, gc_stats.num_keys_relocated);
+        ASSERT_EQ(blob_db_->GetLatestSequenceNumber(),
+                  bfile->GetObsoleteSequence());
+      }
+      if (i == 3) {
+        snapshot = blob_db_->GetSnapshot();
+      }
+      size_t num_files = delete_key ? 3 : 4;
+      ASSERT_EQ(num_files, blob_db_impl()->TEST_GetBlobFiles().size());
+      blob_db_impl()->TEST_DeleteObsoleteFiles();
+      if (i == 0 || i == 3 || (i == 2 && delete_key)) {
+        // The snapshot shouldn't see data in bfile
+        ASSERT_EQ(num_files - 1, blob_db_impl()->TEST_GetBlobFiles().size());
+        blob_db_->ReleaseSnapshot(snapshot);
+      } else {
+        // The snapshot will see data in bfile, so the file shouldn't be deleted
+        ASSERT_EQ(num_files, blob_db_impl()->TEST_GetBlobFiles().size());
+        blob_db_->ReleaseSnapshot(snapshot);
+        blob_db_impl()->TEST_DeleteObsoleteFiles();
+        ASSERT_EQ(num_files - 1, blob_db_impl()->TEST_GetBlobFiles().size());
+      }
+    }
+  }
+}
+
 TEST_F(BlobDBTest, ColumnFamilyNotSupported) {
  Options options;
  options.env = mock_env_.get();
@ -949,6 +1062,41 @@ TEST_F(BlobDBTest, OutOfSpace) {
  ASSERT_TRUE(s.IsNoSpace());
 }

+TEST_F(BlobDBTest, EvictOldestFileWhenCloseToSpaceLimit) {
+  // Use mock env to stop wall clock.
+  Options options;
+  BlobDBOptions bdb_options;
+  bdb_options.blob_dir_size = 270;
+  bdb_options.blob_file_size = 100;
+  bdb_options.disable_background_tasks = true;
+  bdb_options.is_fifo = true;
+  Open(bdb_options);
+
+  // Each stored blob has an overhead of 32 bytes currently.
+  // So a 100 byte blob should take up 132 bytes.
+  std::string value(100, 'v');
+  ASSERT_OK(blob_db_->PutWithTTL(WriteOptions(), "key1", value, 10));
+
+  auto *bdb_impl = static_cast<BlobDBImpl *>(blob_db_);
+  auto blob_files = bdb_impl->TEST_GetBlobFiles();
+  ASSERT_EQ(1, blob_files.size());
+
+  // Adding another 100 byte blob would take the total size to 264 bytes
+  // (2*132), which is more than 90% of blob_dir_size. So, the oldest file
+  // should be evicted and put in obsolete files list.
+  ASSERT_OK(blob_db_->PutWithTTL(WriteOptions(), "key2", value, 60));
+
+  auto obsolete_files = bdb_impl->TEST_GetObsoleteFiles();
+  ASSERT_EQ(1, obsolete_files.size());
+  ASSERT_TRUE(obsolete_files[0]->Immutable());
+  ASSERT_EQ(blob_files[0]->BlobFileNumber(),
+            obsolete_files[0]->BlobFileNumber());
+
+  bdb_impl->TEST_DeleteObsoleteFiles();
+  obsolete_files = bdb_impl->TEST_GetObsoleteFiles();
+  ASSERT_TRUE(obsolete_files.empty());
+}
+
 TEST_F(BlobDBTest, InlineSmallValues) {
  constexpr uint64_t kMaxExpiration = 1000;
  Random rnd(301);
@ -1018,6 +1166,95 @@ TEST_F(BlobDBTest, InlineSmallValues) {
  ASSERT_EQ(last_ttl_seq, ttl_file->GetSequenceRange().second);
 }

+TEST_F(BlobDBTest, CompactionFilterNotSupported) {
+  class TestCompactionFilter : public CompactionFilter {
+    virtual const char *Name() const { return "TestCompactionFilter"; }
+  };
+  class TestCompactionFilterFactory : public CompactionFilterFactory {
+    virtual const char *Name() const { return "TestCompactionFilterFactory"; }
+    virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+        const CompactionFilter::Context & /*context*/) {
+      return std::unique_ptr<CompactionFilter>(new TestCompactionFilter());
+    }
+  };
+  for (int i = 0; i < 2; i++) {
+    Options options;
+    if (i == 0) {
+      options.compaction_filter = new TestCompactionFilter();
+    } else {
+      options.compaction_filter_factory.reset(
+          new TestCompactionFilterFactory());
+    }
+    ASSERT_TRUE(TryOpen(BlobDBOptions(), options).IsNotSupported());
+    delete options.compaction_filter;
+  }
+}
+
+TEST_F(BlobDBTest, FilterExpiredBlobIndex) {
+  constexpr size_t kNumKeys = 100;
+  constexpr size_t kNumPuts = 1000;
+  constexpr uint64_t kMaxExpiration = 1000;
+  constexpr uint64_t kCompactTime = 500;
+  constexpr uint64_t kMinBlobSize = 100;
+  Random rnd(301);
+  mock_env_->set_current_time(0);
+  BlobDBOptions bdb_options;
+  bdb_options.min_blob_size = kMinBlobSize;
+  bdb_options.disable_background_tasks = true;
+  Options options;
+  options.env = mock_env_.get();
+  Open(bdb_options, options);
+
+  std::map<std::string, std::string> data;
+  std::map<std::string, std::string> data_after_compact;
+  for (size_t i = 0; i < kNumPuts; i++) {
+    bool is_small_value = rnd.Next() % 2;
+    bool has_ttl = rnd.Next() % 2;
+    uint64_t expiration = rnd.Next() % kMaxExpiration;
+    int len = is_small_value ? 10 : 200;
+    std::string key = "key" + ToString(rnd.Next() % kNumKeys);
+    std::string value = test::RandomHumanReadableString(&rnd, len);
+    if (!has_ttl) {
+      if (is_small_value) {
+        std::string blob_entry;
+        BlobIndex::EncodeInlinedTTL(&blob_entry, expiration, value);
+        // Fake blob index with TTL. See what it will do.
+        ASSERT_GT(kMinBlobSize, blob_entry.size());
+        value = blob_entry;
+      }
+      ASSERT_OK(Put(key, value));
+      data_after_compact[key] = value;
+    } else {
+      ASSERT_OK(PutUntil(key, value, expiration));
+      if (expiration <= kCompactTime) {
+        data_after_compact.erase(key);
+      } else {
+        data_after_compact[key] = value;
+      }
+    }
+    data[key] = value;
+  }
+  VerifyDB(data);
+
+  mock_env_->set_current_time(kCompactTime);
+  // Take a snapshot before compaction. Make sure expired blob indexes is
+  // filtered regardless of snapshot.
+  const Snapshot *snapshot = blob_db_->GetSnapshot();
+  // Issue manual compaction to trigger compaction filter.
+  ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(),
+                                   blob_db_->DefaultColumnFamily(), nullptr,
+                                   nullptr));
+  blob_db_->ReleaseSnapshot(snapshot);
+  // Verify expired blob index are filtered.
+  std::vector<KeyVersion> versions;
+  GetAllKeyVersions(blob_db_, "", "", &versions);
+  ASSERT_EQ(data_after_compact.size(), versions.size());
+  for (auto &version : versions) {
+    ASSERT_TRUE(data_after_compact.count(version.user_key) > 0);
+  }
+  VerifyDB(data_after_compact);
+}
+
 }  //  namespace blob_db
 }  //  namespace rocksdb

--- a/utilities/blob_db/blob_file.cc
+++ b/utilities/blob_db/blob_file.cc
@ -30,13 +30,14 @@ BlobFile::BlobFile()
    : parent_(nullptr),
      file_number_(0),
      has_ttl_(false),
+      compression_(kNoCompression),
      blob_count_(0),
      gc_epoch_(-1),
      file_size_(0),
      deleted_count_(0),
      deleted_size_(0),
      closed_(false),
-      can_be_deleted_(false),
+      obsolete_(false),
      gc_once_after_open_(false),
      expiration_range_({0, 0}),
      sequence_range_({kMaxSequenceNumber, 0}),
@ -49,13 +50,14 @@ BlobFile::BlobFile(const BlobDBImpl* p, const std::string& bdir, uint64_t fn)
      path_to_dir_(bdir),
      file_number_(fn),
      has_ttl_(false),
+      compression_(kNoCompression),
      blob_count_(0),
      gc_epoch_(-1),
      file_size_(0),
      deleted_count_(0),
      deleted_size_(0),
      closed_(false),
-      can_be_deleted_(false),
+      obsolete_(false),
      gc_once_after_open_(false),
      expiration_range_({0, 0}),
      sequence_range_({kMaxSequenceNumber, 0}),
@ -64,7 +66,7 @@ BlobFile::BlobFile(const BlobDBImpl* p, const std::string& bdir, uint64_t fn)
      header_valid_(false) {}

 BlobFile::~BlobFile() {
-  if (can_be_deleted_) {
+  if (obsolete_) {
    std::string pn(PathName());
    Status s = Env::Default()->DeleteFile(PathName());
    if (!s.ok()) {
@ -98,8 +100,8 @@ std::shared_ptr<Reader> BlobFile::OpenSequentialReader(
  std::unique_ptr<SequentialFileReader> sfile_reader;
  sfile_reader.reset(new SequentialFileReader(std::move(sfile)));

-  std::shared_ptr<Reader> log_reader =
-      std::make_shared<Reader>(db_options.info_log, std::move(sfile_reader));
+  std::shared_ptr<Reader> log_reader = std::make_shared<Reader>(
+      std::move(sfile_reader), db_options.env, db_options.statistics.get());

  return log_reader;
 }
@ -110,17 +112,21 @@ std::string BlobFile::DumpState() const {
           "path: %s fn: %" PRIu64 " blob_count: %" PRIu64 " gc_epoch: %" PRIu64
           " file_size: %" PRIu64 " deleted_count: %" PRIu64
           " deleted_size: %" PRIu64
-           " closed: %d can_be_deleted: %d expiration_range: (%" PRIu64
-           ", %" PRIu64 ") sequence_range: (%" PRIu64 " %" PRIu64
-           "), writer: %d reader: %d",
+           " closed: %d obsolete: %d expiration_range: (%" PRIu64 ", %" PRIu64
+           ") sequence_range: (%" PRIu64 " %" PRIu64 "), writer: %d reader: %d",
           path_to_dir_.c_str(), file_number_, blob_count_.load(),
           gc_epoch_.load(), file_size_.load(), deleted_count_, deleted_size_,
-           closed_.load(), can_be_deleted_.load(), expiration_range_.first,
+           closed_.load(), obsolete_.load(), expiration_range_.first,
           expiration_range_.second, sequence_range_.first,
           sequence_range_.second, (!!log_writer_), (!!ra_file_reader_));
  return str;
 }

+void BlobFile::MarkObsolete(SequenceNumber sequence) {
+  obsolete_sequence_ = sequence;
+  obsolete_.store(true);
+}
+
 bool BlobFile::NeedsFsync(bool hard, uint64_t bytes_per_sync) const {
  assert(last_fsync_ <= file_size_);
  return (hard) ? file_size_ > last_fsync_
--- a/utilities/blob_db/blob_file.h
+++ b/utilities/blob_db/blob_file.h
@ -41,6 +41,9 @@ class BlobFile {
  // have TTL.
  bool has_ttl_;

+  // Compression type of blobs in the file
+  CompressionType compression_;
+
  // number of blobs in the file
  std::atomic<uint64_t> blob_count_;

@ -63,8 +66,12 @@ class BlobFile {
  std::atomic<bool> closed_;

  // has a pass of garbage collection successfully finished on this file
-  // can_be_deleted_ still needs to do iterator/snapshot checks
-  std::atomic<bool> can_be_deleted_;
+  // obsolete_ still needs to do iterator/snapshot checks
+  std::atomic<bool> obsolete_;
+
+  // The last sequence number by the time the file marked as obsolete.
+  // Data in this file is visible to a snapshot taken before the sequence.
+  SequenceNumber obsolete_sequence_;

  // should this file been gc'd once to reconcile lost deletes/compactions
  std::atomic<bool> gc_once_after_open_;
@ -91,6 +98,8 @@ class BlobFile {

  bool header_valid_;

+  SequenceNumber garbage_collection_finish_sequence_;
+
 public:
  BlobFile();

@ -117,7 +126,19 @@ class BlobFile {
  std::string DumpState() const;

  // if the file has gone through GC and blobs have been relocated
-  bool Obsolete() const { return can_be_deleted_.load(); }
+  bool Obsolete() const {
+    assert(Immutable() || !obsolete_.load());
+    return obsolete_.load();
+  }
+
+  // Mark file as obsolete by garbage collection. The file is not visible to
+  // snapshots with sequence greater or equal to the given sequence.
+  void MarkObsolete(SequenceNumber sequence);
+
+  SequenceNumber GetObsoleteSequence() const {
+    assert(Obsolete());
+    return obsolete_sequence_;
+  }

  // if the file is not taking any more appends.
  bool Immutable() const { return closed_.load(); }
@ -125,6 +146,8 @@ class BlobFile {
  // we will assume this is atomic
  bool NeedsFsync(bool hard, uint64_t bytes_per_sync) const;

+  void Fsync();
+
  uint64_t GetFileSize() const {
    return file_size_.load(std::memory_order_acquire);
  }
@ -153,9 +176,13 @@ class BlobFile {

  void SetHasTTL(bool has_ttl) { has_ttl_ = has_ttl; }

-  std::shared_ptr<Writer> GetWriter() const { return log_writer_; }
+  CompressionType compression() const { return compression_; }

-  void Fsync();
+  void SetCompression(CompressionType c) {
+    compression_ = c;
+  }
+
+  std::shared_ptr<Writer> GetWriter() const { return log_writer_; }

 private:
  std::shared_ptr<Reader> OpenSequentialReader(
@ -183,8 +210,6 @@ class BlobFile {
  void SetFileSize(uint64_t fs) { file_size_ = fs; }

  void SetBlobCount(uint64_t bc) { blob_count_ = bc; }
-
-  void SetCanBeDeleted() { can_be_deleted_ = true; }
 };
 }  // namespace blob_db
 }  // namespace rocksdb
--- a/utilities/blob_db/blob_log_format.h
+++ b/utilities/blob_db/blob_log_format.h
@ -111,6 +111,8 @@ struct BlobLogRecord {
  std::string key_buf;
  std::string value_buf;

+  uint64_t record_size() const { return kHeaderSize + key_size + value_size; }
+
  void EncodeHeaderTo(std::string* dst);

  Status DecodeHeaderFrom(Slice src);
--- a/utilities/blob_db/blob_log_reader.cc
+++ b/utilities/blob_db/blob_log_reader.cc
@ -9,22 +9,30 @@

 #include <algorithm>

+#include "monitoring/statistics.h"
 #include "util/file_reader_writer.h"
+#include "util/stop_watch.h"

 namespace rocksdb {
 namespace blob_db {

-Reader::Reader(std::shared_ptr<Logger> info_log,
-               unique_ptr<SequentialFileReader>&& _file)
-    : info_log_(info_log), file_(std::move(_file)), buffer_(), next_byte_(0) {}
+Reader::Reader(unique_ptr<SequentialFileReader>&& file_reader, Env* env,
+               Statistics* statistics)
+    : file_(std::move(file_reader)),
+      env_(env),
+      statistics_(statistics),
+      buffer_(),
+      next_byte_(0) {}

 Status Reader::ReadSlice(uint64_t size, Slice* slice, std::string* buf) {
+  StopWatch read_sw(env_, statistics_, BLOB_DB_BLOB_FILE_READ_MICROS);
  buf->reserve(size);
  Status s = file_->Read(size, slice, &(*buf)[0]);
  next_byte_ += size;
  if (!s.ok()) {
    return s;
  }
+  RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_READ, slice->size());
  if (slice->size() != size) {
    return Status::Corruption("EOF reached while reading record");
  }
--- a/utilities/blob_db/blob_log_reader.h
+++ b/utilities/blob_db/blob_log_reader.h
@ -10,7 +10,9 @@
 #include <memory>
 #include <string>

+#include "rocksdb/env.h"
 #include "rocksdb/slice.h"
+#include "rocksdb/statistics.h"
 #include "rocksdb/status.h"
 #include "utilities/blob_db/blob_log_format.h"

@ -37,17 +39,8 @@ class Reader {

  // Create a reader that will return log records from "*file".
  // "*file" must remain live while this Reader is in use.
-  //
-  // If "reporter" is non-nullptr, it is notified whenever some data is
-  // dropped due to a detected corruption.  "*reporter" must remain
-  // live while this Reader is in use.
-  //
-  // If "checksum" is true, verify checksums if available.
-  //
-  // The Reader will start reading at the first record located at physical
-  // position >= initial_offset within the file.
-  Reader(std::shared_ptr<Logger> info_log,
-         std::unique_ptr<SequentialFileReader>&& file);
+  Reader(std::unique_ptr<SequentialFileReader>&& file_reader, Env* env,
+         Statistics* statistics);

  ~Reader() = default;

@ -68,17 +61,14 @@ class Reader {

  Status ReadSlice(uint64_t size, Slice* slice, std::string* buf);

-  SequentialFileReader* file() { return file_.get(); }
-
  void ResetNextByte() { next_byte_ = 0; }

  uint64_t GetNextByte() const { return next_byte_; }

-  const SequentialFileReader* file_reader() const { return file_.get(); }
-
 private:
-  std::shared_ptr<Logger> info_log_;
  const std::unique_ptr<SequentialFileReader> file_;
+  Env* env_;
+  Statistics* statistics_;

  std::string backing_store_;
  Slice buffer_;
--- a/utilities/blob_db/blob_log_writer.cc
+++ b/utilities/blob_db/blob_log_writer.cc
@ -8,17 +8,23 @@

 #include <cstdint>
 #include <string>
+
+#include "monitoring/statistics.h"
 #include "rocksdb/env.h"
 #include "util/coding.h"
 #include "util/file_reader_writer.h"
+#include "util/stop_watch.h"
 #include "utilities/blob_db/blob_log_format.h"

 namespace rocksdb {
 namespace blob_db {

-Writer::Writer(unique_ptr<WritableFileWriter>&& dest, uint64_t log_number,
-               uint64_t bpsync, bool use_fs, uint64_t boffset)
+Writer::Writer(unique_ptr<WritableFileWriter>&& dest, Env* env,
+               Statistics* statistics, uint64_t log_number, uint64_t bpsync,
+               bool use_fs, uint64_t boffset)
    : dest_(std::move(dest)),
+      env_(env),
+      statistics_(statistics),
      log_number_(log_number),
      block_offset_(boffset),
      bytes_per_sync_(bpsync),
@ -26,7 +32,11 @@ Writer::Writer(unique_ptr<WritableFileWriter>&& dest, uint64_t log_number,
      use_fsync_(use_fs),
      last_elem_type_(kEtNone) {}

-void Writer::Sync() { dest_->Sync(use_fsync_); }
+void Writer::Sync() {
+  StopWatch sync_sw(env_, statistics_, BLOB_DB_BLOB_FILE_SYNC_MICROS);
+  dest_->Sync(use_fsync_);
+  RecordTick(statistics_, BLOB_DB_BLOB_FILE_SYNCED);
+}

 Status Writer::WriteHeader(BlobLogHeader& header) {
  assert(block_offset_ == 0);
@ -40,6 +50,8 @@ Status Writer::WriteHeader(BlobLogHeader& header) {
    s = dest_->Flush();
  }
  last_elem_type_ = kEtFileHdr;
+  RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_WRITTEN,
+             BlobLogHeader::kSize);
  return s;
 }

@ -58,6 +70,8 @@ Status Writer::AppendFooter(BlobLogFooter& footer) {
  }

  last_elem_type_ = kEtFileFooter;
+  RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_WRITTEN,
+             BlobLogFooter::kSize);
  return s;
 }

@ -98,6 +112,7 @@ void Writer::ConstructBlobHeader(std::string* buf, const Slice& key,
 Status Writer::EmitPhysicalRecord(const std::string& headerbuf,
                                  const Slice& key, const Slice& val,
                                  uint64_t* key_offset, uint64_t* blob_offset) {
+  StopWatch write_sw(env_, statistics_, BLOB_DB_BLOB_FILE_WRITE_MICROS);
  Status s = dest_->Append(Slice(headerbuf));
  if (s.ok()) {
    s = dest_->Append(key);
@ -113,6 +128,8 @@ Status Writer::EmitPhysicalRecord(const std::string& headerbuf,
  *blob_offset = *key_offset + key.size();
  block_offset_ = *blob_offset + val.size();
  last_elem_type_ = kEtRecord;
+  RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_WRITTEN,
+             BlobLogRecord::kHeaderSize + key.size() + val.size());
  return s;
 }

--- a/utilities/blob_db/blob_log_writer.h
+++ b/utilities/blob_db/blob_log_writer.h
@ -10,7 +10,9 @@
 #include <memory>
 #include <string>

+#include "rocksdb/env.h"
 #include "rocksdb/slice.h"
+#include "rocksdb/statistics.h"
 #include "rocksdb/status.h"
 #include "rocksdb/types.h"
 #include "utilities/blob_db/blob_log_format.h"
@ -34,9 +36,9 @@ class Writer {
  // Create a writer that will append data to "*dest".
  // "*dest" must be initially empty.
  // "*dest" must remain live while this Writer is in use.
-  explicit Writer(std::unique_ptr<WritableFileWriter>&& dest,
-                  uint64_t log_number, uint64_t bpsync, bool use_fsync,
-                  uint64_t boffset = 0);
+  Writer(std::unique_ptr<WritableFileWriter>&& dest, Env* env,
+         Statistics* statistics, uint64_t log_number, uint64_t bpsync,
+         bool use_fsync, uint64_t boffset = 0);

  ~Writer() = default;

@ -75,6 +77,8 @@ class Writer {

 private:
  std::unique_ptr<WritableFileWriter> dest_;
+  Env* env_;
+  Statistics* statistics_;
  uint64_t log_number_;
  uint64_t block_offset_;  // Current offset in block
  uint64_t bytes_per_sync_;
--- a/utilities/transactions/pessimistic_transaction_db.cc
+++ b/utilities/transactions/pessimistic_transaction_db.cc
@ -189,12 +189,11 @@ Status TransactionDB::Open(
  std::vector<ColumnFamilyDescriptor> column_families_copy = column_families;
  std::vector<size_t> compaction_enabled_cf_indices;
  DBOptions db_options_2pc = db_options;
-  if (txn_db_options.write_policy == WRITE_PREPARED) {
-    db_options_2pc.seq_per_batch = true;
-  }
  PrepareWrap(&db_options_2pc, &column_families_copy,
              &compaction_enabled_cf_indices);
-  s = DB::Open(db_options_2pc, dbname, column_families_copy, handles, &db);
+  const bool use_seq_per_batch = txn_db_options.write_policy == WRITE_PREPARED;
+  s = DBImpl::Open(db_options_2pc, dbname, column_families_copy, handles, &db,
+                   use_seq_per_batch);
  if (s.ok()) {
    s = WrapDB(db, txn_db_options, compaction_enabled_cf_indices, *handles,
               dbptr);
--- a/utilities/transactions/transaction_lock_mgr.cc
+++ b/utilities/transactions/transaction_lock_mgr.cc
@ -20,6 +20,7 @@
 #include <string>
 #include <vector>

+#include "monitoring/perf_context_imp.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/utilities/transaction_db_mutex.h"
 #include "util/cast_util.h"
@ -347,6 +348,8 @@ Status TransactionLockMgr::AcquireWithTimeout(
                         &expire_time_hint, &wait_ids);

  if (!result.ok() && timeout != 0) {
+    PERF_TIMER_GUARD(key_lock_wait_time);
+    PERF_COUNTER_ADD(key_lock_wait_count, 1);
    // If we weren't able to acquire the lock, we will keep retrying as long
    // as the timeout allows.
    bool timed_out = false;
--- a/utilities/transactions/transaction_test.cc
+++ b/utilities/transactions/transaction_test.cc
@ -19,6 +19,7 @@
 #include "db/db_impl.h"
 #include "rocksdb/db.h"
 #include "rocksdb/options.h"
+#include "rocksdb/perf_context.h"
 #include "rocksdb/utilities/transaction.h"
 #include "rocksdb/utilities/transaction_db.h"
 #include "table/mock_table.h"
@ -187,15 +188,18 @@ TEST_P(TransactionTest, WaitingTxn) {
        ASSERT_EQ(cf_id, 0);
      });

+  get_perf_context()->Reset();
  // lock key in default cf
  s = txn1->GetForUpdate(read_options, "foo", &value);
  ASSERT_OK(s);
  ASSERT_EQ(value, "bar");
+  ASSERT_EQ(get_perf_context()->key_lock_wait_count, 0);

  // lock key in cfa
  s = txn1->GetForUpdate(read_options, cfa, "foo", &value);
  ASSERT_OK(s);
  ASSERT_EQ(value, "bar");
+  ASSERT_EQ(get_perf_context()->key_lock_wait_count, 0);

  auto lock_data = db->GetLockStatusData();
  // Locked keys exist in both column family.
@ -231,6 +235,8 @@ TEST_P(TransactionTest, WaitingTxn) {
  s = txn2->GetForUpdate(read_options, "foo", &value);
  ASSERT_TRUE(s.IsTimedOut());
  ASSERT_EQ(s.ToString(), "Operation timed out: Timeout waiting to lock key");
+  ASSERT_EQ(get_perf_context()->key_lock_wait_count, 1);
+  ASSERT_GE(get_perf_context()->key_lock_wait_time, 0);

  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
@ -4856,12 +4862,12 @@ TEST_P(TransactionTest, SeqAdvanceTest) {
    auto seq = db_impl->GetLatestSequenceNumber();
    exp_seq = seq;
    txn_t0(0);
-    seq = db_impl->TEST_GetLatestVisibleSequenceNumber();
+    seq = db_impl->TEST_GetLastVisibleSequence();
    ASSERT_EQ(exp_seq, seq);

    if (branch_do(n, &branch)) {
      db_impl->Flush(fopt);
-      seq = db_impl->TEST_GetLatestVisibleSequenceNumber();
+      seq = db_impl->TEST_GetLastVisibleSequence();
      ASSERT_EQ(exp_seq, seq);
    }
    if (branch_do(n, &branch)) {
@ -4874,16 +4880,16 @@ TEST_P(TransactionTest, SeqAdvanceTest) {

    // Doing it twice might detect some bugs
    txn_t0(1);
-    seq = db_impl->TEST_GetLatestVisibleSequenceNumber();
+    seq = db_impl->TEST_GetLastVisibleSequence();
    ASSERT_EQ(exp_seq, seq);

    txn_t1(0);
-    seq = db_impl->TEST_GetLatestVisibleSequenceNumber();
+    seq = db_impl->TEST_GetLastVisibleSequence();
    ASSERT_EQ(exp_seq, seq);

    if (branch_do(n, &branch)) {
      db_impl->Flush(fopt);
-      seq = db_impl->TEST_GetLatestVisibleSequenceNumber();
+      seq = db_impl->TEST_GetLastVisibleSequence();
      ASSERT_EQ(exp_seq, seq);
    }
    if (branch_do(n, &branch)) {
@ -4895,12 +4901,12 @@ TEST_P(TransactionTest, SeqAdvanceTest) {
    }

    txn_t3(0);
-    seq = db_impl->TEST_GetLatestVisibleSequenceNumber();
+    seq = db_impl->TEST_GetLastVisibleSequence();
    ASSERT_EQ(exp_seq, seq);

    if (branch_do(n, &branch)) {
      db_impl->Flush(fopt);
-      seq = db_impl->TEST_GetLatestVisibleSequenceNumber();
+      seq = db_impl->TEST_GetLastVisibleSequence();
      ASSERT_EQ(exp_seq, seq);
    }
    if (branch_do(n, &branch)) {
@ -4912,16 +4918,16 @@ TEST_P(TransactionTest, SeqAdvanceTest) {
    }

    txn_t0(0);
-    seq = db_impl->TEST_GetLatestVisibleSequenceNumber();
+    seq = db_impl->TEST_GetLastVisibleSequence();
    ASSERT_EQ(exp_seq, seq);

    txn_t2(0);
-    seq = db_impl->TEST_GetLatestVisibleSequenceNumber();
+    seq = db_impl->TEST_GetLastVisibleSequence();
    ASSERT_EQ(exp_seq, seq);

    if (branch_do(n, &branch)) {
      db_impl->Flush(fopt);
-      seq = db_impl->TEST_GetLatestVisibleSequenceNumber();
+      seq = db_impl->TEST_GetLastVisibleSequence();
      ASSERT_EQ(exp_seq, seq);
    }
    if (branch_do(n, &branch)) {
--- a/utilities/transactions/transaction_test.h
+++ b/utilities/transactions/transaction_test.h
@ -54,7 +54,7 @@ class TransactionTest : public ::testing::TestWithParam<
    options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
    env = new FaultInjectionTestEnv(Env::Default());
    options.env = env;
-    options.concurrent_prepare = std::get<1>(GetParam());
+    options.two_write_queues = std::get<1>(GetParam());
    dbname = test::TmpDir() + "/transaction_testdb";

    DestroyDB(dbname, options);
@ -113,11 +113,10 @@ class TransactionTest : public ::testing::TestWithParam<
    std::vector<ColumnFamilyHandle*> handles;
    DB* root_db;
    Options options_copy(options);
-    if (txn_db_options.write_policy == WRITE_PREPARED) {
-      options_copy.seq_per_batch = true;
-    }
-    Status s =
-        DB::Open(options_copy, dbname, column_families, &handles, &root_db);
+    const bool use_seq_per_batch =
+        txn_db_options.write_policy == WRITE_PREPARED;
+    Status s = DBImpl::Open(options_copy, dbname, column_families, &handles,
+                            &root_db, use_seq_per_batch);
    if (s.ok()) {
      assert(handles.size() == 1);
      s = TransactionDB::WrapStackableDB(
@ -144,7 +143,7 @@ class TransactionTest : public ::testing::TestWithParam<
    } else {
      // Consume one seq per batch
      exp_seq++;
-      if (options.concurrent_prepare) {
+      if (options.two_write_queues) {
        // Consume one seq for commit
        exp_seq++;
      }
@ -169,7 +168,7 @@ class TransactionTest : public ::testing::TestWithParam<
    } else {
      // Consume one seq per batch
      exp_seq++;
-      if (options.concurrent_prepare) {
+      if (options.two_write_queues) {
        // Consume one seq for commit
        exp_seq++;
      }
@ -197,7 +196,7 @@ class TransactionTest : public ::testing::TestWithParam<
    } else {
      // Consume one seq per batch
      exp_seq++;
-      if (options.concurrent_prepare) {
+      if (options.two_write_queues) {
        // Consume one seq for commit
        exp_seq++;
      }
--- a/utilities/transactions/write_prepared_transaction_test.cc
+++ b/utilities/transactions/write_prepared_transaction_test.cc
@ -625,7 +625,7 @@ TEST_P(WritePreparedTransactionTest, SeqAdvanceConcurrentTest) {
      printf("Tested %" ROCKSDB_PRIszt " cases so far\n", n);
    }
    DBImpl* db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
-    auto seq = db_impl->TEST_GetLatestVisibleSequenceNumber();
+    auto seq = db_impl->TEST_GetLastVisibleSequence();
    exp_seq = seq;
    // This is increased before writing the batch for commit
    commit_writes = 0;
@ -693,17 +693,17 @@ TEST_P(WritePreparedTransactionTest, SeqAdvanceConcurrentTest) {
    for (auto& t : threads) {
      t.join();
    }
-    if (options.concurrent_prepare) {
+    if (options.two_write_queues) {
      // In this case none of the above scheduling tricks to deterministically
      // form merged bactches works because the writes go to saparte queues.
      // This would result in different write groups in each run of the test. We
      // still keep the test since althgouh non-deterministic and hard to debug,
      // it is still useful to have.
-      // TODO(myabandeh): Add a deterministic unit test for concurrent_prepare
+      // TODO(myabandeh): Add a deterministic unit test for two_write_queues
    }

    // Check if memtable inserts advanced seq number as expected
-    seq = db_impl->TEST_GetLatestVisibleSequenceNumber();
+    seq = db_impl->TEST_GetLastVisibleSequence();
    ASSERT_EQ(exp_seq, seq);

    rocksdb::SyncPoint::GetInstance()->DisableProcessing();
@ -1258,7 +1258,7 @@ TEST_P(WritePreparedTransactionTest, DisableGCDuringRecoveryTest) {
    VerifyKeys({{"foo", v}});
    seq++;  // one for the key/value
    KeyVersion kv = {"foo", v, seq, kTypeValue};
-    if (options.concurrent_prepare) {
+    if (options.two_write_queues) {
      seq++;  // one for the commit
    }
    versions.emplace_back(kv);
@ -1306,10 +1306,10 @@ TEST_P(WritePreparedTransactionTest, CompactionShouldKeepUncommittedKeys) {
  auto add_key = [&](std::function<Status()> func) {
    ASSERT_OK(func());
    expected_seq++;
-    if (options.concurrent_prepare) {
+    if (options.two_write_queues) {
      expected_seq++;  // 1 for commit
    }
-    ASSERT_EQ(expected_seq, db_impl->TEST_GetLatestVisibleSequenceNumber());
+    ASSERT_EQ(expected_seq, db_impl->TEST_GetLastVisibleSequence());
    snapshots.push_back(db->GetSnapshot());
  };

@ -1397,7 +1397,7 @@ TEST_P(WritePreparedTransactionTest, CompactionShouldKeepSnapshotVisibleKeys) {
  ASSERT_EQ(++expected_seq, db->GetLatestSequenceNumber());
  ASSERT_OK(txn1->Commit());
  DBImpl* db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
-  ASSERT_EQ(++expected_seq, db_impl->TEST_GetLatestVisibleSequenceNumber());
+  ASSERT_EQ(++expected_seq, db_impl->TEST_GetLastVisibleSequence());
  delete txn1;
  // Take a snapshots to avoid keys get evicted before compaction.
  const Snapshot* snapshot1 = db->GetSnapshot();
@ -1410,24 +1410,24 @@ TEST_P(WritePreparedTransactionTest, CompactionShouldKeepSnapshotVisibleKeys) {
  // txn2 commit after snapshot2 and it is not visible.
  const Snapshot* snapshot2 = db->GetSnapshot();
  ASSERT_OK(txn2->Commit());
-  ASSERT_EQ(++expected_seq, db_impl->TEST_GetLatestVisibleSequenceNumber());
+  ASSERT_EQ(++expected_seq, db_impl->TEST_GetLastVisibleSequence());
  delete txn2;
  // Take a snapshots to avoid keys get evicted before compaction.
  const Snapshot* snapshot3 = db->GetSnapshot();
  ASSERT_OK(db->Put(WriteOptions(), "key1", "value1_2"));
  expected_seq++;  // 1 for write
  SequenceNumber seq1 = expected_seq;
-  if (options.concurrent_prepare) {
+  if (options.two_write_queues) {
    expected_seq++;  // 1 for commit
  }
-  ASSERT_EQ(expected_seq, db_impl->TEST_GetLatestVisibleSequenceNumber());
+  ASSERT_EQ(expected_seq, db_impl->TEST_GetLastVisibleSequence());
  ASSERT_OK(db->Put(WriteOptions(), "key2", "value2_2"));
  expected_seq++;  // 1 for write
  SequenceNumber seq2 = expected_seq;
-  if (options.concurrent_prepare) {
+  if (options.two_write_queues) {
    expected_seq++;  // 1 for commit
  }
-  ASSERT_EQ(expected_seq, db_impl->TEST_GetLatestVisibleSequenceNumber());
+  ASSERT_EQ(expected_seq, db_impl->TEST_GetLastVisibleSequence());
  ASSERT_OK(db->Flush(FlushOptions()));
  db->ReleaseSnapshot(snapshot1);
  db->ReleaseSnapshot(snapshot3);
--- a/utilities/transactions/write_prepared_txn.cc
+++ b/utilities/transactions/write_prepared_txn.cc
@ -90,7 +90,7 @@ Status WritePreparedTxn::CommitWithoutPrepareInternal() {
 }

 SequenceNumber WritePreparedTxn::GetACommitSeqNumber(SequenceNumber prep_seq) {
-  if (db_impl_->immutable_db_options().concurrent_prepare) {
+  if (db_impl_->immutable_db_options().two_write_queues) {
    return db_impl_->IncAndFetchSequenceNumber();
  } else {
    return prep_seq;
--- a/utilities/transactions/write_prepared_txn.h
+++ b/utilities/transactions/write_prepared_txn.h
@ -46,7 +46,7 @@ class WritePreparedTxn : public PessimisticTransaction {
  virtual ~WritePreparedTxn() {}

  // To make WAL commit markers visible, the snapshot will be based on the last
-  // seq in the WAL, LastToBeWrittenSquence, as opposed to the last seq in the
+  // seq in the WAL, LastAllocatedSequence, as opposed to the last seq in the
  // memtable.
  using Transaction::Get;
  virtual Status Get(const ReadOptions& options,
@ -54,7 +54,7 @@ class WritePreparedTxn : public PessimisticTransaction {
                     PinnableSlice* value) override;

  // To make WAL commit markers visible, the snapshot will be based on the last
-  // seq in the WAL, LastToBeWrittenSquence, as opposed to the last seq in the
+  // seq in the WAL, LastAllocatedSequence, as opposed to the last seq in the
  // memtable.
  using Transaction::GetIterator;
  virtual Iterator* GetIterator(const ReadOptions& options) override;
@ -76,7 +76,7 @@ class WritePreparedTxn : public PessimisticTransaction {
  // commit entails writing only a commit marker in the WAL. The sequence number
  // of the commit marker is then the commit timestamp of the transaction. To
  // make the commit timestamp visible to readers, their snapshot is based on
-  // the last seq in the WAL, LastToBeWrittenSquence, as opposed to the last seq
+  // the last seq in the WAL, LastAllocatedSequence, as opposed to the last seq
  // in the memtable.
  Status CommitInternal() override;
Author	SHA1	Message	Date
Islam AbdelRahman	4d7b0d528c	Fix DeleteScheduler::MarkAsTrash() handling existing trash Summary: DeleteScheduler::MarkAsTrash() don't handle existing .trash files correctly This cause rocksdb to not being able to delete existing .trash files on restart Closes https://github.com/facebook/rocksdb/pull/3261 Differential Revision: D6548003 Pulled By: IslamAbdelRahman fbshipit-source-id: c3800639412e587a690062c63076a5a08881e0e6	2017-12-21 09:56:02 -08:00
Andrew Kryczka	8486eab5ad	Preserve overlapping file endpoint invariant Summary: Fix for #2833. - In `DeleteFilesInRange`, use `GetCleanInputsWithinInterval` instead of `GetOverlappingInputs` to make sure we get a clean cut set of files to delete. - In `GetCleanInputsWithinInterval`, support nullptr as `begin_key` or `end_key`. - In `GetOverlappingInputsRangeBinarySearch`, move the assertion for non-empty range away from `ExtendFileRangeWithinInterval`, which should be allowed to return an empty range (via `end_index < begin_index`). Closes https://github.com/facebook/rocksdb/pull/2843 Differential Revision: D5772387 Pulled By: ajkr fbshipit-source-id: e554e8461823c6be82b21a9262a2da02b3957881	2017-12-07 09:14:44 -08:00
Manuel Ung	497cd90d6a	Add lock wait time as a perf context counter Summary: Adds two new counters: `key_lock_wait_count` counts how many times a lock was blocked by another transaction and had to wait, instead of being granted the lock immediately. `key_lock_wait_time` counts the time spent acquiring locks. Closes https://github.com/facebook/rocksdb/pull/3107 Differential Revision: D6217332 Pulled By: lth fbshipit-source-id: 55d4f46da5550c333e523263422fd61d6a46deb9	2017-12-06 14:31:05 -08:00
Maysam Yabandeh	4d06d2862d	WritePrepared Txn: Refactor conf params Summary: Summary of changes: - Move seq_per_batch out of Options - Rename concurrent_prepare to two_write_queues - Add allocate_seq_only_for_data_ Closes https://github.com/facebook/rocksdb/pull/3136 Differential Revision: D6304458 Pulled By: maysamyabandeh fbshipit-source-id: 08e685bfa82bbc41b5b1c5eb7040a8ca6e05e58c	2017-12-06 14:30:46 -08:00
Andres Suarez	d1939b0dca	Remove `import` use from TARGETS Summary: We're moving away from `import`. The equivalent internal construct that gets the directory from `fbcode/` is `package_name()`. This is a Skylark friendly wrapper around [`get_base_path`]. The additional whitespace change is from running `python ./buckifier/buckify_rocksdb.py`. [`get_base_path`]: https://buckbuild.com/function/get_base_path.html Closes https://github.com/facebook/rocksdb/pull/3210 Reviewed By: yiwu-arbug Differential Revision: D6451242 Pulled By: zertosh fbshipit-source-id: 445757261de0ec89d5d332c1ba9af097086326dc	2017-11-30 16:05:36 -08:00
Yi Wu	920b7df154	TARGETS file not include tests in opt mode Summary: Do not build the tests in opt mode, since SyncPoint and other test code will not be included. Closes https://github.com/facebook/rocksdb/pull/3204 Differential Revision: D6431154 Pulled By: yiwu-arbug fbshipit-source-id: c404ef042c1a6f679e5c1dc57600b3d8cb52fc28	2017-11-30 12:15:53 -08:00
Yi Wu	7b3bc81f82	Bump version to 5.9.1	2017-11-28 12:09:13 -08:00
Yi Wu	84afb8a524	Blob DB: Add statistics Summary: Adding a list of blob db counters. Also remove WaStats() which doesn't expose the stats and can be substitute by (BLOB_DB_BYTES_WRITTEN / BLOB_DB_BLOB_FILE_BYTES_WRITTEN). Closes https://github.com/facebook/rocksdb/pull/3193 Differential Revision: D6394216 Pulled By: yiwu-arbug fbshipit-source-id: 017508c8ff3fcd7ea7403c64d0f9834b24816803	2017-11-28 12:07:49 -08:00
Yi Wu	3c3f7b12ec	Fix IOError on WAL write doesn't propagate to write group follower Summary: This is a simpler version of #3097 by removing all unrelated changes. Fixing the bug where concurrent writes may get Status::OK while it actually gets IOError on WAL write. This happens when multiple writes form a write batch group, and the leader get an IOError while writing to WAL. The leader failed to pass the error to followers in the group, and the followers end up returning Status::OK() while actually writing nothing. The bug only affect writes in a batch group. Future writes after the batch group will correctly return immediately with the IOError. Closes https://github.com/facebook/rocksdb/pull/3201 Differential Revision: D6421644 Pulled By: yiwu-arbug fbshipit-source-id: 1c2a455c5b73f6842423785eb8a9dbfbb191dc0e	2017-11-28 12:06:55 -08:00
Yi Wu	9debbba3a5	Blob DB: Fix GC handling for inlined blob Summary: Garbage collection checks if the offset in blob index matches the offset of the blob value in the file. If it is a mismatch, the value is the current version. However it failed to check if the blob index is an inlined type, which don't even have an offset. Fixing it. Closes https://github.com/facebook/rocksdb/pull/3194 Differential Revision: D6394270 Pulled By: yiwu-arbug fbshipit-source-id: 7c2b9d795f1116f55f4d728086980f9b6e88ea78	2017-11-28 12:06:48 -08:00
Maysam Yabandeh	3984b44095	Fix calculating filter partition target size Summary: block_size_deviation is in percentage while the partition size is in bytes. The current code fails to take that into account resulting into very large target size for filter partitions. Closes https://github.com/facebook/rocksdb/pull/3187 Differential Revision: D6376069 Pulled By: maysamyabandeh fbshipit-source-id: 276546fc68f50e0da32c462abb46f6cf676db9b2	2017-11-28 12:06:36 -08:00
Zhongyi Xie	28d93aea53	Revert "No need for Restart Interval for meta blocks" Summary: See [issue 3169](https://github.com/facebook/rocksdb/issues/3169) for more information This reverts commit `593d3de371`. Closes https://github.com/facebook/rocksdb/pull/3188 Differential Revision: D6379271 Pulled By: miasantreble fbshipit-source-id: 88f9ed67ba52237ad9b6f7251db83672b62d7537	2017-11-20 16:50:31 -08:00
Yi Wu	5652b6e57f	Fix TARGETS lint warnings. Summary: Fix buckifier script and regenerate TARGETS file with no lint warnings. Closes https://github.com/facebook/rocksdb/pull/3170 Differential Revision: D6328993 Pulled By: yiwu-arbug fbshipit-source-id: 17d0e4ed92f676f35fed76659386611cc72b00b2	2017-11-19 19:11:35 -08:00
Yi Wu	adee21951b	Blob DB: not using PinnableSlice move assignment Summary: The current implementation of PinnableSlice move assignment have an issue #3163. We are moving away from it instead of try to get the move assignment right, since it is too tricky. Closes https://github.com/facebook/rocksdb/pull/3164 Differential Revision: D6319201 Pulled By: yiwu-arbug fbshipit-source-id: 8f3279021f3710da4a4caa14fd238ed2df902c48	2017-11-19 19:08:48 -08:00
Yi Wu	e58d377182	Blob DB: Fix race condition between flush and write Summary: A race condition will happen when: * a user thread writes a value, but it hits the write stop condition because there are too many un-flushed memtables, while holding blob_db_impl.write_mutex_. * Flush is triggered and call flush begin listener and try to acquire blob_db_impl.write_mutex_. Fixing it. Closes https://github.com/facebook/rocksdb/pull/3149 Differential Revision: D6279805 Pulled By: yiwu-arbug fbshipit-source-id: 0e3c58afb78795ebe3360a2c69e05651e3908c40	2017-11-19 19:08:34 -08:00
Yi Wu	3e2998658f	Blob DB: Fix release build Summary: `compression` shadow the method name in `BlobFile`. Rename it. Closes https://github.com/facebook/rocksdb/pull/3148 Differential Revision: D6274498 Pulled By: yiwu-arbug fbshipit-source-id: 7d293596530998b23b6b8a8940f983f9b6343a98	2017-11-19 19:07:56 -08:00
Yi Wu	c468fd127b	Blob DB: use compression in file header instead of global options Summary: To fix the issue of failing to decompress existing value after reopen DB with a different compression settings. Closes https://github.com/facebook/rocksdb/pull/3142 Differential Revision: D6267260 Pulled By: yiwu-arbug fbshipit-source-id: c7cf7f3e33b0cd25520abf4771cdf9180cc02a5f	2017-11-19 19:07:50 -08:00
Yi Wu	c3efe60855	Fix PinnableSlice move assignment Summary: After move assignment, we need to re-initialized the moved PinnableSlice. Also update blob_db_impl.cc to not reuse the moved PinnableSlice since it is supposed to be in an undefined state after move. Closes https://github.com/facebook/rocksdb/pull/3127 Differential Revision: D6238585 Pulled By: yiwu-arbug fbshipit-source-id: bd99f2e37406c4f7de160c7dee6a2e8126bc224e	2017-11-19 19:07:43 -08:00
Yi Wu	a0016dc358	Blob DB: Fix BlobDBTest::SnapshotAndGarbageCollection asan failure Summary: Fix unreleased snapshot at the end of the test. Closes https://github.com/facebook/rocksdb/pull/3126 Differential Revision: D6232867 Pulled By: yiwu-arbug fbshipit-source-id: 651ca3144fc573ea2ab0ab20f0a752fb4a101d26	2017-11-19 19:07:34 -08:00
Yi Wu	49e764a468	Blob DB: Add compaction filter to remove expired blob index entries Summary: After adding expiration to blob index in #3066, we are now able to add a compaction filter to cleanup expired blob index entries. Closes https://github.com/facebook/rocksdb/pull/3090 Differential Revision: D6183812 Pulled By: yiwu-arbug fbshipit-source-id: 9cb03267a9702975290e758c9c176a2c03530b83	2017-11-19 19:07:26 -08:00
Yi Wu	7c27f3ddc6	Blob DB: fix snapshot handling Summary: Blob db will keep blob file if data in the file is visible to an active snapshot. Before this patch it checks whether there is an active snapshot has sequence number greater than the earliest sequence in the file. This is problematic since we take snapshot on every read, if it keep having reads, old blob files will not be cleanup. Change to check if there is an active snapshot falls in the range of [earliest_sequence, obsolete_sequence) where obsolete sequence is 1. if data is relocated to another file by garbage collection, it is the latest sequence at the time garbage collection finish 2. otherwise, it is the latest sequence of the file Closes https://github.com/facebook/rocksdb/pull/3087 Differential Revision: D6182519 Pulled By: yiwu-arbug fbshipit-source-id: cdf4c35281f782eb2a9ad6a87b6727bbdff27a45	2017-11-19 19:07:15 -08:00
Sagar Vemuri	37db5f3e89	Blob DB: Evict oldest blob file when close to blob db size limit Summary: Evict oldest blob file and put it in obsolete_files list when close to blob db size limit. The file will be delete when the `DeleteObsoleteFiles` background job runs next time. For now I set `kEvictOldestFileAtSize` constant, which controls when to evict the oldest file, at 90%. It could be tweaked or made into an option if really needed; I didn't want to expose it as an option pre-maturely as there are already too many :) . Closes https://github.com/facebook/rocksdb/pull/3094 Differential Revision: D6187340 Pulled By: sagar0 fbshipit-source-id: 687f8262101b9301bf964b94025a2fe9d8573421	2017-11-19 19:06:56 -08:00
Yi Wu	cb9104c92c	Blob DB: option to enable garbage collection Summary: Add an option to enable/disable auto garbage collection, where we keep counting how many keys have been evicted by either deletion or compaction and decide whether to garbage collect a blob file. Default disable auto garbage collection for now since the whole logic is not fully tested and we plan to make major change to it. Closes https://github.com/facebook/rocksdb/pull/3117 Differential Revision: D6224756 Pulled By: yiwu-arbug fbshipit-source-id: cdf53bdccec96a4580a2b3a342110ad9e8864dfe	2017-11-19 19:04:05 -08:00
Yi Wu	f1b040c14a	Blob DB: Fix flaky BlobDBTest::GCExpiredKeyWhileOverwriting test Summary: The test intent to wait until key being overwritten until proceed with garbage collection. It failed to wait for `PutUntil` finally finish. Fixing it. Closes https://github.com/facebook/rocksdb/pull/3116 Differential Revision: D6222833 Pulled By: yiwu-arbug fbshipit-source-id: fa9b57a772b92a66cf250b44e7975c43f62f45c5	2017-11-03 17:11:49 -07:00
Andrew Kryczka	d070003313	dynamically change current memtable size Summary: Previously setting `write_buffer_size` with `SetOptions` would only apply to new memtables. An internal user wanted it to take effect immediately, instead of at an arbitrary future point, to prevent OOM. This PR makes the memtable's size mutable, and makes `SetOptions()` mutate it. There is one case when we preserve the old behavior, which is when memtable prefix bloom filter is enabled and the user is increasing the memtable's capacity. That's because the prefix bloom filter's size is fixed and wouldn't work as well on a larger memtable. Closes https://github.com/facebook/rocksdb/pull/3119 Differential Revision: D6228304 Pulled By: ajkr fbshipit-source-id: e44bd9d10a5f8c9d8c464bf7436070bb3eafdfc9	2017-11-03 17:11:17 -07:00