Retrieve ZLib from the archive (#9782 )

Do not rely on ADL when invoking std::max_element (#9608 )
Summary: Certain STLs use raw pointers and ADL does not work for them. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9608 Reviewed By: ajkr Differential Revision: D34583012 Pulled By: riversand963 fbshipit-source-id: 7de6bbc8a080c3e7243ce0d758fe83f1663168aa
2022-04-04 18:09:07 -07:00 · 2022-03-29 12:54:19 -07:00 · 2022-03-29 12:18:28 -07:00 · 2022-03-29 12:17:49 -07:00 · 2022-03-29 12:14:13 -07:00 · 2022-03-29 12:13:30 -07:00
60 changed files with 1589 additions and 428 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -321,6 +321,7 @@ if(NOT MSVC)
  set(CMAKE_REQUIRED_FLAGS "-msse4.2 -mpclmul")
 endif()

+if (NOT PORTABLE OR FORCE_SSE42)
  CHECK_CXX_SOURCE_COMPILES("
 #include <cstdint>
 #include <nmmintrin.h>
@ -339,6 +340,7 @@ if(HAVE_SSE42)
  elseif(FORCE_SSE42)
    message(FATAL_ERROR "FORCE_SSE42=ON but unable to compile with SSE4.2 enabled")
  endif()
+endif()

 # Check if -latomic is required or not
 if (NOT MSVC)
--- a/HISTORY.md
+++ b/HISTORY.md
@ -1,4 +1,35 @@
 # Rocksdb Change Log
+## 6.29.5 (03/29/2022)
+### Bug Fixes
+* Fixed a race condition for `alive_log_files_` in non-two-write-queues mode. The race is between the write_thread_ in WriteToWAL() and another thread executing `FindObsoleteFiles()`. The race condition will be caught if `__glibcxx_requires_nonempty` is enabled.
+* Fixed a race condition when mmaping a WritableFile on POSIX.
+* Fixed a race condition when 2PC is disabled and WAL tracking in the MANIFEST is enabled. The race condition is between two background flush threads trying to install flush results, causing a WAL deletion not tracked in the MANIFEST. A future DB open may fail.
+* Fixed a heap use-after-free race with DropColumnFamily.
+* Fixed a bug that `rocksdb.read.block.compaction.micros` cannot track compaction stats (#9722).
+
+## 6.29.4 (03/22/2022)
+### Bug Fixes
+* Fixed a bug caused by race among flush, incoming writes and taking snapshots. Queries to snapshots created with these race condition can return incorrect result, e.g. resurfacing deleted data.
+* Fixed a bug that DisableManualCompaction may assert when disable an unscheduled manual compaction.
+* Fixed a bug that `Iterator::Refresh()` reads stale keys after DeleteRange() performed.
+* Fixed a race condition when disable and re-enable manual compaction.
+* Fix a race condition when cancel manual compaction with `DisableManualCompaction`. Also DB close can cancel the manual compaction thread.
+* Fixed a data race on `versions_` between `DBImpl::ResumeImpl()` and threads waiting for recovery to complete (#9496)
+* Fixed a read-after-free bug in `DB::GetMergeOperands()`.
+* Fixed NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL, NUM_DATA_BLOCKS_READ_PER_LEVEL, and NUM_SST_READ_PER_LEVEL stats to be reported once per MultiGet batch per level.
+
+## 6.29.3 (02/17/2022)
+### Bug Fixes
+* Fix a data loss bug for 2PC write-committed transaction caused by concurrent transaction commit and memtable switch (#9571).
+
+## 6.29.2 (02/15/2022)
+### Performance Improvements
+* DisableManualCompaction() doesn't have to wait scheduled manual compaction to be executed in thread-pool to cancel the job.
+
+## 6.29.1 (01/31/2022)
+### Bug Fixes
+* Fixed a major bug in which batched MultiGet could return old values for keys deleted by DeleteRange when memtable Bloom filter is enabled (memtable_prefix_bloom_size_ratio > 0). (The fix includes a substantial MultiGet performance improvement in the unusual case of both memtable_whole_key_filtering and prefix_extractor.)
+
 ## 6.29.0 (01/21/2022)
 Note: The next release will be major release 7.0. See https://github.com/facebook/rocksdb/issues/9390 for more info.
 ### Public API change
--- a/39
+++ b/39
@ -324,8 +324,8 @@ ifneq ($(MACHINE), arm64)
 # linking with jemalloc (as it won't be arm64-compatible) and remove some other options
 # set during platform detection
 DISABLE_JEMALLOC=1
-PLATFORM_CFLAGS := $(filter-out -march=native -DHAVE_SSE42, $(PLATFORM_CFLAGS))
-PLATFORM_CXXFLAGS := $(filter-out -march=native -DHAVE_SSE42, $(PLATFORM_CXXFLAGS))
+PLATFORM_CFLAGS := $(filter-out -march=native -DHAVE_SSE42 -DHAVE_AVX2, $(PLATFORM_CFLAGS))
+PLATFORM_CXXFLAGS := $(filter-out -march=native -DHAVE_SSE42 -DHAVE_AVX2, $(PLATFORM_CXXFLAGS))
 endif
 endif
 endif
@ -2088,7 +2088,7 @@ SHA256_CMD = sha256sum

 ZLIB_VER ?= 1.2.11
 ZLIB_SHA256 ?= c3e5e9fdd5004dcb542feda5ee4f0ff0744628baf8ed2dd5d66f8ca1197cb1a1
-ZLIB_DOWNLOAD_BASE ?= http://zlib.net
+ZLIB_DOWNLOAD_BASE ?= https://zlib.net/fossils
 BZIP2_VER ?= 1.0.8
 BZIP2_SHA256 ?= ab5a03176ee106d3f0fa90e381da478ddae405918153cca248e682cd0c4a2269
 BZIP2_DOWNLOAD_BASE ?= http://sourceware.org/pub/bzip2
@ -2106,7 +2106,9 @@ CURL_SSL_OPTS ?= --tlsv1
 ifeq ($(PLATFORM), OS_MACOSX)
 ifeq (,$(findstring librocksdbjni-osx,$(ROCKSDBJNILIB)))
 ifeq ($(MACHINE),arm64)
-	ROCKSDBJNILIB = librocksdbjni-osx-aarch64.jnilib
+	ROCKSDBJNILIB = librocksdbjni-osx-arm64.jnilib
+else ifeq ($(MACHINE),x86_64)
+	ROCKSDBJNILIB = librocksdbjni-osx-x86_64.jnilib
 else
 	ROCKSDBJNILIB = librocksdbjni-osx.jnilib
 endif
@ -2237,15 +2239,20 @@ endif
 	$(MAKE) rocksdbjavastatic_deps
 	$(MAKE) rocksdbjavastatic_libobjects
 	$(MAKE) rocksdbjavastatic_javalib
-	$(MAKE) rocksdbjavastatic_jar
+	$(MAKE) rocksdbjava_jar

 rocksdbjavastaticosx: rocksdbjavastaticosx_archs
-	mv java/target/librocksdbjni-osx-x86_64.jnilib java/target/librocksdbjni-osx.jnilib
-	mv java/target/librocksdbjni-osx-arm64.jnilib java/target/librocksdbjni-osx-aarch64.jnilib
+	cd java; $(JAR_CMD)  -cf target/$(ROCKSDB_JAR) HISTORY*.md
+	cd java/target; $(JAR_CMD) -uf $(ROCKSDB_JAR) librocksdbjni-osx-x86_64.jnilib librocksdbjni-osx-arm64.jnilib
+	cd java/target/classes; $(JAR_CMD) -uf ../$(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class
+	openssl sha1 java/target/$(ROCKSDB_JAR) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(ROCKSDB_JAR).sha1

 rocksdbjavastaticosx_ub: rocksdbjavastaticosx_archs
-	lipo -create -output ./java/target/$(ROCKSDBJNILIB) java/target/librocksdbjni-osx-x86_64.jnilib java/target/librocksdbjni-osx-arm64.jnilib
-	$(MAKE) rocksdbjavastatic_jar
+	cd java/target; lipo -create -output librocksdbjni-osx.jnilib librocksdbjni-osx-x86_64.jnilib librocksdbjni-osx-arm64.jnilib
+	cd java; $(JAR_CMD)  -cf target/$(ROCKSDB_JAR) HISTORY*.md
+	cd java/target; $(JAR_CMD) -uf $(ROCKSDB_JAR) librocksdbjni-osx.jnilib
+	cd java/target/classes; $(JAR_CMD) -uf ../$(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class
+	openssl sha1 java/target/$(ROCKSDB_JAR) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(ROCKSDB_JAR).sha1

 rocksdbjavastaticosx_archs: 
 	$(MAKE) rocksdbjavastaticosx_arch_x86_64
@ -2279,28 +2286,32 @@ rocksdbjavastatic_javalib:
 		strip $(STRIPFLAGS) $(ROCKSDBJNILIB); \
 	fi

-rocksdbjavastatic_jar:
+rocksdbjava_jar:
 	cd java; $(JAR_CMD)  -cf target/$(ROCKSDB_JAR) HISTORY*.md
 	cd java/target; $(JAR_CMD) -uf $(ROCKSDB_JAR) $(ROCKSDBJNILIB)
 	cd java/target/classes; $(JAR_CMD) -uf ../$(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class
-	cd java/target/apidocs; $(JAR_CMD) -cf ../$(ROCKSDB_JAVADOCS_JAR) *
-	cd java/src/main/java; $(JAR_CMD) -cf ../../../target/$(ROCKSDB_SOURCES_JAR) org
 	openssl sha1 java/target/$(ROCKSDB_JAR) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(ROCKSDB_JAR).sha1
+
+rocksdbjava_javadocs_jar:
+	cd java/target/apidocs; $(JAR_CMD) -cf ../$(ROCKSDB_JAVADOCS_JAR) *
 	openssl sha1 java/target/$(ROCKSDB_JAVADOCS_JAR) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(ROCKSDB_JAVADOCS_JAR).sha1
+
+rocksdbjava_sources_jar:
+	cd java/src/main/java; $(JAR_CMD) -cf ../../../target/$(ROCKSDB_SOURCES_JAR) org
 	openssl sha1 java/target/$(ROCKSDB_SOURCES_JAR) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(ROCKSDB_SOURCES_JAR).sha1

 rocksdbjavastatic_deps: $(JAVA_COMPRESSIONS)

 rocksdbjavastatic_libobjects: $(LIB_OBJECTS)

-rocksdbjavastaticrelease: rocksdbjavastaticosx
+rocksdbjavastaticrelease: rocksdbjavastaticosx rocksdbjava_javadocs_jar rocksdbjava_sources_jar
 	cd java/crossbuild && (vagrant destroy -f || true) && vagrant up linux32 && vagrant halt linux32 && vagrant up linux64 && vagrant halt linux64 && vagrant up linux64-musl && vagrant halt linux64-musl
 	cd java; $(JAR_CMD) -cf target/$(ROCKSDB_JAR_ALL) HISTORY*.md
 	cd java/target; $(JAR_CMD) -uf $(ROCKSDB_JAR_ALL) librocksdbjni-*.so librocksdbjni-*.jnilib
 	cd java/target/classes; $(JAR_CMD) -uf ../$(ROCKSDB_JAR_ALL) org/rocksdb/*.class org/rocksdb/util/*.class
 	openssl sha1 java/target/$(ROCKSDB_JAR_ALL) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(ROCKSDB_JAR_ALL).sha1

-rocksdbjavastaticreleasedocker: rocksdbjavastaticosx rocksdbjavastaticdockerx86 rocksdbjavastaticdockerx86_64 rocksdbjavastaticdockerx86musl rocksdbjavastaticdockerx86_64musl
+rocksdbjavastaticreleasedocker: rocksdbjavastaticosx rocksdbjavastaticdockerx86 rocksdbjavastaticdockerx86_64 rocksdbjavastaticdockerx86musl rocksdbjavastaticdockerx86_64musl rocksdbjava_javadocs_jar rocksdbjava_sources_jar
 	cd java; $(JAR_CMD) -cf target/$(ROCKSDB_JAR_ALL) HISTORY*.md
 	cd java/target; $(JAR_CMD) -uf $(ROCKSDB_JAR_ALL) librocksdbjni-*.so librocksdbjni-*.jnilib
 	cd java/target/classes; $(JAR_CMD) -uf ../$(ROCKSDB_JAR_ALL) org/rocksdb/*.class org/rocksdb/util/*.class
--- a/db/arena_wrapped_db_iter.cc
+++ b/db/arena_wrapped_db_iter.cc
@ -58,6 +58,7 @@ Status ArenaWrappedDBIter::Refresh() {
  uint64_t cur_sv_number = cfd_->GetSuperVersionNumber();
  TEST_SYNC_POINT("ArenaWrappedDBIter::Refresh:1");
  TEST_SYNC_POINT("ArenaWrappedDBIter::Refresh:2");
+  while (true) {
    if (sv_number_ != cur_sv_number) {
      Env* env = db_iter_->env();
      db_iter_->~DBIter();
@ -79,9 +80,33 @@ Status ArenaWrappedDBIter::Refresh() {
          read_options_, cfd_, sv, &arena_, db_iter_->GetRangeDelAggregator(),
          latest_seq, /* allow_unprepared_value */ true);
      SetIterUnderDBIter(internal_iter);
+      break;
    } else {
-    db_iter_->set_sequence(db_impl_->GetLatestSequenceNumber());
+      SequenceNumber latest_seq = db_impl_->GetLatestSequenceNumber();
+      // Refresh range-tombstones in MemTable
+      if (!read_options_.ignore_range_deletions) {
+        SuperVersion* sv = cfd_->GetThreadLocalSuperVersion(db_impl_);
+        ReadRangeDelAggregator* range_del_agg =
+            db_iter_->GetRangeDelAggregator();
+        std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter;
+        range_del_iter.reset(
+            sv->mem->NewRangeTombstoneIterator(read_options_, latest_seq));
+        range_del_agg->AddTombstones(std::move(range_del_iter));
+        cfd_->ReturnThreadLocalSuperVersion(sv);
+      }
+      // Refresh latest sequence number
+      db_iter_->set_sequence(latest_seq);
      db_iter_->set_valid(false);
+      // Check again if the latest super version number is changed
+      uint64_t latest_sv_number = cfd_->GetSuperVersionNumber();
+      if (latest_sv_number != cur_sv_number) {
+        // If the super version number is changed after refreshing,
+        // fallback to Re-Init the InternalIterator
+        cur_sv_number = latest_sv_number;
+        continue;
+      }
+      break;
+    }
  }
  return Status::OK();
 }
--- a/db/c_test.c
+++ b/db/c_test.c
@ -7,12 +7,13 @@

 #ifndef ROCKSDB_LITE  // Lite does not support C API

-#include "rocksdb/c.h"
-
+#include <assert.h>
 #include <stddef.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/types.h>
+
+#include "rocksdb/c.h"
 #ifndef OS_WIN
 #include <unistd.h>
 #endif
@ -89,10 +90,8 @@ static void CheckEqual(const char* expected, const char* v, size_t n) {
    // ok
    return;
  } else {
-    fprintf(stderr, "%s: expected '%s', got '%s'\n",
-            phase,
-            (expected ? expected : "(null)"),
-            (v ? v : "(null"));
+    fprintf(stderr, "%s: expected '%s', got '%s'\n", phase,
+            (expected ? expected : "(null)"), (v ? v : "(null)"));
    abort();
  }
 }
@ -1019,7 +1018,36 @@ int main(int argc, char** argv) {
    CheckGet(db, roptions, "foo", NULL);
    rocksdb_release_snapshot(db, snap);
  }
-
+  StartPhase("snapshot_with_memtable_inplace_update");
+  {
+    rocksdb_close(db);
+    const rocksdb_snapshot_t* snap = NULL;
+    const char* s_key = "foo_snap";
+    const char* value1 = "hello_s1";
+    const char* value2 = "hello_s2";
+    rocksdb_options_set_allow_concurrent_memtable_write(options, 0);
+    rocksdb_options_set_inplace_update_support(options, 1);
+    rocksdb_options_set_error_if_exists(options, 0);
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+    rocksdb_put(db, woptions, s_key, 8, value1, 8, &err);
+    snap = rocksdb_create_snapshot(db);
+    assert(snap != NULL);
+    rocksdb_put(db, woptions, s_key, 8, value2, 8, &err);
+    CheckNoError(err);
+    rocksdb_readoptions_set_snapshot(roptions, snap);
+    CheckGet(db, roptions, "foo", NULL);
+    // snapshot syntax is invalid, because of inplace update supported is set
+    CheckGet(db, roptions, s_key, value2);
+    // restore the data and options
+    rocksdb_delete(db, woptions, s_key, 8, &err);
+    CheckGet(db, roptions, s_key, NULL);
+    rocksdb_release_snapshot(db, snap);
+    rocksdb_readoptions_set_snapshot(roptions, NULL);
+    rocksdb_options_set_inplace_update_support(options, 0);
+    rocksdb_options_set_allow_concurrent_memtable_write(options, 1);
+    rocksdb_options_set_error_if_exists(options, 1);
+  }
  StartPhase("repair");
  {
    // If we do not compact here, then the lazy deletion of
--- a/db/column_family.cc
+++ b/db/column_family.cc
@ -1562,20 +1562,6 @@ ColumnFamilyData* ColumnFamilySet::CreateColumnFamily(
  return new_cfd;
 }

-// REQUIRES: DB mutex held
-void ColumnFamilySet::FreeDeadColumnFamilies() {
-  autovector<ColumnFamilyData*> to_delete;
-  for (auto cfd = dummy_cfd_->next_; cfd != dummy_cfd_; cfd = cfd->next_) {
-    if (cfd->refs_.load(std::memory_order_relaxed) == 0) {
-      to_delete.push_back(cfd);
-    }
-  }
-  for (auto cfd : to_delete) {
-    // this is very rare, so it's not a problem that we do it under a mutex
-    delete cfd;
-  }
-}
-
 // under a DB mutex AND from a write thread
 void ColumnFamilySet::RemoveColumnFamily(ColumnFamilyData* cfd) {
  auto cfd_iter = column_family_data_.find(cfd->GetID());
--- a/db/column_family.h
+++ b/db/column_family.h
@ -519,9 +519,10 @@ class ColumnFamilyData {
  ThreadLocalPtr* TEST_GetLocalSV() { return local_sv_.get(); }
  WriteBufferManager* write_buffer_mgr() { return write_buffer_manager_; }

+  static const uint32_t kDummyColumnFamilyDataId;
+
 private:
  friend class ColumnFamilySet;
-  static const uint32_t kDummyColumnFamilyDataId;
  ColumnFamilyData(uint32_t id, const std::string& name,
                   Version* dummy_versions, Cache* table_cache,
                   WriteBufferManager* write_buffer_manager,
@ -627,10 +628,8 @@ class ColumnFamilyData {
 // held and it needs to be executed from the write thread. SetDropped() also
 // guarantees that it will be called only from single-threaded LogAndApply(),
 // but this condition is not that important.
-// * Iteration -- hold DB mutex, but you can release it in the body of
-// iteration. If you release DB mutex in body, reference the column
-// family before the mutex and unreference after you unlock, since the column
-// family might get dropped when the DB mutex is released
+// * Iteration -- hold DB mutex. If you want to release the DB mutex in the
+// body of the iteration, wrap in a RefedColumnFamilySet.
 // * GetDefault() -- thread safe
 // * GetColumnFamily() -- either inside of DB mutex or from a write thread
 // * GetNextColumnFamilyID(), GetMaxColumnFamily(), UpdateMaxColumnFamily(),
@ -642,17 +641,12 @@ class ColumnFamilySet {
   public:
    explicit iterator(ColumnFamilyData* cfd)
        : current_(cfd) {}
+    // NOTE: minimum operators for for-loop iteration
    iterator& operator++() {
-      // dropped column families might still be included in this iteration
-      // (we're only removing them when client drops the last reference to the
-      // column family).
-      // dummy is never dead, so this will never be infinite
-      do {
      current_ = current_->next_;
-      } while (current_->refs_.load(std::memory_order_relaxed) == 0);
      return *this;
    }
-    bool operator!=(const iterator& other) {
+    bool operator!=(const iterator& other) const {
      return this->current_ != other.current_;
    }
    ColumnFamilyData* operator*() { return current_; }
@ -691,10 +685,6 @@ class ColumnFamilySet {
  iterator begin() { return iterator(dummy_cfd_->next_); }
  iterator end() { return iterator(dummy_cfd_); }

-  // REQUIRES: DB mutex held
-  // Don't call while iterating over ColumnFamilySet
-  void FreeDeadColumnFamilies();
-
  Cache* get_table_cache() { return table_cache_; }

  WriteBufferManager* write_buffer_manager() { return write_buffer_manager_; }
@ -737,6 +727,55 @@ class ColumnFamilySet {
  std::string db_session_id_;
 };

+// A wrapper for ColumnFamilySet that supports releasing DB mutex during each
+// iteration over the iterator, because the cfd is Refed and Unrefed during
+// each iteration to prevent concurrent CF drop from destroying it (until
+// Unref).
+class RefedColumnFamilySet {
+ public:
+  explicit RefedColumnFamilySet(ColumnFamilySet* cfs) : wrapped_(cfs) {}
+
+  class iterator {
+   public:
+    explicit iterator(ColumnFamilySet::iterator wrapped) : wrapped_(wrapped) {
+      MaybeRef(*wrapped_);
+    }
+    ~iterator() { MaybeUnref(*wrapped_); }
+    inline void MaybeRef(ColumnFamilyData* cfd) {
+      if (cfd->GetID() != ColumnFamilyData::kDummyColumnFamilyDataId) {
+        cfd->Ref();
+      }
+    }
+    inline void MaybeUnref(ColumnFamilyData* cfd) {
+      if (cfd->GetID() != ColumnFamilyData::kDummyColumnFamilyDataId) {
+        cfd->UnrefAndTryDelete();
+      }
+    }
+    // NOTE: minimum operators for for-loop iteration
+    inline iterator& operator++() {
+      ColumnFamilyData* old = *wrapped_;
+      ++wrapped_;
+      // Can only unref & potentially free cfd after accessing its next_
+      MaybeUnref(old);
+      MaybeRef(*wrapped_);
+      return *this;
+    }
+    inline bool operator!=(const iterator& other) const {
+      return this->wrapped_ != other.wrapped_;
+    }
+    inline ColumnFamilyData* operator*() { return *wrapped_; }
+
+   private:
+    ColumnFamilySet::iterator wrapped_;
+  };
+
+  iterator begin() { return iterator(wrapped_->begin()); }
+  iterator end() { return iterator(wrapped_->end()); }
+
+ private:
+  ColumnFamilySet* wrapped_;
+};
+
 // We use ColumnFamilyMemTablesImpl to provide WriteBatch a way to access
 // memtables of different column families (specified by ID in the write batch)
 class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables {
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@ -1228,7 +1228,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
  }
 #endif  // !ROCKSDB_LITE

-  uint64_t prev_cpu_micros = db_options_.clock->CPUNanos() / 1000;
+  uint64_t prev_cpu_micros = db_options_.clock->CPUMicros();

  ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();

@ -1572,7 +1572,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
  }

  sub_compact->compaction_job_stats.cpu_micros =
-      db_options_.clock->CPUNanos() / 1000 - prev_cpu_micros;
+      db_options_.clock->CPUMicros() - prev_cpu_micros;

  if (measure_io_stats_) {
    sub_compact->compaction_job_stats.file_write_nanos +=
--- a/db/db_basic_test.cc
+++ b/db/db_basic_test.cc
@ -1938,8 +1938,9 @@ TEST_F(DBBasicTest, MultiGetStats) {
  int total_keys = 2000;
  std::vector<std::string> keys_str(total_keys);
  std::vector<Slice> keys(total_keys);
-  std::vector<PinnableSlice> values(total_keys);
-  std::vector<Status> s(total_keys);
+  static size_t kMultiGetBatchSize = 100;
+  std::vector<PinnableSlice> values(kMultiGetBatchSize);
+  std::vector<Status> s(kMultiGetBatchSize);
  ReadOptions read_opts;

  Random rnd(309);
@ -1976,15 +1977,16 @@ TEST_F(DBBasicTest, MultiGetStats) {
    }
  }
  ASSERT_OK(Flush(1));
+  MoveFilesToLevel(1, 1);
  Close();

  ReopenWithColumnFamilies({"default", "pikachu"}, options);
  ASSERT_OK(options.statistics->Reset());

-  db_->MultiGet(read_opts, handles_[1], total_keys, keys.data(), values.data(),
-                s.data(), false);
+  db_->MultiGet(read_opts, handles_[1], kMultiGetBatchSize, &keys[1250],
+                values.data(), s.data(), false);

-  ASSERT_EQ(values.size(), total_keys);
+  ASSERT_EQ(values.size(), kMultiGetBatchSize);
  HistogramData hist_data_blocks;
  HistogramData hist_index_and_filter_blocks;
  HistogramData hist_sst;
@ -1996,16 +1998,16 @@ TEST_F(DBBasicTest, MultiGetStats) {
  options.statistics->histogramData(NUM_SST_READ_PER_LEVEL, &hist_sst);

  // Maximum number of blocks read from a file system in a level.
-  ASSERT_GT(hist_data_blocks.max, 0);
+  ASSERT_EQ(hist_data_blocks.max, 32);
  ASSERT_GT(hist_index_and_filter_blocks.max, 0);
  // Maximum number of sst files read from file system in a level.
-  ASSERT_GT(hist_sst.max, 0);
+  ASSERT_EQ(hist_sst.max, 2);

  // Minimun number of blocks read in a level.
-  ASSERT_EQ(hist_data_blocks.min, 3);
+  ASSERT_EQ(hist_data_blocks.min, 4);
  ASSERT_GT(hist_index_and_filter_blocks.min, 0);
  // Minimun number of sst files read in a level.
-  ASSERT_GT(hist_sst.max, 0);
+  ASSERT_EQ(hist_sst.min, 1);
 }

 // Test class for batched MultiGet with prefix extractor
--- a/db/db_bloom_filter_test.cc
+++ b/db/db_bloom_filter_test.cc
@ -1502,6 +1502,63 @@ TEST_F(DBBloomFilterTest, MemtableWholeKeyBloomFilter) {
  ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count);
 }

+TEST_F(DBBloomFilterTest, MemtableWholeKeyBloomFilterMultiGet) {
+  Options options = CurrentOptions();
+  options.memtable_prefix_bloom_size_ratio = 0.015;
+  options.memtable_whole_key_filtering = true;
+  Reopen(options);
+  std::string key1("AA");
+  std::string key2("BB");
+  std::string key3("CC");
+  std::string key4("DD");
+  std::string key_not("EE");
+  std::string value1("Value1");
+  std::string value2("Value2");
+  std::string value3("Value3");
+  std::string value4("Value4");
+
+  ASSERT_OK(Put(key1, value1, WriteOptions()));
+  ASSERT_OK(Put(key2, value2, WriteOptions()));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(key3, value3, WriteOptions()));
+  const Snapshot* snapshot = db_->GetSnapshot();
+  ASSERT_OK(Put(key4, value4, WriteOptions()));
+
+  // Delete key2 and key3
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "BA", "CZ"));
+
+  // Read without snapshot
+  auto results = MultiGet({key_not, key1, key2, key3, key4});
+  ASSERT_EQ(results[0], "NOT_FOUND");
+  ASSERT_EQ(results[1], value1);
+  ASSERT_EQ(results[2], "NOT_FOUND");
+  ASSERT_EQ(results[3], "NOT_FOUND");
+  ASSERT_EQ(results[4], value4);
+
+  // Also check Get
+  ASSERT_EQ(Get(key1), value1);
+  ASSERT_EQ(Get(key2), "NOT_FOUND");
+  ASSERT_EQ(Get(key3), "NOT_FOUND");
+  ASSERT_EQ(Get(key4), value4);
+
+  // Read with snapshot
+  results = MultiGet({key_not, key1, key2, key3, key4}, snapshot);
+  ASSERT_EQ(results[0], "NOT_FOUND");
+  ASSERT_EQ(results[1], value1);
+  ASSERT_EQ(results[2], value2);
+  ASSERT_EQ(results[3], value3);
+  ASSERT_EQ(results[4], "NOT_FOUND");
+
+  // Also check Get
+  ASSERT_EQ(Get(key1, snapshot), value1);
+  ASSERT_EQ(Get(key2, snapshot), value2);
+  ASSERT_EQ(Get(key3, snapshot), value3);
+  ASSERT_EQ(Get(key4, snapshot), "NOT_FOUND");
+
+  db_->ReleaseSnapshot(snapshot);
+}
+
 TEST_F(DBBloomFilterTest, MemtablePrefixBloomOutOfDomain) {
  constexpr size_t kPrefixSize = 8;
  const std::string kKey = "key";
--- a/db/db_compaction_test.cc
+++ b/db/db_compaction_test.cc
@ -6881,6 +6881,319 @@ TEST_F(DBCompactionTest, FIFOWarm) {
  Destroy(options);
 }

+TEST_F(DBCompactionTest, DisableMultiManualCompaction) {
+  const int kNumL0Files = 10;
+
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  Reopen(options);
+
+  // Generate 2 levels of file to make sure the manual compaction is not skipped
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put(Key(i), "value"));
+    if (i % 2) {
+      ASSERT_OK(Flush());
+    }
+  }
+  MoveFilesToLevel(2);
+
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put(Key(i), "value"));
+    if (i % 2) {
+      ASSERT_OK(Flush());
+    }
+  }
+  MoveFilesToLevel(1);
+
+  // Block compaction queue
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+
+  port::Thread compact_thread1([&]() {
+    CompactRangeOptions cro;
+    cro.exclusive_manual_compaction = false;
+    std::string begin_str = Key(0);
+    std::string end_str = Key(3);
+    Slice b = begin_str;
+    Slice e = end_str;
+    auto s = db_->CompactRange(cro, &b, &e);
+    ASSERT_TRUE(s.IsIncomplete());
+  });
+
+  port::Thread compact_thread2([&]() {
+    CompactRangeOptions cro;
+    cro.exclusive_manual_compaction = false;
+    std::string begin_str = Key(4);
+    std::string end_str = Key(7);
+    Slice b = begin_str;
+    Slice e = end_str;
+    auto s = db_->CompactRange(cro, &b, &e);
+    ASSERT_TRUE(s.IsIncomplete());
+  });
+
+  // Disable manual compaction should cancel both manual compactions and both
+  // compaction should return incomplete.
+  db_->DisableManualCompaction();
+
+  compact_thread1.join();
+  compact_thread2.join();
+
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+}
+
+TEST_F(DBCompactionTest, DisableJustStartedManualCompaction) {
+  const int kNumL0Files = 4;
+
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  Reopen(options);
+
+  // generate files, but avoid trigger auto compaction
+  for (int i = 0; i < kNumL0Files / 2; i++) {
+    ASSERT_OK(Put(Key(1), "value1"));
+    ASSERT_OK(Put(Key(2), "value2"));
+    ASSERT_OK(Flush());
+  }
+
+  // make sure the manual compaction background is started but not yet set the
+  // status to in_progress, then cancel the manual compaction, which should not
+  // result in segfault
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::BGWorkCompaction",
+        "DBCompactionTest::DisableJustStartedManualCompaction:"
+        "PreDisableManualCompaction"},
+       {"DBImpl::RunManualCompaction:Unscheduled",
+        "BackgroundCallCompaction:0"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  port::Thread compact_thread([&]() {
+    CompactRangeOptions cro;
+    cro.exclusive_manual_compaction = true;
+    auto s = db_->CompactRange(cro, nullptr, nullptr);
+    ASSERT_TRUE(s.IsIncomplete());
+  });
+  TEST_SYNC_POINT(
+      "DBCompactionTest::DisableJustStartedManualCompaction:"
+      "PreDisableManualCompaction");
+  db_->DisableManualCompaction();
+
+  compact_thread.join();
+}
+
+TEST_F(DBCompactionTest, DisableInProgressManualCompaction) {
+  const int kNumL0Files = 4;
+
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  Reopen(options);
+
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::BackgroundCompaction:InProgress",
+        "DBCompactionTest::DisableInProgressManualCompaction:"
+        "PreDisableManualCompaction"},
+       {"DBImpl::RunManualCompaction:Unscheduled",
+        "CompactionJob::Run():Start"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // generate files, but avoid trigger auto compaction
+  for (int i = 0; i < kNumL0Files / 2; i++) {
+    ASSERT_OK(Put(Key(1), "value1"));
+    ASSERT_OK(Put(Key(2), "value2"));
+    ASSERT_OK(Flush());
+  }
+
+  port::Thread compact_thread([&]() {
+    CompactRangeOptions cro;
+    cro.exclusive_manual_compaction = true;
+    auto s = db_->CompactRange(cro, nullptr, nullptr);
+    ASSERT_TRUE(s.IsIncomplete());
+  });
+
+  TEST_SYNC_POINT(
+      "DBCompactionTest::DisableInProgressManualCompaction:"
+      "PreDisableManualCompaction");
+  db_->DisableManualCompaction();
+
+  compact_thread.join();
+}
+
+TEST_F(DBCompactionTest, DisableManualCompactionThreadQueueFull) {
+  const int kNumL0Files = 4;
+
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::RunManualCompaction:Scheduled",
+        "DBCompactionTest::DisableManualCompactionThreadQueueFull:"
+        "PreDisableManualCompaction"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  Reopen(options);
+
+  // Block compaction queue
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+
+  // generate files, but avoid trigger auto compaction
+  for (int i = 0; i < kNumL0Files / 2; i++) {
+    ASSERT_OK(Put(Key(1), "value1"));
+    ASSERT_OK(Put(Key(2), "value2"));
+    ASSERT_OK(Flush());
+  }
+
+  port::Thread compact_thread([&]() {
+    CompactRangeOptions cro;
+    cro.exclusive_manual_compaction = true;
+    auto s = db_->CompactRange(cro, nullptr, nullptr);
+    ASSERT_TRUE(s.IsIncomplete());
+  });
+
+  TEST_SYNC_POINT(
+      "DBCompactionTest::DisableManualCompactionThreadQueueFull:"
+      "PreDisableManualCompaction");
+
+  // Generate more files to trigger auto compaction which is scheduled after
+  // manual compaction. Has to generate 4 more files because existing files are
+  // pending compaction
+  for (int i = 0; i < kNumL0Files; i++) {
+    ASSERT_OK(Put(Key(1), "value1"));
+    ASSERT_OK(Put(Key(2), "value2"));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_EQ(ToString(kNumL0Files + (kNumL0Files / 2)), FilesPerLevel(0));
+
+  db_->DisableManualCompaction();
+
+  // CompactRange should return before the compaction has the chance to run
+  compact_thread.join();
+
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+}
+
+TEST_F(DBCompactionTest, DisableManualCompactionThreadQueueFullDBClose) {
+  const int kNumL0Files = 4;
+
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::RunManualCompaction:Scheduled",
+        "DBCompactionTest::DisableManualCompactionThreadQueueFullDBClose:"
+        "PreDisableManualCompaction"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  Reopen(options);
+
+  // Block compaction queue
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+
+  // generate files, but avoid trigger auto compaction
+  for (int i = 0; i < kNumL0Files / 2; i++) {
+    ASSERT_OK(Put(Key(1), "value1"));
+    ASSERT_OK(Put(Key(2), "value2"));
+    ASSERT_OK(Flush());
+  }
+
+  port::Thread compact_thread([&]() {
+    CompactRangeOptions cro;
+    cro.exclusive_manual_compaction = true;
+    auto s = db_->CompactRange(cro, nullptr, nullptr);
+    ASSERT_TRUE(s.IsIncomplete());
+  });
+
+  TEST_SYNC_POINT(
+      "DBCompactionTest::DisableManualCompactionThreadQueueFullDBClose:"
+      "PreDisableManualCompaction");
+
+  // Generate more files to trigger auto compaction which is scheduled after
+  // manual compaction. Has to generate 4 more files because existing files are
+  // pending compaction
+  for (int i = 0; i < kNumL0Files; i++) {
+    ASSERT_OK(Put(Key(1), "value1"));
+    ASSERT_OK(Put(Key(2), "value2"));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_EQ(ToString(kNumL0Files + (kNumL0Files / 2)), FilesPerLevel(0));
+
+  db_->DisableManualCompaction();
+
+  // CompactRange should return before the compaction has the chance to run
+  compact_thread.join();
+
+  // Try close DB while manual compaction is canceled but still in the queue.
+  // And an auto-triggered compaction is also in the queue.
+  auto s = db_->Close();
+  ASSERT_OK(s);
+
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+}
+
+TEST_F(DBCompactionTest, DBCloseWithManualCompaction) {
+  const int kNumL0Files = 4;
+
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::RunManualCompaction:Scheduled",
+        "DBCompactionTest::DisableManualCompactionThreadQueueFullDBClose:"
+        "PreDisableManualCompaction"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  Reopen(options);
+
+  // Block compaction queue
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+
+  // generate files, but avoid trigger auto compaction
+  for (int i = 0; i < kNumL0Files / 2; i++) {
+    ASSERT_OK(Put(Key(1), "value1"));
+    ASSERT_OK(Put(Key(2), "value2"));
+    ASSERT_OK(Flush());
+  }
+
+  port::Thread compact_thread([&]() {
+    CompactRangeOptions cro;
+    cro.exclusive_manual_compaction = true;
+    auto s = db_->CompactRange(cro, nullptr, nullptr);
+    ASSERT_TRUE(s.IsIncomplete());
+  });
+
+  TEST_SYNC_POINT(
+      "DBCompactionTest::DisableManualCompactionThreadQueueFullDBClose:"
+      "PreDisableManualCompaction");
+
+  // Generate more files to trigger auto compaction which is scheduled after
+  // manual compaction. Has to generate 4 more files because existing files are
+  // pending compaction
+  for (int i = 0; i < kNumL0Files; i++) {
+    ASSERT_OK(Put(Key(1), "value1"));
+    ASSERT_OK(Put(Key(2), "value2"));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_EQ(ToString(kNumL0Files + (kNumL0Files / 2)), FilesPerLevel(0));
+
+  // Close DB with manual compaction and auto triggered compaction in the queue.
+  auto s = db_->Close();
+  ASSERT_OK(s);
+
+  // manual compaction thread should return with Incomplete().
+  compact_thread.join();
+
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+}
+
 TEST_F(DBCompactionTest,
       DisableManualCompactionDoesNotWaitForDrainingAutomaticCompaction) {
  // When `CompactRangeOptions::exclusive_manual_compaction == true`, we wait
--- a/db/db_filesnapshot.cc
+++ b/db/db_filesnapshot.cc
@ -45,17 +45,15 @@ Status DBImpl::FlushForGetLiveFiles() {
    }
    mutex_.Lock();
  } else {
-    for (auto cfd : *versions_->GetColumnFamilySet()) {
+    for (auto cfd : versions_->GetRefedColumnFamilySet()) {
      if (cfd->IsDropped()) {
        continue;
      }
-      cfd->Ref();
      mutex_.Unlock();
      status = FlushMemTable(cfd, FlushOptions(), FlushReason::kGetLiveFiles);
      TEST_SYNC_POINT("DBImpl::GetLiveFiles:1");
      TEST_SYNC_POINT("DBImpl::GetLiveFiles:2");
      mutex_.Lock();
-      cfd->UnrefAndTryDelete();
      if (!status.ok() && !status.IsColumnFamilyDropped()) {
        break;
      } else if (status.IsColumnFamilyDropped()) {
@ -63,7 +61,6 @@ Status DBImpl::FlushForGetLiveFiles() {
      }
    }
  }
-  versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
  return status;
 }

--- a/db/db_flush_test.cc
+++ b/db/db_flush_test.cc
@ -676,6 +676,7 @@ class TestFlushListener : public EventListener {
  ~TestFlushListener() override {
    prev_fc_info_.status.PermitUncheckedError();  // Ignore the status
  }
+
  void OnTableFileCreated(const TableFileCreationInfo& info) override {
    // remember the info for later checking the FlushJobInfo.
    prev_fc_info_ = info;
@ -1999,6 +2000,61 @@ TEST_P(DBFlushTestBlobError, FlushError) {
 }

 #ifndef ROCKSDB_LITE
+TEST_F(DBFlushTest, TombstoneVisibleInSnapshot) {
+  class SimpleTestFlushListener : public EventListener {
+   public:
+    explicit SimpleTestFlushListener(DBFlushTest* _test) : test_(_test) {}
+    ~SimpleTestFlushListener() override {}
+
+    void OnFlushBegin(DB* db, const FlushJobInfo& info) override {
+      ASSERT_EQ(static_cast<uint32_t>(0), info.cf_id);
+
+      ASSERT_OK(db->Delete(WriteOptions(), "foo"));
+      snapshot_ = db->GetSnapshot();
+      ASSERT_OK(db->Put(WriteOptions(), "foo", "value"));
+
+      auto* dbimpl = static_cast_with_check<DBImpl>(db);
+      assert(dbimpl);
+
+      ColumnFamilyHandle* cfh = db->DefaultColumnFamily();
+      auto* cfhi = static_cast_with_check<ColumnFamilyHandleImpl>(cfh);
+      assert(cfhi);
+      ASSERT_OK(dbimpl->TEST_SwitchMemtable(cfhi->cfd()));
+    }
+
+    DBFlushTest* test_ = nullptr;
+    const Snapshot* snapshot_ = nullptr;
+  };
+
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  auto* listener = new SimpleTestFlushListener(this);
+  options.listeners.emplace_back(listener);
+  DestroyAndReopen(options);
+
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "value0"));
+
+  ManagedSnapshot snapshot_guard(db_);
+
+  ColumnFamilyHandle* default_cf = db_->DefaultColumnFamily();
+  ASSERT_OK(db_->Flush(FlushOptions(), default_cf));
+
+  const Snapshot* snapshot = listener->snapshot_;
+  assert(snapshot);
+
+  ReadOptions read_opts;
+  read_opts.snapshot = snapshot;
+
+  // Using snapshot should not see "foo".
+  {
+    std::string value;
+    Status s = db_->Get(read_opts, "foo", &value);
+    ASSERT_TRUE(s.IsNotFound());
+  }
+
+  db_->ReleaseSnapshot(snapshot);
+}
+
 TEST_P(DBAtomicFlushTest, ManualFlushUnder2PC) {
  Options options = CurrentOptions();
  options.create_if_missing = true;
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@ -383,15 +383,12 @@ Status DBImpl::ResumeImpl(DBRecoverContext context) {
      s = AtomicFlushMemTables(cfds, flush_opts, context.flush_reason);
      mutex_.Lock();
    } else {
-      for (auto cfd : *versions_->GetColumnFamilySet()) {
+      for (auto cfd : versions_->GetRefedColumnFamilySet()) {
        if (cfd->IsDropped()) {
          continue;
        }
-        cfd->Ref();
-        mutex_.Unlock();
+        InstrumentedMutexUnlock u(&mutex_);
        s = FlushMemTable(cfd, flush_opts, context.flush_reason);
-        mutex_.Lock();
-        cfd->UnrefAndTryDelete();
        if (!s.ok()) {
          break;
        }
@ -406,14 +403,6 @@ Status DBImpl::ResumeImpl(DBRecoverContext context) {

  JobContext job_context(0);
  FindObsoleteFiles(&job_context, true);
-  if (s.ok()) {
-    s = error_handler_.ClearBGError();
-  } else {
-    // NOTE: this is needed to pass ASSERT_STATUS_CHECKED
-    // in the DBSSTTest.DBWithMaxSpaceAllowedRandomized test.
-    // See https://github.com/facebook/rocksdb/pull/7715#issuecomment-754947952
-    error_handler_.GetRecoveryError().PermitUncheckedError();
-  }
  mutex_.Unlock();

  job_context.manifest_file_number = 1;
@ -434,11 +423,31 @@ Status DBImpl::ResumeImpl(DBRecoverContext context) {
            immutable_db_options_.info_log,
            "DB resume requested but could not enable file deletions [%s]",
            s.ToString().c_str());
+        assert(false);
      }
    }
-    ROCKS_LOG_INFO(immutable_db_options_.info_log, "Successfully resumed DB");
  }
+
  mutex_.Lock();
+  if (s.ok()) {
+    // This will notify and unblock threads waiting for error recovery to
+    // finish. Those previouly waiting threads can now proceed, which may
+    // include closing the db.
+    s = error_handler_.ClearBGError();
+  } else {
+    // NOTE: this is needed to pass ASSERT_STATUS_CHECKED
+    // in the DBSSTTest.DBWithMaxSpaceAllowedRandomized test.
+    // See https://github.com/facebook/rocksdb/pull/7715#issuecomment-754947952
+    error_handler_.GetRecoveryError().PermitUncheckedError();
+  }
+
+  if (s.ok()) {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log, "Successfully resumed DB");
+  } else {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log, "Failed to resume DB [%s]",
+                   s.ToString().c_str());
+  }
+
  // Check for shutdown again before scheduling further compactions,
  // since we released and re-acquired the lock above
  if (shutdown_initiated_) {
@ -491,18 +500,14 @@ void DBImpl::CancelAllBackgroundWork(bool wait) {
      s.PermitUncheckedError();  //**TODO: What to do on error?
      mutex_.Lock();
    } else {
-      for (auto cfd : *versions_->GetColumnFamilySet()) {
+      for (auto cfd : versions_->GetRefedColumnFamilySet()) {
        if (!cfd->IsDropped() && cfd->initialized() && !cfd->mem()->IsEmpty()) {
-          cfd->Ref();
-          mutex_.Unlock();
+          InstrumentedMutexUnlock u(&mutex_);
          Status s = FlushMemTable(cfd, FlushOptions(), FlushReason::kShutDown);
          s.PermitUncheckedError();  //**TODO: What to do on error?
-          mutex_.Lock();
-          cfd->UnrefAndTryDelete();
        }
      }
    }
-    versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
  }

  shutting_down_.store(true, std::memory_order_release);
@ -533,10 +538,19 @@ Status DBImpl::CloseHelper() {
  // marker. After this we do a variant of the waiting and unschedule work
  // (to consider: moving all the waiting into CancelAllBackgroundWork(true))
  CancelAllBackgroundWork(false);
+
+  // Cancel manual compaction if there's any
+  if (HasPendingManualCompaction()) {
+    DisableManualCompaction();
+  }
  mutex_.Lock();
-  env_->UnSchedule(this, Env::Priority::BOTTOM);
-  env_->UnSchedule(this, Env::Priority::LOW);
-  env_->UnSchedule(this, Env::Priority::HIGH);
+  // Unschedule all tasks for this DB
+  for (uint8_t i = 0; i < static_cast<uint8_t>(TaskType::kCount); i++) {
+    env_->UnSchedule(GetTaskTag(i), Env::Priority::BOTTOM);
+    env_->UnSchedule(GetTaskTag(i), Env::Priority::LOW);
+    env_->UnSchedule(GetTaskTag(i), Env::Priority::HIGH);
+  }
+
  Status ret = Status::OK();

  // Wait for background work to finish
@ -956,19 +970,14 @@ void DBImpl::DumpStats() {
  TEST_SYNC_POINT("DBImpl::DumpStats:StartRunning");
  {
    InstrumentedMutexLock l(&mutex_);
-    for (auto cfd : *versions_->GetColumnFamilySet()) {
+    for (auto cfd : versions_->GetRefedColumnFamilySet()) {
      if (cfd->initialized()) {
        // Release DB mutex for gathering cache entry stats. Pass over all
        // column families for this first so that other stats are dumped
        // near-atomically.
-        // Get a ref before unlocking
-        cfd->Ref();
-        {
        InstrumentedMutexUnlock u(&mutex_);
        cfd->internal_stats()->CollectCacheEntryStats(/*foreground=*/false);
      }
-        cfd->UnrefAndTryDelete();
-      }
    }

    const std::string* property = &DB::Properties::kDBStats;
@ -1891,11 +1900,12 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key,
      return s;
    }
  }
+  PinnedIteratorsManager pinned_iters_mgr;
  if (!done) {
    PERF_TIMER_GUARD(get_from_output_files_time);
    sv->current->Get(
        read_options, lkey, get_impl_options.value, timestamp, &s,
-        &merge_context, &max_covering_tombstone_seq,
+        &merge_context, &max_covering_tombstone_seq, &pinned_iters_mgr,
        get_impl_options.get_value ? get_impl_options.value_found : nullptr,
        nullptr, nullptr,
        get_impl_options.get_value ? get_impl_options.callback : nullptr,
@ -2072,9 +2082,11 @@ std::vector<Status> DBImpl::MultiGet(
    if (!done) {
      PinnableSlice pinnable_val;
      PERF_TIMER_GUARD(get_from_output_files_time);
-      super_version->current->Get(
-          read_options, lkey, &pinnable_val, timestamp, &s, &merge_context,
-          &max_covering_tombstone_seq, /*value_found=*/nullptr,
+      PinnedIteratorsManager pinned_iters_mgr;
+      super_version->current->Get(read_options, lkey, &pinnable_val, timestamp,
+                                  &s, &merge_context,
+                                  &max_covering_tombstone_seq,
+                                  &pinned_iters_mgr, /*value_found=*/nullptr,
                                  /*key_exists=*/nullptr,
                                  /*seq=*/nullptr, read_callback);
      value->assign(pinnable_val.data(), pinnable_val.size());
@ -3148,6 +3160,12 @@ bool CfdListContains(const CfdList& list, ColumnFamilyData* cfd) {
 }  //  namespace

 void DBImpl::ReleaseSnapshot(const Snapshot* s) {
+  if (s == nullptr) {
+    // DBImpl::GetSnapshot() can return nullptr when snapshot
+    // not supported by specifying the condition:
+    // inplace_update_support enabled.
+    return;
+  }
  const SnapshotImpl* casted_s = reinterpret_cast<const SnapshotImpl*>(s);
  {
    InstrumentedMutexLock l(&mutex_);
@ -3427,15 +3445,13 @@ bool DBImpl::GetAggregatedIntProperty(const Slice& property,
    // Needs mutex to protect the list of column families.
    InstrumentedMutexLock l(&mutex_);
    uint64_t value;
-    for (auto* cfd : *versions_->GetColumnFamilySet()) {
+    for (auto* cfd : versions_->GetRefedColumnFamilySet()) {
      if (!cfd->initialized()) {
        continue;
      }
-      cfd->Ref();
      ret = GetIntPropertyInternal(cfd, *property_info, true, &value);
      // GetIntPropertyInternal may release db mutex and re-acquire it.
      mutex_.AssertHeld();
-      cfd->UnrefAndTryDelete();
      if (ret) {
        sum += value;
      } else {
@ -4539,10 +4555,12 @@ Status DBImpl::GetLatestSequenceForKey(
  // SST files if cache_only=true?
  if (!cache_only) {
    // Check tables
+    PinnedIteratorsManager pinned_iters_mgr;
    sv->current->Get(read_options, lkey, /*value=*/nullptr, timestamp, &s,
                     &merge_context, &max_covering_tombstone_seq,
-                     nullptr /* value_found */, found_record_for_key, seq,
-                     nullptr /*read_callback*/, is_blob_index);
+                     &pinned_iters_mgr, nullptr /* value_found */,
+                     found_record_for_key, seq, nullptr /*read_callback*/,
+                     is_blob_index);

    if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
      // unexpected error reading SST files
@ -5024,6 +5042,7 @@ Status DBImpl::VerifyChecksumInternal(const ReadOptions& read_options,
    }
  }

+  // TODO: simplify using GetRefedColumnFamilySet?
  std::vector<ColumnFamilyData*> cfd_list;
  {
    InstrumentedMutexLock l(&mutex_);
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@ -1492,19 +1492,31 @@ class DBImpl : public DB {

  // Information for a manual compaction
  struct ManualCompactionState {
+    ManualCompactionState(ColumnFamilyData* _cfd, int _input_level,
+                          int _output_level, uint32_t _output_path_id,
+                          bool _exclusive, bool _disallow_trivial_move,
+                          std::atomic<bool>* _canceled)
+        : cfd(_cfd),
+          input_level(_input_level),
+          output_level(_output_level),
+          output_path_id(_output_path_id),
+          exclusive(_exclusive),
+          disallow_trivial_move(_disallow_trivial_move),
+          canceled(_canceled) {}
+
    ColumnFamilyData* cfd;
    int input_level;
    int output_level;
    uint32_t output_path_id;
    Status status;
-    bool done;
-    bool in_progress;             // compaction request being processed?
-    bool incomplete;              // only part of requested range compacted
+    bool done = false;
+    bool in_progress = false;    // compaction request being processed?
+    bool incomplete = false;     // only part of requested range compacted
    bool exclusive;              // current behavior of only one manual
    bool disallow_trivial_move;  // Force actual compaction to run
-    const InternalKey* begin;     // nullptr means beginning of key range
-    const InternalKey* end;       // nullptr means end of key range
-    InternalKey* manual_end;      // how far we are compacting
+    const InternalKey* begin = nullptr;  // nullptr means beginning of key range
+    const InternalKey* end = nullptr;    // nullptr means end of key range
+    InternalKey* manual_end = nullptr;   // how far we are compacting
    InternalKey tmp_storage;      // Used to keep track of compaction progress
    InternalKey tmp_storage1;     // Used to keep track of compaction progress
    std::atomic<bool>* canceled;  // Compaction canceled by the user?
@ -1711,6 +1723,25 @@ class DBImpl : public DB {
    }
  }

+  // TaskType is used to identify tasks in thread-pool, currently only
+  // differentiate manual compaction, which could be unscheduled from the
+  // thread-pool.
+  enum class TaskType : uint8_t {
+    kDefault = 0,
+    kManualCompaction = 1,
+    kCount = 2,
+  };
+
+  // Task tag is used to identity tasks in thread-pool, which is
+  // dbImpl obj address + type
+  inline void* GetTaskTag(TaskType type) {
+    return GetTaskTag(static_cast<uint8_t>(type));
+  }
+
+  inline void* GetTaskTag(uint8_t type) {
+    return static_cast<uint8_t*>(static_cast<void*>(this)) + type;
+  }
+
  // REQUIRES: mutex locked and in write thread.
  void AssignAtomicFlushSeq(const autovector<ColumnFamilyData*>& cfds);

@ -1729,7 +1760,8 @@ class DBImpl : public DB {
                         WriteBatch** to_be_cached_state);

  IOStatus WriteToWAL(const WriteBatch& merged_batch, log::Writer* log_writer,
-                      uint64_t* log_used, uint64_t* log_size);
+                      uint64_t* log_used, uint64_t* log_size,
+                      bool with_db_mutex = false, bool with_log_mutex = false);

  IOStatus WriteToWAL(const WriteThread::WriteGroup& write_group,
                      log::Writer* log_writer, uint64_t* log_used,
@ -2055,12 +2087,15 @@ class DBImpl : public DB {
  bool persistent_stats_cfd_exists_ = true;

  // Without two_write_queues, read and writes to alive_log_files_ are
-  // protected by mutex_. However since back() is never popped, and push_back()
-  // is done only from write_thread_, the same thread can access the item
-  // reffered by back() without mutex_. With two_write_queues_, writes
+  // protected by mutex_. With two_write_queues_, writes
  // are protected by locking both mutex_ and log_write_mutex_, and reads must
  // be under either mutex_ or log_write_mutex_.
  std::deque<LogFileNumberSize> alive_log_files_;
+  // Caching the result of `alive_log_files_.back()` so that we do not have to
+  // call `alive_log_files_.back()` in the write thread (WriteToWAL()) which
+  // requires locking db mutex if log_mutex_ is not already held in
+  // two-write-queues mode.
+  std::deque<LogFileNumberSize>::reverse_iterator alive_log_files_tail_;
  // Log files that aren't fully synced, and the current log file.
  // Synchronization:
  //  - push_back() is done from write_thread_ with locked mutex_ and
@ -2385,11 +2420,10 @@ extern uint64_t PrecomputeMinLogNumberToKeepNon2PC(
 // will not depend on any WAL file. nullptr means no memtable is being flushed.
 // The function is only applicable to 2pc mode.
 extern uint64_t FindMinPrepLogReferencedByMemTable(
-    VersionSet* vset, const ColumnFamilyData* cfd_to_flush,
-    const autovector<MemTable*>& memtables_to_flush);
+    VersionSet* vset, const autovector<MemTable*>& memtables_to_flush);
 // For atomic flush.
 extern uint64_t FindMinPrepLogReferencedByMemTable(
-    VersionSet* vset, const autovector<ColumnFamilyData*>& cfds_to_flush,
+    VersionSet* vset,
    const autovector<const autovector<MemTable*>*>& memtables_to_flush);

 // Fix user-supplied options to be reasonable
--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@ -170,6 +170,7 @@ Status DBImpl::FlushMemTableToOutputFile(
  const bool needs_to_sync_closed_wals =
      logfile_number_ > 0 &&
      versions_->GetColumnFamilySet()->NumberOfColumnFamilies() > 1;
+
  // If needs_to_sync_closed_wals is true, we need to record the current
  // maximum memtable ID of this column family so that a later PickMemtables()
  // call will not pick memtables whose IDs are higher. This is due to the fact
@ -177,9 +178,33 @@ Status DBImpl::FlushMemTableToOutputFile(
  // happen for this column family in the meantime. The newly created memtables
  // have their data backed by unsynced WALs, thus they cannot be included in
  // this flush job.
+  // Another reason why we must record the current maximum memtable ID of this
+  // column family: SyncClosedLogs() may release db mutex, thus it's possible
+  // for application to continue to insert into memtables increasing db's
+  // sequence number. The application may take a snapshot, but this snapshot is
+  // not included in `snapshot_seqs` which will be passed to flush job because
+  // `snapshot_seqs` has already been computed before this function starts.
+  // Recording the max memtable ID ensures that the flush job does not flush
+  // a memtable without knowing such snapshot(s).
  uint64_t max_memtable_id = needs_to_sync_closed_wals
                                 ? cfd->imm()->GetLatestMemTableID()
                                 : port::kMaxUint64;
+
+  // If needs_to_sync_closed_wals is false, then the flush job will pick ALL
+  // existing memtables of the column family when PickMemTable() is called
+  // later. Although we won't call SyncClosedLogs() in this case, we may still
+  // call the callbacks of the listeners, i.e. NotifyOnFlushBegin() which also
+  // releases and re-acquires the db mutex. In the meantime, the application
+  // can still insert into the memtables and increase the db's sequence number.
+  // The application can take a snapshot, hoping that the latest visible state
+  // to this snapshto is preserved. This is hard to guarantee since db mutex
+  // not held. This newly-created snapshot is not included in `snapshot_seqs`
+  // and the flush job is unaware of its presence. Consequently, the flush job
+  // may drop certain keys when generating the L0, causing incorrect data to be
+  // returned for snapshot read using this snapshot.
+  // To address this, we make sure NotifyOnFlushBegin() executes after memtable
+  // picking so that no new snapshot can be taken between the two functions.
+
  FlushJob flush_job(
      dbname_, cfd, immutable_db_options_, mutable_cf_options, max_memtable_id,
      file_options_for_compaction_, versions_.get(), &mutex_, &shutting_down_,
@ -192,11 +217,6 @@ Status DBImpl::FlushMemTableToOutputFile(
      &blob_callback_);
  FileMetaData file_meta;

-#ifndef ROCKSDB_LITE
-  // may temporarily unlock and lock the mutex.
-  NotifyOnFlushBegin(cfd, &file_meta, mutable_cf_options, job_context->job_id);
-#endif  // ROCKSDB_LITE
-
  Status s;
  bool need_cancel = false;
  IOStatus log_io_s = IOStatus::OK();
@ -221,6 +241,12 @@ Status DBImpl::FlushMemTableToOutputFile(
  }
  TEST_SYNC_POINT_CALLBACK(
      "DBImpl::FlushMemTableToOutputFile:AfterPickMemtables", &flush_job);
+
+#ifndef ROCKSDB_LITE
+  // may temporarily unlock and lock the mutex.
+  NotifyOnFlushBegin(cfd, &file_meta, mutable_cf_options, job_context->job_id);
+#endif  // ROCKSDB_LITE
+
  bool switched_to_mempurge = false;
  // Within flush_job.Run, rocksdb may call event listener to notify
  // file creation and deletion.
@ -1752,21 +1778,16 @@ Status DBImpl::RunManualCompaction(
         input_level >= 0);

  InternalKey begin_storage, end_storage;
-  CompactionArg* ca;
+  CompactionArg* ca = nullptr;

  bool scheduled = false;
+  bool unscheduled = false;
+  Env::Priority thread_pool_priority = Env::Priority::TOTAL;
  bool manual_conflict = false;
-  ManualCompactionState manual;
-  manual.cfd = cfd;
-  manual.input_level = input_level;
-  manual.output_level = output_level;
-  manual.output_path_id = compact_range_options.target_path_id;
-  manual.done = false;
-  manual.in_progress = false;
-  manual.incomplete = false;
-  manual.exclusive = exclusive;
-  manual.disallow_trivial_move = disallow_trivial_move;
-  manual.canceled = compact_range_options.canceled;
+
+  ManualCompactionState manual(
+      cfd, input_level, output_level, compact_range_options.target_path_id,
+      exclusive, disallow_trivial_move, compact_range_options.canceled);
  // For universal compaction, we enforce every manual compaction to compact
  // all files.
  if (begin == nullptr ||
@ -1871,6 +1892,23 @@ Status DBImpl::RunManualCompaction(
      assert(!exclusive || !manual_conflict);
      // Running either this or some other manual compaction
      bg_cv_.Wait();
+      if (manual_compaction_paused_ > 0 && scheduled && !unscheduled) {
+        assert(thread_pool_priority != Env::Priority::TOTAL);
+        // unschedule all manual compactions
+        auto unscheduled_task_num = env_->UnSchedule(
+            GetTaskTag(TaskType::kManualCompaction), thread_pool_priority);
+        if (unscheduled_task_num > 0) {
+          ROCKS_LOG_INFO(
+              immutable_db_options_.info_log,
+              "[%s] Unscheduled %d number of manual compactions from the "
+              "thread-pool",
+              cfd->GetName().c_str(), unscheduled_task_num);
+          // it may unschedule other manual compactions, notify others.
+          bg_cv_.SignalAll();
+        }
+        unscheduled = true;
+        TEST_SYNC_POINT("DBImpl::RunManualCompaction:Unscheduled");
+      }
      if (scheduled && manual.incomplete == true) {
        assert(!manual.in_progress);
        scheduled = false;
@ -1898,15 +1936,20 @@ Status DBImpl::RunManualCompaction(
        bg_bottom_compaction_scheduled_++;
        ca->compaction_pri_ = Env::Priority::BOTTOM;
        env_->Schedule(&DBImpl::BGWorkBottomCompaction, ca,
-                       Env::Priority::BOTTOM, this,
+                       Env::Priority::BOTTOM,
+                       GetTaskTag(TaskType::kManualCompaction),
                       &DBImpl::UnscheduleCompactionCallback);
+        thread_pool_priority = Env::Priority::BOTTOM;
      } else {
        bg_compaction_scheduled_++;
        ca->compaction_pri_ = Env::Priority::LOW;
-        env_->Schedule(&DBImpl::BGWorkCompaction, ca, Env::Priority::LOW, this,
+        env_->Schedule(&DBImpl::BGWorkCompaction, ca, Env::Priority::LOW,
+                       GetTaskTag(TaskType::kManualCompaction),
                       &DBImpl::UnscheduleCompactionCallback);
+        thread_pool_priority = Env::Priority::LOW;
      }
      scheduled = true;
+      TEST_SYNC_POINT("DBImpl::RunManualCompaction:Scheduled");
    }
  }

@ -1914,6 +1957,13 @@ Status DBImpl::RunManualCompaction(
  assert(!manual.in_progress);
  assert(HasPendingManualCompaction());
  RemoveManualCompaction(&manual);
+  // if the manual job is unscheduled, try schedule other jobs in case there's
+  // any unscheduled compaction job which was blocked by exclusive manual
+  // compaction.
+  if (manual.status.IsIncomplete() &&
+      manual.status.subcode() == Status::SubCode::kManualCompactionPaused) {
+    MaybeScheduleFlushOrCompaction();
+  }
  bg_cv_.SignalAll();
  return manual.status;
 }
@ -2641,7 +2691,15 @@ void DBImpl::UnscheduleCompactionCallback(void* arg) {
  CompactionArg ca = *(ca_ptr);
  delete reinterpret_cast<CompactionArg*>(arg);
  if (ca.prepicked_compaction != nullptr) {
+    // if it's a manual compaction, set status to ManualCompactionPaused
+    if (ca.prepicked_compaction->manual_compaction_state) {
+      ca.prepicked_compaction->manual_compaction_state->done = true;
+      ca.prepicked_compaction->manual_compaction_state->status =
+          Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+    }
    if (ca.prepicked_compaction->compaction != nullptr) {
+      ca.prepicked_compaction->compaction->ReleaseCompactionFiles(
+          Status::Incomplete(Status::SubCode::kManualCompactionPaused));
      delete ca.prepicked_compaction->compaction;
    }
    delete ca.prepicked_compaction;
@ -2880,6 +2938,7 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
      immutable_db_options_.clock->SleepForMicroseconds(1000000);
      mutex_.Lock();
    } else if (s.IsManualCompactionPaused()) {
+      assert(prepicked_compaction);
      ManualCompactionState* m = prepicked_compaction->manual_compaction_state;
      assert(m);
      ROCKS_LOG_BUFFER(&log_buffer, "[%s] [JOB %d] Manual compaction paused",
@ -2888,9 +2947,9 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,

    ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem);

-    // If compaction failed, we want to delete all temporary files that we might
-    // have created (they might not be all recorded in job_context in case of a
-    // failure). Thus, we force full scan in FindObsoleteFiles()
+    // If compaction failed, we want to delete all temporary files that we
+    // might have created (they might not be all recorded in job_context in
+    // case of a failure). Thus, we force full scan in FindObsoleteFiles()
    FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress() &&
                                        !s.IsManualCompactionPaused() &&
                                        !s.IsColumnFamilyDropped() &&
@ -2917,6 +2976,7 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,

    assert(num_running_compactions_ > 0);
    num_running_compactions_--;
+
    if (bg_thread_pri == Env::Priority::LOW) {
      bg_compaction_scheduled_--;
    } else {
@ -2924,8 +2984,6 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
      bg_bottom_compaction_scheduled_--;
    }

-    versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
-
    // See if there's more work to be done
    MaybeScheduleFlushOrCompaction();

@ -2935,7 +2993,6 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
      // must be done before we potentially signal the DB close process to
      // proceed below.
      prepicked_compaction->task_token.reset();
-      ;
    }

    if (made_progress ||
@ -3022,6 +3079,8 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
    manual_compaction->in_progress = true;
  }

+  TEST_SYNC_POINT("DBImpl::BackgroundCompaction:InProgress");
+
  std::unique_ptr<TaskLimiterToken> task_token;

  // InternalKey manual_end_storage;
--- a/db/db_impl/db_impl_debug.cc
+++ b/db/db_impl/db_impl_debug.cc
@ -262,8 +262,7 @@ size_t DBImpl::TEST_LogsWithPrepSize() {

 uint64_t DBImpl::TEST_FindMinPrepLogReferencedByMemTable() {
  autovector<MemTable*> empty_list;
-  return FindMinPrepLogReferencedByMemTable(versions_.get(), nullptr,
-                                            empty_list);
+  return FindMinPrepLogReferencedByMemTable(versions_.get(), empty_list);
 }

 Status DBImpl::TEST_GetLatestMutableCFOptions(
--- a/db/db_impl/db_impl_files.cc
+++ b/db/db_impl/db_impl_files.cc
@ -23,11 +23,7 @@
 namespace ROCKSDB_NAMESPACE {

 uint64_t DBImpl::MinLogNumberToKeep() {
-  if (allow_2pc()) {
-    return versions_->min_log_number_to_keep_2pc();
-  } else {
-    return versions_->MinLogNumberWithUnflushedData();
-  }
+  return versions_->min_log_number_to_keep();
 }

 uint64_t DBImpl::MinObsoleteSstNumberToKeep() {
@ -224,7 +220,6 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
    }

    // Add log files in wal_dir
-
    if (!immutable_db_options_.IsWalDirSameAsDBPath(dbname_)) {
      std::vector<std::string> log_files;
      Status s = env_->GetChildren(immutable_db_options_.wal_dir, &log_files);
@ -234,6 +229,7 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
            log_file, immutable_db_options_.wal_dir);
      }
    }
+
    // Add info log files in db_log_dir
    if (!immutable_db_options_.db_log_dir.empty() &&
        immutable_db_options_.db_log_dir != dbname_) {
@ -670,8 +666,7 @@ void DBImpl::DeleteObsoleteFiles() {
 }

 uint64_t FindMinPrepLogReferencedByMemTable(
-    VersionSet* vset, const ColumnFamilyData* cfd_to_flush,
-    const autovector<MemTable*>& memtables_to_flush) {
+    VersionSet* vset, const autovector<MemTable*>& memtables_to_flush) {
  uint64_t min_log = 0;

  // we must look through the memtables for two phase transactions
@ -679,7 +674,7 @@ uint64_t FindMinPrepLogReferencedByMemTable(
  std::unordered_set<MemTable*> memtables_to_flush_set(
      memtables_to_flush.begin(), memtables_to_flush.end());
  for (auto loop_cfd : *vset->GetColumnFamilySet()) {
-    if (loop_cfd->IsDropped() || loop_cfd == cfd_to_flush) {
+    if (loop_cfd->IsDropped()) {
      continue;
    }

@ -701,18 +696,16 @@ uint64_t FindMinPrepLogReferencedByMemTable(
 }

 uint64_t FindMinPrepLogReferencedByMemTable(
-    VersionSet* vset, const autovector<ColumnFamilyData*>& cfds_to_flush,
+    VersionSet* vset,
    const autovector<const autovector<MemTable*>*>& memtables_to_flush) {
  uint64_t min_log = 0;

-  std::unordered_set<ColumnFamilyData*> cfds_to_flush_set(cfds_to_flush.begin(),
-                                                          cfds_to_flush.end());
  std::unordered_set<MemTable*> memtables_to_flush_set;
  for (const autovector<MemTable*>* memtables : memtables_to_flush) {
    memtables_to_flush_set.insert(memtables->begin(), memtables->end());
  }
  for (auto loop_cfd : *vset->GetColumnFamilySet()) {
-    if (loop_cfd->IsDropped() || cfds_to_flush_set.count(loop_cfd)) {
+    if (loop_cfd->IsDropped()) {
      continue;
    }

@ -828,8 +821,8 @@ uint64_t PrecomputeMinLogNumberToKeep2PC(
    min_log_number_to_keep = min_log_in_prep_heap;
  }

-  uint64_t min_log_refed_by_mem = FindMinPrepLogReferencedByMemTable(
-      vset, &cfd_to_flush, memtables_to_flush);
+  uint64_t min_log_refed_by_mem =
+      FindMinPrepLogReferencedByMemTable(vset, memtables_to_flush);

  if (min_log_refed_by_mem != 0 &&
      min_log_refed_by_mem < min_log_number_to_keep) {
@ -859,8 +852,8 @@ uint64_t PrecomputeMinLogNumberToKeep2PC(
    min_log_number_to_keep = min_log_in_prep_heap;
  }

-  uint64_t min_log_refed_by_mem = FindMinPrepLogReferencedByMemTable(
-      vset, cfds_to_flush, memtables_to_flush);
+  uint64_t min_log_refed_by_mem =
+      FindMinPrepLogReferencedByMemTable(vset, memtables_to_flush);

  if (min_log_refed_by_mem != 0 &&
      min_log_refed_by_mem < min_log_number_to_keep) {
--- a/db/db_impl/db_impl_open.cc
+++ b/db/db_impl/db_impl_open.cc
@ -864,6 +864,11 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
  bool flushed = false;
  uint64_t corrupted_wal_number = kMaxSequenceNumber;
  uint64_t min_wal_number = MinLogNumberToKeep();
+  if (!allow_2pc()) {
+    // In non-2pc mode, we skip WALs that do not back unflushed data.
+    min_wal_number =
+        std::max(min_wal_number, versions_->MinLogNumberWithUnflushedData());
+  }
  for (auto wal_number : wal_numbers) {
    if (wal_number < min_wal_number) {
      ROCKS_LOG_INFO(immutable_db_options_.info_log,
@ -1268,9 +1273,16 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
      }

      std::unique_ptr<VersionEdit> wal_deletion;
+      if (flushed) {
+        wal_deletion = std::unique_ptr<VersionEdit>(new VersionEdit());
        if (immutable_db_options_.track_and_verify_wals_in_manifest) {
-        wal_deletion.reset(new VersionEdit);
          wal_deletion->DeleteWalsBefore(max_wal_number + 1);
+        }
+        if (!allow_2pc()) {
+          // In non-2pc mode, flushing the memtables of the column families
+          // means we can advance min_log_number_to_keep.
+          wal_deletion->SetMinLogNumberToKeep(max_wal_number + 1);
+        }
        edit_lists.back().push_back(wal_deletion.get());
      }

@ -1349,7 +1361,14 @@ Status DBImpl::RestoreAliveLogFiles(const std::vector<uint64_t>& wal_numbers) {
  // FindObsoleteFiles()
  total_log_size_ = 0;
  log_empty_ = false;
+  uint64_t min_wal_with_unflushed_data =
+      versions_->MinLogNumberWithUnflushedData();
  for (auto wal_number : wal_numbers) {
+    if (!allow_2pc() && wal_number < min_wal_with_unflushed_data) {
+      // In non-2pc mode, the WAL files not backing unflushed data are not
+      // alive, thus should not be added to the alive_log_files_.
+      continue;
+    }
    // We preallocate space for wals, but then after a crash and restart, those
    // preallocated space are not needed anymore. It is likely only the last
    // log has such preallocated space, so we only truncate for the last log.
@ -1362,6 +1381,7 @@ Status DBImpl::RestoreAliveLogFiles(const std::vector<uint64_t>& wal_numbers) {
    total_log_size_ += log.size;
    alive_log_files_.push_back(log);
  }
+  alive_log_files_tail_ = alive_log_files_.rbegin();
  if (two_write_queues_) {
    log_write_mutex_.Unlock();
  }
@ -1371,6 +1391,12 @@ Status DBImpl::RestoreAliveLogFiles(const std::vector<uint64_t>& wal_numbers) {
 Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
                                           MemTable* mem, VersionEdit* edit) {
  mutex_.AssertHeld();
+  assert(cfd);
+  assert(cfd->imm());
+  // The immutable memtable list must be empty.
+  assert(std::numeric_limits<uint64_t>::max() ==
+         cfd->imm()->GetEarliestMemTableID());
+
  const uint64_t start_micros = immutable_db_options_.clock->NowMicros();

  FileMetaData meta;
@ -1699,6 +1725,7 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
      }
      impl->alive_log_files_.push_back(
          DBImpl::LogFileNumberSize(impl->logfile_number_));
+      impl->alive_log_files_tail_ = impl->alive_log_files_.rbegin();
      if (impl->two_write_queues_) {
        impl->log_write_mutex_.Unlock();
      }
@ -1719,7 +1746,8 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
        WriteOptions write_options;
        uint64_t log_used, log_size;
        log::Writer* log_writer = impl->logs_.back().writer;
-        s = impl->WriteToWAL(empty_batch, log_writer, &log_used, &log_size);
+        s = impl->WriteToWAL(empty_batch, log_writer, &log_used, &log_size,
+                             /*with_db_mutex==*/true);
        if (s.ok()) {
          // Need to fsync, otherwise it might get lost after a power reset.
          s = impl->FlushWAL(false);
--- a/db/db_impl/db_impl_readonly.cc
+++ b/db/db_impl/db_impl_readonly.cc
@ -58,9 +58,10 @@ Status DBImplReadOnly::Get(const ReadOptions& read_options,
    RecordTick(stats_, MEMTABLE_HIT);
  } else {
    PERF_TIMER_GUARD(get_from_output_files_time);
+    PinnedIteratorsManager pinned_iters_mgr;
    super_version->current->Get(read_options, lkey, pinnable_val,
                                /*timestamp=*/nullptr, &s, &merge_context,
-                                &max_covering_tombstone_seq);
+                                &max_covering_tombstone_seq, &pinned_iters_mgr);
    RecordTick(stats_, MEMTABLE_MISS);
  }
  RecordTick(stats_, NUMBER_KEYS_READ);
--- a/db/db_impl/db_impl_secondary.cc
+++ b/db/db_impl/db_impl_secondary.cc
@ -377,9 +377,10 @@ Status DBImplSecondary::GetImpl(const ReadOptions& read_options,
  }
  if (!done) {
    PERF_TIMER_GUARD(get_from_output_files_time);
+    PinnedIteratorsManager pinned_iters_mgr;
    super_version->current->Get(read_options, lkey, pinnable_val,
                                /*timestamp=*/nullptr, &s, &merge_context,
-                                &max_covering_tombstone_seq);
+                                &max_covering_tombstone_seq, &pinned_iters_mgr);
    RecordTick(stats_, MEMTABLE_MISS);
  }
  {
--- a/db/db_impl/db_impl_write.cc
+++ b/db/db_impl/db_impl_write.cc
@ -1085,8 +1085,18 @@ WriteBatch* DBImpl::MergeBatch(const WriteThread::WriteGroup& write_group,
 // write thread. Otherwise this must be called holding log_write_mutex_.
 IOStatus DBImpl::WriteToWAL(const WriteBatch& merged_batch,
                            log::Writer* log_writer, uint64_t* log_used,
-                            uint64_t* log_size) {
+                            uint64_t* log_size,
+                            bool with_db_mutex, bool with_log_mutex) {
  assert(log_size != nullptr);
+
+  // Assert mutex explicitly.
+  if (with_db_mutex) {
+    mutex_.AssertHeld();
+  } else if (two_write_queues_) {
+    log_write_mutex_.AssertHeld();
+    assert(with_log_mutex);
+  }
+
  Slice log_entry = WriteBatchInternal::Contents(&merged_batch);
  *log_size = log_entry.size();
  // When two_write_queues_ WriteToWAL has to be protected from concurretn calls
@ -1109,9 +1119,12 @@ IOStatus DBImpl::WriteToWAL(const WriteBatch& merged_batch,
    *log_used = logfile_number_;
  }
  total_log_size_ += log_entry.size();
-  // TODO(myabandeh): it might be unsafe to access alive_log_files_.back() here
-  // since alive_log_files_ might be modified concurrently
-  alive_log_files_.back().AddSize(log_entry.size());
+  if (with_db_mutex || with_log_mutex) {
+    assert(alive_log_files_tail_ == alive_log_files_.rbegin());
+    assert(alive_log_files_tail_ != alive_log_files_.rend());
+  }
+  LogFileNumberSize& last_alive_log = *alive_log_files_tail_;
+  last_alive_log.AddSize(*log_size);
  log_empty_ = false;
  return io_s;
 }
@ -1121,6 +1134,7 @@ IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
                            bool need_log_sync, bool need_log_dir_sync,
                            SequenceNumber sequence) {
  IOStatus io_s;
+  assert(!two_write_queues_);
  assert(!write_group.leader->disable_wal);
  // Same holds for all in the batch group
  size_t write_with_wal = 0;
@ -1208,6 +1222,7 @@ IOStatus DBImpl::ConcurrentWriteToWAL(
    SequenceNumber* last_sequence, size_t seq_inc) {
  IOStatus io_s;

+  assert(two_write_queues_ || immutable_db_options_.unordered_write);
  assert(!write_group.leader->disable_wal);
  // Same holds for all in the batch group
  WriteBatch tmp_batch;
@ -1232,7 +1247,8 @@ IOStatus DBImpl::ConcurrentWriteToWAL(

  log::Writer* log_writer = logs_.back().writer;
  uint64_t log_size;
-  io_s = WriteToWAL(*merged_batch, log_writer, log_used, &log_size);
+  io_s = WriteToWAL(*merged_batch, log_writer, log_used, &log_size,
+                    /*with_db_mutex=*/false, /*with_log_mutex=*/true);
  if (to_be_cached_state) {
    cached_recoverable_state_ = *to_be_cached_state;
    cached_recoverable_state_empty_ = false;
@ -1886,6 +1902,7 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
      log_dir_synced_ = false;
      logs_.emplace_back(logfile_number_, new_log);
      alive_log_files_.push_back(LogFileNumberSize(logfile_number_));
+      alive_log_files_tail_ = alive_log_files_.rbegin();
    }
    log_write_mutex_.Unlock();
  }
--- a/db/db_inplace_update_test.cc
+++ b/db/db_inplace_update_test.cc
@ -169,6 +169,36 @@ TEST_F(DBTestInPlaceUpdate, InPlaceUpdateCallbackNoAction) {
    ASSERT_EQ(Get(1, "key"), "NOT_FOUND");
  } while (ChangeCompactOptions());
 }
+
+TEST_F(DBTestInPlaceUpdate, InPlaceUpdateAndSnapshot) {
+  do {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.inplace_update_support = true;
+    options.env = env_;
+    options.write_buffer_size = 100000;
+    options.allow_concurrent_memtable_write = false;
+    Reopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    // Update key with values of smaller size, and
+    // run GetSnapshot and ReleaseSnapshot
+    int numValues = 2;
+    for (int i = numValues; i > 0; i--) {
+      const Snapshot* s = db_->GetSnapshot();
+      ASSERT_EQ(nullptr, s);
+      std::string value = DummyString(i, 'a');
+      ASSERT_OK(Put(1, "key", value));
+      ASSERT_EQ(value, Get(1, "key"));
+      // release s (nullptr)
+      db_->ReleaseSnapshot(s);
+    }
+
+    // Only 1 instance for that key.
+    validateNumberOfEntries(1, 1);
+  } while (ChangeCompactOptions());
+}
+
 }  // namespace ROCKSDB_NAMESPACE

 int main(int argc, char** argv) {
--- a/db/db_merge_operand_test.cc
+++ b/db/db_merge_operand_test.cc
@ -47,6 +47,45 @@ class DBMergeOperandTest : public DBTestBase {
      : DBTestBase("db_merge_operand_test", /*env_do_fsync=*/true) {}
 };

+TEST_F(DBMergeOperandTest, MergeOperandReadAfterFreeBug) {
+  // There was a bug of reading merge operands after they are mistakely freed
+  // in DB::GetMergeOperands, which is surfaced by cache full.
+  // See PR#9507 for more.
+  Options options;
+  options.create_if_missing = true;
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  options.env = env_;
+  BlockBasedTableOptions table_options;
+
+  // Small cache to simulate cache full
+  table_options.block_cache = NewLRUCache(1);
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  Reopen(options);
+  int num_records = 4;
+  int number_of_operands = 0;
+  std::vector<PinnableSlice> values(num_records);
+  GetMergeOperandsOptions merge_operands_info;
+  merge_operands_info.expected_max_number_of_operands = num_records;
+
+  ASSERT_OK(Merge("k1", "v1"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k1", "v2"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k1", "v3"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k1", "v4"));
+
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k1", values.data(), &merge_operands_info,
+                                  &number_of_operands));
+  ASSERT_EQ(number_of_operands, 4);
+  ASSERT_EQ(values[0].ToString(), "v1");
+  ASSERT_EQ(values[1].ToString(), "v2");
+  ASSERT_EQ(values[2].ToString(), "v3");
+  ASSERT_EQ(values[3].ToString(), "v4");
+}
+
 TEST_F(DBMergeOperandTest, GetMergeOperandsBasic) {
  Options options;
  options.create_if_missing = true;
--- a/db/db_range_del_test.cc
+++ b/db/db_range_del_test.cc
@ -1724,6 +1724,34 @@ TEST_F(DBRangeDelTest, OverlappedKeys) {
  ASSERT_EQ(0, NumTableFilesAtLevel(1));
 }

+TEST_F(DBRangeDelTest, IteratorRefresh) {
+  // Refreshing an iterator after a range tombstone is added should cause the
+  // deleted range of keys to disappear.
+  for (bool sv_changed : {false, true}) {
+    ASSERT_OK(db_->Put(WriteOptions(), "key1", "value1"));
+    ASSERT_OK(db_->Put(WriteOptions(), "key2", "value2"));
+
+    auto* iter = db_->NewIterator(ReadOptions());
+    ASSERT_OK(iter->status());
+
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               "key2", "key3"));
+
+    if (sv_changed) {
+      ASSERT_OK(db_->Flush(FlushOptions()));
+    }
+
+    ASSERT_OK(iter->Refresh());
+    ASSERT_OK(iter->status());
+    iter->SeekToFirst();
+    ASSERT_EQ("key1", iter->key());
+    iter->Next();
+    ASSERT_FALSE(iter->Valid());
+
+    delete iter;
+  }
+}
+
 #endif  // ROCKSDB_LITE

 }  // namespace ROCKSDB_NAMESPACE
--- a/db/db_wal_test.cc
+++ b/db/db_wal_test.cc
@ -1481,6 +1481,93 @@ TEST_F(DBWALTest, kPointInTimeRecoveryCFConsistency) {
  ASSERT_NOK(TryReopenWithColumnFamilies({"default", "one", "two"}, options));
 }

+TEST_F(DBWALTest, RaceInstallFlushResultsWithWalObsoletion) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.track_and_verify_wals_in_manifest = true;
+  // The following make sure there are two bg flush threads.
+  options.max_background_jobs = 8;
+
+  const std::string cf1_name("cf1");
+  CreateAndReopenWithCF({cf1_name}, options);
+  assert(handles_.size() == 2);
+
+  {
+    dbfull()->TEST_LockMutex();
+    ASSERT_LE(2, dbfull()->GetBGJobLimits().max_flushes);
+    dbfull()->TEST_UnlockMutex();
+  }
+
+  ASSERT_OK(dbfull()->PauseBackgroundWork());
+
+  ASSERT_OK(db_->Put(WriteOptions(), handles_[1], "foo", "value"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "value"));
+
+  ASSERT_OK(dbfull()->TEST_FlushMemTable(false, true, handles_[1]));
+
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "value"));
+  ASSERT_OK(dbfull()->TEST_FlushMemTable(false, true, handles_[0]));
+
+  bool called = false;
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  // This callback will be called when the first bg flush thread reaches the
+  // point before entering the MANIFEST write queue after flushing the SST
+  // file.
+  // The purpose of the sync points here is to ensure both bg flush threads
+  // finish computing `min_wal_number_to_keep` before any of them updates the
+  // `log_number` for the column family that's being flushed.
+  SyncPoint::GetInstance()->SetCallBack(
+      "MemTableList::TryInstallMemtableFlushResults:AfterComputeMinWalToKeep",
+      [&](void* /*arg*/) {
+        dbfull()->mutex()->AssertHeld();
+        if (!called) {
+          // We are the first bg flush thread in the MANIFEST write queue.
+          // We set up the dependency between sync points for two threads that
+          // will be executing the same code.
+          // For the interleaving of events, see
+          // https://github.com/facebook/rocksdb/pull/9715.
+          // bg flush thread1 will release the db mutex while in the MANIFEST
+          // write queue. In the meantime, bg flush thread2 locks db mutex and
+          // computes the min_wal_number_to_keep (before thread1 writes to
+          // MANIFEST thus before cf1->log_number is updated). Bg thread2 joins
+          // the MANIFEST write queue afterwards and bg flush thread1 proceeds
+          // with writing to MANIFEST.
+          called = true;
+          SyncPoint::GetInstance()->LoadDependency({
+              {"VersionSet::LogAndApply:WriteManifestStart",
+               "DBWALTest::RaceInstallFlushResultsWithWalObsoletion:BgFlush2"},
+              {"DBWALTest::RaceInstallFlushResultsWithWalObsoletion:BgFlush2",
+               "VersionSet::LogAndApply:WriteManifest"},
+          });
+        } else {
+          // The other bg flush thread has already been in the MANIFEST write
+          // queue, and we are after.
+          TEST_SYNC_POINT(
+              "DBWALTest::RaceInstallFlushResultsWithWalObsoletion:BgFlush2");
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(dbfull()->ContinueBackgroundWork());
+
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0]));
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+
+  ASSERT_TRUE(called);
+
+  Close();
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  DB* db1 = nullptr;
+  Status s = DB::OpenForReadOnly(options, dbname_, &db1);
+  ASSERT_OK(s);
+  assert(db1);
+  delete db1;
+}
+
 // Test scope:
 // - We expect to open data store under all circumstances
 // - We expect only data upto the point where the first error was encountered
--- a/db/event_helpers.cc
+++ b/db/event_helpers.cc
@ -95,8 +95,9 @@ void EventHelpers::LogAndNotifyTableFileCreationFinished(
    jwriter << "cf_name" << cf_name << "job" << job_id << "event"
            << "table_file_creation"
            << "file_number" << fd.GetNumber() << "file_size"
-            << fd.GetFileSize() << "file_checksum" << file_checksum
-            << "file_checksum_func_name" << file_checksum_func_name;
+            << fd.GetFileSize() << "file_checksum"
+            << Slice(file_checksum).ToString(true) << "file_checksum_func_name"
+            << file_checksum_func_name;

    // table_properties
    {
--- a/db/external_sst_file_ingestion_job.cc
+++ b/db/external_sst_file_ingestion_job.cc
@ -773,9 +773,10 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile(
        const std::vector<FileMetaData*>& level_files =
            vstorage->LevelFiles(lvl);
        const SequenceNumber level_largest_seqno =
-            (*max_element(level_files.begin(), level_files.end(),
+            (*std::max_element(level_files.begin(), level_files.end(),
                               [](FileMetaData* f1, FileMetaData* f2) {
-                            return f1->fd.largest_seqno < f2->fd.largest_seqno;
+                                 return f1->fd.largest_seqno <
+                                        f2->fd.largest_seqno;
                               }))
                ->fd.largest_seqno;
        // should only assign seqno to current level's largest seqno when
--- a/db/flush_job.cc
+++ b/db/flush_job.cc
@ -351,7 +351,7 @@ Status FlushJob::MemPurge() {

  // Measure purging time.
  const uint64_t start_micros = clock_->NowMicros();
-  const uint64_t start_cpu_micros = clock_->CPUNanos() / 1000;
+  const uint64_t start_cpu_micros = clock_->CPUMicros();

  MemTable* new_mem = nullptr;
  // For performance/log investigation purposes:
@ -603,7 +603,7 @@ Status FlushJob::MemPurge() {
    TEST_SYNC_POINT("DBImpl::FlushJob:MemPurgeUnsuccessful");
  }
  const uint64_t micros = clock_->NowMicros() - start_micros;
-  const uint64_t cpu_micros = clock_->CPUNanos() / 1000 - start_cpu_micros;
+  const uint64_t cpu_micros = clock_->CPUMicros() - start_cpu_micros;
  ROCKS_LOG_INFO(db_options_.info_log,
                 "[%s] [JOB %d] Mempurge lasted %" PRIu64
                 " microseconds, and %" PRIu64
@ -789,7 +789,7 @@ Status FlushJob::WriteLevel0Table() {
      ThreadStatus::STAGE_FLUSH_WRITE_L0);
  db_mutex_->AssertHeld();
  const uint64_t start_micros = clock_->NowMicros();
-  const uint64_t start_cpu_micros = clock_->CPUNanos() / 1000;
+  const uint64_t start_cpu_micros = clock_->CPUMicros();
  Status s;

  std::vector<BlobFileAddition> blob_file_additions;
@ -976,7 +976,7 @@ Status FlushJob::WriteLevel0Table() {
  // Note that here we treat flush as level 0 compaction in internal stats
  InternalStats::CompactionStats stats(CompactionReason::kFlush, 1);
  const uint64_t micros = clock_->NowMicros() - start_micros;
-  const uint64_t cpu_micros = clock_->CPUNanos() / 1000 - start_cpu_micros;
+  const uint64_t cpu_micros = clock_->CPUMicros() - start_cpu_micros;
  stats.micros = micros;
  stats.cpu_micros = cpu_micros;

--- a/db/memtable.cc
+++ b/db/memtable.cc
@ -33,6 +33,7 @@
 #include "rocksdb/iterator.h"
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/slice_transform.h"
+#include "rocksdb/types.h"
 #include "rocksdb/write_buffer_manager.h"
 #include "table/internal_iterator.h"
 #include "table/iterator_wrapper.h"
@ -447,11 +448,13 @@ FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIterator(
      is_range_del_table_empty_.load(std::memory_order_relaxed)) {
    return nullptr;
  }
+  return NewRangeTombstoneIteratorInternal(read_options, read_seq);
+}
+
+FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIteratorInternal(
+    const ReadOptions& read_options, SequenceNumber read_seq) {
  auto* unfragmented_iter = new MemTableIterator(
      *this, read_options, nullptr /* arena */, true /* use_range_del_table */);
-  if (unfragmented_iter == nullptr) {
-    return nullptr;
-  }
  auto fragmented_tombstone_list =
      std::make_shared<FragmentedRangeTombstoneList>(
          std::unique_ptr<InternalIterator>(unfragmented_iter),
@ -960,53 +963,58 @@ void MemTable::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
  }
  PERF_TIMER_GUARD(get_from_memtable_time);

+  // For now, memtable Bloom filter is effectively disabled if there are any
+  // range tombstones. This is the simplest way to ensure range tombstones are
+  // handled. TODO: allow Bloom checks where max_covering_tombstone_seq==0
+  bool no_range_del = read_options.ignore_range_deletions ||
+                      is_range_del_table_empty_.load(std::memory_order_relaxed);
  MultiGetRange temp_range(*range, range->begin(), range->end());
-  if (bloom_filter_) {
-    std::array<Slice*, MultiGetContext::MAX_BATCH_SIZE> keys;
-    std::array<bool, MultiGetContext::MAX_BATCH_SIZE> may_match = {{true}};
-    autovector<Slice, MultiGetContext::MAX_BATCH_SIZE> prefixes;
+  if (bloom_filter_ && no_range_del) {
+    bool whole_key =
+        !prefix_extractor_ || moptions_.memtable_whole_key_filtering;
+    std::array<Slice, MultiGetContext::MAX_BATCH_SIZE> bloom_keys;
+    std::array<bool, MultiGetContext::MAX_BATCH_SIZE> may_match;
+    std::array<size_t, MultiGetContext::MAX_BATCH_SIZE> range_indexes;
    int num_keys = 0;
    for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) {
-      if (!prefix_extractor_) {
-        keys[num_keys++] = &iter->ukey_without_ts;
+      if (whole_key) {
+        bloom_keys[num_keys] = iter->ukey_without_ts;
+        range_indexes[num_keys++] = iter.index();
      } else if (prefix_extractor_->InDomain(iter->ukey_without_ts)) {
-        prefixes.emplace_back(
-            prefix_extractor_->Transform(iter->ukey_without_ts));
-        keys[num_keys++] = &prefixes.back();
-      }
-    }
-    bloom_filter_->MayContain(num_keys, &keys[0], &may_match[0]);
-    int idx = 0;
-    for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) {
-      if (prefix_extractor_ &&
-          !prefix_extractor_->InDomain(iter->ukey_without_ts)) {
+        bloom_keys[num_keys] =
+            prefix_extractor_->Transform(iter->ukey_without_ts);
+        range_indexes[num_keys++] = iter.index();
+      } else {
+        // TODO: consider not counting these as Bloom hits to more closely
+        // match bloom_sst_hit_count
        PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
-        continue;
      }
-      if (!may_match[idx]) {
-        temp_range.SkipKey(iter);
+    }
+    bloom_filter_->MayContain(num_keys, &bloom_keys[0], &may_match[0]);
+    for (int i = 0; i < num_keys; ++i) {
+      if (!may_match[i]) {
+        temp_range.SkipIndex(range_indexes[i]);
        PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
      } else {
        PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
      }
-      idx++;
    }
  }
  for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) {
-    SequenceNumber seq = kMaxSequenceNumber;
    bool found_final_value{false};
    bool merge_in_progress = iter->s->IsMergeInProgress();
+    if (!no_range_del) {
      std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
-        NewRangeTombstoneIterator(
+          NewRangeTombstoneIteratorInternal(
              read_options, GetInternalKeySeqno(iter->lkey->internal_key())));
-    if (range_del_iter != nullptr) {
      iter->max_covering_tombstone_seq = std::max(
          iter->max_covering_tombstone_seq,
          range_del_iter->MaxCoveringTombstoneSeqnum(iter->lkey->user_key()));
    }
+    SequenceNumber dummy_seq;
    GetFromTable(*(iter->lkey), iter->max_covering_tombstone_seq, true,
                 callback, &iter->is_blob_index, iter->value->GetSelf(),
-                 iter->timestamp, iter->s, &(iter->merge_context), &seq,
+                 iter->timestamp, iter->s, &(iter->merge_context), &dummy_seq,
                 &found_final_value, &merge_in_progress);

    if (!found_final_value && merge_in_progress) {
--- a/db/memtable.h
+++ b/db/memtable.h
@ -600,6 +600,10 @@ class MemTable {
                    std::string* value, std::string* timestamp, Status* s,
                    MergeContext* merge_context, SequenceNumber* seq,
                    bool* found_final_value, bool* merge_in_progress);
+
+  // Always returns non-null and assumes certain pre-checks are done
+  FragmentedRangeTombstoneIterator* NewRangeTombstoneIteratorInternal(
+      const ReadOptions& read_options, SequenceNumber read_seq);
 };

 extern const char* EncodeKey(std::string* scratch, const Slice& target);
--- a/db/memtable_list.cc
+++ b/db/memtable_list.cc
@ -494,8 +494,8 @@ Status MemTableList::TryInstallMemtableFlushResults(
    // TODO(myabandeh): Not sure how batch_count could be 0 here.
    if (batch_count > 0) {
      uint64_t min_wal_number_to_keep = 0;
-      if (vset->db_options()->allow_2pc) {
      assert(edit_list.size() > 0);
+      if (vset->db_options()->allow_2pc) {
        // Note that if mempurge is successful, the edit_list will
        // not be applicable (contains info of new min_log number to keep,
        // and level 0 file path of SST file created during normal flush,
@ -506,21 +506,24 @@ Status MemTableList::TryInstallMemtableFlushResults(

        // We piggyback the information of earliest log file to keep in the
        // manifest entry for the last file flushed.
-        edit_list.back()->SetMinLogNumberToKeep(min_wal_number_to_keep);
-      }
-
-      std::unique_ptr<VersionEdit> wal_deletion;
-      if (vset->db_options()->track_and_verify_wals_in_manifest) {
-        if (!vset->db_options()->allow_2pc) {
+      } else {
        min_wal_number_to_keep =
            PrecomputeMinLogNumberToKeepNon2PC(vset, *cfd, edit_list);
      }
+      edit_list.back()->SetMinLogNumberToKeep(min_wal_number_to_keep);
+
+      std::unique_ptr<VersionEdit> wal_deletion;
+      if (vset->db_options()->track_and_verify_wals_in_manifest) {
        if (min_wal_number_to_keep >
            vset->GetWalSet().GetMinWalNumberToKeep()) {
          wal_deletion.reset(new VersionEdit);
          wal_deletion->DeleteWalsBefore(min_wal_number_to_keep);
          edit_list.push_back(wal_deletion.get());
        }
+        TEST_SYNC_POINT_CALLBACK(
+            "MemTableList::TryInstallMemtableFlushResults:"
+            "AfterComputeMinWalToKeep",
+            nullptr);
      }

      const auto manifest_write_cb = [this, cfd, batch_count, log_buffer,
@ -805,15 +808,14 @@ Status InstallMemtableAtomicFlushResults(
  if (vset->db_options()->allow_2pc) {
    min_wal_number_to_keep = PrecomputeMinLogNumberToKeep2PC(
        vset, cfds, edit_lists, mems_list, prep_tracker);
-    edit_lists.back().back()->SetMinLogNumberToKeep(min_wal_number_to_keep);
-  }
-
-  std::unique_ptr<VersionEdit> wal_deletion;
-  if (vset->db_options()->track_and_verify_wals_in_manifest) {
-    if (!vset->db_options()->allow_2pc) {
+  } else {
    min_wal_number_to_keep =
        PrecomputeMinLogNumberToKeepNon2PC(vset, cfds, edit_lists);
  }
+  edit_lists.back().back()->SetMinLogNumberToKeep(min_wal_number_to_keep);
+
+  std::unique_ptr<VersionEdit> wal_deletion;
+  if (vset->db_options()->track_and_verify_wals_in_manifest) {
    if (min_wal_number_to_keep > vset->GetWalSet().GetMinWalNumberToKeep()) {
      wal_deletion.reset(new VersionEdit);
      wal_deletion->DeleteWalsBefore(min_wal_number_to_keep);
--- a/db/table_cache.cc
+++ b/db/table_cache.cc
@ -114,16 +114,19 @@ Status TableCache::GetTableReader(
  if (s.ok()) {
    s = ioptions_.fs->NewRandomAccessFile(fname, fopts, &file, nullptr);
  }
+  if (s.ok()) {
    RecordTick(ioptions_.stats, NO_FILE_OPENS);
-  if (s.IsPathNotFound()) {
+  } else if (s.IsPathNotFound()) {
    fname = Rocks2LevelTableFileName(fname);
    s = PrepareIOFromReadOptions(ro, ioptions_.clock, fopts.io_options);
    if (s.ok()) {
      s = ioptions_.fs->NewRandomAccessFile(fname, file_options, &file,
                                            nullptr);
    }
+    if (s.ok()) {
      RecordTick(ioptions_.stats, NO_FILE_OPENS);
    }
+  }

  if (s.ok()) {
    if (!sequential_mode && ioptions_.advise_random_on_open) {
--- a/db/version_edit_handler.cc
+++ b/db/version_edit_handler.cc
@ -394,7 +394,7 @@ void VersionEditHandler::CheckIterationResult(const log::Reader& reader,
  if (s->ok()) {
    version_set_->GetColumnFamilySet()->UpdateMaxColumnFamily(
        version_edit_params_.max_column_family_);
-    version_set_->MarkMinLogNumberToKeep2PC(
+    version_set_->MarkMinLogNumberToKeep(
        version_edit_params_.min_log_number_to_keep_);
    version_set_->MarkFileNumberUsed(version_edit_params_.prev_log_number_);
    version_set_->MarkFileNumberUsed(version_edit_params_.log_number_);
@ -970,12 +970,11 @@ void DumpManifestHandler::CheckIterationResult(const log::Reader& reader,
  fprintf(stdout,
          "next_file_number %" PRIu64 " last_sequence %" PRIu64
          "  prev_log_number %" PRIu64 " max_column_family %" PRIu32
-          " min_log_number_to_keep "
-          "%" PRIu64 "\n",
+          " min_log_number_to_keep %" PRIu64 "\n",
          version_set_->current_next_file_number(),
          version_set_->LastSequence(), version_set_->prev_log_number(),
          version_set_->column_family_set_->GetMaxColumnFamily(),
-          version_set_->min_log_number_to_keep_2pc());
+          version_set_->min_log_number_to_keep());
 }

 }  // namespace ROCKSDB_NAMESPACE
--- a/db/version_set.cc
+++ b/db/version_set.cc
@ -1965,7 +1965,8 @@ void Version::MultiGetBlob(
 void Version::Get(const ReadOptions& read_options, const LookupKey& k,
                  PinnableSlice* value, std::string* timestamp, Status* status,
                  MergeContext* merge_context,
-                  SequenceNumber* max_covering_tombstone_seq, bool* value_found,
+                  SequenceNumber* max_covering_tombstone_seq,
+                  PinnedIteratorsManager* pinned_iters_mgr, bool* value_found,
                  bool* key_exists, SequenceNumber* seq, ReadCallback* callback,
                  bool* is_blob, bool do_merge) {
  Slice ikey = k.internal_key();
@ -1978,7 +1979,6 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k,
    *key_exists = true;
  }

-  PinnedIteratorsManager pinned_iters_mgr;
  uint64_t tracing_get_id = BlockCacheTraceHelper::kReservedGetId;
  if (vset_ && vset_->block_cache_tracer_ &&
      vset_->block_cache_tracer_->is_tracing_enabled()) {
@ -1992,17 +1992,18 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k,
  bool* const is_blob_to_use = is_blob ? is_blob : &is_blob_index;
  BlobFetcher blob_fetcher(this, read_options);

+  assert(pinned_iters_mgr);
  GetContext get_context(
      user_comparator(), merge_operator_, info_log_, db_statistics_,
      status->ok() ? GetContext::kNotFound : GetContext::kMerge, user_key,
      do_merge ? value : nullptr, do_merge ? timestamp : nullptr, value_found,
      merge_context, do_merge, max_covering_tombstone_seq, clock_, seq,
-      merge_operator_ ? &pinned_iters_mgr : nullptr, callback, is_blob_to_use,
+      merge_operator_ ? pinned_iters_mgr : nullptr, callback, is_blob_to_use,
      tracing_get_id, &blob_fetcher);

  // Pin blocks that we read to hold merge operands
  if (merge_operator_) {
-    pinned_iters_mgr.StartPinning();
+    pinned_iters_mgr->StartPinning();
  }

  FilePicker fp(user_key, ikey, &storage_info_.level_files_brief_,
@ -2188,12 +2189,31 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
  MultiGetRange keys_with_blobs_range(*range, range->begin(), range->end());
  // blob_file => [[blob_idx, it], ...]
  std::unordered_map<uint64_t, BlobReadRequests> blob_rqs;
+  int level = -1;

  while (f != nullptr) {
    MultiGetRange file_range = fp.CurrentFileRange();
    bool timer_enabled =
        GetPerfLevel() >= PerfLevel::kEnableTimeExceptForMutex &&
        get_perf_context()->per_level_perf_context_enabled;
+
+    // Report MultiGet stats per level.
+    if (level >= 0 && level != (int)fp.GetHitFileLevel()) {
+      // Dump the stats if the search has moved to the next level and
+      // reset for next level.
+      RecordInHistogram(db_statistics_,
+                        NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL,
+                        num_index_read + num_filter_read);
+      RecordInHistogram(db_statistics_, NUM_DATA_BLOCKS_READ_PER_LEVEL,
+                        num_data_read);
+      RecordInHistogram(db_statistics_, NUM_SST_READ_PER_LEVEL, num_sst_read);
+      num_filter_read = 0;
+      num_index_read = 0;
+      num_data_read = 0;
+      num_sst_read = 0;
+      level = fp.GetHitFileLevel();
+    }
+
    StopWatchNano timer(clock_, timer_enabled /* auto_start */);
    s = table_cache_->MultiGet(
        read_options, *internal_comparator(), *f->file_metadata, &file_range,
@ -2238,6 +2258,11 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
      num_filter_read += get_context.get_context_stats_.num_filter_read;
      num_data_read += get_context.get_context_stats_.num_data_read;
      num_sst_read += get_context.get_context_stats_.num_sst_read;
+      // Reset these stats since they're specific to a level
+      get_context.get_context_stats_.num_index_read = 0;
+      get_context.get_context_stats_.num_filter_read = 0;
+      get_context.get_context_stats_.num_data_read = 0;
+      get_context.get_context_stats_.num_sst_read = 0;

      // report the counters before returning
      if (get_context.State() != GetContext::kNotFound &&
@ -2314,22 +2339,6 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
      }
    }

-    // Report MultiGet stats per level.
-    if (fp.IsHitFileLastInLevel()) {
-      // Dump the stats if this is the last file of this level and reset for
-      // next level.
-      RecordInHistogram(db_statistics_,
-                        NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL,
-                        num_index_read + num_filter_read);
-      RecordInHistogram(db_statistics_, NUM_DATA_BLOCKS_READ_PER_LEVEL,
-                        num_data_read);
-      RecordInHistogram(db_statistics_, NUM_SST_READ_PER_LEVEL, num_sst_read);
-      num_filter_read = 0;
-      num_index_read = 0;
-      num_data_read = 0;
-      num_sst_read = 0;
-    }
-
    RecordInHistogram(db_statistics_, SST_BATCH_SIZE, batch_size);
    if (!s.ok() || file_picker_range.empty()) {
      break;
@ -2337,6 +2346,13 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
    f = fp.GetNextFile();
  }

+  // Dump stats for most recent level
+  RecordInHistogram(db_statistics_, NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL,
+                    num_index_read + num_filter_read);
+  RecordInHistogram(db_statistics_, NUM_DATA_BLOCKS_READ_PER_LEVEL,
+                    num_data_read);
+  RecordInHistogram(db_statistics_, NUM_SST_READ_PER_LEVEL, num_sst_read);
+
  if (s.ok() && !blob_rqs.empty()) {
    MultiGetBlob(read_options, keys_with_blobs_range, blob_rqs);
  }
@ -4097,7 +4113,7 @@ void VersionSet::Reset() {
  }
  db_id_.clear();
  next_file_number_.store(2);
-  min_log_number_to_keep_2pc_.store(0);
+  min_log_number_to_keep_.store(0);
  manifest_file_number_ = 0;
  options_file_number_ = 0;
  pending_manifest_file_number_ = 0;
@ -4564,8 +4580,7 @@ Status VersionSet::ProcessManifestWrites(
      }

      if (last_min_log_number_to_keep != 0) {
-        // Should only be set in 2PC mode.
-        MarkMinLogNumberToKeep2PC(last_min_log_number_to_keep);
+        MarkMinLogNumberToKeep(last_min_log_number_to_keep);
      }

      for (int i = 0; i < static_cast<int>(versions.size()); ++i) {
@ -4919,7 +4934,7 @@ Status VersionSet::Recover(
        ",min_log_number_to_keep is %" PRIu64 "\n",
        manifest_path.c_str(), manifest_file_number_, next_file_number_.load(),
        last_sequence_.load(), log_number, prev_log_number_,
-        column_family_set_->GetMaxColumnFamily(), min_log_number_to_keep_2pc());
+        column_family_set_->GetMaxColumnFamily(), min_log_number_to_keep());

    for (auto cfd : *column_family_set_) {
      if (cfd->IsDropped()) {
@ -5324,9 +5339,9 @@ void VersionSet::MarkFileNumberUsed(uint64_t number) {
 }
 // Called only either from ::LogAndApply which is protected by mutex or during
 // recovery which is single-threaded.
-void VersionSet::MarkMinLogNumberToKeep2PC(uint64_t number) {
-  if (min_log_number_to_keep_2pc_.load(std::memory_order_relaxed) < number) {
-    min_log_number_to_keep_2pc_.store(number, std::memory_order_relaxed);
+void VersionSet::MarkMinLogNumberToKeep(uint64_t number) {
+  if (min_log_number_to_keep_.load(std::memory_order_relaxed) < number) {
+    min_log_number_to_keep_.store(number, std::memory_order_relaxed);
  }
 }

@ -5448,7 +5463,7 @@ Status VersionSet::WriteCurrentStateToManifest(
        // min_log_number_to_keep is for the whole db, not for specific column family.
        // So it does not need to be set for every column family, just need to be set once.
        // Since default CF can never be dropped, we set the min_log to the default CF here.
-        uint64_t min_log = min_log_number_to_keep_2pc();
+        uint64_t min_log = min_log_number_to_keep();
        if (min_log != 0) {
          edit.SetMinLogNumberToKeep(min_log);
        }
--- a/db/version_set.h
+++ b/db/version_set.h
@ -708,9 +708,11 @@ class Version {
  //    If the key has any merge operands then store them in
  //    merge_context.operands_list and don't merge the operands
  // REQUIRES: lock is not held
+  // REQUIRES: pinned_iters_mgr != nullptr
  void Get(const ReadOptions&, const LookupKey& key, PinnableSlice* value,
           std::string* timestamp, Status* status, MergeContext* merge_context,
           SequenceNumber* max_covering_tombstone_seq,
+           PinnedIteratorsManager* pinned_iters_mgr,
           bool* value_found = nullptr, bool* key_exists = nullptr,
           SequenceNumber* seq = nullptr, ReadCallback* callback = nullptr,
           bool* is_blob = nullptr, bool do_merge = true);
@ -1099,8 +1101,8 @@ class VersionSet {

  uint64_t current_next_file_number() const { return next_file_number_.load(); }

-  uint64_t min_log_number_to_keep_2pc() const {
-    return min_log_number_to_keep_2pc_.load();
+  uint64_t min_log_number_to_keep() const {
+    return min_log_number_to_keep_.load();
  }

  // Allocate and return a new file number
@ -1158,7 +1160,7 @@ class VersionSet {
  // Mark the specified log number as deleted
  // REQUIRED: this is only called during single-threaded recovery or repair, or
  // from ::LogAndApply where the global mutex is held.
-  void MarkMinLogNumberToKeep2PC(uint64_t number);
+  void MarkMinLogNumberToKeep(uint64_t number);

  // Return the log file number for the log file that is currently
  // being compacted, or zero if there is no such log file.
@ -1167,10 +1169,12 @@ class VersionSet {
  // Returns the minimum log number which still has data not flushed to any SST
  // file.
  // In non-2PC mode, all the log numbers smaller than this number can be safely
-  // deleted.
+  // deleted, although we still use `min_log_number_to_keep_` to determine when
+  // to delete a WAL file.
  uint64_t MinLogNumberWithUnflushedData() const {
    return PreComputeMinLogNumberWithUnflushedData(nullptr);
  }
+
  // Returns the minimum log number which still has data not flushed to any SST
  // file.
  // Empty column families' log number is considered to be
@ -1268,6 +1272,10 @@ class VersionSet {
                        uint64_t min_pending_output);

  ColumnFamilySet* GetColumnFamilySet() { return column_family_set_.get(); }
+  RefedColumnFamilySet GetRefedColumnFamilySet() {
+    return RefedColumnFamilySet(GetColumnFamilySet());
+  }
+
  const FileOptions& file_options() { return file_options_; }
  void ChangeFileOptions(const MutableDBOptions& new_options) {
    file_options_.writable_file_max_buffer_size =
@ -1370,9 +1378,8 @@ class VersionSet {
  const ImmutableDBOptions* const db_options_;
  std::atomic<uint64_t> next_file_number_;
  // Any WAL number smaller than this should be ignored during recovery,
-  // and is qualified for being deleted in 2PC mode. In non-2PC mode, this
-  // number is ignored.
-  std::atomic<uint64_t> min_log_number_to_keep_2pc_ = {0};
+  // and is qualified for being deleted.
+  std::atomic<uint64_t> min_log_number_to_keep_ = {0};
  uint64_t manifest_file_number_;
  uint64_t options_file_number_;
  uint64_t options_file_size_;
--- a/db/version_set_test.cc
+++ b/db/version_set_test.cc
@ -3204,6 +3204,7 @@ TEST_F(VersionSetTestMissingFiles, NoFileMissing) {
 }

 TEST_F(VersionSetTestMissingFiles, MinLogNumberToKeep2PC) {
+  db_options_.allow_2pc = true;
  NewDB();

  SstInfo sst(100, kDefaultColumnFamilyName, "a");
@ -3215,12 +3216,12 @@ TEST_F(VersionSetTestMissingFiles, MinLogNumberToKeep2PC) {
  edit.AddFile(0, file_metas[0]);
  edit.SetMinLogNumberToKeep(kMinWalNumberToKeep2PC);
  ASSERT_OK(LogAndApplyToDefaultCF(edit));
-  ASSERT_EQ(versions_->min_log_number_to_keep_2pc(), kMinWalNumberToKeep2PC);
+  ASSERT_EQ(versions_->min_log_number_to_keep(), kMinWalNumberToKeep2PC);

  for (int i = 0; i < 3; i++) {
    CreateNewManifest();
    ReopenDB();
-    ASSERT_EQ(versions_->min_log_number_to_keep_2pc(), kMinWalNumberToKeep2PC);
+    ASSERT_EQ(versions_->min_log_number_to_keep(), kMinWalNumberToKeep2PC);
  }
 }

--- a/env/env_posix.cc
+++ b/env/env_posix.cc
@ -166,7 +166,7 @@ class PosixClock : public SystemClock {
    defined(OS_AIX) || (defined(__MACH__) && defined(__MAC_10_12))
    struct timespec ts;
    clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
-    return static_cast<uint64_t>(ts.tv_sec) * 1000000000;
+    return (static_cast<uint64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec) / 1000;
 #endif
    return 0;
  }
--- a/env/fs_posix.cc
+++ b/env/fs_posix.cc
@ -325,14 +325,7 @@ class PosixFileSystem : public FileSystem {
    SetFD_CLOEXEC(fd, &options);

    if (options.use_mmap_writes) {
-      if (!checkedDiskForMmap_) {
-        // this will be executed once in the program's lifetime.
-        // do not use mmapWrite on non ext-3/xfs/tmpfs systems.
-        if (!SupportsFastAllocate(fname)) {
-          forceMmapOff_ = true;
-        }
-        checkedDiskForMmap_ = true;
-      }
+      MaybeForceDisableMmap(fd);
    }
    if (options.use_mmap_writes && !forceMmapOff_) {
      result->reset(new PosixMmapFile(fname, fd, page_size_, options));
@ -431,14 +424,7 @@ class PosixFileSystem : public FileSystem {
    }

    if (options.use_mmap_writes) {
-      if (!checkedDiskForMmap_) {
-        // this will be executed once in the program's lifetime.
-        // do not use mmapWrite on non ext-3/xfs/tmpfs systems.
-        if (!SupportsFastAllocate(fname)) {
-          forceMmapOff_ = true;
-        }
-        checkedDiskForMmap_ = true;
-      }
+      MaybeForceDisableMmap(fd);
    }
    if (options.use_mmap_writes && !forceMmapOff_) {
      result->reset(new PosixMmapFile(fname, fd, page_size_, options));
@ -753,8 +739,10 @@ class PosixFileSystem : public FileSystem {
                    const IOOptions& /*opts*/,
                    IODebugContext* /*dbg*/) override {
    if (link(src.c_str(), target.c_str()) != 0) {
-      if (errno == EXDEV) {
-        return IOStatus::NotSupported("No cross FS links allowed");
+      if (errno == EXDEV || errno == ENOTSUP) {
+        return IOStatus::NotSupported(errno == EXDEV
+                                          ? "No cross FS links allowed"
+                                          : "Links not supported by FS");
      }
      return IOError("while link file to " + target, src, errno);
    }
@ -997,8 +985,7 @@ class PosixFileSystem : public FileSystem {
  }
 #endif
 private:
-  bool checkedDiskForMmap_;
-  bool forceMmapOff_;  // do we override Env options?
+  bool forceMmapOff_ = false;  // do we override Env options?

  // Returns true iff the named directory exists and is a directory.
  virtual bool DirExists(const std::string& dname) {
@ -1009,10 +996,10 @@ class PosixFileSystem : public FileSystem {
    return false;  // stat() failed return false
  }

-  bool SupportsFastAllocate(const std::string& path) {
+  bool SupportsFastAllocate(int fd) {
 #ifdef ROCKSDB_FALLOCATE_PRESENT
    struct statfs s;
-    if (statfs(path.c_str(), &s)) {
+    if (fstatfs(fd, &s)) {
      return false;
    }
    switch (s.f_type) {
@ -1026,11 +1013,26 @@ class PosixFileSystem : public FileSystem {
        return false;
    }
 #else
-    (void)path;
+    (void)fd;
    return false;
 #endif
  }

+  void MaybeForceDisableMmap(int fd) {
+    static std::once_flag s_check_disk_for_mmap_once;
+    assert(this == FileSystem::Default().get());
+    std::call_once(
+        s_check_disk_for_mmap_once,
+        [this](int fdesc) {
+          // this will be executed once in the program's lifetime.
+          // do not use mmapWrite on non ext-3/xfs/tmpfs systems.
+          if (!SupportsFastAllocate(fdesc)) {
+            forceMmapOff_ = true;
+          }
+        },
+        fd);
+  }
+
 #ifdef ROCKSDB_IOURING_PRESENT
  bool IsIOUringEnabled() {
    if (RocksDbIOUringEnable && RocksDbIOUringEnable()) {
@ -1094,8 +1096,7 @@ size_t PosixFileSystem::GetLogicalBlockSizeForWriteIfNeeded(
 }

 PosixFileSystem::PosixFileSystem()
-    : checkedDiskForMmap_(false),
-      forceMmapOff_(false),
+    : forceMmapOff_(false),
      page_size_(getpagesize()),
      allow_non_owner_access_(true) {
 #if defined(ROCKSDB_IOURING_PRESENT)
--- a/include/rocksdb/advanced_options.h
+++ b/include/rocksdb/advanced_options.h
@ -366,10 +366,16 @@ struct AdvancedColumnFamilyOptions {
                                   Slice delta_value,
                                   std::string* merged_value) = nullptr;

-  // if prefix_extractor is set and memtable_prefix_bloom_size_ratio is not 0,
-  // create prefix bloom for memtable with the size of
+  // Should really be called `memtable_bloom_size_ratio`. Enables a dynamic
+  // Bloom filter in memtable to optimize many queries that must go beyond
+  // the memtable. The size in bytes of the filter is
  // write_buffer_size * memtable_prefix_bloom_size_ratio.
-  // If it is larger than 0.25, it is sanitized to 0.25.
+  // * If prefix_extractor is set, the filter includes prefixes.
+  // * If memtable_whole_key_filtering, the filter includes whole keys.
+  // * If both, the filter includes both.
+  // * If neither, the feature is disabled.
+  //
+  // If this value is larger than 0.25, it is sanitized to 0.25.
  //
  // Default: 0 (disable)
  //
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@ -744,7 +744,7 @@ class DB {
  // snapshot is no longer needed.
  //
  // nullptr will be returned if the DB fails to take a snapshot or does
-  // not support snapshot.
+  // not support snapshot (eg: inplace_update_support enabled).
  virtual const Snapshot* GetSnapshot() = 0;

  // Release a previously acquired snapshot.  The caller must not
--- a/include/rocksdb/env.h
+++ b/include/rocksdb/env.h
@ -171,7 +171,7 @@ class Env : public Customizable {
  Env(const Env&) = delete;
  void operator=(const Env&) = delete;

-  virtual ~Env();
+  ~Env() override;

  static const char* Type() { return "Environment"; }

--- a/include/rocksdb/utilities/object_registry.h
+++ b/include/rocksdb/utilities/object_registry.h
@ -570,10 +570,10 @@ class ObjectRegistry {
        }
      }
    }
-    if (parent_ != nullptr) {
-      return parent_->FindFactory<T>(name);
-    } else {
+    if (parent_ == nullptr) {
      return nullptr;
+    } else {
+      return parent_->FindFactory<T>(name);
    }
  }

--- a/include/rocksdb/version.h
+++ b/include/rocksdb/version.h
@ -11,7 +11,7 @@

 #define ROCKSDB_MAJOR 6
 #define ROCKSDB_MINOR 29
-#define ROCKSDB_PATCH 0
+#define ROCKSDB_PATCH 5

 // Do not use these. We made the mistake of declaring macros starting with
 // double underscore. Now we have to live with our choice. We'll deprecate these
--- a/java/rocksjni/options.cc
+++ b/java/rocksjni/options.cc
@ -4282,9 +4282,10 @@ void Java_org_rocksdb_ColumnFamilyOptions_setSstPartitionerFactory(
    JNIEnv*, jobject, jlong jhandle, jlong factory_handle) {
  auto* options =
      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
-  auto* factory = reinterpret_cast<ROCKSDB_NAMESPACE::SstPartitionerFactory*>(
+  auto factory = reinterpret_cast<
+      std::shared_ptr<ROCKSDB_NAMESPACE::SstPartitionerFactory>*>(
      factory_handle);
-  options->sst_partitioner_factory.reset(factory);
+  options->sst_partitioner_factory = *factory;
 }

 /*
--- a/java/src/main/java/org/rocksdb/NativeLibraryLoader.java
+++ b/java/src/main/java/org/rocksdb/NativeLibraryLoader.java
@ -18,7 +18,11 @@ public class NativeLibraryLoader {

  private static final String sharedLibraryName = Environment.getSharedLibraryName("rocksdb");
  private static final String jniLibraryName = Environment.getJniLibraryName("rocksdb");
+  private static final /* @Nullable */ String fallbackJniLibraryName =
+      Environment.getFallbackJniLibraryName("rocksdb");
  private static final String jniLibraryFileName = Environment.getJniLibraryFileName("rocksdb");
+  private static final /* @Nullable */ String fallbackJniLibraryFileName =
+      Environment.getFallbackJniLibraryFileName("rocksdb");
  private static final String tempFilePrefix = "librocksdbjni";
  private static final String tempFileSuffix = Environment.getJniLibraryExtension();

@ -49,15 +53,34 @@ public class NativeLibraryLoader {
   */
  public synchronized void loadLibrary(final String tmpDir) throws IOException {
    try {
+      // try dynamic library
      System.loadLibrary(sharedLibraryName);
-    } catch(final UnsatisfiedLinkError ule1) {
+      return;
+    } catch (final UnsatisfiedLinkError ule) {
+      // ignore - try from static library
+    }
+
    try {
+      // try static library
      System.loadLibrary(jniLibraryName);
-      } catch(final UnsatisfiedLinkError ule2) {
+      return;
+    } catch (final UnsatisfiedLinkError ule) {
+      // ignore - then try static library fallback or from jar
+    }
+
+    if (fallbackJniLibraryName != null) {
+      try {
+        // try static library fallback
+        System.loadLibrary(fallbackJniLibraryName);
+        return;
+      } catch (final UnsatisfiedLinkError ule) {
+        // ignore - then try from jar
+      }
+    }
+
+    // try jar
    loadLibraryFromJar(tmpDir);
  }
-    }
-  }

  /**
   * Attempts to extract the native RocksDB library
@ -83,6 +106,27 @@ public class NativeLibraryLoader {

  File loadLibraryFromJarToTemp(final String tmpDir)
          throws IOException {
+    InputStream is = null;
+    try {
+      // attempt to look up the static library in the jar file
+      String libraryFileName = jniLibraryFileName;
+      is = getClass().getClassLoader().getResourceAsStream(libraryFileName);
+
+      if (is == null) {
+        // is there a fallback we can try
+        if (fallbackJniLibraryFileName == null) {
+          throw new RuntimeException(libraryFileName + " was not found inside JAR.");
+        }
+
+        // attempt to look up the fallback static library in the jar file
+        libraryFileName = fallbackJniLibraryFileName;
+        is = getClass().getClassLoader().getResourceAsStream(libraryFileName);
+        if (is == null) {
+          throw new RuntimeException(libraryFileName + " was not found inside JAR.");
+        }
+      }
+
+      // create a temporary file to copy the library to
      final File temp;
      if (tmpDir == null || tmpDir.isEmpty()) {
        temp = File.createTempFile(tempFilePrefix, tempFileSuffix);
@ -92,34 +136,32 @@ public class NativeLibraryLoader {
          throw new RuntimeException(
              "Directory: " + parentDir.getAbsolutePath() + " does not exist!");
        }
-      temp = new File(parentDir, jniLibraryFileName);
+        temp = new File(parentDir, libraryFileName);
        if (temp.exists() && !temp.delete()) {
-        throw new RuntimeException("File: " + temp.getAbsolutePath()
-            + " already exists and cannot be removed.");
+          throw new RuntimeException(
+              "File: " + temp.getAbsolutePath() + " already exists and cannot be removed.");
        }
        if (!temp.createNewFile()) {
-        throw new RuntimeException("File: " + temp.getAbsolutePath()
-            + " could not be created.");
+          throw new RuntimeException("File: " + temp.getAbsolutePath() + " could not be created.");
        }
      }
-
      if (!temp.exists()) {
        throw new RuntimeException("File " + temp.getAbsolutePath() + " does not exist.");
      } else {
        temp.deleteOnExit();
      }

-    // attempt to copy the library from the Jar file to the temp destination
-    try (final InputStream is = getClass().getClassLoader().
-      getResourceAsStream(jniLibraryFileName)) {
-      if (is == null) {
-        throw new RuntimeException(jniLibraryFileName + " was not found inside JAR.");
-      } else {
+      // copy the library from the Jar file to the temp destination
      Files.copy(is, temp.toPath(), StandardCopyOption.REPLACE_EXISTING);
-      }
-    }

+      // return the temporary library file
      return temp;
+
+    } finally {
+      if (is != null) {
+        is.close();
+      }
+    }
  }

  /**
--- a/java/src/main/java/org/rocksdb/util/Environment.java
+++ b/java/src/main/java/org/rocksdb/util/Environment.java
@ -110,8 +110,14 @@ public class Environment {
        return String.format("%sjni-linux%s%s", name, arch, getLibcPostfix());
      }
    } else if (isMac()) {
+      if (is64Bit()) {
+        final String arch;
        if (isAarch64()) {
-        return String.format("%sjni-osx-%s", name, ARCH);
+          arch = "arm64";
+        } else {
+          arch = "x86_64";
+        }
+        return String.format("%sjni-osx-%s", name, arch);
      } else {
        return String.format("%sjni-osx", name);
      }
@ -131,10 +137,25 @@ public class Environment {
    throw new UnsupportedOperationException(String.format("Cannot determine JNI library name for ARCH='%s' OS='%s' name='%s'", ARCH, OS, name));
  }

+  public static /*@Nullable*/ String getFallbackJniLibraryName(final String name) {
+    if (isMac() && is64Bit()) {
+      return String.format("%sjni-osx", name);
+    }
+    return null;
+  }
+
  public static String getJniLibraryFileName(final String name) {
    return appendLibOsSuffix("lib" + getJniLibraryName(name), false);
  }

+  public static /*@Nullable*/ String getFallbackJniLibraryFileName(final String name) {
+    final String fallbackJniLibraryName = getFallbackJniLibraryName(name);
+    if (fallbackJniLibraryName == null) {
+      return null;
+    }
+    return appendLibOsSuffix("lib" + fallbackJniLibraryName, false);
+  }
+
  private static String appendLibOsSuffix(final String libraryFileName, final boolean shared) {
    if (isUnix() || isAix() || isSolaris() || isFreeBSD() || isOpenBSD()) {
      return libraryFileName + ".so";
--- a/java/src/test/java/org/rocksdb/SstPartitionerTest.java
+++ b/java/src/test/java/org/rocksdb/SstPartitionerTest.java
@ -5,6 +5,7 @@

 package org.rocksdb;

+import static java.nio.charset.StandardCharsets.UTF_8;
 import static org.assertj.core.api.Assertions.assertThat;

 import java.util.List;
@ -21,7 +22,7 @@ public class SstPartitionerTest {
  @Rule public TemporaryFolder dbFolder = new TemporaryFolder();

  @Test
-  public void sstFixedPrefix() throws InterruptedException, RocksDBException {
+  public void sstFixedPrefix() throws RocksDBException {
    try (SstPartitionerFixedPrefixFactory factory = new SstPartitionerFixedPrefixFactory(4);
         final Options opt =
             new Options().setCreateIfMissing(true).setSstPartitionerFactory(factory);
@ -31,7 +32,8 @@ public class SstPartitionerTest {
      db.put("bbbb1".getBytes(), "B".getBytes());
      db.flush(new FlushOptions());

-      db.put("aaaa1".getBytes(), "A2".getBytes());
+      db.put("aaaa0".getBytes(), "A2".getBytes());
+      db.put("aaaa2".getBytes(), "A2".getBytes());
      db.flush(new FlushOptions());

      db.compactRange();
@ -40,4 +42,31 @@ public class SstPartitionerTest {
      assertThat(metadata.size()).isEqualTo(2);
    }
  }
+
+  @Test
+  public void sstFixedPrefixFamily() throws RocksDBException {
+    final byte[] cfName = "new_cf".getBytes(UTF_8);
+    final ColumnFamilyDescriptor cfDescriptor = new ColumnFamilyDescriptor(cfName,
+        new ColumnFamilyOptions().setSstPartitionerFactory(
+            new SstPartitionerFixedPrefixFactory(4)));
+
+    try (final Options opt = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) {
+      final ColumnFamilyHandle columnFamilyHandle = db.createColumnFamily(cfDescriptor);
+
+      // writing (long)100 under key
+      db.put(columnFamilyHandle, "aaaa1".getBytes(), "A".getBytes());
+      db.put(columnFamilyHandle, "bbbb1".getBytes(), "B".getBytes());
+      db.flush(new FlushOptions(), columnFamilyHandle);
+
+      db.put(columnFamilyHandle, "aaaa0".getBytes(), "A2".getBytes());
+      db.put(columnFamilyHandle, "aaaa2".getBytes(), "A2".getBytes());
+      db.flush(new FlushOptions(), columnFamilyHandle);
+
+      db.compactRange(columnFamilyHandle);
+
+      List<LiveFileMetaData> metadata = db.getLiveFilesMetaData();
+      assertThat(metadata.size()).isEqualTo(2);
+    }
+  }
 }
--- a/java/src/test/java/org/rocksdb/util/EnvironmentTest.java
+++ b/java/src/test/java/org/rocksdb/util/EnvironmentTest.java
@ -9,7 +9,6 @@ import org.junit.BeforeClass;
 import org.junit.Test;

 import java.lang.reflect.Field;
-import java.lang.reflect.Modifier;

 import static org.assertj.core.api.Assertions.assertThat;

@ -37,18 +36,21 @@ public class EnvironmentTest {
        isEqualTo(".jnilib");
    assertThat(Environment.getJniLibraryFileName("rocksdb")).
        isEqualTo("librocksdbjni-osx.jnilib");
+    assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
    assertThat(Environment.getSharedLibraryFileName("rocksdb")).
        isEqualTo("librocksdbjni.dylib");
  }

  @Test
-  public void mac64() {
-    setEnvironmentClassFields("mac", "x86-64");
+  public void mac64_x86_64() {
+    setEnvironmentClassFields("mac", "x86_64");
    assertThat(Environment.isWindows()).isFalse();
    assertThat(Environment.getJniLibraryExtension()).
        isEqualTo(".jnilib");
-    assertThat(Environment.getJniLibraryFileName("rocksdb")).
-        isEqualTo("librocksdbjni-osx.jnilib");
+    assertThat(Environment.getJniLibraryFileName("rocksdb"))
+        .isEqualTo("librocksdbjni-osx-x86_64.jnilib");
+    assertThat(Environment.getFallbackJniLibraryFileName("rocksdb"))
+        .isEqualTo("librocksdbjni-osx.jnilib");
    assertThat(Environment.getSharedLibraryFileName("rocksdb")).
        isEqualTo("librocksdbjni.dylib");
  }
@ -59,7 +61,9 @@ public class EnvironmentTest {
    assertThat(Environment.isWindows()).isFalse();
    assertThat(Environment.getJniLibraryExtension()).isEqualTo(".jnilib");
    assertThat(Environment.getJniLibraryFileName("rocksdb"))
-        .isEqualTo("librocksdbjni-osx-aarch64.jnilib");
+        .isEqualTo("librocksdbjni-osx-arm64.jnilib");
+    assertThat(Environment.getFallbackJniLibraryFileName("rocksdb"))
+        .isEqualTo("librocksdbjni-osx.jnilib");
    assertThat(Environment.getSharedLibraryFileName("rocksdb")).isEqualTo("librocksdbjni.dylib");
  }

@ -73,6 +77,7 @@ public class EnvironmentTest {
        isEqualTo(".so");
    assertThat(Environment.getJniLibraryFileName("rocksdb")).
        isEqualTo("librocksdbjni-linux32.so");
+    assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
    assertThat(Environment.getSharedLibraryFileName("rocksdb")).
        isEqualTo("librocksdbjni.so");
    // Linux musl-libc (Alpine)
@ -103,7 +108,8 @@ public class EnvironmentTest {
    assertThat(Environment.isWindows()).isFalse();
    assertThat(Environment.getJniLibraryExtension()).
        isEqualTo(".so");
-    Environment.getJniLibraryFileName("rocksdb");
+    assertThat(Environment.getJniLibraryFileName("rocksdb")).isEqualTo("blah");
+    assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
  }

  @Test
@ -115,6 +121,7 @@ public class EnvironmentTest {
        isEqualTo(".so");
    assertThat(Environment.getJniLibraryFileName("rocksdb")).
        isEqualTo("librocksdbjni-linux64.so");
+    assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
    assertThat(Environment.getSharedLibraryFileName("rocksdb")).
        isEqualTo("librocksdbjni.so");
    // Linux musl-libc (Alpine)
@ -124,6 +131,7 @@ public class EnvironmentTest {
        isEqualTo(".so");
    assertThat(Environment.getJniLibraryFileName("rocksdb")).
        isEqualTo("librocksdbjni-linux64-musl.so");
+    assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
    assertThat(Environment.getSharedLibraryFileName("rocksdb")).
        isEqualTo("librocksdbjni.so");
    // UNIX
@ -134,6 +142,7 @@ public class EnvironmentTest {
        isEqualTo(".so");
    assertThat(Environment.getJniLibraryFileName("rocksdb")).
        isEqualTo("librocksdbjni-linux64.so");
+    assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
    assertThat(Environment.getSharedLibraryFileName("rocksdb")).
        isEqualTo("librocksdbjni.so");
    // AIX
@ -143,6 +152,7 @@ public class EnvironmentTest {
        isEqualTo(".so");
    assertThat(Environment.getJniLibraryFileName("rocksdb")).
        isEqualTo("librocksdbjni-aix64.so");
+    assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
    assertThat(Environment.getSharedLibraryFileName("rocksdb")).
        isEqualTo("librocksdbjni.so");
  }
@ -161,6 +171,7 @@ public class EnvironmentTest {
      isEqualTo(".dll");
    assertThat(Environment.getJniLibraryFileName("rocksdb")).
      isEqualTo("librocksdbjni-win64.dll");
+    assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
    assertThat(Environment.getSharedLibraryFileName("rocksdb")).
      isEqualTo("librocksdbjni.dll");
  }
@ -177,6 +188,7 @@ public class EnvironmentTest {
    assertThat(Environment.getJniLibraryName("rocksdb")).isEqualTo("rocksdbjni-linux-ppc64le");
    assertThat(Environment.getJniLibraryFileName("rocksdb"))
        .isEqualTo("librocksdbjni-linux-ppc64le.so");
+    assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
    assertThat(Environment.getSharedLibraryFileName("rocksdb")).isEqualTo("librocksdbjni.so");
    // Linux musl-libc (Alpine)
    setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, true);
@ -189,6 +201,7 @@ public class EnvironmentTest {
    assertThat(Environment.getJniLibraryName("rocksdb")).isEqualTo("rocksdbjni-linux-ppc64le-musl");
    assertThat(Environment.getJniLibraryFileName("rocksdb"))
        .isEqualTo("librocksdbjni-linux-ppc64le-musl.so");
+    assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
    assertThat(Environment.getSharedLibraryFileName("rocksdb")).isEqualTo("librocksdbjni.so");
    setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, false);
  }
@ -205,6 +218,7 @@ public class EnvironmentTest {
    assertThat(Environment.getJniLibraryName("rocksdb")).isEqualTo("rocksdbjni-linux-aarch64");
    assertThat(Environment.getJniLibraryFileName("rocksdb"))
        .isEqualTo("librocksdbjni-linux-aarch64.so");
+    assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
    assertThat(Environment.getSharedLibraryFileName("rocksdb")).isEqualTo("librocksdbjni.so");
    // Linux musl-libc (Alpine)
    setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, true);
@ -217,6 +231,7 @@ public class EnvironmentTest {
    assertThat(Environment.getJniLibraryName("rocksdb")).isEqualTo("rocksdbjni-linux-aarch64-musl");
    assertThat(Environment.getJniLibraryFileName("rocksdb"))
        .isEqualTo("librocksdbjni-linux-aarch64-musl.so");
+    assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
    assertThat(Environment.getSharedLibraryFileName("rocksdb")).isEqualTo("librocksdbjni.so");
    setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, false);
  }
--- a/port/win/env_win.cc
+++ b/port/win/env_win.cc
@ -9,6 +9,8 @@

 #if defined(OS_WIN)

+#include "port/win/env_win.h"
+
 #include <direct.h>  // _rmdir, _mkdir, _getcwd
 #include <errno.h>
 #include <io.h>   // _access
@ -17,6 +19,7 @@
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <windows.h>
+#include <winioctl.h>

 #include <algorithm>
 #include <ctime>
@ -27,7 +30,6 @@
 #include "monitoring/thread_status_util.h"
 #include "port/port.h"
 #include "port/port_dirent.h"
-#include "port/win/env_win.h"
 #include "port/win/io_win.h"
 #include "port/win/win_logger.h"
 #include "rocksdb/env.h"
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@ -1436,9 +1436,10 @@ template <typename TBlocklike>
 Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
    FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
    const BlockHandle& handle, const UncompressionDict& uncompression_dict,
-    const bool wait, CachableEntry<TBlocklike>* block_entry,
-    BlockType block_type, GetContext* get_context,
-    BlockCacheLookupContext* lookup_context, BlockContents* contents) const {
+    const bool wait, const bool for_compaction,
+    CachableEntry<TBlocklike>* block_entry, BlockType block_type,
+    GetContext* get_context, BlockCacheLookupContext* lookup_context,
+    BlockContents* contents) const {
  assert(block_entry != nullptr);
  const bool no_io = (ro.read_tier == kBlockCacheTier);
  Cache* block_cache = rep_->table_options.block_cache.get();
@ -1491,7 +1492,9 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
      CompressionType raw_block_comp_type;
      BlockContents raw_block_contents;
      if (!contents) {
-        StopWatch sw(rep_->ioptions.clock, statistics, READ_BLOCK_GET_MICROS);
+        Histograms histogram = for_compaction ? READ_BLOCK_COMPACTION_MICROS
+                                              : READ_BLOCK_GET_MICROS;
+        StopWatch sw(rep_->ioptions.clock, statistics, histogram);
        BlockFetcher block_fetcher(
            rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle,
            &raw_block_contents, rep_->ioptions, do_uncompress,
@ -1849,8 +1852,9 @@ void BlockBasedTable::RetrieveMultipleBlocks(
        // avoid looking up the block cache
        s = MaybeReadBlockAndLoadToCache(
            nullptr, options, handle, uncompression_dict, /*wait=*/true,
-            block_entry, BlockType::kData, mget_iter->get_context,
-            &lookup_data_block_context, &raw_block_contents);
+            /*for_compaction=*/false, block_entry, BlockType::kData,
+            mget_iter->get_context, &lookup_data_block_context,
+            &raw_block_contents);

        // block_entry value could be null if no block cache is present, i.e
        // BlockBasedTableOptions::no_block_cache is true and no compressed
@ -1904,7 +1908,7 @@ Status BlockBasedTable::RetrieveBlock(
  if (use_cache) {
    s = MaybeReadBlockAndLoadToCache(
        prefetch_buffer, ro, handle, uncompression_dict, wait_for_cache,
-        block_entry, block_type, get_context, lookup_context,
+        for_compaction, block_entry, block_type, get_context, lookup_context,
        /*contents=*/nullptr);

    if (!s.ok()) {
@ -1933,8 +1937,9 @@ Status BlockBasedTable::RetrieveBlock(
  std::unique_ptr<TBlocklike> block;

  {
-    StopWatch sw(rep_->ioptions.clock, rep_->ioptions.stats,
-                 READ_BLOCK_GET_MICROS);
+    Histograms histogram =
+        for_compaction ? READ_BLOCK_COMPACTION_MICROS : READ_BLOCK_GET_MICROS;
+    StopWatch sw(rep_->ioptions.clock, rep_->ioptions.stats, histogram);
    s = ReadBlockFromFile(
        rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle, &block,
        rep_->ioptions, do_uncompress, maybe_compressed, block_type,
@ -3161,6 +3166,7 @@ Status BlockBasedTable::CreateIndexReader(
 uint64_t BlockBasedTable::ApproximateDataOffsetOf(
    const InternalIteratorBase<IndexValue>& index_iter,
    uint64_t data_size) const {
+  assert(index_iter.status().ok());
  if (index_iter.Valid()) {
    BlockHandle handle = index_iter.value().handle;
    return handle.offset();
@ -3203,8 +3209,16 @@ uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key,
  }

  index_iter->Seek(key);
+  uint64_t offset;
+  if (index_iter->status().ok()) {
+    offset = ApproximateDataOffsetOf(*index_iter, data_size);
+  } else {
+    // Split in half to avoid skewing one way or another,
+    // since we don't know whether we're operating on lower bound or
+    // upper bound.
+    return rep_->file_size / 2;
+  }

-  uint64_t offset = ApproximateDataOffsetOf(*index_iter, data_size);
  // Pro-rate file metadata (incl filters) size-proportionally across data
  // blocks.
  double size_ratio =
@ -3220,7 +3234,9 @@ uint64_t BlockBasedTable::ApproximateSize(const Slice& start, const Slice& end,
  uint64_t data_size = GetApproximateDataSize();
  if (UNLIKELY(data_size == 0)) {
    // Hmm. Assume whole file is involved, since we have lower and upper
-    // bound.
+    // bound. This likely skews the estimate if we consider that this function
+    // is typically called with `[start, end]` fully contained in the file's
+    // key-range.
    return rep_->file_size;
  }

@ -3238,9 +3254,24 @@ uint64_t BlockBasedTable::ApproximateSize(const Slice& start, const Slice& end,
  }

  index_iter->Seek(start);
-  uint64_t start_offset = ApproximateDataOffsetOf(*index_iter, data_size);
+  uint64_t start_offset;
+  if (index_iter->status().ok()) {
+    start_offset = ApproximateDataOffsetOf(*index_iter, data_size);
+  } else {
+    // Assume file is involved from the start. This likely skews the estimate
+    // but is consistent with the above error handling.
+    start_offset = 0;
+  }
+
  index_iter->Seek(end);
-  uint64_t end_offset = ApproximateDataOffsetOf(*index_iter, data_size);
+  uint64_t end_offset;
+  if (index_iter->status().ok()) {
+    end_offset = ApproximateDataOffsetOf(*index_iter, data_size);
+  } else {
+    // Assume file is involved until the end. This likely skews the estimate
+    // but is consistent with the above error handling.
+    end_offset = data_size;
+  }

  assert(end_offset >= start_offset);
  // Pro-rate file metadata (incl filters) size-proportionally across data
--- a/table/block_based/block_based_table_reader.h
+++ b/table/block_based/block_based_table_reader.h
@ -343,9 +343,10 @@ class BlockBasedTable : public TableReader {
  Status MaybeReadBlockAndLoadToCache(
      FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
      const BlockHandle& handle, const UncompressionDict& uncompression_dict,
-      const bool wait, CachableEntry<TBlocklike>* block_entry,
-      BlockType block_type, GetContext* get_context,
-      BlockCacheLookupContext* lookup_context, BlockContents* contents) const;
+      const bool wait, const bool for_compaction,
+      CachableEntry<TBlocklike>* block_entry, BlockType block_type,
+      GetContext* get_context, BlockCacheLookupContext* lookup_context,
+      BlockContents* contents) const;

  // Similar to the above, with one crucial difference: it will retrieve the
  // block from the file even if there are no caches configured (assuming the
--- a/table/block_based/partitioned_filter_block.cc
+++ b/table/block_based/partitioned_filter_block.cc
@ -501,8 +501,8 @@ Status PartitionedFilterBlockReader::CacheDependencies(const ReadOptions& ro,
    // filter blocks
    s = table()->MaybeReadBlockAndLoadToCache(
        prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(),
-        /* wait */ true, &block, BlockType::kFilter, nullptr /* get_context */,
-        &lookup_context, nullptr /* contents */);
+        /* wait */ true, /* for_compaction */ false, &block, BlockType::kFilter,
+        nullptr /* get_context */, &lookup_context, nullptr /* contents */);
    if (!s.ok()) {
      return s;
    }
--- a/table/block_based/partitioned_index_reader.cc
+++ b/table/block_based/partitioned_index_reader.cc
@ -179,8 +179,8 @@ Status PartitionIndexReader::CacheDependencies(const ReadOptions& ro,
    // filter blocks
    Status s = table()->MaybeReadBlockAndLoadToCache(
        prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(),
-        /*wait=*/true, &block, BlockType::kIndex, /*get_context=*/nullptr,
-        &lookup_context, /*contents=*/nullptr);
+        /*wait=*/true, /*for_compaction=*/false, &block, BlockType::kIndex,
+        /*get_context=*/nullptr, &lookup_context, /*contents=*/nullptr);

    if (!s.ok()) {
      return s;
--- a/table/multiget_context.h
+++ b/table/multiget_context.h
@ -235,9 +235,9 @@ class MultiGetContext {

    bool empty() const { return RemainingMask() == 0; }

-    void SkipKey(const Iterator& iter) {
-      skip_mask_ |= uint64_t{1} << iter.index_;
-    }
+    void SkipIndex(size_t index) { skip_mask_ |= uint64_t{1} << index; }
+
+    void SkipKey(const Iterator& iter) { SkipIndex(iter.index_); }

    bool IsKeySkipped(const Iterator& iter) const {
      return skip_mask_ & (uint64_t{1} << iter.index_);
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@ -87,6 +87,7 @@ default_params = {
    "partition_filters": lambda: random.randint(0, 1),
    "partition_pinning": lambda: random.randint(0, 3),
    "pause_background_one_in": 1000000,
+    "prefix_size" : lambda: random.choice([-1, 1, 5, 7, 8]),
    "prefixpercent": 5,
    "progress_reports": 0,
    "readpercent": 45,
@ -155,6 +156,8 @@ default_params = {
    "user_timestamp_size": 0,
    "secondary_cache_fault_one_in" : lambda: random.choice([0, 0, 32]),
    "prepopulate_block_cache" : lambda: random.choice([0, 1]),
+    "memtable_prefix_bloom_size_ratio": lambda: random.choice([0.001, 0.01, 0.1, 0.5]),
+    "memtable_whole_key_filtering": lambda: random.randint(0, 1),
 }

 _TEST_DIR_ENV_VAR = 'TEST_TMPDIR'
@ -240,9 +243,6 @@ simple_default_params = {
    "max_background_compactions": 1,
    "max_bytes_for_level_base": 67108864,
    "memtablerep": "skip_list",
-    "prefixpercent": 0,
-    "readpercent": 50,
-    "prefix_size" : -1,
    "target_file_size_base": 16777216,
    "target_file_size_multiplier": 1,
    "test_batches_snapshots": 0,
@ -399,8 +399,15 @@ def finalize_and_sanitize(src_params):
        dest_params["readpercent"] += dest_params.get("iterpercent", 10)
        dest_params["iterpercent"] = 0
        dest_params["test_batches_snapshots"] = 0
+    if dest_params.get("prefix_size") == -1:
+        dest_params["readpercent"] += dest_params.get("prefixpercent", 20)
+        dest_params["prefixpercent"] = 0
+        dest_params["test_batches_snapshots"] = 0
    if dest_params.get("test_batches_snapshots") == 0:
        dest_params["batch_protection_bytes_per_key"] = 0
+    if (dest_params.get("prefix_size") == -1 and
+        dest_params.get("memtable_whole_key_filtering") == 0):
+        dest_params["memtable_prefix_bloom_size_ratio"] = 0
    return dest_params

 def gen_cmd_params(args):
--- a/util/compression.h
+++ b/util/compression.h
@ -724,9 +724,6 @@ inline bool Zlib_Compress(const CompressionInfo& info,
    output_header_len = compression::PutDecompressedSizeInfo(
        output, static_cast<uint32_t>(length));
  }
-  // Resize output to be the plain data length.
-  // This may not be big enough if the compression actually expands data.
-  output->resize(output_header_len + length);

  // The memLevel parameter specifies how much memory should be allocated for
  // the internal compression state.
@ -760,12 +757,17 @@ inline bool Zlib_Compress(const CompressionInfo& info,
    }
  }

+  // Get an upper bound on the compressed size.
+  size_t upper_bound =
+      deflateBound(&_stream, static_cast<unsigned long>(length));
+  output->resize(output_header_len + upper_bound);
+
  // Compress the input, and put compressed data in output.
  _stream.next_in = (Bytef*)input;
  _stream.avail_in = static_cast<unsigned int>(length);

  // Initialize the output size.
-  _stream.avail_out = static_cast<unsigned int>(length);
+  _stream.avail_out = static_cast<unsigned int>(upper_bound);
  _stream.next_out = reinterpret_cast<Bytef*>(&(*output)[output_header_len]);

  bool compressed = false;
--- a/util/dynamic_bloom.h
+++ b/util/dynamic_bloom.h
@ -65,7 +65,7 @@ class DynamicBloom {
  // Multithreaded access to this function is OK
  bool MayContain(const Slice& key) const;

-  void MayContain(int num_keys, Slice** keys, bool* may_match) const;
+  void MayContain(int num_keys, Slice* keys, bool* may_match) const;

  // Multithreaded access to this function is OK
  bool MayContainHash(uint32_t hash) const;
@ -120,12 +120,12 @@ inline bool DynamicBloom::MayContain(const Slice& key) const {
  return (MayContainHash(BloomHash(key)));
 }

-inline void DynamicBloom::MayContain(int num_keys, Slice** keys,
+inline void DynamicBloom::MayContain(int num_keys, Slice* keys,
                                     bool* may_match) const {
  std::array<uint32_t, MultiGetContext::MAX_BATCH_SIZE> hashes;
  std::array<size_t, MultiGetContext::MAX_BATCH_SIZE> byte_offsets;
  for (int i = 0; i < num_keys; ++i) {
-    hashes[i] = BloomHash(*keys[i]);
+    hashes[i] = BloomHash(keys[i]);
    size_t a = FastRange32(kLen, hashes[i]);
    PREFETCH(data_ + a, 0, 3);
    byte_offsets[i] = a;
--- a/util/math.h
+++ b/util/math.h
@ -92,18 +92,25 @@ inline int CountTrailingZeroBits(T v) {
 #endif
 }

-#if defined(_MSC_VER) && !defined(_M_X64)
+// Not all MSVC compile settings will use `BitsSetToOneFallback()`. We include
+// the following code at coarse granularity for simpler macros. It's important
+// to exclude at least so our non-MSVC unit test coverage tool doesn't see it.
+#ifdef _MSC_VER
+
 namespace detail {
+
 template <typename T>
 int BitsSetToOneFallback(T v) {
  const int kBits = static_cast<int>(sizeof(T)) * 8;
  static_assert((kBits & (kBits - 1)) == 0, "must be power of two bits");
  // we static_cast these bit patterns in order to truncate them to the correct
-  // size
+  // size. Warning C4309 dislikes this technique, so disable it here.
+#pragma warning(disable : 4309)
  v = static_cast<T>(v - ((v >> 1) & static_cast<T>(0x5555555555555555ull)));
  v = static_cast<T>((v & static_cast<T>(0x3333333333333333ull)) +
                     ((v >> 2) & static_cast<T>(0x3333333333333333ull)));
  v = static_cast<T>((v + (v >> 4)) & static_cast<T>(0x0F0F0F0F0F0F0F0Full));
+#pragma warning(default : 4309)
  for (int shift_bits = 8; shift_bits < kBits; shift_bits <<= 1) {
    v += static_cast<T>(v >> shift_bits);
  }
@ -113,7 +120,8 @@ int BitsSetToOneFallback(T v) {
 }

 }  // namespace detail
-#endif
+
+#endif  // _MSC_VER

 // Number of bits set to 1. Also known as "population count".
 template <typename T>
@ -126,21 +134,21 @@ inline int BitsSetToOne(T v) {
    constexpr auto mm = 8 * sizeof(uint32_t) - 1;
    // The bit mask is to neutralize sign extension on small signed types
    constexpr uint32_t m = (uint32_t{1} << ((8 * sizeof(T)) & mm)) - 1;
-#if defined(_M_X64) || defined(_M_IX86)
+#if defined(HAVE_SSE42) && (defined(_M_X64) || defined(_M_IX86))
    return static_cast<int>(__popcnt(static_cast<uint32_t>(v) & m));
 #else
    return static_cast<int>(detail::BitsSetToOneFallback(v) & m);
 #endif
  } else if (sizeof(T) == sizeof(uint32_t)) {
-#if defined(_M_X64) || defined(_M_IX86)
+#if defined(HAVE_SSE42) && (defined(_M_X64) || defined(_M_IX86))
    return static_cast<int>(__popcnt(static_cast<uint32_t>(v)));
 #else
    return detail::BitsSetToOneFallback(static_cast<uint32_t>(v));
 #endif
  } else {
-#ifdef _M_X64
+#if defined(HAVE_SSE42) && defined(_M_X64)
    return static_cast<int>(__popcnt64(static_cast<uint64_t>(v)));
-#elif defined(_M_IX86)
+#elif defined(HAVE_SSE42) && defined(_M_IX86)
    return static_cast<int>(
        __popcnt(static_cast<uint32_t>(static_cast<uint64_t>(v) >> 32) +
                 __popcnt(static_cast<uint32_t>(v))));
--- a/utilities/transactions/transaction_test.cc
+++ b/utilities/transactions/transaction_test.cc
@ -148,6 +148,78 @@ TEST_P(TransactionTest, SuccessTest) {
  delete txn;
 }

+TEST_P(TransactionTest, SwitchMemtableDuringPrepareAndCommit_WC) {
+  const TxnDBWritePolicy write_policy = std::get<2>(GetParam());
+
+  if (write_policy != TxnDBWritePolicy::WRITE_COMMITTED) {
+    ROCKSDB_GTEST_BYPASS("Test applies to write-committed only");
+    return;
+  }
+
+  ASSERT_OK(db->Put(WriteOptions(), "key0", "value"));
+
+  TransactionOptions txn_opts;
+  txn_opts.use_only_the_last_commit_time_batch_for_recovery = true;
+  Transaction* txn = db->BeginTransaction(WriteOptions(), txn_opts);
+  assert(txn);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "FlushJob::WriteLevel0Table", [&](void* arg) {
+        // db mutex not held.
+        auto* mems = reinterpret_cast<autovector<MemTable*>*>(arg);
+        assert(mems);
+        ASSERT_EQ(1, mems->size());
+        auto* ctwb = txn->GetCommitTimeWriteBatch();
+        ASSERT_OK(ctwb->Put("gtid", "123"));
+        ASSERT_OK(txn->Commit());
+        delete txn;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(txn->Put("key1", "value"));
+  ASSERT_OK(txn->SetName("txn1"));
+
+  ASSERT_OK(txn->Prepare());
+
+  auto dbimpl = static_cast_with_check<DBImpl>(db->GetRootDB());
+  ASSERT_OK(dbimpl->TEST_SwitchMemtable(nullptr));
+  ASSERT_OK(dbimpl->TEST_FlushMemTable(
+      /*wait=*/false, /*allow_write_stall=*/true, /*cfh=*/nullptr));
+
+  ASSERT_OK(dbimpl->TEST_WaitForFlushMemTable());
+
+  {
+    std::string value;
+    ASSERT_OK(db->Get(ReadOptions(), "key1", &value));
+    ASSERT_EQ("value", value);
+  }
+
+  delete db;
+  db = nullptr;
+  Status s;
+  if (use_stackable_db_ == false) {
+    s = TransactionDB::Open(options, txn_db_options, dbname, &db);
+  } else {
+    s = OpenWithStackableDB();
+  }
+  ASSERT_OK(s);
+  assert(db);
+
+  {
+    std::string value;
+    ASSERT_OK(db->Get(ReadOptions(), "gtid", &value));
+    ASSERT_EQ("123", value);
+
+    ASSERT_OK(db->Get(ReadOptions(), "key1", &value));
+    ASSERT_EQ("value", value);
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
 // The test clarifies the contract of do_validate and assume_tracked
 // in GetForUpdate and Put/Merge/Delete
 TEST_P(TransactionTest, AssumeExclusiveTracked) {