Update variables

Netty resources
Update version
2022-06-14 21:55:56 +02:00 · 2022-06-14 19:07:55 +02:00 · 2022-06-08 14:08:16 -07:00 · 2022-06-08 14:08:16 -07:00 · 2022-06-08 14:08:16 -07:00 · 2022-06-08 14:08:16 -07:00
213 changed files with 7624 additions and 3159 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -152,8 +152,8 @@ commands:
      - run:
          name: Install libprotobuf-mutator libs
          command: |
-            git clone --single-branch --branch master --depth 1 git@github.com:google/libprotobuf-mutator.git ~/libprotobuf-mutator
-            cd ~/libprotobuf-mutator && mkdir build && cd build
+            git clone -b v1.0 git@github.com:google/libprotobuf-mutator.git ~/libprotobuf-mutator
+            cd ~/libprotobuf-mutator && git checkout ffd86a32874e5c08a143019aad1aaf0907294c9f && mkdir build && cd build
            cmake .. -GNinja -DCMAKE_C_COMPILER=clang-13 -DCMAKE_CXX_COMPILER=clang++-13 -DCMAKE_BUILD_TYPE=Release -DLIB_PROTO_MUTATOR_DOWNLOAD_PROTOBUF=ON
            ninja && sudo ninja install
      - run:
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -0,0 +1,73 @@
+name: Check buck targets and code format
+on: [push]
+jobs:
+  build:
+    name: Build
+    runs-on: ubuntu-20.04
+    steps:
+    - name: Checkout feature branch
+      uses: actions/checkout@v2
+    - name: Script
+      shell: bash
+      run: |
+            set -xe
+            REPODIR="$(pwd)"
+            export GTEST_THROW_ON_FAILURE=0
+            export GTEST_OUTPUT=\"xml:/tmp/test-results/\"
+            export SKIP_FORMAT_BUCK_CHECKS=1
+            export GTEST_COLOR=1
+            export CTEST_OUTPUT_ON_FAILURE=1
+            export CTEST_TEST_TIMEOUT=300
+            export ZLIB_DOWNLOAD_BASE=https://rocksdb-deps.s3.us-west-2.amazonaws.com/pkgs/zlib
+            export BZIP2_DOWNLOAD_BASE=https://rocksdb-deps.s3.us-west-2.amazonaws.com/pkgs/bzip2
+            export SNAPPY_DOWNLOAD_BASE=https://rocksdb-deps.s3.us-west-2.amazonaws.com/pkgs/snappy
+            export LZ4_DOWNLOAD_BASE=https://rocksdb-deps.s3.us-west-2.amazonaws.com/pkgs/lz4
+            export ZSTD_DOWNLOAD_BASE=https://rocksdb-deps.s3.us-west-2.amazonaws.com/pkgs/zstd
+            export DEBIAN_FRONTEND="noninteractive"
+            export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
+            export PATH=$JAVA_HOME/bin:$PATH
+            
+            # install dev deps
+            
+            sudo apt-get update -y && sudo apt-get install -y libgflags-dev ninja-build valgrind cmake libsnappy-dev zlib1g-dev libbz2-dev liblz4-dev libzstd-dev gnupg wget git make build-essential liblzma-dev openjdk-8-jdk curl unzip tar
+            
+            # install clang 10
+            
+            wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add -
+            echo "deb http://apt.llvm.org/focal/ llvm-toolchain-focal-10 main" | sudo tee -a /etc/apt/sources.list
+            echo "deb-src http://apt.llvm.org/focal/ llvm-toolchain-focal-10 main" | sudo tee -a /etc/apt/sources.list
+            echo "APT::Acquire::Retries \"10\";" | sudo tee -a /etc/apt/apt.conf.d/80-retries # llvm.org unreliable
+            sudo apt-get update -y && sudo apt-get install -y clang-10
+            
+            # install clang 13
+            
+            echo "deb http://apt.llvm.org/focal/ llvm-toolchain-focal-13 main" | sudo tee -a /etc/apt/sources.list
+            echo "deb-src http://apt.llvm.org/focal/ llvm-toolchain-focal-13 main" | sudo tee -a /etc/apt/sources.list
+            echo "APT::Acquire::Retries \"10\";" | sudo tee -a /etc/apt/apt.conf.d/80-retries # llvm.org unreliable
+            wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add -
+            sudo apt-get update -y && sudo apt-get install -y clang-13
+            
+            # install google/benchmark
+            
+            git clone --depth 1 --branch v1.6.1 https://github.com/google/benchmark.git ~/benchmark
+            cd ~/benchmark && mkdir build && cd build
+            cmake .. -GNinja -DCMAKE_BUILD_TYPE=Release -DBENCHMARK_ENABLE_GTEST_TESTS=0
+            ninja && sudo ninja install
+            
+            # install google/gtest-parallel
+            
+            git clone --single-branch --branch master --depth 1 https://github.com/google/gtest-parallel.git ~/gtest-parallel
+            export PATH=$HOME/gtest-parallel:$PATH
+            
+            # install google/libprotobuf-mutator
+            
+            git clone --single-branch --branch master --depth 1 https://github.com/google/libprotobuf-mutator.git ~/libprotobuf-mutator
+            cd ~/libprotobuf-mutator && mkdir build && cd build
+            cmake .. -GNinja -DCMAKE_C_COMPILER=clang-13 -DCMAKE_CXX_COMPILER=clang++-13 -DCMAKE_BUILD_TYPE=Release -DLIB_PROTO_MUTATOR_DOWNLOAD_PROTOBUF=ON
+            ninja && sudo ninja install
+            export PKG_CONFIG_PATH=$PKG_CONFIG_PATH:/usr/local/OFF/:~/libprotobuf-mutator/build/external.protobuf/lib/pkgconfig/
+            export PROTOC_BIN=~/libprotobuf-mutator/build/external.protobuf/bin/protoc
+            
+            # build rocksdb java
+            
+            cd "$REPODIR"; make V=0 J=8 -j8 rocksdbjavastaticpublish
--- a/.github/workflows/sanity_check.yml
+++ b/.github/workflows/sanity_check.yml
@ -1,44 +0,0 @@
-name: Check buck targets and code format
-on: [push, pull_request]
-jobs:
-  check:
-    name: Check TARGETS file and code format
-    runs-on: ubuntu-latest
-    steps:
-    - name: Checkout feature branch
-      uses: actions/checkout@v2
-      with:
-        fetch-depth: 0
-
-    - name: Fetch from upstream
-      run: |
-        git remote add upstream https://github.com/facebook/rocksdb.git && git fetch upstream
-
-    - name: Where am I
-      run: |
-        echo git status && git status
-        echo "git remote -v" && git remote -v
-        echo git branch && git branch
-
-    - name: Setup Python
-      uses: actions/setup-python@v1
-
-    - name: Install Dependencies
-      run: python -m pip install --upgrade pip
-
-    - name: Install argparse
-      run: pip install argparse
-
-    - name: Download clang-format-diff.py
-      uses: wei/wget@v1
-      with:
-        args: https://raw.githubusercontent.com/llvm/llvm-project/release/12.x/clang/tools/clang-format/clang-format-diff.py
-
-    - name: Check format
-      run: VERBOSE_CHECK=1 make check-format
-
-    - name: Compare buckify output
-      run: make check-buck-targets
-
-    - name: Simple source code checks
-      run: make check-sources
--- a/.gitignore
+++ b/.gitignore
@ -36,6 +36,7 @@ manifest_dump
 sst_dump
 blob_dump
 block_cache_trace_analyzer
+db_readonly_with_timestamp_test
 db_with_timestamp_basic_test
 tools/block_cache_analyzer/*.pyc
 column_aware_encoding_exp
@ -96,3 +97,15 @@ fuzz/crash-*

 cmake-build-*
 third-party/folly/
+
+*.gz
+
+bzip2-1.0.8/
+
+lz4-1.9.3/
+
+snappy-1.1.8/
+
+zlib-1.2.12/
+
+zstd-1.4.9/
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -337,9 +337,6 @@ endif()
 # Reset the required flags
 set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQUIRED_FLAGS})

-# thread_local is part of C++11 and later (TODO: clean up this define)
-add_definitions(-DROCKSDB_SUPPORT_THREAD_LOCAL)
-
 option(WITH_IOSTATS_CONTEXT "Enable IO stats context" ON)
 if (NOT WITH_IOSTATS_CONTEXT)
  add_definitions(-DNIOSTATS_CONTEXT)
@ -799,6 +796,7 @@ set(SOURCES
        trace_replay/trace_record_result.cc
        trace_replay/trace_record.cc
        trace_replay/trace_replay.cc
+        util/async_file_reader.cc
        util/cleanable.cc
        util/coding.cc
        util/compaction_job_stats_impl.cc
@ -1131,6 +1129,12 @@ if(NOT WIN32 OR ROCKSDB_INSTALL_ON_WINDOWS)

  install(DIRECTORY include/rocksdb COMPONENT devel DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}")

+  foreach (plugin ${PLUGINS})
+    foreach (header ${${plugin}_HEADERS})
+      install(FILES plugin/${plugin}/${header} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rocksdb/plugin/${plugin})
+    endforeach()
+  endforeach()
+
  install(DIRECTORY "${PROJECT_SOURCE_DIR}/cmake/modules" COMPONENT devel DESTINATION ${package_config_destination})

  install(
@ -1219,6 +1223,7 @@ if(WITH_TESTS)
        db/comparator_db_test.cc
        db/corruption_test.cc
        db/cuckoo_table_db_test.cc
+        db/db_readonly_with_timestamp_test.cc
        db/db_with_timestamp_basic_test.cc
        db/db_block_cache_test.cc
        db/db_bloom_filter_test.cc
@ -1387,6 +1392,7 @@ if(WITH_TESTS)

  set(TESTUTIL_SOURCE
      db/db_test_util.cc
+      db/db_with_timestamp_test_util.cc
      monitoring/thread_status_updater_debug.cc
      table/mock_table.cc
      utilities/agg_merge/test_agg_merge.cc
--- a/HISTORY.md
+++ b/HISTORY.md
@ -1,15 +1,26 @@
 # Rocksdb Change Log
-## Unreleased
+## 7.3.1 (06/08/2022)
+### Bug Fixes
+* Fix a bug in WAL tracking. Before this PR (#10087), calling `SyncWAL()` on the only WAL file of the db will not log the event in MANIFEST, thus allowing a subsequent `DB::Open` even if the WAL file is missing or corrupted.
+* Fixed a bug for non-TransactionDB with avoid_flush_during_recovery = true and TransactionDB where in case of crash, min_log_number_to_keep may not change on recovery and persisting a new MANIFEST with advanced log_numbers for some column families, results in "column family inconsistency" error on second recovery. As a solution, RocksDB will persist the new MANIFEST after successfully syncing the new WAL. If a future recovery starts from the new MANIFEST, then it means the new WAL is successfully synced. Due to the sentinel empty write batch at the beginning, kPointInTimeRecovery of WAL is guaranteed to go after this point. If future recovery starts from the old MANIFEST, it means the writing the new MANIFEST failed. We won't have the "SST ahead of WAL" error.
+* Fixed a bug where RocksDB DB::Open() may creates and writes to two new MANIFEST files even before recovery succeeds. Now writes to MANIFEST are persisted only after recovery is successful.
+
+## 7.3.0 (05/20/2022)
 ### Bug Fixes
 * Fixed a bug where manual flush would block forever even though flush options had wait=false.
 * Fixed a bug where RocksDB could corrupt DBs with `avoid_flush_during_recovery == true` by removing valid WALs, leading to `Status::Corruption` with message like "SST file is ahead of WALs" when attempting to reopen.
 * Fixed a bug in async_io path where incorrect length of data is read by FilePrefetchBuffer if data is consumed from two populated buffers and request for more data is sent.
 * Fixed a CompactionFilter bug. Compaction filter used to use `Delete` to remove keys, even if the keys should be removed with `SingleDelete`. Mixing `Delete` and `SingleDelete` may cause undefined behavior.
+* Fixed a bug in `WritableFileWriter::WriteDirect` and `WritableFileWriter::WriteDirectWithChecksum`. The rate_limiter_priority specified in ReadOptions was not passed to the RateLimiter when requesting a token.
 * Fixed a bug which might cause process crash when I/O error happens when reading an index block in MultiGet().

 ### New Features
 * DB::GetLiveFilesStorageInfo is ready for production use.
 * Add new stats PREFETCHED_BYTES_DISCARDED which records number of prefetched bytes discarded by RocksDB FilePrefetchBuffer on destruction and POLL_WAIT_MICROS records wait time for FS::Poll API completion.
+* RemoteCompaction supports table_properties_collector_factories override on compaction worker.
+* Start tracking SST unique id in MANIFEST, which will be used to verify with SST properties during DB open to make sure the SST file is not overwritten or misplaced. A db option `verify_sst_unique_id_in_manifest` is introduced to enable/disable the verification, if enabled all SST files will be opened during DB-open to verify the unique id (default is false), so it's recommended to use it with `max_open_files = -1` to pre-open the files.
+* Added the ability to concurrently read data blocks from multiple files in a level in batched MultiGet. This can be enabled by setting the async_io option in ReadOptions. Using this feature requires a FileSystem that supports ReadAsync (PosixFileSystem is not supported yet for this), and for RocksDB to be compiled with folly and c++20.
+* Add FileSystem::ReadAsync API in io_tracing.

 ### Public API changes
 * Add rollback_deletion_type_callback to TransactionDBOptions so that write-prepared transactions know whether to issue a Delete or SingleDelete to cancel a previous key written during prior prepare phase. The PR aims to prevent mixing SingleDeletes and Deletes for the same key that can lead to undefined behaviors for write-prepared transactions.
@ -18,6 +29,13 @@
 * Renamed CompactionFilter::Decision::kRemoveWithSingleDelete to kPurge since the latter sounds more general and hides the implementation details of how compaction iterator handles keys.
 * Added ability to specify functions for Prepare and Validate to OptionsTypeInfo.  Added methods to OptionTypeInfo to set the functions via an API.  These methods are intended for RocksDB plugin developers for configuration management.
 * Added a new immutable db options, enforce_single_del_contracts. If set to false (default is true), compaction will NOT fail due to a single delete followed by a delete for the same key. The purpose of this temporay option is to help existing use cases migrate.
+* Introduce `BlockBasedTableOptions::cache_usage_options` and use that to replace `BlockBasedTableOptions::reserve_table_builder_memory` and  `BlockBasedTableOptions::reserve_table_reader_memory`.
+* Changed `GetUniqueIdFromTableProperties` to return a 128-bit unique identifier, which will be the standard size now. The old functionality (192-bit) is available from `GetExtendedUniqueIdFromTableProperties`. Both functions are no longer "experimental" and are ready for production use.
+* In IOOptions, mark `prio` as deprecated for future removal.
+* In `file_system.h`, mark `IOPriority` as deprecated for future removal.
+* Add an option, `CompressionOptions::use_zstd_dict_trainer`, to indicate whether zstd dictionary trainer should be used for generating zstd compression dictionaries. The default value of this option is true for backward compatibility. When this option is set to false, zstd API `ZDICT_finalizeDictionary` is used to generate compression dictionaries.
+* Seek API which positions itself every LevelIterator on the correct data block in the correct SST file which can be parallelized if ReadOptions.async_io option is enabled.
+* Add new stat number_async_seek in PerfContext that indicates number of async calls made by seek to prefetch data.

 ### Bug Fixes
 * RocksDB calls FileSystem::Poll API during FilePrefetchBuffer destruction which impacts performance as it waits for read requets completion which is not needed anymore. Calling FileSystem::AbortIO to abort those requests instead fixes that performance issue.
@ -26,6 +44,8 @@
 ### Behavior changes
 * Enforce the existing contract of SingleDelete so that SingleDelete cannot be mixed with Delete because it leads to undefined behavior. Fix a number of unit tests that violate the contract but happen to pass.
 * ldb `--try_load_options` default to true if `--db` is specified and not creating a new DB, the user can still explicitly disable that by `--try_load_options=false` (or explicitly enable that by `--try_load_options`).
+* During Flush write or Compaction write/read, the WriteController is used to determine whether DB writes are stalled or slowed down. The priority (Env::IOPriority) can then be determined accordingly and be passed in IOOptions to the file system.
+

 ## 7.2.0 (04/15/2022)
 ### Bug Fixes
--- a/31
+++ b/31
@ -136,6 +136,13 @@ CXXFLAGS += $(PLATFORM_SHARED_CFLAGS) -DROCKSDB_DLL
 CFLAGS +=  $(PLATFORM_SHARED_CFLAGS) -DROCKSDB_DLL
 endif

+ifeq ($(USE_COROUTINES), 1)
+	USE_FOLLY = 1
+	OPT += -DUSE_COROUTINES
+	ROCKSDB_CXX_STANDARD = c++2a
+	USE_RTTI = 1
+endif
+
 # if we're compiling for release, compile without debug code (-DNDEBUG)
 ifeq ($(DEBUG_LEVEL),0)
 OPT += -DNDEBUG
@ -226,6 +233,7 @@ dummy := $(shell (export ROCKSDB_ROOT="$(CURDIR)"; \
                  export ROCKSDB_NO_FBCODE="$(ROCKSDB_NO_FBCODE)"; \
                  export USE_CLANG="$(USE_CLANG)"; \
                  export LIB_MODE="$(LIB_MODE)"; \
+		  export ROCKSDB_CXX_STANDARD="$(ROCKSDB_CXX_STANDARD)"; \
                  "$(CURDIR)/build_tools/build_detect_platform" "$(CURDIR)/make_config.mk"))
 # this file is generated by the previous line to set build flags and sources
 include make_config.mk
@ -632,7 +640,6 @@ TESTS_PLATFORM_DEPENDENT := \
 	db_basic_test \
 	db_blob_basic_test \
 	db_encryption_test \
-	db_test2 \
 	external_sst_file_basic_test \
 	auto_roll_logger_test \
 	bloom_test \
@ -654,7 +661,6 @@ TESTS_PLATFORM_DEPENDENT := \
 	rate_limiter_test \
 	perf_context_test \
 	iostats_context_test \
-	db_wal_test \

 # Sort ROCKSDBTESTS_SUBSET for filtering, except db_test is special (expensive)
 # so is placed first (out-of-order)
@ -1372,6 +1378,9 @@ db_blob_basic_test: $(OBJ_DIR)/db/blob/db_blob_basic_test.o $(TEST_LIBRARY) $(LI
 db_blob_compaction_test: $(OBJ_DIR)/db/blob/db_blob_compaction_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)

+db_readonly_with_timestamp_test: $(OBJ_DIR)/db/db_readonly_with_timestamp_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
 db_with_timestamp_basic_test: $(OBJ_DIR)/db/db_with_timestamp_basic_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)

@ -1867,7 +1876,7 @@ testutil_test: $(OBJ_DIR)/test_util/testutil_test.o $(TEST_LIBRARY) $(LIBRARY)
 io_tracer_test: $(OBJ_DIR)/trace_replay/io_tracer_test.o $(OBJ_DIR)/trace_replay/io_tracer.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)

-prefetch_test: $(OBJ_DIR)/file/prefetch_test.o $(TEST_LIBRARY) $(LIBRARY)
+prefetch_test: $(OBJ_DIR)/file/prefetch_test.o  $(OBJ_DIR)/tools/io_tracer_parser_tool.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)

 io_tracer_parser_test: $(OBJ_DIR)/tools/io_tracer_parser_test.o $(OBJ_DIR)/tools/io_tracer_parser_tool.o $(TEST_LIBRARY) $(LIBRARY)
@ -1990,7 +1999,7 @@ else
 	ROCKSDBJNILIB = librocksdbjni-linux$(ARCH)$(JNI_LIBC_POSTFIX).so
 endif
 endif
-ROCKSDB_JAVA_VERSION ?= $(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)
+ROCKSDB_JAVA_VERSION ?= $(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-netty
 ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_JAVA_VERSION)-linux$(ARCH)$(JNI_LIBC_POSTFIX).jar
 ROCKSDB_JAR_ALL = rocksdbjni-$(ROCKSDB_JAVA_VERSION).jar
 ROCKSDB_JAVADOCS_JAR = rocksdbjni-$(ROCKSDB_JAVA_VERSION)-javadoc.jar
@ -2055,6 +2064,7 @@ ifeq ($(PLATFORM), OS_OPENBSD)
 	ROCKSDBJNILIB = librocksdbjni-openbsd$(ARCH).so
 	ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_JAVA_VERSION)-openbsd$(ARCH).jar
 endif
+export SHA256_CMD

 zlib-$(ZLIB_VER).tar.gz:
 	curl --fail --output zlib-$(ZLIB_VER).tar.gz --location ${ZLIB_DOWNLOAD_BASE}/zlib-$(ZLIB_VER).tar.gz
@ -2187,7 +2197,7 @@ JAR_CMD := jar
 endif
 endif
 rocksdbjavastatic_javalib:
-	cd java; SHA256_CMD='$(SHA256_CMD)' $(MAKE) javalib
+	cd java; $(MAKE) javalib
 	rm -f java/target/$(ROCKSDBJNILIB)
 	$(CXX) $(CXXFLAGS) -I./java/. $(JAVA_INCLUDE) -shared -fPIC \
 	  -o ./java/target/$(ROCKSDBJNILIB) $(ALL_JNI_NATIVE_SOURCES) \
@ -2275,8 +2285,8 @@ rocksdbjavastaticpublishdocker: rocksdbjavastaticreleasedocker rocksdbjavastatic
 ROCKSDB_JAVA_RELEASE_CLASSIFIERS = javadoc sources linux64 linux32 linux64-musl linux32-musl osx win64

 rocksdbjavastaticpublishcentral: rocksdbjavageneratepom
-	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/pom.xml -Dfile=java/target/rocksdbjni-$(ROCKSDB_JAVA_VERSION).jar
-	$(foreach classifier, $(ROCKSDB_JAVA_RELEASE_CLASSIFIERS), mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/pom.xml -Dfile=java/target/rocksdbjni-$(ROCKSDB_JAVA_VERSION)-$(classifier).jar -Dclassifier=$(classifier);)
+	mvn gpg:sign-and-deploy-file -Durl=https://mvn.mchv.eu/repository/mchv -DrepositoryId=mchv -DpomFile=java/pom.xml -Dfile=java/target/rocksdbjni-$(ROCKSDB_JAVA_VERSION).jar
+	$(foreach classifier, $(ROCKSDB_JAVA_RELEASE_CLASSIFIERS), mvn gpg:sign-and-deploy-file -Durl=https://mvn.mchv.eu/repository/mchv -DrepositoryId=mchvg -DpomFile=java/pom.xml -Dfile=java/target/rocksdbjni-$(ROCKSDB_JAVA_VERSION)-$(classifier).jar -Dclassifier=$(classifier);)

 rocksdbjavageneratepom:
 	cd java;cat pom.xml.template | sed 's/\$${ROCKSDB_JAVA_VERSION}/$(ROCKSDB_JAVA_VERSION)/' > pom.xml
@ -2301,7 +2311,7 @@ rocksdbjava: $(LIB_OBJECTS)
 ifeq ($(JAVA_HOME),)
 	$(error JAVA_HOME is not set)
 endif
-	$(AM_V_GEN)cd java; SHA256_CMD='$(SHA256_CMD)' $(MAKE) javalib;
+	$(AM_V_GEN)cd java; $(MAKE) javalib;
 	$(AM_V_at)rm -f ./java/target/$(ROCKSDBJNILIB)
 	$(AM_V_at)$(CXX) $(CXXFLAGS) -I./java/. -I./java/rocksjni $(JAVA_INCLUDE) $(ROCKSDB_PLUGIN_JNI_CXX_INCLUDEFLAGS) -shared -fPIC -o ./java/target/$(ROCKSDBJNILIB) $(ALL_JNI_NATIVE_SOURCES) $(LIB_OBJECTS) $(JAVA_LDFLAGS) $(COVERAGEFLAGS)
 	$(AM_V_at)cd java; $(JAR_CMD) -cf target/$(ROCKSDB_JAR) HISTORY*.md
@ -2313,14 +2323,13 @@ jclean:
 	cd java;$(MAKE) clean;

 jtest_compile: rocksdbjava
-	cd java; SHA256_CMD='$(SHA256_CMD)' $(MAKE) java_test
+	cd java;$(MAKE) java_test

 jtest_run:
 	cd java;$(MAKE) run_test

 jtest: rocksdbjava
-	cd java;$(MAKE) sample; SHA256_CMD='$(SHA256_CMD)' $(MAKE) test;
-	$(PYTHON) tools/check_all_python.py # TODO peterd: find a better place for this check in CI targets
+	cd java;$(MAKE) sample test

 jdb_bench:
 	cd java;$(MAKE) db_bench;
--- a/25
+++ b/25
@ -224,6 +224,7 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[
        "trace_replay/trace_record_handler.cc",
        "trace_replay/trace_record_result.cc",
        "trace_replay/trace_replay.cc",
+        "util/async_file_reader.cc",
        "util/build_version.cc",
        "util/cleanable.cc",
        "util/coding.cc",
@ -327,7 +328,13 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[
        "utilities/wal_filter.cc",
        "utilities/write_batch_with_index/write_batch_with_index.cc",
        "utilities/write_batch_with_index/write_batch_with_index_internal.cc",
-    ], deps=["//folly/container:f14_hash"], headers=None, link_whole=False, extra_test_libs=False)
+    ], deps=[
+        "//folly/container:f14_hash",
+        "//folly/experimental/coro:blocking_wait",
+        "//folly/experimental/coro:collect",
+        "//folly/experimental/coro:coroutine",
+        "//folly/experimental/coro:task",
+    ], headers=None, link_whole=False, extra_test_libs=False)

 cpp_library_wrapper(name="rocksdb_whole_archive_lib", srcs=[
        "cache/cache.cc",
@ -545,6 +552,7 @@ cpp_library_wrapper(name="rocksdb_whole_archive_lib", srcs=[
        "trace_replay/trace_record_handler.cc",
        "trace_replay/trace_record_result.cc",
        "trace_replay/trace_replay.cc",
+        "util/async_file_reader.cc",
        "util/build_version.cc",
        "util/cleanable.cc",
        "util/coding.cc",
@ -648,10 +656,17 @@ cpp_library_wrapper(name="rocksdb_whole_archive_lib", srcs=[
        "utilities/wal_filter.cc",
        "utilities/write_batch_with_index/write_batch_with_index.cc",
        "utilities/write_batch_with_index/write_batch_with_index_internal.cc",
-    ], deps=["//folly/container:f14_hash"], headers=None, link_whole=True, extra_test_libs=False)
+    ], deps=[
+        "//folly/container:f14_hash",
+        "//folly/experimental/coro:blocking_wait",
+        "//folly/experimental/coro:collect",
+        "//folly/experimental/coro:coroutine",
+        "//folly/experimental/coro:task",
+    ], headers=None, link_whole=True, extra_test_libs=False)

 cpp_library_wrapper(name="rocksdb_test_lib", srcs=[
        "db/db_test_util.cc",
+        "db/db_with_timestamp_test_util.cc",
        "table/mock_table.cc",
        "test_util/mock_time_env.cc",
        "test_util/testharness.cc",
@ -5161,6 +5176,12 @@ cpp_unittest_wrapper(name="db_rate_limiter_test",
            extra_compiler_flags=[])


+cpp_unittest_wrapper(name="db_readonly_with_timestamp_test",
+            srcs=["db/db_readonly_with_timestamp_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
 cpp_unittest_wrapper(name="db_secondary_test",
            srcs=["db/db_secondary_test.cc"],
            deps=[":rocksdb_test_lib"],
--- a/buckifier/buckify_rocksdb.py
+++ b/buckifier/buckify_rocksdb.py
@ -145,7 +145,12 @@ def generate_targets(repo_path, deps_map):
        # always add range_tree, it's only excluded on ppc64, which we don't use internally
        src_mk["RANGE_TREE_SOURCES"] +
        src_mk["TOOL_LIB_SOURCES"],
-        deps=["//folly/container:f14_hash"])
+        deps=[
+            "//folly/container:f14_hash",
+            "//folly/experimental/coro:blocking_wait",
+            "//folly/experimental/coro:collect",
+            "//folly/experimental/coro:coroutine",
+            "//folly/experimental/coro:task"])
    # rocksdb_whole_archive_lib
    TARGETS.add_library(
        "rocksdb_whole_archive_lib",
@ -153,7 +158,12 @@ def generate_targets(repo_path, deps_map):
        # always add range_tree, it's only excluded on ppc64, which we don't use internally
        src_mk["RANGE_TREE_SOURCES"] +
        src_mk["TOOL_LIB_SOURCES"],
-        deps=["//folly/container:f14_hash"],
+        deps=[
+            "//folly/container:f14_hash",
+            "//folly/experimental/coro:blocking_wait",
+            "//folly/experimental/coro:collect",
+            "//folly/experimental/coro:coroutine",
+            "//folly/experimental/coro:task"],
        headers=None,
        extra_external_deps="",
        link_whole=True)
--- a/build_tools/build_detect_platform
+++ b/build_tools/build_detect_platform
@ -269,7 +269,7 @@ esac
 PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS ${CXXFLAGS}"
 JAVA_LDFLAGS="$PLATFORM_LDFLAGS"
 JAVA_STATIC_LDFLAGS="$PLATFORM_LDFLAGS"
-JAVAC_ARGS="-source 8"
+JAVAC_ARGS="-source 11"

 if [ "$CROSS_COMPILE" = "true" -o "$FBCODE_BUILD" = "true" ]; then
    # Cross-compiling; do not try any compilation tests.
@ -662,13 +662,13 @@ else
  fi

  if [[ "${PLATFORM}" == "OS_MACOSX" ]]; then
-    # For portability compile for macOS 10.12 (2016) or newer
-    COMMON_FLAGS="$COMMON_FLAGS -mmacosx-version-min=10.12"
-    PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -mmacosx-version-min=10.12"
+    # For portability compile for macOS 10.13 (2017) or newer
+    COMMON_FLAGS="$COMMON_FLAGS -mmacosx-version-min=10.13"
+    PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -mmacosx-version-min=10.13"
    # -mmacosx-version-min must come first here.
-    PLATFORM_SHARED_LDFLAGS="-mmacosx-version-min=10.12 $PLATFORM_SHARED_LDFLAGS"
-    PLATFORM_CMAKE_FLAGS="-DCMAKE_OSX_DEPLOYMENT_TARGET=10.12"
-    JAVA_STATIC_DEPS_COMMON_FLAGS="-mmacosx-version-min=10.12"
+    PLATFORM_SHARED_LDFLAGS="-mmacosx-version-min=10.13 $PLATFORM_SHARED_LDFLAGS"
+    PLATFORM_CMAKE_FLAGS="-DCMAKE_OSX_DEPLOYMENT_TARGET=10.13"
+    JAVA_STATIC_DEPS_COMMON_FLAGS="-mmacosx-version-min=10.13"
    JAVA_STATIC_DEPS_LDFLAGS="$JAVA_STATIC_DEPS_COMMON_FLAGS"
    JAVA_STATIC_DEPS_CCFLAGS="$JAVA_STATIC_DEPS_COMMON_FLAGS"
    JAVA_STATIC_DEPS_CXXFLAGS="$JAVA_STATIC_DEPS_COMMON_FLAGS"
@ -800,9 +800,6 @@ if [ "$?" = 0 ]; then
  COMMON_FLAGS="$COMMON_FLAGS -DHAVE_UINT128_EXTENSION"
 fi

-# thread_local is part of C++11 and later (TODO: clean up this define)
-COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_SUPPORT_THREAD_LOCAL"
-
 if [ "$FBCODE_BUILD" != "true" -a "$PLATFORM" = OS_LINUX ]; then
  $CXX $COMMON_FLAGS $PLATFORM_SHARED_CFLAGS -x c++ -c - -o test_dl.o 2>/dev/null <<EOF
  void dummy_func() {}
--- a/build_tools/dependencies_platform009.sh
+++ b/build_tools/dependencies_platform009.sh
@ -20,3 +20,7 @@ VALGRIND_BASE=/mnt/gvfs/third-party2/valgrind/6ae525939ad02e5e676855082fbbc7828d
 LUA_BASE=/mnt/gvfs/third-party2/lua/162efd9561a3d21f6869f4814011e9cf1b3ff4dc/5.3.4/platform009/a6271c4
 BENCHMARK_BASE=/mnt/gvfs/third-party2/benchmark/30bf49ad6414325e17f3425b0edcb64239427ae3/1.6.1/platform009/7f3b187
 BOOST_BASE=/mnt/gvfs/third-party2/boost/201b7d74941e54b436dfa364a063aa6d2cd7de4c/1.69.0/platform009/8a7ffdf
+GLOG_BASE=/mnt/gvfs/third-party2/glog/32d751bd5673375b438158717ab6a57c1cc57e3d/0.3.2_fb/platform009/10a364d/
+FMT_BASE=/mnt/gvfs/third-party2/fmt/ce0c25f67165f4d2c22a29b8ef50f5600d7873ca/6.1.1/platform009/7f3b187/
+DBL_CONV_BASE=/mnt/gvfs/third-party2/double_conversion/109b3d9696d71f1048678cd7da1e22505470543d/20141126/platform009/7f3b187/
+LIBEVENT_BASE=/mnt/gvfs/third-party2/libevent/4a4d3a79a76c2439b6bd471bf3586b3481dde75e/1.4.14b_hphp/platform009/7f3b187/
--- a/build_tools/fbcode_config.sh
+++ b/build_tools/fbcode_config.sh
@ -147,7 +147,7 @@ else
 fi

 CFLAGS+=" $DEPS_INCLUDE"
-CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE -DROCKSDB_RANGESYNC_PRESENT -DROCKSDB_SCHED_GETCPU_PRESENT -DROCKSDB_SUPPORT_THREAD_LOCAL -DHAVE_SSE42"
+CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE -DROCKSDB_RANGESYNC_PRESENT -DROCKSDB_SCHED_GETCPU_PRESENT -DHAVE_SSE42"
 CXXFLAGS+=" $CFLAGS"

 EXEC_LDFLAGS=" $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS $NUMA_LIB $TBB_LIBS"
--- a/build_tools/fbcode_config_platform009.sh
+++ b/build_tools/fbcode_config_platform009.sh
@ -14,7 +14,7 @@ source "$BASEDIR/dependencies_platform009.sh"
 CFLAGS=""

 # libgcc
-LIBGCC_INCLUDE="$LIBGCC_BASE/include/c++/9.3.0"
+LIBGCC_INCLUDE="$LIBGCC_BASE/include/c++/9.3.0 -I $LIBGCC_BASE/include/c++/9.3.0/backward"
 LIBGCC_LIBS=" -L $LIBGCC_BASE/lib"

 # glibc
@ -70,6 +70,18 @@ BENCHMARK_LIBS=" $BENCHMARK_BASE/lib/libbenchmark${MAYBE_PIC}.a"

 BOOST_INCLUDE=" -I $BOOST_BASE/include/"

+GLOG_INCLUDE=" -I $GLOG_BASE/include/"
+GLOG_LIBS=" $GLOG_BASE/lib/libglog${MAYBE_PIC}.a"
+
+FMT_INCLUDE=" -I $FMT_BASE/include/"
+FMT_LIBS=" $FMT_BASE/lib/libfmt${MAYBE_PIC}.a"
+
+DBL_CONV_INCLUDE=" -I $DBL_CONV_BASE/include/"
+DBL_CONV_LIBS=" $DBL_CONV_BASE/lib/libdouble-conversion${MAYBE_PIC}.a"
+
+LIBEVENT_INCLUDE=" -I $LIBEVENT_BASE/include/"
+LIBEVENT_LIBS=" $LIBEVENT_BASE/lib/libevent${MAYBE_PIC}.a"
+
 # location of jemalloc
 JEMALLOC_INCLUDE=" -I $JEMALLOC_BASE/include/"
 JEMALLOC_LIB=" $JEMALLOC_BASE/lib/libjemalloc${MAYBE_PIC}.a"
@ -101,7 +113,7 @@ BINUTILS="$BINUTILS_BASE/bin"
 AR="$BINUTILS/ar"
 AS="$BINUTILS/as"

-DEPS_INCLUDE="$SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $LZ4_INCLUDE $ZSTD_INCLUDE $GFLAGS_INCLUDE $NUMA_INCLUDE $TBB_INCLUDE $LIBURING_INCLUDE $BENCHMARK_INCLUDE $BOOST_INCLUDE"
+DEPS_INCLUDE="$SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $LZ4_INCLUDE $ZSTD_INCLUDE $GFLAGS_INCLUDE $NUMA_INCLUDE $TBB_INCLUDE $LIBURING_INCLUDE $BENCHMARK_INCLUDE $BOOST_INCLUDE $GLOG_INCLUDE $FMT_INCLUDE $DBL_CONV_INCLUDE $LIBEVENT_INCLUDE"

 STDLIBS="-L $GCC_BASE/lib64"

@ -144,7 +156,7 @@ else
 fi

 CFLAGS+=" $DEPS_INCLUDE"
-CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE -DROCKSDB_RANGESYNC_PRESENT -DROCKSDB_SCHED_GETCPU_PRESENT -DROCKSDB_SUPPORT_THREAD_LOCAL -DHAVE_SSE42 -DROCKSDB_IOURING_PRESENT"
+CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE -DROCKSDB_RANGESYNC_PRESENT -DROCKSDB_SCHED_GETCPU_PRESENT -DHAVE_SSE42 -DROCKSDB_IOURING_PRESENT"
 CXXFLAGS+=" $CFLAGS"

 EXEC_LDFLAGS=" $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS $NUMA_LIB $TBB_LIBS $LIBURING_LIBS $BENCHMARK_LIBS"
--- a/build_tools/fbcode_config_platform010.sh
+++ b/build_tools/fbcode_config_platform010.sh
@ -154,7 +154,7 @@ CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE/linux "
 CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE "

 CFLAGS+=" $DEPS_INCLUDE"
-CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE -DROCKSDB_RANGESYNC_PRESENT -DROCKSDB_SCHED_GETCPU_PRESENT -DROCKSDB_SUPPORT_THREAD_LOCAL -DHAVE_SSE42 -DROCKSDB_IOURING_PRESENT"
+CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE -DROCKSDB_RANGESYNC_PRESENT -DROCKSDB_SCHED_GETCPU_PRESENT -DHAVE_SSE42 -DROCKSDB_IOURING_PRESENT"
 CXXFLAGS+=" $CFLAGS"

 EXEC_LDFLAGS=" $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS $NUMA_LIB $TBB_LIBS $LIBURING_LIBS $BENCHMARK_LIBS"
--- a/db/builder.cc
+++ b/db/builder.cc
@ -37,6 +37,7 @@
 #include "table/block_based/block_based_table_builder.h"
 #include "table/format.h"
 #include "table/internal_iterator.h"
+#include "table/unique_id_impl.h"
 #include "test_util/sync_point.h"
 #include "util/stop_watch.h"

@ -310,6 +311,15 @@ Status BuildTable(
      meta->file_checksum_func_name = file_writer->GetFileChecksumFuncName();
      file_checksum = meta->file_checksum;
      file_checksum_func_name = meta->file_checksum_func_name;
+      // Set unique_id only if db_id and db_session_id exist
+      if (!tboptions.db_id.empty() && !tboptions.db_session_id.empty()) {
+        if (!GetSstInternalUniqueId(tboptions.db_id, tboptions.db_session_id,
+                                    meta->fd.GetNumber(), &(meta->unique_id))
+                 .ok()) {
+          // if failed to get unique id, just set it Null
+          meta->unique_id = kNullUniqueId64x2;
+        }
+      }
    }

    if (s.ok()) {
@ -331,10 +341,9 @@ Status BuildTable(
    if (s.ok() && !empty) {
      // Verify that the table is usable
      // We set for_compaction to false and don't OptimizeForCompactionTableRead
-      // here because this is a special case after we finish the table building
+      // here because this is a special case after we finish the table building.
      // No matter whether use_direct_io_for_flush_and_compaction is true,
-      // we will regrad this verification as user reads since the goal is
-      // to cache it here for further user reads
+      // the goal is to cache it here for further user reads.
      ReadOptions read_options;
      std::unique_ptr<InternalIterator> it(table_cache->NewIterator(
          read_options, file_options, tboptions.internal_comparator, *meta,
--- a/db/c.cc
+++ b/db/c.cc
@ -2855,6 +2855,20 @@ void rocksdb_options_set_bottommost_compression_options_zstd_max_train_bytes(
  opt->rep.bottommost_compression_opts.enabled = enabled;
 }

+void rocksdb_options_set_bottommost_compression_options_use_zstd_dict_trainer(
+    rocksdb_options_t* opt, unsigned char use_zstd_dict_trainer,
+    unsigned char enabled) {
+  opt->rep.bottommost_compression_opts.use_zstd_dict_trainer =
+      use_zstd_dict_trainer;
+  opt->rep.bottommost_compression_opts.enabled = enabled;
+}
+
+unsigned char
+rocksdb_options_get_bottommost_compression_options_use_zstd_dict_trainer(
+    rocksdb_options_t* opt) {
+  return opt->rep.bottommost_compression_opts.use_zstd_dict_trainer;
+}
+
 void rocksdb_options_set_bottommost_compression_options_max_dict_buffer_bytes(
    rocksdb_options_t* opt, uint64_t max_dict_buffer_bytes,
    unsigned char enabled) {
@ -2882,6 +2896,16 @@ int rocksdb_options_get_compression_options_zstd_max_train_bytes(
  return opt->rep.compression_opts.zstd_max_train_bytes;
 }

+void rocksdb_options_set_compression_options_use_zstd_dict_trainer(
+    rocksdb_options_t* opt, unsigned char use_zstd_dict_trainer) {
+  opt->rep.compression_opts.use_zstd_dict_trainer = use_zstd_dict_trainer;
+}
+
+unsigned char rocksdb_options_get_compression_options_use_zstd_dict_trainer(
+    rocksdb_options_t* opt) {
+  return opt->rep.compression_opts.use_zstd_dict_trainer;
+}
+
 void rocksdb_options_set_compression_options_parallel_threads(
    rocksdb_options_t* opt, int value) {
  opt->rep.compression_opts.parallel_threads = value;
@ -3673,6 +3697,8 @@ uint64_t rocksdb_perfcontext_metric(rocksdb_perfcontext_t* context,
      return rep->env_unlock_file_nanos;
    case rocksdb_env_new_logger_nanos:
      return rep->env_new_logger_nanos;
+    case rocksdb_number_async_seek:
+      return rep->number_async_seek;
    default:
      break;
  }
--- a/db/c_test.c
+++ b/db/c_test.c
@ -2539,6 +2539,9 @@ int main(int argc, char** argv) {
        200 ==
        rocksdb_options_get_compression_options_max_dict_buffer_bytes(co));

+    rocksdb_options_set_compression_options_use_zstd_dict_trainer(co, 0);
+    CheckCondition(
+        0 == rocksdb_options_get_compression_options_use_zstd_dict_trainer(co));
    rocksdb_options_destroy(co);
  }

--- a/db/column_family.cc
+++ b/db/column_family.cc
@ -136,9 +136,15 @@ Status CheckCompressionSupported(const ColumnFamilyOptions& cf_options) {
    }
  }
  if (cf_options.compression_opts.zstd_max_train_bytes > 0) {
-    if (!ZSTD_TrainDictionarySupported()) {
+    if (cf_options.compression_opts.use_zstd_dict_trainer) {
+      if (!ZSTD_TrainDictionarySupported()) {
+        return Status::InvalidArgument(
+            "zstd dictionary trainer cannot be used because ZSTD 1.1.3+ "
+            "is not linked with the binary.");
+      }
+    } else if (!ZSTD_FinalizeDictionarySupported()) {
      return Status::InvalidArgument(
-          "zstd dictionary trainer cannot be used because ZSTD 1.1.3+ "
+          "zstd finalizeDictionary cannot be used because ZSTD 1.4.5+ "
          "is not linked with the binary.");
    }
    if (cf_options.compression_opts.max_dict_bytes == 0) {
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@ -64,6 +64,7 @@
 #include "table/block_based/block_based_table_factory.h"
 #include "table/merging_iterator.h"
 #include "table/table_builder.h"
+#include "table/unique_id_impl.h"
 #include "test_util/sync_point.h"
 #include "util/coding.h"
 #include "util/hash.h"
@ -1047,6 +1048,7 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
  const Compaction* compaction = sub_compact->compaction;
  CompactionServiceInput compaction_input;
  compaction_input.output_level = compaction->output_level();
+  compaction_input.db_id = db_id_;

  const std::vector<CompactionInputFiles>& inputs =
      *(compact_->compaction->inputs());
@ -1208,6 +1210,7 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
    meta.oldest_ancester_time = file.oldest_ancester_time;
    meta.file_creation_time = file.file_creation_time;
    meta.marked_for_compaction = file.marked_for_compaction;
+    meta.unique_id = file.unique_id;

    auto cfd = compaction->column_family_data();
    sub_compact->outputs.emplace_back(std::move(meta),
@ -1350,7 +1353,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
  ReadOptions read_options;
  read_options.verify_checksums = true;
  read_options.fill_cache = false;
-  read_options.rate_limiter_priority = Env::IO_LOW;
+  read_options.rate_limiter_priority = GetRateLimiterPriority();
  // Compaction iterators shouldn't be confined to a single prefix.
  // Compactions use Seek() for
  // (a) concurrent compactions,
@ -2112,11 +2115,11 @@ Status CompactionJob::InstallCompactionResults(

  {
    Compaction::InputLevelSummaryBuffer inputs_summary;
-    ROCKS_LOG_INFO(db_options_.info_log,
-                   "[%s] [JOB %d] Compacted %s => %" PRIu64 " bytes",
-                   compaction->column_family_data()->GetName().c_str(), job_id_,
-                   compaction->InputLevelSummary(&inputs_summary),
-                   compact_->total_bytes + compact_->total_blob_bytes);
+    ROCKS_LOG_BUFFER(log_buffer_,
+                     "[%s] [JOB %d] Compacted %s => %" PRIu64 " bytes",
+                     compaction->column_family_data()->GetName().c_str(),
+                     job_id_, compaction->InputLevelSummary(&inputs_summary),
+                     compact_->total_bytes + compact_->total_blob_bytes);
  }

  VersionEdit* const edit = compaction->edit();
@ -2277,6 +2280,18 @@ Status CompactionJob::OpenCompactionOutputFile(
    meta.oldest_ancester_time = oldest_ancester_time;
    meta.file_creation_time = current_time;
    meta.temperature = temperature;
+    assert(!db_id_.empty());
+    assert(!db_session_id_.empty());
+    s = GetSstInternalUniqueId(db_id_, db_session_id_, meta.fd.GetNumber(),
+                               &meta.unique_id);
+    if (!s.ok()) {
+      ROCKS_LOG_ERROR(db_options_.info_log,
+                      "[%s] [JOB %d] file #%" PRIu64
+                      " failed to generate unique id: %s.",
+                      cfd->GetName().c_str(), job_id_, meta.fd.GetNumber(),
+                      s.ToString().c_str());
+      return s;
+    }
    sub_compact->outputs.emplace_back(
        std::move(meta), cfd->internal_comparator(),
        /*enable_order_check=*/
@ -2285,7 +2300,7 @@ Status CompactionJob::OpenCompactionOutputFile(
        /*enable_hash=*/paranoid_file_checks_);
  }

-  writable_file->SetIOPriority(Env::IOPriority::IO_LOW);
+  writable_file->SetIOPriority(GetRateLimiterPriority());
  writable_file->SetWriteLifeTimeHint(write_hint_);
  FileTypeSet tmp_set = db_options_.checksum_handoff_file_types;
  writable_file->SetPreallocationBlockSize(static_cast<size_t>(
@ -2476,6 +2491,19 @@ std::string CompactionJob::GetTableFileName(uint64_t file_number) {
                       file_number, compact_->compaction->output_path_id());
 }

+Env::IOPriority CompactionJob::GetRateLimiterPriority() {
+  if (versions_ && versions_->GetColumnFamilySet() &&
+      versions_->GetColumnFamilySet()->write_controller()) {
+    WriteController* write_controller =
+        versions_->GetColumnFamilySet()->write_controller();
+    if (write_controller->NeedsDelay() || write_controller->IsStopped()) {
+      return Env::IO_USER;
+    }
+  }
+
+  return Env::IO_LOW;
+}
+
 #ifndef ROCKSDB_LITE
 std::string CompactionServiceCompactionJob::GetTableFileName(
    uint64_t file_number) {
@ -2596,7 +2624,7 @@ Status CompactionServiceCompactionJob::Run() {
        meta.fd.largest_seqno, meta.smallest.Encode().ToString(),
        meta.largest.Encode().ToString(), meta.oldest_ancester_time,
        meta.file_creation_time, output_file.validator.GetHash(),
-        meta.marked_for_compaction);
+        meta.marked_for_compaction, meta.unique_id);
  }
  compaction_result_->num_output_records = sub_compact->num_output_records;
  compaction_result_->total_bytes = sub_compact->total_bytes;
@ -2700,6 +2728,9 @@ static std::unordered_map<std::string, OptionTypeInfo> cs_input_type_info = {
    {"output_level",
     {offsetof(struct CompactionServiceInput, output_level), OptionType::kInt,
      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+    {"db_id",
+     {offsetof(struct CompactionServiceInput, db_id),
+      OptionType::kEncodedString}},
    {"has_begin",
     {offsetof(struct CompactionServiceInput, has_begin), OptionType::kBoolean,
      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
@ -2757,6 +2788,11 @@ static std::unordered_map<std::string, OptionTypeInfo>
         {offsetof(struct CompactionServiceOutputFile, marked_for_compaction),
          OptionType::kBoolean, OptionVerificationType::kNormal,
          OptionTypeFlags::kNone}},
+        {"unique_id",
+         OptionTypeInfo::Array<uint64_t, 2>(
+             offsetof(struct CompactionServiceOutputFile, unique_id),
+             OptionVerificationType::kNormal, OptionTypeFlags::kNone,
+             {0, OptionType::kUInt64T})},
 };

 static std::unordered_map<std::string, OptionTypeInfo>
--- a/db/compaction/compaction_job.h
+++ b/db/compaction/compaction_job.h
@ -137,6 +137,8 @@ class CompactionJob {
  IOStatus io_status_;

 private:
+  friend class CompactionJobTestBase;
+
  // Generates a histogram representing potential divisions of key ranges from
  // the input. It adds the starting and/or ending keys of certain input files
  // to the working set and then finds the approximate size of data in between
@ -234,6 +236,10 @@ class CompactionJob {
  // Get table file name in where it's outputting to, which should also be in
  // `output_directory_`.
  virtual std::string GetTableFileName(uint64_t file_number);
+  // The rate limiter priority (io_priority) is determined dynamically here.
+  // The Compaction Read and Write priorities are the same for different
+  // scenarios, such as write stalled.
+  Env::IOPriority GetRateLimiterPriority();
 };

 // CompactionServiceInput is used the pass compaction information between two
@ -253,6 +259,9 @@ struct CompactionServiceInput {
  std::vector<std::string> input_files;
  int output_level;

+  // db_id is used to generate unique id of sst on the remote compactor
+  std::string db_id;
+
  // information for subcompaction
  bool has_begin = false;
  std::string begin;
@ -284,13 +293,15 @@ struct CompactionServiceOutputFile {
  uint64_t file_creation_time;
  uint64_t paranoid_hash;
  bool marked_for_compaction;
+  UniqueId64x2 unique_id;

  CompactionServiceOutputFile() = default;
  CompactionServiceOutputFile(
      const std::string& name, SequenceNumber smallest, SequenceNumber largest,
      std::string _smallest_internal_key, std::string _largest_internal_key,
      uint64_t _oldest_ancester_time, uint64_t _file_creation_time,
-      uint64_t _paranoid_hash, bool _marked_for_compaction)
+      uint64_t _paranoid_hash, bool _marked_for_compaction,
+      UniqueId64x2 _unique_id)
      : file_name(name),
        smallest_seqno(smallest),
        largest_seqno(largest),
@ -299,7 +310,8 @@ struct CompactionServiceOutputFile {
        oldest_ancester_time(_oldest_ancester_time),
        file_creation_time(_file_creation_time),
        paranoid_hash(_paranoid_hash),
-        marked_for_compaction(_marked_for_compaction) {}
+        marked_for_compaction(_marked_for_compaction),
+        unique_id(std::move(_unique_id)) {}
 };

 // CompactionServiceResult contains the compaction result from a different db
--- a/db/compaction/compaction_job_test.cc
+++ b/db/compaction/compaction_job_test.cc
@ -27,6 +27,7 @@
 #include "rocksdb/options.h"
 #include "rocksdb/write_buffer_manager.h"
 #include "table/mock_table.h"
+#include "table/unique_id_impl.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "util/string_util.h"
@ -206,7 +207,7 @@ class CompactionJobTestBase : public testing::Test {
                 oldest_blob_file_number, kUnknownOldestAncesterTime,
                 kUnknownFileCreationTime, kUnknownFileChecksum,
                 kUnknownFileChecksumFuncName, kDisableUserTimestamp,
-                 kDisableUserTimestamp);
+                 kDisableUserTimestamp, kNullUniqueId64x2);

    mutex_.Lock();
    EXPECT_OK(
@ -321,7 +322,8 @@ class CompactionJobTestBase : public testing::Test {
      const std::vector<SequenceNumber>& snapshots = {},
      SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber,
      int output_level = 1, bool verify = true,
-      uint64_t expected_oldest_blob_file_number = kInvalidBlobFileNumber) {
+      uint64_t expected_oldest_blob_file_number = kInvalidBlobFileNumber,
+      bool check_get_priority = false) {
    auto cfd = versions_->GetColumnFamilySet()->GetDefault();

    size_t num_input_files = 0;
@ -359,8 +361,8 @@ class CompactionJobTestBase : public testing::Test {
        table_cache_, &event_logger, false, false, dbname_,
        &compaction_job_stats_, Env::Priority::USER, nullptr /* IOTracer */,
        /*manual_compaction_paused=*/nullptr,
-        /*manual_compaction_canceled=*/nullptr, /*db_id=*/"",
-        /*db_session_id=*/"", full_history_ts_low_);
+        /*manual_compaction_canceled=*/nullptr, env_->GenerateUniqueId(),
+        DBImpl::GenerateDbSessionId(nullptr), full_history_ts_low_);
    VerifyInitializationOfCompactionJobStats(compaction_job_stats_);

    compaction_job.Prepare();
@ -390,6 +392,32 @@ class CompactionJobTestBase : public testing::Test {
                  expected_oldest_blob_file_number);
      }
    }
+
+    if (check_get_priority) {
+      CheckGetRateLimiterPriority(compaction_job);
+    }
+  }
+
+  void CheckGetRateLimiterPriority(CompactionJob& compaction_job) {
+    // When the state from WriteController is normal.
+    ASSERT_EQ(compaction_job.GetRateLimiterPriority(), Env::IO_LOW);
+
+    WriteController* write_controller =
+        compaction_job.versions_->GetColumnFamilySet()->write_controller();
+
+    {
+      // When the state from WriteController is Delayed.
+      std::unique_ptr<WriteControllerToken> delay_token =
+          write_controller->GetDelayToken(1000000);
+      ASSERT_EQ(compaction_job.GetRateLimiterPriority(), Env::IO_USER);
+    }
+
+    {
+      // When the state from WriteController is Stopped.
+      std::unique_ptr<WriteControllerToken> stop_token =
+          write_controller->GetStopToken();
+      ASSERT_EQ(compaction_job.GetRateLimiterPriority(), Env::IO_USER);
+    }
  }

  std::shared_ptr<Env> env_guard_;
@ -1227,13 +1255,14 @@ TEST_F(CompactionJobTest, ResultSerialization) {
  result.status =
      status_list.at(rnd.Uniform(static_cast<int>(status_list.size())));
  while (!rnd.OneIn(10)) {
+    UniqueId64x2 id{rnd64.Uniform(UINT64_MAX), rnd64.Uniform(UINT64_MAX)};
    result.output_files.emplace_back(
        rnd.RandomString(rnd.Uniform(kStrMaxLen)), rnd64.Uniform(UINT64_MAX),
        rnd64.Uniform(UINT64_MAX),
        rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen)),
        rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen)),
        rnd64.Uniform(UINT64_MAX), rnd64.Uniform(UINT64_MAX),
-        rnd64.Uniform(UINT64_MAX), rnd.OneIn(2));
+        rnd64.Uniform(UINT64_MAX), rnd.OneIn(2), id);
  }
  result.output_level = rnd.Uniform(10);
  result.output_path = rnd.RandomString(rnd.Uniform(kStrMaxLen));
@ -1261,6 +1290,16 @@ TEST_F(CompactionJobTest, ResultSerialization) {
  ASSERT_FALSE(deserialized1.TEST_Equals(&result, &mismatch));
  ASSERT_EQ(mismatch, "stats.num_input_files");

+  // Test unique id mismatch
+  if (!result.output_files.empty()) {
+    CompactionServiceResult deserialized_tmp;
+    ASSERT_OK(CompactionServiceResult::Read(output, &deserialized_tmp));
+    deserialized_tmp.output_files[0].unique_id[0] += 1;
+    ASSERT_FALSE(deserialized_tmp.TEST_Equals(&result, &mismatch));
+    ASSERT_EQ(mismatch, "output_files.unique_id");
+    deserialized_tmp.status.PermitUncheckedError();
+  }
+
  // Test unknown field
  CompactionServiceResult deserialized2;
  output.clear();
@ -1303,6 +1342,17 @@ TEST_F(CompactionJobTest, ResultSerialization) {
  }
 }

+TEST_F(CompactionJobTest, GetRateLimiterPriority) {
+  NewDB();
+
+  auto expected_results = CreateTwoFiles(false);
+  auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+  auto files = cfd->current()->storage_info()->LevelFiles(0);
+  ASSERT_EQ(2U, files.size());
+  RunCompaction({files}, expected_results, {}, kMaxSequenceNumber, 1, true,
+                kInvalidBlobFileNumber, true);
+}
+
 class CompactionJobTimestampTest : public CompactionJobTestBase {
 public:
  CompactionJobTimestampTest()
--- a/db/compaction/compaction_picker_test.cc
+++ b/db/compaction/compaction_picker_test.cc
@ -12,6 +12,7 @@
 #include "db/compaction/compaction_picker_level.h"
 #include "db/compaction/compaction_picker_universal.h"
 #include "db/compaction/file_pri.h"
+#include "table/unique_id_impl.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "util/string_util.h"
@ -115,7 +116,7 @@ class CompactionPickerTest : public testing::Test {
        largest_seq, marked_for_compact, temperature, kInvalidBlobFileNumber,
        kUnknownOldestAncesterTime, kUnknownFileCreationTime,
        kUnknownFileChecksum, kUnknownFileChecksumFuncName,
-        kDisableUserTimestamp, kDisableUserTimestamp);
+        kDisableUserTimestamp, kDisableUserTimestamp, kNullUniqueId64x2);
    f->compensated_file_size =
        (compensated_file_size != 0) ? compensated_file_size : file_size;
    f->oldest_ancester_time = oldest_ancestor_time;
--- a/db/compaction/compaction_service_test.cc
+++ b/db/compaction/compaction_service_test.cc
@ -15,13 +15,17 @@ class MyTestCompactionService : public CompactionService {
  MyTestCompactionService(
      std::string db_path, Options& options,
      std::shared_ptr<Statistics>& statistics,
-      std::vector<std::shared_ptr<EventListener>>& listeners)
+      std::vector<std::shared_ptr<EventListener>>& listeners,
+      std::vector<std::shared_ptr<TablePropertiesCollectorFactory>>
+          table_properties_collector_factories)
      : db_path_(std::move(db_path)),
        options_(options),
        statistics_(statistics),
        start_info_("na", "na", "na", 0, Env::TOTAL),
        wait_info_("na", "na", "na", 0, Env::TOTAL),
-        listeners_(listeners) {}
+        listeners_(listeners),
+        table_properties_collector_factories_(
+            std::move(table_properties_collector_factories)) {}

  static const char* kClassName() { return "MyTestCompactionService"; }

@ -78,6 +82,11 @@ class MyTestCompactionService : public CompactionService {
      options_override.listeners = listeners_;
    }

+    if (!table_properties_collector_factories_.empty()) {
+      options_override.table_properties_collector_factories =
+          table_properties_collector_factories_;
+    }
+
    OpenAndCompactOptions options;
    options.canceled = &canceled_;

@ -141,6 +150,8 @@ class MyTestCompactionService : public CompactionService {
  bool is_override_wait_result_ = false;
  std::string override_wait_result_;
  std::vector<std::shared_ptr<EventListener>> listeners_;
+  std::vector<std::shared_ptr<TablePropertiesCollectorFactory>>
+      table_properties_collector_factories_;
  std::atomic_bool canceled_{false};
 };

@ -157,7 +168,8 @@ class CompactionServiceTest : public DBTestBase {
    compactor_statistics_ = CreateDBStatistics();

    compaction_service_ = std::make_shared<MyTestCompactionService>(
-        dbname_, *options, compactor_statistics_, remote_listeners);
+        dbname_, *options, compactor_statistics_, remote_listeners,
+        remote_table_properties_collector_factories);
    options->compaction_service = compaction_service_;
    DestroyAndReopen(*options);
  }
@ -206,6 +218,8 @@ class CompactionServiceTest : public DBTestBase {
  }

  std::vector<std::shared_ptr<EventListener>> remote_listeners;
+  std::vector<std::shared_ptr<TablePropertiesCollectorFactory>>
+      remote_table_properties_collector_factories;

 private:
  std::shared_ptr<Statistics> compactor_statistics_;
@ -274,6 +288,16 @@ TEST_F(CompactionServiceTest, BasicCompactions) {
        auto s = static_cast<Status*>(status);
        *s = Status::Aborted("MyTestCompactionService failed to compact!");
      });
+
+  // tracking success unique id verification
+  std::atomic_int verify_passed{0};
+  SyncPoint::GetInstance()->SetCallBack(
+      "Version::VerifySstUniqueIds::Passed", [&](void* arg) {
+        // override job status
+        auto id = static_cast<std::string*>(arg);
+        assert(!id->empty());
+        verify_passed++;
+      });
  SyncPoint::GetInstance()->EnableProcessing();

  Status s;
@ -298,6 +322,12 @@ TEST_F(CompactionServiceTest, BasicCompactions) {
    }
  }
  ASSERT_TRUE(s.IsAborted());
+
+  // Test verification
+  ASSERT_EQ(verify_passed, 0);
+  options.verify_sst_unique_id_in_manifest = true;
+  Reopen(options);
+  ASSERT_GT(verify_passed, 0);
 }

 TEST_F(CompactionServiceTest, ManualCompaction) {
@ -827,6 +857,96 @@ TEST_F(CompactionServiceTest, RemoteEventListener) {
  }
 }

+TEST_F(CompactionServiceTest, TablePropertiesCollector) {
+  const static std::string kUserPropertyName = "TestCount";
+
+  class TablePropertiesCollectorTest : public TablePropertiesCollector {
+   public:
+    Status Finish(UserCollectedProperties* properties) override {
+      *properties = UserCollectedProperties{
+          {kUserPropertyName, std::to_string(count_)},
+      };
+      return Status::OK();
+    }
+
+    UserCollectedProperties GetReadableProperties() const override {
+      return UserCollectedProperties();
+    }
+
+    const char* Name() const override { return "TablePropertiesCollectorTest"; }
+
+    Status AddUserKey(const Slice& /*user_key*/, const Slice& /*value*/,
+                      EntryType /*type*/, SequenceNumber /*seq*/,
+                      uint64_t /*file_size*/) override {
+      count_++;
+      return Status::OK();
+    }
+
+   private:
+    uint32_t count_ = 0;
+  };
+
+  class TablePropertiesCollectorFactoryTest
+      : public TablePropertiesCollectorFactory {
+   public:
+    TablePropertiesCollector* CreateTablePropertiesCollector(
+        TablePropertiesCollectorFactory::Context /*context*/) override {
+      return new TablePropertiesCollectorTest();
+    }
+
+    const char* Name() const override {
+      return "TablePropertiesCollectorFactoryTest";
+    }
+  };
+
+  auto factory = new TablePropertiesCollectorFactoryTest();
+  remote_table_properties_collector_factories.emplace_back(factory);
+
+  const int kNumSst = 3;
+  const int kLevel0Trigger = 4;
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kLevel0Trigger;
+  ReopenWithCompactionService(&options);
+
+  // generate a few SSTs locally which should not have user property
+  for (int i = 0; i < kNumSst; i++) {
+    for (int j = 0; j < 100; j++) {
+      ASSERT_OK(Put(Key(i * 10 + j), "value"));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  TablePropertiesCollection fname_to_props;
+  ASSERT_OK(db_->GetPropertiesOfAllTables(&fname_to_props));
+  for (const auto& file_props : fname_to_props) {
+    auto properties = file_props.second->user_collected_properties;
+    auto it = properties.find(kUserPropertyName);
+    ASSERT_EQ(it, properties.end());
+  }
+
+  // trigger compaction
+  for (int i = kNumSst; i < kLevel0Trigger; i++) {
+    for (int j = 0; j < 100; j++) {
+      ASSERT_OK(Put(Key(i * 10 + j), "value"));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+  ASSERT_OK(db_->GetPropertiesOfAllTables(&fname_to_props));
+
+  bool has_user_property = false;
+  for (const auto& file_props : fname_to_props) {
+    auto properties = file_props.second->user_collected_properties;
+    auto it = properties.find(kUserPropertyName);
+    if (it != properties.end()) {
+      has_user_property = true;
+      ASSERT_GT(std::stoi(it->second), 0);
+    }
+  }
+  ASSERT_TRUE(has_user_property);
+}
+
 }  // namespace ROCKSDB_NAMESPACE

 int main(int argc, char** argv) {
--- a/db/corruption_test.cc
+++ b/db/corruption_test.cc
@ -1065,7 +1065,7 @@ INSTANTIATE_TEST_CASE_P(CorruptionTest, CrashDuringRecoveryWithCorruptionTest,
 // The combination of corrupting a WAL and injecting an error during subsequent
 // re-open exposes the bug of prematurely persisting a new MANIFEST with
 // advanced ColumnFamilyData::log_number.
-TEST_P(CrashDuringRecoveryWithCorruptionTest, DISABLED_CrashDuringRecovery) {
+TEST_P(CrashDuringRecoveryWithCorruptionTest, CrashDuringRecovery) {
  CloseDb();
  Options options;
  options.track_and_verify_wals_in_manifest =
@ -1107,7 +1107,8 @@ TEST_P(CrashDuringRecoveryWithCorruptionTest, DISABLED_CrashDuringRecovery) {
    // number. TEST_SwitchMemtable makes sure WALs are not synced and test can
    // corrupt un-sync WAL.
    for (int i = 0; i < 2; ++i) {
-      ASSERT_OK(db_->Put(WriteOptions(), "key" + std::to_string(i), "value"));
+      ASSERT_OK(db_->Put(WriteOptions(), "key" + std::to_string(i),
+                         "value" + std::to_string(i)));
      ASSERT_OK(dbimpl->TEST_SwitchMemtable());
    }

@ -1188,6 +1189,23 @@ TEST_P(CrashDuringRecoveryWithCorruptionTest, DISABLED_CrashDuringRecovery) {
  {
    options.avoid_flush_during_recovery = avoid_flush_during_recovery_;
    ASSERT_OK(DB::Open(options, dbname_, cf_descs, &handles, &db_));
+
+    // Verify that data is not lost.
+    {
+      std::string v;
+      ASSERT_OK(db_->Get(ReadOptions(), handles[1], "old_key", &v));
+      ASSERT_EQ("dontcare", v);
+
+      v.clear();
+      ASSERT_OK(db_->Get(ReadOptions(), "key" + std::to_string(0), &v));
+      ASSERT_EQ("value" + std::to_string(0), v);
+
+      // Since  it's corrupting second last wal, below key is not found.
+      v.clear();
+      ASSERT_EQ(db_->Get(ReadOptions(), "key" + std::to_string(1), &v),
+                Status::NotFound());
+    }
+
    for (auto* h : handles) {
      delete h;
    }
@ -1219,8 +1237,7 @@ TEST_P(CrashDuringRecoveryWithCorruptionTest, DISABLED_CrashDuringRecovery) {
 // The combination of corrupting a WAL and injecting an error during subsequent
 // re-open exposes the bug of prematurely persisting a new MANIFEST with
 // advanced ColumnFamilyData::log_number.
-TEST_P(CrashDuringRecoveryWithCorruptionTest,
-       DISABLED_TxnDbCrashDuringRecovery) {
+TEST_P(CrashDuringRecoveryWithCorruptionTest, TxnDbCrashDuringRecovery) {
  CloseDb();
  Options options;
  options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
@ -1271,13 +1288,14 @@ TEST_P(CrashDuringRecoveryWithCorruptionTest,

    // Put and flush cf0
    for (int i = 0; i < 2; ++i) {
-      ASSERT_OK(txn_db->Put(WriteOptions(), "dontcare", "value"));
+      ASSERT_OK(txn_db->Put(WriteOptions(), "key" + std::to_string(i),
+                            "value" + std::to_string(i)));
      ASSERT_OK(dbimpl->TEST_SwitchMemtable());
    }

    // Put cf1
    txn = txn_db->BeginTransaction(WriteOptions(), TransactionOptions());
-    ASSERT_OK(txn->Put(handles[1], "foo1", "value"));
+    ASSERT_OK(txn->Put(handles[1], "foo1", "value1"));
    ASSERT_OK(txn->Commit());

    delete txn;
@ -1337,7 +1355,6 @@ TEST_P(CrashDuringRecoveryWithCorruptionTest,
    std::vector<uint64_t> file_nums;
    GetSortedWalFiles(file_nums);
    size_t size = file_nums.size();
-    assert(size >= 2);
    uint64_t log_num = file_nums[size - 1];
    CorruptFileWithTruncation(FileType::kWalFile, log_num);
  }
@ -1354,6 +1371,27 @@ TEST_P(CrashDuringRecoveryWithCorruptionTest,
  {
    ASSERT_OK(TransactionDB::Open(options, txn_db_opts, dbname_, cf_descs,
                                  &handles, &txn_db));
+
+    // Verify that data is not lost.
+    {
+      std::string v;
+      // Key not visible since it's not committed.
+      ASSERT_EQ(txn_db->Get(ReadOptions(), handles[1], "foo", &v),
+                Status::NotFound());
+
+      v.clear();
+      ASSERT_OK(txn_db->Get(ReadOptions(), "key" + std::to_string(0), &v));
+      ASSERT_EQ("value" + std::to_string(0), v);
+
+      // Last WAL is corrupted which contains two keys below.
+      v.clear();
+      ASSERT_EQ(txn_db->Get(ReadOptions(), "key" + std::to_string(1), &v),
+                Status::NotFound());
+      v.clear();
+      ASSERT_EQ(txn_db->Get(ReadOptions(), handles[1], "foo1", &v),
+                Status::NotFound());
+    }
+
    for (auto* h : handles) {
      delete h;
    }
@ -1396,8 +1434,7 @@ TEST_P(CrashDuringRecoveryWithCorruptionTest,
 // The combination of corrupting a WAL and injecting an error during subsequent
 // re-open exposes the bug of prematurely persisting a new MANIFEST with
 // advanced ColumnFamilyData::log_number.
-TEST_P(CrashDuringRecoveryWithCorruptionTest,
-       DISABLED_CrashDuringRecoveryWithFlush) {
+TEST_P(CrashDuringRecoveryWithCorruptionTest, CrashDuringRecoveryWithFlush) {
  CloseDb();
  Options options;
  options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
@ -1430,7 +1467,8 @@ TEST_P(CrashDuringRecoveryWithCorruptionTest,
    // Write to default_cf and flush this cf several times to advance wal
    // number.
    for (int i = 0; i < 2; ++i) {
-      ASSERT_OK(db_->Put(WriteOptions(), "key" + std::to_string(i), "value"));
+      ASSERT_OK(db_->Put(WriteOptions(), "key" + std::to_string(i),
+                         "value" + std::to_string(i)));
      ASSERT_OK(db_->Flush(FlushOptions()));
    }

@ -1483,6 +1521,25 @@ TEST_P(CrashDuringRecoveryWithCorruptionTest,
  {
    options.avoid_flush_during_recovery = avoid_flush_during_recovery_;
    ASSERT_OK(DB::Open(options, dbname_, cf_descs, &handles, &db_));
+
+    // Verify that data is not lost.
+    {
+      std::string v;
+      ASSERT_OK(db_->Get(ReadOptions(), handles[1], "old_key", &v));
+      ASSERT_EQ("dontcare", v);
+
+      for (int i = 0; i < 2; ++i) {
+        v.clear();
+        ASSERT_OK(db_->Get(ReadOptions(), "key" + std::to_string(i), &v));
+        ASSERT_EQ("value" + std::to_string(i), v);
+      }
+
+      // Since it's corrupting last wal after Flush, below key is not found.
+      v.clear();
+      ASSERT_EQ(db_->Get(ReadOptions(), handles[1], "dontcare", &v),
+                Status::NotFound());
+    }
+
    for (auto* h : handles) {
      delete h;
    }
--- a/db/db_basic_test.cc
+++ b/db/db_basic_test.cc
@ -12,6 +12,7 @@
 #include "db/db_test_util.h"
 #include "options/options_helper.h"
 #include "port/stack_trace.h"
+#include "rocksdb/filter_policy.h"
 #include "rocksdb/flush_block_policy.h"
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/perf_context.h"
@ -1180,10 +1181,17 @@ TEST_F(DBBasicTest, DBCloseFlushError) {
  Destroy(options);
 }

-class DBMultiGetTestWithParam : public DBBasicTest,
-                                public testing::WithParamInterface<bool> {};
+class DBMultiGetTestWithParam
+    : public DBBasicTest,
+      public testing::WithParamInterface<std::tuple<bool, bool>> {};

 TEST_P(DBMultiGetTestWithParam, MultiGetMultiCF) {
+#ifndef USE_COROUTINES
+  if (std::get<1>(GetParam())) {
+    ROCKSDB_GTEST_SKIP("This test requires coroutine support");
+    return;
+  }
+#endif  // USE_COROUTINES
  Options options = CurrentOptions();
  CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich",
                         "alyosha", "popovich"},
@ -1240,7 +1248,8 @@ TEST_P(DBMultiGetTestWithParam, MultiGetMultiCF) {
    keys.push_back(std::get<1>(cf_kv_vec[i]));
  }

-  values = MultiGet(cfs, keys, nullptr, GetParam());
+  values = MultiGet(cfs, keys, nullptr, std::get<0>(GetParam()),
+                    std::get<1>(GetParam()));
  ASSERT_EQ(values.size(), num_keys);
  for (unsigned int j = 0; j < values.size(); ++j) {
    ASSERT_EQ(values[j], std::get<2>(cf_kv_vec[j]) + "_2");
@ -1254,7 +1263,8 @@ TEST_P(DBMultiGetTestWithParam, MultiGetMultiCF) {
  keys.push_back(std::get<1>(cf_kv_vec[3]));
  cfs.push_back(std::get<0>(cf_kv_vec[4]));
  keys.push_back(std::get<1>(cf_kv_vec[4]));
-  values = MultiGet(cfs, keys, nullptr, GetParam());
+  values = MultiGet(cfs, keys, nullptr, std::get<0>(GetParam()),
+                    std::get<1>(GetParam()));
  ASSERT_EQ(values[0], std::get<2>(cf_kv_vec[0]) + "_2");
  ASSERT_EQ(values[1], std::get<2>(cf_kv_vec[3]) + "_2");
  ASSERT_EQ(values[2], std::get<2>(cf_kv_vec[4]) + "_2");
@ -1267,7 +1277,8 @@ TEST_P(DBMultiGetTestWithParam, MultiGetMultiCF) {
  keys.push_back(std::get<1>(cf_kv_vec[6]));
  cfs.push_back(std::get<0>(cf_kv_vec[1]));
  keys.push_back(std::get<1>(cf_kv_vec[1]));
-  values = MultiGet(cfs, keys, nullptr, GetParam());
+  values = MultiGet(cfs, keys, nullptr, std::get<0>(GetParam()),
+                    std::get<1>(GetParam()));
  ASSERT_EQ(values[0], std::get<2>(cf_kv_vec[7]) + "_2");
  ASSERT_EQ(values[1], std::get<2>(cf_kv_vec[6]) + "_2");
  ASSERT_EQ(values[2], std::get<2>(cf_kv_vec[1]) + "_2");
@ -1283,6 +1294,12 @@ TEST_P(DBMultiGetTestWithParam, MultiGetMultiCF) {
 }

 TEST_P(DBMultiGetTestWithParam, MultiGetMultiCFMutex) {
+#ifndef USE_COROUTINES
+  if (std::get<1>(GetParam())) {
+    ROCKSDB_GTEST_SKIP("This test requires coroutine support");
+    return;
+  }
+#endif  // USE_COROUTINES
  Options options = CurrentOptions();
  CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich",
                         "alyosha", "popovich"},
@ -1328,7 +1345,8 @@ TEST_P(DBMultiGetTestWithParam, MultiGetMultiCFMutex) {
    keys.push_back("cf" + std::to_string(i) + "_key");
  }

-  values = MultiGet(cfs, keys, nullptr, GetParam());
+  values = MultiGet(cfs, keys, nullptr, std::get<0>(GetParam()),
+                    std::get<1>(GetParam()));
  ASSERT_TRUE(last_try);
  ASSERT_EQ(values.size(), 8);
  for (unsigned int j = 0; j < values.size(); ++j) {
@ -1345,6 +1363,12 @@ TEST_P(DBMultiGetTestWithParam, MultiGetMultiCFMutex) {
 }

 TEST_P(DBMultiGetTestWithParam, MultiGetMultiCFSnapshot) {
+#ifndef USE_COROUTINES
+  if (std::get<1>(GetParam())) {
+    ROCKSDB_GTEST_SKIP("This test requires coroutine support");
+    return;
+  }
+#endif  // USE_COROUTINES
  Options options = CurrentOptions();
  CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich",
                         "alyosha", "popovich"},
@ -1389,7 +1413,8 @@ TEST_P(DBMultiGetTestWithParam, MultiGetMultiCFSnapshot) {
  }

  const Snapshot* snapshot = db_->GetSnapshot();
-  values = MultiGet(cfs, keys, snapshot, GetParam());
+  values = MultiGet(cfs, keys, snapshot, std::get<0>(GetParam()),
+                    std::get<1>(GetParam()));
  db_->ReleaseSnapshot(snapshot);
  ASSERT_EQ(values.size(), 8);
  for (unsigned int j = 0; j < values.size(); ++j) {
@ -1405,6 +1430,12 @@ TEST_P(DBMultiGetTestWithParam, MultiGetMultiCFSnapshot) {
 }

 TEST_P(DBMultiGetTestWithParam, MultiGetMultiCFUnsorted) {
+#ifndef USE_COROUTINES
+  if (std::get<1>(GetParam())) {
+    ROCKSDB_GTEST_SKIP("This test requires coroutine support");
+    return;
+  }
+#endif  // USE_COROUTINES
  Options options = CurrentOptions();
  CreateAndReopenWithCF({"one", "two"}, options);

@ -1417,8 +1448,9 @@ TEST_P(DBMultiGetTestWithParam, MultiGetMultiCFUnsorted) {
  std::vector<std::string> keys{"foo", "baz", "abc"};
  std::vector<std::string> values;

-  values =
-      MultiGet(cfs, keys, /* snapshot */ nullptr, /* batched */ GetParam());
+  values = MultiGet(cfs, keys, /* snapshot */ nullptr,
+                    /* batched */ std::get<0>(GetParam()),
+                    /* async */ std::get<1>(GetParam()));

  ASSERT_EQ(values.size(), 3);
  ASSERT_EQ(values[0], "bar");
@ -1426,10 +1458,18 @@ TEST_P(DBMultiGetTestWithParam, MultiGetMultiCFUnsorted) {
  ASSERT_EQ(values[2], "def");
 }

-INSTANTIATE_TEST_CASE_P(DBMultiGetTestWithParam, DBMultiGetTestWithParam,
-                        testing::Bool());
-
-TEST_F(DBBasicTest, MultiGetBatchedSimpleUnsorted) {
+TEST_P(DBMultiGetTestWithParam, MultiGetBatchedSimpleUnsorted) {
+#ifndef USE_COROUTINES
+  if (std::get<1>(GetParam())) {
+    ROCKSDB_GTEST_SKIP("This test requires coroutine support");
+    return;
+  }
+#endif  // USE_COROUTINES
+  // Skip for unbatched MultiGet
+  if (!std::get<0>(GetParam())) {
+    ROCKSDB_GTEST_SKIP("This test is only for batched MultiGet");
+    return;
+  }
  do {
    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
    SetPerfLevel(kEnableCount);
@ -1448,8 +1488,10 @@ TEST_F(DBBasicTest, MultiGetBatchedSimpleUnsorted) {
    std::vector<ColumnFamilyHandle*> cfs(keys.size(), handles_[1]);
    std::vector<Status> s(keys.size());

-    db_->MultiGet(ReadOptions(), handles_[1], keys.size(), keys.data(),
-                  values.data(), s.data(), false);
+    ReadOptions ro;
+    ro.async_io = std::get<1>(GetParam());
+    db_->MultiGet(ro, handles_[1], keys.size(), keys.data(), values.data(),
+                  s.data(), false);

    ASSERT_EQ(values.size(), keys.size());
    ASSERT_EQ(std::string(values[5].data(), values[5].size()), "v1");
@ -1470,7 +1512,18 @@ TEST_F(DBBasicTest, MultiGetBatchedSimpleUnsorted) {
  } while (ChangeCompactOptions());
 }

-TEST_F(DBBasicTest, MultiGetBatchedSortedMultiFile) {
+TEST_P(DBMultiGetTestWithParam, MultiGetBatchedSortedMultiFile) {
+#ifndef USE_COROUTINES
+  if (std::get<1>(GetParam())) {
+    ROCKSDB_GTEST_SKIP("This test requires coroutine support");
+    return;
+  }
+#endif  // USE_COROUTINES
+  // Skip for unbatched MultiGet
+  if (!std::get<0>(GetParam())) {
+    ROCKSDB_GTEST_SKIP("This test is only for batched MultiGet");
+    return;
+  }
  do {
    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
    SetPerfLevel(kEnableCount);
@ -1493,8 +1546,10 @@ TEST_F(DBBasicTest, MultiGetBatchedSortedMultiFile) {
    std::vector<ColumnFamilyHandle*> cfs(keys.size(), handles_[1]);
    std::vector<Status> s(keys.size());

-    db_->MultiGet(ReadOptions(), handles_[1], keys.size(), keys.data(),
-                  values.data(), s.data(), true);
+    ReadOptions ro;
+    ro.async_io = std::get<1>(GetParam());
+    db_->MultiGet(ro, handles_[1], keys.size(), keys.data(), values.data(),
+                  s.data(), true);

    ASSERT_EQ(values.size(), keys.size());
    ASSERT_EQ(std::string(values[0].data(), values[0].size()), "v1");
@ -1515,7 +1570,18 @@ TEST_F(DBBasicTest, MultiGetBatchedSortedMultiFile) {
  } while (ChangeOptions());
 }

-TEST_F(DBBasicTest, MultiGetBatchedDuplicateKeys) {
+TEST_P(DBMultiGetTestWithParam, MultiGetBatchedDuplicateKeys) {
+#ifndef USE_COROUTINES
+  if (std::get<1>(GetParam())) {
+    ROCKSDB_GTEST_SKIP("This test requires coroutine support");
+    return;
+  }
+#endif  // USE_COROUTINES
+  // Skip for unbatched MultiGet
+  if (!std::get<0>(GetParam())) {
+    ROCKSDB_GTEST_SKIP("This test is only for batched MultiGet");
+    return;
+  }
  Options opts = CurrentOptions();
  opts.merge_operator = MergeOperators::CreateStringAppendOperator();
  CreateAndReopenWithCF({"pikachu"}, opts);
@ -1546,8 +1612,10 @@ TEST_F(DBBasicTest, MultiGetBatchedDuplicateKeys) {
  std::vector<ColumnFamilyHandle*> cfs(keys.size(), handles_[1]);
  std::vector<Status> s(keys.size());

-  db_->MultiGet(ReadOptions(), handles_[1], keys.size(), keys.data(),
-                values.data(), s.data(), false);
+  ReadOptions ro;
+  ro.async_io = std::get<1>(GetParam());
+  db_->MultiGet(ro, handles_[1], keys.size(), keys.data(), values.data(),
+                s.data(), false);

  ASSERT_EQ(values.size(), keys.size());
  ASSERT_EQ(std::string(values[0].data(), values[0].size()), "v8");
@ -1566,7 +1634,18 @@ TEST_F(DBBasicTest, MultiGetBatchedDuplicateKeys) {
  SetPerfLevel(kDisable);
 }

-TEST_F(DBBasicTest, MultiGetBatchedMultiLevel) {
+TEST_P(DBMultiGetTestWithParam, MultiGetBatchedMultiLevel) {
+#ifndef USE_COROUTINES
+  if (std::get<1>(GetParam())) {
+    ROCKSDB_GTEST_SKIP("This test requires coroutine support");
+    return;
+  }
+#endif  // USE_COROUTINES
+  // Skip for unbatched MultiGet
+  if (!std::get<0>(GetParam())) {
+    ROCKSDB_GTEST_SKIP("This test is only for batched MultiGet");
+    return;
+  }
  Options options = CurrentOptions();
  options.disable_auto_compactions = true;
  Reopen(options);
@ -1625,7 +1704,7 @@ TEST_F(DBBasicTest, MultiGetBatchedMultiLevel) {
    keys.push_back("key_" + std::to_string(i));
  }

-  values = MultiGet(keys, nullptr);
+  values = MultiGet(keys, nullptr, std::get<1>(GetParam()));
  ASSERT_EQ(values.size(), 16);
  for (unsigned int j = 0; j < values.size(); ++j) {
    int key = j + 64;
@ -1641,7 +1720,18 @@ TEST_F(DBBasicTest, MultiGetBatchedMultiLevel) {
  }
 }

-TEST_F(DBBasicTest, MultiGetBatchedMultiLevelMerge) {
+TEST_P(DBMultiGetTestWithParam, MultiGetBatchedMultiLevelMerge) {
+#ifndef USE_COROUTINES
+  if (std::get<1>(GetParam())) {
+    ROCKSDB_GTEST_SKIP("This test requires coroutine support");
+    return;
+  }
+#endif  // USE_COROUTINES
+  // Skip for unbatched MultiGet
+  if (!std::get<0>(GetParam())) {
+    ROCKSDB_GTEST_SKIP("This test is only for batched MultiGet");
+    return;
+  }
  Options options = CurrentOptions();
  options.disable_auto_compactions = true;
  options.merge_operator = MergeOperators::CreateStringAppendOperator();
@ -1705,7 +1795,7 @@ TEST_F(DBBasicTest, MultiGetBatchedMultiLevelMerge) {
    keys.push_back("key_" + std::to_string(i));
  }

-  values = MultiGet(keys, nullptr);
+  values = MultiGet(keys, nullptr, std::get<1>(GetParam()));
  ASSERT_EQ(values.size(), keys.size());
  for (unsigned int j = 0; j < 48; ++j) {
    int key = j + 32;
@ -1727,7 +1817,18 @@ TEST_F(DBBasicTest, MultiGetBatchedMultiLevelMerge) {
  }
 }

-TEST_F(DBBasicTest, MultiGetBatchedValueSizeInMemory) {
+TEST_P(DBMultiGetTestWithParam, MultiGetBatchedValueSizeInMemory) {
+#ifndef USE_COROUTINES
+  if (std::get<1>(GetParam())) {
+    ROCKSDB_GTEST_SKIP("This test requires coroutine support");
+    return;
+  }
+#endif  // USE_COROUTINES
+  // Skip for unbatched MultiGet
+  if (!std::get<0>(GetParam())) {
+    ROCKSDB_GTEST_SKIP("This test is only for batched MultiGet");
+    return;
+  }
  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
  SetPerfLevel(kEnableCount);
  ASSERT_OK(Put(1, "k1", "v_1"));
@ -1744,6 +1845,7 @@ TEST_F(DBBasicTest, MultiGetBatchedValueSizeInMemory) {
  get_perf_context()->Reset();
  ReadOptions ro;
  ro.value_size_soft_limit = 11;
+  ro.async_io = std::get<1>(GetParam());
  db_->MultiGet(ro, handles_[1], keys.size(), keys.data(), values.data(),
                s.data(), false);

@ -1761,7 +1863,17 @@ TEST_F(DBBasicTest, MultiGetBatchedValueSizeInMemory) {
  SetPerfLevel(kDisable);
 }

-TEST_F(DBBasicTest, MultiGetBatchedValueSize) {
+TEST_P(DBMultiGetTestWithParam, MultiGetBatchedValueSize) {
+#ifndef USE_COROUTINES
+  if (std::get<1>(GetParam())) {
+    ROCKSDB_GTEST_SKIP("This test requires coroutine support");
+    return;
+  }
+#endif  // USE_COROUTINES
+  // Skip for unbatched MultiGet
+  if (!std::get<0>(GetParam())) {
+    return;
+  }
  do {
    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
    SetPerfLevel(kEnableCount);
@ -1801,6 +1913,7 @@ TEST_F(DBBasicTest, MultiGetBatchedValueSize) {

    ReadOptions ro;
    ro.value_size_soft_limit = 20;
+    ro.async_io = std::get<1>(GetParam());
    db_->MultiGet(ro, handles_[1], keys.size(), keys.data(), values.data(),
                  s.data(), false);

@ -1836,7 +1949,18 @@ TEST_F(DBBasicTest, MultiGetBatchedValueSize) {
  } while (ChangeCompactOptions());
 }

-TEST_F(DBBasicTest, MultiGetBatchedValueSizeMultiLevelMerge) {
+TEST_P(DBMultiGetTestWithParam, MultiGetBatchedValueSizeMultiLevelMerge) {
+#ifndef USE_COROUTINES
+  if (std::get<1>(GetParam())) {
+    ROCKSDB_GTEST_SKIP("This test requires coroutine support");
+    return;
+  }
+#endif  // USE_COROUTINES
+  // Skip for unbatched MultiGet
+  if (!std::get<0>(GetParam())) {
+    ROCKSDB_GTEST_SKIP("This test is only for batched MultiGet");
+    return;
+  }
  Options options = CurrentOptions();
  options.disable_auto_compactions = true;
  options.merge_operator = MergeOperators::CreateStringAppendOperator();
@ -1908,6 +2032,7 @@ TEST_F(DBBasicTest, MultiGetBatchedValueSizeMultiLevelMerge) {
  ReadOptions read_options;
  read_options.verify_checksums = true;
  read_options.value_size_soft_limit = 380;
+  read_options.async_io = std::get<1>(GetParam());
  db_->MultiGet(read_options, dbfull()->DefaultColumnFamily(), keys.size(),
                keys.data(), values.data(), statuses.data());

@ -1939,6 +2064,217 @@ TEST_F(DBBasicTest, MultiGetBatchedValueSizeMultiLevelMerge) {
  }
 }

+INSTANTIATE_TEST_CASE_P(DBMultiGetTestWithParam, DBMultiGetTestWithParam,
+                        testing::Combine(testing::Bool(), testing::Bool()));
+
+#if USE_COROUTINES
+class DBMultiGetAsyncIOTest : public DBBasicTest {
+ public:
+  DBMultiGetAsyncIOTest()
+      : DBBasicTest(), statistics_(ROCKSDB_NAMESPACE::CreateDBStatistics()) {
+    BlockBasedTableOptions bbto;
+    bbto.filter_policy.reset(NewBloomFilterPolicy(10));
+    Options options = CurrentOptions();
+    options.disable_auto_compactions = true;
+    options.statistics = statistics_;
+    options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+    Reopen(options);
+    int num_keys = 0;
+
+    // Put all keys in the bottommost level, and overwrite some keys
+    // in L0 and L1
+    for (int i = 0; i < 128; ++i) {
+      EXPECT_OK(Put(Key(i), "val_l2_" + std::to_string(i)));
+      num_keys++;
+      if (num_keys == 8) {
+        EXPECT_OK(Flush());
+        num_keys = 0;
+      }
+    }
+    if (num_keys > 0) {
+      EXPECT_OK(Flush());
+      num_keys = 0;
+    }
+    MoveFilesToLevel(2);
+
+    for (int i = 0; i < 128; i += 3) {
+      EXPECT_OK(Put(Key(i), "val_l1_" + std::to_string(i)));
+      num_keys++;
+      if (num_keys == 8) {
+        EXPECT_OK(Flush());
+        num_keys = 0;
+      }
+    }
+    if (num_keys > 0) {
+      EXPECT_OK(Flush());
+      num_keys = 0;
+    }
+    MoveFilesToLevel(1);
+
+    for (int i = 0; i < 128; i += 5) {
+      EXPECT_OK(Put(Key(i), "val_l0_" + std::to_string(i)));
+      num_keys++;
+      if (num_keys == 8) {
+        EXPECT_OK(Flush());
+        num_keys = 0;
+      }
+    }
+    if (num_keys > 0) {
+      EXPECT_OK(Flush());
+      num_keys = 0;
+    }
+    EXPECT_EQ(0, num_keys);
+  }
+
+  const std::shared_ptr<Statistics>& statistics() { return statistics_; }
+
+ private:
+  std::shared_ptr<Statistics> statistics_;
+};
+
+TEST_F(DBMultiGetAsyncIOTest, GetFromL0) {
+  // All 3 keys in L0. The L0 files should be read serially.
+  std::vector<std::string> key_strs{Key(0), Key(40), Key(80)};
+  std::vector<Slice> keys{key_strs[0], key_strs[1], key_strs[2]};
+  std::vector<PinnableSlice> values(key_strs.size());
+  std::vector<Status> statuses(key_strs.size());
+
+  ReadOptions ro;
+  ro.async_io = true;
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data());
+  ASSERT_EQ(values.size(), 3);
+  ASSERT_OK(statuses[0]);
+  ASSERT_OK(statuses[1]);
+  ASSERT_OK(statuses[2]);
+  ASSERT_EQ(values[0], "val_l0_" + std::to_string(0));
+  ASSERT_EQ(values[1], "val_l0_" + std::to_string(40));
+  ASSERT_EQ(values[2], "val_l0_" + std::to_string(80));
+
+  HistogramData multiget_io_batch_size;
+
+  statistics()->histogramData(MULTIGET_IO_BATCH_SIZE, &multiget_io_batch_size);
+
+  // No async IO in this case since we don't do parallel lookup in L0
+  ASSERT_EQ(multiget_io_batch_size.count, 0);
+  ASSERT_EQ(multiget_io_batch_size.max, 0);
+}
+
+TEST_F(DBMultiGetAsyncIOTest, GetFromL1) {
+  std::vector<std::string> key_strs;
+  std::vector<Slice> keys;
+  std::vector<PinnableSlice> values;
+  std::vector<Status> statuses;
+
+  key_strs.push_back(Key(33));
+  key_strs.push_back(Key(54));
+  key_strs.push_back(Key(102));
+  keys.push_back(key_strs[0]);
+  keys.push_back(key_strs[1]);
+  keys.push_back(key_strs[2]);
+  values.resize(keys.size());
+  statuses.resize(keys.size());
+
+  ReadOptions ro;
+  ro.async_io = true;
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data());
+  ASSERT_EQ(values.size(), 3);
+  ASSERT_EQ(statuses[0], Status::OK());
+  ASSERT_EQ(statuses[1], Status::OK());
+  ASSERT_EQ(statuses[2], Status::OK());
+  ASSERT_EQ(values[0], "val_l1_" + std::to_string(33));
+  ASSERT_EQ(values[1], "val_l1_" + std::to_string(54));
+  ASSERT_EQ(values[2], "val_l1_" + std::to_string(102));
+
+  HistogramData multiget_io_batch_size;
+
+  statistics()->histogramData(MULTIGET_IO_BATCH_SIZE, &multiget_io_batch_size);
+
+  // A batch of 3 async IOs is expected, one for each overlapping file in L1
+  ASSERT_EQ(multiget_io_batch_size.count, 1);
+  ASSERT_EQ(multiget_io_batch_size.max, 3);
+}
+
+TEST_F(DBMultiGetAsyncIOTest, LastKeyInFile) {
+  std::vector<std::string> key_strs;
+  std::vector<Slice> keys;
+  std::vector<PinnableSlice> values;
+  std::vector<Status> statuses;
+
+  // 21 is the last key in the first L1 file
+  key_strs.push_back(Key(21));
+  key_strs.push_back(Key(54));
+  key_strs.push_back(Key(102));
+  keys.push_back(key_strs[0]);
+  keys.push_back(key_strs[1]);
+  keys.push_back(key_strs[2]);
+  values.resize(keys.size());
+  statuses.resize(keys.size());
+
+  ReadOptions ro;
+  ro.async_io = true;
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data());
+  ASSERT_EQ(values.size(), 3);
+  ASSERT_EQ(statuses[0], Status::OK());
+  ASSERT_EQ(statuses[1], Status::OK());
+  ASSERT_EQ(statuses[2], Status::OK());
+  ASSERT_EQ(values[0], "val_l1_" + std::to_string(21));
+  ASSERT_EQ(values[1], "val_l1_" + std::to_string(54));
+  ASSERT_EQ(values[2], "val_l1_" + std::to_string(102));
+
+  HistogramData multiget_io_batch_size;
+
+  statistics()->histogramData(MULTIGET_IO_BATCH_SIZE, &multiget_io_batch_size);
+
+  // Since the first MultiGet key is the last key in a file, the MultiGet is
+  // expected to lookup in that file first, before moving on to other files.
+  // So the first file lookup will issue one async read, and the next lookup
+  // will lookup 2 files in parallel and issue 2 async reads
+  ASSERT_EQ(multiget_io_batch_size.count, 2);
+  ASSERT_EQ(multiget_io_batch_size.max, 2);
+}
+
+TEST_F(DBMultiGetAsyncIOTest, GetFromL1AndL2) {
+  std::vector<std::string> key_strs;
+  std::vector<Slice> keys;
+  std::vector<PinnableSlice> values;
+  std::vector<Status> statuses;
+
+  // 33 and 102 are in L1, and 56 is in L2
+  key_strs.push_back(Key(33));
+  key_strs.push_back(Key(56));
+  key_strs.push_back(Key(102));
+  keys.push_back(key_strs[0]);
+  keys.push_back(key_strs[1]);
+  keys.push_back(key_strs[2]);
+  values.resize(keys.size());
+  statuses.resize(keys.size());
+
+  ReadOptions ro;
+  ro.async_io = true;
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data());
+  ASSERT_EQ(values.size(), 3);
+  ASSERT_EQ(statuses[0], Status::OK());
+  ASSERT_EQ(statuses[1], Status::OK());
+  ASSERT_EQ(statuses[2], Status::OK());
+  ASSERT_EQ(values[0], "val_l1_" + std::to_string(33));
+  ASSERT_EQ(values[1], "val_l2_" + std::to_string(56));
+  ASSERT_EQ(values[2], "val_l1_" + std::to_string(102));
+
+  HistogramData multiget_io_batch_size;
+
+  statistics()->histogramData(MULTIGET_IO_BATCH_SIZE, &multiget_io_batch_size);
+
+  // There is only one MultiGet key in the bottommost level - 56. Thus
+  // the bottommost level will not use async IO.
+  ASSERT_EQ(multiget_io_batch_size.count, 1);
+  ASSERT_EQ(multiget_io_batch_size.max, 2);
+}
+#endif  // USE_COROUTINES
+
 TEST_F(DBBasicTest, MultiGetStats) {
  Options options;
  options.create_if_missing = true;
@ -3308,6 +3644,11 @@ class DeadlineRandomAccessFile : public FSRandomAccessFileOwnerWrapper {
  IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs,
                     const IOOptions& options, IODebugContext* dbg) override;

+  IOStatus ReadAsync(FSReadRequest& req, const IOOptions& opts,
+                     std::function<void(const FSReadRequest&, void*)> cb,
+                     void* cb_arg, void** io_handle, IOHandleDeleter* del_fn,
+                     IODebugContext* dbg) override;
+
 private:
  DeadlineFS& fs_;
  std::unique_ptr<FSRandomAccessFile> file_;
@ -3448,6 +3789,26 @@ IOStatus DeadlineRandomAccessFile::Read(uint64_t offset, size_t len,
  return s;
 }

+IOStatus DeadlineRandomAccessFile::ReadAsync(
+    FSReadRequest& req, const IOOptions& opts,
+    std::function<void(const FSReadRequest&, void*)> cb, void* cb_arg,
+    void** io_handle, IOHandleDeleter* del_fn, IODebugContext* dbg) {
+  const std::chrono::microseconds deadline = fs_.GetDeadline();
+  const std::chrono::microseconds io_timeout = fs_.GetIOTimeout();
+  IOStatus s;
+  if (deadline.count() || io_timeout.count()) {
+    fs_.AssertDeadline(deadline, io_timeout, opts);
+  }
+  if (s.ok()) {
+    s = FSRandomAccessFileWrapper::ReadAsync(req, opts, cb, cb_arg, io_handle,
+                                             del_fn, dbg);
+  }
+  if (s.ok()) {
+    s = fs_.ShouldDelay(opts);
+  }
+  return s;
+}
+
 IOStatus DeadlineRandomAccessFile::MultiRead(FSReadRequest* reqs,
                                             size_t num_reqs,
                                             const IOOptions& options,
@ -3469,7 +3830,8 @@ IOStatus DeadlineRandomAccessFile::MultiRead(FSReadRequest* reqs,

 // A test class for intercepting random reads and injecting artificial
 // delays. Used for testing the MultiGet deadline feature
-class DBBasicTestMultiGetDeadline : public DBBasicTestMultiGet {
+class DBBasicTestMultiGetDeadline : public DBBasicTestMultiGet,
+                                    public testing::WithParamInterface<bool> {
 public:
  DBBasicTestMultiGetDeadline()
      : DBBasicTestMultiGet(
@ -3492,7 +3854,13 @@ class DBBasicTestMultiGetDeadline : public DBBasicTestMultiGet {
  }
 };

-TEST_F(DBBasicTestMultiGetDeadline, MultiGetDeadlineExceeded) {
+TEST_P(DBBasicTestMultiGetDeadline, MultiGetDeadlineExceeded) {
+#ifndef USE_COROUTINES
+  if (GetParam()) {
+    ROCKSDB_GTEST_SKIP("This test requires coroutine support");
+    return;
+  }
+#endif  // USE_COROUTINES
  std::shared_ptr<DeadlineFS> fs = std::make_shared<DeadlineFS>(env_, false);
  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
  Options options = CurrentOptions();
@ -3523,6 +3891,7 @@ TEST_F(DBBasicTestMultiGetDeadline, MultiGetDeadlineExceeded) {

  ReadOptions ro;
  ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000};
+  ro.async_io = GetParam();
  // Delay the first IO
  fs->SetDelayTrigger(ro.deadline, ro.io_timeout, 0);

@ -3625,6 +3994,9 @@ TEST_F(DBBasicTestMultiGetDeadline, MultiGetDeadlineExceeded) {
  Close();
 }

+INSTANTIATE_TEST_CASE_P(DeadlineIO, DBBasicTestMultiGetDeadline,
+                        ::testing::Bool());
+
 TEST_F(DBBasicTest, ManifestWriteFailure) {
  Options options = GetDefaultOptions();
  options.create_if_missing = true;
@ -3724,6 +4096,27 @@ TEST_F(DBBasicTest, VerifyFileChecksums) {
  Reopen(options);
  ASSERT_TRUE(db_->VerifyFileChecksums(ReadOptions()).IsInvalidArgument());
 }
+
+TEST_F(DBBasicTest, ManualWalSync) {
+  Options options = CurrentOptions();
+  options.track_and_verify_wals_in_manifest = true;
+  options.wal_recovery_mode = WALRecoveryMode::kAbsoluteConsistency;
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("x", "y"));
+  // This does not create a new WAL.
+  ASSERT_OK(db_->SyncWAL());
+  EXPECT_FALSE(dbfull()->GetVersionSet()->GetWalSet().GetWals().empty());
+
+  std::unique_ptr<LogFile> wal;
+  Status s = db_->GetCurrentWalFile(&wal);
+  ASSERT_OK(s);
+  Close();
+
+  EXPECT_OK(env_->DeleteFile(LogFileName(dbname_, wal->LogNumber())));
+
+  ASSERT_TRUE(TryReopen(options).IsCorruption());
+}
 #endif  // !ROCKSDB_LITE

 // A test class for intercepting random reads and injecting artificial
--- a/db/db_bloom_filter_test.cc
+++ b/db/db_bloom_filter_test.cc
@ -886,103 +886,27 @@ TEST_F(DBBloomFilterTest, BloomFilterCompatibility) {
  }
 }

-/*
- * A cache wrapper that tracks peaks and increments of filter
- * construction cache reservation.
- *        p0
- *       / \   p1
- *      /   \  /\
- *     /     \/  \
- *  a /       b   \
- * peaks = {p0, p1}
- * increments = {p1-a, p2-b}
- */
-class FilterConstructResPeakTrackingCache : public CacheWrapper {
- public:
-  explicit FilterConstructResPeakTrackingCache(std::shared_ptr<Cache> target)
-      : CacheWrapper(std::move(target)),
-        cur_cache_res_(0),
-        cache_res_peak_(0),
-        cache_res_increment_(0),
-        last_peak_tracked_(false),
-        cache_res_increments_sum_(0) {}
-
-  using Cache::Insert;
-  Status Insert(const Slice& key, void* value, size_t charge,
-                void (*deleter)(const Slice& key, void* value),
-                Handle** handle = nullptr,
-                Priority priority = Priority::LOW) override {
-    Status s = target_->Insert(key, value, charge, deleter, handle, priority);
-    if (deleter == kNoopDeleterForFilterConstruction) {
-      if (last_peak_tracked_) {
-        cache_res_peak_ = 0;
-        cache_res_increment_ = 0;
-        last_peak_tracked_ = false;
-      }
-      cur_cache_res_ += charge;
-      cache_res_peak_ = std::max(cache_res_peak_, cur_cache_res_);
-      cache_res_increment_ += charge;
-    }
-    return s;
-  }
-
-  using Cache::Release;
-  bool Release(Handle* handle, bool erase_if_last_ref = false) override {
-    auto deleter = GetDeleter(handle);
-    if (deleter == kNoopDeleterForFilterConstruction) {
-      if (!last_peak_tracked_) {
-        cache_res_peaks_.push_back(cache_res_peak_);
-        cache_res_increments_sum_ += cache_res_increment_;
-        last_peak_tracked_ = true;
-      }
-      cur_cache_res_ -= GetCharge(handle);
-    }
-    bool is_successful = target_->Release(handle, erase_if_last_ref);
-    return is_successful;
-  }
-
-  std::deque<std::size_t> GetReservedCachePeaks() { return cache_res_peaks_; }
-
-  std::size_t GetReservedCacheIncrementSum() {
-    return cache_res_increments_sum_;
-  }
-
- private:
-  static const Cache::DeleterFn kNoopDeleterForFilterConstruction;
-
-  std::size_t cur_cache_res_;
-  std::size_t cache_res_peak_;
-  std::size_t cache_res_increment_;
-  bool last_peak_tracked_;
-  std::deque<std::size_t> cache_res_peaks_;
-  std::size_t cache_res_increments_sum_;
-};
-
-const Cache::DeleterFn
-    FilterConstructResPeakTrackingCache::kNoopDeleterForFilterConstruction =
-        CacheReservationManagerImpl<
-            CacheEntryRole::kFilterConstruction>::TEST_GetNoopDeleterForRole();
-
 // To align with the type of hash entry being reserved in implementation.
 using FilterConstructionReserveMemoryHash = uint64_t;

-class DBFilterConstructionReserveMemoryTestWithParam
+class ChargeFilterConstructionTestWithParam
    : public DBTestBase,
-      public testing::WithParamInterface<
-          std::tuple<bool, std::string, bool, bool>> {
+      public testing::WithParamInterface<std::tuple<
+          CacheEntryRoleOptions::Decision, std::string, bool, bool>> {
 public:
-  DBFilterConstructionReserveMemoryTestWithParam()
+  ChargeFilterConstructionTestWithParam()
      : DBTestBase("db_bloom_filter_tests",
                   /*env_do_fsync=*/true),
        num_key_(0),
-        reserve_table_builder_memory_(std::get<0>(GetParam())),
+        charge_filter_construction_(std::get<0>(GetParam())),
        policy_(std::get<1>(GetParam())),
        partition_filters_(std::get<2>(GetParam())),
        detect_filter_construct_corruption_(std::get<3>(GetParam())) {
-    if (!reserve_table_builder_memory_ || policy_ == kDeprecatedBlock ||
-        policy_ == kLegacyBloom) {
+    if (charge_filter_construction_ ==
+            CacheEntryRoleOptions::Decision::kDisabled ||
+        policy_ == kDeprecatedBlock || policy_ == kLegacyBloom) {
      // For these cases, we only interested in whether filter construction
-      // cache resevation happens instead of its accuracy. Therefore we don't
+      // cache charging happens instead of its accuracy. Therefore we don't
      // need many keys.
      num_key_ = 5;
    } else if (partition_filters_) {
@ -997,11 +921,11 @@ class DBFilterConstructionReserveMemoryTestWithParam
                 sizeof(FilterConstructionReserveMemoryHash);
    } else if (policy_ == kFastLocalBloom) {
      // For Bloom Filter + FullFilter case, since we design the num_key_ to
-      // make hash entry cache reservation be a multiple of dummy entries, the
+      // make hash entry cache charging be a multiple of dummy entries, the
      // correct behavior of charging final filter on top of it will trigger at
      // least another dummy entry insertion. Therefore we can assert that
      // behavior and we don't need a large number of keys to verify we
-      // indeed charge the final filter for cache reservation, even though final
+      // indeed charge the final filter for in cache, even though final
      // filter is a lot smaller than hash entries.
      num_key_ = 1 *
                 CacheReservationManagerImpl<
@ -1011,7 +935,7 @@ class DBFilterConstructionReserveMemoryTestWithParam
      // For Ribbon Filter + FullFilter case, we need a large enough number of
      // keys so that charging final filter after releasing the hash entries
      // reservation will trigger at least another dummy entry (or equivalently
-      // to saying, causing another peak in cache reservation) as banding
+      // to saying, causing another peak in cache charging) as banding
      // reservation might not be a multiple of dummy entry.
      num_key_ = 12 *
                 CacheReservationManagerImpl<
@ -1027,7 +951,9 @@ class DBFilterConstructionReserveMemoryTestWithParam
    // calculation.
    constexpr std::size_t kCacheCapacity = 100 * 1024 * 1024;

-    table_options.reserve_table_builder_memory = reserve_table_builder_memory_;
+    table_options.cache_usage_options.options_overrides.insert(
+        {CacheEntryRole::kFilterConstruction,
+         {/*.charged = */ charge_filter_construction_}});
    table_options.filter_policy = Create(10, policy_);
    table_options.partition_filters = partition_filters_;
    if (table_options.partition_filters) {
@ -1045,7 +971,8 @@ class DBFilterConstructionReserveMemoryTestWithParam
    lo.capacity = kCacheCapacity;
    lo.num_shard_bits = 0;  // 2^0 shard
    lo.strict_capacity_limit = true;
-    cache_ = std::make_shared<FilterConstructResPeakTrackingCache>(
+    cache_ = std::make_shared<
+        TargetCacheChargeTrackingCache<CacheEntryRole::kFilterConstruction>>(
        (NewLRUCache(lo)));
    table_options.block_cache = cache_;

@ -1054,56 +981,73 @@ class DBFilterConstructionReserveMemoryTestWithParam

  std::size_t GetNumKey() { return num_key_; }

-  bool ReserveTableBuilderMemory() { return reserve_table_builder_memory_; }
+  CacheEntryRoleOptions::Decision ChargeFilterConstructMemory() {
+    return charge_filter_construction_;
+  }

  std::string GetFilterPolicy() { return policy_; }

  bool PartitionFilters() { return partition_filters_; }

-  std::shared_ptr<FilterConstructResPeakTrackingCache>
-  GetFilterConstructResPeakTrackingCache() {
+  std::shared_ptr<
+      TargetCacheChargeTrackingCache<CacheEntryRole::kFilterConstruction>>
+  GetCache() {
    return cache_;
  }

 private:
  std::size_t num_key_;
-  bool reserve_table_builder_memory_;
+  CacheEntryRoleOptions::Decision charge_filter_construction_;
  std::string policy_;
  bool partition_filters_;
-  std::shared_ptr<FilterConstructResPeakTrackingCache> cache_;
+  std::shared_ptr<
+      TargetCacheChargeTrackingCache<CacheEntryRole::kFilterConstruction>>
+      cache_;
  bool detect_filter_construct_corruption_;
 };

 INSTANTIATE_TEST_CASE_P(
-    DBFilterConstructionReserveMemoryTestWithParam,
-    DBFilterConstructionReserveMemoryTestWithParam,
-    ::testing::Values(std::make_tuple(false, kFastLocalBloom, false, false),
+    ChargeFilterConstructionTestWithParam,
+    ChargeFilterConstructionTestWithParam,
+    ::testing::Values(
+        std::make_tuple(CacheEntryRoleOptions::Decision::kDisabled,
+                        kFastLocalBloom, false, false),

-                      std::make_tuple(true, kFastLocalBloom, false, false),
-                      std::make_tuple(true, kFastLocalBloom, false, true),
-                      std::make_tuple(true, kFastLocalBloom, true, false),
-                      std::make_tuple(true, kFastLocalBloom, true, true),
+        std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled,
+                        kFastLocalBloom, false, false),
+        std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled,
+                        kFastLocalBloom, false, true),
+        std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled,
+                        kFastLocalBloom, true, false),
+        std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled,
+                        kFastLocalBloom, true, true),

-                      std::make_tuple(true, kStandard128Ribbon, false, false),
-                      std::make_tuple(true, kStandard128Ribbon, false, true),
-                      std::make_tuple(true, kStandard128Ribbon, true, false),
-                      std::make_tuple(true, kStandard128Ribbon, true, true),
+        std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled,
+                        kStandard128Ribbon, false, false),
+        std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled,
+                        kStandard128Ribbon, false, true),
+        std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled,
+                        kStandard128Ribbon, true, false),
+        std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled,
+                        kStandard128Ribbon, true, true),

-                      std::make_tuple(true, kDeprecatedBlock, false, false),
-                      std::make_tuple(true, kLegacyBloom, false, false)));
+        std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled,
+                        kDeprecatedBlock, false, false),
+        std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled, kLegacyBloom,
+                        false, false)));

 // TODO: Speed up this test, and reduce disk space usage (~700MB)
 // The current test inserts many keys (on the scale of dummy entry size)
 // in order to make small memory user (e.g, final filter, partitioned hash
 // entries/filter/banding) , which is proportional to the number of
-// keys, big enough so that its cache reservation triggers dummy entry insertion
+// keys, big enough so that its cache charging triggers dummy entry insertion
 // and becomes observable in the test.
 //
 // However, inserting that many keys slows down this test and leaves future
 // developers an opportunity to speed it up.
 //
 // Possible approaches & challenges:
-// 1. Use sync point during cache reservation of filter construction
+// 1. Use sync point during cache charging of filter construction
 //
 // Benefit: It does not rely on triggering dummy entry insertion
 // but the sync point to verify small memory user is charged correctly.
@ -1112,7 +1056,7 @@ INSTANTIATE_TEST_CASE_P(
 //
 // 2. Make dummy entry size configurable and set it small in the test
 //
-// Benefit: It increases the precision of cache reservation and therefore
+// Benefit: It increases the precision of cache charging and therefore
 // small memory usage can still trigger insertion of dummy entry.
 //
 // Challenge: change CacheReservationManager related APIs and a hack
@ -1120,16 +1064,17 @@ INSTANTIATE_TEST_CASE_P(
 // CacheReservationManager used in filter construction for testing
 // since CacheReservationManager is not exposed at the high level.
 //
-TEST_P(DBFilterConstructionReserveMemoryTestWithParam, ReserveMemory) {
+TEST_P(ChargeFilterConstructionTestWithParam, Basic) {
  Options options = CurrentOptions();
  // We set write_buffer_size big enough so that in the case where there is
-  // filter construction cache reservation, flush won't be triggered before we
+  // filter construction cache charging, flush won't be triggered before we
  // manually trigger it for clean testing
  options.write_buffer_size = 640 << 20;
  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
-  std::shared_ptr<FilterConstructResPeakTrackingCache> cache =
-      GetFilterConstructResPeakTrackingCache();
+  std::shared_ptr<
+      TargetCacheChargeTrackingCache<CacheEntryRole::kFilterConstruction>>
+      cache = GetCache();
  options.create_if_missing = true;
  // Disable auto compaction to prevent its unexpected side effect
  // to the number of keys per partition designed by us in the test
@ -1140,32 +1085,33 @@ TEST_P(DBFilterConstructionReserveMemoryTestWithParam, ReserveMemory) {
    ASSERT_OK(Put(Key(i), Key(i)));
  }

-  ASSERT_EQ(cache->GetReservedCacheIncrementSum(), 0)
+  ASSERT_EQ(cache->GetChargedCacheIncrementSum(), 0)
      << "Flush was triggered too early in the test case with filter "
-         "construction cache reservation - please make sure no flush triggered "
+         "construction cache charging - please make sure no flush triggered "
         "during the key insertions above";

  ASSERT_OK(Flush());

-  bool reserve_table_builder_memory = ReserveTableBuilderMemory();
+  bool charge_filter_construction = (ChargeFilterConstructMemory() ==
+                                     CacheEntryRoleOptions::Decision::kEnabled);
  std::string policy = GetFilterPolicy();
  bool partition_filters = PartitionFilters();
  bool detect_filter_construct_corruption =
      table_options.detect_filter_construct_corruption;

  std::deque<std::size_t> filter_construction_cache_res_peaks =
-      cache->GetReservedCachePeaks();
+      cache->GetChargedCachePeaks();
  std::size_t filter_construction_cache_res_increments_sum =
-      cache->GetReservedCacheIncrementSum();
+      cache->GetChargedCacheIncrementSum();

-  if (!reserve_table_builder_memory) {
+  if (!charge_filter_construction) {
    EXPECT_EQ(filter_construction_cache_res_peaks.size(), 0);
    return;
  }

  if (policy == kDeprecatedBlock || policy == kLegacyBloom) {
    EXPECT_EQ(filter_construction_cache_res_peaks.size(), 0)
-        << "There shouldn't be filter construction cache reservation as this "
+        << "There shouldn't be filter construction cache charging as this "
           "feature does not support kDeprecatedBlock "
           "nor kLegacyBloom";
    return;
@ -1239,14 +1185,14 @@ TEST_P(DBFilterConstructionReserveMemoryTestWithParam, ReserveMemory) {
     */
    if (!partition_filters) {
      EXPECT_EQ(filter_construction_cache_res_peaks.size(), 1)
-          << "Filter construction cache reservation should have only 1 peak in "
+          << "Filter construction cache charging should have only 1 peak in "
             "case: kFastLocalBloom + FullFilter";
      std::size_t filter_construction_cache_res_peak =
          filter_construction_cache_res_peaks[0];
      EXPECT_GT(filter_construction_cache_res_peak,
                predicted_hash_entries_cache_res)
          << "The testing number of hash entries is designed to make hash "
-             "entries cache reservation be multiples of dummy entries"
+             "entries cache charging be multiples of dummy entries"
             " so the correct behavior of charging final filter on top of it"
             " should've triggered at least another dummy entry insertion";

@ -1259,7 +1205,7 @@ TEST_P(DBFilterConstructionReserveMemoryTestWithParam, ReserveMemory) {
      return;
    } else {
      EXPECT_GE(filter_construction_cache_res_peaks.size(), 2)
-          << "Filter construction cache reservation should have multiple peaks "
+          << "Filter construction cache charging should have multiple peaks "
             "in case: kFastLocalBloom + "
             "PartitionedFilter";
      std::size_t predicted_filter_construction_cache_res_increments_sum =
@ -1366,11 +1312,11 @@ TEST_P(DBFilterConstructionReserveMemoryTestWithParam, ReserveMemory) {
              CacheReservationManagerImpl<
                  CacheEntryRole::kFilterConstruction>::GetDummyEntrySize()),
          1)
-          << "Final filter cache reservation too small for this test - please "
+          << "Final filter cache charging too small for this test - please "
             "increase the number of keys";
      if (!detect_filter_construct_corruption) {
        EXPECT_EQ(filter_construction_cache_res_peaks.size(), 2)
-            << "Filter construction cache reservation should have 2 peaks in "
+            << "Filter construction cache charging should have 2 peaks in "
               "case: kStandard128Ribbon + "
               "FullFilter. "
               "The second peak is resulted from charging the final filter "
@ -1389,7 +1335,7 @@ TEST_P(DBFilterConstructionReserveMemoryTestWithParam, ReserveMemory) {
                  predicted_filter_construction_cache_res_peak * 1.1);
      } else {
        EXPECT_EQ(filter_construction_cache_res_peaks.size(), 1)
-            << "Filter construction cache reservation should have 1 peaks in "
+            << "Filter construction cache charging should have 1 peaks in "
               "case: kStandard128Ribbon + FullFilter "
               "+ detect_filter_construct_corruption. "
               "The previous second peak now disappears since we don't "
@ -1410,13 +1356,13 @@ TEST_P(DBFilterConstructionReserveMemoryTestWithParam, ReserveMemory) {
    } else {
      if (!detect_filter_construct_corruption) {
        EXPECT_GE(filter_construction_cache_res_peaks.size(), 3)
-            << "Filter construction cache reservation should have more than 3 "
+            << "Filter construction cache charging should have more than 3 "
               "peaks "
               "in case: kStandard128Ribbon + "
               "PartitionedFilter";
      } else {
        EXPECT_GE(filter_construction_cache_res_peaks.size(), 2)
-            << "Filter construction cache reservation should have more than 2 "
+            << "Filter construction cache charging should have more than 2 "
               "peaks "
               "in case: kStandard128Ribbon + "
               "PartitionedFilter + detect_filter_construct_corruption";
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@ -1441,12 +1441,13 @@ Status DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir) {
  for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to;) {
    auto& wal = *it;
    assert(wal.getting_synced);
+    if (immutable_db_options_.track_and_verify_wals_in_manifest &&
+        wal.writer->file()->GetFileSize() > 0) {
+      synced_wals.AddWal(wal.number,
+                         WalMetadata(wal.writer->file()->GetFileSize()));
+    }
+
    if (logs_.size() > 1) {
-      if (immutable_db_options_.track_and_verify_wals_in_manifest &&
-          wal.writer->file()->GetFileSize() > 0) {
-        synced_wals.AddWal(wal.number,
-                           WalMetadata(wal.writer->file()->GetFileSize()));
-      }
      logs_to_free_.push_back(wal.ReleaseWriter());
      // To modify logs_ both mutex_ and log_write_mutex_ must be held
      InstrumentedMutexLock l(&log_write_mutex_);
@ -1723,17 +1724,6 @@ Status DBImpl::Get(const ReadOptions& read_options,
  return s;
 }

-namespace {
-class GetWithTimestampReadCallback : public ReadCallback {
- public:
-  explicit GetWithTimestampReadCallback(SequenceNumber seq)
-      : ReadCallback(seq) {}
-  bool IsVisibleFullCheck(SequenceNumber seq) override {
-    return seq <= max_visible_seq_;
-  }
-};
-}  // namespace
-
 Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key,
                       GetImplOptions& get_impl_options) {
  assert(get_impl_options.value != nullptr ||
@ -2590,7 +2580,8 @@ Status DBImpl::MultiGetImpl(
                            ? MultiGetContext::MAX_BATCH_SIZE
                            : keys_left;
    MultiGetContext ctx(sorted_keys, start_key + num_keys - keys_left,
-                        batch_size, snapshot, read_options);
+                        batch_size, snapshot, read_options, GetFileSystem(),
+                        stats_);
    MultiGetRange range = ctx.GetMultiGetRange();
    range.AddValueSize(curr_value_size);
    bool lookup_current = false;
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@ -1240,6 +1240,39 @@ class DBImpl : public DB {

  std::atomic<bool> shutting_down_;

+  // RecoveryContext struct stores the context about version edits along
+  // with corresponding column_family_data and column_family_options.
+  class RecoveryContext {
+   public:
+    ~RecoveryContext() {
+      for (auto& edit_list : edit_lists_) {
+        for (auto* edit : edit_list) {
+          delete edit;
+        }
+      }
+    }
+
+    void UpdateVersionEdits(ColumnFamilyData* cfd, const VersionEdit& edit) {
+      assert(cfd != nullptr);
+      if (map_.find(cfd->GetID()) == map_.end()) {
+        uint32_t size = static_cast<uint32_t>(map_.size());
+        map_.emplace(cfd->GetID(), size);
+        cfds_.emplace_back(cfd);
+        mutable_cf_opts_.emplace_back(cfd->GetLatestMutableCFOptions());
+        edit_lists_.emplace_back(autovector<VersionEdit*>());
+      }
+      uint32_t i = map_[cfd->GetID()];
+      edit_lists_[i].emplace_back(new VersionEdit(edit));
+    }
+
+    std::unordered_map<uint32_t, uint32_t> map_;  // cf_id to index;
+    autovector<ColumnFamilyData*> cfds_;
+    autovector<const MutableCFOptions*> mutable_cf_opts_;
+    autovector<autovector<VersionEdit*>> edit_lists_;
+    // files_to_delete_ contains sst files
+    std::unordered_set<std::string> files_to_delete_;
+  };
+
  // Except in DB::Open(), WriteOptionsFile can only be called when:
  // Persist options to options file.
  // If need_mutex_lock = false, the method will lock DB mutex.
@ -1356,16 +1389,19 @@ class DBImpl : public DB {
  // be made to the descriptor are added to *edit.
  // recovered_seq is set to less than kMaxSequenceNumber if the log's tail is
  // skipped.
+  // recovery_ctx stores the context about version edits and all those
+  // edits are persisted to new Manifest after successfully syncing the new WAL.
  virtual Status Recover(
      const std::vector<ColumnFamilyDescriptor>& column_families,
      bool read_only = false, bool error_if_wal_file_exists = false,
      bool error_if_data_exists_in_wals = false,
-      uint64_t* recovered_seq = nullptr);
+      uint64_t* recovered_seq = nullptr,
+      RecoveryContext* recovery_ctx = nullptr);

  virtual bool OwnTablesAndLogs() const { return true; }

  // Set DB identity file, and write DB ID to manifest if necessary.
-  Status SetDBId(bool read_only);
+  Status SetDBId(bool read_only, RecoveryContext* recovery_ctx);

  // REQUIRES: db mutex held when calling this function, but the db mutex can
  // be released and re-acquired. Db mutex will be held when the function
@ -1374,12 +1410,15 @@ class DBImpl : public DB {
  // not referenced in the MANIFEST (e.g.
  // 1. It's best effort recovery;
  // 2. The VersionEdits referencing the SST files are appended to
-  // MANIFEST, DB crashes when syncing the MANIFEST, the VersionEdits are
+  // RecoveryContext, DB crashes when syncing the MANIFEST, the VersionEdits are
  // still not synced to MANIFEST during recovery.)
-  // We delete these SST files. In the
+  // It stores the SST files to be deleted in RecoveryContext. In the
  // meantime, we find out the largest file number present in the paths, and
  // bump up the version set's next_file_number_ to be 1 + largest_file_number.
-  Status DeleteUnreferencedSstFiles();
+  // recovery_ctx stores the context about version edits and files to be
+  // deleted. All those edits are persisted to new Manifest after successfully
+  // syncing the new WAL.
+  Status DeleteUnreferencedSstFiles(RecoveryContext* recovery_ctx);

  // SetDbSessionId() should be called in the constuctor DBImpl()
  // to ensure that db_session_id_ gets updated every time the DB is opened
@ -1389,6 +1428,14 @@ class DBImpl : public DB {
  Status FailIfTsSizesMismatch(const ColumnFamilyHandle* column_family,
                               const Slice& ts) const;

+  // recovery_ctx stores the context about version edits and
+  // LogAndApplyForRecovery persist all those edits to new Manifest after
+  // successfully syncing new WAL.
+  // LogAndApplyForRecovery should be called only once during recovery and it
+  // should be called when RocksDB writes to a first new MANIFEST since this
+  // recovery.
+  Status LogAndApplyForRecovery(const RecoveryContext& recovery_ctx);
+
 private:
  friend class DB;
  friend class ErrorHandler;
@ -1645,7 +1692,8 @@ class DBImpl : public DB {
  // corrupted_log_found is set to true if we recover from a corrupted log file.
  Status RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
                         SequenceNumber* next_sequence, bool read_only,
-                         bool* corrupted_log_found);
+                         bool* corrupted_log_found,
+                         RecoveryContext* recovery_ctx);

  // The following two methods are used to flush a memtable to
  // storage. The first one is used at database RecoveryTime (when the
@ -1974,6 +2022,11 @@ class DBImpl : public DB {
  IOStatus CreateWAL(uint64_t log_file_num, uint64_t recycle_log_number,
                     size_t preallocate_block_size, log::Writer** new_log);

+  // Verify SST file unique id between Manifest and table properties to make
+  // sure they're the same. Currently only used during DB open when
+  // `verify_sst_unique_id_in_manifest = true`.
+  Status VerifySstUniqueIdInManifest();
+
  // Validate self-consistency of DB options
  static Status ValidateOptions(const DBOptions& db_options);
  // Validate self-consistency of DB options and its consistency with cf options
@ -2395,6 +2448,15 @@ class DBImpl : public DB {
  std::unique_ptr<StallInterface> wbm_stall_;
 };

+class GetWithTimestampReadCallback : public ReadCallback {
+ public:
+  explicit GetWithTimestampReadCallback(SequenceNumber seq)
+      : ReadCallback(seq) {}
+  bool IsVisibleFullCheck(SequenceNumber seq) override {
+    return seq <= max_visible_seq_;
+  }
+};
+
 extern Options SanitizeOptions(const std::string& db, const Options& src,
                               bool read_only = false);

--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@ -1651,7 +1651,8 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
          f->smallest, f->largest, f->fd.smallest_seqno, f->fd.largest_seqno,
          f->marked_for_compaction, f->temperature, f->oldest_blob_file_number,
          f->oldest_ancester_time, f->file_creation_time, f->file_checksum,
-          f->file_checksum_func_name, f->min_timestamp, f->max_timestamp);
+          f->file_checksum_func_name, f->min_timestamp, f->max_timestamp,
+          f->unique_id);
    }
    ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
                    "[%s] Apply version edit:\n%s", cfd->GetName().c_str(),
@ -1856,11 +1857,12 @@ Status DBImpl::RunManualCompaction(
    }
  }

-  ROCKS_LOG_INFO(immutable_db_options_.info_log,
-                 "[%s] Manual compaction starting", cfd->GetName().c_str());
-
  LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL,
                       immutable_db_options_.info_log.get());
+
+  ROCKS_LOG_BUFFER(&log_buffer, "[%s] Manual compaction starting",
+                   cfd->GetName().c_str());
+
  // We don't check bg_error_ here, because if we get the error in compaction,
  // the compaction will set manual.status to bg_error_ and set manual.done to
  // true.
@ -3276,7 +3278,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
            f->fd.largest_seqno, f->marked_for_compaction, f->temperature,
            f->oldest_blob_file_number, f->oldest_ancester_time,
            f->file_creation_time, f->file_checksum, f->file_checksum_func_name,
-            f->min_timestamp, f->max_timestamp);
+            f->min_timestamp, f->max_timestamp, f->unique_id);

        ROCKS_LOG_BUFFER(
            log_buffer,
--- a/db/db_impl/db_impl_experimental.cc
+++ b/db/db_impl/db_impl_experimental.cc
@ -137,7 +137,7 @@ Status DBImpl::PromoteL0(ColumnFamilyHandle* column_family, int target_level) {
          f->fd.largest_seqno, f->marked_for_compaction, f->temperature,
          f->oldest_blob_file_number, f->oldest_ancester_time,
          f->file_creation_time, f->file_checksum, f->file_checksum_func_name,
-          f->min_timestamp, f->max_timestamp);
+          f->min_timestamp, f->max_timestamp, f->unique_id);
    }

    status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
--- a/db/db_impl/db_impl_files.cc
+++ b/db/db_impl/db_impl_files.cc
@ -863,7 +863,7 @@ uint64_t PrecomputeMinLogNumberToKeep2PC(
  return min_log_number_to_keep;
 }

-Status DBImpl::SetDBId(bool read_only) {
+Status DBImpl::SetDBId(bool read_only, RecoveryContext* recovery_ctx) {
  Status s;
  // Happens when immutable_db_options_.write_dbid_to_manifest is set to true
  // the very first time.
@ -890,14 +890,14 @@ Status DBImpl::SetDBId(bool read_only) {
    }
    s = GetDbIdentityFromIdentityFile(&db_id_);
    if (immutable_db_options_.write_dbid_to_manifest && s.ok()) {
+      assert(!read_only);
+      assert(recovery_ctx != nullptr);
+      assert(versions_->GetColumnFamilySet() != nullptr);
      VersionEdit edit;
      edit.SetDBId(db_id_);
-      Options options;
-      MutableCFOptions mutable_cf_options(options);
      versions_->db_id_ = db_id_;
-      s = versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(),
-                                 mutable_cf_options, &edit, &mutex_, nullptr,
-                                 /* new_descriptor_log */ false);
+      recovery_ctx->UpdateVersionEdits(
+          versions_->GetColumnFamilySet()->GetDefault(), edit);
    }
  } else if (!read_only) {
    s = SetIdentityFile(env_, dbname_, db_id_);
@ -905,7 +905,7 @@ Status DBImpl::SetDBId(bool read_only) {
  return s;
 }

-Status DBImpl::DeleteUnreferencedSstFiles() {
+Status DBImpl::DeleteUnreferencedSstFiles(RecoveryContext* recovery_ctx) {
  mutex_.AssertHeld();
  std::vector<std::string> paths;
  paths.push_back(NormalizePath(dbname_ + std::string(1, kFilePathSeparator)));
@ -925,7 +925,6 @@ Status DBImpl::DeleteUnreferencedSstFiles() {

  uint64_t next_file_number = versions_->current_next_file_number();
  uint64_t largest_file_number = next_file_number;
-  std::set<std::string> files_to_delete;
  Status s;
  for (const auto& path : paths) {
    std::vector<std::string> files;
@ -943,8 +942,9 @@ Status DBImpl::DeleteUnreferencedSstFiles() {
      const std::string normalized_fpath = path + fname;
      largest_file_number = std::max(largest_file_number, number);
      if (type == kTableFile && number >= next_file_number &&
-          files_to_delete.find(normalized_fpath) == files_to_delete.end()) {
-        files_to_delete.insert(normalized_fpath);
+          recovery_ctx->files_to_delete_.find(normalized_fpath) ==
+              recovery_ctx->files_to_delete_.end()) {
+        recovery_ctx->files_to_delete_.emplace(normalized_fpath);
      }
    }
  }
@ -961,21 +961,7 @@ Status DBImpl::DeleteUnreferencedSstFiles() {
  assert(versions_->GetColumnFamilySet());
  ColumnFamilyData* default_cfd = versions_->GetColumnFamilySet()->GetDefault();
  assert(default_cfd);
-  s = versions_->LogAndApply(
-      default_cfd, *default_cfd->GetLatestMutableCFOptions(), &edit, &mutex_,
-      directories_.GetDbDir(), /*new_descriptor_log*/ false);
-  if (!s.ok()) {
-    return s;
-  }
-
-  mutex_.Unlock();
-  for (const auto& fname : files_to_delete) {
-    s = env_->DeleteFile(fname);
-    if (!s.ok()) {
-      break;
-    }
-  }
-  mutex_.Lock();
+  recovery_ctx->UpdateVersionEdits(default_cfd, edit);
  return s;
 }

--- a/db/db_impl/db_impl_open.cc
+++ b/db/db_impl/db_impl_open.cc
@ -399,7 +399,7 @@ IOStatus Directories::SetDirectories(FileSystem* fs, const std::string& dbname,
 Status DBImpl::Recover(
    const std::vector<ColumnFamilyDescriptor>& column_families, bool read_only,
    bool error_if_wal_file_exists, bool error_if_data_exists_in_wals,
-    uint64_t* recovered_seq) {
+    uint64_t* recovered_seq, RecoveryContext* recovery_ctx) {
  mutex_.AssertHeld();

  bool is_new_db = false;
@ -518,9 +518,15 @@ Status DBImpl::Recover(
  if (!s.ok()) {
    return s;
  }
-  s = SetDBId(read_only);
+  if (immutable_db_options_.verify_sst_unique_id_in_manifest) {
+    s = VerifySstUniqueIdInManifest();
+    if (!s.ok()) {
+      return s;
+    }
+  }
+  s = SetDBId(read_only, recovery_ctx);
  if (s.ok() && !read_only) {
-    s = DeleteUnreferencedSstFiles();
+    s = DeleteUnreferencedSstFiles(recovery_ctx);
  }

  if (immutable_db_options_.paranoid_checks && s.ok()) {
@ -535,10 +541,6 @@ Status DBImpl::Recover(
      }
    }
  }
-  // DB mutex is already held
-  if (s.ok() && immutable_db_options_.persist_stats_to_disk) {
-    s = InitPersistStatsColumnFamily();
-  }

  std::vector<std::string> files_in_wal_dir;
  if (s.ok()) {
@ -608,7 +610,10 @@ Status DBImpl::Recover(
      WalNumber max_wal_number =
          versions_->GetWalSet().GetWals().rbegin()->first;
      edit.DeleteWalsBefore(max_wal_number + 1);
-      s = versions_->LogAndApplyToDefaultColumnFamily(&edit, &mutex_);
+      assert(recovery_ctx != nullptr);
+      assert(versions_->GetColumnFamilySet() != nullptr);
+      recovery_ctx->UpdateVersionEdits(
+          versions_->GetColumnFamilySet()->GetDefault(), edit);
    }
    if (!s.ok()) {
      return s;
@ -644,8 +649,8 @@ Status DBImpl::Recover(
      std::sort(wals.begin(), wals.end());

      bool corrupted_wal_found = false;
-      s = RecoverLogFiles(wals, &next_sequence, read_only,
-                          &corrupted_wal_found);
+      s = RecoverLogFiles(wals, &next_sequence, read_only, &corrupted_wal_found,
+                          recovery_ctx);
      if (corrupted_wal_found && recovered_seq != nullptr) {
        *recovered_seq = next_sequence;
      }
@ -698,6 +703,25 @@ Status DBImpl::Recover(
  return s;
 }

+Status DBImpl::VerifySstUniqueIdInManifest() {
+  mutex_.AssertHeld();
+  ROCKS_LOG_INFO(
+      immutable_db_options_.info_log,
+      "Verifying SST unique id between MANIFEST and SST file table properties");
+  Status status;
+  for (auto cfd : *versions_->GetColumnFamilySet()) {
+    if (!cfd->IsDropped()) {
+      auto version = cfd->current();
+      version->Ref();
+      mutex_.Unlock();
+      status = version->VerifySstUniqueIds();
+      mutex_.Lock();
+      version->Unref();
+    }
+  }
+  return status;
+}
+
 Status DBImpl::PersistentStatsProcessFormatVersion() {
  mutex_.AssertHeld();
  Status s;
@ -805,10 +829,30 @@ Status DBImpl::InitPersistStatsColumnFamily() {
  return s;
 }

+Status DBImpl::LogAndApplyForRecovery(const RecoveryContext& recovery_ctx) {
+  mutex_.AssertHeld();
+  assert(versions_->descriptor_log_ == nullptr);
+  Status s = versions_->LogAndApply(
+      recovery_ctx.cfds_, recovery_ctx.mutable_cf_opts_,
+      recovery_ctx.edit_lists_, &mutex_, directories_.GetDbDir());
+  if (s.ok() && !(recovery_ctx.files_to_delete_.empty())) {
+    mutex_.Unlock();
+    for (const auto& fname : recovery_ctx.files_to_delete_) {
+      s = env_->DeleteFile(fname);
+      if (!s.ok()) {
+        break;
+      }
+    }
+    mutex_.Lock();
+  }
+  return s;
+}
+
 // REQUIRES: wal_numbers are sorted in ascending order
 Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
                               SequenceNumber* next_sequence, bool read_only,
-                               bool* corrupted_wal_found) {
+                               bool* corrupted_wal_found,
+                               RecoveryContext* recovery_ctx) {
  struct LogReporter : public log::Reader::Reporter {
    Env* env;
    Logger* info_log;
@ -1266,44 +1310,36 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
      // VersionSet::next_file_number_ always to be strictly greater than any
      // log number
      versions_->MarkFileNumberUsed(max_wal_number + 1);
+      assert(recovery_ctx != nullptr);

-      autovector<ColumnFamilyData*> cfds;
-      autovector<const MutableCFOptions*> cf_opts;
-      autovector<autovector<VersionEdit*>> edit_lists;
      for (auto* cfd : *versions_->GetColumnFamilySet()) {
-        cfds.push_back(cfd);
-        cf_opts.push_back(cfd->GetLatestMutableCFOptions());
        auto iter = version_edits.find(cfd->GetID());
        assert(iter != version_edits.end());
-        edit_lists.push_back({&iter->second});
+        recovery_ctx->UpdateVersionEdits(cfd, iter->second);
      }

-      std::unique_ptr<VersionEdit> wal_deletion;
      if (flushed) {
-        wal_deletion = std::make_unique<VersionEdit>();
+        VersionEdit wal_deletion;
        if (immutable_db_options_.track_and_verify_wals_in_manifest) {
-          wal_deletion->DeleteWalsBefore(max_wal_number + 1);
+          wal_deletion.DeleteWalsBefore(max_wal_number + 1);
        }
        if (!allow_2pc()) {
          // In non-2pc mode, flushing the memtables of the column families
          // means we can advance min_log_number_to_keep.
-          wal_deletion->SetMinLogNumberToKeep(max_wal_number + 1);
+          wal_deletion.SetMinLogNumberToKeep(max_wal_number + 1);
        }
-        edit_lists.back().push_back(wal_deletion.get());
+        assert(versions_->GetColumnFamilySet() != nullptr);
+        recovery_ctx->UpdateVersionEdits(
+            versions_->GetColumnFamilySet()->GetDefault(), wal_deletion);
      }
-
-      // write MANIFEST with update
-      status = versions_->LogAndApply(cfds, cf_opts, edit_lists, &mutex_,
-                                      directories_.GetDbDir(),
-                                      /*new_descriptor_log=*/true);
    }
  }

  if (status.ok()) {
    if (data_seen && !flushed) {
      status = RestoreAliveLogFiles(wal_numbers);
-    } else {
-      // If there's no data in the WAL, or we flushed all the data, still
+    } else if (!wal_numbers.empty()) {  // If there's no data in the WAL, or we
+                                        // flushed all the data, still
      // truncate the log file. If the process goes into a crash loop before
      // the file is deleted, the preallocated space will never get freed.
      const bool truncate = !read_only;
@ -1498,13 +1534,14 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
  constexpr int level = 0;

  if (s.ok() && has_output) {
-    edit->AddFile(
-        level, meta.fd.GetNumber(), meta.fd.GetPathId(), meta.fd.GetFileSize(),
-        meta.smallest, meta.largest, meta.fd.smallest_seqno,
-        meta.fd.largest_seqno, meta.marked_for_compaction, meta.temperature,
-        meta.oldest_blob_file_number, meta.oldest_ancester_time,
-        meta.file_creation_time, meta.file_checksum,
-        meta.file_checksum_func_name, meta.min_timestamp, meta.max_timestamp);
+    edit->AddFile(level, meta.fd.GetNumber(), meta.fd.GetPathId(),
+                  meta.fd.GetFileSize(), meta.smallest, meta.largest,
+                  meta.fd.smallest_seqno, meta.fd.largest_seqno,
+                  meta.marked_for_compaction, meta.temperature,
+                  meta.oldest_blob_file_number, meta.oldest_ancester_time,
+                  meta.file_creation_time, meta.file_checksum,
+                  meta.file_checksum_func_name, meta.min_timestamp,
+                  meta.max_timestamp, meta.unique_id);

    for (const auto& blob : blob_file_additions) {
      edit->AddBlobFile(blob);
@ -1698,6 +1735,7 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
  }

  *dbptr = nullptr;
+  assert(handles);
  handles->clear();

  size_t max_write_buffer_size = 0;
@ -1740,11 +1778,13 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
  }

  impl->wal_in_db_path_ = impl->immutable_db_options_.IsWalDirSameAsDBPath();
-
+  RecoveryContext recovery_ctx;
  impl->mutex_.Lock();
+
  // Handles create_if_missing, error_if_exists
  uint64_t recovered_seq(kMaxSequenceNumber);
-  s = impl->Recover(column_families, false, false, false, &recovered_seq);
+  s = impl->Recover(column_families, false, false, false, &recovered_seq,
+                    &recovery_ctx);
  if (s.ok()) {
    uint64_t new_log_number = impl->versions_->NewFileNumber();
    log::Writer* new_log = nullptr;
@ -1761,40 +1801,6 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
    }

    if (s.ok()) {
-      // set column family handles
-      for (auto cf : column_families) {
-        auto cfd =
-            impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name);
-        if (cfd != nullptr) {
-          handles->push_back(
-              new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_));
-          impl->NewThreadStatusCfInfo(cfd);
-        } else {
-          if (db_options.create_missing_column_families) {
-            // missing column family, create it
-            ColumnFamilyHandle* handle;
-            impl->mutex_.Unlock();
-            s = impl->CreateColumnFamily(cf.options, cf.name, &handle);
-            impl->mutex_.Lock();
-            if (s.ok()) {
-              handles->push_back(handle);
-            } else {
-              break;
-            }
-          } else {
-            s = Status::InvalidArgument("Column family not found", cf.name);
-            break;
-          }
-        }
-      }
-    }
-    if (s.ok()) {
-      SuperVersionContext sv_context(/* create_superversion */ true);
-      for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
-        impl->InstallSuperVersionAndScheduleWork(
-            cfd, &sv_context, *cfd->GetLatestMutableCFOptions());
-      }
-      sv_context.Clean();
      if (impl->two_write_queues_) {
        impl->log_write_mutex_.Lock();
      }
@ -1834,6 +1840,53 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
      }
    }
  }
+  if (s.ok()) {
+    s = impl->LogAndApplyForRecovery(recovery_ctx);
+  }
+
+  if (s.ok() && impl->immutable_db_options_.persist_stats_to_disk) {
+    impl->mutex_.AssertHeld();
+    s = impl->InitPersistStatsColumnFamily();
+  }
+
+  if (s.ok()) {
+    // set column family handles
+    for (auto cf : column_families) {
+      auto cfd =
+          impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name);
+      if (cfd != nullptr) {
+        handles->push_back(
+            new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_));
+        impl->NewThreadStatusCfInfo(cfd);
+      } else {
+        if (db_options.create_missing_column_families) {
+          // missing column family, create it
+          ColumnFamilyHandle* handle = nullptr;
+          impl->mutex_.Unlock();
+          s = impl->CreateColumnFamily(cf.options, cf.name, &handle);
+          impl->mutex_.Lock();
+          if (s.ok()) {
+            handles->push_back(handle);
+          } else {
+            break;
+          }
+        } else {
+          s = Status::InvalidArgument("Column family not found", cf.name);
+          break;
+        }
+      }
+    }
+  }
+
+  if (s.ok()) {
+    SuperVersionContext sv_context(/* create_superversion */ true);
+    for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
+      impl->InstallSuperVersionAndScheduleWork(
+          cfd, &sv_context, *cfd->GetLatestMutableCFOptions());
+    }
+    sv_context.Clean();
+  }
+
  if (s.ok() && impl->immutable_db_options_.persist_stats_to_disk) {
    // try to read format version
    s = impl->PersistentStatsProcessFormatVersion();
--- a/db/db_impl/db_impl_readonly.cc
+++ b/db/db_impl/db_impl_readonly.cc
@ -33,20 +33,38 @@ DBImplReadOnly::~DBImplReadOnly() {}
 Status DBImplReadOnly::Get(const ReadOptions& read_options,
                           ColumnFamilyHandle* column_family, const Slice& key,
                           PinnableSlice* pinnable_val) {
+  return Get(read_options, column_family, key, pinnable_val,
+             /*timestamp*/ nullptr);
+}
+
+Status DBImplReadOnly::Get(const ReadOptions& read_options,
+                           ColumnFamilyHandle* column_family, const Slice& key,
+                           PinnableSlice* pinnable_val,
+                           std::string* timestamp) {
  assert(pinnable_val != nullptr);
  // TODO: stopwatch DB_GET needed?, perf timer needed?
  PERF_TIMER_GUARD(get_snapshot_time);

  assert(column_family);
+  if (read_options.timestamp) {
+    const Status s =
+        FailIfTsSizesMismatch(column_family, *(read_options.timestamp));
+    if (!s.ok()) {
+      return s;
+    }
+  } else {
+    const Status s = FailIfCfHasTs(column_family);
+    if (!s.ok()) {
+      return s;
+    }
+  }
  const Comparator* ucmp = column_family->GetComparator();
  assert(ucmp);
-  if (ucmp->timestamp_size() || read_options.timestamp) {
-    // TODO: support timestamp
-    return Status::NotSupported();
-  }
+  std::string* ts = ucmp->timestamp_size() > 0 ? timestamp : nullptr;

  Status s;
  SequenceNumber snapshot = versions_->LastSequence();
+  GetWithTimestampReadCallback read_cb(snapshot);
  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
  auto cfd = cfh->cfd();
  if (tracer_) {
@ -58,19 +76,23 @@ Status DBImplReadOnly::Get(const ReadOptions& read_options,
  SuperVersion* super_version = cfd->GetSuperVersion();
  MergeContext merge_context;
  SequenceNumber max_covering_tombstone_seq = 0;
-  LookupKey lkey(key, snapshot);
+  LookupKey lkey(key, snapshot, read_options.timestamp);
  PERF_TIMER_STOP(get_snapshot_time);
-  if (super_version->mem->Get(lkey, pinnable_val->GetSelf(),
-                              /*timestamp=*/nullptr, &s, &merge_context,
-                              &max_covering_tombstone_seq, read_options)) {
+  if (super_version->mem->Get(lkey, pinnable_val->GetSelf(), ts, &s,
+                              &merge_context, &max_covering_tombstone_seq,
+                              read_options, &read_cb)) {
    pinnable_val->PinSelf();
    RecordTick(stats_, MEMTABLE_HIT);
  } else {
    PERF_TIMER_GUARD(get_from_output_files_time);
    PinnedIteratorsManager pinned_iters_mgr;
-    super_version->current->Get(read_options, lkey, pinnable_val,
-                                /*timestamp=*/nullptr, &s, &merge_context,
-                                &max_covering_tombstone_seq, &pinned_iters_mgr);
+    super_version->current->Get(
+        read_options, lkey, pinnable_val, ts, &s, &merge_context,
+        &max_covering_tombstone_seq, &pinned_iters_mgr,
+        /*value_found*/ nullptr,
+        /*key_exists*/ nullptr, /*seq*/ nullptr, &read_cb,
+        /*is_blob*/ nullptr,
+        /*do_merge*/ true);
    RecordTick(stats_, MEMTABLE_MISS);
  }
  RecordTick(stats_, NUMBER_KEYS_READ);
@ -84,11 +106,17 @@ Status DBImplReadOnly::Get(const ReadOptions& read_options,
 Iterator* DBImplReadOnly::NewIterator(const ReadOptions& read_options,
                                      ColumnFamilyHandle* column_family) {
  assert(column_family);
-  const Comparator* ucmp = column_family->GetComparator();
-  assert(ucmp);
-  if (ucmp->timestamp_size() || read_options.timestamp) {
-    // TODO: support timestamp
-    return NewErrorIterator(Status::NotSupported());
+  if (read_options.timestamp) {
+    const Status s =
+        FailIfTsSizesMismatch(column_family, *(read_options.timestamp));
+    if (!s.ok()) {
+      return NewErrorIterator(s);
+    }
+  } else {
+    const Status s = FailIfCfHasTs(column_family);
+    if (!s.ok()) {
+      return NewErrorIterator(s);
+    }
  }
  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
  auto cfd = cfh->cfd();
@ -118,16 +146,19 @@ Status DBImplReadOnly::NewIterators(
    const std::vector<ColumnFamilyHandle*>& column_families,
    std::vector<Iterator*>* iterators) {
  if (read_options.timestamp) {
-    // TODO: support timestamp
-    return Status::NotSupported();
+    for (auto* cf : column_families) {
+      assert(cf);
+      const Status s = FailIfTsSizesMismatch(cf, *(read_options.timestamp));
+      if (!s.ok()) {
+        return s;
+      }
+    }
  } else {
    for (auto* cf : column_families) {
      assert(cf);
-      const Comparator* ucmp = cf->GetComparator();
-      assert(ucmp);
-      if (ucmp->timestamp_size()) {
-        // TODO: support timestamp
-        return Status::NotSupported();
+      const Status s = FailIfCfHasTs(cf);
+      if (!s.ok()) {
+        return s;
      }
    }
  }
--- a/db/db_impl/db_impl_readonly.h
+++ b/db/db_impl/db_impl_readonly.h
@ -27,6 +27,9 @@ class DBImplReadOnly : public DBImpl {
  virtual Status Get(const ReadOptions& options,
                     ColumnFamilyHandle* column_family, const Slice& key,
                     PinnableSlice* value) override;
+  Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family,
+             const Slice& key, PinnableSlice* value,
+             std::string* timestamp) override;

  // TODO: Implement ReadOnly MultiGet?

--- a/db/db_impl/db_impl_secondary.cc
+++ b/db/db_impl/db_impl_secondary.cc
@ -33,7 +33,8 @@ DBImplSecondary::~DBImplSecondary() {}
 Status DBImplSecondary::Recover(
    const std::vector<ColumnFamilyDescriptor>& column_families,
    bool /*readonly*/, bool /*error_if_wal_file_exists*/,
-    bool /*error_if_data_exists_in_wals*/, uint64_t*) {
+    bool /*error_if_data_exists_in_wals*/, uint64_t*,
+    RecoveryContext* /*recovery_ctx*/) {
  mutex_.AssertHeld();

  JobContext job_context(0);
@ -772,12 +773,19 @@ Status DBImplSecondary::CompactWithoutInstallation(

  const int job_id = next_job_id_.fetch_add(1);

+  // use primary host's db_id for running the compaction, but db_session_id is
+  // using the local one, which is to make sure the unique id is unique from
+  // the remote compactors. Because the id is generated from db_id,
+  // db_session_id and orig_file_number, unlike the local compaction, remote
+  // compaction cannot guarantee the uniqueness of orig_file_number, the file
+  // number is only assigned when compaction is done.
  CompactionServiceCompactionJob compaction_job(
      job_id, c.get(), immutable_db_options_, mutable_db_options_,
      file_options_for_compaction_, versions_.get(), &shutting_down_,
      &log_buffer, output_dir.get(), stats_, &mutex_, &error_handler_,
      input.snapshots, table_cache_, &event_logger_, dbname_, io_tracer_,
-      options.canceled, db_id_, db_session_id_, secondary_path_, input, result);
+      options.canceled, input.db_id, db_session_id_, secondary_path_, input,
+      result);

  mutex_.Unlock();
  s = compaction_job.Run();
@ -832,6 +840,8 @@ Status DB::OpenAndCompact(
      override_options.table_factory;
  compaction_input.column_family.options.sst_partitioner_factory =
      override_options.sst_partitioner_factory;
+  compaction_input.column_family.options.table_properties_collector_factories =
+      override_options.table_properties_collector_factories;
  compaction_input.db_options.listeners = override_options.listeners;

  std::vector<ColumnFamilyDescriptor> column_families;
--- a/db/db_impl/db_impl_secondary.h
+++ b/db/db_impl/db_impl_secondary.h
@ -81,8 +81,8 @@ class DBImplSecondary : public DBImpl {
  // and log_readers_ to facilitate future operations.
  Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
                 bool read_only, bool error_if_wal_file_exists,
-                 bool error_if_data_exists_in_wals,
-                 uint64_t* = nullptr) override;
+                 bool error_if_data_exists_in_wals, uint64_t* = nullptr,
+                 RecoveryContext* recovery_ctx = nullptr) override;

  // Implementations of the DB interface
  using DB::Get;
--- a/db/db_options_test.cc
+++ b/db/db_options_test.cc
@ -95,6 +95,22 @@ TEST_F(DBOptionsTest, ImmutableTrackAndVerifyWalsInManifest) {
  ASSERT_FALSE(s.ok());
 }

+TEST_F(DBOptionsTest, ImmutableVerifySstUniqueIdInManifest) {
+  Options options;
+  options.env = env_;
+  options.verify_sst_unique_id_in_manifest = true;
+
+  ImmutableDBOptions db_options(options);
+  ASSERT_TRUE(db_options.verify_sst_unique_id_in_manifest);
+
+  Reopen(options);
+  ASSERT_TRUE(dbfull()->GetDBOptions().verify_sst_unique_id_in_manifest);
+
+  Status s =
+      dbfull()->SetDBOptions({{"verify_sst_unique_id_in_manifest", "false"}});
+  ASSERT_FALSE(s.ok());
+}
+
 // RocksDB lite don't support dynamic options.
 #ifndef ROCKSDB_LITE

--- a/db/db_readonly_with_timestamp_test.cc
+++ b/db/db_readonly_with_timestamp_test.cc
@ -0,0 +1,331 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_with_timestamp_test_util.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+class DBReadOnlyTestWithTimestamp : public DBBasicTestWithTimestampBase {
+ public:
+  DBReadOnlyTestWithTimestamp()
+      : DBBasicTestWithTimestampBase("db_readonly_test_with_timestamp") {}
+};
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBReadOnlyTestWithTimestamp, IteratorAndGetReadTimestampSizeMismatch) {
+  const int kNumKeysPerFile = 128;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  const std::string write_timestamp = Timestamp(1, 0);
+  WriteOptions write_opts;
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    Status s = db_->Put(write_opts, Key1(key), write_timestamp,
+                        "value" + std::to_string(key));
+    ASSERT_OK(s);
+  }
+
+  // Reopen the database in read only mode to test its timestamp support.
+  Close();
+  ASSERT_OK(ReadOnlyReopen(options));
+  ReadOptions read_opts;
+  std::string different_size_read_timestamp;
+  PutFixed32(&different_size_read_timestamp, 2);
+  Slice different_size_read_ts = different_size_read_timestamp;
+  read_opts.timestamp = &different_size_read_ts;
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    ASSERT_FALSE(iter->Valid());
+    ASSERT_TRUE(iter->status().IsInvalidArgument());
+  }
+
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    std::string value_from_get;
+    std::string timestamp;
+    ASSERT_TRUE(db_->Get(read_opts, Key1(key), &value_from_get, &timestamp)
+                    .IsInvalidArgument());
+  }
+
+  Close();
+}
+
+TEST_F(DBReadOnlyTestWithTimestamp,
+       IteratorAndGetReadTimestampSpecifiedWithoutWriteTimestamp) {
+  const int kNumKeysPerFile = 128;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  WriteOptions write_opts;
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    Status s = db_->Put(write_opts, Key1(key), "value" + std::to_string(key));
+    ASSERT_OK(s);
+  }
+
+  // Reopen the database in read only mode to test its timestamp support.
+  Close();
+  ASSERT_OK(ReadOnlyReopen(options));
+  ReadOptions read_opts;
+  const std::string read_timestamp = Timestamp(2, 0);
+  Slice read_ts = read_timestamp;
+  read_opts.timestamp = &read_ts;
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    ASSERT_FALSE(iter->Valid());
+    ASSERT_TRUE(iter->status().IsInvalidArgument());
+  }
+
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    std::string value_from_get;
+    std::string timestamp;
+    ASSERT_TRUE(db_->Get(read_opts, Key1(key), &value_from_get, &timestamp)
+                    .IsInvalidArgument());
+  }
+
+  Close();
+}
+
+TEST_F(DBReadOnlyTestWithTimestamp, IteratorAndGet) {
+  const int kNumKeysPerFile = 128;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  const std::vector<uint64_t> start_keys = {1, 0};
+  const std::vector<std::string> write_timestamps = {Timestamp(1, 0),
+                                                     Timestamp(3, 0)};
+  const std::vector<std::string> read_timestamps = {Timestamp(2, 0),
+                                                    Timestamp(4, 0)};
+  for (size_t i = 0; i < write_timestamps.size(); ++i) {
+    WriteOptions write_opts;
+    for (uint64_t key = start_keys[i]; key <= kMaxKey; ++key) {
+      Status s = db_->Put(write_opts, Key1(key), write_timestamps[i],
+                          "value" + std::to_string(i));
+      ASSERT_OK(s);
+    }
+  }
+
+  // Reopen the database in read only mode to test its timestamp support.
+  Close();
+  ASSERT_OK(ReadOnlyReopen(options));
+
+  auto get_value_and_check = [](DB* db, ReadOptions read_opts, Slice key,
+                                Slice expected_value, std::string expected_ts) {
+    std::string value_from_get;
+    std::string timestamp;
+    ASSERT_OK(db->Get(read_opts, key.ToString(), &value_from_get, &timestamp));
+    ASSERT_EQ(expected_value, value_from_get);
+    ASSERT_EQ(expected_ts, timestamp);
+  };
+  for (size_t i = 0; i < read_timestamps.size(); ++i) {
+    ReadOptions read_opts;
+    Slice read_ts = read_timestamps[i];
+    read_opts.timestamp = &read_ts;
+    std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+    int count = 0;
+    uint64_t key = 0;
+    // Forward iterate.
+    for (it->Seek(Key1(0)), key = start_keys[i]; it->Valid();
+         it->Next(), ++count, ++key) {
+      CheckIterUserEntry(it.get(), Key1(key), kTypeValue,
+                         "value" + std::to_string(i), write_timestamps[i]);
+      get_value_and_check(db_, read_opts, it->key(), it->value(),
+                          write_timestamps[i]);
+    }
+    size_t expected_count = kMaxKey - start_keys[i] + 1;
+    ASSERT_EQ(expected_count, count);
+
+    // Backward iterate.
+    count = 0;
+    for (it->SeekForPrev(Key1(kMaxKey)), key = kMaxKey; it->Valid();
+         it->Prev(), ++count, --key) {
+      CheckIterUserEntry(it.get(), Key1(key), kTypeValue,
+                         "value" + std::to_string(i), write_timestamps[i]);
+      get_value_and_check(db_, read_opts, it->key(), it->value(),
+                          write_timestamps[i]);
+    }
+    ASSERT_EQ(static_cast<size_t>(kMaxKey) - start_keys[i] + 1, count);
+
+    // SeekToFirst()/SeekToLast() with lower/upper bounds.
+    // Then iter with lower and upper bounds.
+    uint64_t l = 0;
+    uint64_t r = kMaxKey + 1;
+    while (l < r) {
+      std::string lb_str = Key1(l);
+      Slice lb = lb_str;
+      std::string ub_str = Key1(r);
+      Slice ub = ub_str;
+      read_opts.iterate_lower_bound = &lb;
+      read_opts.iterate_upper_bound = &ub;
+      it.reset(db_->NewIterator(read_opts));
+      for (it->SeekToFirst(), key = std::max(l, start_keys[i]), count = 0;
+           it->Valid(); it->Next(), ++key, ++count) {
+        CheckIterUserEntry(it.get(), Key1(key), kTypeValue,
+                           "value" + std::to_string(i), write_timestamps[i]);
+        get_value_and_check(db_, read_opts, it->key(), it->value(),
+                            write_timestamps[i]);
+      }
+      ASSERT_EQ(r - std::max(l, start_keys[i]), count);
+
+      for (it->SeekToLast(), key = std::min(r, kMaxKey + 1), count = 0;
+           it->Valid(); it->Prev(), --key, ++count) {
+        CheckIterUserEntry(it.get(), Key1(key - 1), kTypeValue,
+                           "value" + std::to_string(i), write_timestamps[i]);
+        get_value_and_check(db_, read_opts, it->key(), it->value(),
+                            write_timestamps[i]);
+      }
+      l += (kMaxKey / 100);
+      r -= (kMaxKey / 100);
+    }
+  }
+  Close();
+}
+
+TEST_F(DBReadOnlyTestWithTimestamp, Iterators) {
+  const int kNumKeysPerFile = 128;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  const std::string write_timestamp = Timestamp(1, 0);
+  const std::string read_timestamp = Timestamp(2, 0);
+  WriteOptions write_opts;
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    Status s = db_->Put(write_opts, Key1(key), write_timestamp,
+                        "value" + std::to_string(key));
+    ASSERT_OK(s);
+  }
+
+  // Reopen the database in read only mode to test its timestamp support.
+  Close();
+  ASSERT_OK(ReadOnlyReopen(options));
+  ReadOptions read_opts;
+  Slice read_ts = read_timestamp;
+  read_opts.timestamp = &read_ts;
+  std::vector<Iterator*> iters;
+  ASSERT_OK(db_->NewIterators(read_opts, {db_->DefaultColumnFamily()}, &iters));
+  ASSERT_EQ(static_cast<uint64_t>(1), iters.size());
+
+  int count = 0;
+  uint64_t key = 0;
+  // Forward iterate.
+  for (iters[0]->Seek(Key1(0)), key = 0; iters[0]->Valid();
+       iters[0]->Next(), ++count, ++key) {
+    CheckIterUserEntry(iters[0], Key1(key), kTypeValue,
+                       "value" + std::to_string(key), write_timestamp);
+  }
+
+  size_t expected_count = kMaxKey - 0 + 1;
+  ASSERT_EQ(expected_count, count);
+  delete iters[0];
+
+  Close();
+}
+
+TEST_F(DBReadOnlyTestWithTimestamp, IteratorsReadTimestampSizeMismatch) {
+  const int kNumKeysPerFile = 128;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  const std::string write_timestamp = Timestamp(1, 0);
+  WriteOptions write_opts;
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    Status s = db_->Put(write_opts, Key1(key), write_timestamp,
+                        "value" + std::to_string(key));
+    ASSERT_OK(s);
+  }
+
+  // Reopen the database in read only mode to test its timestamp support.
+  Close();
+  ASSERT_OK(ReadOnlyReopen(options));
+  ReadOptions read_opts;
+  std::string different_size_read_timestamp;
+  PutFixed32(&different_size_read_timestamp, 2);
+  Slice different_size_read_ts = different_size_read_timestamp;
+  read_opts.timestamp = &different_size_read_ts;
+  {
+    std::vector<Iterator*> iters;
+    ASSERT_TRUE(
+        db_->NewIterators(read_opts, {db_->DefaultColumnFamily()}, &iters)
+            .IsInvalidArgument());
+  }
+
+  Close();
+}
+
+TEST_F(DBReadOnlyTestWithTimestamp,
+       IteratorsReadTimestampSpecifiedWithoutWriteTimestamp) {
+  const int kNumKeysPerFile = 128;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  WriteOptions write_opts;
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    Status s = db_->Put(write_opts, Key1(key), "value" + std::to_string(key));
+    ASSERT_OK(s);
+  }
+
+  // Reopen the database in read only mode to test its timestamp support.
+  Close();
+  ASSERT_OK(ReadOnlyReopen(options));
+  ReadOptions read_opts;
+  const std::string read_timestamp = Timestamp(2, 0);
+  Slice read_ts = read_timestamp;
+  read_opts.timestamp = &read_ts;
+  {
+    std::vector<Iterator*> iters;
+    ASSERT_TRUE(
+        db_->NewIterators(read_opts, {db_->DefaultColumnFamily()}, &iters)
+            .IsInvalidArgument());
+  }
+
+  Close();
+}
+#endif  // !ROCKSDB_LITE
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
--- a/db/db_secondary_test.cc
+++ b/db/db_secondary_test.cc
@ -181,6 +181,7 @@ TEST_F(DBSecondaryTest, SimpleInternalCompaction) {
  ASSERT_EQ(input.input_files.size(), 3);

  input.output_level = 1;
+  ASSERT_OK(db_->GetDbIdentity(input.db_id));
  Close();

  options.max_open_files = -1;
@ -241,6 +242,7 @@ TEST_F(DBSecondaryTest, InternalCompactionMultiLevels) {
  input1.input_files.push_back(meta.levels[1].files[2].name);

  input1.output_level = 1;
+  ASSERT_OK(db_->GetDbIdentity(input1.db_id));

  options.max_open_files = -1;
  Close();
@ -261,6 +263,7 @@ TEST_F(DBSecondaryTest, InternalCompactionMultiLevels) {
  }

  input2.output_level = 2;
+  input2.db_id = input1.db_id;
  ASSERT_OK(db_secondary_full()->TEST_CompactWithoutInstallation(
      OpenAndCompactOptions(), cfh, input2, &result));
  ASSERT_OK(result.status);
@ -305,6 +308,7 @@ TEST_F(DBSecondaryTest, InternalCompactionCompactedFiles) {
  ASSERT_EQ(input.input_files.size(), 3);

  input.output_level = 1;
+  ASSERT_OK(db_->GetDbIdentity(input.db_id));

  // trigger compaction to delete the files for secondary instance compaction
  ASSERT_OK(Put("foo", "foo_value" + std::to_string(3)));
@ -346,6 +350,7 @@ TEST_F(DBSecondaryTest, InternalCompactionMissingFiles) {
  ASSERT_EQ(input.input_files.size(), 3);

  input.output_level = 1;
+  ASSERT_OK(db_->GetDbIdentity(input.db_id));

  Close();

--- a/db/db_sst_test.cc
+++ b/db/db_sst_test.cc
@ -1448,7 +1448,9 @@ TEST_F(DBSSTTest, OpenDBWithInfiniteMaxOpenFiles) {
 }

 TEST_F(DBSSTTest, OpenDBWithInfiniteMaxOpenFilesSubjectToMemoryLimit) {
-  for (bool reserve_table_builder_memory : {true, false}) {
+  for (CacheEntryRoleOptions::Decision charge_table_reader :
+       {CacheEntryRoleOptions::Decision::kEnabled,
+        CacheEntryRoleOptions::Decision::kDisabled}) {
    // Open DB with infinite max open files
    //  - First iteration use 1 thread to open files
    //  - Second iteration use 5 threads to open files
@ -1488,7 +1490,9 @@ TEST_F(DBSSTTest, OpenDBWithInfiniteMaxOpenFilesSubjectToMemoryLimit) {
      }
      Close();

-      table_options.reserve_table_reader_memory = reserve_table_builder_memory;
+      table_options.cache_usage_options.options_overrides.insert(
+          {CacheEntryRole::kBlockBasedTableReader,
+           {/*.charged = */ charge_table_reader}});
      table_options.block_cache =
          NewLRUCache(1024 /* capacity */, 0 /* num_shard_bits */,
                      true /* strict_capacity_limit */);
@ -1497,8 +1501,13 @@ TEST_F(DBSSTTest, OpenDBWithInfiniteMaxOpenFilesSubjectToMemoryLimit) {
      // Reopening the DB will try to load all existing files, conditionally
      // subject to memory limit
      Status s = TryReopen(options);
-      if (table_options.reserve_table_reader_memory) {
+
+      if (charge_table_reader == CacheEntryRoleOptions::Decision::kEnabled) {
        EXPECT_TRUE(s.IsMemoryLimit());
+        EXPECT_TRUE(s.ToString().find(
+                        kCacheEntryRoleToCamelString[static_cast<std::uint32_t>(
+                            CacheEntryRole::kBlockBasedTableReader)]) !=
+                    std::string::npos);
        EXPECT_TRUE(s.ToString().find("memory limit based on cache capacity") !=
                    std::string::npos);

--- a/db/db_test2.cc
+++ b/db/db_test2.cc
@ -1296,6 +1296,7 @@ TEST_F(DBTest2, PresetCompressionDict) {
  enum DictionaryTypes : int {
    kWithoutDict,
    kWithDict,
+    kWithZSTDfinalizeDict,
    kWithZSTDTrainedDict,
    kDictEnd,
  };
@ -1304,6 +1305,7 @@ TEST_F(DBTest2, PresetCompressionDict) {
    options.compression = compression_type;
    size_t bytes_without_dict = 0;
    size_t bytes_with_dict = 0;
+    size_t bytes_with_zstd_finalize_dict = 0;
    size_t bytes_with_zstd_trained_dict = 0;
    for (int i = kWithoutDict; i < kDictEnd; i++) {
      // First iteration: compress without preset dictionary
@ -1323,12 +1325,21 @@ TEST_F(DBTest2, PresetCompressionDict) {
          options.compression_opts.max_dict_bytes = kBlockSizeBytes;
          options.compression_opts.zstd_max_train_bytes = 0;
          break;
+        case kWithZSTDfinalizeDict:
+          if (compression_type != kZSTD) {
+            continue;
+          }
+          options.compression_opts.max_dict_bytes = kBlockSizeBytes;
+          options.compression_opts.zstd_max_train_bytes = kL0FileBytes;
+          options.compression_opts.use_zstd_dict_trainer = false;
+          break;
        case kWithZSTDTrainedDict:
          if (compression_type != kZSTD) {
            continue;
          }
          options.compression_opts.max_dict_bytes = kBlockSizeBytes;
          options.compression_opts.zstd_max_train_bytes = kL0FileBytes;
+          options.compression_opts.use_zstd_dict_trainer = true;
          break;
        default:
          assert(false);
@ -1365,6 +1376,8 @@ TEST_F(DBTest2, PresetCompressionDict) {
        bytes_without_dict = total_sst_bytes;
      } else if (i == kWithDict) {
        bytes_with_dict = total_sst_bytes;
+      } else if (i == kWithZSTDfinalizeDict) {
+        bytes_with_zstd_finalize_dict = total_sst_bytes;
      } else if (i == kWithZSTDTrainedDict) {
        bytes_with_zstd_trained_dict = total_sst_bytes;
      }
@ -1375,6 +1388,13 @@ TEST_F(DBTest2, PresetCompressionDict) {
      }
      if (i == kWithDict) {
        ASSERT_GT(bytes_without_dict, bytes_with_dict);
+      } else if (i == kWithZSTDTrainedDict) {
+        // In zstd compression, it is sometimes possible that using a finalized
+        // dictionary does not get as good a compression ratio as raw content
+        // dictionary. But using a dictionary should always get better
+        // compression ratio than not using one.
+        ASSERT_TRUE(bytes_with_dict > bytes_with_zstd_finalize_dict ||
+                    bytes_without_dict > bytes_with_zstd_finalize_dict);
      } else if (i == kWithZSTDTrainedDict) {
        // In zstd compression, it is sometimes possible that using a trained
        // dictionary does not get as good a compression ratio as without
@ -3987,12 +4007,14 @@ TEST_F(DBTest2, RateLimitedCompactionReads) {

      // should be slightly above 512KB due to non-data blocks read. Arbitrarily
      // chose 1MB as the upper bound on the total bytes read.
-      size_t rate_limited_bytes =
-          options.rate_limiter->GetTotalBytesThrough(Env::IO_TOTAL);
-      // There must be no charges at non-`IO_LOW` priorities.
+      size_t rate_limited_bytes = static_cast<size_t>(
+          options.rate_limiter->GetTotalBytesThrough(Env::IO_TOTAL));
+      // The charges can exist for `IO_LOW` and `IO_USER` priorities.
+      size_t rate_limited_bytes_by_pri =
+          options.rate_limiter->GetTotalBytesThrough(Env::IO_LOW) +
+          options.rate_limiter->GetTotalBytesThrough(Env::IO_USER);
      ASSERT_EQ(rate_limited_bytes,
-                static_cast<size_t>(
-                    options.rate_limiter->GetTotalBytesThrough(Env::IO_LOW)));
+                static_cast<size_t>(rate_limited_bytes_by_pri));
      // Include the explicit prefetch of the footer in direct I/O case.
      size_t direct_io_extra = use_direct_io ? 512 * 1024 : 0;
      ASSERT_GE(
@ -4010,9 +4032,11 @@ TEST_F(DBTest2, RateLimitedCompactionReads) {
      }
      delete iter;
      // bytes read for user iterator shouldn't count against the rate limit.
+      rate_limited_bytes_by_pri =
+          options.rate_limiter->GetTotalBytesThrough(Env::IO_LOW) +
+          options.rate_limiter->GetTotalBytesThrough(Env::IO_USER);
      ASSERT_EQ(rate_limited_bytes,
-                static_cast<size_t>(
-                    options.rate_limiter->GetTotalBytesThrough(Env::IO_LOW)));
+                static_cast<size_t>(rate_limited_bytes_by_pri));
    }
  }
 }
@ -6305,115 +6329,118 @@ TEST_F(DBTest2, BlockBasedTablePrefixGetIndexNotFound) {

 #ifndef ROCKSDB_LITE
 TEST_F(DBTest2, AutoPrefixMode1) {
-  // create a DB with block prefix index
-  BlockBasedTableOptions table_options;
-  Options options = CurrentOptions();
-  table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
-  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
-  options.prefix_extractor.reset(NewFixedPrefixTransform(1));
-  options.statistics = CreateDBStatistics();
+  do {
+    // create a DB with block prefix index
+    Options options = CurrentOptions();
+    BlockBasedTableOptions table_options =
+        *options.table_factory->GetOptions<BlockBasedTableOptions>();
+    table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+    options.statistics = CreateDBStatistics();

-  Reopen(options);
+    Reopen(options);

-  Random rnd(301);
-  std::string large_value = rnd.RandomString(500);
+    Random rnd(301);
+    std::string large_value = rnd.RandomString(500);

-  ASSERT_OK(Put("a1", large_value));
-  ASSERT_OK(Put("x1", large_value));
-  ASSERT_OK(Put("y1", large_value));
-  ASSERT_OK(Flush());
+    ASSERT_OK(Put("a1", large_value));
+    ASSERT_OK(Put("x1", large_value));
+    ASSERT_OK(Put("y1", large_value));
+    ASSERT_OK(Flush());

-  ReadOptions ro;
-  ro.total_order_seek = false;
-  ro.auto_prefix_mode = true;
-  {
-    std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
-    iterator->Seek("b1");
-    ASSERT_TRUE(iterator->Valid());
-    ASSERT_EQ("x1", iterator->key().ToString());
-    ASSERT_EQ(0, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
-    ASSERT_OK(iterator->status());
-  }
+    ReadOptions ro;
+    ro.total_order_seek = false;
+    ro.auto_prefix_mode = true;
+    {
+      std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+      iterator->Seek("b1");
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("x1", iterator->key().ToString());
+      ASSERT_EQ(0, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+      ASSERT_OK(iterator->status());
+    }

-  std::string ub_str = "b9";
-  Slice ub(ub_str);
-  ro.iterate_upper_bound = &ub;
-
-  {
-    std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
-    iterator->Seek("b1");
-    ASSERT_FALSE(iterator->Valid());
-    ASSERT_EQ(1, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
-    ASSERT_OK(iterator->status());
-  }
-
-  ub_str = "z";
-  ub = Slice(ub_str);
-  {
-    std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
-    iterator->Seek("b1");
-    ASSERT_TRUE(iterator->Valid());
-    ASSERT_EQ("x1", iterator->key().ToString());
-    ASSERT_EQ(1, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
-    ASSERT_OK(iterator->status());
-  }
-
-  ub_str = "c";
-  ub = Slice(ub_str);
-  {
-    std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
-    iterator->Seek("b1");
-    ASSERT_FALSE(iterator->Valid());
-    ASSERT_EQ(2, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
-    ASSERT_OK(iterator->status());
-  }
-
-  // The same queries without recreating iterator
-  {
-    ub_str = "b9";
-    ub = Slice(ub_str);
+    std::string ub_str = "b9";
+    Slice ub(ub_str);
    ro.iterate_upper_bound = &ub;

-    std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
-    iterator->Seek("b1");
-    ASSERT_FALSE(iterator->Valid());
-    ASSERT_EQ(3, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
-    ASSERT_OK(iterator->status());
+    {
+      std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+      iterator->Seek("b1");
+      ASSERT_FALSE(iterator->Valid());
+      ASSERT_EQ(1, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+      ASSERT_OK(iterator->status());
+    }

    ub_str = "z";
    ub = Slice(ub_str);
-
-    iterator->Seek("b1");
-    ASSERT_TRUE(iterator->Valid());
-    ASSERT_EQ("x1", iterator->key().ToString());
-    ASSERT_EQ(3, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+    {
+      std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+      iterator->Seek("b1");
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("x1", iterator->key().ToString());
+      ASSERT_EQ(1, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+      ASSERT_OK(iterator->status());
+    }

    ub_str = "c";
    ub = Slice(ub_str);
+    {
+      std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+      iterator->Seek("b1");
+      ASSERT_FALSE(iterator->Valid());
+      ASSERT_EQ(2, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+      ASSERT_OK(iterator->status());
+    }

-    iterator->Seek("b1");
-    ASSERT_FALSE(iterator->Valid());
-    ASSERT_EQ(4, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+    // The same queries without recreating iterator
+    {
+      ub_str = "b9";
+      ub = Slice(ub_str);
+      ro.iterate_upper_bound = &ub;

-    ub_str = "b9";
-    ub = Slice(ub_str);
-    ro.iterate_upper_bound = &ub;
-    iterator->SeekForPrev("b1");
-    ASSERT_TRUE(iterator->Valid());
-    ASSERT_EQ("a1", iterator->key().ToString());
-    ASSERT_EQ(4, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+      std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+      iterator->Seek("b1");
+      ASSERT_FALSE(iterator->Valid());
+      ASSERT_EQ(3, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+      ASSERT_OK(iterator->status());

-    ub_str = "zz";
-    ub = Slice(ub_str);
-    ro.iterate_upper_bound = &ub;
-    iterator->SeekToLast();
-    ASSERT_TRUE(iterator->Valid());
-    ASSERT_EQ("y1", iterator->key().ToString());
+      ub_str = "z";
+      ub = Slice(ub_str);

-    iterator->SeekToFirst();
-    ASSERT_TRUE(iterator->Valid());
-    ASSERT_EQ("a1", iterator->key().ToString());
-  }
+      iterator->Seek("b1");
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("x1", iterator->key().ToString());
+      ASSERT_EQ(3, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+
+      ub_str = "c";
+      ub = Slice(ub_str);
+
+      iterator->Seek("b1");
+      ASSERT_FALSE(iterator->Valid());
+      ASSERT_EQ(4, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+
+      ub_str = "b9";
+      ub = Slice(ub_str);
+      ro.iterate_upper_bound = &ub;
+      iterator->SeekForPrev("b1");
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("a1", iterator->key().ToString());
+      ASSERT_EQ(4, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+
+      ub_str = "zz";
+      ub = Slice(ub_str);
+      ro.iterate_upper_bound = &ub;
+      iterator->SeekToLast();
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("y1", iterator->key().ToString());
+
+      iterator->SeekToFirst();
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("a1", iterator->key().ToString());
+    }
+  } while (ChangeOptions(kSkipPlainTable));
 }

 class RenameCurrentTest : public DBTestBase,
@ -7145,6 +7172,112 @@ TEST_F(DBTest2, RenameDirectory) {
  dbname_ = old_dbname;
 }

+TEST_F(DBTest2, SstUniqueIdVerifyBackwardCompatible) {
+  const int kNumSst = 3;
+  const int kLevel0Trigger = 4;
+  auto options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kLevel0Trigger;
+  options.statistics = CreateDBStatistics();
+
+  // Existing manifest doesn't have unique id
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionEdit::EncodeTo:UniqueId", [&](void* arg) {
+        auto unique_id = static_cast<UniqueId64x2*>(arg);
+        // remove id before writing it to manifest
+        (*unique_id)[0] = 0;
+        (*unique_id)[1] = 0;
+      });
+  std::atomic_int skipped = 0;
+  SyncPoint::GetInstance()->SetCallBack("Version::VerifySstUniqueIds::Skipped",
+                                        [&](void* /*arg*/) { skipped++; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // generate a few SSTs
+  for (int i = 0; i < kNumSst; i++) {
+    for (int j = 0; j < 100; j++) {
+      ASSERT_OK(Put(Key(i * 10 + j), "value"));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  // Reopen without verification
+  Reopen(options);
+
+  // Reopen with verification, but it's skipped because manifest doesn't have id
+  options.verify_sst_unique_id_in_manifest = true;
+  Reopen(options);
+  ASSERT_EQ(skipped, kNumSst);
+
+  // test compaction generated Sst
+  for (int i = kNumSst; i < kLevel0Trigger; i++) {
+    for (int j = 0; j < 100; j++) {
+      ASSERT_OK(Put(Key(i * 10 + j), "value"));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+#endif  // ROCKSDB_LITE
+
+  // Reopen with verification should fail
+  options.verify_sst_unique_id_in_manifest = true;
+  skipped = 0;
+  Reopen(options);
+  ASSERT_EQ(skipped, 1);
+}
+
+TEST_F(DBTest2, SstUniqueIdVerify) {
+  const int kNumSst = 3;
+  const int kLevel0Trigger = 4;
+  auto options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kLevel0Trigger;
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "PropertyBlockBuilder::AddTableProperty:Start", [&](void* props_vs) {
+        auto props = static_cast<TableProperties*>(props_vs);
+        // update table property session_id to a different one
+        props->db_session_id = DBImpl::GenerateDbSessionId(nullptr);
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // generate a few SSTs
+  for (int i = 0; i < kNumSst; i++) {
+    for (int j = 0; j < 100; j++) {
+      ASSERT_OK(Put(Key(i * 10 + j), "value"));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  // Reopen with verification should report corruption
+  options.verify_sst_unique_id_in_manifest = true;
+  auto s = TryReopen(options);
+  ASSERT_TRUE(s.IsCorruption());
+
+  // Reopen without verification should be fine
+  options.verify_sst_unique_id_in_manifest = false;
+  Reopen(options);
+
+  // test compaction generated Sst
+  for (int i = kNumSst; i < kLevel0Trigger; i++) {
+    for (int j = 0; j < 100; j++) {
+      ASSERT_OK(Put(Key(i * 10 + j), "value"));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+#endif  // ROCKSDB_LITE
+
+  // Reopen with verification should fail
+  options.verify_sst_unique_id_in_manifest = true;
+  s = TryReopen(options);
+  ASSERT_TRUE(s.IsCorruption());
+}
+
 #ifndef ROCKSDB_LITE
 TEST_F(DBTest2, GetLatestSeqAndTsForKey) {
  Destroy(last_options_);
--- a/db/db_test_util.cc
+++ b/db/db_test_util.cc
@ -9,6 +9,7 @@

 #include "db/db_test_util.h"

+#include "cache/cache_reservation_manager.h"
 #include "db/forward_iterator.h"
 #include "env/mock_env.h"
 #include "port/lang.h"
@ -829,10 +830,12 @@ std::string DBTestBase::Get(int cf, const std::string& k,
 std::vector<std::string> DBTestBase::MultiGet(std::vector<int> cfs,
                                              const std::vector<std::string>& k,
                                              const Snapshot* snapshot,
-                                              const bool batched) {
+                                              const bool batched,
+                                              const bool async) {
  ReadOptions options;
  options.verify_checksums = true;
  options.snapshot = snapshot;
+  options.async_io = async;
  std::vector<ColumnFamilyHandle*> handles;
  std::vector<Slice> keys;
  std::vector<std::string> result;
@ -874,10 +877,12 @@ std::vector<std::string> DBTestBase::MultiGet(std::vector<int> cfs,
 }

 std::vector<std::string> DBTestBase::MultiGet(const std::vector<std::string>& k,
-                                              const Snapshot* snapshot) {
+                                              const Snapshot* snapshot,
+                                              const bool async) {
  ReadOptions options;
  options.verify_checksums = true;
  options.snapshot = snapshot;
+  options.async_io = async;
  std::vector<Slice> keys;
  std::vector<std::string> result(k.size());
  std::vector<Status> statuses(k.size());
@ -1683,4 +1688,61 @@ void VerifySstUniqueIds(const TablePropertiesCollection& props) {
  }
 }

+template <CacheEntryRole R>
+TargetCacheChargeTrackingCache<R>::TargetCacheChargeTrackingCache(
+    std::shared_ptr<Cache> target)
+    : CacheWrapper(std::move(target)),
+      cur_cache_charge_(0),
+      cache_charge_peak_(0),
+      cache_charge_increment_(0),
+      last_peak_tracked_(false),
+      cache_charge_increments_sum_(0) {}
+
+template <CacheEntryRole R>
+Status TargetCacheChargeTrackingCache<R>::Insert(
+    const Slice& key, void* value, size_t charge,
+    void (*deleter)(const Slice& key, void* value), Handle** handle,
+    Priority priority) {
+  Status s = target_->Insert(key, value, charge, deleter, handle, priority);
+  if (deleter == kNoopDeleter) {
+    if (last_peak_tracked_) {
+      cache_charge_peak_ = 0;
+      cache_charge_increment_ = 0;
+      last_peak_tracked_ = false;
+    }
+    if (s.ok()) {
+      cur_cache_charge_ += charge;
+    }
+    cache_charge_peak_ = std::max(cache_charge_peak_, cur_cache_charge_);
+    cache_charge_increment_ += charge;
+  }
+
+  return s;
+}
+
+template <CacheEntryRole R>
+bool TargetCacheChargeTrackingCache<R>::Release(Handle* handle,
+                                                bool erase_if_last_ref) {
+  auto deleter = GetDeleter(handle);
+  if (deleter == kNoopDeleter) {
+    if (!last_peak_tracked_) {
+      cache_charge_peaks_.push_back(cache_charge_peak_);
+      cache_charge_increments_sum_ += cache_charge_increment_;
+      last_peak_tracked_ = true;
+    }
+    cur_cache_charge_ -= GetCharge(handle);
+  }
+  bool is_successful = target_->Release(handle, erase_if_last_ref);
+  return is_successful;
+}
+
+template <CacheEntryRole R>
+const Cache::DeleterFn TargetCacheChargeTrackingCache<R>::kNoopDeleter =
+    CacheReservationManagerImpl<R>::TEST_GetNoopDeleterForRole();
+
+template class TargetCacheChargeTrackingCache<
+    CacheEntryRole::kFilterConstruction>;
+template class TargetCacheChargeTrackingCache<
+    CacheEntryRole::kBlockBasedTableReader>;
+
 }  // namespace ROCKSDB_NAMESPACE
--- a/db/db_test_util.h
+++ b/db/db_test_util.h
@ -952,6 +952,51 @@ class CacheWrapper : public Cache {
  std::shared_ptr<Cache> target_;
 };

+/*
+ * A cache wrapper that tracks certain CacheEntryRole's cache charge, its
+ * peaks and increments
+ *
+ *        p0
+ *       / \   p1
+ *      /   \  /\
+ *     /     \/  \
+ *  a /       b   \
+ * peaks = {p0, p1}
+ * increments = {p1-a, p2-b}
+ */
+template <CacheEntryRole R>
+class TargetCacheChargeTrackingCache : public CacheWrapper {
+ public:
+  explicit TargetCacheChargeTrackingCache(std::shared_ptr<Cache> target);
+
+  using Cache::Insert;
+  Status Insert(const Slice& key, void* value, size_t charge,
+                void (*deleter)(const Slice& key, void* value),
+                Handle** handle = nullptr,
+                Priority priority = Priority::LOW) override;
+
+  using Cache::Release;
+  bool Release(Handle* handle, bool erase_if_last_ref = false) override;
+
+  std::size_t GetCacheCharge() { return cur_cache_charge_; }
+
+  std::deque<std::size_t> GetChargedCachePeaks() { return cache_charge_peaks_; }
+
+  std::size_t GetChargedCacheIncrementSum() {
+    return cache_charge_increments_sum_;
+  }
+
+ private:
+  static const Cache::DeleterFn kNoopDeleter;
+
+  std::size_t cur_cache_charge_;
+  std::size_t cache_charge_peak_;
+  std::size_t cache_charge_increment_;
+  bool last_peak_tracked_;
+  std::deque<std::size_t> cache_charge_peaks_;
+  std::size_t cache_charge_increments_sum_;
+};
+
 class DBTestBase : public testing::Test {
 public:
  // Sequence of option configurations to try
@ -1154,10 +1199,12 @@ class DBTestBase : public testing::Test {
  std::vector<std::string> MultiGet(std::vector<int> cfs,
                                    const std::vector<std::string>& k,
                                    const Snapshot* snapshot,
-                                    const bool batched);
+                                    const bool batched,
+                                    const bool async = false);

  std::vector<std::string> MultiGet(const std::vector<std::string>& k,
-                                    const Snapshot* snapshot = nullptr);
+                                    const Snapshot* snapshot = nullptr,
+                                    const bool async = false);

  uint64_t GetNumSnapshots();

--- a/db/db_with_timestamp_basic_test.cc
+++ b/db/db_with_timestamp_basic_test.cc
@ -7,7 +7,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.

-#include "db/db_test_util.h"
+#include "db/db_with_timestamp_test_util.h"
 #include "port/stack_trace.h"
 #include "rocksdb/perf_context.h"
 #include "rocksdb/utilities/debug.h"
@ -20,176 +20,6 @@
 #include "utilities/fault_injection_env.h"

 namespace ROCKSDB_NAMESPACE {
-class DBBasicTestWithTimestampBase : public DBTestBase {
- public:
-  explicit DBBasicTestWithTimestampBase(const std::string& dbname)
-      : DBTestBase(dbname, /*env_do_fsync=*/true) {}
-
- protected:
-  static std::string Key1(uint64_t k) {
-    std::string ret;
-    PutFixed64(&ret, k);
-    std::reverse(ret.begin(), ret.end());
-    return ret;
-  }
-
-  static std::string KeyWithPrefix(std::string prefix, uint64_t k) {
-    std::string ret;
-    PutFixed64(&ret, k);
-    std::reverse(ret.begin(), ret.end());
-    return prefix + ret;
-  }
-
-  static std::vector<Slice> ConvertStrToSlice(
-      std::vector<std::string>& strings) {
-    std::vector<Slice> ret;
-    for (const auto& s : strings) {
-      ret.emplace_back(s);
-    }
-    return ret;
-  }
-
-  class TestComparator : public Comparator {
-   private:
-    const Comparator* cmp_without_ts_;
-
-   public:
-    explicit TestComparator(size_t ts_sz)
-        : Comparator(ts_sz), cmp_without_ts_(nullptr) {
-      cmp_without_ts_ = BytewiseComparator();
-    }
-
-    const char* Name() const override { return "TestComparator"; }
-
-    void FindShortSuccessor(std::string*) const override {}
-
-    void FindShortestSeparator(std::string*, const Slice&) const override {}
-
-    int Compare(const Slice& a, const Slice& b) const override {
-      int r = CompareWithoutTimestamp(a, b);
-      if (r != 0 || 0 == timestamp_size()) {
-        return r;
-      }
-      return -CompareTimestamp(
-          Slice(a.data() + a.size() - timestamp_size(), timestamp_size()),
-          Slice(b.data() + b.size() - timestamp_size(), timestamp_size()));
-    }
-
-    using Comparator::CompareWithoutTimestamp;
-    int CompareWithoutTimestamp(const Slice& a, bool a_has_ts, const Slice& b,
-                                bool b_has_ts) const override {
-      if (a_has_ts) {
-        assert(a.size() >= timestamp_size());
-      }
-      if (b_has_ts) {
-        assert(b.size() >= timestamp_size());
-      }
-      Slice lhs = a_has_ts ? StripTimestampFromUserKey(a, timestamp_size()) : a;
-      Slice rhs = b_has_ts ? StripTimestampFromUserKey(b, timestamp_size()) : b;
-      return cmp_without_ts_->Compare(lhs, rhs);
-    }
-
-    int CompareTimestamp(const Slice& ts1, const Slice& ts2) const override {
-      if (!ts1.data() && !ts2.data()) {
-        return 0;
-      } else if (ts1.data() && !ts2.data()) {
-        return 1;
-      } else if (!ts1.data() && ts2.data()) {
-        return -1;
-      }
-      assert(ts1.size() == ts2.size());
-      uint64_t low1 = 0;
-      uint64_t low2 = 0;
-      uint64_t high1 = 0;
-      uint64_t high2 = 0;
-      const size_t kSize = ts1.size();
-      std::unique_ptr<char[]> ts1_buf(new char[kSize]);
-      memcpy(ts1_buf.get(), ts1.data(), ts1.size());
-      std::unique_ptr<char[]> ts2_buf(new char[kSize]);
-      memcpy(ts2_buf.get(), ts2.data(), ts2.size());
-      Slice ts1_copy = Slice(ts1_buf.get(), kSize);
-      Slice ts2_copy = Slice(ts2_buf.get(), kSize);
-      auto* ptr1 = const_cast<Slice*>(&ts1_copy);
-      auto* ptr2 = const_cast<Slice*>(&ts2_copy);
-      if (!GetFixed64(ptr1, &low1) || !GetFixed64(ptr1, &high1) ||
-          !GetFixed64(ptr2, &low2) || !GetFixed64(ptr2, &high2)) {
-        assert(false);
-      }
-      if (high1 < high2) {
-        return -1;
-      } else if (high1 > high2) {
-        return 1;
-      }
-      if (low1 < low2) {
-        return -1;
-      } else if (low1 > low2) {
-        return 1;
-      }
-      return 0;
-    }
-  };
-
-  std::string Timestamp(uint64_t low, uint64_t high) {
-    std::string ts;
-    PutFixed64(&ts, low);
-    PutFixed64(&ts, high);
-    return ts;
-  }
-
-  void CheckIterUserEntry(const Iterator* it, const Slice& expected_key,
-                          ValueType expected_value_type,
-                          const Slice& expected_value,
-                          const Slice& expected_ts) const {
-    ASSERT_TRUE(it->Valid());
-    ASSERT_OK(it->status());
-    ASSERT_EQ(expected_key, it->key());
-    if (kTypeValue == expected_value_type) {
-      ASSERT_EQ(expected_value, it->value());
-    }
-    ASSERT_EQ(expected_ts, it->timestamp());
-  }
-
-  void CheckIterEntry(const Iterator* it, const Slice& expected_ukey,
-                      SequenceNumber expected_seq, ValueType expected_val_type,
-                      const Slice& expected_value, const Slice& expected_ts) {
-    ASSERT_TRUE(it->Valid());
-    ASSERT_OK(it->status());
-    std::string ukey_and_ts;
-    ukey_and_ts.assign(expected_ukey.data(), expected_ukey.size());
-    ukey_and_ts.append(expected_ts.data(), expected_ts.size());
-    ParsedInternalKey parsed_ikey;
-    ASSERT_OK(
-        ParseInternalKey(it->key(), &parsed_ikey, true /* log_err_key */));
-    ASSERT_EQ(ukey_and_ts, parsed_ikey.user_key);
-    ASSERT_EQ(expected_val_type, parsed_ikey.type);
-    ASSERT_EQ(expected_seq, parsed_ikey.sequence);
-    if (expected_val_type == kTypeValue) {
-      ASSERT_EQ(expected_value, it->value());
-    }
-    ASSERT_EQ(expected_ts, it->timestamp());
-  }
-
-  void CheckIterEntry(const Iterator* it, const Slice& expected_ukey,
-                      ValueType expected_val_type, const Slice& expected_value,
-                      const Slice& expected_ts) {
-    ASSERT_TRUE(it->Valid());
-    ASSERT_OK(it->status());
-    std::string ukey_and_ts;
-    ukey_and_ts.assign(expected_ukey.data(), expected_ukey.size());
-    ukey_and_ts.append(expected_ts.data(), expected_ts.size());
-
-    ParsedInternalKey parsed_ikey;
-    ASSERT_OK(
-        ParseInternalKey(it->key(), &parsed_ikey, true /* log_err_key */));
-    ASSERT_EQ(expected_val_type, parsed_ikey.type);
-    ASSERT_EQ(Slice(ukey_and_ts), parsed_ikey.user_key);
-    if (expected_val_type == kTypeValue) {
-      ASSERT_EQ(expected_value, it->value());
-    }
-    ASSERT_EQ(expected_ts, it->timestamp());
-  }
-};
-
 class DBBasicTestWithTimestamp : public DBBasicTestWithTimestampBase {
 public:
  DBBasicTestWithTimestamp()
--- a/db/db_with_timestamp_test_util.cc
+++ b/db/db_with_timestamp_test_util.cc
@ -0,0 +1,96 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_with_timestamp_test_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+std::string DBBasicTestWithTimestampBase::Key1(uint64_t k) {
+  std::string ret;
+  PutFixed64(&ret, k);
+  std::reverse(ret.begin(), ret.end());
+  return ret;
+}
+
+std::string DBBasicTestWithTimestampBase::KeyWithPrefix(std::string prefix,
+                                                        uint64_t k) {
+  std::string ret;
+  PutFixed64(&ret, k);
+  std::reverse(ret.begin(), ret.end());
+  return prefix + ret;
+}
+
+std::vector<Slice> DBBasicTestWithTimestampBase::ConvertStrToSlice(
+    std::vector<std::string>& strings) {
+  std::vector<Slice> ret;
+  for (const auto& s : strings) {
+    ret.emplace_back(s);
+  }
+  return ret;
+}
+
+std::string DBBasicTestWithTimestampBase::Timestamp(uint64_t low,
+                                                    uint64_t high) {
+  std::string ts;
+  PutFixed64(&ts, low);
+  PutFixed64(&ts, high);
+  return ts;
+}
+
+void DBBasicTestWithTimestampBase::CheckIterUserEntry(
+    const Iterator* it, const Slice& expected_key,
+    ValueType expected_value_type, const Slice& expected_value,
+    const Slice& expected_ts) const {
+  ASSERT_TRUE(it->Valid());
+  ASSERT_OK(it->status());
+  ASSERT_EQ(expected_key, it->key());
+  if (kTypeValue == expected_value_type) {
+    ASSERT_EQ(expected_value, it->value());
+  }
+  ASSERT_EQ(expected_ts, it->timestamp());
+}
+
+void DBBasicTestWithTimestampBase::CheckIterEntry(
+    const Iterator* it, const Slice& expected_ukey, SequenceNumber expected_seq,
+    ValueType expected_val_type, const Slice& expected_value,
+    const Slice& expected_ts) const {
+  ASSERT_TRUE(it->Valid());
+  ASSERT_OK(it->status());
+  std::string ukey_and_ts;
+  ukey_and_ts.assign(expected_ukey.data(), expected_ukey.size());
+  ukey_and_ts.append(expected_ts.data(), expected_ts.size());
+  ParsedInternalKey parsed_ikey;
+  ASSERT_OK(ParseInternalKey(it->key(), &parsed_ikey, true /* log_err_key */));
+  ASSERT_EQ(ukey_and_ts, parsed_ikey.user_key);
+  ASSERT_EQ(expected_val_type, parsed_ikey.type);
+  ASSERT_EQ(expected_seq, parsed_ikey.sequence);
+  if (expected_val_type == kTypeValue) {
+    ASSERT_EQ(expected_value, it->value());
+  }
+  ASSERT_EQ(expected_ts, it->timestamp());
+}
+
+void DBBasicTestWithTimestampBase::CheckIterEntry(
+    const Iterator* it, const Slice& expected_ukey, ValueType expected_val_type,
+    const Slice& expected_value, const Slice& expected_ts) const {
+  ASSERT_TRUE(it->Valid());
+  ASSERT_OK(it->status());
+  std::string ukey_and_ts;
+  ukey_and_ts.assign(expected_ukey.data(), expected_ukey.size());
+  ukey_and_ts.append(expected_ts.data(), expected_ts.size());
+
+  ParsedInternalKey parsed_ikey;
+  ASSERT_OK(ParseInternalKey(it->key(), &parsed_ikey, true /* log_err_key */));
+  ASSERT_EQ(expected_val_type, parsed_ikey.type);
+  ASSERT_EQ(Slice(ukey_and_ts), parsed_ikey.user_key);
+  if (expected_val_type == kTypeValue) {
+    ASSERT_EQ(expected_value, it->value());
+  }
+  ASSERT_EQ(expected_ts, it->timestamp());
+}
+}  // namespace ROCKSDB_NAMESPACE
--- a/db/db_with_timestamp_test_util.h
+++ b/db/db_with_timestamp_test_util.h
@ -0,0 +1,126 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+class DBBasicTestWithTimestampBase : public DBTestBase {
+ public:
+  explicit DBBasicTestWithTimestampBase(const std::string& dbname)
+      : DBTestBase(dbname, /*env_do_fsync=*/true) {}
+
+ protected:
+  static std::string Key1(uint64_t k);
+
+  static std::string KeyWithPrefix(std::string prefix, uint64_t k);
+
+  static std::vector<Slice> ConvertStrToSlice(
+      std::vector<std::string>& strings);
+
+  class TestComparator : public Comparator {
+   private:
+    const Comparator* cmp_without_ts_;
+
+   public:
+    explicit TestComparator(size_t ts_sz)
+        : Comparator(ts_sz), cmp_without_ts_(nullptr) {
+      cmp_without_ts_ = BytewiseComparator();
+    }
+
+    const char* Name() const override { return "TestComparator"; }
+
+    void FindShortSuccessor(std::string*) const override {}
+
+    void FindShortestSeparator(std::string*, const Slice&) const override {}
+
+    int Compare(const Slice& a, const Slice& b) const override {
+      int r = CompareWithoutTimestamp(a, b);
+      if (r != 0 || 0 == timestamp_size()) {
+        return r;
+      }
+      return -CompareTimestamp(
+          Slice(a.data() + a.size() - timestamp_size(), timestamp_size()),
+          Slice(b.data() + b.size() - timestamp_size(), timestamp_size()));
+    }
+
+    using Comparator::CompareWithoutTimestamp;
+    int CompareWithoutTimestamp(const Slice& a, bool a_has_ts, const Slice& b,
+                                bool b_has_ts) const override {
+      if (a_has_ts) {
+        assert(a.size() >= timestamp_size());
+      }
+      if (b_has_ts) {
+        assert(b.size() >= timestamp_size());
+      }
+      Slice lhs = a_has_ts ? StripTimestampFromUserKey(a, timestamp_size()) : a;
+      Slice rhs = b_has_ts ? StripTimestampFromUserKey(b, timestamp_size()) : b;
+      return cmp_without_ts_->Compare(lhs, rhs);
+    }
+
+    int CompareTimestamp(const Slice& ts1, const Slice& ts2) const override {
+      if (!ts1.data() && !ts2.data()) {
+        return 0;
+      } else if (ts1.data() && !ts2.data()) {
+        return 1;
+      } else if (!ts1.data() && ts2.data()) {
+        return -1;
+      }
+      assert(ts1.size() == ts2.size());
+      uint64_t low1 = 0;
+      uint64_t low2 = 0;
+      uint64_t high1 = 0;
+      uint64_t high2 = 0;
+      const size_t kSize = ts1.size();
+      std::unique_ptr<char[]> ts1_buf(new char[kSize]);
+      memcpy(ts1_buf.get(), ts1.data(), ts1.size());
+      std::unique_ptr<char[]> ts2_buf(new char[kSize]);
+      memcpy(ts2_buf.get(), ts2.data(), ts2.size());
+      Slice ts1_copy = Slice(ts1_buf.get(), kSize);
+      Slice ts2_copy = Slice(ts2_buf.get(), kSize);
+      auto* ptr1 = const_cast<Slice*>(&ts1_copy);
+      auto* ptr2 = const_cast<Slice*>(&ts2_copy);
+      if (!GetFixed64(ptr1, &low1) || !GetFixed64(ptr1, &high1) ||
+          !GetFixed64(ptr2, &low2) || !GetFixed64(ptr2, &high2)) {
+        assert(false);
+      }
+      if (high1 < high2) {
+        return -1;
+      } else if (high1 > high2) {
+        return 1;
+      }
+      if (low1 < low2) {
+        return -1;
+      } else if (low1 > low2) {
+        return 1;
+      }
+      return 0;
+    }
+  };
+
+  std::string Timestamp(uint64_t low, uint64_t high);
+
+  void CheckIterUserEntry(const Iterator* it, const Slice& expected_key,
+                          ValueType expected_value_type,
+                          const Slice& expected_value,
+                          const Slice& expected_ts) const;
+
+  void CheckIterEntry(const Iterator* it, const Slice& expected_ukey,
+                      SequenceNumber expected_seq, ValueType expected_val_type,
+                      const Slice& expected_value,
+                      const Slice& expected_ts) const;
+
+  void CheckIterEntry(const Iterator* it, const Slice& expected_ukey,
+                      ValueType expected_val_type, const Slice& expected_value,
+                      const Slice& expected_ts) const;
+};
+}  // namespace ROCKSDB_NAMESPACE
--- a/db/experimental.cc
+++ b/db/experimental.cc
@ -112,7 +112,7 @@ Status UpdateManifestForFilesState(
                           lf->oldest_blob_file_number,
                           lf->oldest_ancester_time, lf->file_creation_time,
                           lf->file_checksum, lf->file_checksum_func_name,
-                           lf->min_timestamp, lf->max_timestamp);
+                           lf->min_timestamp, lf->max_timestamp, lf->unique_id);
            }
          }
        } else {
--- a/db/external_sst_file_basic_test.cc
+++ b/db/external_sst_file_basic_test.cc
@ -1866,6 +1866,63 @@ TEST_F(ExternalSSTFileBasicTest, VerifyChecksum) {
  ASSERT_OK(db_->VerifyChecksum());
 }

+TEST_F(ExternalSSTFileBasicTest, VerifySstUniqueId) {
+  const std::string kPutVal = "put_val";
+  const std::string kIngestedVal = "ingested_val";
+
+  ASSERT_OK(Put("k", kPutVal, WriteOptions()));
+  ASSERT_OK(Flush());
+
+  std::string external_file = sst_files_dir_ + "/file_to_ingest.sst";
+  {
+    SstFileWriter sst_file_writer{EnvOptions(), CurrentOptions()};
+
+    ASSERT_OK(sst_file_writer.Open(external_file));
+    ASSERT_OK(sst_file_writer.Put("k", kIngestedVal));
+    ASSERT_OK(sst_file_writer.Finish());
+  }
+
+  ASSERT_OK(db_->IngestExternalFile(db_->DefaultColumnFamily(), {external_file},
+                                    IngestExternalFileOptions()));
+  auto options = CurrentOptions();
+  options.verify_sst_unique_id_in_manifest = true;
+  Reopen(options);
+
+  // Test ingest file without session_id and db_id (for example generated by an
+  // older version of sst_writer)
+  SyncPoint::GetInstance()->SetCallBack(
+      "PropertyBlockBuilder::AddTableProperty:Start", [&](void* props_vs) {
+        auto props = static_cast<TableProperties*>(props_vs);
+        // update table property session_id to a different one
+        props->db_session_id = "";
+        props->db_id = "";
+      });
+  std::atomic_int skipped = 0;
+  SyncPoint::GetInstance()->SetCallBack("Version::VerifySstUniqueIds::Skipped",
+                                        [&](void* /*arg*/) { skipped++; });
+  SyncPoint::GetInstance()->EnableProcessing();
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  external_file = sst_files_dir_ + "/file_to_ingest2.sst";
+  {
+    SstFileWriter sst_file_writer{EnvOptions(), CurrentOptions()};
+
+    ASSERT_OK(sst_file_writer.Open(external_file));
+    ASSERT_OK(sst_file_writer.Put("k", kIngestedVal));
+    ASSERT_OK(sst_file_writer.Finish());
+  }
+
+  ASSERT_OK(db_->IngestExternalFile(db_->DefaultColumnFamily(), {external_file},
+                                    IngestExternalFileOptions()));
+
+  options.statistics = CreateDBStatistics();
+  options.verify_sst_unique_id_in_manifest = true;
+  ASSERT_EQ(skipped, 0);
+  Reopen(options);
+  // only one sst file is not verified because of missing unique_id
+  ASSERT_EQ(skipped, 1);
+}
+
 INSTANTIATE_TEST_CASE_P(ExternalSSTFileBasicTest, ExternalSSTFileBasicTest,
                        testing::Values(std::make_tuple(true, true),
                                        std::make_tuple(true, false),
--- a/db/external_sst_file_ingestion_job.cc
+++ b/db/external_sst_file_ingestion_job.cc
@ -22,6 +22,7 @@
 #include "table/scoped_arena_iterator.h"
 #include "table/sst_file_writer_collectors.h"
 #include "table/table_builder.h"
+#include "table/unique_id_impl.h"
 #include "test_util/sync_point.h"
 #include "util/stop_watch.h"

@ -142,6 +143,9 @@ Status ExternalSstFileIngestionJob::Prepare(
                 ingestion_options_.failed_move_fall_back_to_copy) {
        // Original file is on a different FS, use copy instead of hard linking.
        f.copy_file = true;
+        ROCKS_LOG_INFO(db_options_.info_log,
+                       "Triy to link file %s but it's not supported : %s",
+                       path_outside_db.c_str(), status.ToString().c_str());
      }
    } else {
      f.copy_file = true;
@ -446,8 +450,8 @@ Status ExternalSstFileIngestionJob::Run() {
        f.smallest_internal_key, f.largest_internal_key, f.assigned_seqno,
        f.assigned_seqno, false, f.file_temperature, kInvalidBlobFileNumber,
        oldest_ancester_time, current_time, f.file_checksum,
-        f.file_checksum_func_name, kDisableUserTimestamp,
-        kDisableUserTimestamp);
+        f.file_checksum_func_name, kDisableUserTimestamp, kDisableUserTimestamp,
+        f.unique_id);
    f_metadata.temperature = f.file_temperature;
    edit_.AddFile(f.picked_level, f_metadata);
  }
@ -727,6 +731,16 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(

  file_to_ingest->table_properties = *props;

+  auto s = GetSstInternalUniqueId(props->db_id, props->db_session_id,
+                                  props->orig_file_number,
+                                  &(file_to_ingest->unique_id));
+  if (!s.ok()) {
+    ROCKS_LOG_WARN(db_options_.info_log,
+                   "Failed to get SST unique id for file %s",
+                   file_to_ingest->internal_file_path.c_str());
+    file_to_ingest->unique_id = kNullUniqueId64x2;
+  }
+
  return status;
 }

--- a/db/external_sst_file_ingestion_job.h
+++ b/db/external_sst_file_ingestion_job.h
@ -70,6 +70,8 @@ struct IngestedFileInfo {
  std::string file_checksum_func_name;
  // The temperature of the file to be ingested
  Temperature file_temperature = Temperature::kUnknown;
+  // Unique id of the file to be ingested
+  UniqueId64x2 unique_id{};
 };

 class ExternalSstFileIngestionJob {
--- a/db/flush_job.cc
+++ b/db/flush_job.cc
@ -9,9 +9,8 @@

 #include "db/flush_job.h"

-#include <cinttypes>
-
 #include <algorithm>
+#include <cinttypes>
 #include <vector>

 #include "db/builder.h"
@ -810,6 +809,7 @@ Status FlushJob::WriteLevel0Table() {

  {
    auto write_hint = cfd_->CalculateSSTWriteHint(0);
+    Env::IOPriority io_priority = GetRateLimiterPriorityForWrite();
    db_mutex_->Unlock();
    if (log_buffer_) {
      log_buffer_->FlushBufferToLog();
@ -925,7 +925,7 @@ Status FlushJob::WriteLevel0Table() {
          snapshot_checker_, mutable_cf_options_.paranoid_file_checks,
          cfd_->internal_stats(), &io_s, io_tracer_,
          BlobFileCreationReason::kFlush, event_logger_, job_context_->job_id,
-          Env::IO_HIGH, &table_properties_, write_hint, full_history_ts_low,
+          io_priority, &table_properties_, write_hint, full_history_ts_low,
          blob_callback_, &num_input_entries, &memtable_payload_bytes,
          &memtable_garbage_bytes);
      // TODO: Cleanup io_status in BuildTable and table builders
@ -951,14 +951,14 @@ Status FlushJob::WriteLevel0Table() {
      }
      LogFlush(db_options_.info_log);
    }
-    ROCKS_LOG_INFO(db_options_.info_log,
-                   "[%s] [JOB %d] Level-0 flush table #%" PRIu64 ": %" PRIu64
-                   " bytes %s"
-                   "%s",
-                   cfd_->GetName().c_str(), job_context_->job_id,
-                   meta_.fd.GetNumber(), meta_.fd.GetFileSize(),
-                   s.ToString().c_str(),
-                   meta_.marked_for_compaction ? " (needs compaction)" : "");
+    ROCKS_LOG_BUFFER(log_buffer_,
+                     "[%s] [JOB %d] Level-0 flush table #%" PRIu64 ": %" PRIu64
+                     " bytes %s"
+                     "%s",
+                     cfd_->GetName().c_str(), job_context_->job_id,
+                     meta_.fd.GetNumber(), meta_.fd.GetFileSize(),
+                     s.ToString().c_str(),
+                     meta_.marked_for_compaction ? " (needs compaction)" : "");

    if (s.ok() && output_file_directory_ != nullptr && sync_output_directory_) {
      s = output_file_directory_->FsyncWithDirOptions(
@ -988,7 +988,7 @@ Status FlushJob::WriteLevel0Table() {
                   meta_.oldest_blob_file_number, meta_.oldest_ancester_time,
                   meta_.file_creation_time, meta_.file_checksum,
                   meta_.file_checksum_func_name, meta_.min_timestamp,
-                   meta_.max_timestamp);
+                   meta_.max_timestamp, meta_.unique_id);

    edit_->SetBlobFileAdditions(std::move(blob_file_additions));
  }
@ -1032,6 +1032,19 @@ Status FlushJob::WriteLevel0Table() {
  return s;
 }

+Env::IOPriority FlushJob::GetRateLimiterPriorityForWrite() {
+  if (versions_ && versions_->GetColumnFamilySet() &&
+      versions_->GetColumnFamilySet()->write_controller()) {
+    WriteController* write_controller =
+        versions_->GetColumnFamilySet()->write_controller();
+    if (write_controller->IsStopped() || write_controller->NeedsDelay()) {
+      return Env::IO_USER;
+    }
+  }
+
+  return Env::IO_HIGH;
+}
+
 #ifndef ROCKSDB_LITE
 std::unique_ptr<FlushJobInfo> FlushJob::GetFlushJobInfo() const {
  db_mutex_->AssertHeld();
@ -1064,7 +1077,6 @@ std::unique_ptr<FlushJobInfo> FlushJob::GetFlushJobInfo() const {
  }
  return info;
 }
-
 #endif  // !ROCKSDB_LITE

 }  // namespace ROCKSDB_NAMESPACE
--- a/db/flush_job.h
+++ b/db/flush_job.h
@ -94,6 +94,8 @@ class FlushJob {
 #endif  // !ROCKSDB_LITE

 private:
+  friend class FlushJobTest_GetRateLimiterPriorityForWrite_Test;
+
  void ReportStartedFlush();
  void ReportFlushInputSize(const autovector<MemTable*>& mems);
  void RecordFlushIOStats();
@ -121,6 +123,8 @@ class FlushJob {
  // process has not matured yet.
  Status MemPurge();
  bool MemPurgeDecider();
+  // The rate limiter priority (io_priority) is determined dynamically here.
+  Env::IOPriority GetRateLimiterPriorityForWrite();
 #ifndef ROCKSDB_LITE
  std::unique_ptr<FlushJobInfo> GetFlushJobInfo() const;
 #endif  // !ROCKSDB_LITE
--- a/db/flush_job_test.cc
+++ b/db/flush_job_test.cc
@ -528,6 +528,72 @@ TEST_F(FlushJobTest, Snapshots) {
  job_context.Clean();
 }

+TEST_F(FlushJobTest, GetRateLimiterPriorityForWrite) {
+  // Prepare a FlushJob that flush MemTables of Single Column Family.
+  const size_t num_mems = 2;
+  const size_t num_mems_to_flush = 1;
+  const size_t num_keys_per_table = 100;
+  JobContext job_context(0);
+  ColumnFamilyData* cfd = versions_->GetColumnFamilySet()->GetDefault();
+  std::vector<uint64_t> memtable_ids;
+  std::vector<MemTable*> new_mems;
+  for (size_t i = 0; i != num_mems; ++i) {
+    MemTable* mem = cfd->ConstructNewMemtable(*cfd->GetLatestMutableCFOptions(),
+                                              kMaxSequenceNumber);
+    mem->SetID(i);
+    mem->Ref();
+    new_mems.emplace_back(mem);
+    memtable_ids.push_back(mem->GetID());
+
+    for (size_t j = 0; j < num_keys_per_table; ++j) {
+      std::string key(std::to_string(j + i * num_keys_per_table));
+      std::string value("value" + key);
+      ASSERT_OK(mem->Add(SequenceNumber(j + i * num_keys_per_table), kTypeValue,
+                         key, value, nullptr /* kv_prot_info */));
+    }
+  }
+
+  autovector<MemTable*> to_delete;
+  for (auto mem : new_mems) {
+    cfd->imm()->Add(mem, &to_delete);
+  }
+
+  EventLogger event_logger(db_options_.info_log.get());
+  SnapshotChecker* snapshot_checker = nullptr;  // not relavant
+
+  assert(memtable_ids.size() == num_mems);
+  uint64_t smallest_memtable_id = memtable_ids.front();
+  uint64_t flush_memtable_id = smallest_memtable_id + num_mems_to_flush - 1;
+  FlushJob flush_job(
+      dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_,
+      *cfd->GetLatestMutableCFOptions(), flush_memtable_id, env_options_,
+      versions_.get(), &mutex_, &shutting_down_, {}, kMaxSequenceNumber,
+      snapshot_checker, &job_context, nullptr, nullptr, nullptr, kNoCompression,
+      db_options_.statistics.get(), &event_logger, true,
+      true /* sync_output_directory */, true /* write_manifest */,
+      Env::Priority::USER, nullptr /*IOTracer*/);
+
+  // When the state from WriteController is normal.
+  ASSERT_EQ(flush_job.GetRateLimiterPriorityForWrite(), Env::IO_HIGH);
+
+  WriteController* write_controller =
+      flush_job.versions_->GetColumnFamilySet()->write_controller();
+
+  {
+    // When the state from WriteController is Delayed.
+    std::unique_ptr<WriteControllerToken> delay_token =
+        write_controller->GetDelayToken(1000000);
+    ASSERT_EQ(flush_job.GetRateLimiterPriorityForWrite(), Env::IO_USER);
+  }
+
+  {
+    // When the state from WriteController is Stopped.
+    std::unique_ptr<WriteControllerToken> stop_token =
+        write_controller->GetStopToken();
+    ASSERT_EQ(flush_job.GetRateLimiterPriorityForWrite(), Env::IO_USER);
+  }
+}
+
 class FlushJobTimestampTest : public FlushJobTestBase {
 public:
  FlushJobTimestampTest()
--- a/db/import_column_family_job.cc
+++ b/db/import_column_family_job.cc
@ -15,6 +15,7 @@
 #include "table/scoped_arena_iterator.h"
 #include "table/sst_file_writer_collectors.h"
 #include "table/table_builder.h"
+#include "table/unique_id_impl.h"
 #include "util/stop_watch.h"

 namespace ROCKSDB_NAMESPACE {
@ -97,6 +98,9 @@ Status ImportColumnFamilyJob::Prepare(uint64_t next_file_number,
      if (status.IsNotSupported()) {
        // Original file is on a different FS, use copy instead of hard linking
        hardlink_files = false;
+        ROCKS_LOG_INFO(db_options_.info_log,
+                       "Try to link file %s but it's not supported : %s",
+                       f.internal_file_path.c_str(), status.ToString().c_str());
      }
    }
    if (!hardlink_files) {
@ -156,7 +160,7 @@ Status ImportColumnFamilyJob::Run() {
                  file_metadata.largest_seqno, false, file_metadata.temperature,
                  kInvalidBlobFileNumber, oldest_ancester_time, current_time,
                  kUnknownFileChecksum, kUnknownFileChecksumFuncName,
-                  kDisableUserTimestamp, kDisableUserTimestamp);
+                  kDisableUserTimestamp, kDisableUserTimestamp, f.unique_id);

    // If incoming sequence number is higher, update local sequence number.
    if (file_metadata.largest_seqno > versions_->LastSequence()) {
@ -285,6 +289,15 @@ Status ImportColumnFamilyJob::GetIngestedFileInfo(

  file_to_import->table_properties = *props;

+  auto s = GetSstInternalUniqueId(props->db_id, props->db_session_id,
+                                  props->orig_file_number,
+                                  &(file_to_import->unique_id));
+  if (!s.ok()) {
+    ROCKS_LOG_WARN(db_options_.info_log,
+                   "Failed to get SST unique id for file %s",
+                   file_to_import->internal_file_path.c_str());
+  }
+
  return status;
 }

--- a/db/import_column_family_test.cc
+++ b/db/import_column_family_test.cc
@ -130,6 +130,12 @@ TEST_F(ImportColumnFamilyTest, ImportSSTFileWriterFiles) {
    ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, "K4", &value));
    ASSERT_EQ(value, "V2");
  }
+  EXPECT_OK(db_->DestroyColumnFamilyHandle(import_cfh_));
+  import_cfh_ = nullptr;
+
+  // verify sst unique id during reopen
+  options.verify_sst_unique_id_in_manifest = true;
+  ReopenWithColumnFamilies({"default", "koko", "yoyo"}, options);
 }

 TEST_F(ImportColumnFamilyTest, ImportSSTFileWriterFilesWithOverlap) {
--- a/db/repair.cc
+++ b/db/repair.cc
@ -82,6 +82,7 @@
 #include "rocksdb/options.h"
 #include "rocksdb/write_buffer_manager.h"
 #include "table/scoped_arena_iterator.h"
+#include "table/unique_id_impl.h"
 #include "util/string_util.h"

 namespace ROCKSDB_NAMESPACE {
@ -505,6 +506,15 @@ class Repairer {
                                                t->meta.fd, &props);
    }
    if (status.ok()) {
+      auto s =
+          GetSstInternalUniqueId(props->db_id, props->db_session_id,
+                                 props->orig_file_number, &t->meta.unique_id);
+      if (!s.ok()) {
+        ROCKS_LOG_WARN(db_options_.info_log,
+                       "Table #%" PRIu64
+                       ": unable to get unique id, default to Unknown.",
+                       t->meta.fd.GetNumber());
+      }
      t->column_family_id = static_cast<uint32_t>(props->column_family_id);
      if (t->column_family_id ==
          TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) {
@ -639,7 +649,8 @@ class Repairer {
            table->meta.temperature, table->meta.oldest_blob_file_number,
            table->meta.oldest_ancester_time, table->meta.file_creation_time,
            table->meta.file_checksum, table->meta.file_checksum_func_name,
-            table->meta.min_timestamp, table->meta.max_timestamp);
+            table->meta.min_timestamp, table->meta.max_timestamp,
+            table->meta.unique_id);
      }
      assert(next_file_number_ > 0);
      vset_.MarkFileNumberUsed(next_file_number_ - 1);
--- a/db/repair_test.cc
+++ b/db/repair_test.cc
@ -43,6 +43,23 @@ class RepairTest : public DBTestBase {
    }
    return s;
  }
+
+  void ReopenWithSstIdVerify() {
+    std::atomic_int verify_passed{0};
+    SyncPoint::GetInstance()->SetCallBack(
+        "Version::VerifySstUniqueIds::Passed", [&](void* arg) {
+          // override job status
+          auto id = static_cast<std::string*>(arg);
+          assert(!id->empty());
+          verify_passed++;
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+    auto options = CurrentOptions();
+    options.verify_sst_unique_id_in_manifest = true;
+    Reopen(options);
+
+    ASSERT_GT(verify_passed, 0);
+  }
 };

 TEST_F(RepairTest, LostManifest) {
@ -61,7 +78,7 @@ TEST_F(RepairTest, LostManifest) {
  ASSERT_OK(env_->FileExists(manifest_path));
  ASSERT_OK(env_->DeleteFile(manifest_path));
  ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
-  Reopen(CurrentOptions());
+  ReopenWithSstIdVerify();

  ASSERT_EQ(Get("key"), "val");
  ASSERT_EQ(Get("key2"), "val2");
@ -88,7 +105,9 @@ TEST_F(RepairTest, LostManifestMoreDbFeatures) {
  ASSERT_OK(env_->FileExists(manifest_path));
  ASSERT_OK(env_->DeleteFile(manifest_path));
  ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
-  Reopen(CurrentOptions());
+
+  // repair from sst should work with unique_id verification
+  ReopenWithSstIdVerify();

  ASSERT_EQ(Get("key"), "val");
  ASSERT_EQ(Get("key2"), "NOT_FOUND");
@ -113,7 +132,8 @@ TEST_F(RepairTest, CorruptManifest) {
  ASSERT_OK(CreateFile(env_->GetFileSystem(), manifest_path, "blah",
                       false /* use_fsync */));
  ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
-  Reopen(CurrentOptions());
+
+  ReopenWithSstIdVerify();

  ASSERT_EQ(Get("key"), "val");
  ASSERT_EQ(Get("key2"), "val2");
@ -139,7 +159,8 @@ TEST_F(RepairTest, IncompleteManifest) {
  // Replace the manifest with one that is only aware of the first SST file.
  CopyFile(orig_manifest_path + ".tmp", new_manifest_path);
  ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
-  Reopen(CurrentOptions());
+
+  ReopenWithSstIdVerify();

  ASSERT_EQ(Get("key"), "val");
  ASSERT_EQ(Get("key2"), "val2");
@ -157,7 +178,8 @@ TEST_F(RepairTest, PostRepairSstFileNumbering) {

  ASSERT_OK(RepairDB(dbname_, CurrentOptions()));

-  Reopen(CurrentOptions());
+  ReopenWithSstIdVerify();
+
  uint64_t post_repair_file_num = dbfull()->TEST_Current_Next_FileNo();
  ASSERT_GE(post_repair_file_num, pre_repair_file_num);
 }
@ -176,7 +198,7 @@ TEST_F(RepairTest, LostSst) {

  Close();
  ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
-  Reopen(CurrentOptions());
+  ReopenWithSstIdVerify();

  // Exactly one of the key-value pairs should be in the DB now.
  ASSERT_TRUE((Get("key") == "val") != (Get("key2") == "val2"));
@ -198,7 +220,7 @@ TEST_F(RepairTest, CorruptSst) {

  Close();
  ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
-  Reopen(CurrentOptions());
+  ReopenWithSstIdVerify();

  // Exactly one of the key-value pairs should be in the DB now.
  ASSERT_TRUE((Get("key") == "val") != (Get("key2") == "val2"));
@ -226,7 +248,7 @@ TEST_F(RepairTest, UnflushedSst) {
  ASSERT_OK(env_->FileExists(manifest_path));
  ASSERT_OK(env_->DeleteFile(manifest_path));
  ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
-  Reopen(CurrentOptions());
+  ReopenWithSstIdVerify();

  ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files));
  ASSERT_EQ(wal_files.size(), 0);
@ -265,7 +287,7 @@ TEST_F(RepairTest, SeparateWalDir) {
    // make sure that all WALs are converted to SSTables.
    options.wal_dir = "";

-    Reopen(options);
+    ReopenWithSstIdVerify();
    ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files));
    ASSERT_EQ(wal_files.size(), 0);
    {
@ -398,7 +420,7 @@ TEST_F(RepairTest, DbNameContainsTrailingSlash) {
  Close();

  ASSERT_OK(RepairDB(dbname_ + "/", CurrentOptions()));
-  Reopen(CurrentOptions());
+  ReopenWithSstIdVerify();
  ASSERT_EQ(Get("key"), "val");
 }
 #endif  // ROCKSDB_LITE
--- a/db/table_cache.cc
+++ b/db/table_cache.cc
@ -32,14 +32,31 @@
 #include "util/stop_watch.h"

 namespace ROCKSDB_NAMESPACE {
-
 namespace {
-
 template <class T>
 static void DeleteEntry(const Slice& /*key*/, void* value) {
  T* typed_value = reinterpret_cast<T*>(value);
  delete typed_value;
 }
+}  // namespace
+}  // namespace ROCKSDB_NAMESPACE
+
+// Generate the regular and coroutine versions of some methods by
+// including table_cache_sync_and_async.h twice
+// Macros in the header will expand differently based on whether
+// WITH_COROUTINES or WITHOUT_COROUTINES is defined
+// clang-format off
+#define WITHOUT_COROUTINES
+#include "db/table_cache_sync_and_async.h"
+#undef WITHOUT_COROUTINES
+#define WITH_COROUTINES
+#include "db/table_cache_sync_and_async.h"
+#undef WITH_COROUTINES
+// clang-format on
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {

 static void UnrefEntry(void* arg1, void* arg2) {
  Cache* cache = reinterpret_cast<Cache*>(arg1);
@ -484,131 +501,6 @@ Status TableCache::Get(
  return s;
 }

-// Batched version of TableCache::MultiGet.
-Status TableCache::MultiGet(
-    const ReadOptions& options,
-    const InternalKeyComparator& internal_comparator,
-    const FileMetaData& file_meta, const MultiGetContext::Range* mget_range,
-    const std::shared_ptr<const SliceTransform>& prefix_extractor,
-    HistogramImpl* file_read_hist, bool skip_filters, int level) {
-  auto& fd = file_meta.fd;
-  Status s;
-  TableReader* t = fd.table_reader;
-  Cache::Handle* handle = nullptr;
-  MultiGetRange table_range(*mget_range, mget_range->begin(),
-                            mget_range->end());
-#ifndef ROCKSDB_LITE
-  autovector<std::string, MultiGetContext::MAX_BATCH_SIZE> row_cache_entries;
-  IterKey row_cache_key;
-  size_t row_cache_key_prefix_size = 0;
-  KeyContext& first_key = *table_range.begin();
-  bool lookup_row_cache =
-      ioptions_.row_cache && !first_key.get_context->NeedToReadSequence();
-
-  // Check row cache if enabled. Since row cache does not currently store
-  // sequence numbers, we cannot use it if we need to fetch the sequence.
-  if (lookup_row_cache) {
-    GetContext* first_context = first_key.get_context;
-    CreateRowCacheKeyPrefix(options, fd, first_key.ikey, first_context,
-                            row_cache_key);
-    row_cache_key_prefix_size = row_cache_key.Size();
-
-    for (auto miter = table_range.begin(); miter != table_range.end();
-         ++miter) {
-      const Slice& user_key = miter->ukey_with_ts;
-
-      GetContext* get_context = miter->get_context;
-
-      if (GetFromRowCache(user_key, row_cache_key, row_cache_key_prefix_size,
-                          get_context)) {
-        table_range.SkipKey(miter);
-      } else {
-        row_cache_entries.emplace_back();
-        get_context->SetReplayLog(&(row_cache_entries.back()));
-      }
-    }
-  }
-#endif  // ROCKSDB_LITE
-
-  // Check that table_range is not empty. Its possible all keys may have been
-  // found in the row cache and thus the range may now be empty
-  if (s.ok() && !table_range.empty()) {
-    if (t == nullptr) {
-      s = FindTable(options, file_options_, internal_comparator, fd, &handle,
-                    prefix_extractor,
-                    options.read_tier == kBlockCacheTier /* no_io */,
-                    true /* record_read_stats */, file_read_hist, skip_filters,
-                    level, true /* prefetch_index_and_filter_in_cache */,
-                    0 /*max_file_size_for_l0_meta_pin*/, file_meta.temperature);
-      TEST_SYNC_POINT_CALLBACK("TableCache::MultiGet:FindTable", &s);
-      if (s.ok()) {
-        t = GetTableReaderFromHandle(handle);
-        assert(t);
-      }
-    }
-    if (s.ok() && !options.ignore_range_deletions) {
-      std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
-          t->NewRangeTombstoneIterator(options));
-      if (range_del_iter != nullptr) {
-        for (auto iter = table_range.begin(); iter != table_range.end();
-             ++iter) {
-          SequenceNumber* max_covering_tombstone_seq =
-              iter->get_context->max_covering_tombstone_seq();
-          *max_covering_tombstone_seq = std::max(
-              *max_covering_tombstone_seq,
-              range_del_iter->MaxCoveringTombstoneSeqnum(iter->ukey_with_ts));
-        }
-      }
-    }
-    if (s.ok()) {
-      t->MultiGet(options, &table_range, prefix_extractor.get(), skip_filters);
-    } else if (options.read_tier == kBlockCacheTier && s.IsIncomplete()) {
-      for (auto iter = table_range.begin(); iter != table_range.end(); ++iter) {
-        Status* status = iter->s;
-        if (status->IsIncomplete()) {
-          // Couldn't find Table in cache but treat as kFound if no_io set
-          iter->get_context->MarkKeyMayExist();
-          s = Status::OK();
-        }
-      }
-    }
-  }
-
-#ifndef ROCKSDB_LITE
-  if (lookup_row_cache) {
-    size_t row_idx = 0;
-
-    for (auto miter = table_range.begin(); miter != table_range.end();
-         ++miter) {
-      std::string& row_cache_entry = row_cache_entries[row_idx++];
-      const Slice& user_key = miter->ukey_with_ts;
-      ;
-      GetContext* get_context = miter->get_context;
-
-      get_context->SetReplayLog(nullptr);
-      // Compute row cache key.
-      row_cache_key.TrimAppend(row_cache_key_prefix_size, user_key.data(),
-                               user_key.size());
-      // Put the replay log in row cache only if something was found.
-      if (s.ok() && !row_cache_entry.empty()) {
-        size_t charge = row_cache_entry.capacity() + sizeof(std::string);
-        void* row_ptr = new std::string(std::move(row_cache_entry));
-        // If row cache is full, it's OK.
-        ioptions_.row_cache
-            ->Insert(row_cache_key.GetUserKey(), row_ptr, charge,
-                     &DeleteEntry<std::string>)
-            .PermitUncheckedError();
-      }
-    }
-  }
-#endif  // ROCKSDB_LITE
-
-  if (handle != nullptr) {
-    ReleaseHandle(handle);
-  }
-  return s;
-}
-
 Status TableCache::GetTableProperties(
    const FileOptions& file_options,
    const InternalKeyComparator& internal_comparator, const FileDescriptor& fd,
--- a/db/table_cache.h
+++ b/db/table_cache.h
@ -24,6 +24,7 @@
 #include "rocksdb/table.h"
 #include "table/table_reader.h"
 #include "trace_replay/block_cache_tracer.h"
+#include "util/coro_utils.h"

 namespace ROCKSDB_NAMESPACE {

@ -115,8 +116,8 @@ class TableCache {
  //                   in the embedded GetContext
  // @param skip_filters Disables loading/accessing the filter block
  // @param level The level this table is at, -1 for "not set / don't know"
-  Status MultiGet(
-      const ReadOptions& options,
+  DECLARE_SYNC_AND_ASYNC(
+      Status, MultiGet, const ReadOptions& options,
      const InternalKeyComparator& internal_comparator,
      const FileMetaData& file_meta, const MultiGetContext::Range* mget_range,
      const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr,
--- a/db/table_cache_sync_and_async.h
+++ b/db/table_cache_sync_and_async.h
@ -0,0 +1,140 @@
+//  Copyright (c) Meta Platforms, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "util/coro_utils.h"
+
+#if defined(WITHOUT_COROUTINES) || \
+    (defined(USE_COROUTINES) && defined(WITH_COROUTINES))
+namespace ROCKSDB_NAMESPACE {
+
+#if defined(WITHOUT_COROUTINES)
+#endif
+
+// Batched version of TableCache::MultiGet.
+DEFINE_SYNC_AND_ASYNC(Status, TableCache::MultiGet)
+(const ReadOptions& options, const InternalKeyComparator& internal_comparator,
+ const FileMetaData& file_meta, const MultiGetContext::Range* mget_range,
+ const std::shared_ptr<const SliceTransform>& prefix_extractor,
+ HistogramImpl* file_read_hist, bool skip_filters, int level) {
+  auto& fd = file_meta.fd;
+  Status s;
+  TableReader* t = fd.table_reader;
+  Cache::Handle* handle = nullptr;
+  MultiGetRange table_range(*mget_range, mget_range->begin(),
+                            mget_range->end());
+#ifndef ROCKSDB_LITE
+  autovector<std::string, MultiGetContext::MAX_BATCH_SIZE> row_cache_entries;
+  IterKey row_cache_key;
+  size_t row_cache_key_prefix_size = 0;
+  KeyContext& first_key = *table_range.begin();
+  bool lookup_row_cache =
+      ioptions_.row_cache && !first_key.get_context->NeedToReadSequence();
+
+  // Check row cache if enabled. Since row cache does not currently store
+  // sequence numbers, we cannot use it if we need to fetch the sequence.
+  if (lookup_row_cache) {
+    GetContext* first_context = first_key.get_context;
+    CreateRowCacheKeyPrefix(options, fd, first_key.ikey, first_context,
+                            row_cache_key);
+    row_cache_key_prefix_size = row_cache_key.Size();
+
+    for (auto miter = table_range.begin(); miter != table_range.end();
+         ++miter) {
+      const Slice& user_key = miter->ukey_with_ts;
+
+      GetContext* get_context = miter->get_context;
+
+      if (GetFromRowCache(user_key, row_cache_key, row_cache_key_prefix_size,
+                          get_context)) {
+        table_range.SkipKey(miter);
+      } else {
+        row_cache_entries.emplace_back();
+        get_context->SetReplayLog(&(row_cache_entries.back()));
+      }
+    }
+  }
+#endif  // ROCKSDB_LITE
+
+  // Check that table_range is not empty. Its possible all keys may have been
+  // found in the row cache and thus the range may now be empty
+  if (s.ok() && !table_range.empty()) {
+    if (t == nullptr) {
+      s = FindTable(options, file_options_, internal_comparator, fd, &handle,
+                    prefix_extractor,
+                    options.read_tier == kBlockCacheTier /* no_io */,
+                    true /* record_read_stats */, file_read_hist, skip_filters,
+                    level, true /* prefetch_index_and_filter_in_cache */,
+                    0 /*max_file_size_for_l0_meta_pin*/, file_meta.temperature);
+      TEST_SYNC_POINT_CALLBACK("TableCache::MultiGet:FindTable", &s);
+      if (s.ok()) {
+        t = GetTableReaderFromHandle(handle);
+        assert(t);
+      }
+    }
+    if (s.ok() && !options.ignore_range_deletions) {
+      std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+          t->NewRangeTombstoneIterator(options));
+      if (range_del_iter != nullptr) {
+        for (auto iter = table_range.begin(); iter != table_range.end();
+             ++iter) {
+          SequenceNumber* max_covering_tombstone_seq =
+              iter->get_context->max_covering_tombstone_seq();
+          *max_covering_tombstone_seq = std::max(
+              *max_covering_tombstone_seq,
+              range_del_iter->MaxCoveringTombstoneSeqnum(iter->ukey_with_ts));
+        }
+      }
+    }
+    if (s.ok()) {
+      CO_AWAIT(t->MultiGet)
+      (options, &table_range, prefix_extractor.get(), skip_filters);
+    } else if (options.read_tier == kBlockCacheTier && s.IsIncomplete()) {
+      for (auto iter = table_range.begin(); iter != table_range.end(); ++iter) {
+        Status* status = iter->s;
+        if (status->IsIncomplete()) {
+          // Couldn't find Table in cache but treat as kFound if no_io set
+          iter->get_context->MarkKeyMayExist();
+          s = Status::OK();
+        }
+      }
+    }
+  }
+
+#ifndef ROCKSDB_LITE
+  if (lookup_row_cache) {
+    size_t row_idx = 0;
+
+    for (auto miter = table_range.begin(); miter != table_range.end();
+         ++miter) {
+      std::string& row_cache_entry = row_cache_entries[row_idx++];
+      const Slice& user_key = miter->ukey_with_ts;
+      ;
+      GetContext* get_context = miter->get_context;
+
+      get_context->SetReplayLog(nullptr);
+      // Compute row cache key.
+      row_cache_key.TrimAppend(row_cache_key_prefix_size, user_key.data(),
+                               user_key.size());
+      // Put the replay log in row cache only if something was found.
+      if (s.ok() && !row_cache_entry.empty()) {
+        size_t charge = row_cache_entry.capacity() + sizeof(std::string);
+        void* row_ptr = new std::string(std::move(row_cache_entry));
+        // If row cache is full, it's OK.
+        ioptions_.row_cache
+            ->Insert(row_cache_key.GetUserKey(), row_ptr, charge,
+                     &DeleteEntry<std::string>)
+            .PermitUncheckedError();
+      }
+    }
+  }
+#endif  // ROCKSDB_LITE
+
+  if (handle != nullptr) {
+    ReleaseHandle(handle);
+  }
+  CO_RETURN s;
+}
+}  // namespace ROCKSDB_NAMESPACE
+#endif
--- a/db/version_builder_test.cc
+++ b/db/version_builder_test.cc
@ -12,6 +12,7 @@
 #include "db/version_edit.h"
 #include "db/version_set.h"
 #include "rocksdb/advanced_options.h"
+#include "table/unique_id_impl.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "util/string_util.h"
@ -72,7 +73,7 @@ class VersionBuilderTest : public testing::Test {
        oldest_blob_file_number, kUnknownOldestAncesterTime,
        kUnknownFileCreationTime, kUnknownFileChecksum,
        kUnknownFileChecksumFuncName, kDisableUserTimestamp,
-        kDisableUserTimestamp);
+        kDisableUserTimestamp, kNullUniqueId64x2);
    f->compensated_file_size = file_size;
    f->num_entries = num_entries;
    f->num_deletions = num_deletions;
@ -128,13 +129,13 @@ class VersionBuilderTest : public testing::Test {
    constexpr SequenceNumber largest_seqno = 300;
    constexpr bool marked_for_compaction = false;

-    edit->AddFile(level, table_file_number, path_id, file_size,
-                  GetInternalKey(smallest), GetInternalKey(largest),
-                  smallest_seqno, largest_seqno, marked_for_compaction,
-                  Temperature::kUnknown, blob_file_number,
-                  kUnknownOldestAncesterTime, kUnknownFileCreationTime,
-                  kUnknownFileChecksum, kUnknownFileChecksumFuncName,
-                  kDisableUserTimestamp, kDisableUserTimestamp);
+    edit->AddFile(
+        level, table_file_number, path_id, file_size, GetInternalKey(smallest),
+        GetInternalKey(largest), smallest_seqno, largest_seqno,
+        marked_for_compaction, Temperature::kUnknown, blob_file_number,
+        kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+        kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+        kDisableUserTimestamp, kDisableUserTimestamp, kNullUniqueId64x2);
  }

  void UpdateVersionStorageInfo(VersionStorageInfo* vstorage) {
@ -175,12 +176,12 @@ TEST_F(VersionBuilderTest, ApplyAndSaveTo) {
  UpdateVersionStorageInfo();

  VersionEdit version_edit;
-  version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"),
-                       GetInternalKey("350"), 200, 200, false,
-                       Temperature::kUnknown, kInvalidBlobFileNumber,
-                       kUnknownOldestAncesterTime, kUnknownFileCreationTime,
-                       kUnknownFileChecksum, kUnknownFileChecksumFuncName,
-                       kDisableUserTimestamp, kDisableUserTimestamp);
+  version_edit.AddFile(
+      2, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200,
+      false, Temperature::kUnknown, kInvalidBlobFileNumber,
+      kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kDisableUserTimestamp,
+      kDisableUserTimestamp, kNullUniqueId64x2);
  version_edit.DeleteFile(3, 27U);

  EnvOptions env_options;
@ -219,12 +220,12 @@ TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic) {
  UpdateVersionStorageInfo();

  VersionEdit version_edit;
-  version_edit.AddFile(3, 666, 0, 100U, GetInternalKey("301"),
-                       GetInternalKey("350"), 200, 200, false,
-                       Temperature::kUnknown, kInvalidBlobFileNumber,
-                       kUnknownOldestAncesterTime, kUnknownFileCreationTime,
-                       kUnknownFileChecksum, kUnknownFileChecksumFuncName,
-                       kDisableUserTimestamp, kDisableUserTimestamp);
+  version_edit.AddFile(
+      3, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200,
+      false, Temperature::kUnknown, kInvalidBlobFileNumber,
+      kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kDisableUserTimestamp,
+      kDisableUserTimestamp, kNullUniqueId64x2);
  version_edit.DeleteFile(0, 1U);
  version_edit.DeleteFile(0, 88U);

@ -266,12 +267,12 @@ TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic2) {
  UpdateVersionStorageInfo();

  VersionEdit version_edit;
-  version_edit.AddFile(4, 666, 0, 100U, GetInternalKey("301"),
-                       GetInternalKey("350"), 200, 200, false,
-                       Temperature::kUnknown, kInvalidBlobFileNumber,
-                       kUnknownOldestAncesterTime, kUnknownFileCreationTime,
-                       kUnknownFileChecksum, kUnknownFileChecksumFuncName,
-                       kDisableUserTimestamp, kDisableUserTimestamp);
+  version_edit.AddFile(
+      4, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200,
+      false, Temperature::kUnknown, kInvalidBlobFileNumber,
+      kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kDisableUserTimestamp,
+      kDisableUserTimestamp, kNullUniqueId64x2);
  version_edit.DeleteFile(0, 1U);
  version_edit.DeleteFile(0, 88U);
  version_edit.DeleteFile(4, 6U);
@ -303,36 +304,36 @@ TEST_F(VersionBuilderTest, ApplyMultipleAndSaveTo) {
  UpdateVersionStorageInfo();

  VersionEdit version_edit;
-  version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"),
-                       GetInternalKey("350"), 200, 200, false,
-                       Temperature::kUnknown, kInvalidBlobFileNumber,
-                       kUnknownOldestAncesterTime, kUnknownFileCreationTime,
-                       kUnknownFileChecksum, kUnknownFileChecksumFuncName,
-                       kDisableUserTimestamp, kDisableUserTimestamp);
-  version_edit.AddFile(2, 676, 0, 100U, GetInternalKey("401"),
-                       GetInternalKey("450"), 200, 200, false,
-                       Temperature::kUnknown, kInvalidBlobFileNumber,
-                       kUnknownOldestAncesterTime, kUnknownFileCreationTime,
-                       kUnknownFileChecksum, kUnknownFileChecksumFuncName,
-                       kDisableUserTimestamp, kDisableUserTimestamp);
-  version_edit.AddFile(2, 636, 0, 100U, GetInternalKey("601"),
-                       GetInternalKey("650"), 200, 200, false,
-                       Temperature::kUnknown, kInvalidBlobFileNumber,
-                       kUnknownOldestAncesterTime, kUnknownFileCreationTime,
-                       kUnknownFileChecksum, kUnknownFileChecksumFuncName,
-                       kDisableUserTimestamp, kDisableUserTimestamp);
-  version_edit.AddFile(2, 616, 0, 100U, GetInternalKey("501"),
-                       GetInternalKey("550"), 200, 200, false,
-                       Temperature::kUnknown, kInvalidBlobFileNumber,
-                       kUnknownOldestAncesterTime, kUnknownFileCreationTime,
-                       kUnknownFileChecksum, kUnknownFileChecksumFuncName,
-                       kDisableUserTimestamp, kDisableUserTimestamp);
-  version_edit.AddFile(2, 606, 0, 100U, GetInternalKey("701"),
-                       GetInternalKey("750"), 200, 200, false,
-                       Temperature::kUnknown, kInvalidBlobFileNumber,
-                       kUnknownOldestAncesterTime, kUnknownFileCreationTime,
-                       kUnknownFileChecksum, kUnknownFileChecksumFuncName,
-                       kDisableUserTimestamp, kDisableUserTimestamp);
+  version_edit.AddFile(
+      2, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200,
+      false, Temperature::kUnknown, kInvalidBlobFileNumber,
+      kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kDisableUserTimestamp,
+      kDisableUserTimestamp, kNullUniqueId64x2);
+  version_edit.AddFile(
+      2, 676, 0, 100U, GetInternalKey("401"), GetInternalKey("450"), 200, 200,
+      false, Temperature::kUnknown, kInvalidBlobFileNumber,
+      kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kDisableUserTimestamp,
+      kDisableUserTimestamp, kNullUniqueId64x2);
+  version_edit.AddFile(
+      2, 636, 0, 100U, GetInternalKey("601"), GetInternalKey("650"), 200, 200,
+      false, Temperature::kUnknown, kInvalidBlobFileNumber,
+      kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kDisableUserTimestamp,
+      kDisableUserTimestamp, kNullUniqueId64x2);
+  version_edit.AddFile(
+      2, 616, 0, 100U, GetInternalKey("501"), GetInternalKey("550"), 200, 200,
+      false, Temperature::kUnknown, kInvalidBlobFileNumber,
+      kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kDisableUserTimestamp,
+      kDisableUserTimestamp, kNullUniqueId64x2);
+  version_edit.AddFile(
+      2, 606, 0, 100U, GetInternalKey("701"), GetInternalKey("750"), 200, 200,
+      false, Temperature::kUnknown, kInvalidBlobFileNumber,
+      kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kDisableUserTimestamp,
+      kDisableUserTimestamp, kNullUniqueId64x2);

  EnvOptions env_options;
  constexpr TableCache* table_cache = nullptr;
@ -367,53 +368,53 @@ TEST_F(VersionBuilderTest, ApplyDeleteAndSaveTo) {
                                  kCompactionStyleLevel, nullptr, false);

  VersionEdit version_edit;
-  version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"),
-                       GetInternalKey("350"), 200, 200, false,
-                       Temperature::kUnknown, kInvalidBlobFileNumber,
-                       kUnknownOldestAncesterTime, kUnknownFileCreationTime,
-                       kUnknownFileChecksum, kUnknownFileChecksumFuncName,
-                       kDisableUserTimestamp, kDisableUserTimestamp);
-  version_edit.AddFile(2, 676, 0, 100U, GetInternalKey("401"),
-                       GetInternalKey("450"), 200, 200, false,
-                       Temperature::kUnknown, kInvalidBlobFileNumber,
-                       kUnknownOldestAncesterTime, kUnknownFileCreationTime,
-                       kUnknownFileChecksum, kUnknownFileChecksumFuncName,
-                       kDisableUserTimestamp, kDisableUserTimestamp);
-  version_edit.AddFile(2, 636, 0, 100U, GetInternalKey("601"),
-                       GetInternalKey("650"), 200, 200, false,
-                       Temperature::kUnknown, kInvalidBlobFileNumber,
-                       kUnknownOldestAncesterTime, kUnknownFileCreationTime,
-                       kUnknownFileChecksum, kUnknownFileChecksumFuncName,
-                       kDisableUserTimestamp, kDisableUserTimestamp);
-  version_edit.AddFile(2, 616, 0, 100U, GetInternalKey("501"),
-                       GetInternalKey("550"), 200, 200, false,
-                       Temperature::kUnknown, kInvalidBlobFileNumber,
-                       kUnknownOldestAncesterTime, kUnknownFileCreationTime,
-                       kUnknownFileChecksum, kUnknownFileChecksumFuncName,
-                       kDisableUserTimestamp, kDisableUserTimestamp);
-  version_edit.AddFile(2, 606, 0, 100U, GetInternalKey("701"),
-                       GetInternalKey("750"), 200, 200, false,
-                       Temperature::kUnknown, kInvalidBlobFileNumber,
-                       kUnknownOldestAncesterTime, kUnknownFileCreationTime,
-                       kUnknownFileChecksum, kUnknownFileChecksumFuncName,
-                       kDisableUserTimestamp, kDisableUserTimestamp);
+  version_edit.AddFile(
+      2, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200,
+      false, Temperature::kUnknown, kInvalidBlobFileNumber,
+      kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kDisableUserTimestamp,
+      kDisableUserTimestamp, kNullUniqueId64x2);
+  version_edit.AddFile(
+      2, 676, 0, 100U, GetInternalKey("401"), GetInternalKey("450"), 200, 200,
+      false, Temperature::kUnknown, kInvalidBlobFileNumber,
+      kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kDisableUserTimestamp,
+      kDisableUserTimestamp, kNullUniqueId64x2);
+  version_edit.AddFile(
+      2, 636, 0, 100U, GetInternalKey("601"), GetInternalKey("650"), 200, 200,
+      false, Temperature::kUnknown, kInvalidBlobFileNumber,
+      kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kDisableUserTimestamp,
+      kDisableUserTimestamp, kNullUniqueId64x2);
+  version_edit.AddFile(
+      2, 616, 0, 100U, GetInternalKey("501"), GetInternalKey("550"), 200, 200,
+      false, Temperature::kUnknown, kInvalidBlobFileNumber,
+      kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kDisableUserTimestamp,
+      kDisableUserTimestamp, kNullUniqueId64x2);
+  version_edit.AddFile(
+      2, 606, 0, 100U, GetInternalKey("701"), GetInternalKey("750"), 200, 200,
+      false, Temperature::kUnknown, kInvalidBlobFileNumber,
+      kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kDisableUserTimestamp,
+      kDisableUserTimestamp, kNullUniqueId64x2);
  ASSERT_OK(version_builder.Apply(&version_edit));

  VersionEdit version_edit2;
-  version_edit.AddFile(2, 808, 0, 100U, GetInternalKey("901"),
-                       GetInternalKey("950"), 200, 200, false,
-                       Temperature::kUnknown, kInvalidBlobFileNumber,
-                       kUnknownOldestAncesterTime, kUnknownFileCreationTime,
-                       kUnknownFileChecksum, kUnknownFileChecksumFuncName,
-                       kDisableUserTimestamp, kDisableUserTimestamp);
+  version_edit.AddFile(
+      2, 808, 0, 100U, GetInternalKey("901"), GetInternalKey("950"), 200, 200,
+      false, Temperature::kUnknown, kInvalidBlobFileNumber,
+      kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kDisableUserTimestamp,
+      kDisableUserTimestamp, kNullUniqueId64x2);
  version_edit2.DeleteFile(2, 616);
  version_edit2.DeleteFile(2, 636);
-  version_edit.AddFile(2, 806, 0, 100U, GetInternalKey("801"),
-                       GetInternalKey("850"), 200, 200, false,
-                       Temperature::kUnknown, kInvalidBlobFileNumber,
-                       kUnknownOldestAncesterTime, kUnknownFileCreationTime,
-                       kUnknownFileChecksum, kUnknownFileChecksumFuncName,
-                       kDisableUserTimestamp, kDisableUserTimestamp);
+  version_edit.AddFile(
+      2, 806, 0, 100U, GetInternalKey("801"), GetInternalKey("850"), 200, 200,
+      false, Temperature::kUnknown, kInvalidBlobFileNumber,
+      kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kDisableUserTimestamp,
+      kDisableUserTimestamp, kNullUniqueId64x2);

  ASSERT_OK(version_builder.Apply(&version_edit2));
  ASSERT_OK(version_builder.SaveTo(&new_vstorage));
@ -525,7 +526,7 @@ TEST_F(VersionBuilderTest, ApplyFileDeletionAndAddition) {
                   kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
                   kUnknownFileCreationTime, kUnknownFileChecksum,
                   kUnknownFileChecksumFuncName, kDisableUserTimestamp,
-                   kDisableUserTimestamp);
+                   kDisableUserTimestamp, kNullUniqueId64x2);

  ASSERT_OK(builder.Apply(&addition));

@ -575,7 +576,7 @@ TEST_F(VersionBuilderTest, ApplyFileAdditionAlreadyInBase) {
               Temperature::kUnknown, kInvalidBlobFileNumber,
               kUnknownOldestAncesterTime, kUnknownFileCreationTime,
               kUnknownFileChecksum, kUnknownFileChecksumFuncName,
-               kDisableUserTimestamp, kDisableUserTimestamp);
+               kDisableUserTimestamp, kDisableUserTimestamp, kNullUniqueId64x2);

  const Status s = builder.Apply(&edit);
  ASSERT_TRUE(s.IsCorruption());
@ -612,7 +613,7 @@ TEST_F(VersionBuilderTest, ApplyFileAdditionAlreadyApplied) {
               kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
               kUnknownFileCreationTime, kUnknownFileChecksum,
               kUnknownFileChecksumFuncName, kDisableUserTimestamp,
-               kDisableUserTimestamp);
+               kDisableUserTimestamp, kNullUniqueId64x2);

  ASSERT_OK(builder.Apply(&edit));

@ -620,13 +621,13 @@ TEST_F(VersionBuilderTest, ApplyFileAdditionAlreadyApplied) {

  constexpr int new_level = 2;

-  other_edit.AddFile(new_level, file_number, path_id, file_size,
-                     GetInternalKey(smallest), GetInternalKey(largest),
-                     smallest_seqno, largest_seqno, marked_for_compaction,
-                     Temperature::kUnknown, kInvalidBlobFileNumber,
-                     kUnknownOldestAncesterTime, kUnknownFileCreationTime,
-                     kUnknownFileChecksum, kUnknownFileChecksumFuncName,
-                     kDisableUserTimestamp, kDisableUserTimestamp);
+  other_edit.AddFile(
+      new_level, file_number, path_id, file_size, GetInternalKey(smallest),
+      GetInternalKey(largest), smallest_seqno, largest_seqno,
+      marked_for_compaction, Temperature::kUnknown, kInvalidBlobFileNumber,
+      kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kDisableUserTimestamp,
+      kDisableUserTimestamp, kNullUniqueId64x2);

  const Status s = builder.Apply(&other_edit);
  ASSERT_TRUE(s.IsCorruption());
@ -657,13 +658,13 @@ TEST_F(VersionBuilderTest, ApplyFileAdditionAndDeletion) {

  VersionEdit addition;

-  addition.AddFile(level, file_number, path_id, file_size,
-                   GetInternalKey(smallest), GetInternalKey(largest),
-                   smallest_seqno, largest_seqno, marked_for_compaction,
-                   Temperature::kUnknown, kInvalidBlobFileNumber,
-                   kUnknownOldestAncesterTime, kUnknownFileCreationTime,
-                   kUnknownFileChecksum, kUnknownFileChecksumFuncName,
-                   kDisableUserTimestamp, kDisableUserTimestamp);
+  addition.AddFile(
+      level, file_number, path_id, file_size, GetInternalKey(smallest),
+      GetInternalKey(largest), smallest_seqno, largest_seqno,
+      marked_for_compaction, Temperature::kUnknown, kInvalidBlobFileNumber,
+      kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kDisableUserTimestamp,
+      kDisableUserTimestamp, kNullUniqueId64x2);

  ASSERT_OK(builder.Apply(&addition));

@ -1227,12 +1228,13 @@ TEST_F(VersionBuilderTest, SaveBlobFilesToConcurrentJobs) {
  constexpr uint64_t total_blob_count = 234;
  constexpr uint64_t total_blob_bytes = 1 << 22;

-  edit.AddFile(
-      level, table_file_number, path_id, file_size, GetInternalKey(smallest),
-      GetInternalKey(largest), smallest_seqno, largest_seqno,
-      marked_for_compaction, Temperature::kUnknown, blob_file_number,
-      kUnknownOldestAncesterTime, kUnknownFileCreationTime, checksum_value,
-      checksum_method, kDisableUserTimestamp, kDisableUserTimestamp);
+  edit.AddFile(level, table_file_number, path_id, file_size,
+               GetInternalKey(smallest), GetInternalKey(largest),
+               smallest_seqno, largest_seqno, marked_for_compaction,
+               Temperature::kUnknown, blob_file_number,
+               kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+               checksum_value, checksum_method, kDisableUserTimestamp,
+               kDisableUserTimestamp, kNullUniqueId64x2);
  edit.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes,
                   checksum_method, checksum_value);

@ -1320,7 +1322,7 @@ TEST_F(VersionBuilderTest, CheckConsistencyForBlobFiles) {
               /* oldest_blob_file_number */ 16, kUnknownOldestAncesterTime,
               kUnknownFileCreationTime, kUnknownFileChecksum,
               kUnknownFileChecksumFuncName, kDisableUserTimestamp,
-               kDisableUserTimestamp);
+               kDisableUserTimestamp, kNullUniqueId64x2);

  edit.AddFile(/* level */ 1, /* file_number */ 700, /* path_id */ 0,
               /* file_size */ 100, /* smallest */ GetInternalKey("801"),
@ -1330,7 +1332,7 @@ TEST_F(VersionBuilderTest, CheckConsistencyForBlobFiles) {
               /* oldest_blob_file_number */ 1000, kUnknownOldestAncesterTime,
               kUnknownFileCreationTime, kUnknownFileChecksum,
               kUnknownFileChecksumFuncName, kDisableUserTimestamp,
-               kDisableUserTimestamp);
+               kDisableUserTimestamp, kNullUniqueId64x2);
  edit.AddBlobFile(/* blob_file_number */ 1000, /* total_blob_count */ 2000,
                   /* total_blob_bytes */ 200000,
                   /* checksum_method */ std::string(),
@ -1552,7 +1554,7 @@ TEST_F(VersionBuilderTest, MaintainLinkedSstsForBlobFiles) {
      /* oldest_blob_file_number */ 1, kUnknownOldestAncesterTime,
      kUnknownFileCreationTime, kUnknownFileChecksum,
      kUnknownFileChecksumFuncName, kDisableUserTimestamp,
-      kDisableUserTimestamp);
+      kDisableUserTimestamp, kNullUniqueId64x2);

  // Add an SST that does not reference any blob files.
  edit.AddFile(
@ -1563,7 +1565,7 @@ TEST_F(VersionBuilderTest, MaintainLinkedSstsForBlobFiles) {
      Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
      kUnknownFileCreationTime, kUnknownFileChecksum,
      kUnknownFileChecksumFuncName, kDisableUserTimestamp,
-      kDisableUserTimestamp);
+      kDisableUserTimestamp, kNullUniqueId64x2);

  // Delete a file that references a blob file.
  edit.DeleteFile(/* level */ 1, /* file_number */ 6);
@ -1586,7 +1588,7 @@ TEST_F(VersionBuilderTest, MaintainLinkedSstsForBlobFiles) {
               /* oldest_blob_file_number */ 3, kUnknownOldestAncesterTime,
               kUnknownFileCreationTime, kUnknownFileChecksum,
               kUnknownFileChecksumFuncName, kDisableUserTimestamp,
-               kDisableUserTimestamp);
+               kDisableUserTimestamp, kNullUniqueId64x2);

  // Trivially move a file that does not reference any blob files.
  edit.DeleteFile(/* level */ 1, /* file_number */ 13);
@ -1598,7 +1600,7 @@ TEST_F(VersionBuilderTest, MaintainLinkedSstsForBlobFiles) {
               Temperature::kUnknown, kInvalidBlobFileNumber,
               kUnknownOldestAncesterTime, kUnknownFileCreationTime,
               kUnknownFileChecksum, kUnknownFileChecksumFuncName,
-               kDisableUserTimestamp, kDisableUserTimestamp);
+               kDisableUserTimestamp, kDisableUserTimestamp, kNullUniqueId64x2);

  // Add one more SST file that references a blob file, then promptly
  // delete it in a second version edit before the new version gets saved.
@ -1612,7 +1614,7 @@ TEST_F(VersionBuilderTest, MaintainLinkedSstsForBlobFiles) {
               /* oldest_blob_file_number */ 5, kUnknownOldestAncesterTime,
               kUnknownFileCreationTime, kUnknownFileChecksum,
               kUnknownFileChecksumFuncName, kDisableUserTimestamp,
-               kDisableUserTimestamp);
+               kDisableUserTimestamp, kNullUniqueId64x2);

  VersionEdit edit2;

--- a/db/version_edit.cc
+++ b/db/version_edit.cc
@ -13,6 +13,7 @@
 #include "db/version_set.h"
 #include "logging/event_logger.h"
 #include "rocksdb/slice.h"
+#include "table/unique_id_impl.h"
 #include "test_util/sync_point.h"
 #include "util/coding.h"
 #include "util/string_util.h"
@ -221,6 +222,14 @@ bool VersionEdit::EncodeTo(std::string* dst) const {
      PutVarint64(&oldest_blob_file_number, f.oldest_blob_file_number);
      PutLengthPrefixedSlice(dst, Slice(oldest_blob_file_number));
    }
+    UniqueId64x2 unique_id = f.unique_id;
+    TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:UniqueId", &unique_id);
+    if (unique_id != kNullUniqueId64x2) {
+      PutVarint32(dst, NewFileCustomTag::kUniqueId);
+      std::string unique_id_str = EncodeUniqueIdBytes(&unique_id);
+      PutLengthPrefixedSlice(dst, Slice(unique_id_str));
+    }
+
    TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:NewFile4:CustomizeFields",
                             dst);

@ -392,6 +401,12 @@ const char* VersionEdit::DecodeNewFile4From(Slice* input) {
        case kMaxTimestamp:
          f.max_timestamp = field.ToString();
          break;
+        case kUniqueId:
+          if (!DecodeUniqueIdBytes(field.ToString(), &f.unique_id).ok()) {
+            f.unique_id = kNullUniqueId64x2;
+            return "invalid unique id";
+          }
+          break;
        default:
          if ((custom_tag & kCustomTagNonSafeIgnoreMask) != 0) {
            // Should not proceed if cannot understand it
@ -819,6 +834,11 @@ std::string VersionEdit::DebugString(bool hex_key) const {
      // permanent
      r.append(std::to_string(static_cast<int>(f.temperature)));
    }
+    if (f.unique_id != kNullUniqueId64x2) {
+      r.append(" unique_id(internal): ");
+      UniqueId64x2 id = f.unique_id;
+      r.append(InternalUniqueIdToHumanString(&id));
+    }
  }

  for (const auto& blob_file_addition : blob_file_additions_) {
--- a/db/version_edit.h
+++ b/db/version_edit.h
@ -85,6 +85,7 @@ enum NewFileCustomTag : uint32_t {
  kTemperature = 9,
  kMinTimestamp = 10,
  kMaxTimestamp = 11,
+  kUniqueId = 12,

  // If this bit for the custom tag is set, opening DB should fail if
  // we don't know this field.
@ -102,6 +103,8 @@ constexpr uint64_t kUnknownFileCreationTime = 0;

 extern uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id);

+using UniqueId64x2 = std::array<uint64_t, 2>;
+
 // A copyable structure contains information needed to read data from an SST
 // file. It can contain a pointer to a table reader opened for the file, or
 // file number and size, which can be used to create a new table reader for it.
@ -217,6 +220,9 @@ struct FileMetaData {
  // Max (newest) timestamp of keys in this file
  std::string max_timestamp;

+  // SST unique id
+  UniqueId64x2 unique_id{};
+
  FileMetaData() = default;

  FileMetaData(uint64_t file, uint32_t file_path_id, uint64_t file_size,
@ -227,7 +233,8 @@ struct FileMetaData {
               uint64_t _oldest_ancester_time, uint64_t _file_creation_time,
               const std::string& _file_checksum,
               const std::string& _file_checksum_func_name,
-               std::string _min_timestamp, std::string _max_timestamp)
+               std::string _min_timestamp, std::string _max_timestamp,
+               UniqueId64x2 _unique_id)
      : fd(file, file_path_id, file_size, smallest_seq, largest_seq),
        smallest(smallest_key),
        largest(largest_key),
@ -239,7 +246,8 @@ struct FileMetaData {
        file_checksum(_file_checksum),
        file_checksum_func_name(_file_checksum_func_name),
        min_timestamp(std::move(_min_timestamp)),
-        max_timestamp(std::move(_max_timestamp)) {
+        max_timestamp(std::move(_max_timestamp)),
+        unique_id(std::move(_unique_id)) {
    TEST_SYNC_POINT_CALLBACK("FileMetaData::FileMetaData", this);
  }

@ -408,7 +416,8 @@ class VersionEdit {
               const std::string& file_checksum,
               const std::string& file_checksum_func_name,
               const std::string& min_timestamp,
-               const std::string& max_timestamp) {
+               const std::string& max_timestamp,
+               const UniqueId64x2& unique_id) {
    assert(smallest_seqno <= largest_seqno);
    new_files_.emplace_back(
        level,
@ -416,7 +425,7 @@ class VersionEdit {
                     smallest_seqno, largest_seqno, marked_for_compaction,
                     temperature, oldest_blob_file_number, oldest_ancester_time,
                     file_creation_time, file_checksum, file_checksum_func_name,
-                     min_timestamp, max_timestamp));
+                     min_timestamp, max_timestamp, unique_id));
    if (!HasLastSequence() || largest_seqno > GetLastSequence()) {
      SetLastSequence(largest_seqno);
    }
--- a/db/version_edit_test.cc
+++ b/db/version_edit_test.cc
@ -11,6 +11,7 @@

 #include "db/blob/blob_index.h"
 #include "rocksdb/advanced_options.h"
+#include "table/unique_id_impl.h"
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
@ -43,7 +44,7 @@ TEST_F(VersionEditTest, EncodeDecode) {
                 InternalKey("zoo", kBig + 600 + i, kTypeDeletion),
                 kBig + 500 + i, kBig + 600 + i, false, Temperature::kUnknown,
                 kInvalidBlobFileNumber, 888, 678, "234", "crc32c", "123",
-                 "345");
+                 "345", kNullUniqueId64x2);
    edit.DeleteFile(4, kBig + 700 + i);
  }

@ -62,26 +63,25 @@ TEST_F(VersionEditTest, EncodeDecodeNewFile4) {
               InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500,
               kBig + 600, true, Temperature::kUnknown, kInvalidBlobFileNumber,
               kUnknownOldestAncesterTime, kUnknownFileCreationTime,
-               kUnknownFileChecksum, kUnknownFileChecksumFuncName, "123",
-               "234");
+               kUnknownFileChecksum, kUnknownFileChecksumFuncName, "123", "234",
+               kNullUniqueId64x2);
  edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue),
               InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501,
               kBig + 601, false, Temperature::kUnknown, kInvalidBlobFileNumber,
               kUnknownOldestAncesterTime, kUnknownFileCreationTime,
-               kUnknownFileChecksum, kUnknownFileChecksumFuncName, "345",
-               "543");
+               kUnknownFileChecksum, kUnknownFileChecksumFuncName, "345", "543",
+               kNullUniqueId64x2);
  edit.AddFile(5, 302, 0, 100, InternalKey("foo", kBig + 502, kTypeValue),
               InternalKey("zoo", kBig + 602, kTypeDeletion), kBig + 502,
               kBig + 602, true, Temperature::kUnknown, kInvalidBlobFileNumber,
               666, 888, kUnknownFileChecksum, kUnknownFileChecksumFuncName,
-               "456", "567");
+               "456", "567", kNullUniqueId64x2);
  edit.AddFile(5, 303, 0, 100, InternalKey("foo", kBig + 503, kTypeBlobIndex),
               InternalKey("zoo", kBig + 603, kTypeBlobIndex), kBig + 503,
               kBig + 603, true, Temperature::kUnknown, 1001,
               kUnknownOldestAncesterTime, kUnknownFileCreationTime,
-               kUnknownFileChecksum, kUnknownFileChecksumFuncName, "678",
-               "789");
-  ;
+               kUnknownFileChecksum, kUnknownFileChecksumFuncName, "678", "789",
+               kNullUniqueId64x2);

  edit.DeleteFile(4, 700);

@ -129,13 +129,13 @@ TEST_F(VersionEditTest, ForwardCompatibleNewFile4) {
               InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500,
               kBig + 600, true, Temperature::kUnknown, kInvalidBlobFileNumber,
               kUnknownOldestAncesterTime, kUnknownFileCreationTime,
-               kUnknownFileChecksum, kUnknownFileChecksumFuncName, "123",
-               "234");
+               kUnknownFileChecksum, kUnknownFileChecksumFuncName, "123", "234",
+               kNullUniqueId64x2);
  edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue),
               InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501,
               kBig + 601, false, Temperature::kUnknown, kInvalidBlobFileNumber,
               686, 868, "234", "crc32c", kDisableUserTimestamp,
-               kDisableUserTimestamp);
+               kDisableUserTimestamp, kNullUniqueId64x2);
  edit.DeleteFile(4, 700);

  edit.SetComparatorName("foo");
@ -188,7 +188,7 @@ TEST_F(VersionEditTest, NewFile4NotSupportedField) {
               kBig + 600, true, Temperature::kUnknown, kInvalidBlobFileNumber,
               kUnknownOldestAncesterTime, kUnknownFileCreationTime,
               kUnknownFileChecksum, kUnknownFileChecksumFuncName,
-               kDisableUserTimestamp, kDisableUserTimestamp);
+               kDisableUserTimestamp, kDisableUserTimestamp, kNullUniqueId64x2);

  edit.SetComparatorName("foo");
  edit.SetLogNumber(kBig + 100);
@ -219,7 +219,7 @@ TEST_F(VersionEditTest, EncodeEmptyFile) {
               Temperature::kUnknown, kInvalidBlobFileNumber,
               kUnknownOldestAncesterTime, kUnknownFileCreationTime,
               kUnknownFileChecksum, kUnknownFileChecksumFuncName,
-               kDisableUserTimestamp, kDisableUserTimestamp);
+               kDisableUserTimestamp, kDisableUserTimestamp, kNullUniqueId64x2);
  std::string buffer;
  ASSERT_TRUE(!edit.EncodeTo(&buffer));
 }
--- a/db/version_set.cc
+++ b/db/version_set.cc
@ -38,6 +38,10 @@
 #include "db/table_cache.h"
 #include "db/version_builder.h"
 #include "db/version_edit_handler.h"
+#if USE_COROUTINES
+#include "folly/experimental/coro/BlockingWait.h"
+#include "folly/experimental/coro/Collect.h"
+#endif
 #include "file/filename.h"
 #include "file/random_access_file_reader.h"
 #include "file/read_write_util.h"
@ -59,13 +63,28 @@
 #include "table/plain/plain_table_factory.h"
 #include "table/table_reader.h"
 #include "table/two_level_iterator.h"
+#include "table/unique_id_impl.h"
 #include "test_util/sync_point.h"
 #include "util/cast_util.h"
 #include "util/coding.h"
+#include "util/coro_utils.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
 #include "util/user_comparator_wrapper.h"

+// Generate the regular and coroutine versions of some methods by
+// including version_set_sync_and_async.h twice
+// Macros in the header will expand differently based on whether
+// WITH_COROUTINES or WITHOUT_COROUTINES is defined
+// clang-format off
+#define WITHOUT_COROUTINES
+#include "db/version_set_sync_and_async.h"
+#undef WITHOUT_COROUTINES
+#define WITH_COROUTINES
+#include "db/version_set_sync_and_async.h"
+#undef WITH_COROUTINES
+// clang-format on
+
 namespace ROCKSDB_NAMESPACE {

 namespace {
@ -504,68 +523,63 @@ class FilePickerMultiGet {
    return file_hit;
  }

-  FdWithKeyRange* GetNextFile() {
-    while (!search_ended_) {
-      // Start searching next level.
-      if (batch_iter_ == current_level_range_.end()) {
-        search_ended_ = !PrepareNextLevel();
-        continue;
-      } else {
-        if (maybe_repeat_key_) {
-          maybe_repeat_key_ = false;
-          // Check if we found the final value for the last key in the
-          // previous lookup range. If we did, then there's no need to look
-          // any further for that key, so advance batch_iter_. Else, keep
-          // batch_iter_ positioned on that key so we look it up again in
-          // the next file
-          // For L0, always advance the key because we will look in the next
-          // file regardless for all keys not found yet
-          if (current_level_range_.CheckKeyDone(batch_iter_) ||
-              curr_level_ == 0) {
-            batch_iter_ = upper_key_;
-          }
-        }
-        // batch_iter_prev_ will become the start key for the next file
-        // lookup
-        batch_iter_prev_ = batch_iter_;
-      }
+  void PrepareNextLevelForSearch() { search_ended_ = !PrepareNextLevel(); }

-      MultiGetRange next_file_range(current_level_range_, batch_iter_prev_,
-                                    current_level_range_.end());
-      size_t curr_file_index =
-          (batch_iter_ != current_level_range_.end())
-              ? fp_ctx_array_[batch_iter_.index()].curr_index_in_curr_level
-              : curr_file_level_->num_files;
-      FdWithKeyRange* f;
-      bool is_last_key_in_file;
-      if (!GetNextFileInLevelWithKeys(&next_file_range, &curr_file_index, &f,
-                                      &is_last_key_in_file)) {
-        search_ended_ = !PrepareNextLevel();
-      } else {
-        if (is_last_key_in_file) {
-          // Since cmp_largest is 0, batch_iter_ still points to the last key
-          // that falls in this file, instead of the next one. Increment
-          // the file index for all keys between batch_iter_ and upper_key_
-          auto tmp_iter = batch_iter_;
-          while (tmp_iter != upper_key_) {
-            ++(fp_ctx_array_[tmp_iter.index()].curr_index_in_curr_level);
-            ++tmp_iter;
-          }
-          maybe_repeat_key_ = true;
+  FdWithKeyRange* GetNextFileInLevel() {
+    if (batch_iter_ == current_level_range_.end() || search_ended_) {
+      return nullptr;
+    } else {
+      if (maybe_repeat_key_) {
+        maybe_repeat_key_ = false;
+        // Check if we found the final value for the last key in the
+        // previous lookup range. If we did, then there's no need to look
+        // any further for that key, so advance batch_iter_. Else, keep
+        // batch_iter_ positioned on that key so we look it up again in
+        // the next file
+        // For L0, always advance the key because we will look in the next
+        // file regardless for all keys not found yet
+        if (current_level_range_.CheckKeyDone(batch_iter_) ||
+            curr_level_ == 0) {
+          batch_iter_ = upper_key_;
        }
-        // Set the range for this file
-        current_file_range_ =
-            MultiGetRange(next_file_range, batch_iter_prev_, upper_key_);
-        returned_file_level_ = curr_level_;
-        hit_file_level_ = curr_level_;
-        is_hit_file_last_in_level_ =
-            curr_file_index == curr_file_level_->num_files - 1;
-        return f;
      }
+      // batch_iter_prev_ will become the start key for the next file
+      // lookup
+      batch_iter_prev_ = batch_iter_;
    }

-    // Search ended
-    return nullptr;
+    MultiGetRange next_file_range(current_level_range_, batch_iter_prev_,
+                                  current_level_range_.end());
+    size_t curr_file_index =
+        (batch_iter_ != current_level_range_.end())
+            ? fp_ctx_array_[batch_iter_.index()].curr_index_in_curr_level
+            : curr_file_level_->num_files;
+    FdWithKeyRange* f;
+    bool is_last_key_in_file;
+    if (!GetNextFileInLevelWithKeys(&next_file_range, &curr_file_index, &f,
+                                    &is_last_key_in_file)) {
+      return nullptr;
+    } else {
+      if (is_last_key_in_file) {
+        // Since cmp_largest is 0, batch_iter_ still points to the last key
+        // that falls in this file, instead of the next one. Increment
+        // the file index for all keys between batch_iter_ and upper_key_
+        auto tmp_iter = batch_iter_;
+        while (tmp_iter != upper_key_) {
+          ++(fp_ctx_array_[tmp_iter.index()].curr_index_in_curr_level);
+          ++tmp_iter;
+        }
+        maybe_repeat_key_ = true;
+      }
+      // Set the range for this file
+      current_file_range_ =
+          MultiGetRange(next_file_range, batch_iter_prev_, upper_key_);
+      returned_file_level_ = curr_level_;
+      hit_file_level_ = curr_level_;
+      is_hit_file_last_in_level_ =
+          curr_file_index == curr_file_level_->num_files - 1;
+      return f;
+    }
  }

  // getter for current file level
@ -576,8 +590,16 @@ class FilePickerMultiGet {
  // GetNextFile()) is at the last index in its level.
  bool IsHitFileLastInLevel() { return is_hit_file_last_in_level_; }

+  bool KeyMaySpanNextFile() { return maybe_repeat_key_; }
+
+  bool IsSearchEnded() { return search_ended_; }
+
  const MultiGetRange& CurrentFileRange() { return current_file_range_; }

+  bool RemainingOverlapInLevel() {
+    return !current_level_range_.Suffix(current_file_range_).empty();
+  }
+
 private:
  unsigned int num_levels_;
  unsigned int curr_level_;
@ -1056,7 +1078,15 @@ void LevelIterator::Seek(const Slice& target) {

  if (file_iter_.iter() != nullptr) {
    file_iter_.Seek(target);
+    // Status::TryAgain indicates asynchronous request for retrieval of data
+    // blocks has been submitted. So it should return at this point and Seek
+    // should be called again to retrieve the requested block and execute the
+    // remaining code.
+    if (file_iter_.status() == Status::TryAgain()) {
+      return;
+    }
  }
+
  if (SkipEmptyFileForward() && prefix_extractor_ != nullptr &&
      !read_options_.total_order_seek && !read_options_.auto_prefix_mode &&
      file_iter_.iter() != nullptr && file_iter_.Valid()) {
@ -1534,6 +1564,40 @@ void Version::GetCreationTimeOfOldestFile(uint64_t* creation_time) {
  *creation_time = oldest_time;
 }

+Status Version::VerifySstUniqueIds() const {
+  for (int level = 0; level < storage_info_.num_non_empty_levels_; level++) {
+    for (FileMetaData* meta : storage_info_.LevelFiles(level)) {
+      if (meta->unique_id != kNullUniqueId64x2) {
+        std::shared_ptr<const TableProperties> props;
+        Status s =
+            GetTableProperties(&props, meta);  // may open the file if it's not
+        if (!s.ok()) {
+          return s;
+        }
+        UniqueId64x2 id;
+        s = GetSstInternalUniqueId(props->db_id, props->db_session_id,
+                                   props->orig_file_number, &id);
+        if (!s.ok() || id != meta->unique_id) {
+          std::ostringstream oss;
+          oss << "SST #" << meta->fd.GetNumber() << " unique ID mismatch. ";
+          oss << "Manifest: "
+              << InternalUniqueIdToHumanString(&(meta->unique_id)) << ", ";
+          if (s.ok()) {
+            oss << "Table Properties: " << InternalUniqueIdToHumanString(&id);
+          } else {
+            oss << "Failed to get Table Properties: " << s.ToString();
+          }
+          return Status::Corruption("VersionSet", oss.str());
+        }
+        TEST_SYNC_POINT_CALLBACK("Version::VerifySstUniqueIds::Passed", &id);
+      } else {
+        TEST_SYNC_POINT_CALLBACK("Version::VerifySstUniqueIds::Skipped", meta);
+      }
+    }
+  }
+  return Status::OK();
+}
+
 uint64_t VersionStorageInfo::GetEstimatedActiveKeys() const {
  // Estimation will be inaccurate when:
  // (1) there exist merge keys
@ -2183,7 +2247,7 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
      &file_picker_range,
      &storage_info_.level_files_brief_, storage_info_.num_non_empty_levels_,
      &storage_info_.file_indexer_, user_comparator(), internal_comparator());
-  FdWithKeyRange* f = fp.GetNextFile();
+  FdWithKeyRange* f = fp.GetNextFileInLevel();
  Status s;
  uint64_t num_index_read = 0;
  uint64_t num_filter_read = 0;
@ -2193,164 +2257,92 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
  MultiGetRange keys_with_blobs_range(*range, range->begin(), range->end());
  // blob_file => [[blob_idx, it], ...]
  std::unordered_map<uint64_t, BlobReadRequests> blob_rqs;
-  int level = -1;
+  int prev_level = -1;

-  while (f != nullptr) {
-    MultiGetRange file_range = fp.CurrentFileRange();
-    bool timer_enabled =
-        GetPerfLevel() >= PerfLevel::kEnableTimeExceptForMutex &&
-        get_perf_context()->per_level_perf_context_enabled;
+  while (!fp.IsSearchEnded()) {
+    // This will be set to true later if we actually look up in a file in L0.
+    // For per level stats purposes, an L0 file is treated as a level
+    bool dump_stats_for_l0_file = false;

-    // Report MultiGet stats per level.
-    if (level >= 0 && level != (int)fp.GetHitFileLevel()) {
-      // Dump the stats if the search has moved to the next level and
-      // reset for next level.
-      RecordInHistogram(db_statistics_,
-                        NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL,
-                        num_index_read + num_filter_read);
-      RecordInHistogram(db_statistics_, NUM_DATA_BLOCKS_READ_PER_LEVEL,
-                        num_data_read);
-      RecordInHistogram(db_statistics_, NUM_SST_READ_PER_LEVEL, num_sst_read);
-      num_filter_read = 0;
-      num_index_read = 0;
-      num_data_read = 0;
-      num_sst_read = 0;
-      level = fp.GetHitFileLevel();
-    }
-
-    StopWatchNano timer(clock_, timer_enabled /* auto_start */);
-    s = table_cache_->MultiGet(
-        read_options, *internal_comparator(), *f->file_metadata, &file_range,
-        mutable_cf_options_.prefix_extractor,
-        cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()),
-        IsFilterSkipped(static_cast<int>(fp.GetHitFileLevel()),
-                        fp.IsHitFileLastInLevel()),
-        fp.GetHitFileLevel());
-    // TODO: examine the behavior for corrupted key
-    if (timer_enabled) {
-      PERF_COUNTER_BY_LEVEL_ADD(get_from_table_nanos, timer.ElapsedNanos(),
-                                fp.GetHitFileLevel());
-    }
-    if (!s.ok()) {
-      // TODO: Set status for individual keys appropriately
-      for (auto iter = file_range.begin(); iter != file_range.end(); ++iter) {
-        *iter->s = s;
-        file_range.MarkKeyDone(iter);
-      }
-      return;
-    }
-    uint64_t batch_size = 0;
-    for (auto iter = file_range.begin(); s.ok() && iter != file_range.end();
-         ++iter) {
-      GetContext& get_context = *iter->get_context;
-      Status* status = iter->s;
-      // The Status in the KeyContext takes precedence over GetContext state
-      // Status may be an error if there were any IO errors in the table
-      // reader. We never expect Status to be NotFound(), as that is
-      // determined by get_context
-      assert(!status->IsNotFound());
-      if (!status->ok()) {
-        file_range.MarkKeyDone(iter);
-        continue;
-      }
-
-      if (get_context.sample()) {
-        sample_file_read_inc(f->file_metadata);
-      }
-      batch_size++;
-      num_index_read += get_context.get_context_stats_.num_index_read;
-      num_filter_read += get_context.get_context_stats_.num_filter_read;
-      num_data_read += get_context.get_context_stats_.num_data_read;
-      num_sst_read += get_context.get_context_stats_.num_sst_read;
-      // Reset these stats since they're specific to a level
-      get_context.get_context_stats_.num_index_read = 0;
-      get_context.get_context_stats_.num_filter_read = 0;
-      get_context.get_context_stats_.num_data_read = 0;
-      get_context.get_context_stats_.num_sst_read = 0;
-
-      // report the counters before returning
-      if (get_context.State() != GetContext::kNotFound &&
-          get_context.State() != GetContext::kMerge &&
-          db_statistics_ != nullptr) {
-        get_context.ReportCounters();
-      } else {
-        if (iter->max_covering_tombstone_seq > 0) {
-          // The remaining files we look at will only contain covered keys, so
-          // we stop here for this key
-          file_picker_range.SkipKey(iter);
+    // Avoid using the coroutine version if we're looking in a L0 file, since
+    // L0 files won't be parallelized anyway. The regular synchronous version
+    // is faster.
+    if (!read_options.async_io || !using_coroutines() ||
+        fp.GetHitFileLevel() == 0 || !fp.RemainingOverlapInLevel()) {
+      if (f) {
+        // Call MultiGetFromSST for looking up a single file
+        s = MultiGetFromSST(read_options, fp.CurrentFileRange(),
+                            fp.GetHitFileLevel(), fp.IsHitFileLastInLevel(), f,
+                            blob_rqs, num_filter_read, num_index_read,
+                            num_data_read, num_sst_read);
+        if (fp.GetHitFileLevel() == 0) {
+          dump_stats_for_l0_file = true;
        }
      }
-      switch (get_context.State()) {
-        case GetContext::kNotFound:
-          // Keep searching in other files
-          break;
-        case GetContext::kMerge:
-          // TODO: update per-level perfcontext user_key_return_count for kMerge
-          break;
-        case GetContext::kFound:
-          if (fp.GetHitFileLevel() == 0) {
-            RecordTick(db_statistics_, GET_HIT_L0);
-          } else if (fp.GetHitFileLevel() == 1) {
-            RecordTick(db_statistics_, GET_HIT_L1);
-          } else if (fp.GetHitFileLevel() >= 2) {
-            RecordTick(db_statistics_, GET_HIT_L2_AND_UP);
-          }
-
-          PERF_COUNTER_BY_LEVEL_ADD(user_key_return_count, 1,
-                                    fp.GetHitFileLevel());
-
-          file_range.MarkKeyDone(iter);
-
-          if (iter->is_blob_index) {
-            if (iter->value) {
-              TEST_SYNC_POINT_CALLBACK("Version::MultiGet::TamperWithBlobIndex",
-                                       &(*iter));
-
-              const Slice& blob_index_slice = *(iter->value);
-              BlobIndex blob_index;
-              Status tmp_s = blob_index.DecodeFrom(blob_index_slice);
-              if (tmp_s.ok()) {
-                const uint64_t blob_file_num = blob_index.file_number();
-                blob_rqs[blob_file_num].emplace_back(
-                    std::make_pair(blob_index, std::cref(*iter)));
-              } else {
-                *(iter->s) = tmp_s;
-              }
-            }
-          } else {
-            file_range.AddValueSize(iter->value->size());
-            if (file_range.GetValueSize() >
-                read_options.value_size_soft_limit) {
-              s = Status::Aborted();
-              break;
-            }
-          }
-          continue;
-        case GetContext::kDeleted:
-          // Use empty error message for speed
-          *status = Status::NotFound();
-          file_range.MarkKeyDone(iter);
-          continue;
-        case GetContext::kCorrupt:
-          *status =
-              Status::Corruption("corrupted key for ", iter->lkey->user_key());
-          file_range.MarkKeyDone(iter);
-          continue;
-        case GetContext::kUnexpectedBlobIndex:
-          ROCKS_LOG_ERROR(info_log_, "Encounter unexpected blob index.");
-          *status = Status::NotSupported(
-              "Encounter unexpected blob index. Please open DB with "
-              "ROCKSDB_NAMESPACE::blob_db::BlobDB instead.");
-          file_range.MarkKeyDone(iter);
-          continue;
+      if (s.ok()) {
+        f = fp.GetNextFileInLevel();
      }
-    }
+#if USE_COROUTINES
+    } else {
+      std::vector<folly::coro::Task<Status>> mget_tasks;
+      while (f != nullptr) {
+        mget_tasks.emplace_back(MultiGetFromSSTCoroutine(
+            read_options, fp.CurrentFileRange(), fp.GetHitFileLevel(),
+            fp.IsHitFileLastInLevel(), f, blob_rqs, num_filter_read,
+            num_index_read, num_data_read, num_sst_read));
+        if (fp.KeyMaySpanNextFile()) {
+          break;
+        }
+        f = fp.GetNextFileInLevel();
+      }
+      if (mget_tasks.size() > 0) {
+        // Collect all results so far
+        std::vector<Status> statuses = folly::coro::blockingWait(
+            folly::coro::collectAllRange(std::move(mget_tasks))
+                .scheduleOn(&range->context()->executor()));
+        for (Status stat : statuses) {
+          if (!stat.ok()) {
+            s = stat;
+          }
+        }

-    RecordInHistogram(db_statistics_, SST_BATCH_SIZE, batch_size);
+        if (s.ok() && fp.KeyMaySpanNextFile()) {
+          f = fp.GetNextFileInLevel();
+        }
+      }
+#endif  // USE_COROUTINES
+    }
+    // If bad status or we found final result for all the keys
    if (!s.ok() || file_picker_range.empty()) {
      break;
    }
-    f = fp.GetNextFile();
+    if (!f) {
+      // Reached the end of this level. Prepare the next level
+      fp.PrepareNextLevelForSearch();
+      if (!fp.IsSearchEnded()) {
+        // Its possible there is no overlap on this level and f is nullptr
+        f = fp.GetNextFileInLevel();
+      }
+      if (dump_stats_for_l0_file ||
+          (prev_level != 0 && prev_level != (int)fp.GetHitFileLevel())) {
+        // Dump the stats if the search has moved to the next level and
+        // reset for next level.
+        if (num_sst_read || (num_filter_read + num_index_read)) {
+          RecordInHistogram(db_statistics_,
+                            NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL,
+                            num_index_read + num_filter_read);
+          RecordInHistogram(db_statistics_, NUM_DATA_BLOCKS_READ_PER_LEVEL,
+                            num_data_read);
+          RecordInHistogram(db_statistics_, NUM_SST_READ_PER_LEVEL,
+                            num_sst_read);
+        }
+        num_filter_read = 0;
+        num_index_read = 0;
+        num_data_read = 0;
+        num_sst_read = 0;
+      }
+      prev_level = fp.GetHitFileLevel();
+    }
  }

  // Dump stats for most recent level
@ -5492,13 +5484,14 @@ Status VersionSet::WriteCurrentStateToManifest(
        for (const auto& f : level_files) {
          assert(f);

-          edit.AddFile(
-              level, f->fd.GetNumber(), f->fd.GetPathId(), f->fd.GetFileSize(),
-              f->smallest, f->largest, f->fd.smallest_seqno,
-              f->fd.largest_seqno, f->marked_for_compaction, f->temperature,
-              f->oldest_blob_file_number, f->oldest_ancester_time,
-              f->file_creation_time, f->file_checksum,
-              f->file_checksum_func_name, f->min_timestamp, f->max_timestamp);
+          edit.AddFile(level, f->fd.GetNumber(), f->fd.GetPathId(),
+                       f->fd.GetFileSize(), f->smallest, f->largest,
+                       f->fd.smallest_seqno, f->fd.largest_seqno,
+                       f->marked_for_compaction, f->temperature,
+                       f->oldest_blob_file_number, f->oldest_ancester_time,
+                       f->file_creation_time, f->file_checksum,
+                       f->file_checksum_func_name, f->min_timestamp,
+                       f->max_timestamp, f->unique_id);
        }
      }

--- a/db/version_set.h
+++ b/db/version_set.h
@ -54,6 +54,7 @@
 #include "table/get_context.h"
 #include "table/multiget_context.h"
 #include "trace_replay/block_cache_tracer.h"
+#include "util/coro_utils.h"
 #include "util/hash_containers.h"

 namespace ROCKSDB_NAMESPACE {
@ -851,6 +852,8 @@ class Version {

  const MutableCFOptions& GetMutableCFOptions() { return mutable_cf_options_; }

+  Status VerifySstUniqueIds() const;
+
 private:
  Env* env_;
  SystemClock* clock_;
@ -882,6 +885,14 @@ class Version {
  // This accumulated stats will be used in compaction.
  void UpdateAccumulatedStats();

+  DECLARE_SYNC_AND_ASYNC(
+      /* ret_type */ Status, /* func_name */ MultiGetFromSST,
+      const ReadOptions& read_options, MultiGetRange file_range,
+      int hit_file_level, bool is_hit_file_last_in_level, FdWithKeyRange* f,
+      std::unordered_map<uint64_t, BlobReadRequests>& blob_rqs,
+      uint64_t& num_filter_read, uint64_t& num_index_read,
+      uint64_t& num_data_read, uint64_t& num_sst_read);
+
  ColumnFamilyData* cfd_;  // ColumnFamilyData to which this Version belongs
  Logger* info_log_;
  Statistics* db_statistics_;
--- a/db/version_set_sync_and_async.h
+++ b/db/version_set_sync_and_async.h
@ -0,0 +1,154 @@
+//  Copyright (c) Meta Platforms, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "util/coro_utils.h"
+
+#if defined(WITHOUT_COROUTINES) || \
+    (defined(USE_COROUTINES) && defined(WITH_COROUTINES))
+
+namespace ROCKSDB_NAMESPACE {
+
+// Lookup a batch of keys in a single SST file
+DEFINE_SYNC_AND_ASYNC(Status, Version::MultiGetFromSST)
+(const ReadOptions& read_options, MultiGetRange file_range, int hit_file_level,
+ bool is_hit_file_last_in_level, FdWithKeyRange* f,
+ std::unordered_map<uint64_t, BlobReadRequests>& blob_rqs,
+ uint64_t& num_filter_read, uint64_t& num_index_read, uint64_t& num_data_read,
+ uint64_t& num_sst_read) {
+  bool timer_enabled = GetPerfLevel() >= PerfLevel::kEnableTimeExceptForMutex &&
+                       get_perf_context()->per_level_perf_context_enabled;
+
+  Status s;
+  StopWatchNano timer(clock_, timer_enabled /* auto_start */);
+  s = CO_AWAIT(table_cache_->MultiGet)(
+      read_options, *internal_comparator(), *f->file_metadata, &file_range,
+      mutable_cf_options_.prefix_extractor,
+      cfd_->internal_stats()->GetFileReadHist(hit_file_level),
+      IsFilterSkipped(static_cast<int>(hit_file_level),
+                      is_hit_file_last_in_level),
+      hit_file_level);
+  // TODO: examine the behavior for corrupted key
+  if (timer_enabled) {
+    PERF_COUNTER_BY_LEVEL_ADD(get_from_table_nanos, timer.ElapsedNanos(),
+                              hit_file_level);
+  }
+  if (!s.ok()) {
+    // TODO: Set status for individual keys appropriately
+    for (auto iter = file_range.begin(); iter != file_range.end(); ++iter) {
+      *iter->s = s;
+      file_range.MarkKeyDone(iter);
+    }
+    CO_RETURN s;
+  }
+  uint64_t batch_size = 0;
+  for (auto iter = file_range.begin(); s.ok() && iter != file_range.end();
+       ++iter) {
+    GetContext& get_context = *iter->get_context;
+    Status* status = iter->s;
+    // The Status in the KeyContext takes precedence over GetContext state
+    // Status may be an error if there were any IO errors in the table
+    // reader. We never expect Status to be NotFound(), as that is
+    // determined by get_context
+    assert(!status->IsNotFound());
+    if (!status->ok()) {
+      file_range.MarkKeyDone(iter);
+      continue;
+    }
+
+    if (get_context.sample()) {
+      sample_file_read_inc(f->file_metadata);
+    }
+    batch_size++;
+    num_index_read += get_context.get_context_stats_.num_index_read;
+    num_filter_read += get_context.get_context_stats_.num_filter_read;
+    num_data_read += get_context.get_context_stats_.num_data_read;
+    num_sst_read += get_context.get_context_stats_.num_sst_read;
+    // Reset these stats since they're specific to a level
+    get_context.get_context_stats_.num_index_read = 0;
+    get_context.get_context_stats_.num_filter_read = 0;
+    get_context.get_context_stats_.num_data_read = 0;
+    get_context.get_context_stats_.num_sst_read = 0;
+
+    // report the counters before returning
+    if (get_context.State() != GetContext::kNotFound &&
+        get_context.State() != GetContext::kMerge &&
+        db_statistics_ != nullptr) {
+      get_context.ReportCounters();
+    } else {
+      if (iter->max_covering_tombstone_seq > 0) {
+        // The remaining files we look at will only contain covered keys, so
+        // we stop here for this key
+        file_range.SkipKey(iter);
+      }
+    }
+    switch (get_context.State()) {
+      case GetContext::kNotFound:
+        // Keep searching in other files
+        break;
+      case GetContext::kMerge:
+        // TODO: update per-level perfcontext user_key_return_count for kMerge
+        break;
+      case GetContext::kFound:
+        if (hit_file_level == 0) {
+          RecordTick(db_statistics_, GET_HIT_L0);
+        } else if (hit_file_level == 1) {
+          RecordTick(db_statistics_, GET_HIT_L1);
+        } else if (hit_file_level >= 2) {
+          RecordTick(db_statistics_, GET_HIT_L2_AND_UP);
+        }
+
+        PERF_COUNTER_BY_LEVEL_ADD(user_key_return_count, 1, hit_file_level);
+
+        file_range.MarkKeyDone(iter);
+
+        if (iter->is_blob_index) {
+          if (iter->value) {
+            TEST_SYNC_POINT_CALLBACK("Version::MultiGet::TamperWithBlobIndex",
+                                     &(*iter));
+
+            const Slice& blob_index_slice = *(iter->value);
+            BlobIndex blob_index;
+            Status tmp_s = blob_index.DecodeFrom(blob_index_slice);
+            if (tmp_s.ok()) {
+              const uint64_t blob_file_num = blob_index.file_number();
+              blob_rqs[blob_file_num].emplace_back(
+                  std::make_pair(blob_index, std::cref(*iter)));
+            } else {
+              *(iter->s) = tmp_s;
+            }
+          }
+        } else {
+          file_range.AddValueSize(iter->value->size());
+          if (file_range.GetValueSize() > read_options.value_size_soft_limit) {
+            s = Status::Aborted();
+            break;
+          }
+        }
+        continue;
+      case GetContext::kDeleted:
+        // Use empty error message for speed
+        *status = Status::NotFound();
+        file_range.MarkKeyDone(iter);
+        continue;
+      case GetContext::kCorrupt:
+        *status =
+            Status::Corruption("corrupted key for ", iter->lkey->user_key());
+        file_range.MarkKeyDone(iter);
+        continue;
+      case GetContext::kUnexpectedBlobIndex:
+        ROCKS_LOG_ERROR(info_log_, "Encounter unexpected blob index.");
+        *status = Status::NotSupported(
+            "Encounter unexpected blob index. Please open DB with "
+            "ROCKSDB_NAMESPACE::blob_db::BlobDB instead.");
+        file_range.MarkKeyDone(iter);
+        continue;
+    }
+  }
+
+  RecordInHistogram(db_statistics_, SST_BATCH_SIZE, batch_size);
+  CO_RETURN s;
+}
+}  // namespace ROCKSDB_NAMESPACE
+#endif
--- a/db/version_set_test.cc
+++ b/db/version_set_test.cc
@ -18,6 +18,7 @@
 #include "rocksdb/file_system.h"
 #include "table/block_based/block_based_table_factory.h"
 #include "table/mock_table.h"
+#include "table/unique_id_impl.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "util/string_util.h"
@ -49,7 +50,7 @@ class GenerateLevelFilesBriefTest : public testing::Test {
        kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
        kUnknownFileCreationTime, kUnknownFileChecksum,
        kUnknownFileChecksumFuncName, kDisableUserTimestamp,
-        kDisableUserTimestamp);
+        kDisableUserTimestamp, kNullUniqueId64x2);
    files_.push_back(f);
  }

@ -158,7 +159,7 @@ class VersionStorageInfoTestBase : public testing::Test {
        Temperature::kUnknown, oldest_blob_file_number,
        kUnknownOldestAncesterTime, kUnknownFileCreationTime,
        kUnknownFileChecksum, kUnknownFileChecksumFuncName,
-        kDisableUserTimestamp, kDisableUserTimestamp);
+        kDisableUserTimestamp, kDisableUserTimestamp, kNullUniqueId64x2);
    f->compensated_file_size = file_size;
    vstorage_.AddFile(level, f);
  }
@ -3222,11 +3223,11 @@ class VersionSetTestMissingFiles : public VersionSetTestBase,
      s = fs_->GetFileSize(fname, IOOptions(), &file_size, nullptr);
      ASSERT_OK(s);
      ASSERT_NE(0, file_size);
-      file_metas->emplace_back(file_num, /*file_path_id=*/0, file_size, ikey,
-                               ikey, 0, 0, false, Temperature::kUnknown, 0, 0,
-                               0, kUnknownFileChecksum,
-                               kUnknownFileChecksumFuncName,
-                               kDisableUserTimestamp, kDisableUserTimestamp);
+      file_metas->emplace_back(
+          file_num, /*file_path_id=*/0, file_size, ikey, ikey, 0, 0, false,
+          Temperature::kUnknown, 0, 0, 0, kUnknownFileChecksum,
+          kUnknownFileChecksumFuncName, kDisableUserTimestamp,
+          kDisableUserTimestamp, kNullUniqueId64x2);
    }
  }

@ -3282,7 +3283,7 @@ TEST_F(VersionSetTestMissingFiles, ManifestFarBehindSst) {
        file_num, /*file_path_id=*/0, /*file_size=*/12, smallest_ikey,
        largest_ikey, 0, 0, false, Temperature::kUnknown, 0, 0, 0,
        kUnknownFileChecksum, kUnknownFileChecksumFuncName,
-        kDisableUserTimestamp, kDisableUserTimestamp);
+        kDisableUserTimestamp, kDisableUserTimestamp, kNullUniqueId64x2);
    added_files.emplace_back(0, meta);
  }
  WriteFileAdditionAndDeletionToManifest(
@ -3338,7 +3339,7 @@ TEST_F(VersionSetTestMissingFiles, ManifestAheadofSst) {
        file_num, /*file_path_id=*/0, /*file_size=*/12, smallest_ikey,
        largest_ikey, 0, 0, false, Temperature::kUnknown, 0, 0, 0,
        kUnknownFileChecksum, kUnknownFileChecksumFuncName,
-        kDisableUserTimestamp, kDisableUserTimestamp);
+        kDisableUserTimestamp, kDisableUserTimestamp, kNullUniqueId64x2);
    added_files.emplace_back(0, meta);
  }
  WriteFileAdditionAndDeletionToManifest(
--- a/db/write_controller.h
+++ b/db/write_controller.h
@ -52,7 +52,7 @@ class WriteController {
  bool IsStopped() const;
  bool NeedsDelay() const { return total_delayed_.load() > 0; }
  bool NeedSpeedupCompaction() const {
-    return IsStopped() || NeedsDelay() || total_compaction_pressure_ > 0;
+    return IsStopped() || NeedsDelay() || total_compaction_pressure_.load() > 0;
  }
  // return how many microseconds the caller needs to sleep after the call
  // num_bytes: how many number of bytes to put into the DB.
--- a/db_stress_tool/db_stress_common.h
+++ b/db_stress_tool/db_stress_common.h
@ -108,6 +108,7 @@ DECLARE_double(memtable_prefix_bloom_size_ratio);
 DECLARE_bool(memtable_whole_key_filtering);
 DECLARE_int32(open_files);
 DECLARE_int64(compressed_cache_size);
+DECLARE_int32(compressed_cache_numshardbits);
 DECLARE_int32(compaction_style);
 DECLARE_int32(num_levels);
 DECLARE_int32(level0_file_num_compaction_trigger);
@ -135,7 +136,9 @@ DECLARE_int32(set_in_place_one_in);
 DECLARE_int64(cache_size);
 DECLARE_int32(cache_numshardbits);
 DECLARE_bool(cache_index_and_filter_blocks);
-DECLARE_bool(reserve_table_reader_memory);
+DECLARE_bool(charge_compression_dictionary_building_buffer);
+DECLARE_bool(charge_filter_construction);
+DECLARE_bool(charge_table_reader);
 DECLARE_int32(top_level_index_pinning);
 DECLARE_int32(partition_pinning);
 DECLARE_int32(unpartitioned_pinning);
@ -218,6 +221,7 @@ DECLARE_int32(compression_max_dict_bytes);
 DECLARE_int32(compression_zstd_max_train_bytes);
 DECLARE_int32(compression_parallel_threads);
 DECLARE_uint64(compression_max_dict_buffer_bytes);
+DECLARE_bool(compression_use_zstd_dict_trainer);
 DECLARE_string(checksum_type);
 DECLARE_string(env_uri);
 DECLARE_string(fs_uri);
@ -289,6 +293,7 @@ DECLARE_uint64(wp_commit_cache_bits);
 DECLARE_bool(adaptive_readahead);
 DECLARE_bool(async_io);
 DECLARE_string(wal_compression);
+DECLARE_bool(verify_sst_unique_id_in_manifest);

 constexpr long KB = 1024;
 constexpr int kRandomValueMaxFactor = 3;
--- a/db_stress_tool/db_stress_gflags.cc
+++ b/db_stress_tool/db_stress_gflags.cc
@ -187,9 +187,15 @@ DEFINE_int32(open_files, ROCKSDB_NAMESPACE::Options().max_open_files,
             "Maximum number of files to keep open at the same time "
             "(use default if == 0)");

-DEFINE_int64(compressed_cache_size, -1,
+DEFINE_int64(compressed_cache_size, 0,
             "Number of bytes to use as a cache of compressed data."
-             " Negative means use default settings.");
+             " 0 means use default settings.");
+
+DEFINE_int32(
+    compressed_cache_numshardbits, -1,
+    "Number of shards for the compressed block cache is 2 ** "
+    "compressed_cache_numshardbits. Negative value means default settings. "
+    "This is applied only if compressed_cache_size is greater than 0.");

 DEFINE_int32(compaction_style, ROCKSDB_NAMESPACE::Options().compaction_style,
             "");
@ -304,10 +310,20 @@ DEFINE_int32(cache_numshardbits, 6,
 DEFINE_bool(cache_index_and_filter_blocks, false,
            "True if indexes/filters should be cached in block cache.");

-DEFINE_bool(reserve_table_reader_memory, false,
-            "A dynamically updating charge to block cache, loosely based on "
-            "the actual memory usage of table reader, will occur to account "
-            "the memory, if block cache available.");
+DEFINE_bool(charge_compression_dictionary_building_buffer, false,
+            "Setting for "
+            "CacheEntryRoleOptions::charged of"
+            "CacheEntryRole::kCompressionDictionaryBuildingBuffer");
+
+DEFINE_bool(charge_filter_construction, false,
+            "Setting for "
+            "CacheEntryRoleOptions::charged of"
+            "CacheEntryRole::kFilterConstruction");
+
+DEFINE_bool(charge_table_reader, false,
+            "Setting for "
+            "CacheEntryRoleOptions::charged of"
+            "CacheEntryRole::kBlockBasedTableReader");

 DEFINE_int32(
    top_level_index_pinning,
@ -736,6 +752,13 @@ DEFINE_uint64(compression_max_dict_buffer_bytes, 0,
              "Buffering limit for SST file data to sample for dictionary "
              "compression.");

+DEFINE_bool(
+    compression_use_zstd_dict_trainer, true,
+    "Use zstd's trainer to generate dictionary. If the options is false, "
+    "zstd's finalizeDictionary() API is used to generate dictionary. "
+    "ZSTD 1.4.5+ is required. If ZSTD 1.4.5+ is not linked with the binary, "
+    "this flag will have the default value true.");
+
 DEFINE_string(bottommost_compression_type, "disable",
              "Algorithm to use to compress bottommost level of the database. "
              "\"disable\" means disabling the feature");
@ -936,4 +959,10 @@ DEFINE_bool(
 DEFINE_string(wal_compression, "none",
              "Algorithm to use for WAL compression. none to disable.");

+DEFINE_bool(
+    verify_sst_unique_id_in_manifest, false,
+    "Enable DB options `verify_sst_unique_id_in_manifest`, if true, during "
+    "DB-open try verifying the SST unique id between MANIFEST and SST "
+    "properties.");
+
 #endif  // GFLAGS
--- a/db_stress_tool/db_stress_listener.cc
+++ b/db_stress_tool/db_stress_listener.cc
@ -173,7 +173,10 @@ void DbStressListener::VerifyTableFileUniqueId(
    const TableProperties& new_file_properties, const std::string& file_path) {
  // Verify unique ID
  std::string id;
-  Status s = GetUniqueIdFromTableProperties(new_file_properties, &id);
+  // Unit tests verify that GetUniqueIdFromTableProperties returns just a
+  // substring of this, and we're only going to pull out 64 bits, so using
+  // GetExtendedUniqueIdFromTableProperties is arguably stronger testing here.
+  Status s = GetExtendedUniqueIdFromTableProperties(new_file_properties, &id);
  if (!s.ok()) {
    fprintf(stderr, "Error getting SST unique id for %s: %s\n",
            file_path.c_str(), s.ToString().c_str());
--- a/db_stress_tool/db_stress_shared_state.cc
+++ b/db_stress_tool/db_stress_shared_state.cc
@ -12,14 +12,6 @@
 #include "db_stress_tool/db_stress_shared_state.h"

 namespace ROCKSDB_NAMESPACE {
-#if defined(ROCKSDB_SUPPORT_THREAD_LOCAL)
-#if defined(OS_SOLARIS)
-__thread bool SharedState::ignore_read_error;
-#else
 thread_local bool SharedState::ignore_read_error;
-#endif // OS_SOLARIS
-#else
-bool SharedState::ignore_read_error;
-#endif // ROCKSDB_SUPPORT_THREAD_LOCAL
 }  // namespace ROCKSDB_NAMESPACE
 #endif  // GFLAGS
--- a/db_stress_tool/db_stress_shared_state.h
+++ b/db_stress_tool/db_stress_shared_state.h
@ -53,15 +53,7 @@ class SharedState {
  // local variable updated via sync points to keep track of errors injected
  // while reading filter blocks in order to ignore the Get/MultiGet result
  // for those calls
-#if defined(ROCKSDB_SUPPORT_THREAD_LOCAL)
-#if defined(OS_SOLARIS)
-  static __thread bool ignore_read_error;
-#else
  static thread_local bool ignore_read_error;
-#endif // OS_SOLARIS
-#else
-  static bool ignore_read_error;
-#endif // ROCKSDB_SUPPORT_THREAD_LOCAL

  SharedState(Env* /*env*/, StressTest* stress_test)
      : cv_(&mu_),
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@ -8,6 +8,7 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 //

+#include "util/compression.h"
 #ifdef GFLAGS
 #include "db_stress_tool/db_stress_common.h"
 #include "db_stress_tool/db_stress_compaction_filter.h"
@ -57,7 +58,8 @@ std::shared_ptr<const FilterPolicy> CreateFilterPolicy() {

 StressTest::StressTest()
    : cache_(NewCache(FLAGS_cache_size, FLAGS_cache_numshardbits)),
-      compressed_cache_(NewLRUCache(FLAGS_compressed_cache_size)),
+      compressed_cache_(NewLRUCache(FLAGS_compressed_cache_size,
+                                    FLAGS_compressed_cache_numshardbits)),
      filter_policy_(CreateFilterPolicy()),
      db_(nullptr),
 #ifndef ROCKSDB_LITE
@ -2314,6 +2316,8 @@ void StressTest::PrintEnv() const {
          static_cast<int>(FLAGS_user_timestamp_size));
  fprintf(stdout, "WAL compression           : %s\n",
          FLAGS_wal_compression.c_str());
+  fprintf(stdout, "Try verify sst unique id  : %d\n",
+          static_cast<int>(FLAGS_verify_sst_unique_id_in_manifest));

  fprintf(stdout, "------------------------------------------------\n");
 }
@ -2325,185 +2329,11 @@ void StressTest::Open(SharedState* shared) {
 #else
  (void)shared;
 #endif
-  if (FLAGS_options_file.empty()) {
-    BlockBasedTableOptions block_based_options;
-    block_based_options.block_cache = cache_;
-    block_based_options.cache_index_and_filter_blocks =
-        FLAGS_cache_index_and_filter_blocks;
-    block_based_options.metadata_cache_options.top_level_index_pinning =
-        static_cast<PinningTier>(FLAGS_top_level_index_pinning);
-    block_based_options.metadata_cache_options.partition_pinning =
-        static_cast<PinningTier>(FLAGS_partition_pinning);
-    block_based_options.metadata_cache_options.unpartitioned_pinning =
-        static_cast<PinningTier>(FLAGS_unpartitioned_pinning);
-    block_based_options.block_cache_compressed = compressed_cache_;
-    block_based_options.checksum = checksum_type_e;
-    block_based_options.block_size = FLAGS_block_size;
-    block_based_options.reserve_table_reader_memory =
-        FLAGS_reserve_table_reader_memory;
-    block_based_options.format_version =
-        static_cast<uint32_t>(FLAGS_format_version);
-    block_based_options.index_block_restart_interval =
-        static_cast<int32_t>(FLAGS_index_block_restart_interval);
-    block_based_options.filter_policy = filter_policy_;
-    block_based_options.partition_filters = FLAGS_partition_filters;
-    block_based_options.optimize_filters_for_memory =
-        FLAGS_optimize_filters_for_memory;
-    block_based_options.detect_filter_construct_corruption =
-        FLAGS_detect_filter_construct_corruption;
-    block_based_options.index_type =
-        static_cast<BlockBasedTableOptions::IndexType>(FLAGS_index_type);
-    block_based_options.prepopulate_block_cache =
-        static_cast<BlockBasedTableOptions::PrepopulateBlockCache>(
-            FLAGS_prepopulate_block_cache);
-    options_.table_factory.reset(
-        NewBlockBasedTableFactory(block_based_options));
-    options_.db_write_buffer_size = FLAGS_db_write_buffer_size;
-    options_.write_buffer_size = FLAGS_write_buffer_size;
-    options_.max_write_buffer_number = FLAGS_max_write_buffer_number;
-    options_.min_write_buffer_number_to_merge =
-        FLAGS_min_write_buffer_number_to_merge;
-    options_.max_write_buffer_number_to_maintain =
-        FLAGS_max_write_buffer_number_to_maintain;
-    options_.max_write_buffer_size_to_maintain =
-        FLAGS_max_write_buffer_size_to_maintain;
-    options_.memtable_prefix_bloom_size_ratio =
-        FLAGS_memtable_prefix_bloom_size_ratio;
-    options_.memtable_whole_key_filtering = FLAGS_memtable_whole_key_filtering;
-    options_.disable_auto_compactions = FLAGS_disable_auto_compactions;
-    options_.max_background_compactions = FLAGS_max_background_compactions;
-    options_.max_background_flushes = FLAGS_max_background_flushes;
-    options_.compaction_style =
-        static_cast<ROCKSDB_NAMESPACE::CompactionStyle>(FLAGS_compaction_style);
-    if (FLAGS_prefix_size >= 0) {
-      options_.prefix_extractor.reset(
-          NewFixedPrefixTransform(FLAGS_prefix_size));
-    }
-    options_.max_open_files = FLAGS_open_files;
-    options_.statistics = dbstats;
-    options_.env = db_stress_env;
-    options_.use_fsync = FLAGS_use_fsync;
-    options_.bytes_per_sync = FLAGS_bytes_per_sync;
-    options_.wal_bytes_per_sync = FLAGS_wal_bytes_per_sync;
-    options_.compaction_readahead_size = FLAGS_compaction_readahead_size;
-    options_.allow_mmap_reads = FLAGS_mmap_read;
-    options_.allow_mmap_writes = FLAGS_mmap_write;
-    options_.use_direct_reads = FLAGS_use_direct_reads;
-    options_.use_direct_io_for_flush_and_compaction =
-        FLAGS_use_direct_io_for_flush_and_compaction;
-    options_.recycle_log_file_num =
-        static_cast<size_t>(FLAGS_recycle_log_file_num);
-    options_.target_file_size_base = FLAGS_target_file_size_base;
-    options_.target_file_size_multiplier = FLAGS_target_file_size_multiplier;
-    options_.max_bytes_for_level_base = FLAGS_max_bytes_for_level_base;
-    options_.max_bytes_for_level_multiplier =
-        FLAGS_max_bytes_for_level_multiplier;
-    options_.level0_stop_writes_trigger = FLAGS_level0_stop_writes_trigger;
-    options_.level0_slowdown_writes_trigger =
-        FLAGS_level0_slowdown_writes_trigger;
-    options_.level0_file_num_compaction_trigger =
-        FLAGS_level0_file_num_compaction_trigger;
-    options_.compression = compression_type_e;
-    options_.bottommost_compression = bottommost_compression_type_e;
-    options_.compression_opts.max_dict_bytes = FLAGS_compression_max_dict_bytes;
-    options_.compression_opts.zstd_max_train_bytes =
-        FLAGS_compression_zstd_max_train_bytes;
-    options_.compression_opts.parallel_threads =
-        FLAGS_compression_parallel_threads;
-    options_.compression_opts.max_dict_buffer_bytes =
-        FLAGS_compression_max_dict_buffer_bytes;
-    options_.create_if_missing = true;
-    options_.max_manifest_file_size = FLAGS_max_manifest_file_size;
-    options_.inplace_update_support = FLAGS_in_place_update;
-    options_.max_subcompactions = static_cast<uint32_t>(FLAGS_subcompactions);
-    options_.allow_concurrent_memtable_write =
-        FLAGS_allow_concurrent_memtable_write;
-    options_.experimental_mempurge_threshold =
-        FLAGS_experimental_mempurge_threshold;
-    options_.periodic_compaction_seconds = FLAGS_periodic_compaction_seconds;
-    options_.ttl = FLAGS_compaction_ttl;
-    options_.enable_pipelined_write = FLAGS_enable_pipelined_write;
-    options_.enable_write_thread_adaptive_yield =
-        FLAGS_enable_write_thread_adaptive_yield;
-    options_.compaction_options_universal.size_ratio =
-        FLAGS_universal_size_ratio;
-    options_.compaction_options_universal.min_merge_width =
-        FLAGS_universal_min_merge_width;
-    options_.compaction_options_universal.max_merge_width =
-        FLAGS_universal_max_merge_width;
-    options_.compaction_options_universal.max_size_amplification_percent =
-        FLAGS_universal_max_size_amplification_percent;
-    options_.atomic_flush = FLAGS_atomic_flush;
-    options_.avoid_unnecessary_blocking_io =
-        FLAGS_avoid_unnecessary_blocking_io;
-    options_.write_dbid_to_manifest = FLAGS_write_dbid_to_manifest;
-    options_.avoid_flush_during_recovery = FLAGS_avoid_flush_during_recovery;
-    options_.max_write_batch_group_size_bytes =
-        FLAGS_max_write_batch_group_size_bytes;
-    options_.level_compaction_dynamic_level_bytes =
-        FLAGS_level_compaction_dynamic_level_bytes;
-    options_.file_checksum_gen_factory =
-        GetFileChecksumImpl(FLAGS_file_checksum_impl);
-    options_.track_and_verify_wals_in_manifest = true;
-
-    // Integrated BlobDB
-    options_.enable_blob_files = FLAGS_enable_blob_files;
-    options_.min_blob_size = FLAGS_min_blob_size;
-    options_.blob_file_size = FLAGS_blob_file_size;
-    options_.blob_compression_type =
-        StringToCompressionType(FLAGS_blob_compression_type.c_str());
-    options_.enable_blob_garbage_collection =
-        FLAGS_enable_blob_garbage_collection;
-    options_.blob_garbage_collection_age_cutoff =
-        FLAGS_blob_garbage_collection_age_cutoff;
-    options_.blob_garbage_collection_force_threshold =
-        FLAGS_blob_garbage_collection_force_threshold;
-    options_.blob_compaction_readahead_size =
-        FLAGS_blob_compaction_readahead_size;
-
-    options_.wal_compression =
-        StringToCompressionType(FLAGS_wal_compression.c_str());
-  } else {
-#ifdef ROCKSDB_LITE
-    fprintf(stderr, "--options_file not supported in lite mode\n");
-    exit(1);
-#else
-    DBOptions db_options;
-    std::vector<ColumnFamilyDescriptor> cf_descriptors;
-    Status s = LoadOptionsFromFile(FLAGS_options_file, db_stress_env,
-                                   &db_options, &cf_descriptors);
-    db_options.env = new DbStressEnvWrapper(db_stress_env);
-    if (!s.ok()) {
-      fprintf(stderr, "Unable to load options file %s --- %s\n",
-              FLAGS_options_file.c_str(), s.ToString().c_str());
-      exit(1);
-    }
-    options_ = Options(db_options, cf_descriptors[0].options);
-#endif  // ROCKSDB_LITE
-  }
-
-  if (FLAGS_rate_limiter_bytes_per_sec > 0) {
-    options_.rate_limiter.reset(NewGenericRateLimiter(
-        FLAGS_rate_limiter_bytes_per_sec, 1000 /* refill_period_us */,
-        10 /* fairness */,
-        FLAGS_rate_limit_bg_reads ? RateLimiter::Mode::kReadsOnly
-                                  : RateLimiter::Mode::kWritesOnly));
-  }
-  if (FLAGS_sst_file_manager_bytes_per_sec > 0 ||
-      FLAGS_sst_file_manager_bytes_per_truncate > 0) {
-    Status status;
-    options_.sst_file_manager.reset(NewSstFileManager(
-        db_stress_env, options_.info_log, "" /* trash_dir */,
-        static_cast<int64_t>(FLAGS_sst_file_manager_bytes_per_sec),
-        true /* delete_existing_trash */, &status,
-        0.25 /* max_trash_db_ratio */,
-        FLAGS_sst_file_manager_bytes_per_truncate));
-    if (!status.ok()) {
-      fprintf(stderr, "SstFileManager creation failed: %s\n",
-              status.ToString().c_str());
-      exit(1);
-    }
+  if (!InitializeOptionsFromFile(options_)) {
+    InitializeOptionsFromFlags(cache_, compressed_cache_, filter_policy_,
+                               options_);
  }
+  InitializeOptionsGeneral(cache_, compressed_cache_, filter_policy_, options_);

  if (FLAGS_prefix_size == 0 && FLAGS_rep_factory == kHashSkipList) {
    fprintf(stderr,
@ -2515,40 +2345,6 @@ void StressTest::Open(SharedState* shared) {
            "WARNING: prefix_size is non-zero but "
            "memtablerep != prefix_hash\n");
  }
-  switch (FLAGS_rep_factory) {
-    case kSkipList:
-      // no need to do anything
-      break;
-#ifndef ROCKSDB_LITE
-    case kHashSkipList:
-      options_.memtable_factory.reset(NewHashSkipListRepFactory(10000));
-      break;
-    case kVectorRep:
-      options_.memtable_factory.reset(new VectorRepFactory());
-      break;
-#else
-    default:
-      fprintf(stderr,
-              "RocksdbLite only supports skip list mem table. Skip "
-              "--rep_factory\n");
-#endif  // ROCKSDB_LITE
-  }
-
-  if (FLAGS_use_full_merge_v1) {
-    options_.merge_operator = MergeOperators::CreateDeprecatedPutOperator();
-  } else {
-    options_.merge_operator = MergeOperators::CreatePutOperator();
-  }
-  if (FLAGS_enable_compaction_filter) {
-    options_.compaction_filter_factory =
-        std::make_shared<DbStressCompactionFilterFactory>();
-  }
-  options_.table_properties_collector_factories.emplace_back(
-      std::make_shared<DbStressTablePropertiesCollectorFactory>());
-
-  options_.best_efforts_recovery = FLAGS_best_efforts_recovery;
-  options_.paranoid_file_checks = FLAGS_paranoid_file_checks;
-  options_.fail_if_options_file_error = FLAGS_fail_if_options_file_error;

  if ((options_.enable_blob_files || options_.enable_blob_garbage_collection ||
       FLAGS_allow_setting_blob_options_dynamically) &&
@ -2576,10 +2372,6 @@ void StressTest::Open(SharedState* shared) {

  Status s;

-  if (FLAGS_user_timestamp_size > 0) {
-    CheckAndSetOptionsForUserTimestamp();
-  }
-
  if (FLAGS_ttl == -1) {
    std::vector<std::string> existing_column_families;
    s = DB::ListColumnFamilies(DBOptions(options_), FLAGS_db,
@ -2628,13 +2420,14 @@ void StressTest::Open(SharedState* shared) {
      cf_descriptors.emplace_back(name, ColumnFamilyOptions(options_));
      column_family_names_.push_back(name);
    }
+
    options_.listeners.clear();
 #ifndef ROCKSDB_LITE
    options_.listeners.emplace_back(new DbStressListener(
        FLAGS_db, options_.db_paths, cf_descriptors, db_stress_listener_env));
 #endif  // !ROCKSDB_LITE
    RegisterAdditionalListeners();
-    options_.create_missing_column_families = true;
+
    if (!FLAGS_use_txn) {
      // Determine whether we need to ingest file metadata write failures
      // during DB reopen. If it does, enable it.
@ -2940,7 +2733,7 @@ void StressTest::Reopen(ThreadState* thread) {
  }
 }

-void StressTest::CheckAndSetOptionsForUserTimestamp() {
+void CheckAndSetOptionsForUserTimestamp(Options& options) {
  assert(FLAGS_user_timestamp_size > 0);
  const Comparator* const cmp = test::BytewiseComparatorWithU64TsWrapper();
  assert(cmp);
@ -2998,7 +2791,295 @@ void StressTest::CheckAndSetOptionsForUserTimestamp() {
    fprintf(stderr, "Bulk loading may not support timestamp yet.\n");
    exit(1);
  }
-  options_.comparator = cmp;
+  options.comparator = cmp;
 }
+
+bool InitializeOptionsFromFile(Options& options) {
+#ifndef ROCKSDB_LITE
+  DBOptions db_options;
+  std::vector<ColumnFamilyDescriptor> cf_descriptors;
+  if (!FLAGS_options_file.empty()) {
+    Status s = LoadOptionsFromFile(FLAGS_options_file, db_stress_env,
+                                   &db_options, &cf_descriptors);
+    if (!s.ok()) {
+      fprintf(stderr, "Unable to load options file %s --- %s\n",
+              FLAGS_options_file.c_str(), s.ToString().c_str());
+      exit(1);
+    }
+    db_options.env = new DbStressEnvWrapper(db_stress_env);
+    options = Options(db_options, cf_descriptors[0].options);
+    return true;
+  }
+#else
+  (void)options;
+  fprintf(stderr, "--options_file not supported in lite mode\n");
+  exit(1);
+#endif  //! ROCKSDB_LITE
+  return false;
+}
+
+void InitializeOptionsFromFlags(
+    const std::shared_ptr<Cache>& cache,
+    const std::shared_ptr<Cache>& block_cache_compressed,
+    const std::shared_ptr<const FilterPolicy>& filter_policy,
+    Options& options) {
+  BlockBasedTableOptions block_based_options;
+  block_based_options.block_cache = cache;
+  block_based_options.cache_index_and_filter_blocks =
+      FLAGS_cache_index_and_filter_blocks;
+  block_based_options.metadata_cache_options.top_level_index_pinning =
+      static_cast<PinningTier>(FLAGS_top_level_index_pinning);
+  block_based_options.metadata_cache_options.partition_pinning =
+      static_cast<PinningTier>(FLAGS_partition_pinning);
+  block_based_options.metadata_cache_options.unpartitioned_pinning =
+      static_cast<PinningTier>(FLAGS_unpartitioned_pinning);
+  block_based_options.block_cache_compressed = block_cache_compressed;
+  block_based_options.checksum = checksum_type_e;
+  block_based_options.block_size = FLAGS_block_size;
+  block_based_options.cache_usage_options.options_overrides.insert(
+      {CacheEntryRole::kCompressionDictionaryBuildingBuffer,
+       {/*.charged = */ FLAGS_charge_compression_dictionary_building_buffer
+            ? CacheEntryRoleOptions::Decision::kEnabled
+            : CacheEntryRoleOptions::Decision::kDisabled}});
+  block_based_options.cache_usage_options.options_overrides.insert(
+      {CacheEntryRole::kFilterConstruction,
+       {/*.charged = */ FLAGS_charge_filter_construction
+            ? CacheEntryRoleOptions::Decision::kEnabled
+            : CacheEntryRoleOptions::Decision::kDisabled}});
+  block_based_options.cache_usage_options.options_overrides.insert(
+      {CacheEntryRole::kBlockBasedTableReader,
+       {/*.charged = */ FLAGS_charge_table_reader
+            ? CacheEntryRoleOptions::Decision::kEnabled
+            : CacheEntryRoleOptions::Decision::kDisabled}});
+  block_based_options.format_version =
+      static_cast<uint32_t>(FLAGS_format_version);
+  block_based_options.index_block_restart_interval =
+      static_cast<int32_t>(FLAGS_index_block_restart_interval);
+  block_based_options.filter_policy = filter_policy;
+  block_based_options.partition_filters = FLAGS_partition_filters;
+  block_based_options.optimize_filters_for_memory =
+      FLAGS_optimize_filters_for_memory;
+  block_based_options.detect_filter_construct_corruption =
+      FLAGS_detect_filter_construct_corruption;
+  block_based_options.index_type =
+      static_cast<BlockBasedTableOptions::IndexType>(FLAGS_index_type);
+  block_based_options.prepopulate_block_cache =
+      static_cast<BlockBasedTableOptions::PrepopulateBlockCache>(
+          FLAGS_prepopulate_block_cache);
+  options.table_factory.reset(NewBlockBasedTableFactory(block_based_options));
+  options.db_write_buffer_size = FLAGS_db_write_buffer_size;
+  options.write_buffer_size = FLAGS_write_buffer_size;
+  options.max_write_buffer_number = FLAGS_max_write_buffer_number;
+  options.min_write_buffer_number_to_merge =
+      FLAGS_min_write_buffer_number_to_merge;
+  options.max_write_buffer_number_to_maintain =
+      FLAGS_max_write_buffer_number_to_maintain;
+  options.max_write_buffer_size_to_maintain =
+      FLAGS_max_write_buffer_size_to_maintain;
+  options.memtable_prefix_bloom_size_ratio =
+      FLAGS_memtable_prefix_bloom_size_ratio;
+  options.memtable_whole_key_filtering = FLAGS_memtable_whole_key_filtering;
+  options.disable_auto_compactions = FLAGS_disable_auto_compactions;
+  options.max_background_compactions = FLAGS_max_background_compactions;
+  options.max_background_flushes = FLAGS_max_background_flushes;
+  options.compaction_style =
+      static_cast<ROCKSDB_NAMESPACE::CompactionStyle>(FLAGS_compaction_style);
+  if (FLAGS_prefix_size >= 0) {
+    options.prefix_extractor.reset(NewFixedPrefixTransform(FLAGS_prefix_size));
+  }
+  options.max_open_files = FLAGS_open_files;
+  options.statistics = dbstats;
+  options.env = db_stress_env;
+  options.use_fsync = FLAGS_use_fsync;
+  options.compaction_readahead_size = FLAGS_compaction_readahead_size;
+  options.allow_mmap_reads = FLAGS_mmap_read;
+  options.allow_mmap_writes = FLAGS_mmap_write;
+  options.use_direct_reads = FLAGS_use_direct_reads;
+  options.use_direct_io_for_flush_and_compaction =
+      FLAGS_use_direct_io_for_flush_and_compaction;
+  options.recycle_log_file_num =
+      static_cast<size_t>(FLAGS_recycle_log_file_num);
+  options.target_file_size_base = FLAGS_target_file_size_base;
+  options.target_file_size_multiplier = FLAGS_target_file_size_multiplier;
+  options.max_bytes_for_level_base = FLAGS_max_bytes_for_level_base;
+  options.max_bytes_for_level_multiplier = FLAGS_max_bytes_for_level_multiplier;
+  options.level0_stop_writes_trigger = FLAGS_level0_stop_writes_trigger;
+  options.level0_slowdown_writes_trigger = FLAGS_level0_slowdown_writes_trigger;
+  options.level0_file_num_compaction_trigger =
+      FLAGS_level0_file_num_compaction_trigger;
+  options.compression = compression_type_e;
+  options.bottommost_compression = bottommost_compression_type_e;
+  options.compression_opts.max_dict_bytes = FLAGS_compression_max_dict_bytes;
+  options.compression_opts.zstd_max_train_bytes =
+      FLAGS_compression_zstd_max_train_bytes;
+  options.compression_opts.parallel_threads =
+      FLAGS_compression_parallel_threads;
+  options.compression_opts.max_dict_buffer_bytes =
+      FLAGS_compression_max_dict_buffer_bytes;
+  if (ZSTD_FinalizeDictionarySupported()) {
+    options.compression_opts.use_zstd_dict_trainer =
+        FLAGS_compression_use_zstd_dict_trainer;
+  } else if (!FLAGS_compression_use_zstd_dict_trainer) {
+    fprintf(
+        stderr,
+        "WARNING: use_zstd_dict_trainer is false but zstd finalizeDictionary "
+        "cannot be used because ZSTD 1.4.5+ is not linked with the binary."
+        " zstd dictionary trainer will be used.\n");
+  }
+  options.max_manifest_file_size = FLAGS_max_manifest_file_size;
+  options.inplace_update_support = FLAGS_in_place_update;
+  options.max_subcompactions = static_cast<uint32_t>(FLAGS_subcompactions);
+  options.allow_concurrent_memtable_write =
+      FLAGS_allow_concurrent_memtable_write;
+  options.experimental_mempurge_threshold =
+      FLAGS_experimental_mempurge_threshold;
+  options.periodic_compaction_seconds = FLAGS_periodic_compaction_seconds;
+  options.ttl = FLAGS_compaction_ttl;
+  options.enable_pipelined_write = FLAGS_enable_pipelined_write;
+  options.enable_write_thread_adaptive_yield =
+      FLAGS_enable_write_thread_adaptive_yield;
+  options.compaction_options_universal.size_ratio = FLAGS_universal_size_ratio;
+  options.compaction_options_universal.min_merge_width =
+      FLAGS_universal_min_merge_width;
+  options.compaction_options_universal.max_merge_width =
+      FLAGS_universal_max_merge_width;
+  options.compaction_options_universal.max_size_amplification_percent =
+      FLAGS_universal_max_size_amplification_percent;
+  options.atomic_flush = FLAGS_atomic_flush;
+  options.avoid_unnecessary_blocking_io = FLAGS_avoid_unnecessary_blocking_io;
+  options.write_dbid_to_manifest = FLAGS_write_dbid_to_manifest;
+  options.avoid_flush_during_recovery = FLAGS_avoid_flush_during_recovery;
+  options.max_write_batch_group_size_bytes =
+      FLAGS_max_write_batch_group_size_bytes;
+  options.level_compaction_dynamic_level_bytes =
+      FLAGS_level_compaction_dynamic_level_bytes;
+  options.track_and_verify_wals_in_manifest = true;
+  options.verify_sst_unique_id_in_manifest =
+      FLAGS_verify_sst_unique_id_in_manifest;
+
+  // Integrated BlobDB
+  options.enable_blob_files = FLAGS_enable_blob_files;
+  options.min_blob_size = FLAGS_min_blob_size;
+  options.blob_file_size = FLAGS_blob_file_size;
+  options.blob_compression_type =
+      StringToCompressionType(FLAGS_blob_compression_type.c_str());
+  options.enable_blob_garbage_collection = FLAGS_enable_blob_garbage_collection;
+  options.blob_garbage_collection_age_cutoff =
+      FLAGS_blob_garbage_collection_age_cutoff;
+  options.blob_garbage_collection_force_threshold =
+      FLAGS_blob_garbage_collection_force_threshold;
+  options.blob_compaction_readahead_size = FLAGS_blob_compaction_readahead_size;
+
+  options.wal_compression =
+      StringToCompressionType(FLAGS_wal_compression.c_str());
+
+  switch (FLAGS_rep_factory) {
+    case kSkipList:
+      // no need to do anything
+      break;
+#ifndef ROCKSDB_LITE
+    case kHashSkipList:
+      options.memtable_factory.reset(NewHashSkipListRepFactory(10000));
+      break;
+    case kVectorRep:
+      options.memtable_factory.reset(new VectorRepFactory());
+      break;
+#else
+    default:
+      fprintf(stderr,
+              "RocksdbLite only supports skip list mem table. Skip "
+              "--rep_factory\n");
+#endif  // ROCKSDB_LITE
+  }
+
+  if (FLAGS_use_full_merge_v1) {
+    options.merge_operator = MergeOperators::CreateDeprecatedPutOperator();
+  } else {
+    options.merge_operator = MergeOperators::CreatePutOperator();
+  }
+
+  if (FLAGS_enable_compaction_filter) {
+    options.compaction_filter_factory =
+        std::make_shared<DbStressCompactionFilterFactory>();
+  }
+
+  options.best_efforts_recovery = FLAGS_best_efforts_recovery;
+  options.paranoid_file_checks = FLAGS_paranoid_file_checks;
+  options.fail_if_options_file_error = FLAGS_fail_if_options_file_error;
+
+  if (FLAGS_user_timestamp_size > 0) {
+    CheckAndSetOptionsForUserTimestamp(options);
+  }
+}
+
+void InitializeOptionsGeneral(
+    const std::shared_ptr<Cache>& cache,
+    const std::shared_ptr<Cache>& block_cache_compressed,
+    const std::shared_ptr<const FilterPolicy>& filter_policy,
+    Options& options) {
+  options.create_missing_column_families = true;
+  options.create_if_missing = true;
+
+  if (!options.statistics) {
+    options.statistics = dbstats;
+  }
+
+  if (options.env == Options().env) {
+    options.env = db_stress_env;
+  }
+
+  assert(options.table_factory);
+  auto table_options =
+      options.table_factory->GetOptions<BlockBasedTableOptions>();
+  if (table_options) {
+    if (FLAGS_cache_size > 0) {
+      table_options->block_cache = cache;
+    }
+    if (!table_options->block_cache_compressed &&
+        FLAGS_compressed_cache_size > 0) {
+      table_options->block_cache_compressed = block_cache_compressed;
+    }
+    if (!table_options->filter_policy) {
+      table_options->filter_policy = filter_policy;
+    }
+  }
+
+  // TODO: row_cache, thread-pool IO priority, CPU priority.
+
+  if (!options.rate_limiter) {
+    if (FLAGS_rate_limiter_bytes_per_sec > 0) {
+      options.rate_limiter.reset(NewGenericRateLimiter(
+          FLAGS_rate_limiter_bytes_per_sec, 1000 /* refill_period_us */,
+          10 /* fairness */,
+          FLAGS_rate_limit_bg_reads ? RateLimiter::Mode::kReadsOnly
+                                    : RateLimiter::Mode::kWritesOnly));
+    }
+  }
+
+  if (!options.file_checksum_gen_factory) {
+    options.file_checksum_gen_factory =
+        GetFileChecksumImpl(FLAGS_file_checksum_impl);
+  }
+
+  if (FLAGS_sst_file_manager_bytes_per_sec > 0 ||
+      FLAGS_sst_file_manager_bytes_per_truncate > 0) {
+    Status status;
+    options.sst_file_manager.reset(NewSstFileManager(
+        db_stress_env, options.info_log, "" /* trash_dir */,
+        static_cast<int64_t>(FLAGS_sst_file_manager_bytes_per_sec),
+        true /* delete_existing_trash */, &status,
+        0.25 /* max_trash_db_ratio */,
+        FLAGS_sst_file_manager_bytes_per_truncate));
+    if (!status.ok()) {
+      fprintf(stderr, "SstFileManager creation failed: %s\n",
+              status.ToString().c_str());
+      exit(1);
+    }
+  }
+
+  options.table_properties_collector_factories.emplace_back(
+      std::make_shared<DbStressTablePropertiesCollectorFactory>());
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 #endif  // GFLAGS
--- a/db_stress_tool/db_stress_test_base.h
+++ b/db_stress_tool/db_stress_test_base.h
@ -223,8 +223,6 @@ class StressTest {

  void Reopen(ThreadState* thread);

-  void CheckAndSetOptionsForUserTimestamp();
-
  virtual void RegisterAdditionalListeners() {}

 #ifndef ROCKSDB_LITE
@ -259,5 +257,49 @@ class StressTest {
  bool is_db_stopped_;
 };

+// Load options from OPTIONS file and populate `options`.
+extern bool InitializeOptionsFromFile(Options& options);
+
+// Initialize `options` using command line arguments.
+// When this function is called, `cache`, `block_cache_compressed`,
+// `filter_policy` have all been initialized. Therefore, we just pass them as
+// input arguments.
+extern void InitializeOptionsFromFlags(
+    const std::shared_ptr<Cache>& cache,
+    const std::shared_ptr<Cache>& block_cache_compressed,
+    const std::shared_ptr<const FilterPolicy>& filter_policy, Options& options);
+
+// Initialize `options` on which `InitializeOptionsFromFile()` and
+// `InitializeOptionsFromFlags()` have both been called already.
+// There are two cases.
+// Case 1: OPTIONS file is not specified. Command line arguments have been used
+//         to initialize `options`. InitializeOptionsGeneral() will use
+//         `cache`, `block_cache_compressed` and `filter_policy` to initialize
+//         corresponding fields of `options`. InitializeOptionsGeneral() will
+//         also set up other fields of `options` so that stress test can run.
+//         Examples include `create_if_missing` and
+//         `create_missing_column_families`, etc.
+// Case 2: OPTIONS file is specified. It is possible that, after loading from
+//         the given OPTIONS files, some shared object fields are still not
+//         initialized because they are not set in the OPTIONS file. In this
+//         case, if command line arguments indicate that the user wants to set
+//         up such shared objects, e.g. block cache, compressed block cache,
+//         row cache, filter policy, then InitializeOptionsGeneral() will honor
+//         the user's choice, thus passing `cache`, `block_cache_compressed`,
+//         `filter_policy` as input arguments.
+//
+// InitializeOptionsGeneral() must not overwrite fields of `options` loaded
+// from OPTIONS file.
+extern void InitializeOptionsGeneral(
+    const std::shared_ptr<Cache>& cache,
+    const std::shared_ptr<Cache>& block_cache_compressed,
+    const std::shared_ptr<const FilterPolicy>& filter_policy, Options& options);
+
+// If no OPTIONS file is specified, set up `options` so that we can test
+// user-defined timestamp which requires `-user_timestamp_size=8`.
+// This function also checks for known (currently) incompatible features with
+// user-defined timestamp.
+extern void CheckAndSetOptionsForUserTimestamp(Options& options);
+
 }  // namespace ROCKSDB_NAMESPACE
 #endif  // GFLAGS
--- a/docs/Gemfile.lock
+++ b/docs/Gemfile.lock
@ -232,7 +232,7 @@ GEM
      jekyll-seo-tag (~> 2.1)
    minitest (5.15.0)
    multipart-post (2.1.1)
-    nokogiri (1.13.4)
+    nokogiri (1.13.6)
      mini_portile2 (~> 2.8.0)
      racc (~> 1.4)
    octokit (4.22.0)
--- a/env/env_posix.cc
+++ b/env/env_posix.cc
@ -134,8 +134,8 @@ class PosixClock : public SystemClock {
  const char* NickName() const override { return kClassName(); }

  uint64_t NowMicros() override {
-    struct timeval tv;
-    gettimeofday(&tv, nullptr);
+    port::TimeVal tv;
+    port::GetTimeOfDay(&tv, nullptr);
    return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
  }

@ -200,7 +200,7 @@ class PosixClock : public SystemClock {
    dummy.reserve(maxsize);
    dummy.resize(maxsize);
    char* p = &dummy[0];
-    localtime_r(&seconds, &t);
+    port::LocalTimeR(&seconds, &t);
    snprintf(p, maxsize, "%04d/%02d/%02d-%02d:%02d:%02d ", t.tm_year + 1900,
             t.tm_mon + 1, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec);
    return dummy;
--- a/env/env_test.cc
+++ b/env/env_test.cc
@ -1604,9 +1604,9 @@ class TestLogger : public Logger {

      if (new_format[0] == '[') {
        // "[DEBUG] "
-        ASSERT_TRUE(n <= 56 + (512 - static_cast<int>(sizeof(struct timeval))));
+        ASSERT_TRUE(n <= 56 + (512 - static_cast<int>(sizeof(port::TimeVal))));
      } else {
-        ASSERT_TRUE(n <= 48 + (512 - static_cast<int>(sizeof(struct timeval))));
+        ASSERT_TRUE(n <= 48 + (512 - static_cast<int>(sizeof(port::TimeVal))));
      }
      va_end(backup_ap);
    }
@ -1674,9 +1674,9 @@ class TestLogger2 : public Logger {
      va_copy(backup_ap, ap);
      int n = vsnprintf(new_format, sizeof(new_format) - 1, format, backup_ap);
      // 48 bytes for extra information + bytes allocated
-      ASSERT_TRUE(
-          n <= 48 + static_cast<int>(max_log_size_ - sizeof(struct timeval)));
-      ASSERT_TRUE(n > static_cast<int>(max_log_size_ - sizeof(struct timeval)));
+      ASSERT_TRUE(n <=
+                  48 + static_cast<int>(max_log_size_ - sizeof(port::TimeVal)));
+      ASSERT_TRUE(n > static_cast<int>(max_log_size_ - sizeof(port::TimeVal)));
      va_end(backup_ap);
    }
  }
--- a/env/file_system_tracer.cc
+++ b/env/file_system_tracer.cc
@ -338,6 +338,51 @@ IOStatus FSRandomAccessFileTracingWrapper::InvalidateCache(size_t offset,
  return s;
 }

+IOStatus FSRandomAccessFileTracingWrapper::ReadAsync(
+    FSReadRequest& req, const IOOptions& opts,
+    std::function<void(const FSReadRequest&, void*)> cb, void* cb_arg,
+    void** io_handle, IOHandleDeleter* del_fn, IODebugContext* dbg) {
+  // Create a callback and populate info.
+  auto read_async_callback =
+      std::bind(&FSRandomAccessFileTracingWrapper::ReadAsyncCallback, this,
+                std::placeholders::_1, std::placeholders::_2);
+  ReadAsyncCallbackInfo* read_async_cb_info = new ReadAsyncCallbackInfo;
+  read_async_cb_info->cb_ = cb;
+  read_async_cb_info->cb_arg_ = cb_arg;
+  read_async_cb_info->start_time_ = clock_->NowNanos();
+  read_async_cb_info->file_op_ = __func__;
+
+  IOStatus s = target()->ReadAsync(req, opts, read_async_callback,
+                                   read_async_cb_info, io_handle, del_fn, dbg);
+
+  if (!s.ok()) {
+    delete read_async_cb_info;
+  }
+  return s;
+}
+
+void FSRandomAccessFileTracingWrapper::ReadAsyncCallback(
+    const FSReadRequest& req, void* cb_arg) {
+  ReadAsyncCallbackInfo* read_async_cb_info =
+      static_cast<ReadAsyncCallbackInfo*>(cb_arg);
+  assert(read_async_cb_info);
+  assert(read_async_cb_info->cb_);
+
+  uint64_t elapsed = clock_->NowNanos() - read_async_cb_info->start_time_;
+  uint64_t io_op_data = 0;
+  io_op_data |= (1 << IOTraceOp::kIOLen);
+  io_op_data |= (1 << IOTraceOp::kIOOffset);
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+                          read_async_cb_info->file_op_, elapsed,
+                          req.status.ToString(), file_name_, req.result.size(),
+                          req.offset);
+  io_tracer_->WriteIOOp(io_record, nullptr /*dbg*/);
+
+  // call the underlying callback.
+  read_async_cb_info->cb_(req, read_async_cb_info->cb_arg_);
+  delete read_async_cb_info;
+}
+
 IOStatus FSWritableFileTracingWrapper::Append(const Slice& data,
                                              const IOOptions& options,
                                              IODebugContext* dbg) {
--- a/env/file_system_tracer.h
+++ b/env/file_system_tracer.h
@ -228,11 +228,25 @@ class FSRandomAccessFileTracingWrapper : public FSRandomAccessFileOwnerWrapper {

  IOStatus InvalidateCache(size_t offset, size_t length) override;

+  IOStatus ReadAsync(FSReadRequest& req, const IOOptions& opts,
+                     std::function<void(const FSReadRequest&, void*)> cb,
+                     void* cb_arg, void** io_handle, IOHandleDeleter* del_fn,
+                     IODebugContext* dbg) override;
+
+  void ReadAsyncCallback(const FSReadRequest& req, void* cb_arg);
+
 private:
  std::shared_ptr<IOTracer> io_tracer_;
  SystemClock* clock_;
  // Stores file name instead of full path.
  std::string file_name_;
+
+  struct ReadAsyncCallbackInfo {
+    uint64_t start_time_;
+    std::function<void(const FSReadRequest&, void*)> cb_;
+    void* cb_arg_;
+    std::string file_op_;
+  };
 };

 // The FSRandomAccessFilePtr is a wrapper class that takes pointer to storage
--- a/env/mock_env.cc
+++ b/env/mock_env.cc
@ -509,13 +509,13 @@ class TestMemLogger : public Logger {
      char* p = base;
      char* limit = base + bufsize;

-      struct timeval now_tv;
-      gettimeofday(&now_tv, nullptr);
+      port::TimeVal now_tv;
+      port::GetTimeOfDay(&now_tv, nullptr);
      const time_t seconds = now_tv.tv_sec;
      struct tm t;
      memset(&t, 0, sizeof(t));
      struct tm* ret __attribute__((__unused__));
-      ret = localtime_r(&seconds, &t);
+      ret = port::LocalTimeR(&seconds, &t);
      assert(ret);
      p += snprintf(p, limit - p, "%04d/%02d/%02d-%02d:%02d:%02d.%06d ",
                    t.tm_year + 1900, t.tm_mon + 1, t.tm_mday, t.tm_hour,
--- a/file/file_prefetch_buffer.cc
+++ b/file/file_prefetch_buffer.cc
@ -194,34 +194,7 @@ void FilePrefetchBuffer::CopyDataToBuffer(uint32_t src, uint64_t& offset,
  }
 }

-// If async_read = true:
-// async_read is enabled in case of sequential reads. So when
-// buffers are switched, we clear the curr_ buffer as we assume the data has
-// been consumed because of sequential reads.
-//
-// Scenarios for prefetching asynchronously:
-// Case1: If both buffers are empty, prefetch n bytes
-//        synchronously in curr_
-//        and prefetch readahead_size_/2 async in second buffer.
-// Case2: If second buffer has partial or full data, make it current and
-//        prefetch readahead_size_/2 async in second buffer. In case of
-//        partial data, prefetch remaining bytes from size n synchronously to
-//        fulfill the requested bytes request.
-// Case3: If curr_ has partial data, prefetch remaining bytes from size n
-//        synchronously in curr_ to fulfill the requested bytes request and
-//        prefetch readahead_size_/2 bytes async in second buffer.
-// Case4: If data is in both buffers, copy requested data from curr_ and second
-//        buffer to third buffer. If all requested bytes have been copied, do
-//        the asynchronous prefetching in second buffer.
-Status FilePrefetchBuffer::PrefetchAsync(const IOOptions& opts,
-                                         RandomAccessFileReader* reader,
-                                         uint64_t offset, size_t length,
-                                         size_t readahead_size,
-                                         Env::IOPriority rate_limiter_priority,
-                                         bool& copy_to_third_buffer) {
-  if (!enable_) {
-    return Status::OK();
-  }
+void FilePrefetchBuffer::PollAndUpdateBuffersIfNeeded(uint64_t offset) {
  if (async_read_in_progress_ && fs_ != nullptr) {
    // Wait for prefetch data to complete.
    // No mutex is needed as PrefetchAsyncCallback updates the result in second
@ -242,11 +215,6 @@ Status FilePrefetchBuffer::PrefetchAsync(const IOOptions& opts,
    del_fn_ = nullptr;
  }

-  TEST_SYNC_POINT("FilePrefetchBuffer::PrefetchAsync:Start");
-  Status s;
-  size_t prefetch_size = length + readahead_size;
-
-  size_t alignment = reader->file()->GetRequiredBufferAlignment();
  // Index of second buffer.
  uint32_t second = curr_ ^ 1;

@ -273,17 +241,55 @@ Status FilePrefetchBuffer::PrefetchAsync(const IOOptions& opts,
    // outdated data and switch the buffers.
    bufs_[curr_].buffer_.Clear();
    curr_ = curr_ ^ 1;
-    second = curr_ ^ 1;
  }
-  // After swap check if all the requested bytes are in curr_, it will go for
-  // async prefetching only.
+}
+
+// If async_read = true:
+// async_read is enabled in case of sequential reads. So when
+// buffers are switched, we clear the curr_ buffer as we assume the data has
+// been consumed because of sequential reads.
+//
+// Scenarios for prefetching asynchronously:
+// Case1: If both buffers are empty, prefetch n bytes
+//        synchronously in curr_
+//        and prefetch readahead_size_/2 async in second buffer.
+// Case2: If second buffer has partial or full data, make it current and
+//        prefetch readahead_size_/2 async in second buffer. In case of
+//        partial data, prefetch remaining bytes from size n synchronously to
+//        fulfill the requested bytes request.
+// Case3: If curr_ has partial data, prefetch remaining bytes from size n
+//        synchronously in curr_ to fulfill the requested bytes request and
+//        prefetch readahead_size_/2 bytes async in second buffer.
+// Case4: If data is in both buffers, copy requested data from curr_ and second
+//        buffer to third buffer. If all requested bytes have been copied, do
+//        the asynchronous prefetching in second buffer.
+Status FilePrefetchBuffer::PrefetchAsyncInternal(
+    const IOOptions& opts, RandomAccessFileReader* reader, uint64_t offset,
+    size_t length, size_t readahead_size, Env::IOPriority rate_limiter_priority,
+    bool& copy_to_third_buffer) {
+  if (!enable_) {
+    return Status::OK();
+  }
+
+  TEST_SYNC_POINT("FilePrefetchBuffer::PrefetchAsyncInternal:Start");
+
+  PollAndUpdateBuffersIfNeeded(offset);
+
+  // If all the requested bytes are in curr_, it will go for async prefetching
+  // only.
  if (bufs_[curr_].buffer_.CurrentSize() > 0 &&
      offset + length <=
          bufs_[curr_].offset_ + bufs_[curr_].buffer_.CurrentSize()) {
    offset += length;
    length = 0;
-    prefetch_size = readahead_size;
  }
+
+  Status s;
+  size_t prefetch_size = length + readahead_size;
+  size_t alignment = reader->file()->GetRequiredBufferAlignment();
+  // Index of second buffer.
+  uint32_t second = curr_ ^ 1;
+
  // Data is overlapping i.e. some of the data is in curr_ buffer and remaining
  // in second buffer.
  if (bufs_[curr_].buffer_.CurrentSize() > 0 &&
@ -315,9 +321,8 @@ Status FilePrefetchBuffer::PrefetchAsync(const IOOptions& opts,
    prefetch_size = length + readahead_size;
  }

-  // Update second again if swap happened.
-  second = curr_ ^ 1;
  size_t _offset = static_cast<size_t>(offset);
+  second = curr_ ^ 1;

  // offset and size alignment for curr_ buffer with synchronous prefetching
  uint64_t rounddown_start1 = Rounddown(_offset, alignment);
@ -442,12 +447,23 @@ bool FilePrefetchBuffer::TryReadFromCache(const IOOptions& opts,
 bool FilePrefetchBuffer::TryReadFromCacheAsync(
    const IOOptions& opts, RandomAccessFileReader* reader, uint64_t offset,
    size_t n, Slice* result, Status* status,
-    Env::IOPriority rate_limiter_priority, bool for_compaction /* = false */
-) {
+    Env::IOPriority rate_limiter_priority) {
+  assert(async_io_);
+
  if (track_min_offset_ && offset < min_offset_read_) {
    min_offset_read_ = static_cast<size_t>(offset);
  }
-  if (!enable_ || (offset < bufs_[curr_].offset_)) {
+
+  if (!enable_) {
+    return false;
+  }
+
+  // In case of async_io_, offset can be less than bufs_[curr_].offset_ because
+  // of reads not sequential and PrefetchAsync can be called for any block and
+  // RocksDB will call TryReadFromCacheAsync after PrefetchAsync to Poll for
+  // requested bytes.
+  if (bufs_[curr_].buffer_.CurrentSize() > 0 && offset < bufs_[curr_].offset_ &&
+      prev_len_ != 0) {
    return false;
  }

@ -459,35 +475,25 @@ bool FilePrefetchBuffer::TryReadFromCacheAsync(
  //    If readahead is not enabled: return false.
  TEST_SYNC_POINT_CALLBACK("FilePrefetchBuffer::TryReadFromCache",
                           &readahead_size_);
-  if (offset + n > bufs_[curr_].offset_ + bufs_[curr_].buffer_.CurrentSize()) {
+  if (offset < bufs_[curr_].offset_ ||
+      offset + n > bufs_[curr_].offset_ + bufs_[curr_].buffer_.CurrentSize()) {
    if (readahead_size_ > 0) {
      Status s;
      assert(reader != nullptr);
      assert(max_readahead_size_ >= readahead_size_);
-      if (for_compaction) {
-        s = Prefetch(opts, reader, offset, std::max(n, readahead_size_),
-                     rate_limiter_priority);
-      } else {
-        if (implicit_auto_readahead_) {
-          if (!IsEligibleForPrefetch(offset, n)) {
-            // Ignore status as Prefetch is not called.
-            s.PermitUncheckedError();
-            return false;
-          }
-        }
-        // async prefetching is enabled if it's implicit_auto_readahead_ or
-        // explicit readahead_size_ is passed along with ReadOptions.async_io =
-        // true.
-        if (async_io_) {
-          // Prefetch n + readahead_size_/2 synchronously as remaining
-          // readahead_size_/2 will be prefetched asynchronously.
-          s = PrefetchAsync(opts, reader, offset, n, readahead_size_ / 2,
-                            rate_limiter_priority, copy_to_third_buffer);
-        } else {
-          s = Prefetch(opts, reader, offset, n + readahead_size_,
-                       rate_limiter_priority);
+
+      if (implicit_auto_readahead_) {
+        if (!IsEligibleForPrefetch(offset, n)) {
+          // Ignore status as Prefetch is not called.
+          s.PermitUncheckedError();
+          return false;
        }
      }
+
+      // Prefetch n + readahead_size_/2 synchronously as remaining
+      // readahead_size_/2 will be prefetched asynchronously.
+      s = PrefetchAsyncInternal(opts, reader, offset, n, readahead_size_ / 2,
+                                rate_limiter_priority, copy_to_third_buffer);
      if (!s.ok()) {
        if (status) {
          *status = s;
@ -544,4 +550,92 @@ void FilePrefetchBuffer::PrefetchAsyncCallback(const FSReadRequest& req,
    bufs_[index].buffer_.Size(current_size + req.result.size());
  }
 }
+
+Status FilePrefetchBuffer::PrefetchAsync(const IOOptions& opts,
+                                         RandomAccessFileReader* reader,
+                                         uint64_t offset, size_t n,
+                                         Env::IOPriority rate_limiter_priority,
+                                         Slice* result) {
+  assert(reader != nullptr);
+  if (!enable_) {
+    return Status::NotSupported();
+  }
+  TEST_SYNC_POINT("FilePrefetchBuffer::PrefetchAsync:Start");
+
+  PollAndUpdateBuffersIfNeeded(offset);
+
+  // Index of second buffer.
+  uint32_t second = curr_ ^ 1;
+
+  // Since PrefetchAsync can be called on non sequential reads. So offset can
+  // be less than buffers' offset. In that case it clears the buffer and
+  // prefetch that block.
+  if (bufs_[curr_].buffer_.CurrentSize() > 0 && offset < bufs_[curr_].offset_) {
+    bufs_[curr_].buffer_.Clear();
+  }
+
+  // All requested bytes are already in the curr_ buffer. So no need to Read
+  // again.
+  if (bufs_[curr_].buffer_.CurrentSize() > 0 &&
+      offset + n <= bufs_[curr_].offset_ + bufs_[curr_].buffer_.CurrentSize()) {
+    uint64_t offset_in_buffer = offset - bufs_[curr_].offset_;
+    *result = Slice(bufs_[curr_].buffer_.BufferStart() + offset_in_buffer, n);
+    return Status::OK();
+  }
+
+  Status s;
+  size_t alignment = reader->file()->GetRequiredBufferAlignment();
+
+  // TODO akanksha: Handle the scenario if data is overlapping in 2 buffers.
+  // Currently, tt covers 2 scenarios. Either one buffer (curr_) has no data or
+  // it has partial data. It ignores the contents in second buffer (overlapping
+  // data in 2 buffers) and send the request to re-read that data again.
+
+  // Clear the second buffer in order to do asynchronous prefetching.
+  bufs_[second].buffer_.Clear();
+
+  size_t offset_to_read = static_cast<size_t>(offset);
+  uint64_t rounddown_start = 0;
+  uint64_t roundup_end = 0;
+
+  if (bufs_[curr_].buffer_.CurrentSize() == 0) {
+    // Prefetch full data.
+    rounddown_start = Rounddown(offset_to_read, alignment);
+    roundup_end = Roundup(offset_to_read + n, alignment);
+  } else {
+    // Prefetch remaining data.
+    size_t rem_length = n - (bufs_[curr_].buffer_.CurrentSize() -
+                             (offset - bufs_[curr_].offset_));
+    rounddown_start = bufs_[curr_].offset_ + bufs_[curr_].buffer_.CurrentSize();
+    roundup_end = Roundup(rounddown_start + rem_length, alignment);
+  }
+
+  uint64_t roundup_len = roundup_end - rounddown_start;
+  assert(roundup_len >= alignment);
+  assert(roundup_len % alignment == 0);
+
+  uint64_t chunk_len = 0;
+  CalculateOffsetAndLen(alignment, rounddown_start, roundup_len, second, false,
+                        chunk_len);
+
+  // Update the buffer offset.
+  bufs_[second].offset_ = rounddown_start;
+  assert(roundup_len >= chunk_len);
+
+  size_t read_len = static_cast<size_t>(roundup_len - chunk_len);
+
+  s = ReadAsync(opts, reader, rate_limiter_priority, read_len, chunk_len,
+                rounddown_start, second);
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  // Update read pattern so that TryReadFromCacheAsync call be called to Poll
+  // the data. It will return without polling if blocks are not sequential.
+  UpdateReadPattern(offset, n, /*decrease_readaheadsize=*/false);
+  prev_len_ = 0;
+
+  return Status::TryAgain();
+}
 }  // namespace ROCKSDB_NAMESPACE
--- a/file/file_prefetch_buffer.h
+++ b/file/file_prefetch_buffer.h
@ -89,6 +89,7 @@ class FilePrefetchBuffer {
    // while curr_ is being consumed. If data is overlapping in two buffers,
    // data is copied to third buffer to return continuous buffer.
    bufs_.resize(3);
+    (void)async_io_;
  }

  ~FilePrefetchBuffer() {
@ -131,10 +132,21 @@ class FilePrefetchBuffer {
                  uint64_t offset, size_t n,
                  Env::IOPriority rate_limiter_priority);

+  // Request for reading the data from a file asynchronously.
+  // If data already exists in the buffer, result will be updated.
+  // reader                : the file reader.
+  // offset                : the file offset to start reading from.
+  // n                     : the number of bytes to read.
+  // rate_limiter_priority : rate limiting priority, or `Env::IO_TOTAL` to
+  //                         bypass.
+  // result                : if data already exists in the buffer, result will
+  //                         be updated with the data.
+  //
+  // If data already exist in the buffer, it will return Status::OK, otherwise
+  // it will send asynchronous request and return Status::TryAgain.
  Status PrefetchAsync(const IOOptions& opts, RandomAccessFileReader* reader,
-                       uint64_t offset, size_t length, size_t readahead_size,
-                       Env::IOPriority rate_limiter_priority,
-                       bool& copy_to_third_buffer);
+                       uint64_t offset, size_t n,
+                       Env::IOPriority rate_limiter_priority, Slice* result);

  // Tries returning the data for a file read from this buffer if that data is
  // in the buffer.
@ -159,8 +171,7 @@ class FilePrefetchBuffer {
  bool TryReadFromCacheAsync(const IOOptions& opts,
                             RandomAccessFileReader* reader, uint64_t offset,
                             size_t n, Slice* result, Status* status,
-                             Env::IOPriority rate_limiter_priority,
-                             bool for_compaction /* = false */);
+                             Env::IOPriority rate_limiter_priority);

  // The minimum `offset` ever passed to TryReadFromCache(). This will nly be
  // tracked if track_min_offset = true.
@ -207,22 +218,6 @@ class FilePrefetchBuffer {
    }
  }

-  bool IsEligibleForPrefetch(uint64_t offset, size_t n) {
-    // Prefetch only if this read is sequential otherwise reset readahead_size_
-    // to initial value.
-    if (!IsBlockSequential(offset)) {
-      UpdateReadPattern(offset, n, false /*decrease_readaheadsize*/);
-      ResetValues();
-      return false;
-    }
-    num_file_reads_++;
-    if (num_file_reads_ <= kMinNumFileReadsToStartAutoReadahead) {
-      UpdateReadPattern(offset, n, false /*decrease_readaheadsize*/);
-      return false;
-    }
-    return true;
-  }
-
  // Callback function passed to underlying FS in case of asynchronous reads.
  void PrefetchAsyncCallback(const FSReadRequest& req, void* cb_arg);

@ -234,6 +229,17 @@ class FilePrefetchBuffer {
                             size_t roundup_len, size_t index, bool refit_tail,
                             uint64_t& chunk_len);

+  // It calls Poll API if any there is any pending asynchronous request. It then
+  // checks if data is in any buffer. It clears the outdated data and swaps the
+  // buffers if required.
+  void PollAndUpdateBuffersIfNeeded(uint64_t offset);
+
+  Status PrefetchAsyncInternal(const IOOptions& opts,
+                               RandomAccessFileReader* reader, uint64_t offset,
+                               size_t length, size_t readahead_size,
+                               Env::IOPriority rate_limiter_priority,
+                               bool& copy_to_third_buffer);
+
  Status Read(const IOOptions& opts, RandomAccessFileReader* reader,
              Env::IOPriority rate_limiter_priority, uint64_t read_len,
              uint64_t chunk_len, uint64_t rounddown_start, uint32_t index);
@ -256,6 +262,22 @@ class FilePrefetchBuffer {
    readahead_size_ = initial_auto_readahead_size_;
  }

+  bool IsEligibleForPrefetch(uint64_t offset, size_t n) {
+    // Prefetch only if this read is sequential otherwise reset readahead_size_
+    // to initial value.
+    if (!IsBlockSequential(offset)) {
+      UpdateReadPattern(offset, n, false /*decrease_readaheadsize*/);
+      ResetValues();
+      return false;
+    }
+    num_file_reads_++;
+    if (num_file_reads_ <= kMinNumFileReadsToStartAutoReadahead) {
+      UpdateReadPattern(offset, n, false /*decrease_readaheadsize*/);
+      return false;
+    }
+    return true;
+  }
+
  std::vector<BufferInfo> bufs_;
  // curr_ represents the index for bufs_ indicating which buffer is being
  // consumed currently.
--- a/file/file_util.h
+++ b/file/file_util.h
@ -78,6 +78,8 @@ inline IOStatus PrepareIOFromReadOptions(const ReadOptions& ro,
      (!opts.timeout.count() || ro.io_timeout < opts.timeout)) {
    opts.timeout = ro.io_timeout;
  }
+
+  opts.rate_limiter_priority = ro.rate_limiter_priority;
  return IOStatus::OK();
 }

--- a/file/prefetch_test.cc
+++ b/file/prefetch_test.cc
@ -5,6 +5,9 @@

 #include "db/db_test_util.h"
 #include "test_util/sync_point.h"
+#ifdef GFLAGS
+#include "tools/io_tracer_parser_tool.h"
+#endif

 namespace ROCKSDB_NAMESPACE {

@ -534,15 +537,24 @@ TEST_P(PrefetchTest, PrefetchWhenReseek) {
     * initially (2 more data blocks).
     */
    iter->Seek(BuildKey(0));
+    ASSERT_TRUE(iter->Valid());
    iter->Seek(BuildKey(1000));
+    ASSERT_TRUE(iter->Valid());
    iter->Seek(BuildKey(1004));  // Prefetch Data
+    ASSERT_TRUE(iter->Valid());
    iter->Seek(BuildKey(1008));
+    ASSERT_TRUE(iter->Valid());
    iter->Seek(BuildKey(1011));
+    ASSERT_TRUE(iter->Valid());
    iter->Seek(BuildKey(1015));  // Prefetch Data
+    ASSERT_TRUE(iter->Valid());
    iter->Seek(BuildKey(1019));
+    ASSERT_TRUE(iter->Valid());
    // Missed 2 blocks but they are already in buffer so no reset.
    iter->Seek(BuildKey(103));   // Already in buffer.
+    ASSERT_TRUE(iter->Valid());
    iter->Seek(BuildKey(1033));  // Prefetch Data
+    ASSERT_TRUE(iter->Valid());
    if (support_prefetch && !use_direct_io) {
      ASSERT_EQ(fs->GetPrefetchCount(), 3);
      fs->ClearPrefetchCount();
@ -558,10 +570,15 @@ TEST_P(PrefetchTest, PrefetchWhenReseek) {
     */
    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
    iter->Seek(BuildKey(0));
+    ASSERT_TRUE(iter->Valid());
    iter->Seek(BuildKey(1008));
+    ASSERT_TRUE(iter->Valid());
    iter->Seek(BuildKey(1019));
+    ASSERT_TRUE(iter->Valid());
    iter->Seek(BuildKey(1033));
+    ASSERT_TRUE(iter->Valid());
    iter->Seek(BuildKey(1048));
+    ASSERT_TRUE(iter->Valid());
    if (support_prefetch && !use_direct_io) {
      ASSERT_EQ(fs->GetPrefetchCount(), 0);
      fs->ClearPrefetchCount();
@ -576,9 +593,13 @@ TEST_P(PrefetchTest, PrefetchWhenReseek) {
     */
    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
    iter->Seek(BuildKey(0));
+    ASSERT_TRUE(iter->Valid());
    iter->Seek(BuildKey(1));
+    ASSERT_TRUE(iter->Valid());
    iter->Seek(BuildKey(10));
+    ASSERT_TRUE(iter->Valid());
    iter->Seek(BuildKey(100));
+    ASSERT_TRUE(iter->Valid());
    if (support_prefetch && !use_direct_io) {
      ASSERT_EQ(fs->GetPrefetchCount(), 0);
      fs->ClearPrefetchCount();
@ -596,14 +617,21 @@ TEST_P(PrefetchTest, PrefetchWhenReseek) {
     */
    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
    iter->Seek(BuildKey(0));
+    ASSERT_TRUE(iter->Valid());
    iter->Seek(BuildKey(1000));
+    ASSERT_TRUE(iter->Valid());
    iter->Seek(BuildKey(1004));  // This iteration will prefetch buffer
+    ASSERT_TRUE(iter->Valid());
    iter->Seek(BuildKey(1008));
+    ASSERT_TRUE(iter->Valid());
    iter->Seek(
        BuildKey(996));  // Reseek won't prefetch any data and
                         // readahead_size will be initiallized to 8*1024.
+    ASSERT_TRUE(iter->Valid());
    iter->Seek(BuildKey(992));
+    ASSERT_TRUE(iter->Valid());
    iter->Seek(BuildKey(989));
+    ASSERT_TRUE(iter->Valid());
    if (support_prefetch && !use_direct_io) {
      ASSERT_EQ(fs->GetPrefetchCount(), 1);
      fs->ClearPrefetchCount();
@ -615,11 +643,17 @@ TEST_P(PrefetchTest, PrefetchWhenReseek) {
    // Read sequentially to confirm readahead_size is reset to initial value (2
    // more data blocks)
    iter->Seek(BuildKey(1011));
+    ASSERT_TRUE(iter->Valid());
    iter->Seek(BuildKey(1015));
+    ASSERT_TRUE(iter->Valid());
    iter->Seek(BuildKey(1019));  // Prefetch Data
+    ASSERT_TRUE(iter->Valid());
    iter->Seek(BuildKey(1022));
+    ASSERT_TRUE(iter->Valid());
    iter->Seek(BuildKey(1026));
+    ASSERT_TRUE(iter->Valid());
    iter->Seek(BuildKey(103));  // Prefetch Data
+    ASSERT_TRUE(iter->Valid());
    if (support_prefetch && !use_direct_io) {
      ASSERT_EQ(fs->GetPrefetchCount(), 2);
      fs->ClearPrefetchCount();
@ -634,12 +668,19 @@ TEST_P(PrefetchTest, PrefetchWhenReseek) {
     */
    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
    iter->Seek(BuildKey(0));
+    ASSERT_TRUE(iter->Valid());
    iter->Seek(BuildKey(1167));
+    ASSERT_TRUE(iter->Valid());
    iter->Seek(BuildKey(1334));  // This iteration will prefetch buffer
+    ASSERT_TRUE(iter->Valid());
    iter->Seek(BuildKey(1499));
+    ASSERT_TRUE(iter->Valid());
    iter->Seek(BuildKey(1667));
+    ASSERT_TRUE(iter->Valid());
    iter->Seek(BuildKey(1847));
+    ASSERT_TRUE(iter->Valid());
    iter->Seek(BuildKey(1999));
+    ASSERT_TRUE(iter->Valid());
    if (support_prefetch && !use_direct_io) {
      ASSERT_EQ(fs->GetPrefetchCount(), 1);
      fs->ClearPrefetchCount();
@ -766,8 +807,11 @@ TEST_P(PrefetchTest, PrefetchWhenReseekwithCache) {
    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
    // Warm up the cache
    iter->Seek(BuildKey(1011));
+    ASSERT_TRUE(iter->Valid());
    iter->Seek(BuildKey(1015));
+    ASSERT_TRUE(iter->Valid());
    iter->Seek(BuildKey(1019));
+    ASSERT_TRUE(iter->Valid());
    if (support_prefetch && !use_direct_io) {
      ASSERT_EQ(fs->GetPrefetchCount(), 1);
      fs->ClearPrefetchCount();
@ -780,20 +824,31 @@ TEST_P(PrefetchTest, PrefetchWhenReseekwithCache) {
    // After caching, blocks will be read from cache (Sequential blocks)
    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
    iter->Seek(BuildKey(0));
+    ASSERT_TRUE(iter->Valid());
    iter->Seek(BuildKey(1000));
+    ASSERT_TRUE(iter->Valid());
    iter->Seek(BuildKey(1004));  // Prefetch data (not in cache).
+    ASSERT_TRUE(iter->Valid());
    // Missed one sequential block but next is in already in buffer so readahead
    // will not be reset.
    iter->Seek(BuildKey(1011));
+    ASSERT_TRUE(iter->Valid());
    // Prefetch data but blocks are in cache so no prefetch and reset.
    iter->Seek(BuildKey(1015));
+    ASSERT_TRUE(iter->Valid());
    iter->Seek(BuildKey(1019));
+    ASSERT_TRUE(iter->Valid());
    iter->Seek(BuildKey(1022));
+    ASSERT_TRUE(iter->Valid());
    // Prefetch data with readahead_size = 4 blocks.
    iter->Seek(BuildKey(1026));
+    ASSERT_TRUE(iter->Valid());
    iter->Seek(BuildKey(103));
+    ASSERT_TRUE(iter->Valid());
    iter->Seek(BuildKey(1033));
+    ASSERT_TRUE(iter->Valid());
    iter->Seek(BuildKey(1037));
+    ASSERT_TRUE(iter->Valid());

    if (support_prefetch && !use_direct_io) {
      ASSERT_EQ(fs->GetPrefetchCount(), 3);
@ -881,7 +936,7 @@ TEST_P(PrefetchTest1, DBIterLevelReadAhead) {
        [&](void*) { buff_prefetch_count++; });

    SyncPoint::GetInstance()->SetCallBack(
-        "FilePrefetchBuffer::PrefetchAsync:Start",
+        "FilePrefetchBuffer::PrefetchAsyncInternal:Start",
        [&](void*) { buff_async_prefetch_count++; });

    // The callback checks, since reads are sequential, readahead_size doesn't
@ -955,7 +1010,7 @@ class PrefetchTest2 : public DBTestBase,
 INSTANTIATE_TEST_CASE_P(PrefetchTest2, PrefetchTest2, ::testing::Bool());

 #ifndef ROCKSDB_LITE
-TEST_P(PrefetchTest2, NonSequentialReads) {
+TEST_P(PrefetchTest2, NonSequentialReadsWithAdaptiveReadahead) {
  const int kNumKeys = 1000;
  // Set options
  std::shared_ptr<MockFS> fs =
@ -1002,9 +1057,8 @@ TEST_P(PrefetchTest2, NonSequentialReads) {
  int set_readahead = 0;
  size_t readahead_size = 0;

-  SyncPoint::GetInstance()->SetCallBack(
-      "FilePrefetchBuffer::PrefetchAsync:Start",
-      [&](void*) { buff_prefetch_count++; });
+  SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start",
+                                        [&](void*) { buff_prefetch_count++; });
  SyncPoint::GetInstance()->SetCallBack(
      "BlockPrefetcher::SetReadaheadState",
      [&](void* /*arg*/) { set_readahead++; });
@ -1018,13 +1072,15 @@ TEST_P(PrefetchTest2, NonSequentialReads) {
    // Iterate until prefetch is done.
    ReadOptions ro;
    ro.adaptive_readahead = true;
-    // TODO akanksha: Remove after adding new units.
-    ro.async_io = true;
    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ro));
+
    iter->SeekToFirst();
+    ASSERT_TRUE(iter->Valid());
+
    while (iter->Valid() && buff_prefetch_count == 0) {
      iter->Next();
    }
+
    ASSERT_EQ(readahead_size, 8 * 1024);
    ASSERT_EQ(buff_prefetch_count, 1);
    ASSERT_EQ(set_readahead, 0);
@ -1033,9 +1089,12 @@ TEST_P(PrefetchTest2, NonSequentialReads) {
    // Move to last file and check readahead size fallbacks to 8KB. So next
    // readahead size after prefetch should be 8 * 1024;
    iter->Seek(BuildKey(4004));
+    ASSERT_TRUE(iter->Valid());
+
    while (iter->Valid() && buff_prefetch_count == 0) {
      iter->Next();
    }
+
    ASSERT_EQ(readahead_size, 8 * 1024);
    ASSERT_EQ(set_readahead, 0);
    ASSERT_EQ(buff_prefetch_count, 1);
@ -1099,7 +1158,7 @@ TEST_P(PrefetchTest2, DecreaseReadAheadIfInCache) {
  size_t decrease_readahead_size = 8 * 1024;

  SyncPoint::GetInstance()->SetCallBack(
-      "FilePrefetchBuffer::PrefetchAsync:Start",
+      "FilePrefetchBuffer::PrefetchAsyncInternal:Start",
      [&](void*) { buff_prefetch_count++; });
  SyncPoint::GetInstance()->SetCallBack(
      "FilePrefetchBuffer::TryReadFromCache", [&](void* arg) {
@ -1120,8 +1179,11 @@ TEST_P(PrefetchTest2, DecreaseReadAheadIfInCache) {
    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ro));
    // Warm up the cache
    iter->Seek(BuildKey(1011));
+    ASSERT_TRUE(iter->Valid());
    iter->Seek(BuildKey(1015));
+    ASSERT_TRUE(iter->Valid());
    iter->Seek(BuildKey(1019));
+    ASSERT_TRUE(iter->Valid());
    buff_prefetch_count = 0;
  }

@ -1129,26 +1191,39 @@ TEST_P(PrefetchTest2, DecreaseReadAheadIfInCache) {
    ASSERT_OK(options.statistics->Reset());
    // After caching, blocks will be read from cache (Sequential blocks)
    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ro));
-    iter->Seek(BuildKey(0));
+    iter->Seek(
+        BuildKey(0));  // In cache so it will decrease the readahead_size.
    ASSERT_TRUE(iter->Valid());
-    iter->Seek(BuildKey(1000));
-    ASSERT_TRUE(iter->Valid());
-    iter->Seek(BuildKey(1004));  // Prefetch data (not in cache).
+    expected_current_readahead_size = std::max(
+        decrease_readahead_size,
+        (expected_current_readahead_size >= decrease_readahead_size
+             ? (expected_current_readahead_size - decrease_readahead_size)
+             : 0));
+
+    iter->Seek(BuildKey(1000));  // Prefetch the block.
    ASSERT_TRUE(iter->Valid());
    ASSERT_EQ(current_readahead_size, expected_current_readahead_size);
+    expected_current_readahead_size *= 2;

-    // Missed one sequential block but 1011 is already in buffer so
-    // readahead will not be reset.
+    iter->Seek(BuildKey(1004));  // Prefetch the block.
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(current_readahead_size, expected_current_readahead_size);
+    expected_current_readahead_size *= 2;
+
+    // 1011 is already in cache but won't reset??
    iter->Seek(BuildKey(1011));
    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ(current_readahead_size, expected_current_readahead_size);

    // Eligible to Prefetch data (not in buffer) but block is in cache so no
    // prefetch will happen and will result in decrease in readahead_size.
    // readahead_size will be 8 * 1024
    iter->Seek(BuildKey(1015));
    ASSERT_TRUE(iter->Valid());
-    expected_current_readahead_size -= decrease_readahead_size;
+    expected_current_readahead_size = std::max(
+        decrease_readahead_size,
+        (expected_current_readahead_size >= decrease_readahead_size
+             ? (expected_current_readahead_size - decrease_readahead_size)
+             : 0));

    // 1016 is the same block as 1015. So no change in readahead_size.
    iter->Seek(BuildKey(1016));
@ -1169,7 +1244,7 @@ TEST_P(PrefetchTest2, DecreaseReadAheadIfInCache) {
    iter->Seek(BuildKey(1022));
    ASSERT_TRUE(iter->Valid());
    ASSERT_EQ(current_readahead_size, expected_current_readahead_size);
-    ASSERT_EQ(buff_prefetch_count, 2);
+    ASSERT_EQ(buff_prefetch_count, 3);

    // Check stats to make sure async prefetch is done.
    {
@ -1179,6 +1254,7 @@ TEST_P(PrefetchTest2, DecreaseReadAheadIfInCache) {
        ASSERT_EQ(async_read_bytes.count, 0);
      } else {
        ASSERT_GT(async_read_bytes.count, 0);
+        ASSERT_GT(get_perf_context()->number_async_seek, 0);
      }
    }

@ -1193,6 +1269,33 @@ class PrefetchTestWithPosix : public DBTestBase,
                              public ::testing::WithParamInterface<bool> {
 public:
  PrefetchTestWithPosix() : DBTestBase("prefetch_test_with_posix", true) {}
+
+#ifndef ROCKSDB_LITE
+#ifdef GFLAGS
+  const int kMaxArgCount = 100;
+  const size_t kArgBufferSize = 100000;
+
+  void RunIOTracerParserTool(std::string trace_file) {
+    std::vector<std::string> params = {"./io_tracer_parser",
+                                       "-io_trace_file=" + trace_file};
+
+    char arg_buffer[kArgBufferSize];
+    char* argv[kMaxArgCount];
+    int argc = 0;
+    int cursor = 0;
+    for (const auto& arg : params) {
+      ASSERT_LE(cursor + arg.size() + 1, kArgBufferSize);
+      ASSERT_LE(argc + 1, kMaxArgCount);
+
+      snprintf(arg_buffer + cursor, arg.size() + 1, "%s", arg.c_str());
+
+      argv[argc++] = arg_buffer + cursor;
+      cursor += static_cast<int>(arg.size()) + 1;
+    }
+    ASSERT_EQ(0, ROCKSDB_NAMESPACE::io_tracer_parser(argc, argv));
+  }
+#endif  // GFLAGS
+#endif  // ROCKSDB_LITE
 };

 INSTANTIATE_TEST_CASE_P(PrefetchTestWithPosix, PrefetchTestWithPosix,
@ -1264,7 +1367,7 @@ TEST_P(PrefetchTestWithPosix, ReadAsyncWithPosixFS) {
  }

  SyncPoint::GetInstance()->SetCallBack(
-      "FilePrefetchBuffer::PrefetchAsync:Start",
+      "FilePrefetchBuffer::PrefetchAsyncInternal:Start",
      [&](void*) { buff_prefetch_count++; });

  SyncPoint::GetInstance()->SetCallBack(
@ -1275,12 +1378,15 @@ TEST_P(PrefetchTestWithPosix, ReadAsyncWithPosixFS) {
  // Read the keys.
  {
    ASSERT_OK(options.statistics->Reset());
+    get_perf_context()->Reset();
+
    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ro));
    int num_keys = 0;
    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
      ASSERT_OK(iter->status());
      num_keys++;
    }
+
    ASSERT_EQ(num_keys, total_keys);
    ASSERT_GT(buff_prefetch_count, 0);

@ -1301,6 +1407,55 @@ TEST_P(PrefetchTestWithPosix, ReadAsyncWithPosixFS) {
      }
      ASSERT_GT(prefetched_bytes_discarded.count, 0);
    }
+    ASSERT_EQ(get_perf_context()->number_async_seek, 0);
+  }
+
+  {
+    // Read the keys using seek.
+    {
+      ASSERT_OK(options.statistics->Reset());
+      get_perf_context()->Reset();
+
+      auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ro));
+      int num_keys = 0;
+      iter->Seek(BuildKey(450));
+      while (iter->Valid()) {
+        ASSERT_OK(iter->status());
+        num_keys++;
+        iter->Next();
+      }
+      ASSERT_OK(iter->status());
+
+      iter->Seek(BuildKey(450));
+      while (iter->Valid()) {
+        ASSERT_OK(iter->status());
+        num_keys++;
+        iter->Prev();
+      }
+
+      ASSERT_EQ(num_keys, total_keys + 1);
+      ASSERT_GT(buff_prefetch_count, 0);
+
+      // Check stats to make sure async prefetch is done.
+      {
+        HistogramData async_read_bytes;
+        options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes);
+        HistogramData prefetched_bytes_discarded;
+        options.statistics->histogramData(PREFETCHED_BYTES_DISCARDED,
+                                          &prefetched_bytes_discarded);
+
+        // Not all platforms support iouring. In that case, ReadAsync in posix
+        // won't submit async requests.
+        if (read_async_called) {
+          ASSERT_GT(async_read_bytes.count, 0);
+          ASSERT_GT(get_perf_context()->number_async_seek, 0);
+        } else {
+          ASSERT_EQ(async_read_bytes.count, 0);
+          ASSERT_EQ(get_perf_context()->number_async_seek, 0);
+        }
+        ASSERT_GT(prefetched_bytes_discarded.count, 0);
+      }
+    }
  }

  SyncPoint::GetInstance()->DisableProcessing();
@ -1308,6 +1463,133 @@ TEST_P(PrefetchTestWithPosix, ReadAsyncWithPosixFS) {

  Close();
 }
+
+#ifndef ROCKSDB_LITE
+#ifdef GFLAGS
+TEST_P(PrefetchTestWithPosix, TraceReadAsyncWithCallbackWrapper) {
+  if (mem_env_ || encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
+    return;
+  }
+
+  const int kNumKeys = 1000;
+  std::shared_ptr<MockFS> fs = std::make_shared<MockFS>(
+      FileSystem::Default(), /*support_prefetch=*/false);
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
+
+  bool use_direct_io = false;
+  Options options = CurrentOptions();
+  options.write_buffer_size = 1024;
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.env = env.get();
+  options.statistics = CreateDBStatistics();
+  if (use_direct_io) {
+    options.use_direct_reads = true;
+    options.use_direct_io_for_flush_and_compaction = true;
+  }
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = true;
+  table_options.cache_index_and_filter_blocks = false;
+  table_options.metadata_block_size = 1024;
+  table_options.index_type =
+      BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  Status s = TryReopen(options);
+  if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) {
+    // If direct IO is not supported, skip the test
+    return;
+  } else {
+    ASSERT_OK(s);
+  }
+
+  int total_keys = 0;
+  // Write the keys.
+  {
+    WriteBatch batch;
+    Random rnd(309);
+    for (int j = 0; j < 5; j++) {
+      for (int i = j * kNumKeys; i < (j + 1) * kNumKeys; i++) {
+        ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000)));
+        total_keys++;
+      }
+      ASSERT_OK(db_->Write(WriteOptions(), &batch));
+      ASSERT_OK(Flush());
+    }
+    MoveFilesToLevel(2);
+  }
+
+  int buff_prefetch_count = 0;
+  bool read_async_called = false;
+  ReadOptions ro;
+  ro.adaptive_readahead = true;
+  ro.async_io = true;
+
+  if (GetParam()) {
+    ro.readahead_size = 16 * 1024;
+  }
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "FilePrefetchBuffer::PrefetchAsyncInternal:Start",
+      [&](void*) { buff_prefetch_count++; });
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "UpdateResults::io_uring_result",
+      [&](void* /*arg*/) { read_async_called = true; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // Read the keys.
+  {
+    // Start io_tracing.
+    WriteOptions write_opt;
+    TraceOptions trace_opt;
+    std::unique_ptr<TraceWriter> trace_writer;
+    std::string trace_file_path = dbname_ + "/io_trace_file";
+
+    ASSERT_OK(
+        NewFileTraceWriter(env_, EnvOptions(), trace_file_path, &trace_writer));
+    ASSERT_OK(db_->StartIOTrace(trace_opt, std::move(trace_writer)));
+    ASSERT_OK(options.statistics->Reset());
+
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ro));
+    int num_keys = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ASSERT_OK(iter->status());
+      num_keys++;
+    }
+
+    // End the tracing.
+    ASSERT_OK(db_->EndIOTrace());
+    ASSERT_OK(env_->FileExists(trace_file_path));
+
+    ASSERT_EQ(num_keys, total_keys);
+    ASSERT_GT(buff_prefetch_count, 0);
+
+    // Check stats to make sure async prefetch is done.
+    {
+      HistogramData async_read_bytes;
+      options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes);
+      // Not all platforms support iouring. In that case, ReadAsync in posix
+      // won't submit async requests.
+      if (read_async_called) {
+        ASSERT_GT(async_read_bytes.count, 0);
+      } else {
+        ASSERT_EQ(async_read_bytes.count, 0);
+      }
+    }
+
+    // Check the file to see if ReadAsync is logged.
+    RunIOTracerParserTool(trace_file_path);
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  Close();
+}
+#endif  // GFLAGS
+#endif  // ROCKSDB_LITE
 }  // namespace ROCKSDB_NAMESPACE

 int main(int argc, char** argv) {
--- a/file/random_access_file_reader.cc
+++ b/file/random_access_file_reader.cc
@ -457,9 +457,16 @@ IOStatus RandomAccessFileReader::ReadAsync(

  IOStatus s = file_->ReadAsync(req, opts, read_async_callback, read_async_info,
                                io_handle, del_fn, nullptr /*dbg*/);
+// Suppress false positive clang analyzer warnings.
+// Memory is not released if file_->ReadAsync returns !s.ok(), because
+// ReadAsyncCallback is never called in that case. If ReadAsyncCallback is
+// called then ReadAsync should always return IOStatus::OK().
+#ifndef __clang_analyzer__
  if (!s.ok()) {
    delete read_async_info;
  }
+#endif  // __clang_analyzer__
+
  return s;
 }

--- a/file/random_access_file_reader.h
+++ b/file/random_access_file_reader.h
@ -172,8 +172,11 @@ class RandomAccessFileReader {
                     size_t num_reqs, AlignedBuf* aligned_buf,
                     Env::IOPriority rate_limiter_priority) const;

-  IOStatus Prefetch(uint64_t offset, size_t n) const {
-    return file_->Prefetch(offset, n, IOOptions(), nullptr);
+  IOStatus Prefetch(uint64_t offset, size_t n,
+                    const Env::IOPriority rate_limiter_priority) const {
+    IOOptions opts;
+    opts.rate_limiter_priority = rate_limiter_priority;
+    return file_->Prefetch(offset, n, opts, nullptr);
  }

  FSRandomAccessFile* file() { return file_.get(); }
--- a/file/writable_file_writer.cc
+++ b/file/writable_file_writer.cc
@ -54,10 +54,14 @@ IOStatus WritableFileWriter::Append(const Slice& data, uint32_t crc32c_checksum,
  UpdateFileChecksum(data);

  {
+    IOOptions io_options;
+    io_options.rate_limiter_priority =
+        WritableFileWriter::DecideRateLimiterPriority(
+            writable_file_->GetIOPriority(), op_rate_limiter_priority);
    IOSTATS_TIMER_GUARD(prepare_write_nanos);
    TEST_SYNC_POINT("WritableFileWriter::Append:BeforePrepareWrite");
    writable_file_->PrepareWrite(static_cast<size_t>(GetFileSize()), left,
-                                 IOOptions(), nullptr);
+                                 io_options, nullptr);
  }

  // See whether we need to enlarge the buffer to avoid the flush
@ -159,7 +163,8 @@ IOStatus WritableFileWriter::Append(const Slice& data, uint32_t crc32c_checksum,

  TEST_KILL_RANDOM("WritableFileWriter::Append:1");
  if (s.ok()) {
-    filesize_ += data.size();
+    uint64_t cur_size = filesize_.load(std::memory_order_acquire);
+    filesize_.store(cur_size + data.size(), std::memory_order_release);
  }
  return s;
 }
@ -187,7 +192,8 @@ IOStatus WritableFileWriter::Pad(const size_t pad_bytes,
    cap = buf_.Capacity() - buf_.CurrentSize();
  }
  pending_sync_ = true;
-  filesize_ += pad_bytes;
+  uint64_t cur_size = filesize_.load(std::memory_order_acquire);
+  filesize_.store(cur_size + pad_bytes, std::memory_order_release);
  if (perform_data_verification_) {
    buffered_data_crc32c_checksum_ =
        crc32c::Extend(buffered_data_crc32c_checksum_,
@ -211,6 +217,8 @@ IOStatus WritableFileWriter::Close() {
  s = Flush();  // flush cache to OS

  IOStatus interim;
+  IOOptions io_options;
+  io_options.rate_limiter_priority = writable_file_->GetIOPriority();
  // In direct I/O mode we write whole pages so
  // we need to let the file know where data ends.
  if (use_direct_io()) {
@ -221,14 +229,15 @@ IOStatus WritableFileWriter::Close() {
        start_ts = FileOperationInfo::StartNow();
      }
 #endif
-      interim = writable_file_->Truncate(filesize_, IOOptions(), nullptr);
+      uint64_t filesz = filesize_.load(std::memory_order_acquire);
+      interim = writable_file_->Truncate(filesz, io_options, nullptr);
 #ifndef ROCKSDB_LITE
      if (ShouldNotifyListeners()) {
        auto finish_ts = FileOperationInfo::FinishNow();
        NotifyOnFileTruncateFinish(start_ts, finish_ts, s);
        if (!interim.ok()) {
          NotifyOnIOError(interim, FileOperationType::kTruncate, file_name(),
-                          filesize_);
+                          filesz);
        }
      }
 #endif
@ -241,7 +250,7 @@ IOStatus WritableFileWriter::Close() {
          start_ts = FileOperationInfo::StartNow();
        }
 #endif
-        interim = writable_file_->Fsync(IOOptions(), nullptr);
+        interim = writable_file_->Fsync(io_options, nullptr);
 #ifndef ROCKSDB_LITE
        if (ShouldNotifyListeners()) {
          auto finish_ts = FileOperationInfo::FinishNow();
@ -267,7 +276,7 @@ IOStatus WritableFileWriter::Close() {
      start_ts = FileOperationInfo::StartNow();
    }
 #endif
-    interim = writable_file_->Close(IOOptions(), nullptr);
+    interim = writable_file_->Close(io_options, nullptr);
 #ifndef ROCKSDB_LITE
    if (ShouldNotifyListeners()) {
      auto finish_ts = FileOperationInfo::FinishNow();
@ -331,7 +340,11 @@ IOStatus WritableFileWriter::Flush(Env::IOPriority op_rate_limiter_priority) {
      start_ts = FileOperationInfo::StartNow();
    }
 #endif
-    s = writable_file_->Flush(IOOptions(), nullptr);
+    IOOptions io_options;
+    io_options.rate_limiter_priority =
+        WritableFileWriter::DecideRateLimiterPriority(
+            writable_file_->GetIOPriority(), op_rate_limiter_priority);
+    s = writable_file_->Flush(io_options, nullptr);
 #ifndef ROCKSDB_LITE
    if (ShouldNotifyListeners()) {
      auto finish_ts = std::chrono::steady_clock::now();
@ -362,8 +375,9 @@ IOStatus WritableFileWriter::Flush(Env::IOPriority op_rate_limiter_priority) {
    const uint64_t kBytesNotSyncRange =
        1024 * 1024;                                // recent 1MB is not synced.
    const uint64_t kBytesAlignWhenSync = 4 * 1024;  // Align 4KB.
-    if (filesize_ > kBytesNotSyncRange) {
-      uint64_t offset_sync_to = filesize_ - kBytesNotSyncRange;
+    uint64_t cur_size = filesize_.load(std::memory_order_acquire);
+    if (cur_size > kBytesNotSyncRange) {
+      uint64_t offset_sync_to = cur_size - kBytesNotSyncRange;
      offset_sync_to -= offset_sync_to % kBytesAlignWhenSync;
      assert(offset_sync_to >= last_sync_size_);
      if (offset_sync_to > 0 &&
@ -428,17 +442,22 @@ IOStatus WritableFileWriter::SyncInternal(bool use_fsync) {
  IOSTATS_TIMER_GUARD(fsync_nanos);
  TEST_SYNC_POINT("WritableFileWriter::SyncInternal:0");
  auto prev_perf_level = GetPerfLevel();
+
  IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, clock_);
+
 #ifndef ROCKSDB_LITE
  FileOperationInfo::StartTimePoint start_ts;
  if (ShouldNotifyListeners()) {
    start_ts = FileOperationInfo::StartNow();
  }
 #endif
+
+  IOOptions io_options;
+  io_options.rate_limiter_priority = writable_file_->GetIOPriority();
  if (use_fsync) {
-    s = writable_file_->Fsync(IOOptions(), nullptr);
+    s = writable_file_->Fsync(io_options, nullptr);
  } else {
-    s = writable_file_->Sync(IOOptions(), nullptr);
+    s = writable_file_->Sync(io_options, nullptr);
  }
 #ifndef ROCKSDB_LITE
  if (ShouldNotifyListeners()) {
@ -466,7 +485,9 @@ IOStatus WritableFileWriter::RangeSync(uint64_t offset, uint64_t nbytes) {
    start_ts = FileOperationInfo::StartNow();
  }
 #endif
-  IOStatus s = writable_file_->RangeSync(offset, nbytes, IOOptions(), nullptr);
+  IOOptions io_options;
+  io_options.rate_limiter_priority = writable_file_->GetIOPriority();
+  IOStatus s = writable_file_->RangeSync(offset, nbytes, io_options, nullptr);
 #ifndef ROCKSDB_LITE
  if (ShouldNotifyListeners()) {
    auto finish_ts = std::chrono::steady_clock::now();
@ -490,19 +511,19 @@ IOStatus WritableFileWriter::WriteBuffered(
  size_t left = size;
  DataVerificationInfo v_info;
  char checksum_buf[sizeof(uint32_t)];
+  Env::IOPriority rate_limiter_priority_used =
+      WritableFileWriter::DecideRateLimiterPriority(
+          writable_file_->GetIOPriority(), op_rate_limiter_priority);
+  IOOptions io_options;
+  io_options.rate_limiter_priority = rate_limiter_priority_used;

  while (left > 0) {
-    size_t allowed;
-    Env::IOPriority rate_limiter_priority_used =
-        WritableFileWriter::DecideRateLimiterPriority(
-            writable_file_->GetIOPriority(), op_rate_limiter_priority);
+    size_t allowed = left;
    if (rate_limiter_ != nullptr &&
        rate_limiter_priority_used != Env::IO_TOTAL) {
      allowed = rate_limiter_->RequestToken(left, 0 /* alignment */,
                                            rate_limiter_priority_used, stats_,
                                            RateLimiter::OpType::kWrite);
-    } else {
-      allowed = left;
    }

    {
@ -511,7 +532,7 @@ IOStatus WritableFileWriter::WriteBuffered(

 #ifndef ROCKSDB_LITE
      FileOperationInfo::StartTimePoint start_ts;
-      uint64_t old_size = writable_file_->GetFileSize(IOOptions(), nullptr);
+      uint64_t old_size = writable_file_->GetFileSize(io_options, nullptr);
      if (ShouldNotifyListeners()) {
        start_ts = FileOperationInfo::StartNow();
        old_size = next_write_offset_;
@ -524,10 +545,10 @@ IOStatus WritableFileWriter::WriteBuffered(
        if (perform_data_verification_) {
          Crc32cHandoffChecksumCalculation(src, allowed, checksum_buf);
          v_info.checksum = Slice(checksum_buf, sizeof(uint32_t));
-          s = writable_file_->Append(Slice(src, allowed), IOOptions(), v_info,
+          s = writable_file_->Append(Slice(src, allowed), io_options, v_info,
                                     nullptr);
        } else {
-          s = writable_file_->Append(Slice(src, allowed), IOOptions(), nullptr);
+          s = writable_file_->Append(Slice(src, allowed), io_options, nullptr);
        }
        if (!s.ok()) {
          // If writable_file_->Append() failed, then the data may or may not
@ -579,15 +600,16 @@ IOStatus WritableFileWriter::WriteBufferedWithChecksum(
  size_t left = size;
  DataVerificationInfo v_info;
  char checksum_buf[sizeof(uint32_t)];
-
+  Env::IOPriority rate_limiter_priority_used =
+      WritableFileWriter::DecideRateLimiterPriority(
+          writable_file_->GetIOPriority(), op_rate_limiter_priority);
+  IOOptions io_options;
+  io_options.rate_limiter_priority = rate_limiter_priority_used;
  // Check how much is allowed. Here, we loop until the rate limiter allows to
  // write the entire buffer.
  // TODO: need to be improved since it sort of defeats the purpose of the rate
  // limiter
  size_t data_size = left;
-  Env::IOPriority rate_limiter_priority_used =
-      WritableFileWriter::DecideRateLimiterPriority(
-          writable_file_->GetIOPriority(), op_rate_limiter_priority);
  if (rate_limiter_ != nullptr && rate_limiter_priority_used != Env::IO_TOTAL) {
    while (data_size > 0) {
      size_t tmp_size;
@ -604,7 +626,7 @@ IOStatus WritableFileWriter::WriteBufferedWithChecksum(

 #ifndef ROCKSDB_LITE
    FileOperationInfo::StartTimePoint start_ts;
-    uint64_t old_size = writable_file_->GetFileSize(IOOptions(), nullptr);
+    uint64_t old_size = writable_file_->GetFileSize(io_options, nullptr);
    if (ShouldNotifyListeners()) {
      start_ts = FileOperationInfo::StartNow();
      old_size = next_write_offset_;
@ -617,8 +639,7 @@ IOStatus WritableFileWriter::WriteBufferedWithChecksum(

      EncodeFixed32(checksum_buf, buffered_data_crc32c_checksum_);
      v_info.checksum = Slice(checksum_buf, sizeof(uint32_t));
-      s = writable_file_->Append(Slice(src, left), IOOptions(), v_info,
-                                 nullptr);
+      s = writable_file_->Append(Slice(src, left), io_options, v_info, nullptr);
      SetPerfLevel(prev_perf_level);
    }
 #ifndef ROCKSDB_LITE
@ -709,20 +730,20 @@ IOStatus WritableFileWriter::WriteDirect(
  size_t left = buf_.CurrentSize();
  DataVerificationInfo v_info;
  char checksum_buf[sizeof(uint32_t)];
+  Env::IOPriority rate_limiter_priority_used =
+      WritableFileWriter::DecideRateLimiterPriority(
+          writable_file_->GetIOPriority(), op_rate_limiter_priority);
+  IOOptions io_options;
+  io_options.rate_limiter_priority = rate_limiter_priority_used;

  while (left > 0) {
    // Check how much is allowed
-    size_t size;
-    Env::IOPriority rate_limiter_priority_used =
-        WritableFileWriter::DecideRateLimiterPriority(
-            writable_file_->GetIOPriority(), op_rate_limiter_priority);
+    size_t size = left;
    if (rate_limiter_ != nullptr &&
        rate_limiter_priority_used != Env::IO_TOTAL) {
      size = rate_limiter_->RequestToken(left, buf_.Alignment(),
-                                         writable_file_->GetIOPriority(),
-                                         stats_, RateLimiter::OpType::kWrite);
-    } else {
-      size = left;
+                                         rate_limiter_priority_used, stats_,
+                                         RateLimiter::OpType::kWrite);
    }

    {
@ -737,10 +758,10 @@ IOStatus WritableFileWriter::WriteDirect(
        Crc32cHandoffChecksumCalculation(src, size, checksum_buf);
        v_info.checksum = Slice(checksum_buf, sizeof(uint32_t));
        s = writable_file_->PositionedAppend(Slice(src, size), write_offset,
-                                             IOOptions(), v_info, nullptr);
+                                             io_options, v_info, nullptr);
      } else {
        s = writable_file_->PositionedAppend(Slice(src, size), write_offset,
-                                             IOOptions(), nullptr);
+                                             io_options, nullptr);
      }

      if (ShouldNotifyListeners()) {
@ -810,20 +831,22 @@ IOStatus WritableFileWriter::WriteDirectWithChecksum(
  DataVerificationInfo v_info;
  char checksum_buf[sizeof(uint32_t)];

+  Env::IOPriority rate_limiter_priority_used =
+      WritableFileWriter::DecideRateLimiterPriority(
+          writable_file_->GetIOPriority(), op_rate_limiter_priority);
+  IOOptions io_options;
+  io_options.rate_limiter_priority = rate_limiter_priority_used;
  // Check how much is allowed. Here, we loop until the rate limiter allows to
  // write the entire buffer.
  // TODO: need to be improved since it sort of defeats the purpose of the rate
  // limiter
  size_t data_size = left;
-  Env::IOPriority rate_limiter_priority_used =
-      WritableFileWriter::DecideRateLimiterPriority(
-          writable_file_->GetIOPriority(), op_rate_limiter_priority);
  if (rate_limiter_ != nullptr && rate_limiter_priority_used != Env::IO_TOTAL) {
    while (data_size > 0) {
      size_t size;
      size = rate_limiter_->RequestToken(data_size, buf_.Alignment(),
-                                         writable_file_->GetIOPriority(),
-                                         stats_, RateLimiter::OpType::kWrite);
+                                         rate_limiter_priority_used, stats_,
+                                         RateLimiter::OpType::kWrite);
      data_size -= size;
    }
  }
@ -839,7 +862,7 @@ IOStatus WritableFileWriter::WriteDirectWithChecksum(
    EncodeFixed32(checksum_buf, buffered_data_crc32c_checksum_);
    v_info.checksum = Slice(checksum_buf, sizeof(uint32_t));
    s = writable_file_->PositionedAppend(Slice(src, left), write_offset,
-                                         IOOptions(), v_info, nullptr);
+                                         io_options, v_info, nullptr);

    if (ShouldNotifyListeners()) {
      auto finish_ts = std::chrono::steady_clock::now();
@ -894,4 +917,5 @@ Env::IOPriority WritableFileWriter::DecideRateLimiterPriority(
    return op_rate_limiter_priority;
  }
 }
+
 }  // namespace ROCKSDB_NAMESPACE
--- a/file/writable_file_writer.h
+++ b/file/writable_file_writer.h
@ -142,7 +142,7 @@ class WritableFileWriter {
  size_t max_buffer_size_;
  // Actually written data size can be used for truncate
  // not counting padding data
-  uint64_t filesize_;
+  std::atomic<uint64_t> filesize_;
 #ifndef ROCKSDB_LITE
  // This is necessary when we use unbuffered access
  // and writes must happen on aligned offsets
@ -255,7 +255,9 @@ class WritableFileWriter {
  // returns NotSupported status.
  IOStatus SyncWithoutFlush(bool use_fsync);

-  uint64_t GetFileSize() const { return filesize_; }
+  uint64_t GetFileSize() const {
+    return filesize_.load(std::memory_order_acquire);
+  }

  IOStatus InvalidateCache(size_t offset, size_t length) {
    return writable_file_->InvalidateCache(offset, length);
@ -277,6 +279,7 @@ class WritableFileWriter {
  const char* GetFileChecksumFuncName() const;

 private:
+  // Decide the Rate Limiter priority.
  static Env::IOPriority DecideRateLimiterPriority(
      Env::IOPriority writable_file_io_priority,
      Env::IOPriority op_rate_limiter_priority);
--- a/include/rocksdb/advanced_options.h
+++ b/include/rocksdb/advanced_options.h
@ -100,8 +100,9 @@ struct CompressionOptions {
  //
  // The dictionary is created by sampling the SST file data. If
  // `zstd_max_train_bytes` is nonzero, the samples are passed through zstd's
-  // dictionary generator. Otherwise, the random samples are used directly as
-  // the dictionary.
+  // dictionary generator (see comments for option `use_zstd_dict_trainer` for
+  // detail on dictionary generator). If `zstd_max_train_bytes` is zero, the
+  // random samples are used directly as the dictionary.
  //
  // When compression dictionary is disabled, we compress and write each block
  // before buffering data for the next one. When compression dictionary is
@ -173,6 +174,20 @@ struct CompressionOptions {
  // Default: 0 (unlimited)
  uint64_t max_dict_buffer_bytes;

+  // Use zstd trainer to generate dictionaries. When this option is set to true,
+  // zstd_max_train_bytes of training data sampled from max_dict_buffer_bytes
+  // buffered data will be passed to zstd dictionary trainer to generate a
+  // dictionary of size max_dict_bytes.
+  //
+  // When this option is false, zstd's API ZDICT_finalizeDictionary() will be
+  // called to generate dictionaries. zstd_max_train_bytes of training sampled
+  // data will be passed to this API. Using this API should save CPU time on
+  // dictionary training, but the compression ratio may not be as good as using
+  // a dictionary trainer.
+  //
+  // Default: true
+  bool use_zstd_dict_trainer;
+
  CompressionOptions()
      : window_bits(-14),
        level(kDefaultCompressionLevel),
@ -181,11 +196,13 @@ struct CompressionOptions {
        zstd_max_train_bytes(0),
        parallel_threads(1),
        enabled(false),
-        max_dict_buffer_bytes(0) {}
+        max_dict_buffer_bytes(0),
+        use_zstd_dict_trainer(true) {}
  CompressionOptions(int wbits, int _lev, int _strategy,
                     uint32_t _max_dict_bytes, uint32_t _zstd_max_train_bytes,
                     uint32_t _parallel_threads, bool _enabled,
-                     uint64_t _max_dict_buffer_bytes)
+                     uint64_t _max_dict_buffer_bytes,
+                     bool _use_zstd_dict_trainer)
      : window_bits(wbits),
        level(_lev),
        strategy(_strategy),
@ -193,7 +210,8 @@ struct CompressionOptions {
        zstd_max_train_bytes(_zstd_max_train_bytes),
        parallel_threads(_parallel_threads),
        enabled(_enabled),
-        max_dict_buffer_bytes(_max_dict_buffer_bytes) {}
+        max_dict_buffer_bytes(_max_dict_buffer_bytes),
+        use_zstd_dict_trainer(_use_zstd_dict_trainer) {}
 };

 // Temperature of a file. Used to pass to FileSystem for a different
--- a/include/rocksdb/c.h
+++ b/include/rocksdb/c.h
@ -1041,6 +1041,12 @@ extern ROCKSDB_LIBRARY_API int
 rocksdb_options_get_compression_options_zstd_max_train_bytes(
    rocksdb_options_t* opt);
 extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_compression_options_use_zstd_dict_trainer(
+    rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_compression_options_use_zstd_dict_trainer(
+    rocksdb_options_t* opt);
+extern ROCKSDB_LIBRARY_API void
 rocksdb_options_set_compression_options_parallel_threads(rocksdb_options_t*,
                                                         int);
 extern ROCKSDB_LIBRARY_API int
@ -1059,6 +1065,12 @@ extern ROCKSDB_LIBRARY_API void
 rocksdb_options_set_bottommost_compression_options_zstd_max_train_bytes(
    rocksdb_options_t*, int, unsigned char);
 extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_bottommost_compression_options_use_zstd_dict_trainer(
+    rocksdb_options_t*, unsigned char, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_options_get_bottommost_compression_options_use_zstd_dict_trainer(
+    rocksdb_options_t* opt);
+extern ROCKSDB_LIBRARY_API void
 rocksdb_options_set_bottommost_compression_options_max_dict_buffer_bytes(
    rocksdb_options_t*, uint64_t, unsigned char);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_prefix_extractor(
@ -1541,7 +1553,8 @@ enum {
  rocksdb_env_lock_file_nanos,
  rocksdb_env_unlock_file_nanos,
  rocksdb_env_new_logger_nanos,
-  rocksdb_total_metric_count = 68
+  rocksdb_number_async_seek,
+  rocksdb_total_metric_count = 69
 };

 extern ROCKSDB_LIBRARY_API void rocksdb_set_perf_level(int);
--- a/include/rocksdb/cache.h
+++ b/include/rocksdb/cache.h
@ -559,15 +559,15 @@ enum class CacheEntryRole {
  kIndexBlock,
  // Other kinds of block-based table block
  kOtherBlock,
-  // WriteBufferManager reservations to account for memtable usage
+  // WriteBufferManager's charge to account for its memtable usage
  kWriteBuffer,
-  // BlockBasedTableBuilder reservations to account for
-  // compression dictionary building buffer's memory usage
+  // Compression dictionary building buffer's charge to account for
+  // its memory usage
  kCompressionDictionaryBuildingBuffer,
-  // Filter reservations to account for
+  // Filter's charge to account for
  // (new) bloom and ribbon filter construction's memory usage
  kFilterConstruction,
-  // BlockBasedTableReader reservations to account for
+  // BlockBasedTableReader's charge to account for
  // its memory usage
  kBlockBasedTableReader,
  // Default bucket, for miscellaneous cache entries. Do not use for
--- a/include/rocksdb/file_system.h
+++ b/include/rocksdb/file_system.h
@ -53,6 +53,7 @@ struct ConfigOptions;
 using AccessPattern = RandomAccessFile::AccessPattern;
 using FileAttributes = Env::FileAttributes;

+// DEPRECATED
 // Priority of an IO request. This is a hint and does not guarantee any
 // particular QoS.
 // IO_LOW - Typically background reads/writes such as compaction/flush
@ -86,6 +87,7 @@ struct IOOptions {
  // Timeout for the operation in microseconds
  std::chrono::microseconds timeout;

+  // DEPRECATED
  // Priority - high or low
  IOPriority prio;

--- a/include/rocksdb/iostats_context.h
+++ b/include/rocksdb/iostats_context.h
@ -81,10 +81,8 @@ struct IOStatsContext {
 // If RocksDB is compiled with -DNIOSTATS_CONTEXT, then a pointer to a global,
 // non-thread-local IOStatsContext object will be returned. Attempts to update
 // this object will be ignored, and reading from it will also be no-op.
-// Otherwise,
-// a) if thread-local is supported on the platform, then a pointer to
-//    a thread-local IOStatsContext object will be returned.
-// b) if thread-local is NOT supported, then compilation will fail.
+// Otherwise, a pointer to a thread-local IOStatsContext object will be
+// returned.
 //
 // This function never returns nullptr.
 IOStatsContext* get_iostats_context();
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@ -492,6 +492,23 @@ struct DBOptions {
  // Default: false
  bool track_and_verify_wals_in_manifest = false;

+  // EXPERIMENTAL: This API/behavior is subject to change
+  // If true, during DB-open it verifies the SST unique id between MANIFEST
+  // and SST properties, which is to make sure the SST is not overwritten or
+  // misplaced. A corruption error will be reported if mismatch detected, but
+  // only when MANIFEST tracks the unique id, which starts from version 7.3.
+  // The unique id is an internal unique id and subject to change.
+  //
+  // Note:
+  // 1. if enabled, it opens every SST files during DB open to read the unique
+  //    id from SST properties, so it's recommended to have `max_open_files=-1`
+  //    to pre-open the SST files before the verification.
+  // 2. existing SST files won't have its unique_id tracked in MANIFEST, then
+  //    verification will be skipped.
+  //
+  // Default: false
+  bool verify_sst_unique_id_in_manifest = false;
+
  // Use the specified object to interact with the environment,
  // e.g. to read/write files, schedule background work, etc. In the near
  // future, support for doing storage operations such as read/write files
@ -552,7 +569,7 @@ struct DBOptions {
  // compaction. For universal-style compaction, you can usually set it to -1.
  //
  // A high value or -1 for this option can cause high memory usage.
-  // See BlockBasedTableOptions::reserve_table_reader_memory to constrain
+  // See BlockBasedTableOptions::cache_usage_options to constrain
  // memory usage in case of block based table format.
  //
  // Default: -1
@ -1150,8 +1167,7 @@ struct DBOptions {
 #endif  // ROCKSDB_LITE

  // If true, then DB::Open / CreateColumnFamily / DropColumnFamily
-  // / SetOptions will fail if options file is not detected or properly
-  // persisted.
+  // SetOptions will fail if options file is not properly persisted.
  //
  // DEFAULT: false
  bool fail_if_options_file_error = false;
@ -1973,6 +1989,11 @@ struct CompactionServiceOptionsOverride {
  // returned to CompactionService primary host, to collect that, the user needs
  // to set it here.
  std::shared_ptr<Statistics> statistics = nullptr;
+
+  // Only compaction generated SST files use this user defined table properties
+  // collector.
+  std::vector<std::shared_ptr<TablePropertiesCollectorFactory>>
+      table_properties_collector_factories;
 };

 struct OpenAndCompactOptions {
--- a/include/rocksdb/perf_context.h
+++ b/include/rocksdb/perf_context.h
@ -229,6 +229,8 @@ struct PerfContext {
  // Time spent in decrypting data. Populated when EncryptedEnv is used.
  uint64_t decrypt_data_nanos;

+  uint64_t number_async_seek;
+
  std::map<uint32_t, PerfContextByLevel>* level_to_perf_context = nullptr;
  bool per_level_perf_context_enabled = false;
 };
--- a/include/rocksdb/statistics.h
+++ b/include/rocksdb/statistics.h
@ -543,6 +543,9 @@ enum Histograms : uint32_t {
  // Number of prefetched bytes discarded by RocksDB.
  PREFETCHED_BYTES_DISCARDED,

+  // Number of IOs issued in parallel in a MultiGet batch
+  MULTIGET_IO_BATCH_SIZE,
+
  HISTOGRAM_ENUM_MAX,
 };

--- a/include/rocksdb/table.h
+++ b/include/rocksdb/table.h
@ -22,6 +22,7 @@
 #include <string>
 #include <unordered_map>

+#include "rocksdb/cache.h"
 #include "rocksdb/customizable.h"
 #include "rocksdb/env.h"
 #include "rocksdb/options.h"
@ -104,6 +105,23 @@ struct MetadataCacheOptions {
  PinningTier unpartitioned_pinning = PinningTier::kFallback;
 };

+struct CacheEntryRoleOptions {
+  enum class Decision {
+    kEnabled,
+    kDisabled,
+    kFallback,
+  };
+  Decision charged = Decision::kFallback;
+  bool operator==(const CacheEntryRoleOptions& other) const {
+    return charged == other.charged;
+  }
+};
+
+struct CacheUsageOptions {
+  CacheEntryRoleOptions options;
+  std::map<CacheEntryRole, CacheEntryRoleOptions> options_overrides;
+};
+
 // For advanced user only
 struct BlockBasedTableOptions {
  static const char* kName() { return "BlockTableOptions"; };
@ -287,47 +305,80 @@ struct BlockBasedTableOptions {
  // separately
  uint64_t metadata_block_size = 4096;

-  // If true, a dynamically updating charge to block cache, loosely based
-  // on the actual memory usage of table building, will occur to account
-  // the memory, if block cache available.
+  // `cache_usage_options` allows users to specify the default
+  // options (`cache_usage_options.options`) and the overriding
+  // options (`cache_usage_options.options_overrides`)
+  // for different `CacheEntryRole` under various features related to cache
+  // usage.
  //
-  // Charged memory usage includes:
-  // 1. Bloom Filter (format_version >= 5) and Ribbon Filter construction
-  // 2. More to come...
+  // For a certain `CacheEntryRole role` and a certain feature `f` of
+  // `CacheEntryRoleOptions`:
+  // 1. If `options_overrides` has an entry for `role` and
+  // `options_overrides[role].f != kFallback`, we use
+  // `options_overrides[role].f`
+  // 2. Otherwise, if `options[role].f != kFallback`, we use `options[role].f`
+  // 3. Otherwise, we follow the compatible existing behavior for `f` (see
+  // each feature's comment for more)
  //
-  // Note:
-  // 1. Bloom Filter (format_version >= 5) and Ribbon Filter construction
+  // `cache_usage_options` currently supports specifying options for the
+  // following features:
  //
-  // If additional temporary memory of Ribbon Filter uses up too much memory
-  // relative to the avaible space left in the block cache
+  // 1. Memory charging to block cache (`CacheEntryRoleOptions::charged`)
+  // Memory charging is a feature of accounting memory usage of specific area
+  // (represented by `CacheEntryRole`) toward usage in block cache (if
+  // available), by updating a dynamical charge to the block cache loosely based
+  // on the actual memory usage of that area.
+  //
+  // (a) CacheEntryRole::kCompressionDictionaryBuildingBuffer
+  // (i) If kEnabled:
+  // Charge memory usage of the buffered data used as training samples for
+  // dictionary compression.
+  // If such memory usage exceeds the avaible space left in the block cache
  // at some point (i.e, causing a cache full under
-  // LRUCacheOptions::strict_capacity_limit = true), construction will fall back
-  // to Bloom Filter.
+  // `LRUCacheOptions::strict_capacity_limit` = true), the data will then be
+  // unbuffered.
+  // (ii) If kDisabled:
+  // Does not charge the memory usage mentioned above.
+  // (iii) Compatible existing behavior:
+  // Same as kEnabled.
  //
-  // Default: false
-  bool reserve_table_builder_memory = false;
-
-  // If true, a dynamically updating charge to block cache, loosely based
-  // on the actual memory usage of table reader, will occur to account
-  // the memory, if block cache available.
+  // (b) CacheEntryRole::kFilterConstruction
+  // (i) If kEnabled:
+  // Charge memory usage of Bloom Filter
+  // (format_version >= 5) and Ribbon Filter construction.
+  // If additional temporary memory of Ribbon Filter exceeds the avaible
+  // space left in the block cache at some point (i.e, causing a cache full
+  // under `LRUCacheOptions::strict_capacity_limit` = true),
+  // construction will fall back to Bloom Filter.
+  // (ii) If kDisabled:
+  // Does not charge the memory usage mentioned above.
+  // (iii) Compatible existing behavior:
+  // Same as kDisabled.
  //
-  // Charged memory usage includes:
-  // 1. Table properties
-  // 2. Index block/Filter block/Uncompression dictionary if stored in table
-  // reader (i.e, BlockBasedTableOptions::cache_index_and_filter_blocks ==
-  // false)
-  // 3. Some internal data structures
-  // 4. More to come...
+  // (c) CacheEntryRole::kBlockBasedTableReader
+  // (i) If kEnabled:
+  // Charge memory usage of table properties +
+  // index block/filter block/uncompression dictionary (when stored in table
+  // reader i.e, BlockBasedTableOptions::cache_index_and_filter_blocks ==
+  // false) + some internal data structures during table reader creation.
+  // If such a table reader exceeds
+  // the avaible space left in the block cache at some point (i.e, causing
+  // a cache full under `LRUCacheOptions::strict_capacity_limit` = true),
+  // creation will fail with Status::MemoryLimit().
+  // (ii) If kDisabled:
+  // Does not charge the memory usage mentioned above.
+  // (iii) Compatible existing behavior:
+  // Same as kDisabled.
  //
-  // Note:
-  // If creation of a table reader uses up too much memory
-  // relative to the avaible space left in the block cache
-  // at some point (i.e, causing a cache full under
-  // LRUCacheOptions::strict_capacity_limit = true), such creation will fail
-  // with Status::MemoryLimit().
+  // (d) Other CacheEntryRole
+  // Not supported.
+  // `Status::kNotSupported` will be returned if
+  // `CacheEntryRoleOptions::charged` is set to {`kEnabled`, `kDisabled`}.
  //
-  // Default: false
-  bool reserve_table_reader_memory = false;
+  //
+  // 2. More to come ...
+  //
+  CacheUsageOptions cache_usage_options;

  // Note: currently this option requires kTwoLevelIndexSearch to be set as
  // well.
--- a/include/rocksdb/thread_status.h
+++ b/include/rocksdb/thread_status.h
@ -22,8 +22,7 @@

 #include "rocksdb/rocksdb_namespace.h"

-#if !defined(ROCKSDB_LITE) && !defined(NROCKSDB_THREAD_STATUS) && \
-    defined(ROCKSDB_SUPPORT_THREAD_LOCAL)
+#if !defined(ROCKSDB_LITE) && !defined(NROCKSDB_THREAD_STATUS)
 #define ROCKSDB_USING_THREAD_STATUS
 #endif

--- a/Show More
+++ b/Show More