Fix build on MSVC

Snappy must be built with position independent code for static linking
RocksJava must compile on JDK7 (#4768 )
2020-03-07 17:58:42 +01:00 · 2020-03-07 13:56:13 +01:00 · 2020-03-07 13:31:52 +01:00 · 2020-03-07 13:08:19 +01:00 · 2020-03-06 22:29:05 +01:00 · 2020-03-06 22:27:09 +01:00
103 changed files with 3597 additions and 3411 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -71,7 +71,10 @@ install:
      CC=gcc-8 && CXX=g++-8;
    fi
  - if [[ "${JOB_NAME}" == cmake* ]] && [ "${TRAVIS_OS_NAME}" == linux ]; then
-      mkdir cmake-dist && curl -sfSL https://cmake.org/files/v3.8/cmake-3.8.1-Linux-x86_64.tar.gz | tar --strip-components=1 -C cmake-dist -xz && export PATH=$PWD/cmake-dist/bin:$PATH;
+      mkdir cmake-dist && curl --silent --fail --show-error --location https://github.com/Kitware/CMake/releases/download/v3.14.5/cmake-3.14.5-Linux-x86_64.tar.gz | tar --strip-components=1 -C cmake-dist -xz && export PATH=$PWD/cmake-dist/bin:$PATH;
+    fi
+  - if [[ "${JOB_NAME}" == java_test ]]; then
+      java -version && echo "JAVA_HOME=${JAVA_HOME}";
    fi

 before_script:
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -504,7 +504,6 @@ set(SOURCES
        db/merge_helper.cc
        db/merge_operator.cc
        db/range_del_aggregator.cc
-        db/range_del_aggregator_v2.cc
        db/range_tombstone_fragmenter.cc
        db/repair.cc
        db/snapshot_impl.cc
@ -907,7 +906,6 @@ if(WITH_TESTS)
        db/plain_table_db_test.cc
        db/prefix_test.cc
        db/range_del_aggregator_test.cc
-        db/range_del_aggregator_v2_test.cc
        db/range_tombstone_fragmenter_test.cc
        db/repair_test.cc
        db/table_properties_collector_test.cc
--- a/HISTORY.md
+++ b/HISTORY.md
@ -1,11 +1,17 @@
 # Rocksdb Change Log
-## Unreleased
-### New Features
+## 5.18.4 (3/3/2020)
+* Various fixes for ARM64 support (#6250)
+* Fix JEMALLOC_CXX_THROW macro missing from older Jemalloc versions, causing build failures on some platforms.
+
+## 5.18.3 (2/11/2019)
+### Bug Fixes
+* Fix possible LSM corruption when both range deletions and subcompactions are used. The symptom of this corruption is L1+ files overlapping in the user key space.
+
+## 5.18.2 (01/31/2019)

 ### Public API Change
-
-### Bug Fixes
-* Fix a deadlock caused by compaction and file ingestion waiting for each other in the event of write stalls.
+* Change time resolution in FileOperationInfo.
+* Deleting Blob files also go through SStFileManager.

 ## 5.18.0 (11/30/2018)
 ### New Features
@ -18,6 +24,7 @@
 * Add xxhash64 checksum support
 * Introduced `MemoryAllocator`, which lets the user specify custom memory allocator for block based table.
 * Improved `DeleteRange` to prevent read performance degradation. The feature is no longer marked as experimental.
+* Enabled checkpoint on readonly db (DBImplReadOnly).

 ### Public API Change
 * `DBOptions::use_direct_reads` now affects reads issued by `BackupEngine` on the database's SSTs.
@ -34,6 +41,8 @@
 * Fixed Get correctness bug in the presence of range tombstones where merge operands covered by a range tombstone always result in NotFound.
 * Start populating `NO_FILE_CLOSES` ticker statistic, which was always zero previously.
 * The default value of NewBloomFilterPolicy()'s argument use_block_based_builder is changed to false. Note that this new default may cause large temp memory usage when building very large SST files.
+* Fix a deadlock caused by compaction and file ingestion waiting for each other in the event of write stalls.
+* Make DB ignore dropped column families while committing results of atomic flush.

 ## 5.17.0 (10/05/2018)
 ### Public API Change
--- a/INSTALL.md
+++ b/INSTALL.md
@ -43,6 +43,8 @@ to build a portable binary, add `PORTABLE=1` before your make commands, like thi
      command line flags processing. You can compile rocksdb library even
      if you don't have gflags installed.

+* If you wish to build the RocksJava static target, then cmake is required for building Snappy.
+
 ## Supported platforms

 * **Linux - Ubuntu**
--- a/72
+++ b/72
@ -137,6 +137,12 @@ CFLAGS +=  -DHAVE_POWER8
 HAVE_POWER8=1
 endif

+ifeq (,$(shell $(CXX) -fsyntax-only -march=armv8-a+crc -xc /dev/null 2>&1))
+CXXFLAGS += -march=armv8-a+crc
+CFLAGS += -march=armv8-a+crc
+ARMCRC_SOURCE=1
+endif
+
 # if we're compiling for release, compile without debug code (-DNDEBUG)
 ifeq ($(DEBUG_LEVEL),0)
 OPT += -DNDEBUG
@ -543,7 +549,6 @@ TESTS = \
 	persistent_cache_test \
 	statistics_test \
 	lua_test \
-	range_del_aggregator_test \
 	lru_cache_test \
 	object_registry_test \
 	repair_test \
@ -554,7 +559,7 @@ TESTS = \
 	trace_analyzer_test \
 	repeatable_thread_test \
 	range_tombstone_fragmenter_test \
-	range_del_aggregator_v2_test \
+	range_del_aggregator_test \
 	sst_file_reader_test \

 PARALLEL_TEST = \
@ -1588,9 +1593,6 @@ repeatable_thread_test: util/repeatable_thread_test.o $(LIBOBJECTS) $(TESTHARNES
 range_tombstone_fragmenter_test: db/range_tombstone_fragmenter_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)

-range_del_aggregator_v2_test: db/range_del_aggregator_v2_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
-	$(AM_LINK)
-
 sst_file_reader_test: table/sst_file_reader_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)

@ -1639,7 +1641,7 @@ JAVA_INCLUDE = -I$(JAVA_HOME)/include/ -I$(JAVA_HOME)/include/linux
 ifeq ($(PLATFORM), OS_SOLARIS)
 	ARCH := $(shell isainfo -b)
 else ifeq ($(PLATFORM), OS_OPENBSD)
-	ifneq (,$(filter $(MACHINE), amd64 arm64 sparc64))
+	ifneq (,$(filter amd64 ppc64 ppc64le arm64 aarch64 sparc64, $(MACHINE)))
 		ARCH := 64
 	else
 		ARCH := 32
@ -1648,10 +1650,10 @@ else
 	ARCH := $(shell getconf LONG_BIT)
 endif

-ifeq (,$(findstring ppc,$(MACHINE)))
-        ROCKSDBJNILIB = librocksdbjni-linux$(ARCH).so
+ifneq (,$(filter ppc% arm64 aarch64 sparc64, $(MACHINE)))
+	ROCKSDBJNILIB = librocksdbjni-linux-$(MACHINE).so
 else
-        ROCKSDBJNILIB = librocksdbjni-linux-$(MACHINE).so
+	ROCKSDBJNILIB = librocksdbjni-linux$(ARCH).so
 endif
 ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux$(ARCH).jar
 ROCKSDB_JAR_ALL = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH).jar
@ -1664,15 +1666,15 @@ ZLIB_SHA256 ?= c3e5e9fdd5004dcb542feda5ee4f0ff0744628baf8ed2dd5d66f8ca1197cb1a1
 ZLIB_DOWNLOAD_BASE ?= http://zlib.net
 BZIP2_VER ?= 1.0.6
 BZIP2_SHA256 ?= a2848f34fcd5d6cf47def00461fcb528a0484d8edef8208d6d2e2909dc61d9cd
-BZIP2_DOWNLOAD_BASE ?= https://web.archive.org/web/20180624184835/http://www.bzip.org
-SNAPPY_VER ?= 1.1.4
-SNAPPY_SHA256 ?= 134bfe122fd25599bb807bb8130e7ba6d9bdb851e0b16efcb83ac4f5d0b70057
-SNAPPY_DOWNLOAD_BASE ?= https://github.com/google/snappy/releases/download
-LZ4_VER ?= 1.8.0
-LZ4_SHA256 ?= 2ca482ea7a9bb103603108b5a7510b7592b90158c151ff50a28f1ca8389fccf6
+BZIP2_DOWNLOAD_BASE ?= https://downloads.sourceforge.net/project/bzip2
+SNAPPY_VER ?= 1.1.8
+SNAPPY_SHA256 ?= 16b677f07832a612b0836178db7f374e414f94657c138e6993cbfc5dcc58651f
+SNAPPY_DOWNLOAD_BASE ?= https://github.com/google/snappy/archive
+LZ4_VER ?= 1.9.2
+LZ4_SHA256 ?= 658ba6191fa44c92280d4aa2c271b0f4fbc0e34d249578dd05e50e76d0e5efcc
 LZ4_DOWNLOAD_BASE ?= https://github.com/lz4/lz4/archive
-ZSTD_VER ?= 1.3.3
-ZSTD_SHA256 ?= a77c47153ee7de02626c5b2a097005786b71688be61e9fb81806a011f90b297b
+ZSTD_VER ?= 1.4.4
+ZSTD_SHA256 ?= a364f5162c7d1a455cc915e8e3cf5f4bd8b75d09bc0f53965b0c9ca1383c52c8
 ZSTD_DOWNLOAD_BASE ?= https://github.com/facebook/zstd/archive
 CURL_SSL_OPTS ?= --tlsv1

@ -1711,7 +1713,9 @@ endif

 libz.a:
 	-rm -rf zlib-$(ZLIB_VER)
-	curl -O -L ${ZLIB_DOWNLOAD_BASE}/zlib-$(ZLIB_VER).tar.gz
+ifeq (,$(wildcard ./zlib-$(ZLIB_VER).tar.gz))
+	curl --fail --output zlib-$(ZLIB_VER).tar.gz --location ${ZLIB_DOWNLOAD_BASE}/zlib-$(ZLIB_VER).tar.gz
+endif
 	ZLIB_SHA256_ACTUAL=`$(SHA256_CMD) zlib-$(ZLIB_VER).tar.gz | cut -d ' ' -f 1`; \
 	if [ "$(ZLIB_SHA256)" != "$$ZLIB_SHA256_ACTUAL" ]; then \
 		echo zlib-$(ZLIB_VER).tar.gz checksum mismatch, expected=\"$(ZLIB_SHA256)\" actual=\"$$ZLIB_SHA256_ACTUAL\"; \
@ -1723,7 +1727,9 @@ libz.a:

 libbz2.a:
 	-rm -rf bzip2-$(BZIP2_VER)
-	curl -O -L ${BZIP2_DOWNLOAD_BASE}/$(BZIP2_VER)/bzip2-$(BZIP2_VER).tar.gz
+ifeq (,$(wildcard ./bzip2-$(BZIP2_VER).tar.gz))
+	curl --fail --output bzip2-$(BZIP2_VER).tar.gz --location ${CURL_SSL_OPTS} ${BZIP2_DOWNLOAD_BASE}/bzip2-$(BZIP2_VER).tar.gz
+endif
 	BZIP2_SHA256_ACTUAL=`$(SHA256_CMD) bzip2-$(BZIP2_VER).tar.gz | cut -d ' ' -f 1`; \
 	if [ "$(BZIP2_SHA256)" != "$$BZIP2_SHA256_ACTUAL" ]; then \
 		echo bzip2-$(BZIP2_VER).tar.gz checksum mismatch, expected=\"$(BZIP2_SHA256)\" actual=\"$$BZIP2_SHA256_ACTUAL\"; \
@ -1735,21 +1741,24 @@ libbz2.a:

 libsnappy.a:
 	-rm -rf snappy-$(SNAPPY_VER)
-	curl -O -L ${CURL_SSL_OPTS} ${SNAPPY_DOWNLOAD_BASE}/$(SNAPPY_VER)/snappy-$(SNAPPY_VER).tar.gz
+ifeq (,$(wildcard ./snappy-$(SNAPPY_VER).tar.gz))
+	curl --fail --output snappy-$(SNAPPY_VER).tar.gz --location ${CURL_SSL_OPTS} ${SNAPPY_DOWNLOAD_BASE}/$(SNAPPY_VER).tar.gz
+endif
 	SNAPPY_SHA256_ACTUAL=`$(SHA256_CMD) snappy-$(SNAPPY_VER).tar.gz | cut -d ' ' -f 1`; \
 	if [ "$(SNAPPY_SHA256)" != "$$SNAPPY_SHA256_ACTUAL" ]; then \
 		echo snappy-$(SNAPPY_VER).tar.gz checksum mismatch, expected=\"$(SNAPPY_SHA256)\" actual=\"$$SNAPPY_SHA256_ACTUAL\"; \
 		exit 1; \
 	fi
 	tar xvzf snappy-$(SNAPPY_VER).tar.gz
-	cd snappy-$(SNAPPY_VER) && CFLAGS='${EXTRA_CFLAGS}' CXXFLAGS='${EXTRA_CXXFLAGS}' LDFLAGS='${EXTRA_LDFLAGS}' ./configure --with-pic --enable-static --disable-shared
-	cd snappy-$(SNAPPY_VER) && $(MAKE) ${SNAPPY_MAKE_TARGET}
-	cp snappy-$(SNAPPY_VER)/.libs/libsnappy.a .
+	mkdir snappy-$(SNAPPY_VER)/build
+	cd snappy-$(SNAPPY_VER)/build && CFLAGS='${EXTRA_CFLAGS}' CXXFLAGS='${EXTRA_CXXFLAGS}' LDFLAGS='${EXTRA_LDFLAGS}' cmake -DCMAKE_POSITION_INDEPENDENT_CODE=ON .. && $(MAKE) ${SNAPPY_MAKE_TARGET}
+	cp snappy-$(SNAPPY_VER)/build/libsnappy.a .

 liblz4.a:
 	-rm -rf lz4-$(LZ4_VER)
-	curl -O -L ${CURL_SSL_OPTS} ${LZ4_DOWNLOAD_BASE}/v$(LZ4_VER).tar.gz
-	mv v$(LZ4_VER).tar.gz lz4-$(LZ4_VER).tar.gz
+ifeq (,$(wildcard ./lz4-$(LZ4_VER).tar.gz))
+	curl --fail --output lz4-$(LZ4_VER).tar.gz --location ${CURL_SSL_OPTS} ${LZ4_DOWNLOAD_BASE}/v$(LZ4_VER).tar.gz
+endif
 	LZ4_SHA256_ACTUAL=`$(SHA256_CMD) lz4-$(LZ4_VER).tar.gz | cut -d ' ' -f 1`; \
 	if [ "$(LZ4_SHA256)" != "$$LZ4_SHA256_ACTUAL" ]; then \
 		echo lz4-$(LZ4_VER).tar.gz checksum mismatch, expected=\"$(LZ4_SHA256)\" actual=\"$$LZ4_SHA256_ACTUAL\"; \
@ -1761,8 +1770,9 @@ liblz4.a:

 libzstd.a:
 	-rm -rf zstd-$(ZSTD_VER)
-	curl -O -L ${CURL_SSL_OPTS} ${ZSTD_DOWNLOAD_BASE}/v$(ZSTD_VER).tar.gz
-	mv v$(ZSTD_VER).tar.gz zstd-$(ZSTD_VER).tar.gz
+ifeq (,$(wildcard ./zstd-$(ZSTD_VER).tar.gz))
+	curl --fail --output zstd-$(ZSTD_VER).tar.gz --location ${CURL_SSL_OPTS} ${ZSTD_DOWNLOAD_BASE}/v$(ZSTD_VER).tar.gz
+endif
 	ZSTD_SHA256_ACTUAL=`$(SHA256_CMD) zstd-$(ZSTD_VER).tar.gz | cut -d ' ' -f 1`; \
 	if [ "$(ZSTD_SHA256)" != "$$ZSTD_SHA256_ACTUAL" ]; then \
 		echo zstd-$(ZSTD_VER).tar.gz checksum mismatch, expected=\"$(ZSTD_SHA256)\" actual=\"$$ZSTD_SHA256_ACTUAL\"; \
@ -1853,6 +1863,14 @@ rocksdbjavastaticdockerppc64le:
 	fi
 	docker start -a rocksdb_linux_ppc64le-be

+rocksdbjavastaticdockerarm64v8:
+	mkdir -p java/target
+	DOCKER_LINUX_ARM64V8_CONTAINER=`docker ps -aqf name=rocksdb_linux_arm64v8-be`; \
+	if [ -z "$$DOCKER_LINUX_ARM64V8_CONTAINER" ]; then \
+		docker container create --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host --name rocksdb_linux_arm64v8-be evolvedbinary/rocksjava:centos7_arm64v8-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh; \
+	fi
+	docker start -a rocksdb_linux_arm64v8-be
+
 rocksdbjavastaticpublish: rocksdbjavastaticrelease rocksdbjavastaticpublishcentral

 rocksdbjavastaticpublishdocker: rocksdbjavastaticreleasedocker rocksdbjavastaticpublishcentral
--- a/14
+++ b/14
@ -67,13 +67,11 @@ is_opt_mode = build_mode.startswith("opt")
 if is_opt_mode:
    rocksdb_compiler_flags.append("-DNDEBUG")

-default_allocator = read_config("fbcode", "default_allocator")
-
 sanitizer = read_config("fbcode", "sanitizer")

-# Let RocksDB aware of jemalloc existence.
-# Do not enable it if sanitizer presents.
-if is_opt_mode and default_allocator.startswith("jemalloc") and sanitizer == "":
+# Do not enable jemalloc if sanitizer presents. RocksDB will further detect
+# whether the binary is linked with jemalloc at runtime.
+if sanitizer == "":
    rocksdb_compiler_flags.append("-DROCKSDB_JEMALLOC")
    rocksdb_external_deps.append(("jemalloc", None, "headers"))

@ -124,7 +122,6 @@ cpp_library(
        "db/merge_helper.cc",
        "db/merge_operator.cc",
        "db/range_del_aggregator.cc",
-        "db/range_del_aggregator_v2.cc",
        "db/range_tombstone_fragmenter.cc",
        "db/repair.cc",
        "db/snapshot_impl.cc",
@ -935,11 +932,6 @@ ROCKS_TESTS = [
        "db/range_del_aggregator_test.cc",
        "serial",
    ],
-    [
-        "range_del_aggregator_v2_test",
-        "db/range_del_aggregator_v2_test.cc",
-        "serial",
-    ],
    [
        "range_tombstone_fragmenter_test",
        "db/range_tombstone_fragmenter_test.cc",
--- a/appveyor.yml
+++ b/appveyor.yml
@ -1,15 +1,75 @@
 version: 1.0.{build}
+
 image: Visual Studio 2017
+
+environment:
+  JAVA_HOME: C:\Program Files\Java\jdk1.8.0
+  THIRDPARTY_HOME: $(APPVEYOR_BUILD_FOLDER)\thirdparty
+  SNAPPY_HOME: $(THIRDPARTY_HOME)\snappy-1.1.7
+  SNAPPY_INCLUDE: $(SNAPPY_HOME);$(SNAPPY_HOME)\build
+  SNAPPY_LIB_DEBUG: $(SNAPPY_HOME)\build\Debug\snappy.lib
+  SNAPPY_LIB_RELEASE: $(SNAPPY_HOME)\build\Release\snappy.lib
+  LZ4_HOME: $(THIRDPARTY_HOME)\lz4-1.8.3
+  LZ4_INCLUDE: $(LZ4_HOME)\lib
+  LZ4_LIB_DEBUG: $(LZ4_HOME)\visual\VS2010\bin\x64_Debug\liblz4_static.lib
+  LZ4_LIB_RELEASE: $(LZ4_HOME)\visual\VS2010\bin\x64_Release\liblz4_static.lib
+  ZSTD_HOME: $(THIRDPARTY_HOME)\zstd-1.4.0
+  ZSTD_INCLUDE: $(ZSTD_HOME)\lib;$(ZSTD_HOME)\lib\dictBuilder
+  ZSTD_LIB_DEBUG: $(ZSTD_HOME)\build\VS2010\bin\x64_Debug\libzstd_static.lib
+  ZSTD_LIB_RELEASE: $(ZSTD_HOME)\build\VS2010\bin\x64_Release\libzstd_static.lib
+  matrix:
+    - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
+      CMAKE_GENERATOR: Visual Studio 14 Win64
+      DEV_ENV: C:\Program Files (x86)\Microsoft Visual Studio 14.0\Common7\IDE\devenv.com
+    - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017
+      CMAKE_GENERATOR: Visual Studio 15 Win64
+      DEV_ENV: C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\Common7\IDE\devenv.com
+
+install:
+  - md %THIRDPARTY_HOME%
+  - echo "Building Snappy dependency..."
+  - cd %THIRDPARTY_HOME%
+  - curl --fail --silent --show-error --output snappy-1.1.7.zip --location https://github.com/google/snappy/archive/1.1.7.zip
+  - unzip snappy-1.1.7.zip
+  - cd snappy-1.1.7
+  - mkdir build
+  - cd build
+  - cmake -G "%CMAKE_GENERATOR%" ..
+  - msbuild Snappy.sln /p:Configuration=Debug /p:Platform=x64
+  - msbuild Snappy.sln /p:Configuration=Release /p:Platform=x64
+  - echo "Building LZ4 dependency..."
+  - cd %THIRDPARTY_HOME%
+  - curl --fail --silent --show-error --output lz4-1.8.3.zip --location https://github.com/lz4/lz4/archive/v1.8.3.zip
+  - unzip lz4-1.8.3.zip
+  - cd lz4-1.8.3\visual\VS2010
+  - ps: $CMD="$Env:DEV_ENV"; & $CMD lz4.sln /upgrade
+  - msbuild lz4.sln /p:Configuration=Debug /p:Platform=x64
+  - msbuild lz4.sln /p:Configuration=Release /p:Platform=x64
+  - echo "Building ZStd dependency..."
+  - cd %THIRDPARTY_HOME%
+  - curl --fail --silent --show-error --output zstd-1.4.0.zip --location https://github.com/facebook/zstd/archive/v1.4.0.zip
+  - unzip zstd-1.4.0.zip
+  - cd zstd-1.4.0\build\VS2010
+  - ps: $CMD="$Env:DEV_ENV"; & $CMD zstd.sln /upgrade
+  - msbuild zstd.sln /p:Configuration=Debug /p:Platform=x64
+  - msbuild zstd.sln /p:Configuration=Release /p:Platform=x64
+
 before_build:
- md %APPVEYOR_BUILD_FOLDER%\build
- cd %APPVEYOR_BUILD_FOLDER%\build
- cmake -G "Visual Studio 15 Win64" -DOPTDBG=1 -DWITH_XPRESS=1 -DPORTABLE=1 ..
- cd ..
+  - md %APPVEYOR_BUILD_FOLDER%\build
+  - cd %APPVEYOR_BUILD_FOLDER%\build
+  - cmake -G "%CMAKE_GENERATOR%" -DCMAKE_BUILD_TYPE=Debug -DOPTDBG=1 -DPORTABLE=1 -DSNAPPY=1 -DLZ4=1 -DZSTD=1 -DXPRESS=1 -DJNI=1 ..
+  - cd ..
+
 build:
  project: build\rocksdb.sln
  parallel: true
  verbosity: normal
-test:
-test_script:
- ps: build_tools\run_ci_db_test.ps1 -SuiteRun db_basic_test,db_test2,db_test,env_basic_test,env_test -Concurrency 8
+
+test:
+
+test_script:
+  - ps: build_tools\run_ci_db_test.ps1 -SuiteRun db_basic_test,db_test2,db_test,env_basic_test,env_test,db_merge_operand_test -Concurrency 8
+
+on_failure:
+  - cmd: 7z a build-failed.zip %APPVEYOR_BUILD_FOLDER%\build\ && appveyor PushArtifact build-failed.zip

--- a/buckifier/targets_cfg.py
+++ b/buckifier/targets_cfg.py
@ -71,13 +71,11 @@ is_opt_mode = build_mode.startswith("opt")
 if is_opt_mode:
    rocksdb_compiler_flags.append("-DNDEBUG")

-default_allocator = read_config("fbcode", "default_allocator")
-
 sanitizer = read_config("fbcode", "sanitizer")

-# Let RocksDB aware of jemalloc existence.
-# Do not enable it if sanitizer presents.
-if is_opt_mode and default_allocator.startswith("jemalloc") and sanitizer == "":
+# Do not enable jemalloc if sanitizer presents. RocksDB will further detect
+# whether the binary is linked with jemalloc at runtime.
+if sanitizer == "":
    rocksdb_compiler_flags.append("-DROCKSDB_JEMALLOC")
    rocksdb_external_deps.append(("jemalloc", None, "headers"))
 """
--- a/build_tools/build_detect_platform
+++ b/build_tools/build_detect_platform
@ -53,11 +53,13 @@ if [ -z "$ROCKSDB_NO_FBCODE" -a -d /mnt/gvfs/third-party ]; then
    FBCODE_BUILD="true"
    # If we're compiling with TSAN we need pic build
    PIC_BUILD=$COMPILE_WITH_TSAN
-    if [ -z "$ROCKSDB_FBCODE_BUILD_WITH_481" ]; then
-      source "$PWD/build_tools/fbcode_config.sh"
-    else
+    if [ -n "$ROCKSDB_FBCODE_BUILD_WITH_481" ]; then
      # we need this to build with MySQL. Don't use for other purposes.
      source "$PWD/build_tools/fbcode_config4.8.1.sh"
+    elif [ -n "$ROCKSDB_FBCODE_BUILD_WITH_5xx" ]; then
+      source "$PWD/build_tools/fbcode_config.sh"
+    else
+      source "$PWD/build_tools/fbcode_config_platform007.sh"
    fi
 fi

@ -525,6 +527,8 @@ if test -z "$PORTABLE"; then
  elif test -n "`echo $TARGET_ARCHITECTURE | grep ^arm`"; then
    # TODO: Handle this with approprite options.
    COMMON_FLAGS="$COMMON_FLAGS"
+  elif test -n "`echo $TARGET_ARCHITECTURE | grep ^aarch64`"; then
+    COMMON_FLAGS="$COMMON_FLAGS"
  elif [ "$TARGET_OS" == "IOS" ]; then
    COMMON_FLAGS="$COMMON_FLAGS"
  elif [ "$TARGET_OS" != "AIX" ] && [ "$TARGET_OS" != "SunOS" ]; then
--- a/build_tools/dependencies_platform007.sh
+++ b/build_tools/dependencies_platform007.sh
@ -0,0 +1,18 @@
+GCC_BASE=/mnt/gvfs/third-party2/gcc/6e8e715624fd15256a7970073387793dfcf79b46/7.x/centos7-native/b2ef2b6
+CLANG_BASE=/mnt/gvfs/third-party2/llvm-fb/ef37e1faa1c29782abfac1ae65a291b9b7966f6d/stable/centos7-native/c9f9104
+LIBGCC_BASE=/mnt/gvfs/third-party2/libgcc/c67031f0f739ac61575a061518d6ef5038f99f90/7.x/platform007/5620abc
+GLIBC_BASE=/mnt/gvfs/third-party2/glibc/60d6f124a78798b73944f5ba87c2306ae3460153/2.26/platform007/f259413
+SNAPPY_BASE=/mnt/gvfs/third-party2/snappy/7f9bdaada18f59bc27ec2b0871eb8a6144343aef/1.1.3/platform007/ca4da3d
+ZLIB_BASE=/mnt/gvfs/third-party2/zlib/22c2d65676fb7c23cfa797c4f6937f38b026f3cf/1.2.8/platform007/ca4da3d
+BZIP2_BASE=/mnt/gvfs/third-party2/bzip2/dc49a21c5fceec6456a7a28a94dcd16690af1337/1.0.6/platform007/ca4da3d
+LZ4_BASE=/mnt/gvfs/third-party2/lz4/907b498203d297947f3bb70b9466f47e100f1873/r131/platform007/ca4da3d
+ZSTD_BASE=/mnt/gvfs/third-party2/zstd/3ee276cbacfad3074e3f07bf826ac47f06970f4e/1.3.5/platform007/15a3614
+GFLAGS_BASE=/mnt/gvfs/third-party2/gflags/0b9929d2588991c65a57168bf88aff2db87c5d48/2.2.0/platform007/ca4da3d
+JEMALLOC_BASE=/mnt/gvfs/third-party2/jemalloc/9c910d36d6235cc40e8ff559358f1833452300ca/master/platform007/5b0f53e
+NUMA_BASE=/mnt/gvfs/third-party2/numa/9cbf2460284c669ed19c3ccb200a71f7dd7e53c7/2.0.11/platform007/ca4da3d
+LIBUNWIND_BASE=/mnt/gvfs/third-party2/libunwind/bf3d7497fe4e6d007354f0adffa16ce3003f8338/1.3/platform007/6f3e0a9
+TBB_BASE=/mnt/gvfs/third-party2/tbb/ff4e0b093534704d8abab678a4fd7f5ea7b094c7/2018_U5/platform007/ca4da3d
+KERNEL_HEADERS_BASE=/mnt/gvfs/third-party2/kernel-headers/b5c4a61a5c483ba24722005ae07895971a2ac707/fb/platform007/da39a3e
+BINUTILS_BASE=/mnt/gvfs/third-party2/binutils/92ff90349e2f43ea0a8246d8b1cf17b6869013e3/2.29.1/centos7-native/da39a3e
+VALGRIND_BASE=/mnt/gvfs/third-party2/valgrind/f3f697a28122e6bcd513273dd9c1ff23852fc59f/3.13.0/platform007/ca4da3d
+LUA_BASE=/mnt/gvfs/third-party2/lua/f0cd714433206d5139df61659eb7b28b1dea6683/5.3.4/platform007/5007832
--- a/build_tools/fbcode_config_platform007.sh
+++ b/build_tools/fbcode_config_platform007.sh
@ -0,0 +1,157 @@
+#!/bin/sh
+#
+# Set environment variables so that we can compile rocksdb using
+# fbcode settings.  It uses the latest g++ and clang compilers and also
+# uses jemalloc
+# Environment variables that change the behavior of this script:
+# PIC_BUILD -- if true, it will only take pic versions of libraries from fbcode. libraries that don't have pic variant will not be included
+
+
+BASEDIR=`dirname $BASH_SOURCE`
+source "$BASEDIR/dependencies_platform007.sh"
+
+CFLAGS=""
+
+# libgcc
+LIBGCC_INCLUDE="$LIBGCC_BASE/include/c++/7.3.0"
+LIBGCC_LIBS=" -L $LIBGCC_BASE/lib"
+
+# glibc
+GLIBC_INCLUDE="$GLIBC_BASE/include"
+GLIBC_LIBS=" -L $GLIBC_BASE/lib"
+
+# snappy
+SNAPPY_INCLUDE=" -I $SNAPPY_BASE/include/"
+if test -z $PIC_BUILD; then
+  SNAPPY_LIBS=" $SNAPPY_BASE/lib/libsnappy.a"
+else
+  SNAPPY_LIBS=" $SNAPPY_BASE/lib/libsnappy_pic.a"
+fi
+CFLAGS+=" -DSNAPPY"
+
+if test -z $PIC_BUILD; then
+  # location of zlib headers and libraries
+  ZLIB_INCLUDE=" -I $ZLIB_BASE/include/"
+  ZLIB_LIBS=" $ZLIB_BASE/lib/libz.a"
+  CFLAGS+=" -DZLIB"
+
+  # location of bzip headers and libraries
+  BZIP_INCLUDE=" -I $BZIP2_BASE/include/"
+  BZIP_LIBS=" $BZIP2_BASE/lib/libbz2.a"
+  CFLAGS+=" -DBZIP2"
+
+  LZ4_INCLUDE=" -I $LZ4_BASE/include/"
+  LZ4_LIBS=" $LZ4_BASE/lib/liblz4.a"
+  CFLAGS+=" -DLZ4"
+fi
+
+ZSTD_INCLUDE=" -I $ZSTD_BASE/include/"
+if test -z $PIC_BUILD; then
+  ZSTD_LIBS=" $ZSTD_BASE/lib/libzstd.a"
+else
+  ZSTD_LIBS=" $ZSTD_BASE/lib/libzstd_pic.a"
+fi
+CFLAGS+=" -DZSTD"
+
+# location of gflags headers and libraries
+GFLAGS_INCLUDE=" -I $GFLAGS_BASE/include/"
+if test -z $PIC_BUILD; then
+  GFLAGS_LIBS=" $GFLAGS_BASE/lib/libgflags.a"
+else
+  GFLAGS_LIBS=" $GFLAGS_BASE/lib/libgflags_pic.a"
+fi
+CFLAGS+=" -DGFLAGS=gflags"
+
+# location of jemalloc
+JEMALLOC_INCLUDE=" -I $JEMALLOC_BASE/include/"
+JEMALLOC_LIB=" $JEMALLOC_BASE/lib/libjemalloc.a"
+
+if test -z $PIC_BUILD; then
+  # location of numa
+  NUMA_INCLUDE=" -I $NUMA_BASE/include/"
+  NUMA_LIB=" $NUMA_BASE/lib/libnuma.a"
+  CFLAGS+=" -DNUMA"
+
+  # location of libunwind
+  LIBUNWIND="$LIBUNWIND_BASE/lib/libunwind.a"
+fi
+
+# location of TBB
+TBB_INCLUDE=" -isystem $TBB_BASE/include/"
+if test -z $PIC_BUILD; then
+  TBB_LIBS="$TBB_BASE/lib/libtbb.a"
+else
+  TBB_LIBS="$TBB_BASE/lib/libtbb_pic.a"
+fi
+CFLAGS+=" -DTBB"
+
+# use Intel SSE support for checksum calculations
+export USE_SSE=1
+export PORTABLE=1
+
+BINUTILS="$BINUTILS_BASE/bin"
+AR="$BINUTILS/ar"
+
+DEPS_INCLUDE="$SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $LZ4_INCLUDE $ZSTD_INCLUDE $GFLAGS_INCLUDE $NUMA_INCLUDE $TBB_INCLUDE"
+
+STDLIBS="-L $GCC_BASE/lib64"
+
+CLANG_BIN="$CLANG_BASE/bin"
+CLANG_LIB="$CLANG_BASE/lib"
+CLANG_SRC="$CLANG_BASE/../../src"
+
+CLANG_ANALYZER="$CLANG_BIN/clang++"
+CLANG_SCAN_BUILD="$CLANG_SRC/llvm/tools/clang/tools/scan-build/bin/scan-build"
+
+if [ -z "$USE_CLANG" ]; then
+  # gcc
+  CC="$GCC_BASE/bin/gcc"
+  CXX="$GCC_BASE/bin/g++"
+
+  CFLAGS+=" -B$BINUTILS/gold"
+  CFLAGS+=" -isystem $LIBGCC_INCLUDE"
+  CFLAGS+=" -isystem $GLIBC_INCLUDE"
+  JEMALLOC=1
+else
+  # clang
+  CLANG_INCLUDE="$CLANG_LIB/clang/stable/include"
+  CC="$CLANG_BIN/clang"
+  CXX="$CLANG_BIN/clang++"
+
+  KERNEL_HEADERS_INCLUDE="$KERNEL_HEADERS_BASE/include"
+
+  CFLAGS+=" -B$BINUTILS/gold -nostdinc -nostdlib"
+  CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/7.x "
+  CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/7.x/x86_64-facebook-linux "
+  CFLAGS+=" -isystem $GLIBC_INCLUDE"
+  CFLAGS+=" -isystem $LIBGCC_INCLUDE"
+  CFLAGS+=" -isystem $CLANG_INCLUDE"
+  CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE/linux "
+  CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE "
+  CFLAGS+=" -Wno-expansion-to-defined "
+  CXXFLAGS="-nostdinc++"
+fi
+
+CFLAGS+=" $DEPS_INCLUDE"
+CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE -DROCKSDB_RANGESYNC_PRESENT -DROCKSDB_SCHED_GETCPU_PRESENT -DROCKSDB_SUPPORT_THREAD_LOCAL -DHAVE_SSE42"
+CXXFLAGS+=" $CFLAGS"
+
+EXEC_LDFLAGS=" $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS $NUMA_LIB $TBB_LIBS"
+EXEC_LDFLAGS+=" -B$BINUTILS/gold"
+EXEC_LDFLAGS+=" -Wl,--dynamic-linker,/usr/local/fbcode/platform007/lib/ld.so"
+EXEC_LDFLAGS+=" $LIBUNWIND"
+EXEC_LDFLAGS+=" -Wl,-rpath=/usr/local/fbcode/platform007/lib"
+# required by libtbb
+EXEC_LDFLAGS+=" -ldl"
+
+PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS $STDLIBS -lgcc -lstdc++"
+
+EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS $TBB_LIBS"
+
+VALGRIND_VER="$VALGRIND_BASE/bin/"
+
+# lua not supported because it's on track for deprecation, I think
+LUA_PATH=
+LUA_LIB=
+
+export CC CXX AR CFLAGS CXXFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE CLANG_ANALYZER CLANG_SCAN_BUILD LUA_PATH LUA_LIB
--- a/build_tools/regression_build_test.sh
+++ b/build_tools/regression_build_test.sh
@ -377,7 +377,7 @@ function send_to_ods {
    echo >&2 "ERROR: Key $key doesn't have a value."
    return
  fi
-  curl -s "https://www.intern.facebook.com/intern/agent/ods_set.php?entity=rocksdb_build$git_br&key=$key&value=$value" \
+  curl --silent "https://www.intern.facebook.com/intern/agent/ods_set.php?entity=rocksdb_build$git_br&key=$key&value=$value" \
    --connect-timeout 60
 }

--- a/build_tools/rocksdb-lego-determinator
+++ b/build_tools/rocksdb-lego-determinator
@ -85,7 +85,9 @@ NON_SHM="TMPD=/tmp/rocksdb_test_tmp"
 GCC_481="ROCKSDB_FBCODE_BUILD_WITH_481=1"
 ASAN="COMPILE_WITH_ASAN=1"
 CLANG="USE_CLANG=1"
-TSAN="COMPILE_WITH_TSAN=1"
+# in gcc-5 there are known problems with TSAN like https://gcc.gnu.org/bugzilla/show_bug.cgi?id=71090.
+# using platform007 gives us gcc-8 or higher which has that bug fixed.
+TSAN="ROCKSDB_FBCODE_BUILD_WITH_PLATFORM007=1 COMPILE_WITH_TSAN=1"
 UBSAN="COMPILE_WITH_UBSAN=1"
 TSAN_CRASH='CRASH_TEST_EXT_ARGS="--compression_type=zstd --log2_keys_per_lock=22"'
 NON_TSAN_CRASH="CRASH_TEST_EXT_ARGS=--compression_type=zstd"
@ -644,7 +646,7 @@ run_regression()

  # parameters: $1 -- key, $2 -- value
  function send_size_to_ods {
-    curl -s "https://www.intern.facebook.com/intern/agent/ods_set.php?entity=rocksdb_build&key=rocksdb.build_size.$1&value=$2" \
+    curl --silent "https://www.intern.facebook.com/intern/agent/ods_set.php?entity=rocksdb_build&key=rocksdb.build_size.$1&value=$2" \
      --connect-timeout 60
  }

--- a/build_tools/update_dependencies.sh
+++ b/build_tools/update_dependencies.sh
@ -53,6 +53,45 @@ function get_lib_base()
  log_variable $__res_var
 }

+###########################################################
+#                platform007 dependencies                 #
+###########################################################
+
+OUTPUT="$BASEDIR/dependencies_platform007.sh"
+
+rm -f "$OUTPUT"
+touch "$OUTPUT"
+
+echo "Writing dependencies to $OUTPUT"
+
+# Compilers locations
+GCC_BASE=`readlink -f $TP2_LATEST/gcc/7.x/centos7-native/*/`
+CLANG_BASE=`readlink -f $TP2_LATEST/llvm-fb/stable/centos7-native/*/`
+
+log_variable GCC_BASE
+log_variable CLANG_BASE
+
+# Libraries locations
+get_lib_base libgcc     7.x     platform007
+get_lib_base glibc      2.26    platform007
+get_lib_base snappy     LATEST  platform007
+get_lib_base zlib       LATEST  platform007
+get_lib_base bzip2      LATEST  platform007
+get_lib_base lz4        LATEST  platform007
+get_lib_base zstd       LATEST  platform007
+get_lib_base gflags     LATEST  platform007
+get_lib_base jemalloc   LATEST  platform007
+get_lib_base numa       LATEST  platform007
+get_lib_base libunwind  LATEST  platform007
+get_lib_base tbb        LATEST  platform007
+
+get_lib_base kernel-headers fb platform007
+get_lib_base binutils   LATEST centos7-native
+get_lib_base valgrind   LATEST platform007
+get_lib_base lua        5.3.4  platform007
+
+git diff $OUTPUT
+
 ###########################################################
 #                   5.x dependencies                      #
 ###########################################################
--- a/db/builder.cc
+++ b/db/builder.cc
@ -18,6 +18,7 @@
 #include "db/event_helpers.h"
 #include "db/internal_stats.h"
 #include "db/merge_helper.h"
+#include "db/range_del_aggregator.h"
 #include "db/table_cache.h"
 #include "db/version_edit.h"
 #include "monitoring/iostats_context_imp.h"
@ -65,8 +66,9 @@ Status BuildTable(
    const std::string& dbname, Env* env, const ImmutableCFOptions& ioptions,
    const MutableCFOptions& mutable_cf_options, const EnvOptions& env_options,
    TableCache* table_cache, InternalIterator* iter,
-    std::unique_ptr<InternalIterator> range_del_iter, FileMetaData* meta,
-    const InternalKeyComparator& internal_comparator,
+    std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
+        range_del_iters,
+    FileMetaData* meta, const InternalKeyComparator& internal_comparator,
    const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
        int_tbl_prop_collector_factories,
    uint32_t column_family_id, const std::string& column_family_name,
@ -86,12 +88,10 @@ Status BuildTable(
  Status s;
  meta->fd.file_size = 0;
  iter->SeekToFirst();
-  std::unique_ptr<RangeDelAggregator> range_del_agg(
-      new RangeDelAggregator(internal_comparator, snapshots));
-  s = range_del_agg->AddTombstones(std::move(range_del_iter));
-  if (!s.ok()) {
-    // may be non-ok if a range tombstone key is unparsable
-    return s;
+  std::unique_ptr<CompactionRangeDelAggregator> range_del_agg(
+      new CompactionRangeDelAggregator(&internal_comparator, snapshots));
+  for (auto& range_del_iter : range_del_iters) {
+    range_del_agg->AddTombstones(std::move(range_del_iter));
  }

  std::string fname = TableFileName(ioptions.cf_paths, meta->fd.GetNumber(),
@ -158,8 +158,10 @@ Status BuildTable(
      }
    }

-    for (auto it = range_del_agg->NewIterator(); it->Valid(); it->Next()) {
-      auto tombstone = it->Tombstone();
+    auto range_del_it = range_del_agg->NewIterator();
+    for (range_del_it->SeekToFirst(); range_del_it->Valid();
+         range_del_it->Next()) {
+      auto tombstone = range_del_it->Tombstone();
      auto kv = tombstone.Serialize();
      builder->Add(kv.first.Encode(), kv.second);
      meta->UpdateBoundariesForRange(kv.first, tombstone.SerializeEndKey(),
--- a/db/builder.h
+++ b/db/builder.h
@ -9,6 +9,7 @@
 #include <string>
 #include <utility>
 #include <vector>
+#include "db/range_tombstone_fragmenter.h"
 #include "db/table_properties_collector.h"
 #include "options/cf_options.h"
 #include "rocksdb/comparator.h"
@ -65,8 +66,9 @@ extern Status BuildTable(
    const std::string& dbname, Env* env, const ImmutableCFOptions& options,
    const MutableCFOptions& mutable_cf_options, const EnvOptions& env_options,
    TableCache* table_cache, InternalIterator* iter,
-    std::unique_ptr<InternalIterator> range_del_iter, FileMetaData* meta,
-    const InternalKeyComparator& internal_comparator,
+    std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
+        range_del_iters,
+    FileMetaData* meta, const InternalKeyComparator& internal_comparator,
    const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
        int_tbl_prop_collector_factories,
    uint32_t column_family_id, const std::string& column_family_name,
--- a/db/column_family.cc
+++ b/db/column_family.cc
@ -25,7 +25,7 @@
 #include "db/db_impl.h"
 #include "db/internal_stats.h"
 #include "db/job_context.h"
-#include "db/range_del_aggregator_v2.h"
+#include "db/range_del_aggregator.h"
 #include "db/table_properties_collector.h"
 #include "db/version_set.h"
 #include "db/write_controller.h"
@ -945,7 +945,7 @@ Status ColumnFamilyData::RangesOverlapWithMemtables(
  ScopedArenaIterator memtable_iter(merge_iter_builder.Finish());

  auto read_seq = super_version->current->version_set()->LastSequence();
-  RangeDelAggregatorV2 range_del_agg(&internal_comparator_, read_seq);
+  ReadRangeDelAggregator range_del_agg(&internal_comparator_, read_seq);
  auto* active_range_del_iter =
      super_version->mem->NewRangeTombstoneIterator(read_opts, read_seq);
  range_del_agg.AddTombstones(
--- a/db/compacted_db_impl.h
+++ b/db/compacted_db_impl.h
@ -67,10 +67,11 @@ class CompactedDBImpl : public DBImpl {
  virtual Status EnableFileDeletions(bool /*force*/) override {
    return Status::NotSupported("Not supported in compacted db mode.");
  }
-  virtual Status GetLiveFiles(std::vector<std::string>&,
-                              uint64_t* /*manifest_file_size*/,
-                              bool /*flush_memtable*/ = true) override {
-    return Status::NotSupported("Not supported in compacted db mode.");
+  virtual Status GetLiveFiles(std::vector<std::string>& ret,
+                              uint64_t* manifest_file_size,
+                              bool /*flush_memtable*/) override {
+    return DBImpl::GetLiveFiles(ret, manifest_file_size,
+                                false /* flush_memtable */);
  }
  using DBImpl::Flush;
  virtual Status Flush(const FlushOptions& /*options*/,
--- a/db/compaction_iterator.cc
+++ b/db/compaction_iterator.cc
@ -18,7 +18,7 @@ CompactionIterator::CompactionIterator(
    SequenceNumber earliest_write_conflict_snapshot,
    const SnapshotChecker* snapshot_checker, Env* env,
    bool report_detailed_time, bool expect_valid_internal_key,
-    RangeDelAggregator* range_del_agg, const Compaction* compaction,
+    CompactionRangeDelAggregator* range_del_agg, const Compaction* compaction,
    const CompactionFilter* compaction_filter,
    const std::atomic<bool>* shutting_down,
    const SequenceNumber preserve_deletes_seqnum)
@ -36,7 +36,7 @@ CompactionIterator::CompactionIterator(
    SequenceNumber earliest_write_conflict_snapshot,
    const SnapshotChecker* snapshot_checker, Env* env,
    bool report_detailed_time, bool expect_valid_internal_key,
-    RangeDelAggregator* range_del_agg,
+    CompactionRangeDelAggregator* range_del_agg,
    std::unique_ptr<CompactionProxy> compaction,
    const CompactionFilter* compaction_filter,
    const std::atomic<bool>* shutting_down,
--- a/db/compaction_iterator.h
+++ b/db/compaction_iterator.h
@ -64,7 +64,7 @@ class CompactionIterator {
                     SequenceNumber earliest_write_conflict_snapshot,
                     const SnapshotChecker* snapshot_checker, Env* env,
                     bool report_detailed_time, bool expect_valid_internal_key,
-                     RangeDelAggregator* range_del_agg,
+                     CompactionRangeDelAggregator* range_del_agg,
                     const Compaction* compaction = nullptr,
                     const CompactionFilter* compaction_filter = nullptr,
                     const std::atomic<bool>* shutting_down = nullptr,
@ -77,7 +77,7 @@ class CompactionIterator {
                     SequenceNumber earliest_write_conflict_snapshot,
                     const SnapshotChecker* snapshot_checker, Env* env,
                     bool report_detailed_time, bool expect_valid_internal_key,
-                     RangeDelAggregator* range_del_agg,
+                     CompactionRangeDelAggregator* range_del_agg,
                     std::unique_ptr<CompactionProxy> compaction,
                     const CompactionFilter* compaction_filter = nullptr,
                     const std::atomic<bool>* shutting_down = nullptr,
@ -141,7 +141,7 @@ class CompactionIterator {
  Env* env_;
  bool report_detailed_time_;
  bool expect_valid_internal_key_;
-  RangeDelAggregator* range_del_agg_;
+  CompactionRangeDelAggregator* range_del_agg_;
  std::unique_ptr<CompactionProxy> compaction_;
  const CompactionFilter* compaction_filter_;
  const std::atomic<bool>* shutting_down_;
--- a/db/compaction_iterator_test.cc
+++ b/db/compaction_iterator_test.cc
@ -221,10 +221,15 @@ class CompactionIteratorTest : public testing::TestWithParam<bool> {
      MergeOperator* merge_op = nullptr, CompactionFilter* filter = nullptr,
      bool bottommost_level = false,
      SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber) {
-    std::unique_ptr<InternalIterator> range_del_iter(
+    std::unique_ptr<InternalIterator> unfragmented_range_del_iter(
        new test::VectorIterator(range_del_ks, range_del_vs));
-    range_del_agg_.reset(new RangeDelAggregator(icmp_, snapshots_));
-    ASSERT_OK(range_del_agg_->AddTombstones(std::move(range_del_iter)));
+    auto tombstone_list = std::make_shared<FragmentedRangeTombstoneList>(
+        std::move(unfragmented_range_del_iter), icmp_);
+    std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+        new FragmentedRangeTombstoneIterator(tombstone_list, icmp_,
+                                             kMaxSequenceNumber));
+    range_del_agg_.reset(new CompactionRangeDelAggregator(&icmp_, snapshots_));
+    range_del_agg_->AddTombstones(std::move(range_del_iter));

    std::unique_ptr<CompactionIterator::CompactionProxy> compaction;
    if (filter || bottommost_level) {
@ -292,7 +297,7 @@ class CompactionIteratorTest : public testing::TestWithParam<bool> {
  std::unique_ptr<MergeHelper> merge_helper_;
  std::unique_ptr<LoggingForwardVectorIterator> iter_;
  std::unique_ptr<CompactionIterator> c_iter_;
-  std::unique_ptr<RangeDelAggregator> range_del_agg_;
+  std::unique_ptr<CompactionRangeDelAggregator> range_del_agg_;
  std::unique_ptr<SnapshotChecker> snapshot_checker_;
  std::atomic<bool> shutting_down_{false};
  FakeCompaction* compaction_proxy_;
--- a/db/compaction_job.cc
+++ b/db/compaction_job.cc
@ -36,7 +36,7 @@
 #include "db/memtable_list.h"
 #include "db/merge_context.h"
 #include "db/merge_helper.h"
-#include "db/range_del_aggregator_v2.h"
+#include "db/range_del_aggregator.h"
 #include "db/version_set.h"
 #include "monitoring/iostats_context_imp.h"
 #include "monitoring/perf_context_imp.h"
@ -805,15 +805,13 @@ Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options) {
 void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
  assert(sub_compact != nullptr);
  ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
-  RangeDelAggregatorV2 range_del_agg_v2(&cfd->internal_comparator(),
-                                        kMaxSequenceNumber /* upper_bound */);
-  auto* range_del_agg =
-      range_del_agg_v2.DelegateToRangeDelAggregator(existing_snapshots_);
+  CompactionRangeDelAggregator range_del_agg(&cfd->internal_comparator(),
+                                             existing_snapshots_);

  // Although the v2 aggregator is what the level iterator(s) know about,
  // the AddTombstones calls will be propagated down to the v1 aggregator.
  std::unique_ptr<InternalIterator> input(versions_->MakeInputIterator(
-      sub_compact->compaction, &range_del_agg_v2, env_optiosn_for_read_));
+      sub_compact->compaction, &range_del_agg, env_optiosn_for_read_));

  AutoThreadOperationStageUpdater stage_updater(
      ThreadStatus::STAGE_COMPACTION_PROCESS_KV);
@ -902,8 +900,8 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
      input.get(), cfd->user_comparator(), &merge, versions_->LastSequence(),
      &existing_snapshots_, earliest_write_conflict_snapshot_,
      snapshot_checker_, env_, ShouldReportDetailedTime(env_, stats_), false,
-      range_del_agg, sub_compact->compaction, compaction_filter, shutting_down_,
-      preserve_deletes_seqnum_));
+      &range_del_agg, sub_compact->compaction, compaction_filter,
+      shutting_down_, preserve_deletes_seqnum_));
  auto c_iter = sub_compact->c_iter.get();
  c_iter->SeekToFirst();
  if (c_iter->Valid() && sub_compact->compaction->output_level() != 0) {
@ -1041,7 +1039,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
      }
      CompactionIterationStats range_del_out_stats;
      status =
-          FinishCompactionOutputFile(input_status, sub_compact, range_del_agg,
+          FinishCompactionOutputFile(input_status, sub_compact, &range_del_agg,
                                     &range_del_out_stats, next_key);
      RecordDroppedKeys(range_del_out_stats,
                        &sub_compact->compaction_job_stats);
@ -1092,8 +1090,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
  }

  if (status.ok() && sub_compact->builder == nullptr &&
-      sub_compact->outputs.size() == 0 &&
-      !range_del_agg->IsEmpty()) {
+      sub_compact->outputs.size() == 0 && !range_del_agg.IsEmpty()) {
    // handle subcompaction containing only range deletions
    status = OpenCompactionOutputFile(sub_compact);
  }
@ -1102,7 +1099,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
  // close the output file.
  if (sub_compact->builder != nullptr) {
    CompactionIterationStats range_del_out_stats;
-    Status s = FinishCompactionOutputFile(status, sub_compact, range_del_agg,
+    Status s = FinishCompactionOutputFile(status, sub_compact, &range_del_agg,
                                          &range_del_out_stats);
    if (status.ok()) {
      status = s;
@ -1168,7 +1165,7 @@ void CompactionJob::RecordDroppedKeys(

 Status CompactionJob::FinishCompactionOutputFile(
    const Status& input_status, SubcompactionState* sub_compact,
-    RangeDelAggregator* range_del_agg,
+    CompactionRangeDelAggregator* range_del_agg,
    CompactionIterationStats* range_del_out_stats,
    const Slice* next_table_min_key /* = nullptr */) {
  AutoThreadOperationStageUpdater stage_updater(
@ -1207,10 +1204,19 @@ Status CompactionJob::FinishCompactionOutputFile(
      lower_bound = nullptr;
    }
    if (next_table_min_key != nullptr) {
-      // This isn't the last file in the subcompaction, so extend until the next
-      // file starts.
+      // This may be the last file in the subcompaction in some cases, so we
+      // need to compare the end key of subcompaction with the next file start
+      // key. When the end key is chosen by the subcompaction, we know that
+      // it must be the biggest key in output file. Therefore, it is safe to
+      // use the smaller key as the upper bound of the output file, to ensure
+      // that there is no overlapping between different output files.
      upper_bound_guard = ExtractUserKey(*next_table_min_key);
-      upper_bound = &upper_bound_guard;
+      if (sub_compact->end != nullptr &&
+          ucmp->Compare(upper_bound_guard, *sub_compact->end) >= 0) {
+        upper_bound = sub_compact->end;
+      } else {
+        upper_bound = &upper_bound_guard;
+      }
    } else {
      // This is the last file in the subcompaction, so extend until the
      // subcompaction ends.
@ -1220,11 +1226,6 @@ Status CompactionJob::FinishCompactionOutputFile(
    if (existing_snapshots_.size() > 0) {
      earliest_snapshot = existing_snapshots_[0];
    }
-    auto it = range_del_agg->NewIterator();
-    if (lower_bound != nullptr) {
-      it->Seek(*lower_bound);
-    }
-
    bool has_overlapping_endpoints;
    if (upper_bound != nullptr && meta->largest.size() > 0) {
      has_overlapping_endpoints =
@ -1232,6 +1233,24 @@ Status CompactionJob::FinishCompactionOutputFile(
    } else {
      has_overlapping_endpoints = false;
    }
+
+    // The end key of the subcompaction must be bigger or equal to the upper
+    // bound. If the end of subcompaction is null or the upper bound is null,
+    // it means that this file is the last file in the compaction. So there
+    // will be no overlapping between this file and others.
+    assert(sub_compact->end == nullptr ||
+           upper_bound == nullptr ||
+           ucmp->Compare(*upper_bound , *sub_compact->end) <= 0);
+    auto it = range_del_agg->NewIterator(lower_bound, upper_bound,
+                                         has_overlapping_endpoints);
+    // Position the range tombstone output iterator. There may be tombstone
+    // fragments that are entirely out of range, so make sure that we do not
+    // include those.
+    if (lower_bound != nullptr) {
+      it->Seek(*lower_bound);
+    } else {
+      it->SeekToFirst();
+    }
    for (; it->Valid(); it->Next()) {
      auto tombstone = it->Tombstone();
      if (upper_bound != nullptr) {
@ -1257,6 +1276,8 @@ Status CompactionJob::FinishCompactionOutputFile(
      }

      auto kv = tombstone.Serialize();
+      assert(lower_bound == nullptr ||
+             ucmp->Compare(*lower_bound, kv.second) < 0);
      sub_compact->builder->Add(kv.first.Encode(), kv.second);
      InternalKey smallest_candidate = std::move(kv.first);
      if (lower_bound != nullptr &&
--- a/db/compaction_job.h
+++ b/db/compaction_job.h
@ -29,8 +29,8 @@
 #include "db/version_edit.h"
 #include "db/write_controller.h"
 #include "db/write_thread.h"
-#include "options/db_options.h"
 #include "options/cf_options.h"
+#include "options/db_options.h"
 #include "port/port.h"
 #include "rocksdb/compaction_filter.h"
 #include "rocksdb/compaction_job_stats.h"
@ -104,7 +104,7 @@ class CompactionJob {

  Status FinishCompactionOutputFile(
      const Status& input_status, SubcompactionState* sub_compact,
-      RangeDelAggregator* range_del_agg,
+      CompactionRangeDelAggregator* range_del_agg,
      CompactionIterationStats* range_del_out_stats,
      const Slice* next_table_min_key = nullptr);
  Status InstallCompactionResults(const MutableCFOptions& mutable_cf_options);
--- a/db/db_compaction_filter_test.cc
+++ b/db/db_compaction_filter_test.cc
@ -340,8 +340,8 @@ TEST_F(DBTestCompactionFilter, CompactionFilter) {
  Arena arena;
  {
    InternalKeyComparator icmp(options.comparator);
-    RangeDelAggregatorV2 range_del_agg(&icmp,
-                                       kMaxSequenceNumber /* upper_bound */);
+    ReadRangeDelAggregator range_del_agg(&icmp,
+                                         kMaxSequenceNumber /* upper_bound */);
    ScopedArenaIterator iter(dbfull()->NewInternalIterator(
        &arena, &range_del_agg, kMaxSequenceNumber, handles_[1]));
    iter->SeekToFirst();
@ -430,8 +430,8 @@ TEST_F(DBTestCompactionFilter, CompactionFilter) {
  count = 0;
  {
    InternalKeyComparator icmp(options.comparator);
-    RangeDelAggregatorV2 range_del_agg(&icmp,
-                                       kMaxSequenceNumber /* upper_bound */);
+    ReadRangeDelAggregator range_del_agg(&icmp,
+                                         kMaxSequenceNumber /* upper_bound */);
    ScopedArenaIterator iter(dbfull()->NewInternalIterator(
        &arena, &range_del_agg, kMaxSequenceNumber, handles_[1]));
    iter->SeekToFirst();
@ -648,8 +648,8 @@ TEST_F(DBTestCompactionFilter, CompactionFilterContextManual) {
    int total = 0;
    Arena arena;
    InternalKeyComparator icmp(options.comparator);
-    RangeDelAggregatorV2 range_del_agg(&icmp,
-                                       kMaxSequenceNumber /* snapshots */);
+    ReadRangeDelAggregator range_del_agg(&icmp,
+                                         kMaxSequenceNumber /* snapshots */);
    ScopedArenaIterator iter(dbfull()->NewInternalIterator(
        &arena, &range_del_agg, kMaxSequenceNumber));
    iter->SeekToFirst();
--- a/db/db_flush_test.cc
+++ b/db/db_flush_test.cc
@ -407,6 +407,87 @@ TEST_P(DBAtomicFlushTest, AtomicFlushRollbackSomeJobs) {
  Destroy(options);
 }

+TEST_P(DBAtomicFlushTest, FlushMultipleCFs_DropSomeBeforeRequestFlush) {
+  bool atomic_flush = GetParam();
+  if (!atomic_flush) {
+    return;
+  }
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.atomic_flush = atomic_flush;
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+  size_t num_cfs = handles_.size();
+  ASSERT_EQ(3, num_cfs);
+  WriteOptions wopts;
+  wopts.disableWAL = true;
+  std::vector<int> cf_ids;
+  for (size_t i = 0; i != num_cfs; ++i) {
+    int cf_id = static_cast<int>(i);
+    ASSERT_OK(Put(cf_id, "key", "value", wopts));
+    cf_ids.push_back(cf_id);
+  }
+  ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+  ASSERT_TRUE(Flush(cf_ids).IsShutdownInProgress());
+  Destroy(options);
+}
+
+TEST_P(DBAtomicFlushTest,
+       FlushMultipleCFs_DropSomeAfterScheduleFlushBeforeFlushJobRun) {
+  bool atomic_flush = GetParam();
+  if (!atomic_flush) {
+    return;
+  }
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.atomic_flush = atomic_flush;
+
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::AtomicFlushMemTables:AfterScheduleFlush",
+        "DBAtomicFlushTest::BeforeDropCF"},
+       {"DBAtomicFlushTest::AfterDropCF",
+        "DBImpl::BackgroundCallFlush:start"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  size_t num_cfs = handles_.size();
+  ASSERT_EQ(3, num_cfs);
+  WriteOptions wopts;
+  wopts.disableWAL = true;
+  for (size_t i = 0; i != num_cfs; ++i) {
+    int cf_id = static_cast<int>(i);
+    ASSERT_OK(Put(cf_id, "key", "value", wopts));
+  }
+  port::Thread user_thread([&]() {
+    TEST_SYNC_POINT("DBAtomicFlushTest::BeforeDropCF");
+    ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+    TEST_SYNC_POINT("DBAtomicFlushTest::AfterDropCF");
+  });
+  FlushOptions flush_opts;
+  flush_opts.wait = true;
+  ASSERT_OK(dbfull()->Flush(flush_opts, handles_));
+  user_thread.join();
+  for (size_t i = 0; i != num_cfs; ++i) {
+    int cf_id = static_cast<int>(i);
+    ASSERT_EQ("value", Get(cf_id, "key"));
+  }
+
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "eevee"}, options);
+  num_cfs = handles_.size();
+  ASSERT_EQ(2, num_cfs);
+  for (size_t i = 0; i != num_cfs; ++i) {
+    int cf_id = static_cast<int>(i);
+    ASSERT_EQ("value", Get(cf_id, "key"));
+  }
+  Destroy(options);
+}
+
 INSTANTIATE_TEST_CASE_P(DBFlushDirectIOTest, DBFlushDirectIOTest,
                        testing::Bool());

--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@ -45,7 +45,6 @@
 #include "db/memtable_list.h"
 #include "db/merge_context.h"
 #include "db/merge_helper.h"
-#include "db/range_del_aggregator.h"
 #include "db/range_tombstone_fragmenter.h"
 #include "db/table_cache.h"
 #include "db/table_properties_collector.h"
@ -221,7 +220,7 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
      preserve_deletes_(options.preserve_deletes),
      closed_(false),
      error_handler_(this, immutable_db_options_, &mutex_),
-      atomic_flush_commit_in_progress_(false) {
+      atomic_flush_install_cv_(&mutex_) {
  // !batch_per_trx_ implies seq_per_batch_ because it is only unset for
  // WriteUnprepared, which should use seq_per_batch_.
  assert(batch_per_txn_ || seq_per_batch_);
@ -1033,7 +1032,7 @@ bool DBImpl::SetPreserveDeletesSequenceNumber(SequenceNumber seqnum) {
 }

 InternalIterator* DBImpl::NewInternalIterator(
-    Arena* arena, RangeDelAggregatorV2* range_del_agg, SequenceNumber sequence,
+    Arena* arena, RangeDelAggregator* range_del_agg, SequenceNumber sequence,
    ColumnFamilyHandle* column_family) {
  ColumnFamilyData* cfd;
  if (column_family == nullptr) {
@ -1150,10 +1149,12 @@ static void CleanupIteratorState(void* arg1, void* /*arg2*/) {
 }
 }  // namespace

-InternalIterator* DBImpl::NewInternalIterator(
-    const ReadOptions& read_options, ColumnFamilyData* cfd,
-    SuperVersion* super_version, Arena* arena,
-    RangeDelAggregatorV2* range_del_agg, SequenceNumber sequence) {
+InternalIterator* DBImpl::NewInternalIterator(const ReadOptions& read_options,
+                                              ColumnFamilyData* cfd,
+                                              SuperVersion* super_version,
+                                              Arena* arena,
+                                              RangeDelAggregator* range_del_agg,
+                                              SequenceNumber sequence) {
  InternalIterator* internal_iter;
  assert(arena != nullptr);
  assert(range_del_agg != nullptr);
--- a/db/db_impl.h
+++ b/db/db_impl.h
@ -31,7 +31,7 @@
 #include "db/log_writer.h"
 #include "db/logs_with_prep_tracker.h"
 #include "db/pre_release_callback.h"
-#include "db/range_del_aggregator_v2.h"
+#include "db/range_del_aggregator.h"
 #include "db/read_callback.h"
 #include "db/snapshot_checker.h"
 #include "db/snapshot_impl.h"
@ -374,8 +374,8 @@ class DBImpl : public DB {
  // The keys of this iterator are internal keys (see format.h).
  // The returned iterator should be deleted when no longer needed.
  InternalIterator* NewInternalIterator(
-      Arena* arena, RangeDelAggregatorV2* range_del_agg,
-      SequenceNumber sequence, ColumnFamilyHandle* column_family = nullptr);
+      Arena* arena, RangeDelAggregator* range_del_agg, SequenceNumber sequence,
+      ColumnFamilyHandle* column_family = nullptr);

  LogsWithPrepTracker* logs_with_prep_tracker() {
    return &logs_with_prep_tracker_;
@ -578,12 +578,9 @@ class DBImpl : public DB {

  const WriteController& write_controller() { return write_controller_; }

-  InternalIterator* NewInternalIterator(const ReadOptions&,
-                                        ColumnFamilyData* cfd,
-                                        SuperVersion* super_version,
-                                        Arena* arena,
-                                        RangeDelAggregatorV2* range_del_agg,
-                                        SequenceNumber sequence);
+  InternalIterator* NewInternalIterator(
+      const ReadOptions&, ColumnFamilyData* cfd, SuperVersion* super_version,
+      Arena* arena, RangeDelAggregator* range_del_agg, SequenceNumber sequence);

  // hollow transactions shell used for recovery.
  // these will then be passed to TransactionDB so that
@ -1613,15 +1610,16 @@ class DBImpl : public DB {

  ErrorHandler error_handler_;

-  // True if the DB is committing atomic flush.
-  // TODO (yanqin) the current impl assumes that the entire DB belongs to
-  // a single atomic flush group. In the future we need to add a new class
-  // (struct) similar to the following to make it more general.
-  // struct AtomicFlushGroup {
-  //   bool commit_in_progress_;
-  //   std::vector<MemTableList*> imm_lists;
-  // };
-  bool atomic_flush_commit_in_progress_;
+  // Conditional variable to coordinate installation of atomic flush results.
+  // With atomic flush, each bg thread installs the result of flushing multiple
+  // column families, and different threads can flush different column
+  // families. It's difficult to rely on one thread to perform batch
+  // installation for all threads. This is different from the non-atomic flush
+  // case.
+  // atomic_flush_install_cv_ makes sure that threads install atomic flush
+  // results sequentially. Flush results of memtables with lower IDs get
+  // installed to MANIFEST first.
+  InstrumentedCondVar atomic_flush_install_cv_;
 };

 extern Options SanitizeOptions(const std::string& db,
--- a/db/db_impl_compaction_flush.cc
+++ b/db/db_impl_compaction_flush.cc
@ -219,20 +219,24 @@ Status DBImpl::FlushMemTablesToOutputFiles(
    return AtomicFlushMemTablesToOutputFiles(bg_flush_args, made_progress,
                                             job_context, log_buffer);
  }
-  Status s;
+  Status status;
  for (auto& arg : bg_flush_args) {
    ColumnFamilyData* cfd = arg.cfd_;
-    const MutableCFOptions& mutable_cf_options =
-        *cfd->GetLatestMutableCFOptions();
+    MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions();
    SuperVersionContext* superversion_context = arg.superversion_context_;
-    s = FlushMemTableToOutputFile(cfd, mutable_cf_options, made_progress,
-                                  job_context, superversion_context,
-                                  log_buffer);
+    Status s = FlushMemTableToOutputFile(cfd, mutable_cf_options, made_progress,
+                                         job_context, superversion_context,
+                                         log_buffer);
    if (!s.ok()) {
-      break;
+      status = s;
+      if (!s.IsShutdownInProgress()) {
+        // At this point, DB is not shutting down, nor is cfd dropped.
+        // Something is wrong, thus we break out of the loop.
+        break;
+      }
    }
  }
-  return s;
+  return status;
 }

 /*
@ -271,7 +275,9 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
  }
  autovector<Directory*> distinct_output_dirs;
  std::vector<FlushJob> jobs;
+  std::vector<MutableCFOptions> all_mutable_cf_options;
  int num_cfs = static_cast<int>(cfds.size());
+  all_mutable_cf_options.reserve(num_cfs);
  for (int i = 0; i < num_cfs; ++i) {
    auto cfd = cfds[i];
    Directory* data_dir = GetDataDir(cfd, 0U);
@ -290,8 +296,8 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
      distinct_output_dirs.emplace_back(data_dir);
    }

-    const MutableCFOptions& mutable_cf_options =
-        *cfd->GetLatestMutableCFOptions();
+    all_mutable_cf_options.emplace_back(*cfd->GetLatestMutableCFOptions());
+    const MutableCFOptions& mutable_cf_options = all_mutable_cf_options.back();
    const uint64_t* max_memtable_id = &(bg_flush_args[i].max_memtable_id_);
    jobs.emplace_back(
        dbname_, cfds[i], immutable_db_options_, mutable_cf_options,
@ -304,21 +310,18 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
    jobs.back().PickMemTable();
  }

-  autovector<FileMetaData> file_meta;
+  std::vector<FileMetaData> file_meta(num_cfs);
  Status s;
  assert(num_cfs == static_cast<int>(jobs.size()));

-  for (int i = 0; i != num_cfs; ++i) {
-    file_meta.emplace_back();
-
 #ifndef ROCKSDB_LITE
-    const MutableCFOptions& mutable_cf_options =
-        *cfds[i]->GetLatestMutableCFOptions();
+  for (int i = 0; i != num_cfs; ++i) {
+    const MutableCFOptions& mutable_cf_options = all_mutable_cf_options.at(i);
    // may temporarily unlock and lock the mutex.
    NotifyOnFlushBegin(cfds[i], &file_meta[i], mutable_cf_options,
                       job_context->job_id, jobs[i].GetTableProperties());
-#endif /* !ROCKSDB_LITE */
  }
+#endif /* !ROCKSDB_LITE */

  if (logfile_number_ > 0) {
    // TODO (yanqin) investigate whether we should sync the closed logs for
@ -331,8 +334,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
  autovector<std::pair<bool, Status>> exec_status;
  for (int i = 0; i != num_cfs; ++i) {
    // Initially all jobs are not executed, with status OK.
-    std::pair<bool, Status> elem(false, Status::OK());
-    exec_status.emplace_back(elem);
+    exec_status.emplace_back(false, Status::OK());
  }

  if (s.ok()) {
@ -341,10 +343,6 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
      exec_status[i].second =
          jobs[i].Run(&logs_with_prep_tracker_, &file_meta[i]);
      exec_status[i].first = true;
-      if (!exec_status[i].second.ok()) {
-        s = exec_status[i].second;
-        break;
-      }
    }
    if (num_cfs > 1) {
      TEST_SYNC_POINT(
@ -352,17 +350,27 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
      TEST_SYNC_POINT(
          "DBImpl::AtomicFlushMemTablesToOutputFiles:SomeFlushJobsComplete:2");
    }
-    if (s.ok()) {
-      exec_status[0].second =
-          jobs[0].Run(&logs_with_prep_tracker_, &file_meta[0]);
-      exec_status[0].first = true;
-      if (!exec_status[0].second.ok()) {
-        s = exec_status[0].second;
+    exec_status[0].second =
+        jobs[0].Run(&logs_with_prep_tracker_, &file_meta[0]);
+    exec_status[0].first = true;
+
+    Status error_status;
+    for (const auto& e : exec_status) {
+      if (!e.second.ok()) {
+        s = e.second;
+        if (!e.second.IsShutdownInProgress()) {
+          // If a flush job did not return OK, and the CF is not dropped, and
+          // the DB is not shutting down, then we have to return this result to
+          // caller later.
+          error_status = e.second;
+        }
      }
    }
+
+    s = error_status.ok() ? s : error_status;
  }

-  if (s.ok()) {
+  if (s.ok() || s.IsShutdownInProgress()) {
    // Sync on all distinct output directories.
    for (auto dir : distinct_output_dirs) {
      if (dir != nullptr) {
@ -372,37 +380,78 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
        }
      }
    }
-
-    if (s.ok()) {
-      autovector<const autovector<MemTable*>*> mems_list;
-      for (int i = 0; i != num_cfs; ++i) {
-        const auto& mems = jobs[i].GetMemTables();
-        mems_list.emplace_back(&mems);
-      }
-      autovector<ColumnFamilyData*> all_cfds;
-      autovector<MemTableList*> imm_lists;
-      autovector<const MutableCFOptions*> mutable_cf_options_list;
-      for (auto cfd : *versions_->GetColumnFamilySet()) {
-        all_cfds.emplace_back(cfd);
-        imm_lists.emplace_back(cfd->imm());
-        mutable_cf_options_list.emplace_back(cfd->GetLatestMutableCFOptions());
-      }
-
-      s = MemTableList::TryInstallMemtableFlushResults(
-          imm_lists, all_cfds, mutable_cf_options_list, mems_list,
-          &atomic_flush_commit_in_progress_, &logs_with_prep_tracker_,
-          versions_.get(), &mutex_, file_meta, &job_context->memtables_to_free,
-          directories_.GetDbDir(), log_buffer);
-    }
  }

  if (s.ok()) {
+    auto wait_to_install_func = [&]() {
+      bool ready = true;
+      for (size_t i = 0; i != cfds.size(); ++i) {
+        const auto& mems = jobs[i].GetMemTables();
+        if (cfds[i]->IsDropped()) {
+          // If the column family is dropped, then do not wait.
+          continue;
+        } else if (!mems.empty() &&
+                   cfds[i]->imm()->GetEarliestMemTableID() < mems[0]->GetID()) {
+          // If a flush job needs to install the flush result for mems and
+          // mems[0] is not the earliest memtable, it means another thread must
+          // be installing flush results for the same column family, then the
+          // current thread needs to wait.
+          ready = false;
+          break;
+        } else if (mems.empty() && cfds[i]->imm()->GetEarliestMemTableID() <=
+                                       bg_flush_args[i].max_memtable_id_) {
+          // If a flush job does not need to install flush results, then it has
+          // to wait until all memtables up to max_memtable_id_ (inclusive) are
+          // installed.
+          ready = false;
+          break;
+        }
+      }
+      return ready;
+    };
+
+    bool resuming_from_bg_err = error_handler_.IsDBStopped();
+    while ((!error_handler_.IsDBStopped() ||
+            error_handler_.GetRecoveryError().ok()) &&
+           !wait_to_install_func()) {
+      atomic_flush_install_cv_.Wait();
+    }
+
+    s = resuming_from_bg_err ? error_handler_.GetRecoveryError()
+                             : error_handler_.GetBGError();
+  }
+
+  if (s.ok()) {
+    autovector<ColumnFamilyData*> tmp_cfds;
+    autovector<const autovector<MemTable*>*> mems_list;
+    autovector<const MutableCFOptions*> mutable_cf_options_list;
+    autovector<FileMetaData*> tmp_file_meta;
+    for (int i = 0; i != num_cfs; ++i) {
+      const auto& mems = jobs[i].GetMemTables();
+      if (!cfds[i]->IsDropped() && !mems.empty()) {
+        tmp_cfds.emplace_back(cfds[i]);
+        mems_list.emplace_back(&mems);
+        mutable_cf_options_list.emplace_back(&all_mutable_cf_options[i]);
+        tmp_file_meta.emplace_back(&file_meta[i]);
+      }
+    }
+
+    s = InstallMemtableAtomicFlushResults(
+        nullptr /* imm_lists */, tmp_cfds, mutable_cf_options_list, mems_list,
+        versions_.get(), &mutex_, tmp_file_meta,
+        &job_context->memtables_to_free, directories_.GetDbDir(), log_buffer);
+  }
+
+  if (s.ok() || s.IsShutdownInProgress()) {
    assert(num_cfs ==
           static_cast<int>(job_context->superversion_contexts.size()));
    for (int i = 0; i != num_cfs; ++i) {
+      if (cfds[i]->IsDropped()) {
+        continue;
+      }
      InstallSuperVersionAndScheduleWork(cfds[i],
                                         &job_context->superversion_contexts[i],
-                                         *cfds[i]->GetLatestMutableCFOptions());
+                                         all_mutable_cf_options[i]);
      VersionStorageInfo::LevelSummaryStorage tmp;
      ROCKS_LOG_BUFFER(log_buffer, "[%s] Level summary: %s\n",
                       cfds[i]->GetName().c_str(),
@ -415,8 +464,10 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
    auto sfm = static_cast<SstFileManagerImpl*>(
        immutable_db_options_.sst_file_manager.get());
    for (int i = 0; i != num_cfs; ++i) {
-      NotifyOnFlushCompleted(cfds[i], &file_meta[i],
-                             *cfds[i]->GetLatestMutableCFOptions(),
+      if (cfds[i]->IsDropped()) {
+        continue;
+      }
+      NotifyOnFlushCompleted(cfds[i], &file_meta[i], all_mutable_cf_options[i],
                             job_context->job_id, jobs[i].GetTableProperties());
      if (sfm) {
        std::string file_path = MakeTableFileName(
@ -434,7 +485,9 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
 #endif  // ROCKSDB_LITE
  }

-  if (!s.ok()) {
+  // Need to undo atomic flush if something went wrong, i.e. s is not OK and
+  // it is not because of CF drop.
+  if (!s.ok() && !s.IsShutdownInProgress()) {
    // Have to cancel the flush jobs that have NOT executed because we need to
    // unref the versions.
    for (int i = 0; i != num_cfs; ++i) {
@ -442,17 +495,15 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
        jobs[i].Cancel();
      }
    }
-    if (!s.IsShutdownInProgress()) {
-      for (int i = 0; i != num_cfs; ++i) {
-        if (exec_status[i].first && exec_status[i].second.ok()) {
-          auto& mems = jobs[i].GetMemTables();
-          cfds[i]->imm()->RollbackMemtableFlush(mems,
-                                                file_meta[i].fd.GetNumber());
-        }
+    for (int i = 0; i != num_cfs; ++i) {
+      if (exec_status[i].first && exec_status[i].second.ok()) {
+        auto& mems = jobs[i].GetMemTables();
+        cfds[i]->imm()->RollbackMemtableFlush(mems,
+                                              file_meta[i].fd.GetNumber());
      }
-      Status new_bg_error = s;
-      error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush);
    }
+    Status new_bg_error = s;
+    error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush);
  }

  return s;
@ -1407,6 +1458,7 @@ Status DBImpl::RunManualCompaction(ColumnFamilyData* cfd, int input_level,
 void DBImpl::GenerateFlushRequest(const autovector<ColumnFamilyData*>& cfds,
                                  FlushRequest* req) {
  assert(req != nullptr);
+  req->reserve(cfds.size());
  for (const auto cfd : cfds) {
    if (nullptr == cfd) {
      // cfd may be null, see DBImpl::ScheduleFlushes
@ -1440,11 +1492,16 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
      write_thread_.EnterUnbatched(&w, &mutex_);
    }

-    if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() ||
-        !cached_recoverable_state_empty_.load()) {
+    if (!cfd->mem()->IsEmpty() || !cached_recoverable_state_empty_.load()) {
      s = SwitchMemtable(cfd, &context);
-      flush_memtable_id = cfd->imm()->GetLatestMemTableID();
-      flush_req.emplace_back(cfd, flush_memtable_id);
+    }
+
+    if (s.ok()) {
+      if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() ||
+          !cached_recoverable_state_empty_.load()) {
+        flush_memtable_id = cfd->imm()->GetLatestMemTableID();
+        flush_req.emplace_back(cfd, flush_memtable_id);
+      }
    }

    if (s.ok() && !flush_req.empty()) {
@ -1518,6 +1575,9 @@ Status DBImpl::AtomicFlushMemTables(
      }
    }
    for (auto cfd : cfds) {
+      if (cfd->mem()->IsEmpty() && cached_recoverable_state_empty_.load()) {
+        continue;
+      }
      cfd->Ref();
      s = SwitchMemtable(cfd, &context);
      cfd->Unref();
@ -1539,6 +1599,7 @@ Status DBImpl::AtomicFlushMemTables(
      write_thread_.ExitUnbatched(&w);
    }
  }
+  TEST_SYNC_POINT("DBImpl::AtomicFlushMemTables:AfterScheduleFlush");

  if (s.ok() && flush_options.wait) {
    autovector<const uint64_t*> flush_memtable_ids;
@ -2046,6 +2107,7 @@ void DBImpl::BackgroundCallFlush() {
    bg_flush_scheduled_--;
    // See if there's more work to be done
    MaybeScheduleFlushOrCompaction();
+    atomic_flush_install_cv_.SignalAll();
    bg_cv_.SignalAll();
    // IMPORTANT: there should be no code after calling SignalAll. This call may
    // signal the DB destructor that it's OK to proceed with destruction. In
--- a/db/db_impl_open.cc
+++ b/db/db_impl_open.cc
@ -23,8 +23,7 @@
 #include "util/sync_point.h"

 namespace rocksdb {
-Options SanitizeOptions(const std::string& dbname,
-                        const Options& src) {
+Options SanitizeOptions(const std::string& dbname, const Options& src) {
  auto db_options = SanitizeOptions(dbname, DBOptions(src));
  ImmutableDBOptions immutable_db_options(db_options);
  auto cf_options =
@ -56,10 +55,9 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) {
    result.write_buffer_manager.reset(
        new WriteBufferManager(result.db_write_buffer_size));
  }
-  auto bg_job_limits = DBImpl::GetBGJobLimits(result.max_background_flushes,
-                                              result.max_background_compactions,
-                                              result.max_background_jobs,
-                                              true /* parallelize_compactions */);
+  auto bg_job_limits = DBImpl::GetBGJobLimits(
+      result.max_background_flushes, result.max_background_compactions,
+      result.max_background_jobs, true /* parallelize_compactions */);
  result.env->IncBackgroundThreadsIfNeeded(bg_job_limits.max_compactions,
                                           Env::Priority::LOW);
  result.env->IncBackgroundThreadsIfNeeded(bg_job_limits.max_flushes,
@ -107,14 +105,12 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) {
    result.db_paths.emplace_back(dbname, std::numeric_limits<uint64_t>::max());
  }

-  if (result.use_direct_reads &&
-      result.compaction_readahead_size == 0) {
+  if (result.use_direct_reads && result.compaction_readahead_size == 0) {
    TEST_SYNC_POINT_CALLBACK("SanitizeOptions:direct_io", nullptr);
    result.compaction_readahead_size = 1024 * 1024 * 2;
  }

-  if (result.compaction_readahead_size > 0 ||
-      result.use_direct_reads) {
+  if (result.compaction_readahead_size > 0 || result.use_direct_reads) {
    result.new_table_reader_for_compaction_inputs = true;
  }

@ -218,7 +214,7 @@ static Status ValidateOptions(

  return Status::OK();
 }
-} // namespace
+}  // namespace
 Status DBImpl::NewDB() {
  VersionEdit new_db;
  new_db.SetLogNumber(0);
@ -258,9 +254,8 @@ Status DBImpl::NewDB() {
  return s;
 }

-Status DBImpl::CreateAndNewDirectory(
-    Env* env, const std::string& dirname,
-    std::unique_ptr<Directory>* directory) {
+Status DBImpl::CreateAndNewDirectory(Env* env, const std::string& dirname,
+                                     std::unique_ptr<Directory>* directory) {
  // We call CreateDirIfMissing() as the directory may already exist (if we
  // are reopening a DB), when this happens we don't want creating the
  // directory to cause an error. However, we need to check if creating the
@ -341,8 +336,8 @@ Status DBImpl::Recover(
      }
    } else if (s.ok()) {
      if (immutable_db_options_.error_if_exists) {
-        return Status::InvalidArgument(
-            dbname_, "exists (error_if_exists is true)");
+        return Status::InvalidArgument(dbname_,
+                                       "exists (error_if_exists is true)");
      }
    } else {
      // Unexpected error reading file
@ -479,6 +474,28 @@ Status DBImpl::Recover(
    }
  }

+  if (read_only) {
+    // If we are opening as read-only, we need to update options_file_number_
+    // to reflect the most recent OPTIONS file. It does not matter for regular
+    // read-write db instance because options_file_number_ will later be
+    // updated to versions_->NewFileNumber() in RenameTempFileToOptionsFile.
+    std::vector<std::string> file_names;
+    if (s.ok()) {
+      s = env_->GetChildren(GetName(), &file_names);
+    }
+    if (s.ok()) {
+      uint64_t number = 0;
+      uint64_t options_file_number = 0;
+      FileType type;
+      for (const auto& fname : file_names) {
+        if (ParseFileName(fname, &number, &type) && type == kOptionsFile) {
+          options_file_number = std::max(number, options_file_number);
+        }
+      }
+      versions_->options_file_number_ = options_file_number;
+    }
+  }
+
  return s;
 }

@ -527,10 +544,9 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
    std::map<std::string, uint32_t> cf_name_id_map;
    std::map<uint32_t, uint64_t> cf_lognumber_map;
    for (auto cfd : *versions_->GetColumnFamilySet()) {
-      cf_name_id_map.insert(
-        std::make_pair(cfd->GetName(), cfd->GetID()));
+      cf_name_id_map.insert(std::make_pair(cfd->GetName(), cfd->GetID()));
      cf_lognumber_map.insert(
-        std::make_pair(cfd->GetID(), cfd->GetLogNumber()));
+          std::make_pair(cfd->GetID(), cfd->GetLogNumber()));
    }

    immutable_db_options_.wal_filter->ColumnFamilyLogNumberMap(cf_lognumber_map,
@ -880,8 +896,8 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
      // VersionSet::next_file_number_ always to be strictly greater than any
      // log number
      versions_->MarkFileNumberUsed(max_log_number + 1);
-      status = versions_->LogAndApply(
-          cfd, *cfd->GetLatestMutableCFOptions(), edit, &mutex_);
+      status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
+                                      edit, &mutex_);
      if (!status.ok()) {
        // Recovery failed
        break;
@ -994,12 +1010,17 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
      if (use_custom_gc_ && snapshot_checker == nullptr) {
        snapshot_checker = DisableGCSnapshotChecker::Instance();
      }
+      std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
+          range_del_iters;
+      auto range_del_iter =
+          mem->NewRangeTombstoneIterator(ro, kMaxSequenceNumber);
+      if (range_del_iter != nullptr) {
+        range_del_iters.emplace_back(range_del_iter);
+      }
      s = BuildTable(
          dbname_, env_, *cfd->ioptions(), mutable_cf_options,
          env_options_for_compaction_, cfd->table_cache(), iter.get(),
-          std::unique_ptr<InternalIterator>(
-              mem->NewRangeTombstoneIterator(ro, versions_->LastSequence())),
-          &meta, cfd->internal_comparator(),
+          std::move(range_del_iters), &meta, cfd->internal_comparator(),
          cfd->int_tbl_prop_collector_factories(), cfd->GetID(), cfd->GetName(),
          snapshot_seqs, earliest_write_conflict_snapshot, snapshot_checker,
          GetCompressionFlush(*cfd->ioptions(), mutable_cf_options),
@ -1033,8 +1054,8 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
  stats.bytes_written = meta.fd.GetFileSize();
  stats.num_output_files = 1;
  cfd->internal_stats()->AddCompactionStats(level, stats);
-  cfd->internal_stats()->AddCFStats(
-      InternalStats::BYTES_FLUSHED, meta.fd.GetFileSize());
+  cfd->internal_stats()->AddCFStats(InternalStats::BYTES_FLUSHED,
+                                    meta.fd.GetFileSize());
  RecordTick(stats_, COMPACT_WRITE_BYTES, meta.fd.GetFileSize());
  return s;
 }
@ -1227,7 +1248,8 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
          !cfd->mem()->IsMergeOperatorSupported()) {
        s = Status::InvalidArgument(
            "The memtable of column family %s does not support merge operator "
-            "its options.merge_operator is non-null", cfd->GetName().c_str());
+            "its options.merge_operator is non-null",
+            cfd->GetName().c_str());
      }
      if (!s.ok()) {
        break;
--- a/db/db_impl_readonly.cc
+++ b/db/db_impl_readonly.cc
@ -9,7 +9,6 @@
 #include "db/db_impl.h"
 #include "db/db_iter.h"
 #include "db/merge_context.h"
-#include "db/range_del_aggregator.h"
 #include "monitoring/perf_context_imp.h"

 namespace rocksdb {
--- a/db/db_impl_readonly.h
+++ b/db/db_impl_readonly.h
@ -89,10 +89,11 @@ class DBImplReadOnly : public DBImpl {
  virtual Status EnableFileDeletions(bool /*force*/) override {
    return Status::NotSupported("Not supported operation in read only mode.");
  }
-  virtual Status GetLiveFiles(std::vector<std::string>&,
-                              uint64_t* /*manifest_file_size*/,
-                              bool /*flush_memtable*/ = true) override {
-    return Status::NotSupported("Not supported operation in read only mode.");
+  virtual Status GetLiveFiles(std::vector<std::string>& ret,
+                              uint64_t* manifest_file_size,
+                              bool /*flush_memtable*/) override {
+    return DBImpl::GetLiveFiles(ret, manifest_file_size,
+                                false /* flush_memtable */);
  }

  using DBImpl::Flush;
--- a/db/db_impl_write.cc
+++ b/db/db_impl_write.cc
@ -265,18 +265,19 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
    // We're optimistic, updating the stats before we successfully
    // commit.  That lets us release our leader status early.
    auto stats = default_cf_internal_stats_;
-    stats->AddDBStats(InternalStats::NUMBER_KEYS_WRITTEN, total_count,
+    stats->AddDBStats(InternalStats::kIntStatsNumKeysWritten, total_count,
                      concurrent_update);
    RecordTick(stats_, NUMBER_KEYS_WRITTEN, total_count);
-    stats->AddDBStats(InternalStats::BYTES_WRITTEN, total_byte_size,
+    stats->AddDBStats(InternalStats::kIntStatsBytesWritten, total_byte_size,
                      concurrent_update);
    RecordTick(stats_, BYTES_WRITTEN, total_byte_size);
-    stats->AddDBStats(InternalStats::WRITE_DONE_BY_SELF, 1, concurrent_update);
+    stats->AddDBStats(InternalStats::kIntStatsWriteDoneBySelf, 1,
+                      concurrent_update);
    RecordTick(stats_, WRITE_DONE_BY_SELF);
    auto write_done_by_other = write_group.size - 1;
    if (write_done_by_other > 0) {
-      stats->AddDBStats(InternalStats::WRITE_DONE_BY_OTHER, write_done_by_other,
-                        concurrent_update);
+      stats->AddDBStats(InternalStats::kIntStatsWriteDoneByOther,
+                        write_done_by_other, concurrent_update);
      RecordTick(stats_, WRITE_DONE_BY_OTHER, write_done_by_other);
    }
    MeasureTime(stats_, BYTES_PER_WRITE, total_byte_size);
@ -467,9 +468,9 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
    }

    auto stats = default_cf_internal_stats_;
-    stats->AddDBStats(InternalStats::NUMBER_KEYS_WRITTEN, total_count);
+    stats->AddDBStats(InternalStats::kIntStatsNumKeysWritten, total_count);
    RecordTick(stats_, NUMBER_KEYS_WRITTEN, total_count);
-    stats->AddDBStats(InternalStats::BYTES_WRITTEN, total_byte_size);
+    stats->AddDBStats(InternalStats::kIntStatsBytesWritten, total_byte_size);
    RecordTick(stats_, BYTES_WRITTEN, total_byte_size);
    MeasureTime(stats_, BYTES_PER_WRITE, total_byte_size);

@ -477,10 +478,10 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,

    if (w.ShouldWriteToWAL()) {
      PERF_TIMER_GUARD(write_wal_time);
-      stats->AddDBStats(InternalStats::WRITE_DONE_BY_SELF, 1);
+      stats->AddDBStats(InternalStats::kIntStatsWriteDoneBySelf, 1);
      RecordTick(stats_, WRITE_DONE_BY_SELF, 1);
      if (wal_write_group.size > 1) {
-        stats->AddDBStats(InternalStats::WRITE_DONE_BY_OTHER,
+        stats->AddDBStats(InternalStats::kIntStatsWriteDoneByOther,
                          wal_write_group.size - 1);
        RecordTick(stats_, WRITE_DONE_BY_OTHER, wal_write_group.size - 1);
      }
@ -591,15 +592,16 @@ Status DBImpl::WriteImplWALOnly(const WriteOptions& write_options,
  // We're optimistic, updating the stats before we successfully
  // commit.  That lets us release our leader status early.
  auto stats = default_cf_internal_stats_;
-  stats->AddDBStats(InternalStats::BYTES_WRITTEN, total_byte_size,
+  stats->AddDBStats(InternalStats::kIntStatsBytesWritten, total_byte_size,
                    concurrent_update);
  RecordTick(stats_, BYTES_WRITTEN, total_byte_size);
-  stats->AddDBStats(InternalStats::WRITE_DONE_BY_SELF, 1, concurrent_update);
+  stats->AddDBStats(InternalStats::kIntStatsWriteDoneBySelf, 1,
+                    concurrent_update);
  RecordTick(stats_, WRITE_DONE_BY_SELF);
  auto write_done_by_other = write_group.size - 1;
  if (write_done_by_other > 0) {
-    stats->AddDBStats(InternalStats::WRITE_DONE_BY_OTHER, write_done_by_other,
-                      concurrent_update);
+    stats->AddDBStats(InternalStats::kIntStatsWriteDoneByOther,
+                      write_done_by_other, concurrent_update);
    RecordTick(stats_, WRITE_DONE_BY_OTHER, write_done_by_other);
  }
  MeasureTime(stats_, BYTES_PER_WRITE, total_byte_size);
@ -908,12 +910,12 @@ Status DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
  if (status.ok()) {
    auto stats = default_cf_internal_stats_;
    if (need_log_sync) {
-      stats->AddDBStats(InternalStats::WAL_FILE_SYNCED, 1);
+      stats->AddDBStats(InternalStats::kIntStatsWalFileSynced, 1);
      RecordTick(stats_, WAL_FILE_SYNCED);
    }
-    stats->AddDBStats(InternalStats::WAL_FILE_BYTES, log_size);
+    stats->AddDBStats(InternalStats::kIntStatsWalFileBytes, log_size);
    RecordTick(stats_, WAL_FILE_BYTES, log_size);
-    stats->AddDBStats(InternalStats::WRITE_WITH_WAL, write_with_wal);
+    stats->AddDBStats(InternalStats::kIntStatsWriteWithWal, write_with_wal);
    RecordTick(stats_, WRITE_WITH_WAL, write_with_wal);
  }
  return status;
@ -959,9 +961,10 @@ Status DBImpl::ConcurrentWriteToWAL(const WriteThread::WriteGroup& write_group,
  if (status.ok()) {
    const bool concurrent = true;
    auto stats = default_cf_internal_stats_;
-    stats->AddDBStats(InternalStats::WAL_FILE_BYTES, log_size, concurrent);
+    stats->AddDBStats(InternalStats::kIntStatsWalFileBytes, log_size,
+                      concurrent);
    RecordTick(stats_, WAL_FILE_BYTES, log_size);
-    stats->AddDBStats(InternalStats::WRITE_WITH_WAL, write_with_wal,
+    stats->AddDBStats(InternalStats::kIntStatsWriteWithWal, write_with_wal,
                      concurrent);
    RecordTick(stats_, WRITE_WITH_WAL, write_with_wal);
  }
@ -1255,8 +1258,8 @@ Status DBImpl::DelayWrite(uint64_t num_bytes,
  }
  assert(!delayed || !write_options.no_slowdown);
  if (delayed) {
-    default_cf_internal_stats_->AddDBStats(InternalStats::WRITE_STALL_MICROS,
-                                           time_delayed);
+    default_cf_internal_stats_->AddDBStats(
+        InternalStats::kIntStatsWriteStallMicros, time_delayed);
    RecordTick(stats_, STALL_MICROS, time_delayed);
  }

--- a/db/db_iter.cc
+++ b/db/db_iter.cc
@ -171,7 +171,7 @@ class DBIter final: public Iterator {
    iter_ = iter;
    iter_->SetPinnedItersMgr(&pinned_iters_mgr_);
  }
-  virtual RangeDelAggregatorV2* GetRangeDelAggregator() {
+  virtual ReadRangeDelAggregator* GetRangeDelAggregator() {
    return &range_del_agg_;
  }

@ -341,7 +341,7 @@ class DBIter final: public Iterator {
  const bool total_order_seek_;
  // List of operands for merge operator.
  MergeContext merge_context_;
-  RangeDelAggregatorV2 range_del_agg_;
+  ReadRangeDelAggregator range_del_agg_;
  LocalStatistics local_stats_;
  PinnedIteratorsManager pinned_iters_mgr_;
  ReadCallback* read_callback_;
@ -1479,7 +1479,7 @@ Iterator* NewDBIterator(Env* env, const ReadOptions& read_options,

 ArenaWrappedDBIter::~ArenaWrappedDBIter() { db_iter_->~DBIter(); }

-RangeDelAggregatorV2* ArenaWrappedDBIter::GetRangeDelAggregator() {
+ReadRangeDelAggregator* ArenaWrappedDBIter::GetRangeDelAggregator() {
  return db_iter_->GetRangeDelAggregator();
 }

--- a/db/db_iter.h
+++ b/db/db_iter.h
@ -12,7 +12,7 @@
 #include <string>
 #include "db/db_impl.h"
 #include "db/dbformat.h"
-#include "db/range_del_aggregator_v2.h"
+#include "db/range_del_aggregator.h"
 #include "options/cf_options.h"
 #include "rocksdb/db.h"
 #include "rocksdb/iterator.h"
@ -48,7 +48,7 @@ class ArenaWrappedDBIter : public Iterator {
  // Get the arena to be used to allocate memory for DBIter to be wrapped,
  // as well as child iterators in it.
  virtual Arena* GetArena() { return &arena_; }
-  virtual RangeDelAggregatorV2* GetRangeDelAggregator();
+  virtual ReadRangeDelAggregator* GetRangeDelAggregator();

  // Set the internal iterator wrapped inside the DB Iterator. Usually it is
  // a merging iterator.
--- a/db/db_memtable_test.cc
+++ b/db/db_memtable_test.cc
@ -8,6 +8,7 @@

 #include "db/db_test_util.h"
 #include "db/memtable.h"
+#include "db/range_del_aggregator.h"
 #include "port/stack_trace.h"
 #include "rocksdb/memtablerep.h"
 #include "rocksdb/slice_transform.h"
@ -135,7 +136,8 @@ TEST_F(DBMemTableTest, DuplicateSeq) {
  MergeContext merge_context;
  Options options;
  InternalKeyComparator ikey_cmp(options.comparator);
-  RangeDelAggregator range_del_agg(ikey_cmp, {} /* snapshots */);
+  ReadRangeDelAggregator range_del_agg(&ikey_cmp,
+                                       kMaxSequenceNumber /* upper_bound */);

  // Create a MemTable
  InternalKeyComparator cmp(BytewiseComparator());
--- a/db/db_range_del_test.cc
+++ b/db/db_range_del_test.cc
@ -1041,11 +1041,16 @@ TEST_F(DBRangeDelTest, RangeTombstoneEndKeyAsSstableUpperBound) {
    //   L2:
    //     [key000000#1,1, key000000#1,1]
    //     [key000002#6,1, key000004#72057594037927935,15]
+    //
+    // At the same time, verify the compaction does not cause the key at the
+    // endpoint (key000002#6,1) to disappear.
+    ASSERT_EQ(value, Get(Key(2)));
    auto begin_str = Key(3);
    const rocksdb::Slice begin = begin_str;
    dbfull()->TEST_CompactRange(1, &begin, nullptr);
    ASSERT_EQ(1, NumTableFilesAtLevel(1));
    ASSERT_EQ(2, NumTableFilesAtLevel(2));
+    ASSERT_EQ(value, Get(Key(2)));
  }

  {
--- a/db/db_test_util.cc
+++ b/db/db_test_util.cc
@ -814,8 +814,8 @@ std::string DBTestBase::AllEntriesFor(const Slice& user_key, int cf) {
  Arena arena;
  auto options = CurrentOptions();
  InternalKeyComparator icmp(options.comparator);
-  RangeDelAggregatorV2 range_del_agg(&icmp,
-                                     kMaxSequenceNumber /* upper_bound */);
+  ReadRangeDelAggregator range_del_agg(&icmp,
+                                       kMaxSequenceNumber /* upper_bound */);
  ScopedArenaIterator iter;
  if (cf == 0) {
    iter.set(dbfull()->NewInternalIterator(&arena, &range_del_agg,
@ -1227,8 +1227,8 @@ void DBTestBase::validateNumberOfEntries(int numValues, int cf) {
  Arena arena;
  auto options = CurrentOptions();
  InternalKeyComparator icmp(options.comparator);
-  RangeDelAggregatorV2 range_del_agg(&icmp,
-                                     kMaxSequenceNumber /* upper_bound */);
+  ReadRangeDelAggregator range_del_agg(&icmp,
+                                       kMaxSequenceNumber /* upper_bound */);
  // This should be defined after range_del_agg so that it destructs the
  // assigned iterator before it range_del_agg is already destructed.
  ScopedArenaIterator iter;
@ -1437,8 +1437,8 @@ void DBTestBase::VerifyDBInternal(
    std::vector<std::pair<std::string, std::string>> true_data) {
  Arena arena;
  InternalKeyComparator icmp(last_options_.comparator);
-  RangeDelAggregatorV2 range_del_agg(&icmp,
-                                     kMaxSequenceNumber /* upper_bound */);
+  ReadRangeDelAggregator range_del_agg(&icmp,
+                                       kMaxSequenceNumber /* upper_bound */);
  auto iter =
      dbfull()->NewInternalIterator(&arena, &range_del_agg, kMaxSequenceNumber);
  iter->SeekToFirst();
--- a/db/flush_job.cc
+++ b/db/flush_job.cc
@ -24,14 +24,15 @@
 #include "db/event_helpers.h"
 #include "db/log_reader.h"
 #include "db/log_writer.h"
+#include "db/memtable.h"
 #include "db/memtable_list.h"
 #include "db/merge_context.h"
+#include "db/range_tombstone_fragmenter.h"
 #include "db/version_set.h"
 #include "monitoring/iostats_context_imp.h"
 #include "monitoring/perf_context_imp.h"
 #include "monitoring/thread_status_util.h"
 #include "port/port.h"
-#include "db/memtable.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/statistics.h"
@ -295,7 +296,8 @@ Status FlushJob::WriteLevel0Table() {
    // memtable and its associated range deletion memtable, respectively, at
    // corresponding indexes.
    std::vector<InternalIterator*> memtables;
-    std::vector<InternalIterator*> range_del_iters;
+    std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
+        range_del_iters;
    ReadOptions ro;
    ro.total_order_seek = true;
    Arena arena;
@ -308,9 +310,9 @@ Status FlushJob::WriteLevel0Table() {
          cfd_->GetName().c_str(), job_context_->job_id, m->GetNextLogNumber());
      memtables.push_back(m->NewIterator(ro, &arena));
      auto* range_del_iter =
-          m->NewRangeTombstoneIterator(ro, versions_->LastSequence());
+          m->NewRangeTombstoneIterator(ro, kMaxSequenceNumber);
      if (range_del_iter != nullptr) {
-        range_del_iters.push_back(range_del_iter);
+        range_del_iters.emplace_back(range_del_iter);
      }
      total_num_entries += m->num_entries();
      total_num_deletes += m->num_deletes();
@ -329,10 +331,6 @@ Status FlushJob::WriteLevel0Table() {
      ScopedArenaIterator iter(
          NewMergingIterator(&cfd_->internal_comparator(), &memtables[0],
                             static_cast<int>(memtables.size()), &arena));
-      std::unique_ptr<InternalIterator> range_del_iter(NewMergingIterator(
-          &cfd_->internal_comparator(),
-          range_del_iters.empty() ? nullptr : &range_del_iters[0],
-          static_cast<int>(range_del_iters.size())));
      ROCKS_LOG_INFO(db_options_.info_log,
                     "[%s] [JOB %d] Level-0 flush table #%" PRIu64 ": started",
                     cfd_->GetName().c_str(), job_context_->job_id,
@ -358,7 +356,7 @@ Status FlushJob::WriteLevel0Table() {
      s = BuildTable(
          dbname_, db_options_.env, *cfd_->ioptions(), mutable_cf_options_,
          env_options_, cfd_->table_cache(), iter.get(),
-          std::move(range_del_iter), &meta_, cfd_->internal_comparator(),
+          std::move(range_del_iters), &meta_, cfd_->internal_comparator(),
          cfd_->int_tbl_prop_collector_factories(), cfd_->GetID(),
          cfd_->GetName(), existing_snapshots_,
          earliest_write_conflict_snapshot_, snapshot_checker_,
--- a/db/flush_job_test.cc
+++ b/db/flush_job_test.cc
@ -279,7 +279,6 @@ TEST_F(FlushJobTest, FlushMemtablesMultipleColumnFamilies) {
          *cfd->GetLatestMutableCFOptions(), kMaxSequenceNumber);
      mem->SetID(i);
      mem->Ref();
-      mem->TEST_AtomicFlushSequenceNumber() = 123;

      for (size_t j = 0; j != num_keys_per_memtable; ++j) {
        std::string key(ToString(j + i * num_keys_per_memtable));
@ -309,7 +308,9 @@ TEST_F(FlushJobTest, FlushMemtablesMultipleColumnFamilies) {
    k++;
  }
  HistogramData hist;
-  autovector<FileMetaData> file_metas;
+  std::vector<FileMetaData> file_metas;
+  // Call reserve to avoid auto-resizing
+  file_metas.reserve(flush_jobs.size());
  mutex_.Lock();
  for (auto& job : flush_jobs) {
    job.PickMemTable();
@ -320,23 +321,23 @@ TEST_F(FlushJobTest, FlushMemtablesMultipleColumnFamilies) {
    ASSERT_OK(job.Run(nullptr /**/, &meta));
    file_metas.emplace_back(meta);
  }
+  autovector<FileMetaData*> file_meta_ptrs;
+  for (auto& meta : file_metas) {
+    file_meta_ptrs.push_back(&meta);
+  }
  autovector<const autovector<MemTable*>*> mems_list;
  for (size_t i = 0; i != all_cfds.size(); ++i) {
    const auto& mems = flush_jobs[i].GetMemTables();
    mems_list.push_back(&mems);
  }
-  autovector<MemTableList*> imm_lists;
  autovector<const MutableCFOptions*> mutable_cf_options_list;
  for (auto cfd : all_cfds) {
-    imm_lists.push_back(cfd->imm());
    mutable_cf_options_list.push_back(cfd->GetLatestMutableCFOptions());
  }

-  bool atomic_flush_commit_in_progress = false;
-  Status s = MemTableList::TryInstallMemtableFlushResults(
-      imm_lists, all_cfds, mutable_cf_options_list, mems_list,
-      &atomic_flush_commit_in_progress, nullptr /* logs_prep_tracker */,
-      versions_.get(), &mutex_, file_metas, &job_context.memtables_to_free,
+  Status s = InstallMemtableAtomicFlushResults(
+      nullptr /* imm_lists */, all_cfds, mutable_cf_options_list, mems_list,
+      versions_.get(), &mutex_, file_meta_ptrs, &job_context.memtables_to_free,
      nullptr /* db_directory */, nullptr /* log_buffer */);
  ASSERT_OK(s);

--- a/db/forward_iterator.cc
+++ b/db/forward_iterator.cc
@ -15,7 +15,7 @@
 #include "db/db_iter.h"
 #include "db/dbformat.h"
 #include "db/job_context.h"
-#include "db/range_del_aggregator_v2.h"
+#include "db/range_del_aggregator.h"
 #include "db/range_tombstone_fragmenter.h"
 #include "rocksdb/env.h"
 #include "rocksdb/slice.h"
@ -73,8 +73,8 @@ class ForwardLevelIterator : public InternalIterator {
      delete file_iter_;
    }

-    RangeDelAggregatorV2 range_del_agg(&cfd_->internal_comparator(),
-                                       kMaxSequenceNumber /* upper_bound */);
+    ReadRangeDelAggregator range_del_agg(&cfd_->internal_comparator(),
+                                         kMaxSequenceNumber /* upper_bound */);
    file_iter_ = cfd_->table_cache()->NewIterator(
        read_options_, *(cfd_->soptions()), cfd_->internal_comparator(),
        *files_[file_index_],
@ -610,8 +610,8 @@ void ForwardIterator::RebuildIterators(bool refresh_sv) {
    // New
    sv_ = cfd_->GetReferencedSuperVersion(&(db_->mutex_));
  }
-  RangeDelAggregatorV2 range_del_agg(&cfd_->internal_comparator(),
-                                     kMaxSequenceNumber /* upper_bound */);
+  ReadRangeDelAggregator range_del_agg(&cfd_->internal_comparator(),
+                                       kMaxSequenceNumber /* upper_bound */);
  mutable_iter_ = sv_->mem->NewIterator(read_options_, &arena_);
  sv_->imm->AddIterators(read_options_, &imm_iters_, &arena_);
  if (!read_options_.ignore_range_deletions) {
@ -669,8 +669,8 @@ void ForwardIterator::RenewIterators() {

  mutable_iter_ = svnew->mem->NewIterator(read_options_, &arena_);
  svnew->imm->AddIterators(read_options_, &imm_iters_, &arena_);
-  RangeDelAggregatorV2 range_del_agg(&cfd_->internal_comparator(),
-                                     kMaxSequenceNumber /* upper_bound */);
+  ReadRangeDelAggregator range_del_agg(&cfd_->internal_comparator(),
+                                       kMaxSequenceNumber /* upper_bound */);
  if (!read_options_.ignore_range_deletions) {
    std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
        svnew->mem->NewRangeTombstoneIterator(
--- a/db/internal_stats.cc
+++ b/db/internal_stats.cc
@ -949,14 +949,17 @@ void InternalStats::DumpDBStats(std::string* value) {
           seconds_up, interval_seconds_up);
  value->append(buf);
  // Cumulative
-  uint64_t user_bytes_written = GetDBStats(InternalStats::BYTES_WRITTEN);
-  uint64_t num_keys_written = GetDBStats(InternalStats::NUMBER_KEYS_WRITTEN);
-  uint64_t write_other = GetDBStats(InternalStats::WRITE_DONE_BY_OTHER);
-  uint64_t write_self = GetDBStats(InternalStats::WRITE_DONE_BY_SELF);
-  uint64_t wal_bytes = GetDBStats(InternalStats::WAL_FILE_BYTES);
-  uint64_t wal_synced = GetDBStats(InternalStats::WAL_FILE_SYNCED);
-  uint64_t write_with_wal = GetDBStats(InternalStats::WRITE_WITH_WAL);
-  uint64_t write_stall_micros = GetDBStats(InternalStats::WRITE_STALL_MICROS);
+  uint64_t user_bytes_written =
+      GetDBStats(InternalStats::kIntStatsBytesWritten);
+  uint64_t num_keys_written =
+      GetDBStats(InternalStats::kIntStatsNumKeysWritten);
+  uint64_t write_other = GetDBStats(InternalStats::kIntStatsWriteDoneByOther);
+  uint64_t write_self = GetDBStats(InternalStats::kIntStatsWriteDoneBySelf);
+  uint64_t wal_bytes = GetDBStats(InternalStats::kIntStatsWalFileBytes);
+  uint64_t wal_synced = GetDBStats(InternalStats::kIntStatsWalFileSynced);
+  uint64_t write_with_wal = GetDBStats(InternalStats::kIntStatsWriteWithWal);
+  uint64_t write_stall_micros =
+      GetDBStats(InternalStats::kIntStatsWriteStallMicros);

  const int kHumanMicrosLen = 32;
  char human_micros[kHumanMicrosLen];
--- a/db/internal_stats.h
+++ b/db/internal_stats.h
@ -108,15 +108,15 @@ class InternalStats {
  };

  enum InternalDBStatsType {
-    WAL_FILE_BYTES,
-    WAL_FILE_SYNCED,
-    BYTES_WRITTEN,
-    NUMBER_KEYS_WRITTEN,
-    WRITE_DONE_BY_OTHER,
-    WRITE_DONE_BY_SELF,
-    WRITE_WITH_WAL,
-    WRITE_STALL_MICROS,
-    INTERNAL_DB_STATS_ENUM_MAX,
+    kIntStatsWalFileBytes,
+    kIntStatsWalFileSynced,
+    kIntStatsBytesWritten,
+    kIntStatsNumKeysWritten,
+    kIntStatsWriteDoneByOther,
+    kIntStatsWriteDoneBySelf,
+    kIntStatsWriteWithWal,
+    kIntStatsWriteStallMicros,
+    kIntStatsNumMax,
  };

  InternalStats(int num_levels, Env* env, ColumnFamilyData* cfd)
@ -292,7 +292,7 @@ class InternalStats {
  };

  void Clear() {
-    for (int i = 0; i < INTERNAL_DB_STATS_ENUM_MAX; i++) {
+    for (int i = 0; i < kIntStatsNumMax; i++) {
      db_stats_[i].store(0);
    }
    for (int i = 0; i < INTERNAL_CF_STATS_ENUM_MAX; i++) {
@ -382,7 +382,7 @@ class InternalStats {
  bool HandleBlockCacheStat(Cache** block_cache);

  // Per-DB stats
-  std::atomic<uint64_t> db_stats_[INTERNAL_DB_STATS_ENUM_MAX];
+  std::atomic<uint64_t> db_stats_[kIntStatsNumMax];
  // Per-ColumnFamily stats
  uint64_t cf_stats_value_[INTERNAL_CF_STATS_ENUM_MAX];
  uint64_t cf_stats_count_[INTERNAL_CF_STATS_ENUM_MAX];
@ -580,15 +580,15 @@ class InternalStats {
  };

  enum InternalDBStatsType {
-    WAL_FILE_BYTES,
-    WAL_FILE_SYNCED,
-    BYTES_WRITTEN,
-    NUMBER_KEYS_WRITTEN,
-    WRITE_DONE_BY_OTHER,
-    WRITE_DONE_BY_SELF,
-    WRITE_WITH_WAL,
-    WRITE_STALL_MICROS,
-    INTERNAL_DB_STATS_ENUM_MAX,
+    kIntStatsWalFileBytes,
+    kIntStatsWalFileSynced,
+    kIntStatsBytesWritten,
+    kIntStatsNumKeysWritten,
+    kIntStatsWriteDoneByOther,
+    kIntStatsWriteDoneBySelf,
+    kIntStatsWriteWithWal,
+    kIntStatsWriteStallMicros,
+    kIntStatsNumMax,
  };

  InternalStats(int /*num_levels*/, Env* /*env*/, ColumnFamilyData* /*cfd*/) {}
--- a/db/listener_test.cc
+++ b/db/listener_test.cc
@ -905,6 +905,7 @@ class TestFileOperationListener : public EventListener {
    if (info.status.ok()) {
      ++file_reads_success_;
    }
+    ReportDuration(info);
  }

  void OnFileWriteFinish(const FileOperationInfo& info) override {
@ -912,6 +913,7 @@ class TestFileOperationListener : public EventListener {
    if (info.status.ok()) {
      ++file_writes_success_;
    }
+    ReportDuration(info);
  }

  bool ShouldBeNotifiedOnFileIO() override { return true; }
@ -920,6 +922,13 @@ class TestFileOperationListener : public EventListener {
  std::atomic<size_t> file_reads_success_;
  std::atomic<size_t> file_writes_;
  std::atomic<size_t> file_writes_success_;
+
+ private:
+  void ReportDuration(const FileOperationInfo& info) const {
+    auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(
+        info.finish_timestamp - info.start_timestamp);
+    ASSERT_GT(duration.count(), 0);
+  }
 };

 TEST_F(EventListenerTest, OnFileOperationTest) {
--- a/db/malloc_stats.cc
+++ b/db/malloc_stats.cc
@ -13,17 +13,16 @@
 #include <memory>
 #include <string.h>

+#include "port/jemalloc_helper.h"
+
+
 namespace rocksdb {

 #ifdef ROCKSDB_JEMALLOC
-#ifdef __FreeBSD__
-#include <malloc_np.h>
-#else
-#include "jemalloc/jemalloc.h"
+
 #ifdef JEMALLOC_NO_RENAME
 #define malloc_stats_print je_malloc_stats_print
 #endif
-#endif

 typedef struct {
  char* cur;
@ -41,10 +40,10 @@ static void GetJemallocStatus(void* mstat_arg, const char* status) {
  snprintf(mstat->cur, buf_size, "%s", status);
  mstat->cur += status_len;
 }
-#endif  // ROCKSDB_JEMALLOC
-
-#ifdef ROCKSDB_JEMALLOC
 void DumpMallocStats(std::string* stats) {
+  if (!HasJemalloc()) {
+    return;
+  }
  MallocStatus mstat;
  const unsigned int kMallocStatusLen = 1000000;
  std::unique_ptr<char[]> buf{new char[kMallocStatusLen + 1]};
@ -56,5 +55,5 @@ void DumpMallocStats(std::string* stats) {
 #else
 void DumpMallocStats(std::string*) {}
 #endif  // ROCKSDB_JEMALLOC
-}
+}  // namespace rocksdb
 #endif  // !ROCKSDB_LITE
--- a/db/memtable.cc
+++ b/db/memtable.cc
@ -428,7 +428,7 @@ FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIterator(
          comparator_.comparator);

  auto* fragmented_iter = new FragmentedRangeTombstoneIterator(
-      fragmented_tombstone_list, read_seq, comparator_.comparator);
+      fragmented_tombstone_list, comparator_.comparator, read_seq);
  return fragmented_iter;
 }

--- a/db/memtable.h
+++ b/db/memtable.h
@ -386,14 +386,16 @@ class MemTable {

  uint64_t GetID() const { return id_; }

-  SequenceNumber& TEST_AtomicFlushSequenceNumber() {
-    return atomic_flush_seqno_;
+  void SetFlushCompleted(bool completed) { flush_completed_ = completed; }
+
+  uint64_t GetFileNumber() const { return file_number_; }
+
+  void SetFileNumber(uint64_t file_num) { file_number_ = file_num; }
+
+  void SetFlushInProgress(bool in_progress) {
+    flush_in_progress_ = in_progress;
  }

-  void TEST_SetFlushCompleted(bool completed) { flush_completed_ = completed; }
-
-  void TEST_SetFileNumber(uint64_t file_num) { file_number_ = file_num; }
-
 private:
  enum FlushStateEnum { FLUSH_NOT_REQUESTED, FLUSH_REQUESTED, FLUSH_SCHEDULED };

--- a/db/memtable_list.cc
+++ b/db/memtable_list.cc
@ -159,7 +159,7 @@ bool MemTableListVersion::GetFromList(

 Status MemTableListVersion::AddRangeTombstoneIterators(
    const ReadOptions& read_opts, Arena* /*arena*/,
-    RangeDelAggregatorV2* range_del_agg) {
+    RangeDelAggregator* range_del_agg) {
  assert(range_del_agg != nullptr);
  for (auto& m : memlist_) {
    // Using kMaxSequenceNumber is OK because these are immutable memtables.
@ -260,228 +260,6 @@ void MemTableListVersion::TrimHistory(autovector<MemTable*>* to_delete) {
  }
 }

-// Try to record multiple successful flush to the MANIFEST as an atomic unit.
-// This function may just return Status::OK if there has already been
-// a concurrent thread performing actual recording.
-Status MemTableList::TryInstallMemtableFlushResults(
-    autovector<MemTableList*>& imm_lists,
-    const autovector<ColumnFamilyData*>& cfds,
-    const autovector<const MutableCFOptions*>& mutable_cf_options_list,
-    const autovector<const autovector<MemTable*>*>& mems_list,
-    bool* atomic_flush_commit_in_progress, LogsWithPrepTracker* prep_tracker,
-    VersionSet* vset, InstrumentedMutex* mu,
-    const autovector<FileMetaData>& file_metas,
-    autovector<MemTable*>* to_delete, Directory* db_directory,
-    LogBuffer* log_buffer) {
-  AutoThreadOperationStageUpdater stage_updater(
-      ThreadStatus::STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS);
-  mu->AssertHeld();
-
-  for (size_t k = 0; k != mems_list.size(); ++k) {
-    for (size_t i = 0; i != mems_list[k]->size(); ++i) {
-      assert(i == 0 || (*mems_list[k])[i]->GetEdits()->NumEntries() == 0);
-      (*mems_list[k])[i]->flush_completed_ = true;
-      (*mems_list[k])[i]->file_number_ = file_metas[k].fd.GetNumber();
-    }
-  }
-
-  assert(atomic_flush_commit_in_progress != nullptr);
-  Status s;
-  if (*atomic_flush_commit_in_progress) {
-    // If the function reaches here, there must be a concurrent thread that
-    // have already started recording to MANIFEST. Therefore we should just
-    // return Status::OK and let the othe thread finish writing to MANIFEST on
-    // our behalf.
-    return s;
-  }
-
-  // If the function reaches here, the current thread will start writing to
-  // MANIFEST. It may record to MANIFEST the flush results of other flushes.
-  *atomic_flush_commit_in_progress = true;
-
-  auto comp = [&imm_lists](size_t lh, size_t rh) {
-    const auto& memlist1 = imm_lists[lh]->current_->memlist_;
-    const auto& memlist2 = imm_lists[rh]->current_->memlist_;
-    auto it1 = memlist1.rbegin();
-    auto it2 = memlist2.rbegin();
-    return (*it1)->atomic_flush_seqno_ > (*it2)->atomic_flush_seqno_;
-  };
-  // The top of the heap is the memtable with smallest atomic_flush_seqno_.
-  std::priority_queue<size_t, std::vector<size_t>, decltype(comp)> heap(comp);
-  // Sequence number of the oldest unfinished atomic flush.
-  SequenceNumber min_unfinished_seqno = kMaxSequenceNumber;
-  // Populate the heap with first element of each imm iff. it has been
-  // flushed to storage, i.e. flush_completed_ is true.
-  size_t num = imm_lists.size();
-  assert(num == cfds.size());
-  for (size_t i = 0; i != num; ++i) {
-    std::list<MemTable*>& memlist = imm_lists[i]->current_->memlist_;
-    if (memlist.empty()) {
-      continue;
-    }
-    auto it = memlist.rbegin();
-    if ((*it)->flush_completed_) {
-      heap.emplace(i);
-    } else if (min_unfinished_seqno > (*it)->atomic_flush_seqno_) {
-      min_unfinished_seqno = (*it)->atomic_flush_seqno_;
-    }
-  }
-
-  while (s.ok() && !heap.empty()) {
-    autovector<size_t> batch;
-    SequenceNumber seqno = kMaxSequenceNumber;
-    // Pop from the heap the memtables that belong to the same atomic flush,
-    // namely their atomic_flush_seqno_ are equal.
-    do {
-      size_t pos = heap.top();
-      const auto& memlist = imm_lists[pos]->current_->memlist_;
-      MemTable* mem = *(memlist.rbegin());
-      if (seqno == kMaxSequenceNumber) {
-        // First mem in this batch.
-        seqno = mem->atomic_flush_seqno_;
-        batch.emplace_back(pos);
-        heap.pop();
-      } else if (mem->atomic_flush_seqno_ == seqno) {
-        // mem has the same atomic_flush_seqno_, thus in the same atomic flush.
-        batch.emplace_back(pos);
-        heap.pop();
-      } else if (mem->atomic_flush_seqno_ > seqno) {
-        // mem belongs to another atomic flush with higher seqno, break the
-        // loop.
-        break;
-      }
-    } while (!heap.empty());
-    if (seqno >= min_unfinished_seqno) {
-      // If there is an older, unfinished atomic flush, then we should not
-      // proceed.
-      TEST_SYNC_POINT_CALLBACK(
-          "MemTableList::TryInstallMemtableFlushResults:"
-          "HasOlderUnfinishedAtomicFlush:0",
-          nullptr);
-      break;
-    }
-
-    // Found the earliest, complete atomic flush. No earlier atomic flush is
-    // pending. Therefore ready to record it to the MANIFEST.
-    uint32_t num_entries = 0;
-    autovector<ColumnFamilyData*> tmp_cfds;
-    autovector<const MutableCFOptions*> tmp_mutable_cf_options_list;
-    std::vector<autovector<MemTable*>> memtables_to_flush;
-    autovector<autovector<VersionEdit*>> edit_lists;
-    for (auto pos : batch) {
-      tmp_cfds.emplace_back(cfds[pos]);
-      tmp_mutable_cf_options_list.emplace_back(mutable_cf_options_list[pos]);
-      const auto& memlist = imm_lists[pos]->current_->memlist_;
-      uint64_t batch_file_number = 0;
-      autovector<MemTable*> tmp_mems;
-      autovector<VersionEdit*> edits;
-      for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) {
-        MemTable* m = *it;
-        if (!m->flush_completed_ ||
-            (it != memlist.rbegin() && m->file_number_ != batch_file_number)) {
-          break;
-        }
-        if (it == memlist.rbegin()) {
-          batch_file_number = m->file_number_;
-          edits.push_back(m->GetEdits());
-          ++num_entries;
-        }
-        tmp_mems.push_back(m);
-      }
-      edit_lists.push_back(edits);
-      memtables_to_flush.push_back(tmp_mems);
-    }
-    TEST_SYNC_POINT_CALLBACK(
-        "MemTableList::TryInstallMemtableFlushResults:FoundBatchToCommit:0",
-        &num_entries);
-
-    // Mark the version edits as an atomic group
-    uint32_t remaining = num_entries;
-    for (auto& edit_list : edit_lists) {
-      assert(edit_list.size() == 1);
-      edit_list[0]->MarkAtomicGroup(--remaining);
-    }
-    assert(remaining == 0);
-
-    size_t batch_sz = batch.size();
-    assert(batch_sz > 0);
-    assert(batch_sz == memtables_to_flush.size());
-    assert(batch_sz == tmp_cfds.size());
-    assert(batch_sz == edit_lists.size());
-
-    if (vset->db_options()->allow_2pc) {
-      for (size_t i = 0; i != batch_sz; ++i) {
-        auto& edit_list = edit_lists[i];
-        assert(!edit_list.empty());
-        edit_list.back()->SetMinLogNumberToKeep(
-            PrecomputeMinLogNumberToKeep(vset, *tmp_cfds[i], edit_list,
-                                         memtables_to_flush[i], prep_tracker));
-      }
-    }
-    // this can release and reacquire the mutex.
-    s = vset->LogAndApply(tmp_cfds, tmp_mutable_cf_options_list, edit_lists, mu,
-                          db_directory);
-
-    for (const auto pos : batch) {
-      imm_lists[pos]->InstallNewVersion();
-    }
-
-    if (s.ok()) {
-      for (size_t i = 0; i != batch_sz; ++i) {
-        if (tmp_cfds[i]->IsDropped()) {
-          continue;
-        }
-        size_t pos = batch[i];
-        for (auto m : memtables_to_flush[i]) {
-          assert(m->file_number_ > 0);
-          uint64_t mem_id = m->GetID();
-          ROCKS_LOG_BUFFER(log_buffer,
-                           "[%s] Level-0 commit table #%" PRIu64
-                           ": memtable #%" PRIu64 " done",
-                           tmp_cfds[i]->GetName().c_str(), m->file_number_,
-                           mem_id);
-          imm_lists[pos]->current_->Remove(m, to_delete);
-        }
-      }
-    } else {
-      for (size_t i = 0; i != batch_sz; ++i) {
-        size_t pos = batch[i];
-        for (auto m : memtables_to_flush[i]) {
-          uint64_t mem_id = m->GetID();
-          ROCKS_LOG_BUFFER(log_buffer,
-                           "[%s] Level-0 commit table #%" PRIu64
-                           ": memtable #%" PRIu64 " failed",
-                           tmp_cfds[i]->GetName().c_str(), m->file_number_,
-                           mem_id);
-          m->flush_completed_ = false;
-          m->flush_in_progress_ = false;
-          m->edit_.Clear();
-          m->file_number_ = 0;
-          imm_lists[pos]->num_flush_not_started_++;
-        }
-        imm_lists[pos]->imm_flush_needed.store(true, std::memory_order_release);
-      }
-    }
-    // Adjust the heap AFTER installing new MemTableListVersions because the
-    // compare function 'comp' needs to capture the most up-to-date state of
-    // imm_lists.
-    for (auto pos : batch) {
-      const auto& memlist = imm_lists[pos]->current_->memlist_;
-      if (!memlist.empty()) {
-        MemTable* mem = *(memlist.rbegin());
-        if (mem->flush_completed_) {
-          heap.emplace(pos);
-        } else if (min_unfinished_seqno > mem->atomic_flush_seqno_) {
-          min_unfinished_seqno = mem->atomic_flush_seqno_;
-        }
-      }
-    }
-  }
-
-  *atomic_flush_commit_in_progress = false;
-  return s;
-}
-
 // Returns true if there is at least one memtable on which flush has
 // not yet started.
 bool MemTableList::IsFlushPending() const {
@ -749,4 +527,106 @@ uint64_t MemTableList::PrecomputeMinLogContainingPrepSection(
  return min_log;
 }

+// Commit a successful atomic flush in the manifest file.
+Status InstallMemtableAtomicFlushResults(
+    const autovector<MemTableList*>* imm_lists,
+    const autovector<ColumnFamilyData*>& cfds,
+    const autovector<const MutableCFOptions*>& mutable_cf_options_list,
+    const autovector<const autovector<MemTable*>*>& mems_list, VersionSet* vset,
+    InstrumentedMutex* mu, const autovector<FileMetaData*>& file_metas,
+    autovector<MemTable*>* to_delete, Directory* db_directory,
+    LogBuffer* log_buffer) {
+  AutoThreadOperationStageUpdater stage_updater(
+      ThreadStatus::STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS);
+  mu->AssertHeld();
+
+  size_t num = mems_list.size();
+  assert(cfds.size() == num);
+  if (imm_lists != nullptr) {
+    assert(imm_lists->size() == num);
+  }
+  for (size_t k = 0; k != num; ++k) {
+#ifndef NDEBUG
+    const auto* imm =
+        (imm_lists == nullptr) ? cfds[k]->imm() : imm_lists->at(k);
+    if (!mems_list[k]->empty()) {
+      assert((*mems_list[k])[0]->GetID() == imm->GetEarliestMemTableID());
+    }
+#endif
+    assert(nullptr != file_metas[k]);
+    for (size_t i = 0; i != mems_list[k]->size(); ++i) {
+      assert(i == 0 || (*mems_list[k])[i]->GetEdits()->NumEntries() == 0);
+      (*mems_list[k])[i]->SetFlushCompleted(true);
+      (*mems_list[k])[i]->SetFileNumber(file_metas[k]->fd.GetNumber());
+    }
+  }
+
+  Status s;
+
+  autovector<autovector<VersionEdit*>> edit_lists;
+  uint32_t num_entries = 0;
+  for (const auto mems : mems_list) {
+    assert(mems != nullptr);
+    autovector<VersionEdit*> edits;
+    assert(!mems->empty());
+    edits.emplace_back((*mems)[0]->GetEdits());
+    ++num_entries;
+    edit_lists.emplace_back(edits);
+  }
+  // Mark the version edits as an atomic group
+  for (auto& edits : edit_lists) {
+    assert(edits.size() == 1);
+    edits[0]->MarkAtomicGroup(--num_entries);
+  }
+  assert(0 == num_entries);
+
+  // this can release and reacquire the mutex.
+  s = vset->LogAndApply(cfds, mutable_cf_options_list, edit_lists, mu,
+                        db_directory);
+
+  for (size_t k = 0; k != cfds.size(); ++k) {
+    auto* imm = (imm_lists == nullptr) ? cfds[k]->imm() : imm_lists->at(k);
+    imm->InstallNewVersion();
+  }
+
+  if (s.ok() || s.IsShutdownInProgress()) {
+    for (size_t i = 0; i != cfds.size(); ++i) {
+      if (cfds[i]->IsDropped()) {
+        continue;
+      }
+      auto* imm = (imm_lists == nullptr) ? cfds[i]->imm() : imm_lists->at(i);
+      for (auto m : *mems_list[i]) {
+        assert(m->GetFileNumber() > 0);
+        uint64_t mem_id = m->GetID();
+        ROCKS_LOG_BUFFER(log_buffer,
+                         "[%s] Level-0 commit table #%" PRIu64
+                         ": memtable #%" PRIu64 " done",
+                         cfds[i]->GetName().c_str(), m->GetFileNumber(),
+                         mem_id);
+        imm->current_->Remove(m, to_delete);
+      }
+    }
+  } else {
+    for (size_t i = 0; i != cfds.size(); ++i) {
+      auto* imm = (imm_lists == nullptr) ? cfds[i]->imm() : imm_lists->at(i);
+      for (auto m : *mems_list[i]) {
+        uint64_t mem_id = m->GetID();
+        ROCKS_LOG_BUFFER(log_buffer,
+                         "[%s] Level-0 commit table #%" PRIu64
+                         ": memtable #%" PRIu64 " failed",
+                         cfds[i]->GetName().c_str(), m->GetFileNumber(),
+                         mem_id);
+        m->SetFlushCompleted(false);
+        m->SetFlushInProgress(false);
+        m->GetEdits()->Clear();
+        m->SetFileNumber(0);
+        imm->num_flush_not_started_++;
+      }
+      imm->imm_flush_needed.store(true, std::memory_order_release);
+    }
+  }
+
+  return s;
+}
+
 }  // namespace rocksdb
--- a/db/memtable_list.h
+++ b/db/memtable_list.h
@ -15,7 +15,7 @@
 #include "db/dbformat.h"
 #include "db/logs_with_prep_tracker.h"
 #include "db/memtable.h"
-#include "db/range_del_aggregator_v2.h"
+#include "db/range_del_aggregator.h"
 #include "monitoring/instrumented_mutex.h"
 #include "rocksdb/db.h"
 #include "rocksdb/iterator.h"
@ -31,6 +31,7 @@ class ColumnFamilyData;
 class InternalKeyComparator;
 class InstrumentedMutex;
 class MergeIteratorBuilder;
+class MemTableList;

 // keeps a list of immutable memtables in a vector. the list is immutable
 // if refcount is bigger than one. It is used as a state for Get() and
@ -91,7 +92,7 @@ class MemTableListVersion {
  }

  Status AddRangeTombstoneIterators(const ReadOptions& read_opts, Arena* arena,
-                                    RangeDelAggregatorV2* range_del_agg);
+                                    RangeDelAggregator* range_del_agg);

  void AddIterators(const ReadOptions& options,
                    std::vector<InternalIterator*>* iterator_list,
@ -114,6 +115,18 @@ class MemTableListVersion {
  SequenceNumber GetEarliestSequenceNumber(bool include_history = false) const;

 private:
+  friend class MemTableList;
+
+  friend Status InstallMemtableAtomicFlushResults(
+      const autovector<MemTableList*>* imm_lists,
+      const autovector<ColumnFamilyData*>& cfds,
+      const autovector<const MutableCFOptions*>& mutable_cf_options_list,
+      const autovector<const autovector<MemTable*>*>& mems_list,
+      VersionSet* vset, InstrumentedMutex* mu,
+      const autovector<FileMetaData*>& file_meta,
+      autovector<MemTable*>* to_delete, Directory* db_directory,
+      LogBuffer* log_buffer);
+
  // REQUIRE: m is an immutable memtable
  void Add(MemTable* m, autovector<MemTable*>* to_delete);
  // REQUIRE: m is an immutable memtable
@ -132,8 +145,6 @@ class MemTableListVersion {

  void UnrefMemTable(autovector<MemTable*>* to_delete, MemTable* m);

-  friend class MemTableList;
-
  // Immutable MemTables that have not yet been flushed.
  std::list<MemTable*> memlist_;

@ -163,18 +174,6 @@ class MemTableListVersion {
 // write thread.)
 class MemTableList {
 public:
-  // Commit a successful atomic flush in the manifest file
-  static Status TryInstallMemtableFlushResults(
-      autovector<MemTableList*>& imm_lists,
-      const autovector<ColumnFamilyData*>& cfds,
-      const autovector<const MutableCFOptions*>& mutable_cf_options_list,
-      const autovector<const autovector<MemTable*>*>& mems_list,
-      bool* atomic_flush_commit_in_progress, LogsWithPrepTracker* prep_tracker,
-      VersionSet* vset, InstrumentedMutex* mu,
-      const autovector<FileMetaData>& file_meta,
-      autovector<MemTable*>* to_delete, Directory* db_directory,
-      LogBuffer* log_buffer);
-
  // A list of memtables.
  explicit MemTableList(int min_write_buffer_number_to_merge,
                        int max_write_buffer_number_to_maintain)
@ -296,6 +295,16 @@ class MemTableList {
  }

 private:
+  friend Status InstallMemtableAtomicFlushResults(
+      const autovector<MemTableList*>* imm_lists,
+      const autovector<ColumnFamilyData*>& cfds,
+      const autovector<const MutableCFOptions*>& mutable_cf_options_list,
+      const autovector<const autovector<MemTable*>*>& mems_list,
+      VersionSet* vset, InstrumentedMutex* mu,
+      const autovector<FileMetaData*>& file_meta,
+      autovector<MemTable*>* to_delete, Directory* db_directory,
+      LogBuffer* log_buffer);
+
  // DB mutex held
  void InstallNewVersion();

@ -317,4 +326,18 @@ class MemTableList {
  size_t current_memory_usage_;
 };

+// Installs memtable atomic flush results.
+// In most cases, imm_lists is nullptr, and the function simply uses the
+// immutable memtable lists associated with the cfds. There are unit tests that
+// installs flush results for external immutable memtable lists other than the
+// cfds' own immutable memtable lists, e.g. MemTableLIstTest. In this case,
+// imm_lists parameter is not nullptr.
+extern Status InstallMemtableAtomicFlushResults(
+    const autovector<MemTableList*>* imm_lists,
+    const autovector<ColumnFamilyData*>& cfds,
+    const autovector<const MutableCFOptions*>& mutable_cf_options_list,
+    const autovector<const autovector<MemTable*>*>& mems_list, VersionSet* vset,
+    InstrumentedMutex* mu, const autovector<FileMetaData*>& file_meta,
+    autovector<MemTable*>* to_delete, Directory* db_directory,
+    LogBuffer* log_buffer);
 }  // namespace rocksdb
--- a/db/memtable_list_test.cc
+++ b/db/memtable_list_test.cc
@ -8,7 +8,6 @@
 #include <string>
 #include <vector>
 #include "db/merge_context.h"
-#include "db/range_del_aggregator.h"
 #include "db/version_set.h"
 #include "db/write_controller.h"
 #include "rocksdb/db.h"
@ -86,17 +85,46 @@ class MemTableListTest : public testing::Test {
  Status Mock_InstallMemtableFlushResults(
      MemTableList* list, const MutableCFOptions& mutable_cf_options,
      const autovector<MemTable*>& m, autovector<MemTable*>* to_delete) {
-    autovector<MemTableList*> lists;
-    lists.emplace_back(list);
-    autovector<const autovector<MemTable*>*> mems_list;
-    mems_list.emplace_back(&m);
-    return Mock_InstallMemtableFlushResults(
-        lists, {0} /* cf_ids */, {&mutable_cf_options}, mems_list, to_delete);
+    // Create a mock Logger
+    test::NullLogger logger;
+    LogBuffer log_buffer(DEBUG_LEVEL, &logger);
+
+    CreateDB();
+    // Create a mock VersionSet
+    DBOptions db_options;
+    ImmutableDBOptions immutable_db_options(db_options);
+    EnvOptions env_options;
+    std::shared_ptr<Cache> table_cache(NewLRUCache(50000, 16));
+    WriteBufferManager write_buffer_manager(db_options.db_write_buffer_size);
+    WriteController write_controller(10000000u);
+
+    VersionSet versions(dbname, &immutable_db_options, env_options,
+                        table_cache.get(), &write_buffer_manager,
+                        &write_controller);
+    std::vector<ColumnFamilyDescriptor> cf_descs;
+    cf_descs.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions());
+    cf_descs.emplace_back("one", ColumnFamilyOptions());
+    cf_descs.emplace_back("two", ColumnFamilyOptions());
+
+    EXPECT_OK(versions.Recover(cf_descs, false));
+
+    // Create mock default ColumnFamilyData
+    auto column_family_set = versions.GetColumnFamilySet();
+    LogsWithPrepTracker dummy_prep_tracker;
+    auto cfd = column_family_set->GetDefault();
+    EXPECT_TRUE(nullptr != cfd);
+    uint64_t file_num = file_number.fetch_add(1);
+    // Create dummy mutex.
+    InstrumentedMutex mutex;
+    InstrumentedMutexLock l(&mutex);
+    return list->TryInstallMemtableFlushResults(
+        cfd, mutable_cf_options, m, &dummy_prep_tracker, &versions, &mutex,
+        file_num, to_delete, nullptr, &log_buffer);
  }

  // Calls MemTableList::InstallMemtableFlushResults() and sets up all
  // structures needed to call this function.
-  Status Mock_InstallMemtableFlushResults(
+  Status Mock_InstallMemtableAtomicFlushResults(
      autovector<MemTableList*>& lists, const autovector<uint32_t>& cf_ids,
      const autovector<const MutableCFOptions*>& mutable_cf_options_list,
      const autovector<const autovector<MemTable*>*>& mems_list,
@ -128,44 +156,28 @@ class MemTableListTest : public testing::Test {
    auto column_family_set = versions.GetColumnFamilySet();

    LogsWithPrepTracker dummy_prep_tracker;
-    if (1 == cf_ids.size()) {
-      auto cfd = column_family_set->GetColumnFamily(cf_ids[0]);
-      EXPECT_TRUE(nullptr != cfd);
-      EXPECT_EQ(1, lists.size());
-      MemTableList* list = lists[0];
-      EXPECT_EQ(1, mutable_cf_options_list.size());
-      const MutableCFOptions& mutable_cf_options =
-          *(mutable_cf_options_list.at(0));
-      const autovector<MemTable*>* mems = mems_list.at(0);
-      EXPECT_TRUE(nullptr != mems);
-
-      uint64_t file_num = file_number.fetch_add(1);
-      // Create dummy mutex.
-      InstrumentedMutex mutex;
-      InstrumentedMutexLock l(&mutex);
-      return list->TryInstallMemtableFlushResults(
-          cfd, mutable_cf_options, *mems, &dummy_prep_tracker, &versions,
-          &mutex, file_num, to_delete, nullptr, &log_buffer);
-    }
    autovector<ColumnFamilyData*> cfds;
    for (int i = 0; i != static_cast<int>(cf_ids.size()); ++i) {
      cfds.emplace_back(column_family_set->GetColumnFamily(cf_ids[i]));
      EXPECT_NE(nullptr, cfds[i]);
    }
-    autovector<FileMetaData> file_metas;
+    std::vector<FileMetaData> file_metas;
+    file_metas.reserve(cf_ids.size());
    for (size_t i = 0; i != cf_ids.size(); ++i) {
      FileMetaData meta;
      uint64_t file_num = file_number.fetch_add(1);
      meta.fd = FileDescriptor(file_num, 0, 0);
      file_metas.emplace_back(meta);
    }
-    bool atomic_flush_commit_in_progress = false;
+    autovector<FileMetaData*> file_meta_ptrs;
+    for (auto& meta : file_metas) {
+      file_meta_ptrs.push_back(&meta);
+    }
    InstrumentedMutex mutex;
    InstrumentedMutexLock l(&mutex);
-    return MemTableList::TryInstallMemtableFlushResults(
-        lists, cfds, mutable_cf_options_list, mems_list,
-        &atomic_flush_commit_in_progress, &dummy_prep_tracker, &versions,
-        &mutex, file_metas, to_delete, nullptr, &log_buffer);
+    return InstallMemtableAtomicFlushResults(
+        &lists, cfds, mutable_cf_options_list, mems_list, &versions, &mutex,
+        file_meta_ptrs, to_delete, nullptr, &log_buffer);
  }
 };

@ -731,18 +743,28 @@ TEST_F(MemTableListTest, FlushPendingTest) {
  to_delete.clear();
 }

-TEST_F(MemTableListTest, FlushMultipleCFsTest) {
+TEST_F(MemTableListTest, EmptyAtomicFlusTest) {
+  autovector<MemTableList*> lists;
+  autovector<uint32_t> cf_ids;
+  autovector<const MutableCFOptions*> options_list;
+  autovector<const autovector<MemTable*>*> to_flush;
+  autovector<MemTable*> to_delete;
+  Status s = Mock_InstallMemtableAtomicFlushResults(lists, cf_ids, options_list,
+                                                    to_flush, &to_delete);
+  ASSERT_OK(s);
+  ASSERT_TRUE(to_delete.empty());
+}
+
+TEST_F(MemTableListTest, AtomicFlusTest) {
  const int num_cfs = 3;
-  const int num_tables_per_cf = 5;
+  const int num_tables_per_cf = 2;
  SequenceNumber seq = 1;
-  Status s;

  auto factory = std::make_shared<SkipListFactory>();
  options.memtable_factory = factory;
  ImmutableCFOptions ioptions(options);
  InternalKeyComparator cmp(BytewiseComparator());
  WriteBufferManager wb(options.db_write_buffer_size);
-  autovector<MemTable*> to_delete;

  // Create MemTableLists
  int min_write_buffer_number_to_merge = 3;
@ -783,135 +805,72 @@ TEST_F(MemTableListTest, FlushMultipleCFsTest) {
  std::vector<autovector<MemTable*>> flush_candidates(num_cfs);

  // Nothing to flush
-  for (int i = 0; i != num_cfs; ++i) {
-    auto list = lists[i];
+  for (auto i = 0; i != num_cfs; ++i) {
+    auto* list = lists[i];
    ASSERT_FALSE(list->IsFlushPending());
    ASSERT_FALSE(list->imm_flush_needed.load(std::memory_order_acquire));
    list->PickMemtablesToFlush(nullptr /* memtable_id */, &flush_candidates[i]);
-    ASSERT_EQ(0, static_cast<int>(flush_candidates[i].size()));
+    ASSERT_EQ(0, flush_candidates[i].size());
  }
-
  // Request flush even though there is nothing to flush
-  for (int i = 0; i != num_cfs; ++i) {
-    auto list = lists[i];
+  for (auto i = 0; i != num_cfs; ++i) {
+    auto* list = lists[i];
    list->FlushRequested();
    ASSERT_FALSE(list->IsFlushPending());
    ASSERT_FALSE(list->imm_flush_needed.load(std::memory_order_acquire));
  }
-
-  // Add tables to column families
-  for (int i = 0; i != num_cfs; ++i) {
-    for (int j = 0; j != num_tables_per_cf; ++j) {
+  autovector<MemTable*> to_delete;
+  // Add tables to the immutable memtalbe lists associated with column families
+  for (auto i = 0; i != num_cfs; ++i) {
+    for (auto j = 0; j != num_tables_per_cf; ++j) {
      lists[i]->Add(tables[i][j], &to_delete);
    }
    ASSERT_EQ(num_tables_per_cf, lists[i]->NumNotFlushed());
    ASSERT_TRUE(lists[i]->IsFlushPending());
    ASSERT_TRUE(lists[i]->imm_flush_needed.load(std::memory_order_acquire));
  }
-
+  std::vector<uint64_t> flush_memtable_ids = {1, 1, 0};
+  //          +----+
+  // list[0]: |0  1|
+  // list[1]: |0  1|
+  //          | +--+
+  // list[2]: |0| 1
+  //          +-+
+  // Pick memtables to flush
+  for (auto i = 0; i != num_cfs; ++i) {
+    flush_candidates[i].clear();
+    lists[i]->PickMemtablesToFlush(&flush_memtable_ids[i],
+                                   &flush_candidates[i]);
+    ASSERT_EQ(flush_memtable_ids[i] - 0 + 1,
+              static_cast<uint64_t>(flush_candidates[i].size()));
+  }
+  autovector<MemTableList*> tmp_lists;
+  autovector<uint32_t> tmp_cf_ids;
+  autovector<const MutableCFOptions*> tmp_options_list;
  autovector<const autovector<MemTable*>*> to_flush;
-  std::vector<uint64_t> prev_memtable_ids;
-  // For each column family, determine the memtables to flush
-  for (int k = 0; k != 4; ++k) {
-    std::vector<uint64_t> flush_memtable_ids;
-    if (0 == k) {
-      //          +----+
-      // list[0]: |0  1|  2 3 4
-      // list[1]: |0  1|  2 3 4
-      //          | +--+
-      // list[2]: |0| 1   2 3 4
-      //          +-+
-      flush_memtable_ids = {1, 1, 0};
-    } else if (1 == k) {
-      //          +----+ +---+
-      // list[0]: |0  1| |2 3|  4
-      // list[1]: |0  1| |2 3|  4
-      //          | +--+ +---+
-      // list[2]: |0| 1   2 3   4
-      //          +-+
-      flush_memtable_ids = {3, 3, 0};
-    } else if (2 == k) {
-      //          +-----+ +---+
-      // list[0]: |0   1| |2 3|  4
-      // list[1]: |0   1| |2 3|  4
-      //          | +---+ +---+
-      //          | | +-------+
-      // list[2]: |0| |1   2 3|  4
-      //          +-+ +-------+
-      flush_memtable_ids = {3, 3, 3};
-    } else {
-      //          +-----+ +---+ +-+
-      // list[0]: |0   1| |2 3| |4|
-      // list[1]: |0   1| |2 3| |4|
-      //          | +---+ +---+ | |
-      //          | | +-------+ | |
-      // list[2]: |0| |1   2 3| |4|
-      //          +-+ +-------+ +-+
-      flush_memtable_ids = {4, 4, 4};
-    }
-    assert(num_cfs == static_cast<int>(flush_memtable_ids.size()));
-
-    // Pick memtables to flush
-    for (int i = 0; i != num_cfs; ++i) {
-      flush_candidates[i].clear();
-      lists[i]->PickMemtablesToFlush(&flush_memtable_ids[i],
-                                     &flush_candidates[i]);
-      for (auto mem : flush_candidates[i]) {
-        mem->TEST_AtomicFlushSequenceNumber() = SequenceNumber(k);
-      }
-      if (prev_memtable_ids.empty()) {
-        ASSERT_EQ(flush_memtable_ids[i] - 0 + 1, flush_candidates[i].size());
-      } else {
-        ASSERT_EQ(flush_memtable_ids[i] - prev_memtable_ids[i],
-                  flush_candidates[i].size());
-      }
-      ASSERT_EQ(num_tables_per_cf, lists[i]->NumNotFlushed());
-      ASSERT_FALSE(lists[i]->HasFlushRequested());
-      if (flush_memtable_ids[i] == num_tables_per_cf - 1) {
-        ASSERT_FALSE(
-            lists[i]->imm_flush_needed.load(std::memory_order_acquire));
-      } else {
-        ASSERT_TRUE(lists[i]->imm_flush_needed.load(std::memory_order_acquire));
-      }
-    }
-    prev_memtable_ids = flush_memtable_ids;
-
-    if (k < 3) {
-      for (const auto& mems : flush_candidates) {
-        uint64_t file_num = file_number.fetch_add(1);
-        for (auto m : mems) {
-          m->TEST_SetFlushCompleted(true);
-          m->TEST_SetFileNumber(file_num);
-        }
-      }
-    }
-
-    if (k == 0) {
-      // Rollback first pick of tables
-      for (int i = 0; i != num_cfs; ++i) {
-        auto list = lists[i];
-        const auto& mems = flush_candidates[i];
-        for (auto m : mems) {
-          m->TEST_SetFileNumber(0);
-        }
-        list->RollbackMemtableFlush(flush_candidates[i], 0);
-        ASSERT_TRUE(list->IsFlushPending());
-        ASSERT_TRUE(list->imm_flush_needed.load(std::memory_order_acquire));
-      }
-      prev_memtable_ids.clear();
-    }
-
-    if (k == 3) {
-      for (int i = 0; i != num_cfs; ++i) {
-        to_flush.emplace_back(&flush_candidates[i]);
-      }
+  for (auto i = 0; i != num_cfs; ++i) {
+    if (!flush_candidates[i].empty()) {
+      to_flush.push_back(&flush_candidates[i]);
+      tmp_lists.push_back(lists[i]);
+      tmp_cf_ids.push_back(i);
+      tmp_options_list.push_back(mutable_cf_options_list[i]);
    }
  }
-
-  s = Mock_InstallMemtableFlushResults(lists, cf_ids, mutable_cf_options_list,
-                                       to_flush, &to_delete);
+  Status s = Mock_InstallMemtableAtomicFlushResults(
+      tmp_lists, tmp_cf_ids, tmp_options_list, to_flush, &to_delete);
  ASSERT_OK(s);

+  for (auto i = 0; i != num_cfs; ++i) {
+    for (auto j = 0; j != num_tables_per_cf; ++j) {
+      if (static_cast<uint64_t>(j) <= flush_memtable_ids[i]) {
+        ASSERT_LT(0, tables[i][j]->GetFileNumber());
+      }
+    }
+    ASSERT_EQ(
+        static_cast<size_t>(num_tables_per_cf) - flush_candidates[i].size(),
+        lists[i]->NumNotFlushed());
+  }
+
  to_delete.clear();
  for (auto list : lists) {
    list->current()->Unref(&to_delete);
@ -933,126 +892,6 @@ TEST_F(MemTableListTest, FlushMultipleCFsTest) {
    ASSERT_EQ(m, m->Unref());
    delete m;
  }
-  to_delete.clear();
-}
-
-TEST_F(MemTableListTest, HasOlderAtomicFlush) {
-  const size_t num_cfs = 3;
-  const size_t num_memtables_per_cf = 2;
-  SequenceNumber seq = 1;
-  Status s;
-
-  auto factory = std::make_shared<SkipListFactory>();
-  options.memtable_factory = factory;
-  ImmutableCFOptions ioptions(options);
-  InternalKeyComparator cmp(BytewiseComparator());
-  WriteBufferManager wb(options.db_write_buffer_size);
-  autovector<MemTable*> to_delete;
-
-  // Create MemTableLists
-  int min_write_buffer_number_to_merge = 3;
-  int max_write_buffer_number_to_maintain = 7;
-  autovector<MemTableList*> lists;
-  for (size_t i = 0; i != num_cfs; ++i) {
-    lists.emplace_back(new MemTableList(min_write_buffer_number_to_merge,
-                                        max_write_buffer_number_to_maintain));
-  }
-
-  autovector<uint32_t> cf_ids;
-  std::vector<std::vector<MemTable*>> tables;
-  autovector<const MutableCFOptions*> mutable_cf_options_list;
-  uint32_t cf_id = 0;
-  for (size_t k = 0; k != num_cfs; ++k) {
-    std::vector<MemTable*> elem;
-    mutable_cf_options_list.emplace_back(new MutableCFOptions(options));
-    uint64_t memtable_id = 0;
-    for (int i = 0; i != num_memtables_per_cf; ++i) {
-      MemTable* mem =
-          new MemTable(cmp, ioptions, *(mutable_cf_options_list.back()), &wb,
-                       kMaxSequenceNumber, cf_id);
-      mem->SetID(memtable_id++);
-      mem->Ref();
-
-      std::string value;
-
-      mem->Add(++seq, kTypeValue, "key1", ToString(i));
-      mem->Add(++seq, kTypeValue, "keyN" + ToString(i), "valueN");
-      mem->Add(++seq, kTypeValue, "keyX" + ToString(i), "value");
-      mem->Add(++seq, kTypeValue, "keyM" + ToString(i), "valueM");
-      mem->Add(++seq, kTypeDeletion, "keyX" + ToString(i), "");
-
-      elem.push_back(mem);
-    }
-    tables.emplace_back(elem);
-    cf_ids.push_back(cf_id++);
-  }
-
-  // Add tables to column families' immutable memtable lists
-  for (size_t i = 0; i != num_cfs; ++i) {
-    for (size_t j = 0; j != num_memtables_per_cf; ++j) {
-      lists[i]->Add(tables[i][j], &to_delete);
-    }
-    lists[i]->FlushRequested();
-    ASSERT_EQ(num_memtables_per_cf, lists[i]->NumNotFlushed());
-    ASSERT_TRUE(lists[i]->IsFlushPending());
-    ASSERT_TRUE(lists[i]->imm_flush_needed.load(std::memory_order_acquire));
-  }
-  std::vector<autovector<MemTable*>> flush_candidates(num_cfs);
-  for (size_t i = 0; i != num_cfs; ++i) {
-    lists[i]->PickMemtablesToFlush(nullptr, &flush_candidates[i]);
-    for (auto m : flush_candidates[i]) {
-      m->TEST_AtomicFlushSequenceNumber() = 123;
-    }
-    lists[i]->RollbackMemtableFlush(flush_candidates[i], 0);
-  }
-  uint64_t memtable_id = num_memtables_per_cf - 1;
-  autovector<MemTable*> other_flush_candidates;
-  lists[0]->PickMemtablesToFlush(&memtable_id, &other_flush_candidates);
-  for (auto m : other_flush_candidates) {
-    m->TEST_AtomicFlushSequenceNumber() = 124;
-    m->TEST_SetFlushCompleted(true);
-    m->TEST_SetFileNumber(1);
-  }
-  autovector<const autovector<MemTable*>*> to_flush;
-  to_flush.emplace_back(&other_flush_candidates);
-  bool has_older_unfinished_atomic_flush = false;
-  bool found_batch_to_commit = false;
-
-  SyncPoint::GetInstance()->SetCallBack(
-      "MemTableList::TryInstallMemtableFlushResults:"
-      "HasOlderUnfinishedAtomicFlush:0",
-      [&](void* /*arg*/) { has_older_unfinished_atomic_flush = true; });
-  SyncPoint::GetInstance()->SetCallBack(
-      "MemTableList::TryInstallMemtableFlushResults:FoundBatchToCommit:0",
-      [&](void* /*arg*/) { found_batch_to_commit = true; });
-  SyncPoint::GetInstance()->EnableProcessing();
-
-  s = Mock_InstallMemtableFlushResults(lists, cf_ids, mutable_cf_options_list,
-                                       to_flush, &to_delete);
-  ASSERT_OK(s);
-  ASSERT_TRUE(has_older_unfinished_atomic_flush);
-  ASSERT_FALSE(found_batch_to_commit);
-
-  SyncPoint::GetInstance()->ClearAllCallBacks();
-
-  ASSERT_TRUE(to_delete.empty());
-  for (auto list : lists) {
-    list->current()->Unref(&to_delete);
-    delete list;
-  }
-  lists.clear();
-  ASSERT_EQ(num_cfs * num_memtables_per_cf, to_delete.size());
-  for (auto m : to_delete) {
-    m->Ref();
-    ASSERT_EQ(m, m->Unref());
-    delete m;
-  }
-  to_delete.clear();
-  for (auto& opts : mutable_cf_options_list) {
-    delete opts;
-    opts = nullptr;
-  }
-  mutable_cf_options_list.clear();
 }

 }  // namespace rocksdb
--- a/db/merge_context.h
+++ b/db/merge_context.h
@ -79,7 +79,8 @@ class MergeContext {
    return GetOperandsDirectionForward();
  }

-  // Return all the operands in the order as they were merged (passed to FullMerge or FullMergeV2)
+  // Return all the operands in the order as they were merged (passed to
+  // FullMerge or FullMergeV2)
  const std::vector<Slice>& GetOperandsDirectionForward() {
    if (!operand_list_) {
      return empty_operand_list;
@ -89,7 +90,8 @@ class MergeContext {
    return *operand_list_;
  }

-  // Return all the operands in the reversed order relative to how they were merged (passed to FullMerge or FullMergeV2)
+  // Return all the operands in the reversed order relative to how they were
+  // merged (passed to FullMerge or FullMergeV2)
  const std::vector<Slice>& GetOperandsDirectionBackward() {
    if (!operand_list_) {
      return empty_operand_list;
--- a/db/merge_helper.cc
+++ b/db/merge_helper.cc
@ -110,8 +110,11 @@ Status MergeHelper::TimedFullMerge(const MergeOperator* merge_operator,
 //       keys_ stores the list of keys encountered while merging.
 //       operands_ stores the list of merge operands encountered while merging.
 //       keys_[i] corresponds to operands_[i] for each i.
+//
+// TODO: Avoid the snapshot stripe map lookup in CompactionRangeDelAggregator
+// and just pass the StripeRep corresponding to the stripe being merged.
 Status MergeHelper::MergeUntil(InternalIterator* iter,
-                               RangeDelAggregator* range_del_agg,
+                               CompactionRangeDelAggregator* range_del_agg,
                               const SequenceNumber stop_before,
                               const bool at_bottom) {
  // Get a copy of the internal key, before it's invalidated by iter->Next()
--- a/db/merge_helper.h
+++ b/db/merge_helper.h
@ -78,7 +78,7 @@ class MergeHelper {
  //
  // REQUIRED: The first key in the input is not corrupted.
  Status MergeUntil(InternalIterator* iter,
-                    RangeDelAggregator* range_del_agg = nullptr,
+                    CompactionRangeDelAggregator* range_del_agg = nullptr,
                    const SequenceNumber stop_before = 0,
                    const bool at_bottom = false);

--- a/db/range_del_aggregator.cc
+++ b/db/range_del_aggregator.cc
--- a/db/range_del_aggregator.h
+++ b/db/range_del_aggregator.h
@ -1,10 +1,12 @@
-//  Copyright (c) 2016-present, Facebook, Inc.  All rights reserved.
+//  Copyright (c) 2018-present, Facebook, Inc.  All rights reserved.
 //  This source code is licensed under both the GPLv2 (found in the
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).

 #pragma once

+#include <algorithm>
+#include <iterator>
 #include <list>
 #include <map>
 #include <set>
@ -14,220 +16,416 @@
 #include "db/compaction_iteration_stats.h"
 #include "db/dbformat.h"
 #include "db/pinned_iterators_manager.h"
+#include "db/range_del_aggregator.h"
+#include "db/range_tombstone_fragmenter.h"
 #include "db/version_edit.h"
 #include "include/rocksdb/comparator.h"
 #include "include/rocksdb/types.h"
 #include "table/internal_iterator.h"
 #include "table/scoped_arena_iterator.h"
 #include "table/table_builder.h"
+#include "util/heap.h"
 #include "util/kv_map.h"

 namespace rocksdb {

-// RangeDelMaps maintain position across calls to ShouldDelete. The caller may
-// wish to specify a mode to optimize positioning the iterator during the next
-// call to ShouldDelete. The non-kFullScan modes are only available when
-// deletion collapsing is enabled.
-//
-// For example, if we invoke Next() on an iterator, kForwardTraversal should be
-// specified to advance one-by-one through deletions until one is found with its
-// interval containing the key. This will typically be faster than doing a full
-// binary search (kBinarySearch).
-enum class RangeDelPositioningMode {
-  kFullScan,  // used iff collapse_deletions_ == false
-  kForwardTraversal,
-  kBackwardTraversal,
-  kBinarySearch,
-};
+class TruncatedRangeDelIterator {
+ public:
+  TruncatedRangeDelIterator(
+      std::unique_ptr<FragmentedRangeTombstoneIterator> iter,
+      const InternalKeyComparator* icmp, const InternalKey* smallest,
+      const InternalKey* largest);

-// TruncatedRangeTombstones are a slight generalization of regular
-// RangeTombstones that can represent truncations caused by SST boundaries.
-// Instead of using user keys to represent the start and end keys, they instead
-// use internal keys, whose sequence number indicates the sequence number of
-// the smallest/largest SST key (in the case where a tombstone is untruncated,
-// the sequence numbers will be kMaxSequenceNumber for both start and end
-// keys). Like RangeTombstones, TruncatedRangeTombstone are also
-// end-key-exclusive.
-struct TruncatedRangeTombstone {
-  TruncatedRangeTombstone(const ParsedInternalKey& sk,
-                          const ParsedInternalKey& ek, SequenceNumber s)
-      : start_key_(sk), end_key_(ek), seq_(s) {}
+  bool Valid() const;

-  RangeTombstone Tombstone() const {
-    // The RangeTombstone returned here can cover less than the
-    // TruncatedRangeTombstone when its end key has a seqnum that is not
-    // kMaxSequenceNumber. Since this method is only used by RangeDelIterators
-    // (which in turn are only used during flush/compaction), we avoid this
-    // problem by using truncation boundaries spanning multiple SSTs, which
-    // are selected in a way that guarantee a clean break at the end key.
-    assert(end_key_.sequence == kMaxSequenceNumber);
-    return RangeTombstone(start_key_.user_key, end_key_.user_key, seq_);
+  void Next();
+  void Prev();
+
+  void InternalNext();
+
+  // Seeks to the tombstone with the highest viisble sequence number that covers
+  // target (a user key). If no such tombstone exists, the position will be at
+  // the earliest tombstone that ends after target.
+  void Seek(const Slice& target);
+
+  // Seeks to the tombstone with the highest viisble sequence number that covers
+  // target (a user key). If no such tombstone exists, the position will be at
+  // the latest tombstone that starts before target.
+  void SeekForPrev(const Slice& target);
+
+  void SeekToFirst();
+  void SeekToLast();
+
+  ParsedInternalKey start_key() const {
+    return (smallest_ == nullptr ||
+            icmp_->Compare(*smallest_, iter_->parsed_start_key()) <= 0)
+               ? iter_->parsed_start_key()
+               : *smallest_;
  }

-  ParsedInternalKey start_key_;
-  ParsedInternalKey end_key_;
-  SequenceNumber seq_;
-};
-
-// A RangeDelIterator iterates over range deletion tombstones.
-class RangeDelIterator {
- public:
-  virtual ~RangeDelIterator() = default;
-
-  virtual bool Valid() const = 0;
-  virtual void Next() = 0;
-  // NOTE: the Slice passed to this method must be a user key.
-  virtual void Seek(const Slice& target) = 0;
-  virtual void Seek(const ParsedInternalKey& target) = 0;
-  virtual RangeTombstone Tombstone() const = 0;
-};
-
-// A RangeDelMap keeps track of range deletion tombstones within a snapshot
-// stripe.
-//
-// RangeDelMaps are used internally by RangeDelAggregator. They are not intended
-// to be used directly.
-class RangeDelMap {
- public:
-  virtual ~RangeDelMap() = default;
-
-  virtual bool ShouldDelete(const ParsedInternalKey& parsed,
-                            RangeDelPositioningMode mode) = 0;
-  virtual bool IsRangeOverlapped(const ParsedInternalKey& start,
-                                 const ParsedInternalKey& end) = 0;
-  virtual void InvalidatePosition() = 0;
-
-  virtual size_t Size() const = 0;
-  bool IsEmpty() const { return Size() == 0; }
-
-  virtual void AddTombstone(TruncatedRangeTombstone tombstone) = 0;
-  virtual std::unique_ptr<RangeDelIterator> NewIterator() = 0;
-};
-
-// A RangeDelAggregator aggregates range deletion tombstones as they are
-// encountered in memtables/SST files. It provides methods that check whether a
-// key is covered by range tombstones or write the relevant tombstones to a new
-// SST file.
-class RangeDelAggregator {
- public:
-  // @param snapshots These are used to organize the tombstones into snapshot
-  //    stripes, which is the seqnum range between consecutive snapshots,
-  //    including the higher snapshot and excluding the lower one. Currently,
-  //    this is used by ShouldDelete() to prevent deletion of keys that are
-  //    covered by range tombstones in other snapshot stripes. This constructor
-  //    is used for writes (flush/compaction). All DB snapshots are provided
-  //    such that no keys are removed that are uncovered according to any DB
-  //    snapshot.
-  // Note this overload does not lazily initialize Rep.
-  RangeDelAggregator(const InternalKeyComparator& icmp,
-                     const std::vector<SequenceNumber>& snapshots,
-                     bool collapse_deletions = true);
-
-  // @param upper_bound Similar to snapshots above, except with a single
-  //    snapshot, which allows us to store the snapshot on the stack and defer
-  //    initialization of heap-allocating members (in Rep) until the first range
-  //    deletion is encountered. This constructor is used in case of reads (get/
-  //    iterator), for which only the user snapshot (upper_bound) is provided
-  //    such that the seqnum space is divided into two stripes. Only the older
-  //    stripe will be used by ShouldDelete().
-  RangeDelAggregator(const InternalKeyComparator& icmp,
-                     SequenceNumber upper_bound,
-                     bool collapse_deletions = false);
-
-  // Returns whether the key should be deleted, which is the case when it is
-  // covered by a range tombstone residing in the same snapshot stripe.
-  // @param mode If collapse_deletions_ is true, this dictates how we will find
-  //             the deletion whose interval contains this key. Otherwise, its
-  //             value must be kFullScan indicating linear scan from beginning.
-  bool ShouldDelete(
-      const ParsedInternalKey& parsed,
-      RangeDelPositioningMode mode = RangeDelPositioningMode::kFullScan) {
-    if (rep_ == nullptr) {
-      return false;
-    }
-    return ShouldDeleteImpl(parsed, mode);
+  ParsedInternalKey end_key() const {
+    return (largest_ == nullptr ||
+            icmp_->Compare(iter_->parsed_end_key(), *largest_) <= 0)
+               ? iter_->parsed_end_key()
+               : *largest_;
  }
-  bool ShouldDelete(
-      const Slice& internal_key,
-      RangeDelPositioningMode mode = RangeDelPositioningMode::kFullScan) {
-    if (rep_ == nullptr) {
-      return false;
-    }
-    return ShouldDeleteImpl(internal_key, mode);
-  }
-  bool ShouldDeleteImpl(const ParsedInternalKey& parsed,
-                        RangeDelPositioningMode mode);
-  bool ShouldDeleteImpl(const Slice& internal_key,
-                        RangeDelPositioningMode mode);

-  // Checks whether range deletions cover any keys between `start` and `end`,
-  // inclusive.
-  //
-  // @param start User key representing beginning of range to check for overlap.
-  // @param end User key representing end of range to check for overlap. This
-  //     argument is inclusive, so the existence of a range deletion covering
-  //     `end` causes this to return true.
-  bool IsRangeOverlapped(const Slice& start, const Slice& end);
+  SequenceNumber seq() const { return iter_->seq(); }

-  // Adds tombstones to the tombstone aggregation structure maintained by this
-  // object. Tombstones are truncated to smallest and largest. If smallest (or
-  // largest) is null, it is not used for truncation. When adding range
-  // tombstones present in an sstable, smallest and largest should be set to
-  // the smallest and largest keys from the sstable file metadata. Note that
-  // tombstones end keys are exclusive while largest is inclusive.
-  // @return non-OK status if any of the tombstone keys are corrupted.
-  Status AddTombstones(std::unique_ptr<InternalIterator> input,
-                       const InternalKey* smallest = nullptr,
-                       const InternalKey* largest = nullptr);
+  std::map<SequenceNumber, std::unique_ptr<TruncatedRangeDelIterator>>
+  SplitBySnapshot(const std::vector<SequenceNumber>& snapshots);

-  // Resets iterators maintained across calls to ShouldDelete(). This may be
-  // called when the tombstones change, or the owner may call explicitly, e.g.,
-  // if it's an iterator that just seeked to an arbitrary position. The effect
-  // of invalidation is that the following call to ShouldDelete() will binary
-  // search for its tombstone.
-  void InvalidateRangeDelMapPositions();
+  SequenceNumber upper_bound() const { return iter_->upper_bound(); }

-  bool IsEmpty();
-  bool AddFile(uint64_t file_number);
-
-  // Create a new iterator over the range deletion tombstones in all of the
-  // snapshot stripes in this aggregator. Tombstones are presented in start key
-  // order. Tombstones with the same start key are presented in arbitrary order.
-  //
-  // The iterator is invalidated after any call to AddTombstones. It is the
-  // caller's responsibility to avoid using invalid iterators.
-  std::unique_ptr<RangeDelIterator> NewIterator();
+  SequenceNumber lower_bound() const { return iter_->lower_bound(); }

 private:
-  // Maps snapshot seqnum -> map of tombstones that fall in that stripe, i.e.,
-  // their seqnums are greater than the next smaller snapshot's seqnum, and the
-  // corresponding index into the list of snapshots. Each entry is lazily
-  // initialized.
-  typedef std::map<SequenceNumber,
-                   std::pair<std::unique_ptr<RangeDelMap>, size_t>>
-      StripeMap;
+  std::unique_ptr<FragmentedRangeTombstoneIterator> iter_;
+  const InternalKeyComparator* icmp_;
+  const ParsedInternalKey* smallest_ = nullptr;
+  const ParsedInternalKey* largest_ = nullptr;
+  std::list<ParsedInternalKey> pinned_bounds_;

-  struct Rep {
-    std::vector<SequenceNumber> snapshots_;
-    StripeMap stripe_map_;
-    PinnedIteratorsManager pinned_iters_mgr_;
-    std::list<std::string> pinned_slices_;
-    std::set<uint64_t> added_files_;
+  const InternalKey* smallest_ikey_;
+  const InternalKey* largest_ikey_;
+};
+
+struct SeqMaxComparator {
+  bool operator()(const TruncatedRangeDelIterator* a,
+                  const TruncatedRangeDelIterator* b) const {
+    return a->seq() > b->seq();
+  }
+};
+
+struct StartKeyMinComparator {
+  explicit StartKeyMinComparator(const InternalKeyComparator* c) : icmp(c) {}
+
+  bool operator()(const TruncatedRangeDelIterator* a,
+                  const TruncatedRangeDelIterator* b) const {
+    return icmp->Compare(a->start_key(), b->start_key()) > 0;
+  }
+
+  const InternalKeyComparator* icmp;
+};
+
+class ForwardRangeDelIterator {
+ public:
+  explicit ForwardRangeDelIterator(const InternalKeyComparator* icmp);
+
+  bool ShouldDelete(const ParsedInternalKey& parsed);
+  void Invalidate();
+
+  void AddNewIter(TruncatedRangeDelIterator* iter,
+                  const ParsedInternalKey& parsed) {
+    iter->Seek(parsed.user_key);
+    PushIter(iter, parsed);
+    assert(active_iters_.size() == active_seqnums_.size());
+  }
+
+  size_t UnusedIdx() const { return unused_idx_; }
+  void IncUnusedIdx() { unused_idx_++; }
+
+ private:
+  using ActiveSeqSet =
+      std::multiset<TruncatedRangeDelIterator*, SeqMaxComparator>;
+
+  struct EndKeyMinComparator {
+    explicit EndKeyMinComparator(const InternalKeyComparator* c) : icmp(c) {}
+
+    bool operator()(const ActiveSeqSet::const_iterator& a,
+                    const ActiveSeqSet::const_iterator& b) const {
+      return icmp->Compare((*a)->end_key(), (*b)->end_key()) > 0;
+    }
+
+    const InternalKeyComparator* icmp;
  };
-  // Initializes rep_ lazily. This aggregator object is constructed for every
-  // read, so expensive members should only be created when necessary, i.e.,
-  // once the first range deletion is encountered.
-  void InitRep(const std::vector<SequenceNumber>& snapshots);

-  std::unique_ptr<RangeDelMap> NewRangeDelMap();
-  RangeDelMap* GetRangeDelMapIfExists(SequenceNumber seq);
-  RangeDelMap& GetRangeDelMap(SequenceNumber seq);
+  void PushIter(TruncatedRangeDelIterator* iter,
+                const ParsedInternalKey& parsed) {
+    if (!iter->Valid()) {
+      // The iterator has been fully consumed, so we don't need to add it to
+      // either of the heaps.
+      return;
+    }
+    int cmp = icmp_->Compare(parsed, iter->start_key());
+    if (cmp < 0) {
+      PushInactiveIter(iter);
+    } else {
+      PushActiveIter(iter);
+    }
+  }

-  SequenceNumber upper_bound_;
-  std::unique_ptr<Rep> rep_;
-  const InternalKeyComparator& icmp_;
-  // collapse range deletions so they're binary searchable
-  const bool collapse_deletions_;
+  void PushActiveIter(TruncatedRangeDelIterator* iter) {
+    auto seq_pos = active_seqnums_.insert(iter);
+    active_iters_.push(seq_pos);
+  }
+
+  TruncatedRangeDelIterator* PopActiveIter() {
+    auto active_top = active_iters_.top();
+    auto iter = *active_top;
+    active_iters_.pop();
+    active_seqnums_.erase(active_top);
+    return iter;
+  }
+
+  void PushInactiveIter(TruncatedRangeDelIterator* iter) {
+    inactive_iters_.push(iter);
+  }
+
+  TruncatedRangeDelIterator* PopInactiveIter() {
+    auto* iter = inactive_iters_.top();
+    inactive_iters_.pop();
+    return iter;
+  }
+
+  const InternalKeyComparator* icmp_;
+  size_t unused_idx_;
+  ActiveSeqSet active_seqnums_;
+  BinaryHeap<ActiveSeqSet::const_iterator, EndKeyMinComparator> active_iters_;
+  BinaryHeap<TruncatedRangeDelIterator*, StartKeyMinComparator> inactive_iters_;
+};
+
+class ReverseRangeDelIterator {
+ public:
+  explicit ReverseRangeDelIterator(const InternalKeyComparator* icmp);
+
+  bool ShouldDelete(const ParsedInternalKey& parsed);
+  void Invalidate();
+
+  void AddNewIter(TruncatedRangeDelIterator* iter,
+                  const ParsedInternalKey& parsed) {
+    iter->SeekForPrev(parsed.user_key);
+    PushIter(iter, parsed);
+    assert(active_iters_.size() == active_seqnums_.size());
+  }
+
+  size_t UnusedIdx() const { return unused_idx_; }
+  void IncUnusedIdx() { unused_idx_++; }
+
+ private:
+  using ActiveSeqSet =
+      std::multiset<TruncatedRangeDelIterator*, SeqMaxComparator>;
+
+  struct EndKeyMaxComparator {
+    explicit EndKeyMaxComparator(const InternalKeyComparator* c) : icmp(c) {}
+
+    bool operator()(const TruncatedRangeDelIterator* a,
+                    const TruncatedRangeDelIterator* b) const {
+      return icmp->Compare(a->end_key(), b->end_key()) < 0;
+    }
+
+    const InternalKeyComparator* icmp;
+  };
+  struct StartKeyMaxComparator {
+    explicit StartKeyMaxComparator(const InternalKeyComparator* c) : icmp(c) {}
+
+    bool operator()(const ActiveSeqSet::const_iterator& a,
+                    const ActiveSeqSet::const_iterator& b) const {
+      return icmp->Compare((*a)->start_key(), (*b)->start_key()) < 0;
+    }
+
+    const InternalKeyComparator* icmp;
+  };
+
+  void PushIter(TruncatedRangeDelIterator* iter,
+                const ParsedInternalKey& parsed) {
+    if (!iter->Valid()) {
+      // The iterator has been fully consumed, so we don't need to add it to
+      // either of the heaps.
+    } else if (icmp_->Compare(iter->end_key(), parsed) <= 0) {
+      PushInactiveIter(iter);
+    } else {
+      PushActiveIter(iter);
+    }
+  }
+
+  void PushActiveIter(TruncatedRangeDelIterator* iter) {
+    auto seq_pos = active_seqnums_.insert(iter);
+    active_iters_.push(seq_pos);
+  }
+
+  TruncatedRangeDelIterator* PopActiveIter() {
+    auto active_top = active_iters_.top();
+    auto iter = *active_top;
+    active_iters_.pop();
+    active_seqnums_.erase(active_top);
+    return iter;
+  }
+
+  void PushInactiveIter(TruncatedRangeDelIterator* iter) {
+    inactive_iters_.push(iter);
+  }
+
+  TruncatedRangeDelIterator* PopInactiveIter() {
+    auto* iter = inactive_iters_.top();
+    inactive_iters_.pop();
+    return iter;
+  }
+
+  const InternalKeyComparator* icmp_;
+  size_t unused_idx_;
+  ActiveSeqSet active_seqnums_;
+  BinaryHeap<ActiveSeqSet::const_iterator, StartKeyMaxComparator> active_iters_;
+  BinaryHeap<TruncatedRangeDelIterator*, EndKeyMaxComparator> inactive_iters_;
+};
+
+enum class RangeDelPositioningMode { kForwardTraversal, kBackwardTraversal };
+class RangeDelAggregator {
+ public:
+  explicit RangeDelAggregator(const InternalKeyComparator* icmp)
+      : icmp_(icmp) {}
+  virtual ~RangeDelAggregator() {}
+
+  virtual void AddTombstones(
+      std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter,
+      const InternalKey* smallest = nullptr,
+      const InternalKey* largest = nullptr) = 0;
+
+  bool ShouldDelete(const Slice& key, RangeDelPositioningMode mode) {
+    ParsedInternalKey parsed;
+    if (!ParseInternalKey(key, &parsed)) {
+      return false;
+    }
+    return ShouldDelete(parsed, mode);
+  }
+  virtual bool ShouldDelete(const ParsedInternalKey& parsed,
+                            RangeDelPositioningMode mode) = 0;
+
+  virtual void InvalidateRangeDelMapPositions() = 0;
+
+  virtual bool IsEmpty() const = 0;
+
+  bool AddFile(uint64_t file_number) {
+    return files_seen_.insert(file_number).second;
+  }
+
+ protected:
+  class StripeRep {
+   public:
+    StripeRep(const InternalKeyComparator* icmp, SequenceNumber upper_bound,
+              SequenceNumber lower_bound)
+        : icmp_(icmp),
+          forward_iter_(icmp),
+          reverse_iter_(icmp),
+          upper_bound_(upper_bound),
+          lower_bound_(lower_bound) {}
+
+    void AddTombstones(std::unique_ptr<TruncatedRangeDelIterator> input_iter) {
+      iters_.push_back(std::move(input_iter));
+    }
+
+    bool IsEmpty() const { return iters_.empty(); }
+
+    bool ShouldDelete(const ParsedInternalKey& parsed,
+                      RangeDelPositioningMode mode);
+
+    void Invalidate() {
+      InvalidateForwardIter();
+      InvalidateReverseIter();
+    }
+
+    bool IsRangeOverlapped(const Slice& start, const Slice& end);
+
+   private:
+    bool InStripe(SequenceNumber seq) const {
+      return lower_bound_ <= seq && seq <= upper_bound_;
+    }
+
+    void InvalidateForwardIter() { forward_iter_.Invalidate(); }
+
+    void InvalidateReverseIter() { reverse_iter_.Invalidate(); }
+
+    const InternalKeyComparator* icmp_;
+    std::vector<std::unique_ptr<TruncatedRangeDelIterator>> iters_;
+    ForwardRangeDelIterator forward_iter_;
+    ReverseRangeDelIterator reverse_iter_;
+    SequenceNumber upper_bound_;
+    SequenceNumber lower_bound_;
+  };
+
+  const InternalKeyComparator* icmp_;
+
+ private:
+  std::set<uint64_t> files_seen_;
+};
+
+class ReadRangeDelAggregator : public RangeDelAggregator {
+ public:
+  ReadRangeDelAggregator(const InternalKeyComparator* icmp,
+                         SequenceNumber upper_bound)
+      : RangeDelAggregator(icmp),
+        rep_(icmp, upper_bound, 0 /* lower_bound */) {}
+  ~ReadRangeDelAggregator() override {}
+
+  using RangeDelAggregator::ShouldDelete;
+  void AddTombstones(
+      std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter,
+      const InternalKey* smallest = nullptr,
+      const InternalKey* largest = nullptr) override;
+
+  bool ShouldDelete(const ParsedInternalKey& parsed,
+                    RangeDelPositioningMode mode) override;
+
+  bool IsRangeOverlapped(const Slice& start, const Slice& end);
+
+  void InvalidateRangeDelMapPositions() override { rep_.Invalidate(); }
+
+  bool IsEmpty() const override { return rep_.IsEmpty(); }
+
+ private:
+  StripeRep rep_;
+};
+
+class CompactionRangeDelAggregator : public RangeDelAggregator {
+ public:
+  CompactionRangeDelAggregator(const InternalKeyComparator* icmp,
+                               const std::vector<SequenceNumber>& snapshots)
+      : RangeDelAggregator(icmp), snapshots_(&snapshots) {}
+  ~CompactionRangeDelAggregator() override {}
+
+  void AddTombstones(
+      std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter,
+      const InternalKey* smallest = nullptr,
+      const InternalKey* largest = nullptr) override;
+
+  using RangeDelAggregator::ShouldDelete;
+  bool ShouldDelete(const ParsedInternalKey& parsed,
+                    RangeDelPositioningMode mode) override;
+
+  bool IsRangeOverlapped(const Slice& start, const Slice& end);
+
+  void InvalidateRangeDelMapPositions() override {
+    for (auto& rep : reps_) {
+      rep.second.Invalidate();
+    }
+  }
+
+  bool IsEmpty() const override {
+    for (const auto& rep : reps_) {
+      if (!rep.second.IsEmpty()) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  // Creates an iterator over all the range tombstones in the aggregator, for
+  // use in compaction. Nullptr arguments indicate that the iterator range is
+  // unbounded.
+  // NOTE: the boundaries are used for optimization purposes to reduce the
+  // number of tombstones that are passed to the fragmenter; they do not
+  // guarantee that the resulting iterator only contains range tombstones that
+  // cover keys in the provided range. If required, these bounds must be
+  // enforced during iteration.
+  std::unique_ptr<FragmentedRangeTombstoneIterator> NewIterator(
+      const Slice* lower_bound = nullptr, const Slice* upper_bound = nullptr,
+      bool upper_bound_inclusive = false);
+
+ private:
+  std::vector<std::unique_ptr<TruncatedRangeDelIterator>> parent_iters_;
+  std::map<SequenceNumber, StripeRep> reps_;
+
+  const std::vector<SequenceNumber>* snapshots_;
 };

 }  // namespace rocksdb
--- a/db/range_del_aggregator_bench.cc
+++ b/db/range_del_aggregator_bench.cc
@ -20,7 +20,6 @@ int main() {
 #include <vector>

 #include "db/range_del_aggregator.h"
-#include "db/range_del_aggregator_v2.h"
 #include "db/range_tombstone_fragmenter.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/env.h"
@ -48,8 +47,6 @@ DEFINE_double(tombstone_width_mean, 100.0, "average range tombstone width");
 DEFINE_double(tombstone_width_stddev, 0.0,
              "standard deviation of range tombstone width");

-DEFINE_bool(use_collapsed, true, "use the collapsed range tombstone map");
-
 DEFINE_int32(seed, 0, "random number generator seed");

 DEFINE_int32(should_deletes_per_run, 1, "number of ShouldDelete calls per run");
@ -57,8 +54,6 @@ DEFINE_int32(should_deletes_per_run, 1, "number of ShouldDelete calls per run");
 DEFINE_int32(add_tombstones_per_run, 1,
             "number of AddTombstones calls per run");

-DEFINE_bool(use_v2_aggregator, false, "benchmark RangeDelAggregatorV2");
-
 namespace {

 struct Stats {
@ -187,14 +182,10 @@ int main(int argc, char** argv) {
        std::vector<rocksdb::PersistentRangeTombstone>(
            FLAGS_num_range_tombstones);
  }
-  auto mode = FLAGS_use_collapsed
-                  ? rocksdb::RangeDelPositioningMode::kForwardTraversal
-                  : rocksdb::RangeDelPositioningMode::kFullScan;
+  auto mode = rocksdb::RangeDelPositioningMode::kForwardTraversal;

  for (int i = 0; i < FLAGS_num_runs; i++) {
-    rocksdb::RangeDelAggregator range_del_agg(icmp, {} /* snapshots */,
-                                              FLAGS_use_collapsed);
-    rocksdb::RangeDelAggregatorV2 range_del_agg_v2(
+    rocksdb::ReadRangeDelAggregator range_del_agg(
        &icmp, rocksdb::kMaxSequenceNumber /* upper_bound */);

    std::vector<std::unique_ptr<rocksdb::FragmentedRangeTombstoneList> >
@ -206,7 +197,7 @@ int main(int argc, char** argv) {
      // real workloads.
      for (int j = 0; j < FLAGS_num_range_tombstones; j++) {
        uint64_t start = rnd.Uniform(FLAGS_tombstone_start_upper_bound);
-        uint64_t end = start + std::max(1.0, normal_dist(random_gen));
+        uint64_t end = start + static_cast<uint64_t>(std::max(1.0, normal_dist(random_gen)));
        persistent_range_tombstones[j] = rocksdb::PersistentRangeTombstone(
            rocksdb::Key(start), rocksdb::Key(end), j);
      }
@ -220,20 +211,13 @@ int main(int argc, char** argv) {
      std::unique_ptr<rocksdb::FragmentedRangeTombstoneIterator>
          fragmented_range_del_iter(
              new rocksdb::FragmentedRangeTombstoneIterator(
-                  fragmented_range_tombstone_lists.back().get(),
-                  rocksdb::kMaxSequenceNumber, icmp));
+                  fragmented_range_tombstone_lists.back().get(), icmp,
+                  rocksdb::kMaxSequenceNumber));

-      if (FLAGS_use_v2_aggregator) {
-        rocksdb::StopWatchNano stop_watch_add_tombstones(
-            rocksdb::Env::Default(), true /* auto_start */);
-        range_del_agg_v2.AddTombstones(std::move(fragmented_range_del_iter));
-        stats.time_add_tombstones += stop_watch_add_tombstones.ElapsedNanos();
-      } else {
-        rocksdb::StopWatchNano stop_watch_add_tombstones(
-            rocksdb::Env::Default(), true /* auto_start */);
-        range_del_agg.AddTombstones(std::move(range_del_iter));
-        stats.time_add_tombstones += stop_watch_add_tombstones.ElapsedNanos();
-      }
+      rocksdb::StopWatchNano stop_watch_add_tombstones(rocksdb::Env::Default(),
+                                                       true /* auto_start */);
+      range_del_agg.AddTombstones(std::move(fragmented_range_del_iter));
+      stats.time_add_tombstones += stop_watch_add_tombstones.ElapsedNanos();
    }

    rocksdb::ParsedInternalKey parsed_key;
@ -247,18 +231,10 @@ int main(int argc, char** argv) {
      std::string key_string = rocksdb::Key(first_key + j);
      parsed_key.user_key = key_string;

-      uint64_t call_time;
-      if (FLAGS_use_v2_aggregator) {
-        rocksdb::StopWatchNano stop_watch_should_delete(rocksdb::Env::Default(),
-                                                        true /* auto_start */);
-        range_del_agg_v2.ShouldDelete(parsed_key, mode);
-        call_time = stop_watch_should_delete.ElapsedNanos();
-      } else {
-        rocksdb::StopWatchNano stop_watch_should_delete(rocksdb::Env::Default(),
-                                                        true /* auto_start */);
-        range_del_agg.ShouldDelete(parsed_key, mode);
-        call_time = stop_watch_should_delete.ElapsedNanos();
-      }
+      rocksdb::StopWatchNano stop_watch_should_delete(rocksdb::Env::Default(),
+                                                      true /* auto_start */);
+      range_del_agg.ShouldDelete(parsed_key, mode);
+      uint64_t call_time = stop_watch_should_delete.ElapsedNanos();

      if (j == 0) {
        stats.time_first_should_delete += call_time;
--- a/db/range_del_aggregator_test.cc
+++ b/db/range_del_aggregator_test.cc
--- a/db/range_del_aggregator_v2.cc
+++ b/db/range_del_aggregator_v2.cc
@ -1,295 +0,0 @@
-//  Copyright (c) 2018-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-#include "db/range_del_aggregator_v2.h"
-
-#include "db/compaction_iteration_stats.h"
-#include "db/dbformat.h"
-#include "db/pinned_iterators_manager.h"
-#include "db/range_del_aggregator.h"
-#include "db/range_tombstone_fragmenter.h"
-#include "db/version_edit.h"
-#include "include/rocksdb/comparator.h"
-#include "include/rocksdb/types.h"
-#include "table/internal_iterator.h"
-#include "table/scoped_arena_iterator.h"
-#include "table/table_builder.h"
-#include "util/heap.h"
-#include "util/kv_map.h"
-#include "util/vector_iterator.h"
-
-namespace rocksdb {
-
-TruncatedRangeDelIterator::TruncatedRangeDelIterator(
-    std::unique_ptr<FragmentedRangeTombstoneIterator> iter,
-    const InternalKeyComparator* icmp, const InternalKey* smallest,
-    const InternalKey* largest)
-    : iter_(std::move(iter)), icmp_(icmp) {
-  if (smallest != nullptr) {
-    pinned_bounds_.emplace_back();
-    auto& parsed_smallest = pinned_bounds_.back();
-    if (!ParseInternalKey(smallest->Encode(), &parsed_smallest)) {
-      assert(false);
-    }
-    smallest_ = &parsed_smallest;
-  }
-  if (largest != nullptr) {
-    pinned_bounds_.emplace_back();
-    auto& parsed_largest = pinned_bounds_.back();
-    if (!ParseInternalKey(largest->Encode(), &parsed_largest)) {
-      assert(false);
-    }
-    if (parsed_largest.type == kTypeRangeDeletion &&
-        parsed_largest.sequence == kMaxSequenceNumber) {
-      // The file boundary has been artificially extended by a range tombstone.
-      // We do not need to adjust largest to properly truncate range
-      // tombstones that extend past the boundary.
-    } else if (parsed_largest.sequence == 0) {
-      // The largest key in the sstable has a sequence number of 0. Since we
-      // guarantee that no internal keys with the same user key and sequence
-      // number can exist in a DB, we know that the largest key in this sstable
-      // cannot exist as the smallest key in the next sstable. This further
-      // implies that no range tombstone in this sstable covers largest;
-      // otherwise, the file boundary would have been artificially extended.
-      //
-      // Therefore, we will never truncate a range tombstone at largest, so we
-      // can leave it unchanged.
-    } else {
-      // The same user key may straddle two sstable boundaries. To ensure that
-      // the truncated end key can cover the largest key in this sstable, reduce
-      // its sequence number by 1.
-      parsed_largest.sequence -= 1;
-    }
-    largest_ = &parsed_largest;
-  }
-}
-
-bool TruncatedRangeDelIterator::Valid() const {
-  return iter_->Valid() &&
-         (smallest_ == nullptr ||
-          icmp_->Compare(*smallest_, iter_->parsed_end_key()) < 0) &&
-         (largest_ == nullptr ||
-          icmp_->Compare(iter_->parsed_start_key(), *largest_) < 0);
-}
-
-void TruncatedRangeDelIterator::Next() { iter_->TopNext(); }
-
-void TruncatedRangeDelIterator::Prev() { iter_->TopPrev(); }
-
-// NOTE: target is a user key
-void TruncatedRangeDelIterator::Seek(const Slice& target) {
-  if (largest_ != nullptr &&
-      icmp_->Compare(*largest_, ParsedInternalKey(target, kMaxSequenceNumber,
-                                                  kTypeRangeDeletion)) <= 0) {
-    iter_->Invalidate();
-    return;
-  }
-  iter_->Seek(target);
-}
-
-// NOTE: target is a user key
-void TruncatedRangeDelIterator::SeekForPrev(const Slice& target) {
-  if (smallest_ != nullptr &&
-      icmp_->Compare(ParsedInternalKey(target, 0, kTypeRangeDeletion),
-                     *smallest_) < 0) {
-    iter_->Invalidate();
-    return;
-  }
-  iter_->SeekForPrev(target);
-}
-
-void TruncatedRangeDelIterator::SeekToFirst() { iter_->SeekToTopFirst(); }
-
-void TruncatedRangeDelIterator::SeekToLast() { iter_->SeekToTopLast(); }
-
-ForwardRangeDelIterator::ForwardRangeDelIterator(
-    const InternalKeyComparator* icmp,
-    const std::vector<std::unique_ptr<TruncatedRangeDelIterator>>* iters)
-    : icmp_(icmp),
-      iters_(iters),
-      unused_idx_(0),
-      active_seqnums_(SeqMaxComparator()),
-      active_iters_(EndKeyMinComparator(icmp)),
-      inactive_iters_(StartKeyMinComparator(icmp)) {}
-
-bool ForwardRangeDelIterator::ShouldDelete(const ParsedInternalKey& parsed) {
-  assert(iters_ != nullptr);
-  // Pick up previously unseen iterators.
-  for (auto it = std::next(iters_->begin(), unused_idx_); it != iters_->end();
-       ++it, ++unused_idx_) {
-    auto& iter = *it;
-    iter->Seek(parsed.user_key);
-    PushIter(iter.get(), parsed);
-    assert(active_iters_.size() == active_seqnums_.size());
-  }
-
-  // Move active iterators that end before parsed.
-  while (!active_iters_.empty() &&
-         icmp_->Compare((*active_iters_.top())->end_key(), parsed) <= 0) {
-    TruncatedRangeDelIterator* iter = PopActiveIter();
-    do {
-      iter->Next();
-    } while (iter->Valid() && icmp_->Compare(iter->end_key(), parsed) <= 0);
-    PushIter(iter, parsed);
-    assert(active_iters_.size() == active_seqnums_.size());
-  }
-
-  // Move inactive iterators that start before parsed.
-  while (!inactive_iters_.empty() &&
-         icmp_->Compare(inactive_iters_.top()->start_key(), parsed) <= 0) {
-    TruncatedRangeDelIterator* iter = PopInactiveIter();
-    while (iter->Valid() && icmp_->Compare(iter->end_key(), parsed) <= 0) {
-      iter->Next();
-    }
-    PushIter(iter, parsed);
-    assert(active_iters_.size() == active_seqnums_.size());
-  }
-
-  return active_seqnums_.empty()
-             ? false
-             : (*active_seqnums_.begin())->seq() > parsed.sequence;
-}
-
-void ForwardRangeDelIterator::Invalidate() {
-  unused_idx_ = 0;
-  active_iters_.clear();
-  active_seqnums_.clear();
-  inactive_iters_.clear();
-}
-
-ReverseRangeDelIterator::ReverseRangeDelIterator(
-    const InternalKeyComparator* icmp,
-    const std::vector<std::unique_ptr<TruncatedRangeDelIterator>>* iters)
-    : icmp_(icmp),
-      iters_(iters),
-      unused_idx_(0),
-      active_seqnums_(SeqMaxComparator()),
-      active_iters_(StartKeyMaxComparator(icmp)),
-      inactive_iters_(EndKeyMaxComparator(icmp)) {}
-
-bool ReverseRangeDelIterator::ShouldDelete(const ParsedInternalKey& parsed) {
-  assert(iters_ != nullptr);
-  // Pick up previously unseen iterators.
-  for (auto it = std::next(iters_->begin(), unused_idx_); it != iters_->end();
-       ++it, ++unused_idx_) {
-    auto& iter = *it;
-    iter->SeekForPrev(parsed.user_key);
-    PushIter(iter.get(), parsed);
-    assert(active_iters_.size() == active_seqnums_.size());
-  }
-
-  // Move active iterators that start after parsed.
-  while (!active_iters_.empty() &&
-         icmp_->Compare(parsed, (*active_iters_.top())->start_key()) < 0) {
-    TruncatedRangeDelIterator* iter = PopActiveIter();
-    do {
-      iter->Prev();
-    } while (iter->Valid() && icmp_->Compare(parsed, iter->start_key()) < 0);
-    PushIter(iter, parsed);
-    assert(active_iters_.size() == active_seqnums_.size());
-  }
-
-  // Move inactive iterators that end after parsed.
-  while (!inactive_iters_.empty() &&
-         icmp_->Compare(parsed, inactive_iters_.top()->end_key()) < 0) {
-    TruncatedRangeDelIterator* iter = PopInactiveIter();
-    while (iter->Valid() && icmp_->Compare(parsed, iter->start_key()) < 0) {
-      iter->Prev();
-    }
-    PushIter(iter, parsed);
-    assert(active_iters_.size() == active_seqnums_.size());
-  }
-
-  return active_seqnums_.empty()
-             ? false
-             : (*active_seqnums_.begin())->seq() > parsed.sequence;
-}
-
-void ReverseRangeDelIterator::Invalidate() {
-  unused_idx_ = 0;
-  active_iters_.clear();
-  active_seqnums_.clear();
-  inactive_iters_.clear();
-}
-
-RangeDelAggregatorV2::RangeDelAggregatorV2(const InternalKeyComparator* icmp,
-                                           SequenceNumber /* upper_bound */)
-    : icmp_(icmp), forward_iter_(icmp, &iters_), reverse_iter_(icmp, &iters_) {}
-
-void RangeDelAggregatorV2::AddTombstones(
-    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter,
-    const InternalKey* smallest, const InternalKey* largest) {
-  if (input_iter == nullptr || input_iter->empty()) {
-    return;
-  }
-  if (wrapped_range_del_agg != nullptr) {
-    wrapped_range_del_agg->AddTombstones(std::move(input_iter), smallest,
-                                         largest);
-    // TODO: this eats the status of the wrapped call; may want to propagate it
-    return;
-  }
-  iters_.emplace_back(new TruncatedRangeDelIterator(std::move(input_iter),
-                                                    icmp_, smallest, largest));
-}
-
-bool RangeDelAggregatorV2::ShouldDelete(const ParsedInternalKey& parsed,
-                                        RangeDelPositioningMode mode) {
-  if (wrapped_range_del_agg != nullptr) {
-    return wrapped_range_del_agg->ShouldDelete(parsed, mode);
-  }
-
-  switch (mode) {
-    case RangeDelPositioningMode::kForwardTraversal:
-      reverse_iter_.Invalidate();
-      return forward_iter_.ShouldDelete(parsed);
-    case RangeDelPositioningMode::kBackwardTraversal:
-      forward_iter_.Invalidate();
-      return reverse_iter_.ShouldDelete(parsed);
-    default:
-      assert(false);
-      return false;
-  }
-}
-
-bool RangeDelAggregatorV2::IsRangeOverlapped(const Slice& start,
-                                             const Slice& end) {
-  assert(wrapped_range_del_agg == nullptr);
-  InvalidateRangeDelMapPositions();
-
-  // Set the internal start/end keys so that:
-  // - if start_ikey has the same user key and sequence number as the current
-  // end key, start_ikey will be considered greater; and
-  // - if end_ikey has the same user key and sequence number as the current
-  // start key, end_ikey will be considered greater.
-  ParsedInternalKey start_ikey(start, kMaxSequenceNumber,
-                               static_cast<ValueType>(0));
-  ParsedInternalKey end_ikey(end, 0, static_cast<ValueType>(0));
-  for (auto& iter : iters_) {
-    bool checked_candidate_tombstones = false;
-    for (iter->SeekForPrev(start);
-         iter->Valid() && icmp_->Compare(iter->start_key(), end_ikey) <= 0;
-         iter->Next()) {
-      checked_candidate_tombstones = true;
-      if (icmp_->Compare(start_ikey, iter->end_key()) < 0 &&
-          icmp_->Compare(iter->start_key(), end_ikey) <= 0) {
-        return true;
-      }
-    }
-
-    if (!checked_candidate_tombstones) {
-      // Do an additional check for when the end of the range is the begin key
-      // of a tombstone, which we missed earlier since SeekForPrev'ing to the
-      // start was invalid.
-      iter->SeekForPrev(end);
-      if (iter->Valid() && icmp_->Compare(start_ikey, iter->end_key()) < 0 &&
-          icmp_->Compare(iter->start_key(), end_ikey) <= 0) {
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
-}  // namespace rocksdb
--- a/db/range_del_aggregator_v2.h
+++ b/db/range_del_aggregator_v2.h
@ -1,295 +0,0 @@
-//  Copyright (c) 2018-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-#pragma once
-
-#include <list>
-#include <map>
-#include <set>
-#include <string>
-#include <vector>
-
-#include "db/compaction_iteration_stats.h"
-#include "db/dbformat.h"
-#include "db/pinned_iterators_manager.h"
-#include "db/range_del_aggregator.h"
-#include "db/range_tombstone_fragmenter.h"
-#include "db/version_edit.h"
-#include "include/rocksdb/comparator.h"
-#include "include/rocksdb/types.h"
-#include "table/internal_iterator.h"
-#include "table/scoped_arena_iterator.h"
-#include "table/table_builder.h"
-#include "util/heap.h"
-#include "util/kv_map.h"
-
-namespace rocksdb {
-
-class RangeDelAggregatorV2;
-
-class TruncatedRangeDelIterator {
- public:
-  TruncatedRangeDelIterator(
-      std::unique_ptr<FragmentedRangeTombstoneIterator> iter,
-      const InternalKeyComparator* icmp, const InternalKey* smallest,
-      const InternalKey* largest);
-
-  bool Valid() const;
-
-  void Next();
-  void Prev();
-
-  // Seeks to the tombstone with the highest viisble sequence number that covers
-  // target (a user key). If no such tombstone exists, the position will be at
-  // the earliest tombstone that ends after target.
-  void Seek(const Slice& target);
-
-  // Seeks to the tombstone with the highest viisble sequence number that covers
-  // target (a user key). If no such tombstone exists, the position will be at
-  // the latest tombstone that starts before target.
-  void SeekForPrev(const Slice& target);
-
-  void SeekToFirst();
-  void SeekToLast();
-
-  ParsedInternalKey start_key() const {
-    return (smallest_ == nullptr ||
-            icmp_->Compare(*smallest_, iter_->parsed_start_key()) <= 0)
-               ? iter_->parsed_start_key()
-               : *smallest_;
-  }
-
-  ParsedInternalKey end_key() const {
-    return (largest_ == nullptr ||
-            icmp_->Compare(iter_->parsed_end_key(), *largest_) <= 0)
-               ? iter_->parsed_end_key()
-               : *largest_;
-  }
-
-  SequenceNumber seq() const { return iter_->seq(); }
-
- private:
-  std::unique_ptr<FragmentedRangeTombstoneIterator> iter_;
-  const InternalKeyComparator* icmp_;
-  const ParsedInternalKey* smallest_ = nullptr;
-  const ParsedInternalKey* largest_ = nullptr;
-  std::list<ParsedInternalKey> pinned_bounds_;
-};
-
-struct SeqMaxComparator {
-  bool operator()(const TruncatedRangeDelIterator* a,
-                  const TruncatedRangeDelIterator* b) const {
-    return a->seq() > b->seq();
-  }
-};
-
-class ForwardRangeDelIterator {
- public:
-  ForwardRangeDelIterator(
-      const InternalKeyComparator* icmp,
-      const std::vector<std::unique_ptr<TruncatedRangeDelIterator>>* iters);
-
-  bool ShouldDelete(const ParsedInternalKey& parsed);
-  void Invalidate();
-
- private:
-  using ActiveSeqSet =
-      std::multiset<TruncatedRangeDelIterator*, SeqMaxComparator>;
-
-  struct StartKeyMinComparator {
-    explicit StartKeyMinComparator(const InternalKeyComparator* c) : icmp(c) {}
-
-    bool operator()(const TruncatedRangeDelIterator* a,
-                    const TruncatedRangeDelIterator* b) const {
-      return icmp->Compare(a->start_key(), b->start_key()) > 0;
-    }
-
-    const InternalKeyComparator* icmp;
-  };
-  struct EndKeyMinComparator {
-    explicit EndKeyMinComparator(const InternalKeyComparator* c) : icmp(c) {}
-
-    bool operator()(const ActiveSeqSet::const_iterator& a,
-                    const ActiveSeqSet::const_iterator& b) const {
-      return icmp->Compare((*a)->end_key(), (*b)->end_key()) > 0;
-    }
-
-    const InternalKeyComparator* icmp;
-  };
-
-  void PushIter(TruncatedRangeDelIterator* iter,
-                const ParsedInternalKey& parsed) {
-    if (!iter->Valid()) {
-      // The iterator has been fully consumed, so we don't need to add it to
-      // either of the heaps.
-    } else if (icmp_->Compare(parsed, iter->start_key()) < 0) {
-      PushInactiveIter(iter);
-    } else {
-      PushActiveIter(iter);
-    }
-  }
-
-  void PushActiveIter(TruncatedRangeDelIterator* iter) {
-    auto seq_pos = active_seqnums_.insert(iter);
-    active_iters_.push(seq_pos);
-  }
-
-  TruncatedRangeDelIterator* PopActiveIter() {
-    auto active_top = active_iters_.top();
-    auto iter = *active_top;
-    active_iters_.pop();
-    active_seqnums_.erase(active_top);
-    return iter;
-  }
-
-  void PushInactiveIter(TruncatedRangeDelIterator* iter) {
-    inactive_iters_.push(iter);
-  }
-
-  TruncatedRangeDelIterator* PopInactiveIter() {
-    auto* iter = inactive_iters_.top();
-    inactive_iters_.pop();
-    return iter;
-  }
-
-  const InternalKeyComparator* icmp_;
-  const std::vector<std::unique_ptr<TruncatedRangeDelIterator>>* iters_;
-  size_t unused_idx_;
-  ActiveSeqSet active_seqnums_;
-  BinaryHeap<ActiveSeqSet::const_iterator, EndKeyMinComparator> active_iters_;
-  BinaryHeap<TruncatedRangeDelIterator*, StartKeyMinComparator> inactive_iters_;
-};
-
-class ReverseRangeDelIterator {
- public:
-  ReverseRangeDelIterator(
-      const InternalKeyComparator* icmp,
-      const std::vector<std::unique_ptr<TruncatedRangeDelIterator>>* iters);
-
-  bool ShouldDelete(const ParsedInternalKey& parsed);
-  void Invalidate();
-
- private:
-  using ActiveSeqSet =
-      std::multiset<TruncatedRangeDelIterator*, SeqMaxComparator>;
-
-  struct EndKeyMaxComparator {
-    explicit EndKeyMaxComparator(const InternalKeyComparator* c) : icmp(c) {}
-
-    bool operator()(const TruncatedRangeDelIterator* a,
-                    const TruncatedRangeDelIterator* b) const {
-      return icmp->Compare(a->end_key(), b->end_key()) < 0;
-    }
-
-    const InternalKeyComparator* icmp;
-  };
-  struct StartKeyMaxComparator {
-    explicit StartKeyMaxComparator(const InternalKeyComparator* c) : icmp(c) {}
-
-    bool operator()(const ActiveSeqSet::const_iterator& a,
-                    const ActiveSeqSet::const_iterator& b) const {
-      return icmp->Compare((*a)->start_key(), (*b)->start_key()) < 0;
-    }
-
-    const InternalKeyComparator* icmp;
-  };
-
-  void PushIter(TruncatedRangeDelIterator* iter,
-                const ParsedInternalKey& parsed) {
-    if (!iter->Valid()) {
-      // The iterator has been fully consumed, so we don't need to add it to
-      // either of the heaps.
-    } else if (icmp_->Compare(iter->end_key(), parsed) <= 0) {
-      PushInactiveIter(iter);
-    } else {
-      PushActiveIter(iter);
-    }
-  }
-
-  void PushActiveIter(TruncatedRangeDelIterator* iter) {
-    auto seq_pos = active_seqnums_.insert(iter);
-    active_iters_.push(seq_pos);
-  }
-
-  TruncatedRangeDelIterator* PopActiveIter() {
-    auto active_top = active_iters_.top();
-    auto iter = *active_top;
-    active_iters_.pop();
-    active_seqnums_.erase(active_top);
-    return iter;
-  }
-
-  void PushInactiveIter(TruncatedRangeDelIterator* iter) {
-    inactive_iters_.push(iter);
-  }
-
-  TruncatedRangeDelIterator* PopInactiveIter() {
-    auto* iter = inactive_iters_.top();
-    inactive_iters_.pop();
-    return iter;
-  }
-
-  const InternalKeyComparator* icmp_;
-  const std::vector<std::unique_ptr<TruncatedRangeDelIterator>>* iters_;
-  size_t unused_idx_;
-  ActiveSeqSet active_seqnums_;
-  BinaryHeap<ActiveSeqSet::const_iterator, StartKeyMaxComparator> active_iters_;
-  BinaryHeap<TruncatedRangeDelIterator*, EndKeyMaxComparator> inactive_iters_;
-};
-
-class RangeDelAggregatorV2 {
- public:
-  RangeDelAggregatorV2(const InternalKeyComparator* icmp,
-                       SequenceNumber upper_bound);
-
-  void AddTombstones(
-      std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter,
-      const InternalKey* smallest = nullptr,
-      const InternalKey* largest = nullptr);
-
-  bool ShouldDelete(const ParsedInternalKey& parsed,
-                    RangeDelPositioningMode mode);
-
-  bool IsRangeOverlapped(const Slice& start, const Slice& end);
-
-  void InvalidateRangeDelMapPositions() {
-    forward_iter_.Invalidate();
-    reverse_iter_.Invalidate();
-  }
-
-  bool IsEmpty() const { return iters_.empty(); }
-  bool AddFile(uint64_t file_number) {
-    return files_seen_.insert(file_number).second;
-  }
-
-  // Adaptor method to pass calls through to an old-style RangeDelAggregator.
-  // Will be removed once this new version supports an iterator that can be used
-  // during flush/compaction.
-  RangeDelAggregator* DelegateToRangeDelAggregator(
-      const std::vector<SequenceNumber>& snapshots) {
-    wrapped_range_del_agg.reset(new RangeDelAggregator(
-        *icmp_, snapshots, true /* collapse_deletions */));
-    return wrapped_range_del_agg.get();
-  }
-
-  std::unique_ptr<RangeDelIterator> NewIterator() {
-    assert(wrapped_range_del_agg != nullptr);
-    return wrapped_range_del_agg->NewIterator();
-  }
-
- private:
-  const InternalKeyComparator* icmp_;
-
-  std::vector<std::unique_ptr<TruncatedRangeDelIterator>> iters_;
-  std::set<uint64_t> files_seen_;
-
-  ForwardRangeDelIterator forward_iter_;
-  ReverseRangeDelIterator reverse_iter_;
-
-  // TODO: remove once V2 supports exposing tombstone iterators
-  std::unique_ptr<RangeDelAggregator> wrapped_range_del_agg;
-};
-
-}  // namespace rocksdb
--- a/db/range_del_aggregator_v2_test.cc
+++ b/db/range_del_aggregator_v2_test.cc
@ -1,469 +0,0 @@
-//  Copyright (c) 2018-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-#include "db/range_del_aggregator_v2.h"
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "db/db_test_util.h"
-#include "db/dbformat.h"
-#include "db/range_tombstone_fragmenter.h"
-#include "util/testutil.h"
-
-namespace rocksdb {
-
-class RangeDelAggregatorV2Test : public testing::Test {};
-
-namespace {
-
-static auto bytewise_icmp = InternalKeyComparator(BytewiseComparator());
-
-std::unique_ptr<InternalIterator> MakeRangeDelIter(
-    const std::vector<RangeTombstone>& range_dels) {
-  std::vector<std::string> keys, values;
-  for (const auto& range_del : range_dels) {
-    auto key_and_value = range_del.Serialize();
-    keys.push_back(key_and_value.first.Encode().ToString());
-    values.push_back(key_and_value.second.ToString());
-  }
-  return std::unique_ptr<test::VectorIterator>(
-      new test::VectorIterator(keys, values));
-}
-
-std::vector<std::unique_ptr<FragmentedRangeTombstoneList>>
-MakeFragmentedTombstoneLists(
-    const std::vector<std::vector<RangeTombstone>>& range_dels_list) {
-  std::vector<std::unique_ptr<FragmentedRangeTombstoneList>> fragment_lists;
-  for (const auto& range_dels : range_dels_list) {
-    auto range_del_iter = MakeRangeDelIter(range_dels);
-    fragment_lists.emplace_back(new FragmentedRangeTombstoneList(
-        std::move(range_del_iter), bytewise_icmp));
-  }
-  return fragment_lists;
-}
-
-struct TruncatedIterScanTestCase {
-  ParsedInternalKey start;
-  ParsedInternalKey end;
-  SequenceNumber seq;
-};
-
-struct TruncatedIterSeekTestCase {
-  Slice target;
-  ParsedInternalKey start;
-  ParsedInternalKey end;
-  SequenceNumber seq;
-  bool invalid;
-};
-
-struct ShouldDeleteTestCase {
-  ParsedInternalKey lookup_key;
-  bool result;
-};
-
-struct IsRangeOverlappedTestCase {
-  Slice start;
-  Slice end;
-  bool result;
-};
-
-ParsedInternalKey UncutEndpoint(const Slice& s) {
-  return ParsedInternalKey(s, kMaxSequenceNumber, kTypeRangeDeletion);
-}
-
-ParsedInternalKey InternalValue(const Slice& key, SequenceNumber seq) {
-  return ParsedInternalKey(key, seq, kTypeValue);
-}
-
-void VerifyIterator(
-    TruncatedRangeDelIterator* iter, const InternalKeyComparator& icmp,
-    const std::vector<TruncatedIterScanTestCase>& expected_range_dels) {
-  // Test forward iteration.
-  iter->SeekToFirst();
-  for (size_t i = 0; i < expected_range_dels.size(); i++, iter->Next()) {
-    ASSERT_TRUE(iter->Valid());
-    EXPECT_EQ(0, icmp.Compare(iter->start_key(), expected_range_dels[i].start));
-    EXPECT_EQ(0, icmp.Compare(iter->end_key(), expected_range_dels[i].end));
-    EXPECT_EQ(expected_range_dels[i].seq, iter->seq());
-  }
-  EXPECT_FALSE(iter->Valid());
-
-  // Test reverse iteration.
-  iter->SeekToLast();
-  std::vector<TruncatedIterScanTestCase> reverse_expected_range_dels(
-      expected_range_dels.rbegin(), expected_range_dels.rend());
-  for (size_t i = 0; i < reverse_expected_range_dels.size();
-       i++, iter->Prev()) {
-    ASSERT_TRUE(iter->Valid());
-    EXPECT_EQ(0, icmp.Compare(iter->start_key(),
-                              reverse_expected_range_dels[i].start));
-    EXPECT_EQ(
-        0, icmp.Compare(iter->end_key(), reverse_expected_range_dels[i].end));
-    EXPECT_EQ(reverse_expected_range_dels[i].seq, iter->seq());
-  }
-  EXPECT_FALSE(iter->Valid());
-}
-
-void VerifySeek(TruncatedRangeDelIterator* iter,
-                const InternalKeyComparator& icmp,
-                const std::vector<TruncatedIterSeekTestCase>& test_cases) {
-  for (const auto& test_case : test_cases) {
-    iter->Seek(test_case.target);
-    if (test_case.invalid) {
-      ASSERT_FALSE(iter->Valid());
-    } else {
-      ASSERT_TRUE(iter->Valid());
-      EXPECT_EQ(0, icmp.Compare(iter->start_key(), test_case.start));
-      EXPECT_EQ(0, icmp.Compare(iter->end_key(), test_case.end));
-      EXPECT_EQ(test_case.seq, iter->seq());
-    }
-  }
-}
-
-void VerifySeekForPrev(
-    TruncatedRangeDelIterator* iter, const InternalKeyComparator& icmp,
-    const std::vector<TruncatedIterSeekTestCase>& test_cases) {
-  for (const auto& test_case : test_cases) {
-    iter->SeekForPrev(test_case.target);
-    if (test_case.invalid) {
-      ASSERT_FALSE(iter->Valid());
-    } else {
-      ASSERT_TRUE(iter->Valid());
-      EXPECT_EQ(0, icmp.Compare(iter->start_key(), test_case.start));
-      EXPECT_EQ(0, icmp.Compare(iter->end_key(), test_case.end));
-      EXPECT_EQ(test_case.seq, iter->seq());
-    }
-  }
-}
-
-void VerifyShouldDelete(RangeDelAggregatorV2* range_del_agg,
-                        const std::vector<ShouldDeleteTestCase>& test_cases) {
-  for (const auto& test_case : test_cases) {
-    EXPECT_EQ(
-        test_case.result,
-        range_del_agg->ShouldDelete(
-            test_case.lookup_key, RangeDelPositioningMode::kForwardTraversal));
-  }
-  for (auto it = test_cases.rbegin(); it != test_cases.rend(); ++it) {
-    const auto& test_case = *it;
-    EXPECT_EQ(
-        test_case.result,
-        range_del_agg->ShouldDelete(
-            test_case.lookup_key, RangeDelPositioningMode::kBackwardTraversal));
-  }
-}
-
-void VerifyIsRangeOverlapped(
-    RangeDelAggregatorV2* range_del_agg,
-    const std::vector<IsRangeOverlappedTestCase>& test_cases) {
-  for (const auto& test_case : test_cases) {
-    EXPECT_EQ(test_case.result,
-              range_del_agg->IsRangeOverlapped(test_case.start, test_case.end));
-  }
-}
-
-}  // namespace
-
-TEST_F(RangeDelAggregatorV2Test, EmptyTruncatedIter) {
-  auto range_del_iter = MakeRangeDelIter({});
-  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
-                                             bytewise_icmp);
-  std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
-      new FragmentedRangeTombstoneIterator(&fragment_list, kMaxSequenceNumber,
-                                           bytewise_icmp));
-
-  TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp, nullptr,
-                                 nullptr);
-
-  iter.SeekToFirst();
-  ASSERT_FALSE(iter.Valid());
-
-  iter.SeekToLast();
-  ASSERT_FALSE(iter.Valid());
-}
-
-TEST_F(RangeDelAggregatorV2Test, UntruncatedIter) {
-  auto range_del_iter =
-      MakeRangeDelIter({{"a", "e", 10}, {"e", "g", 8}, {"j", "n", 4}});
-  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
-                                             bytewise_icmp);
-  std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
-      new FragmentedRangeTombstoneIterator(&fragment_list, kMaxSequenceNumber,
-                                           bytewise_icmp));
-
-  TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp, nullptr,
-                                 nullptr);
-
-  VerifyIterator(&iter, bytewise_icmp,
-                 {{UncutEndpoint("a"), UncutEndpoint("e"), 10},
-                  {UncutEndpoint("e"), UncutEndpoint("g"), 8},
-                  {UncutEndpoint("j"), UncutEndpoint("n"), 4}});
-
-  VerifySeek(
-      &iter, bytewise_icmp,
-      {{"d", UncutEndpoint("a"), UncutEndpoint("e"), 10},
-       {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8},
-       {"ia", UncutEndpoint("j"), UncutEndpoint("n"), 4},
-       {"n", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */},
-       {"", UncutEndpoint("a"), UncutEndpoint("e"), 10}});
-
-  VerifySeekForPrev(
-      &iter, bytewise_icmp,
-      {{"d", UncutEndpoint("a"), UncutEndpoint("e"), 10},
-       {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8},
-       {"ia", UncutEndpoint("e"), UncutEndpoint("g"), 8},
-       {"n", UncutEndpoint("j"), UncutEndpoint("n"), 4},
-       {"", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}});
-}
-
-TEST_F(RangeDelAggregatorV2Test, UntruncatedIterWithSnapshot) {
-  auto range_del_iter =
-      MakeRangeDelIter({{"a", "e", 10}, {"e", "g", 8}, {"j", "n", 4}});
-  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
-                                             bytewise_icmp);
-  std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
-      new FragmentedRangeTombstoneIterator(&fragment_list, 9 /* snapshot */,
-                                           bytewise_icmp));
-
-  TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp, nullptr,
-                                 nullptr);
-
-  VerifyIterator(&iter, bytewise_icmp,
-                 {{UncutEndpoint("e"), UncutEndpoint("g"), 8},
-                  {UncutEndpoint("j"), UncutEndpoint("n"), 4}});
-
-  VerifySeek(
-      &iter, bytewise_icmp,
-      {{"d", UncutEndpoint("e"), UncutEndpoint("g"), 8},
-       {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8},
-       {"ia", UncutEndpoint("j"), UncutEndpoint("n"), 4},
-       {"n", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */},
-       {"", UncutEndpoint("e"), UncutEndpoint("g"), 8}});
-
-  VerifySeekForPrev(
-      &iter, bytewise_icmp,
-      {{"d", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */},
-       {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8},
-       {"ia", UncutEndpoint("e"), UncutEndpoint("g"), 8},
-       {"n", UncutEndpoint("j"), UncutEndpoint("n"), 4},
-       {"", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}});
-}
-
-TEST_F(RangeDelAggregatorV2Test, TruncatedIter) {
-  auto range_del_iter =
-      MakeRangeDelIter({{"a", "e", 10}, {"e", "g", 8}, {"j", "n", 4}});
-  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
-                                             bytewise_icmp);
-  std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
-      new FragmentedRangeTombstoneIterator(&fragment_list, kMaxSequenceNumber,
-                                           bytewise_icmp));
-
-  InternalKey smallest("d", 7, kTypeValue);
-  InternalKey largest("m", 9, kTypeValue);
-  TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp,
-                                 &smallest, &largest);
-
-  VerifyIterator(&iter, bytewise_icmp,
-                 {{InternalValue("d", 7), UncutEndpoint("e"), 10},
-                  {UncutEndpoint("e"), UncutEndpoint("g"), 8},
-                  {UncutEndpoint("j"), InternalValue("m", 8), 4}});
-
-  VerifySeek(
-      &iter, bytewise_icmp,
-      {{"d", InternalValue("d", 7), UncutEndpoint("e"), 10},
-       {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8},
-       {"ia", UncutEndpoint("j"), InternalValue("m", 8), 4},
-       {"n", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */},
-       {"", InternalValue("d", 7), UncutEndpoint("e"), 10}});
-
-  VerifySeekForPrev(
-      &iter, bytewise_icmp,
-      {{"d", InternalValue("d", 7), UncutEndpoint("e"), 10},
-       {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8},
-       {"ia", UncutEndpoint("e"), UncutEndpoint("g"), 8},
-       {"n", UncutEndpoint("j"), InternalValue("m", 8), 4},
-       {"", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}});
-}
-
-TEST_F(RangeDelAggregatorV2Test, SingleIterInAggregator) {
-  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10}, {"c", "g", 8}});
-  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
-                                             bytewise_icmp);
-  std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
-      new FragmentedRangeTombstoneIterator(&fragment_list, kMaxSequenceNumber,
-                                           bytewise_icmp));
-
-  RangeDelAggregatorV2 range_del_agg(&bytewise_icmp, kMaxSequenceNumber);
-  range_del_agg.AddTombstones(std::move(input_iter));
-
-  VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 19), false},
-                                      {InternalValue("b", 9), true},
-                                      {InternalValue("d", 9), true},
-                                      {InternalValue("e", 7), true},
-                                      {InternalValue("g", 7), false}});
-
-  VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false},
-                                           {"_", "a", true},
-                                           {"a", "c", true},
-                                           {"d", "f", true},
-                                           {"g", "l", false}});
-}
-
-TEST_F(RangeDelAggregatorV2Test, MultipleItersInAggregator) {
-  auto fragment_lists = MakeFragmentedTombstoneLists(
-      {{{"a", "e", 10}, {"c", "g", 8}},
-       {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}});
-
-  RangeDelAggregatorV2 range_del_agg(&bytewise_icmp, kMaxSequenceNumber);
-  for (const auto& fragment_list : fragment_lists) {
-    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
-        new FragmentedRangeTombstoneIterator(
-            fragment_list.get(), kMaxSequenceNumber, bytewise_icmp));
-    range_del_agg.AddTombstones(std::move(input_iter));
-  }
-
-  VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 19), true},
-                                      {InternalValue("b", 19), false},
-                                      {InternalValue("b", 9), true},
-                                      {InternalValue("d", 9), true},
-                                      {InternalValue("e", 7), true},
-                                      {InternalValue("g", 7), false},
-                                      {InternalValue("h", 24), true},
-                                      {InternalValue("i", 24), false},
-                                      {InternalValue("ii", 14), true},
-                                      {InternalValue("j", 14), false}});
-
-  VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false},
-                                           {"_", "a", true},
-                                           {"a", "c", true},
-                                           {"d", "f", true},
-                                           {"g", "l", true},
-                                           {"x", "y", false}});
-}
-
-TEST_F(RangeDelAggregatorV2Test, MultipleItersInAggregatorWithUpperBound) {
-  auto fragment_lists = MakeFragmentedTombstoneLists(
-      {{{"a", "e", 10}, {"c", "g", 8}},
-       {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}});
-
-  RangeDelAggregatorV2 range_del_agg(&bytewise_icmp, 19);
-  for (const auto& fragment_list : fragment_lists) {
-    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
-        new FragmentedRangeTombstoneIterator(fragment_list.get(),
-                                             19 /* snapshot */, bytewise_icmp));
-    range_del_agg.AddTombstones(std::move(input_iter));
-  }
-
-  VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 19), false},
-                                      {InternalValue("a", 9), true},
-                                      {InternalValue("b", 9), true},
-                                      {InternalValue("d", 9), true},
-                                      {InternalValue("e", 7), true},
-                                      {InternalValue("g", 7), false},
-                                      {InternalValue("h", 24), false},
-                                      {InternalValue("i", 24), false},
-                                      {InternalValue("ii", 14), true},
-                                      {InternalValue("j", 14), false}});
-
-  VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false},
-                                           {"_", "a", true},
-                                           {"a", "c", true},
-                                           {"d", "f", true},
-                                           {"g", "l", true},
-                                           {"x", "y", false}});
-}
-
-TEST_F(RangeDelAggregatorV2Test, MultipleTruncatedItersInAggregator) {
-  auto fragment_lists = MakeFragmentedTombstoneLists(
-      {{{"a", "z", 10}}, {{"a", "z", 10}}, {{"a", "z", 10}}});
-  std::vector<std::pair<InternalKey, InternalKey>> iter_bounds = {
-      {InternalKey("a", 4, kTypeValue),
-       InternalKey("m", kMaxSequenceNumber, kTypeRangeDeletion)},
-      {InternalKey("m", 20, kTypeValue),
-       InternalKey("x", kMaxSequenceNumber, kTypeRangeDeletion)},
-      {InternalKey("x", 5, kTypeValue), InternalKey("zz", 30, kTypeValue)}};
-
-  RangeDelAggregatorV2 range_del_agg(&bytewise_icmp, 19);
-  for (size_t i = 0; i < fragment_lists.size(); i++) {
-    const auto& fragment_list = fragment_lists[i];
-    const auto& bounds = iter_bounds[i];
-    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
-        new FragmentedRangeTombstoneIterator(fragment_list.get(),
-                                             19 /* snapshot */, bytewise_icmp));
-    range_del_agg.AddTombstones(std::move(input_iter), &bounds.first,
-                                &bounds.second);
-  }
-
-  VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 10), false},
-                                      {InternalValue("a", 9), false},
-                                      {InternalValue("a", 4), true},
-                                      {InternalValue("m", 10), false},
-                                      {InternalValue("m", 9), true},
-                                      {InternalValue("x", 10), false},
-                                      {InternalValue("x", 9), false},
-                                      {InternalValue("x", 5), true},
-                                      {InternalValue("z", 9), false}});
-
-  VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false},
-                                           {"_", "a", true},
-                                           {"a", "n", true},
-                                           {"l", "x", true},
-                                           {"w", "z", true},
-                                           {"zzz", "zz", false},
-                                           {"zz", "zzz", false}});
-}
-
-TEST_F(RangeDelAggregatorV2Test, MultipleTruncatedItersInAggregatorSameLevel) {
-  auto fragment_lists = MakeFragmentedTombstoneLists(
-      {{{"a", "z", 10}}, {{"a", "z", 10}}, {{"a", "z", 10}}});
-  std::vector<std::pair<InternalKey, InternalKey>> iter_bounds = {
-      {InternalKey("a", 4, kTypeValue),
-       InternalKey("m", kMaxSequenceNumber, kTypeRangeDeletion)},
-      {InternalKey("m", 20, kTypeValue),
-       InternalKey("x", kMaxSequenceNumber, kTypeRangeDeletion)},
-      {InternalKey("x", 5, kTypeValue), InternalKey("zz", 30, kTypeValue)}};
-
-  RangeDelAggregatorV2 range_del_agg(&bytewise_icmp, 19);
-
-  auto add_iter_to_agg = [&](size_t i) {
-    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
-        new FragmentedRangeTombstoneIterator(fragment_lists[i].get(),
-                                             19 /* snapshot */, bytewise_icmp));
-    range_del_agg.AddTombstones(std::move(input_iter), &iter_bounds[i].first,
-                                &iter_bounds[i].second);
-  };
-
-  add_iter_to_agg(0);
-  VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 10), false},
-                                      {InternalValue("a", 9), false},
-                                      {InternalValue("a", 4), true}});
-
-  add_iter_to_agg(1);
-  VerifyShouldDelete(&range_del_agg, {{InternalValue("m", 10), false},
-                                      {InternalValue("m", 9), true}});
-
-  add_iter_to_agg(2);
-  VerifyShouldDelete(&range_del_agg, {{InternalValue("x", 10), false},
-                                      {InternalValue("x", 9), false},
-                                      {InternalValue("x", 5), true},
-                                      {InternalValue("z", 9), false}});
-
-  VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false},
-                                           {"_", "a", true},
-                                           {"a", "n", true},
-                                           {"l", "x", true},
-                                           {"w", "z", true},
-                                           {"zzz", "zz", false},
-                                           {"zz", "zzz", false}});
-}
-
-}  // namespace rocksdb
-
-int main(int argc, char** argv) {
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
--- a/db/range_tombstone_fragmenter.cc
+++ b/db/range_tombstone_fragmenter.cc
@ -20,7 +20,8 @@ namespace rocksdb {

 FragmentedRangeTombstoneList::FragmentedRangeTombstoneList(
    std::unique_ptr<InternalIterator> unfragmented_tombstones,
-    const InternalKeyComparator& icmp) {
+    const InternalKeyComparator& icmp, bool for_compaction,
+    const std::vector<SequenceNumber>& snapshots) {
  if (unfragmented_tombstones == nullptr) {
    return;
  }
@ -43,7 +44,8 @@ FragmentedRangeTombstoneList::FragmentedRangeTombstoneList(
    }
  }
  if (is_sorted) {
-    FragmentTombstones(std::move(unfragmented_tombstones), icmp);
+    FragmentTombstones(std::move(unfragmented_tombstones), icmp, for_compaction,
+                       snapshots);
    return;
  }

@ -61,12 +63,13 @@ FragmentedRangeTombstoneList::FragmentedRangeTombstoneList(
  // VectorIterator implicitly sorts by key during construction.
  auto iter = std::unique_ptr<VectorIterator>(
      new VectorIterator(std::move(keys), std::move(values), &icmp));
-  FragmentTombstones(std::move(iter), icmp);
+  FragmentTombstones(std::move(iter), icmp, for_compaction, snapshots);
 }

 void FragmentedRangeTombstoneList::FragmentTombstones(
    std::unique_ptr<InternalIterator> unfragmented_tombstones,
-    const InternalKeyComparator& icmp) {
+    const InternalKeyComparator& icmp, bool for_compaction,
+    const std::vector<SequenceNumber>& snapshots) {
  Slice cur_start_key(nullptr, 0);
  auto cmp = ParsedInternalKeyComparator(&icmp);

@ -117,10 +120,38 @@ void FragmentedRangeTombstoneList::FragmentTombstones(
      }
      std::sort(seqnums_to_flush.begin(), seqnums_to_flush.end(),
                std::greater<SequenceNumber>());
+
      size_t start_idx = tombstone_seqs_.size();
      size_t end_idx = start_idx + seqnums_to_flush.size();
-      tombstone_seqs_.insert(tombstone_seqs_.end(), seqnums_to_flush.begin(),
-                             seqnums_to_flush.end());
+
+      if (for_compaction) {
+        // Drop all tombstone seqnums that are not preserved by a snapshot.
+        SequenceNumber next_snapshot = kMaxSequenceNumber;
+        for (auto seq : seqnums_to_flush) {
+          if (seq <= next_snapshot) {
+            // This seqnum is visible by a lower snapshot.
+            tombstone_seqs_.push_back(seq);
+            seq_set_.insert(seq);
+            auto upper_bound_it =
+                std::lower_bound(snapshots.begin(), snapshots.end(), seq);
+            if (upper_bound_it == snapshots.begin()) {
+              // This seqnum is the topmost one visible by the earliest
+              // snapshot. None of the seqnums below it will be visible, so we
+              // can skip them.
+              break;
+            }
+            next_snapshot = *std::prev(upper_bound_it);
+          }
+        }
+        end_idx = tombstone_seqs_.size();
+      } else {
+        // The fragmentation is being done for reads, so preserve all seqnums.
+        tombstone_seqs_.insert(tombstone_seqs_.end(), seqnums_to_flush.begin(),
+                               seqnums_to_flush.end());
+        seq_set_.insert(seqnums_to_flush.begin(), seqnums_to_flush.end());
+      }
+
+      assert(start_idx < end_idx);
      tombstones_.emplace_back(cur_start_key, cur_end_key, start_idx, end_idx);

      cur_start_key = cur_end_key;
@ -143,6 +174,11 @@ void FragmentedRangeTombstoneList::FragmentTombstones(
    const Slice& ikey = unfragmented_tombstones->key();
    Slice tombstone_start_key = ExtractUserKey(ikey);
    SequenceNumber tombstone_seq = GetInternalKeySeqno(ikey);
+    if (!unfragmented_tombstones->IsKeyPinned()) {
+      pinned_slices_.emplace_back(tombstone_start_key.data(),
+                                  tombstone_start_key.size());
+      tombstone_start_key = pinned_slices_.back();
+    }
    no_tombstones = false;

    Slice tombstone_end_key = unfragmented_tombstones->value();
@ -157,13 +193,7 @@ void FragmentedRangeTombstoneList::FragmentTombstones(
      // this new start key.
      flush_current_tombstones(tombstone_start_key);
    }
-    if (unfragmented_tombstones->IsKeyPinned()) {
-      cur_start_key = tombstone_start_key;
-    } else {
-      pinned_slices_.emplace_back(tombstone_start_key.data(),
-                                  tombstone_start_key.size());
-      cur_start_key = pinned_slices_.back();
-    }
+    cur_start_key = tombstone_start_key;

    cur_end_keys.emplace(tombstone_end_key, tombstone_seq, kTypeRangeDeletion);
  }
@ -178,33 +208,41 @@ void FragmentedRangeTombstoneList::FragmentTombstones(
  }
 }

+bool FragmentedRangeTombstoneList::ContainsRange(SequenceNumber lower,
+                                                 SequenceNumber upper) const {
+  auto seq_it = seq_set_.lower_bound(lower);
+  return seq_it != seq_set_.end() && *seq_it <= upper;
+}
+
 FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator(
-    const FragmentedRangeTombstoneList* tombstones, SequenceNumber snapshot,
-    const InternalKeyComparator& icmp)
+    const FragmentedRangeTombstoneList* tombstones,
+    const InternalKeyComparator& icmp, SequenceNumber _upper_bound,
+    SequenceNumber _lower_bound)
    : tombstone_start_cmp_(icmp.user_comparator()),
      tombstone_end_cmp_(icmp.user_comparator()),
+      icmp_(&icmp),
      ucmp_(icmp.user_comparator()),
      tombstones_(tombstones),
-      snapshot_(snapshot) {
+      upper_bound_(_upper_bound),
+      lower_bound_(_lower_bound) {
  assert(tombstones_ != nullptr);
-  pos_ = tombstones_->end();
-  pinned_pos_ = tombstones_->end();
+  Invalidate();
 }

 FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator(
    const std::shared_ptr<const FragmentedRangeTombstoneList>& tombstones,
-    SequenceNumber snapshot, const InternalKeyComparator& icmp)
+    const InternalKeyComparator& icmp, SequenceNumber _upper_bound,
+    SequenceNumber _lower_bound)
    : tombstone_start_cmp_(icmp.user_comparator()),
      tombstone_end_cmp_(icmp.user_comparator()),
+      icmp_(&icmp),
      ucmp_(icmp.user_comparator()),
      tombstones_ref_(tombstones),
      tombstones_(tombstones_ref_.get()),
-      snapshot_(snapshot) {
+      upper_bound_(_upper_bound),
+      lower_bound_(_lower_bound) {
  assert(tombstones_ != nullptr);
-  pos_ = tombstones_->end();
-  seq_pos_ = tombstones_->seq_end();
-  pinned_pos_ = tombstones_->end();
-  pinned_seq_pos_ = tombstones_->seq_end();
+  Invalidate();
 }

 void FragmentedRangeTombstoneIterator::SeekToFirst() {
@ -220,7 +258,7 @@ void FragmentedRangeTombstoneIterator::SeekToTopFirst() {
  pos_ = tombstones_->begin();
  seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx),
                              tombstones_->seq_iter(pos_->seq_end_idx),
-                              snapshot_, std::greater<SequenceNumber>());
+                              upper_bound_, std::greater<SequenceNumber>());
  ScanForwardToVisibleTombstone();
 }

@ -237,7 +275,7 @@ void FragmentedRangeTombstoneIterator::SeekToTopLast() {
  pos_ = std::prev(tombstones_->end());
  seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx),
                              tombstones_->seq_iter(pos_->seq_end_idx),
-                              snapshot_, std::greater<SequenceNumber>());
+                              upper_bound_, std::greater<SequenceNumber>());
  ScanBackwardToVisibleTombstone();
 }

@ -270,7 +308,7 @@ void FragmentedRangeTombstoneIterator::SeekToCoveringTombstone(
  }
  seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx),
                              tombstones_->seq_iter(pos_->seq_end_idx),
-                              snapshot_, std::greater<SequenceNumber>());
+                              upper_bound_, std::greater<SequenceNumber>());
 }

 void FragmentedRangeTombstoneIterator::SeekForPrevToCoveringTombstone(
@ -289,25 +327,28 @@ void FragmentedRangeTombstoneIterator::SeekForPrevToCoveringTombstone(
  --pos_;
  seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx),
                              tombstones_->seq_iter(pos_->seq_end_idx),
-                              snapshot_, std::greater<SequenceNumber>());
+                              upper_bound_, std::greater<SequenceNumber>());
 }

 void FragmentedRangeTombstoneIterator::ScanForwardToVisibleTombstone() {
  while (pos_ != tombstones_->end() &&
-         seq_pos_ == tombstones_->seq_iter(pos_->seq_end_idx)) {
+         (seq_pos_ == tombstones_->seq_iter(pos_->seq_end_idx) ||
+          *seq_pos_ < lower_bound_)) {
    ++pos_;
    if (pos_ == tombstones_->end()) {
+      Invalidate();
      return;
    }
    seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx),
                                tombstones_->seq_iter(pos_->seq_end_idx),
-                                snapshot_, std::greater<SequenceNumber>());
+                                upper_bound_, std::greater<SequenceNumber>());
  }
 }

 void FragmentedRangeTombstoneIterator::ScanBackwardToVisibleTombstone() {
  while (pos_ != tombstones_->end() &&
-         seq_pos_ == tombstones_->seq_iter(pos_->seq_end_idx)) {
+         (seq_pos_ == tombstones_->seq_iter(pos_->seq_end_idx) ||
+          *seq_pos_ < lower_bound_)) {
    if (pos_ == tombstones_->begin()) {
      Invalidate();
      return;
@ -315,7 +356,7 @@ void FragmentedRangeTombstoneIterator::ScanBackwardToVisibleTombstone() {
    --pos_;
    seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx),
                                tombstones_->seq_iter(pos_->seq_end_idx),
-                                snapshot_, std::greater<SequenceNumber>());
+                                upper_bound_, std::greater<SequenceNumber>());
  }
 }

@ -333,14 +374,13 @@ void FragmentedRangeTombstoneIterator::TopNext() {
  }
  seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx),
                              tombstones_->seq_iter(pos_->seq_end_idx),
-                              snapshot_, std::greater<SequenceNumber>());
+                              upper_bound_, std::greater<SequenceNumber>());
  ScanForwardToVisibleTombstone();
 }

 void FragmentedRangeTombstoneIterator::Prev() {
  if (seq_pos_ == tombstones_->seq_begin()) {
-    pos_ = tombstones_->end();
-    seq_pos_ = tombstones_->seq_end();
+    Invalidate();
    return;
  }
  --seq_pos_;
@ -358,7 +398,7 @@ void FragmentedRangeTombstoneIterator::TopPrev() {
  --pos_;
  seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx),
                              tombstones_->seq_iter(pos_->seq_end_idx),
-                              snapshot_, std::greater<SequenceNumber>());
+                              upper_bound_, std::greater<SequenceNumber>());
  ScanBackwardToVisibleTombstone();
 }

@ -372,4 +412,27 @@ SequenceNumber FragmentedRangeTombstoneIterator::MaxCoveringTombstoneSeqnum(
  return ValidPos() && ucmp_->Compare(start_key(), user_key) <= 0 ? seq() : 0;
 }

+std::map<SequenceNumber, std::unique_ptr<FragmentedRangeTombstoneIterator>>
+FragmentedRangeTombstoneIterator::SplitBySnapshot(
+    const std::vector<SequenceNumber>& snapshots) {
+  std::map<SequenceNumber, std::unique_ptr<FragmentedRangeTombstoneIterator>>
+      splits;
+  SequenceNumber lower = 0;
+  SequenceNumber upper;
+  for (size_t i = 0; i <= snapshots.size(); i++) {
+    if (i >= snapshots.size()) {
+      upper = kMaxSequenceNumber;
+    } else {
+      upper = snapshots[i];
+    }
+    if (tombstones_->ContainsRange(lower, upper)) {
+      splits.emplace(upper, std::unique_ptr<FragmentedRangeTombstoneIterator>(
+                                new FragmentedRangeTombstoneIterator(
+                                    tombstones_, *icmp_, upper, lower)));
+    }
+    lower = upper + 1;
+  }
+  return splits;
+}
+
 }  // namespace rocksdb
--- a/db/range_tombstone_fragmenter.h
+++ b/db/range_tombstone_fragmenter.h
@ -7,6 +7,7 @@

 #include <list>
 #include <memory>
+#include <set>
 #include <string>
 #include <vector>

@ -38,7 +39,8 @@ struct FragmentedRangeTombstoneList {
  };
  FragmentedRangeTombstoneList(
      std::unique_ptr<InternalIterator> unfragmented_tombstones,
-      const InternalKeyComparator& icmp);
+      const InternalKeyComparator& icmp, bool for_compaction = false,
+      const std::vector<SequenceNumber>& snapshots = {});

  std::vector<RangeTombstoneStack>::const_iterator begin() const {
    return tombstones_.begin();
@ -60,7 +62,11 @@ struct FragmentedRangeTombstoneList {
    return tombstone_seqs_.end();
  }

-  bool empty() const { return tombstones_.size() == 0; }
+  bool empty() const { return tombstones_.empty(); }
+
+  // Returns true if the stored tombstones contain with one with a sequence
+  // number in [lower, upper].
+  bool ContainsRange(SequenceNumber lower, SequenceNumber upper) const;

 private:
  // Given an ordered range tombstone iterator unfragmented_tombstones,
@ -68,10 +74,12 @@ struct FragmentedRangeTombstoneList {
  // tombstones_ and tombstone_seqs_.
  void FragmentTombstones(
      std::unique_ptr<InternalIterator> unfragmented_tombstones,
-      const InternalKeyComparator& icmp);
+      const InternalKeyComparator& icmp, bool for_compaction,
+      const std::vector<SequenceNumber>& snapshots);

  std::vector<RangeTombstoneStack> tombstones_;
  std::vector<SequenceNumber> tombstone_seqs_;
+  std::set<SequenceNumber> seq_set_;
  std::list<std::string> pinned_slices_;
  PinnedIteratorsManager pinned_iters_mgr_;
 };
@ -88,11 +96,13 @@ struct FragmentedRangeTombstoneList {
 class FragmentedRangeTombstoneIterator : public InternalIterator {
 public:
  FragmentedRangeTombstoneIterator(
-      const FragmentedRangeTombstoneList* tombstones, SequenceNumber snapshot,
-      const InternalKeyComparator& icmp);
+      const FragmentedRangeTombstoneList* tombstones,
+      const InternalKeyComparator& icmp, SequenceNumber upper_bound,
+      SequenceNumber lower_bound = 0);
  FragmentedRangeTombstoneIterator(
      const std::shared_ptr<const FragmentedRangeTombstoneList>& tombstones,
-      SequenceNumber snapshot, const InternalKeyComparator& icmp);
+      const InternalKeyComparator& icmp, SequenceNumber upper_bound,
+      SequenceNumber lower_bound = 0);

  void SeekToFirst() override;
  void SeekToLast() override;
@ -136,8 +146,7 @@ class FragmentedRangeTombstoneIterator : public InternalIterator {
    seq_pos_ = tombstones_->seq_end();
  }

-  // TODO: implement properly
-  RangeTombstone tombstone() const {
+  RangeTombstone Tombstone() const {
    return RangeTombstone(start_key(), end_key(), seq());
  }
  Slice start_key() const { return pos_->start_key; }
@ -151,12 +160,24 @@ class FragmentedRangeTombstoneIterator : public InternalIterator {
    return ParsedInternalKey(pos_->end_key, kMaxSequenceNumber,
                             kTypeRangeDeletion);
  }
-  ParsedInternalKey internal_key() const {
-    return ParsedInternalKey(pos_->start_key, *seq_pos_, kTypeRangeDeletion);
-  }

  SequenceNumber MaxCoveringTombstoneSeqnum(const Slice& user_key);

+  // Splits the iterator into n+1 iterators (where n is the number of
+  // snapshots), each providing a view over a "stripe" of sequence numbers. The
+  // iterators are keyed by the upper bound of their ranges (the provided
+  // snapshots + kMaxSequenceNumber).
+  //
+  // NOTE: the iterators in the returned map are no longer valid if their
+  // parent iterator is deleted, since they do not modify the refcount of the
+  // underlying tombstone list. Therefore, this map should be deleted before
+  // the parent iterator.
+  std::map<SequenceNumber, std::unique_ptr<FragmentedRangeTombstoneIterator>>
+  SplitBySnapshot(const std::vector<SequenceNumber>& snapshots);
+
+  SequenceNumber upper_bound() const { return upper_bound_; }
+  SequenceNumber lower_bound() const { return lower_bound_; }
+
 private:
  using RangeTombstoneStack = FragmentedRangeTombstoneList::RangeTombstoneStack;

@ -217,10 +238,12 @@ class FragmentedRangeTombstoneIterator : public InternalIterator {

  const RangeTombstoneStackStartComparator tombstone_start_cmp_;
  const RangeTombstoneStackEndComparator tombstone_end_cmp_;
+  const InternalKeyComparator* icmp_;
  const Comparator* ucmp_;
  std::shared_ptr<const FragmentedRangeTombstoneList> tombstones_ref_;
  const FragmentedRangeTombstoneList* tombstones_;
-  SequenceNumber snapshot_;
+  SequenceNumber upper_bound_;
+  SequenceNumber lower_bound_;
  std::vector<RangeTombstoneStack>::const_iterator pos_;
  std::vector<SequenceNumber>::const_iterator seq_pos_;
  mutable std::vector<RangeTombstoneStack>::const_iterator pinned_pos_;
--- a/db/range_tombstone_fragmenter_test.cc
+++ b/db/range_tombstone_fragmenter_test.cc
@ -29,15 +29,26 @@ std::unique_ptr<InternalIterator> MakeRangeDelIter(
      new test::VectorIterator(keys, values));
 }

+void CheckIterPosition(const RangeTombstone& tombstone,
+                       const FragmentedRangeTombstoneIterator* iter) {
+  // Test InternalIterator interface.
+  EXPECT_EQ(tombstone.start_key_, ExtractUserKey(iter->key()));
+  EXPECT_EQ(tombstone.end_key_, iter->value());
+  EXPECT_EQ(tombstone.seq_, iter->seq());
+
+  // Test FragmentedRangeTombstoneIterator interface.
+  EXPECT_EQ(tombstone.start_key_, iter->start_key());
+  EXPECT_EQ(tombstone.end_key_, iter->end_key());
+  EXPECT_EQ(tombstone.seq_, GetInternalKeySeqno(iter->key()));
+}
+
 void VerifyFragmentedRangeDels(
    FragmentedRangeTombstoneIterator* iter,
    const std::vector<RangeTombstone>& expected_tombstones) {
  iter->SeekToFirst();
-  for (size_t i = 0; i < expected_tombstones.size() && iter->Valid();
-       i++, iter->Next()) {
-    EXPECT_EQ(iter->start_key(), expected_tombstones[i].start_key_);
-    EXPECT_EQ(iter->value(), expected_tombstones[i].end_key_);
-    EXPECT_EQ(iter->seq(), expected_tombstones[i].seq_);
+  for (size_t i = 0; i < expected_tombstones.size(); i++, iter->Next()) {
+    ASSERT_TRUE(iter->Valid());
+    CheckIterPosition(expected_tombstones[i], iter);
  }
  EXPECT_FALSE(iter->Valid());
 }
@ -46,11 +57,9 @@ void VerifyVisibleTombstones(
    FragmentedRangeTombstoneIterator* iter,
    const std::vector<RangeTombstone>& expected_tombstones) {
  iter->SeekToTopFirst();
-  for (size_t i = 0; i < expected_tombstones.size() && iter->Valid();
-       i++, iter->TopNext()) {
-    EXPECT_EQ(iter->start_key(), expected_tombstones[i].start_key_);
-    EXPECT_EQ(iter->value(), expected_tombstones[i].end_key_);
-    EXPECT_EQ(iter->seq(), expected_tombstones[i].seq_);
+  for (size_t i = 0; i < expected_tombstones.size(); i++, iter->TopNext()) {
+    ASSERT_TRUE(iter->Valid());
+    CheckIterPosition(expected_tombstones[i], iter);
  }
  EXPECT_FALSE(iter->Valid());
 }
@ -69,9 +78,7 @@ void VerifySeek(FragmentedRangeTombstoneIterator* iter,
      ASSERT_FALSE(iter->Valid());
    } else {
      ASSERT_TRUE(iter->Valid());
-      EXPECT_EQ(testcase.expected_position.start_key_, iter->start_key());
-      EXPECT_EQ(testcase.expected_position.end_key_, iter->value());
-      EXPECT_EQ(testcase.expected_position.seq_, iter->seq());
+      CheckIterPosition(testcase.expected_position, iter);
    }
  }
 }
@ -84,9 +91,7 @@ void VerifySeekForPrev(FragmentedRangeTombstoneIterator* iter,
      ASSERT_FALSE(iter->Valid());
    } else {
      ASSERT_TRUE(iter->Valid());
-      EXPECT_EQ(testcase.expected_position.start_key_, iter->start_key());
-      EXPECT_EQ(testcase.expected_position.end_key_, iter->value());
-      EXPECT_EQ(testcase.expected_position.seq_, iter->seq());
+      CheckIterPosition(testcase.expected_position, iter);
    }
  }
 }
@ -112,8 +117,10 @@ TEST_F(RangeTombstoneFragmenterTest, NonOverlappingTombstones) {

  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
                                             bytewise_icmp);
-  FragmentedRangeTombstoneIterator iter(&fragment_list, kMaxSequenceNumber,
-                                        bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber);
+  ASSERT_EQ(0, iter.lower_bound());
+  ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound());
  VerifyFragmentedRangeDels(&iter, {{"a", "b", 10}, {"c", "d", 5}});
  VerifyMaxCoveringTombstoneSeqnum(&iter,
                                   {{"", 0}, {"a", 10}, {"b", 0}, {"c", 5}});
@ -124,8 +131,10 @@ TEST_F(RangeTombstoneFragmenterTest, OverlappingTombstones) {

  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
                                             bytewise_icmp);
-  FragmentedRangeTombstoneIterator iter(&fragment_list, kMaxSequenceNumber,
-                                        bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber);
+  ASSERT_EQ(0, iter.lower_bound());
+  ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound());
  VerifyFragmentedRangeDels(
      &iter, {{"a", "c", 10}, {"c", "e", 15}, {"c", "e", 10}, {"e", "g", 15}});
  VerifyMaxCoveringTombstoneSeqnum(&iter,
@ -138,8 +147,10 @@ TEST_F(RangeTombstoneFragmenterTest, ContiguousTombstones) {

  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
                                             bytewise_icmp);
-  FragmentedRangeTombstoneIterator iter(&fragment_list, kMaxSequenceNumber,
-                                        bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber);
+  ASSERT_EQ(0, iter.lower_bound());
+  ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound());
  VerifyFragmentedRangeDels(
      &iter, {{"a", "c", 10}, {"c", "e", 20}, {"c", "e", 5}, {"e", "g", 15}});
  VerifyMaxCoveringTombstoneSeqnum(&iter,
@ -152,8 +163,10 @@ TEST_F(RangeTombstoneFragmenterTest, RepeatedStartAndEndKey) {

  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
                                             bytewise_icmp);
-  FragmentedRangeTombstoneIterator iter(&fragment_list, kMaxSequenceNumber,
-                                        bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber);
+  ASSERT_EQ(0, iter.lower_bound());
+  ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound());
  VerifyFragmentedRangeDels(&iter,
                            {{"a", "c", 10}, {"a", "c", 7}, {"a", "c", 3}});
  VerifyMaxCoveringTombstoneSeqnum(&iter, {{"a", 10}, {"b", 10}, {"c", 0}});
@ -165,8 +178,10 @@ TEST_F(RangeTombstoneFragmenterTest, RepeatedStartKeyDifferentEndKeys) {

  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
                                             bytewise_icmp);
-  FragmentedRangeTombstoneIterator iter(&fragment_list, kMaxSequenceNumber,
-                                        bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber);
+  ASSERT_EQ(0, iter.lower_bound());
+  ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound());
  VerifyFragmentedRangeDels(&iter, {{"a", "c", 10},
                                    {"a", "c", 7},
                                    {"a", "c", 3},
@ -186,8 +201,10 @@ TEST_F(RangeTombstoneFragmenterTest, RepeatedStartKeyMixedEndKeys) {

  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
                                             bytewise_icmp);
-  FragmentedRangeTombstoneIterator iter(&fragment_list, kMaxSequenceNumber,
-                                        bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber);
+  ASSERT_EQ(0, iter.lower_bound());
+  ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound());
  VerifyFragmentedRangeDels(&iter, {{"a", "c", 30},
                                    {"a", "c", 20},
                                    {"a", "c", 10},
@ -211,16 +228,16 @@ TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKey) {

  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
                                             bytewise_icmp);
-  FragmentedRangeTombstoneIterator iter1(&fragment_list, kMaxSequenceNumber,
-                                         bytewise_icmp);
-  FragmentedRangeTombstoneIterator iter2(&fragment_list, 9 /* snapshot */,
-                                         bytewise_icmp);
-  FragmentedRangeTombstoneIterator iter3(&fragment_list, 7 /* snapshot */,
-                                         bytewise_icmp);
-  FragmentedRangeTombstoneIterator iter4(&fragment_list, 5 /* snapshot */,
-                                         bytewise_icmp);
-  FragmentedRangeTombstoneIterator iter5(&fragment_list, 3 /* snapshot */,
-                                         bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter1(&fragment_list, bytewise_icmp,
+                                         kMaxSequenceNumber);
+  FragmentedRangeTombstoneIterator iter2(&fragment_list, bytewise_icmp,
+                                         9 /* upper_bound */);
+  FragmentedRangeTombstoneIterator iter3(&fragment_list, bytewise_icmp,
+                                         7 /* upper_bound */);
+  FragmentedRangeTombstoneIterator iter4(&fragment_list, bytewise_icmp,
+                                         5 /* upper_bound */);
+  FragmentedRangeTombstoneIterator iter5(&fragment_list, bytewise_icmp,
+                                         3 /* upper_bound */);
  for (auto* iter : {&iter1, &iter2, &iter3, &iter4, &iter5}) {
    VerifyFragmentedRangeDels(iter, {{"a", "c", 10},
                                     {"c", "e", 10},
@ -234,6 +251,8 @@ TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKey) {
                                     {"l", "n", 4}});
  }

+  ASSERT_EQ(0, iter1.lower_bound());
+  ASSERT_EQ(kMaxSequenceNumber, iter1.upper_bound());
  VerifyVisibleTombstones(&iter1, {{"a", "c", 10},
                                   {"c", "e", 10},
                                   {"e", "g", 8},
@ -243,6 +262,8 @@ TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKey) {
  VerifyMaxCoveringTombstoneSeqnum(
      &iter1, {{"a", 10}, {"c", 10}, {"e", 8}, {"i", 0}, {"j", 4}, {"m", 4}});

+  ASSERT_EQ(0, iter2.lower_bound());
+  ASSERT_EQ(9, iter2.upper_bound());
  VerifyVisibleTombstones(&iter2, {{"c", "e", 8},
                                   {"e", "g", 8},
                                   {"g", "i", 6},
@ -251,6 +272,8 @@ TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKey) {
  VerifyMaxCoveringTombstoneSeqnum(
      &iter2, {{"a", 0}, {"c", 8}, {"e", 8}, {"i", 0}, {"j", 4}, {"m", 4}});

+  ASSERT_EQ(0, iter3.lower_bound());
+  ASSERT_EQ(7, iter3.upper_bound());
  VerifyVisibleTombstones(&iter3, {{"c", "e", 6},
                                   {"e", "g", 6},
                                   {"g", "i", 6},
@ -259,10 +282,14 @@ TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKey) {
  VerifyMaxCoveringTombstoneSeqnum(
      &iter3, {{"a", 0}, {"c", 6}, {"e", 6}, {"i", 0}, {"j", 4}, {"m", 4}});

+  ASSERT_EQ(0, iter4.lower_bound());
+  ASSERT_EQ(5, iter4.upper_bound());
  VerifyVisibleTombstones(&iter4, {{"j", "l", 4}, {"l", "n", 4}});
  VerifyMaxCoveringTombstoneSeqnum(
      &iter4, {{"a", 0}, {"c", 0}, {"e", 0}, {"i", 0}, {"j", 4}, {"m", 4}});

+  ASSERT_EQ(0, iter5.lower_bound());
+  ASSERT_EQ(3, iter5.upper_bound());
  VerifyVisibleTombstones(&iter5, {{"j", "l", 2}});
  VerifyMaxCoveringTombstoneSeqnum(
      &iter5, {{"a", 0}, {"c", 0}, {"e", 0}, {"i", 0}, {"j", 2}, {"m", 0}});
@ -277,8 +304,10 @@ TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKeyUnordered) {

  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
                                             bytewise_icmp);
-  FragmentedRangeTombstoneIterator iter(&fragment_list, 9 /* snapshot */,
-                                        bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        9 /* upper_bound */);
+  ASSERT_EQ(0, iter.lower_bound());
+  ASSERT_EQ(9, iter.upper_bound());
  VerifyFragmentedRangeDels(&iter, {{"a", "c", 10},
                                    {"c", "e", 10},
                                    {"c", "e", 8},
@ -293,6 +322,116 @@ TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKeyUnordered) {
      &iter, {{"a", 0}, {"c", 8}, {"e", 8}, {"i", 0}, {"j", 4}, {"m", 4}});
 }

+TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKeyForCompaction) {
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+                                          {"j", "n", 4},
+                                          {"c", "i", 6},
+                                          {"c", "g", 8},
+                                          {"j", "l", 2}});
+
+  FragmentedRangeTombstoneList fragment_list(
+      std::move(range_del_iter), bytewise_icmp, true /* for_compaction */,
+      {} /* snapshots */);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber /* upper_bound */);
+  VerifyFragmentedRangeDels(&iter, {{"a", "c", 10},
+                                    {"c", "e", 10},
+                                    {"e", "g", 8},
+                                    {"g", "i", 6},
+                                    {"j", "l", 4},
+                                    {"l", "n", 4}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest,
+       OverlapAndRepeatedStartKeyForCompactionWithSnapshot) {
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+                                          {"j", "n", 4},
+                                          {"c", "i", 6},
+                                          {"c", "g", 8},
+                                          {"j", "l", 2}});
+
+  FragmentedRangeTombstoneList fragment_list(
+      std::move(range_del_iter), bytewise_icmp, true /* for_compaction */,
+      {20, 9} /* upper_bounds */);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber /* upper_bound */);
+  VerifyFragmentedRangeDels(&iter, {{"a", "c", 10},
+                                    {"c", "e", 10},
+                                    {"c", "e", 8},
+                                    {"e", "g", 8},
+                                    {"g", "i", 6},
+                                    {"j", "l", 4},
+                                    {"l", "n", 4}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, IteratorSplitNoSnapshots) {
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+                                          {"j", "n", 4},
+                                          {"c", "i", 6},
+                                          {"c", "g", 8},
+                                          {"j", "l", 2}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber /* upper_bound */);
+
+  auto split_iters = iter.SplitBySnapshot({} /* snapshots */);
+  ASSERT_EQ(1, split_iters.size());
+
+  auto* split_iter = split_iters[kMaxSequenceNumber].get();
+  ASSERT_EQ(0, split_iter->lower_bound());
+  ASSERT_EQ(kMaxSequenceNumber, split_iter->upper_bound());
+  VerifyVisibleTombstones(split_iter, {{"a", "c", 10},
+                                       {"c", "e", 10},
+                                       {"e", "g", 8},
+                                       {"g", "i", 6},
+                                       {"j", "l", 4},
+                                       {"l", "n", 4}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, IteratorSplitWithSnapshots) {
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+                                          {"j", "n", 4},
+                                          {"c", "i", 6},
+                                          {"c", "g", 8},
+                                          {"j", "l", 2}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber /* upper_bound */);
+
+  auto split_iters = iter.SplitBySnapshot({3, 5, 7, 9} /* snapshots */);
+  ASSERT_EQ(5, split_iters.size());
+
+  auto* split_iter1 = split_iters[3].get();
+  ASSERT_EQ(0, split_iter1->lower_bound());
+  ASSERT_EQ(3, split_iter1->upper_bound());
+  VerifyVisibleTombstones(split_iter1, {{"j", "l", 2}});
+
+  auto* split_iter2 = split_iters[5].get();
+  ASSERT_EQ(4, split_iter2->lower_bound());
+  ASSERT_EQ(5, split_iter2->upper_bound());
+  VerifyVisibleTombstones(split_iter2, {{"j", "l", 4}, {"l", "n", 4}});
+
+  auto* split_iter3 = split_iters[7].get();
+  ASSERT_EQ(6, split_iter3->lower_bound());
+  ASSERT_EQ(7, split_iter3->upper_bound());
+  VerifyVisibleTombstones(split_iter3,
+                          {{"c", "e", 6}, {"e", "g", 6}, {"g", "i", 6}});
+
+  auto* split_iter4 = split_iters[9].get();
+  ASSERT_EQ(8, split_iter4->lower_bound());
+  ASSERT_EQ(9, split_iter4->upper_bound());
+  VerifyVisibleTombstones(split_iter4, {{"c", "e", 8}, {"e", "g", 8}});
+
+  auto* split_iter5 = split_iters[kMaxSequenceNumber].get();
+  ASSERT_EQ(10, split_iter5->lower_bound());
+  ASSERT_EQ(kMaxSequenceNumber, split_iter5->upper_bound());
+  VerifyVisibleTombstones(split_iter5, {{"a", "c", 10}, {"c", "e", 10}});
+}
+
 TEST_F(RangeTombstoneFragmenterTest, SeekStartKey) {
  // Same tombstones as OverlapAndRepeatedStartKey.
  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
@ -304,8 +443,8 @@ TEST_F(RangeTombstoneFragmenterTest, SeekStartKey) {
  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
                                             bytewise_icmp);

-  FragmentedRangeTombstoneIterator iter1(&fragment_list, kMaxSequenceNumber,
-                                         bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter1(&fragment_list, bytewise_icmp,
+                                         kMaxSequenceNumber);
  VerifySeek(
      &iter1,
      {{"a", {"a", "c", 10}}, {"e", {"e", "g", 8}}, {"l", {"l", "n", 4}}});
@ -313,8 +452,8 @@ TEST_F(RangeTombstoneFragmenterTest, SeekStartKey) {
      &iter1,
      {{"a", {"a", "c", 10}}, {"e", {"e", "g", 8}}, {"l", {"l", "n", 4}}});

-  FragmentedRangeTombstoneIterator iter2(&fragment_list, 3 /* snapshot */,
-                                         bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter2(&fragment_list, bytewise_icmp,
+                                         3 /* upper_bound */);
  VerifySeek(&iter2, {{"a", {"j", "l", 2}},
                      {"e", {"j", "l", 2}},
                      {"l", {}, true /* out of range */}});
@ -334,8 +473,8 @@ TEST_F(RangeTombstoneFragmenterTest, SeekCovered) {
  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
                                             bytewise_icmp);

-  FragmentedRangeTombstoneIterator iter1(&fragment_list, kMaxSequenceNumber,
-                                         bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter1(&fragment_list, bytewise_icmp,
+                                         kMaxSequenceNumber);
  VerifySeek(
      &iter1,
      {{"b", {"a", "c", 10}}, {"f", {"e", "g", 8}}, {"m", {"l", "n", 4}}});
@ -343,8 +482,8 @@ TEST_F(RangeTombstoneFragmenterTest, SeekCovered) {
      &iter1,
      {{"b", {"a", "c", 10}}, {"f", {"e", "g", 8}}, {"m", {"l", "n", 4}}});

-  FragmentedRangeTombstoneIterator iter2(&fragment_list, 3 /* snapshot */,
-                                         bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter2(&fragment_list, bytewise_icmp,
+                                         3 /* upper_bound */);
  VerifySeek(&iter2, {{"b", {"j", "l", 2}},
                      {"f", {"j", "l", 2}},
                      {"m", {}, true /* out of range */}});
@ -364,8 +503,8 @@ TEST_F(RangeTombstoneFragmenterTest, SeekEndKey) {
  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
                                             bytewise_icmp);

-  FragmentedRangeTombstoneIterator iter1(&fragment_list, kMaxSequenceNumber,
-                                         bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter1(&fragment_list, bytewise_icmp,
+                                         kMaxSequenceNumber);
  VerifySeek(&iter1, {{"c", {"c", "e", 10}},
                      {"g", {"g", "i", 6}},
                      {"i", {"j", "l", 4}},
@ -375,8 +514,8 @@ TEST_F(RangeTombstoneFragmenterTest, SeekEndKey) {
                             {"i", {"g", "i", 6}},
                             {"n", {"l", "n", 4}}});

-  FragmentedRangeTombstoneIterator iter2(&fragment_list, 3 /* snapshot */,
-                                         bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter2(&fragment_list, bytewise_icmp,
+                                         3 /* upper_bound */);
  VerifySeek(&iter2, {{"c", {"j", "l", 2}},
                      {"g", {"j", "l", 2}},
                      {"i", {"j", "l", 2}},
@ -398,8 +537,8 @@ TEST_F(RangeTombstoneFragmenterTest, SeekOutOfBounds) {
  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
                                             bytewise_icmp);

-  FragmentedRangeTombstoneIterator iter(&fragment_list, kMaxSequenceNumber,
-                                        bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber);
  VerifySeek(&iter, {{"", {"a", "c", 10}}, {"z", {}, true /* out of range */}});
  VerifySeekForPrev(&iter,
                    {{"", {}, true /* out of range */}, {"z", {"l", "n", 4}}});
--- a/db/repair.cc
+++ b/db/repair.cc
@ -417,11 +417,16 @@ class Repairer {
      SnapshotChecker* snapshot_checker = DisableGCSnapshotChecker::Instance();

      auto write_hint = cfd->CalculateSSTWriteHint(0);
+      std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
+          range_del_iters;
+      auto range_del_iter =
+          mem->NewRangeTombstoneIterator(ro, kMaxSequenceNumber);
+      if (range_del_iter != nullptr) {
+        range_del_iters.emplace_back(range_del_iter);
+      }
      status = BuildTable(
          dbname_, env_, *cfd->ioptions(), *cfd->GetLatestMutableCFOptions(),
-          env_options_, table_cache_, iter.get(),
-          std::unique_ptr<InternalIterator>(
-              mem->NewRangeTombstoneIterator(ro, vset_.LastSequence())),
+          env_options_, table_cache_, iter.get(), std::move(range_del_iters),
          &meta, cfd->internal_comparator(),
          cfd->int_tbl_prop_collector_factories(), cfd->GetID(), cfd->GetName(),
          {}, kMaxSequenceNumber, snapshot_checker, kNoCompression,
--- a/db/table_cache.cc
+++ b/db/table_cache.cc
@ -185,7 +185,7 @@ Status TableCache::FindTable(const EnvOptions& env_options,
 InternalIterator* TableCache::NewIterator(
    const ReadOptions& options, const EnvOptions& env_options,
    const InternalKeyComparator& icomparator, const FileMetaData& file_meta,
-    RangeDelAggregatorV2* range_del_agg, const SliceTransform* prefix_extractor,
+    RangeDelAggregator* range_del_agg, const SliceTransform* prefix_extractor,
    TableReader** table_reader_ptr, HistogramImpl* file_read_hist,
    bool for_compaction, Arena* arena, bool skip_filters, int level,
    const InternalKey* smallest_compaction_key,
--- a/db/table_cache.h
+++ b/db/table_cache.h
@ -15,7 +15,7 @@
 #include <stdint.h>

 #include "db/dbformat.h"
-#include "db/range_del_aggregator_v2.h"
+#include "db/range_del_aggregator.h"
 #include "options/cf_options.h"
 #include "port/port.h"
 #include "rocksdb/cache.h"
@ -52,7 +52,7 @@ class TableCache {
  InternalIterator* NewIterator(
      const ReadOptions& options, const EnvOptions& toptions,
      const InternalKeyComparator& internal_comparator,
-      const FileMetaData& file_meta, RangeDelAggregatorV2* range_del_agg,
+      const FileMetaData& file_meta, RangeDelAggregator* range_del_agg,
      const SliceTransform* prefix_extractor = nullptr,
      TableReader** table_reader_ptr = nullptr,
      HistogramImpl* file_read_hist = nullptr, bool for_compaction = false,
--- a/db/version_edit.cc
+++ b/db/version_edit.cc
@ -579,7 +579,7 @@ std::string VersionEdit::DebugString(bool hex_key) const {
    AppendNumberTo(&r, max_column_family_);
  }
  if (is_in_atomic_group_) {
-    r.append("\n AtomicGroup: ");
+    r.append("\n  AtomicGroup: ");
    AppendNumberTo(&r, remaining_entries_);
    r.append(" entries remains");
  }
--- a/db/version_set.cc
+++ b/db/version_set.cc
@ -301,17 +301,28 @@ class FilePicker {
        // On Level-n (n>=1), files are sorted. Binary search to find the
        // earliest file whose largest key >= ikey. Search left bound and
        // right bound are used to narrow the range.
-        if (search_left_bound_ == search_right_bound_) {
-          start_index = search_left_bound_;
-        } else if (search_left_bound_ < search_right_bound_) {
+        if (search_left_bound_ <= search_right_bound_) {
          if (search_right_bound_ == FileIndexer::kLevelMaxIndex) {
            search_right_bound_ =
                static_cast<int32_t>(curr_file_level_->num_files) - 1;
          }
+          // `search_right_bound_` is an inclusive upper-bound, but since it was
+          // determined based on user key, it is still possible the lookup key
+          // falls to the right of `search_right_bound_`'s corresponding file.
+          // So, pass a limit one higher, which allows us to detect this case.
          start_index =
              FindFileInRange(*internal_comparator_, *curr_file_level_, ikey_,
                              static_cast<uint32_t>(search_left_bound_),
-                              static_cast<uint32_t>(search_right_bound_));
+                              static_cast<uint32_t>(search_right_bound_) + 1);
+          if (start_index == search_right_bound_ + 1) {
+            // `ikey_` comes after `search_right_bound_`. The lookup key does
+            // not exist on this level, so let's skip this level and do a full
+            // binary search on the next level.
+            search_left_bound_ = 0;
+            search_right_bound_ = FileIndexer::kLevelMaxIndex;
+            curr_level_++;
+            continue;
+          }
        } else {
          // search_left_bound > search_right_bound, key does not exist in
          // this level. Since no comparison is done in this level, it will
@ -459,7 +470,7 @@ class LevelIterator final : public InternalIterator {
      const EnvOptions& env_options, const InternalKeyComparator& icomparator,
      const LevelFilesBrief* flevel, const SliceTransform* prefix_extractor,
      bool should_sample, HistogramImpl* file_read_hist, bool for_compaction,
-      bool skip_filters, int level, RangeDelAggregatorV2* range_del_agg,
+      bool skip_filters, int level, RangeDelAggregator* range_del_agg,
      const std::vector<AtomicCompactionUnitBoundary>* compaction_boundaries =
          nullptr)
      : table_cache_(table_cache),
@ -571,7 +582,7 @@ class LevelIterator final : public InternalIterator {
  bool skip_filters_;
  size_t file_index_;
  int level_;
-  RangeDelAggregatorV2* range_del_agg_;
+  RangeDelAggregator* range_del_agg_;
  IteratorWrapper file_iter_;  // May be nullptr
  PinnedIteratorsManager* pinned_iters_mgr_;

@ -985,7 +996,7 @@ double VersionStorageInfo::GetEstimatedCompressionRatioAtLevel(
 void Version::AddIterators(const ReadOptions& read_options,
                           const EnvOptions& soptions,
                           MergeIteratorBuilder* merge_iter_builder,
-                           RangeDelAggregatorV2* range_del_agg) {
+                           RangeDelAggregator* range_del_agg) {
  assert(storage_info_.finalized_);

  for (int level = 0; level < storage_info_.num_non_empty_levels(); level++) {
@ -998,7 +1009,7 @@ void Version::AddIteratorsForLevel(const ReadOptions& read_options,
                                   const EnvOptions& soptions,
                                   MergeIteratorBuilder* merge_iter_builder,
                                   int level,
-                                   RangeDelAggregatorV2* range_del_agg) {
+                                   RangeDelAggregator* range_del_agg) {
  assert(storage_info_.finalized_);
  if (level >= storage_info_.num_non_empty_levels()) {
    // This is an empty level
@ -1057,8 +1068,8 @@ Status Version::OverlapWithLevelIterator(const ReadOptions& read_options,

  Arena arena;
  Status status;
-  RangeDelAggregatorV2 range_del_agg(&icmp,
-                                     kMaxSequenceNumber /* upper_bound */);
+  ReadRangeDelAggregator range_del_agg(&icmp,
+                                       kMaxSequenceNumber /* upper_bound */);

  *overlap = false;

@ -2849,6 +2860,7 @@ Status VersionSet::ProcessManifestWrites(
    batch_edits.push_back(first_writer.edit_list.front());
  } else {
    auto it = manifest_writers_.cbegin();
+    size_t group_start = std::numeric_limits<size_t>::max();
    while (it != manifest_writers_.cend()) {
      if ((*it)->edit_list.front()->IsColumnFamilyManipulation()) {
        // no group commits for column family add or drop
@ -2857,7 +2869,36 @@ Status VersionSet::ProcessManifestWrites(
      last_writer = *(it++);
      assert(last_writer != nullptr);
      assert(last_writer->cfd != nullptr);
-      if (last_writer->cfd != nullptr && last_writer->cfd->IsDropped()) {
+      if (last_writer->cfd->IsDropped()) {
+        // If we detect a dropped CF at this point, and the corresponding
+        // version edits belong to an atomic group, then we need to find out
+        // the preceding version edits in the same atomic group, and update
+        // their `remaining_entries_` member variable because we are NOT going
+        // to write the version edits' of dropped CF to the MANIFEST. If we
+        // don't update, then Recover can report corrupted atomic group because
+        // the `remaining_entries_` do not match.
+        if (!batch_edits.empty()) {
+          if (batch_edits.back()->is_in_atomic_group_ &&
+              batch_edits.back()->remaining_entries_ > 0) {
+            assert(group_start < batch_edits.size());
+            const auto& edit_list = last_writer->edit_list;
+            size_t k = 0;
+            while (k < edit_list.size()) {
+              if (!edit_list[k]->is_in_atomic_group_) {
+                break;
+              } else if (edit_list[k]->remaining_entries_ == 0) {
+                ++k;
+                break;
+              }
+              ++k;
+            }
+            for (auto i = group_start; i < batch_edits.size(); ++i) {
+              assert(static_cast<uint32_t>(k) <=
+                     batch_edits.back()->remaining_entries_);
+              batch_edits[i]->remaining_entries_ -= static_cast<uint32_t>(k);
+            }
+          }
+        }
        continue;
      }
      // We do a linear search on versions because versions is small.
@ -2888,6 +2929,15 @@ Status VersionSet::ProcessManifestWrites(
      }
      assert(builder != nullptr);  // make checker happy
      for (const auto& e : last_writer->edit_list) {
+        if (e->is_in_atomic_group_) {
+          if (batch_edits.empty() || !batch_edits.back()->is_in_atomic_group_ ||
+              (batch_edits.back()->is_in_atomic_group_ &&
+               batch_edits.back()->remaining_entries_ == 0)) {
+            group_start = batch_edits.size();
+          }
+        } else if (group_start != std::numeric_limits<size_t>::max()) {
+          group_start = std::numeric_limits<size_t>::max();
+        }
        LogAndApplyHelper(last_writer->cfd, builder, version, e, mu);
        batch_edits.push_back(e);
      }
@ -2900,6 +2950,42 @@ Status VersionSet::ProcessManifestWrites(
    }
  }

+#ifndef NDEBUG
+  // Verify that version edits of atomic groups have correct
+  // remaining_entries_.
+  size_t k = 0;
+  while (k < batch_edits.size()) {
+    while (k < batch_edits.size() && !batch_edits[k]->is_in_atomic_group_) {
+      ++k;
+    }
+    if (k == batch_edits.size()) {
+      break;
+    }
+    size_t i = k;
+    while (i < batch_edits.size()) {
+      if (!batch_edits[i]->is_in_atomic_group_) {
+        break;
+      }
+      assert(i - k + batch_edits[i]->remaining_entries_ ==
+             batch_edits[k]->remaining_entries_);
+      if (batch_edits[i]->remaining_entries_ == 0) {
+        ++i;
+        break;
+      }
+      ++i;
+    }
+    assert(batch_edits[i - 1]->is_in_atomic_group_);
+    assert(0 == batch_edits[i - 1]->remaining_entries_);
+    std::vector<VersionEdit*> tmp;
+    for (size_t j = k; j != i; ++j) {
+      tmp.emplace_back(batch_edits[j]);
+    }
+    TEST_SYNC_POINT_CALLBACK(
+        "VersionSet::ProcessManifestWrites:CheckOneAtomicGroup", &tmp);
+    k = i;
+  }
+#endif  // NDEBUG
+
  uint64_t new_manifest_file_size = 0;
  Status s;

@ -3205,7 +3291,7 @@ Status VersionSet::LogAndApply(
    if (!manifest_writers_.empty()) {
      manifest_writers_.front()->cv.Signal();
    }
-    return Status::OK();
+    return Status::ShutdownInProgress();
  }

  return ProcessManifestWrites(writers, mu, db_directory, new_descriptor_log,
@ -4253,7 +4339,7 @@ void VersionSet::AddLiveFiles(std::vector<FileDescriptor>* live_list) {
 }

 InternalIterator* VersionSet::MakeInputIterator(
-    const Compaction* c, RangeDelAggregatorV2* range_del_agg,
+    const Compaction* c, RangeDelAggregator* range_del_agg,
    const EnvOptions& env_options_compactions) {
  auto cfd = c->column_family_data();
  ReadOptions read_options;
--- a/db/version_set.h
+++ b/db/version_set.h
@ -34,7 +34,7 @@
 #include "db/dbformat.h"
 #include "db/file_indexer.h"
 #include "db/log_reader.h"
-#include "db/range_del_aggregator_v2.h"
+#include "db/range_del_aggregator.h"
 #include "db/read_callback.h"
 #include "db/table_cache.h"
 #include "db/version_builder.h"
@ -538,11 +538,11 @@ class Version {
  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
  void AddIterators(const ReadOptions&, const EnvOptions& soptions,
                    MergeIteratorBuilder* merger_iter_builder,
-                    RangeDelAggregatorV2* range_del_agg);
+                    RangeDelAggregator* range_del_agg);

  void AddIteratorsForLevel(const ReadOptions&, const EnvOptions& soptions,
                            MergeIteratorBuilder* merger_iter_builder,
-                            int level, RangeDelAggregatorV2* range_del_agg);
+                            int level, RangeDelAggregator* range_del_agg);

  Status OverlapWithLevelIterator(const ReadOptions&, const EnvOptions&,
                                  const Slice& smallest_user_key,
@ -935,7 +935,7 @@ class VersionSet {
  // Create an iterator that reads over the compaction inputs for "*c".
  // The caller should delete the iterator when no longer needed.
  InternalIterator* MakeInputIterator(
-      const Compaction* c, RangeDelAggregatorV2* range_del_agg,
+      const Compaction* c, RangeDelAggregator* range_del_agg,
      const EnvOptions& env_options_compactions);

  // Add all files listed in any live version to *live.
--- a/db/version_set_test.cc
+++ b/db/version_set_test.cc
@ -605,9 +605,13 @@ TEST_F(FindLevelFileTest, LevelOverlappingFiles) {
  ASSERT_TRUE(Overlaps("600", "700"));
 }

-class VersionSetTest : public testing::Test {
+class VersionSetTestBase {
 public:
-  VersionSetTest()
+  const static std::string kColumnFamilyName1;
+  const static std::string kColumnFamilyName2;
+  const static std::string kColumnFamilyName3;
+
+  VersionSetTestBase()
      : env_(Env::Default()),
        dbname_(test::PerThreadDBPath("version_set_test")),
        db_options_(),
@ -635,8 +639,9 @@ class VersionSetTest : public testing::Test {
    new_db.SetNextFile(2);
    new_db.SetLastSequence(0);

-    const std::vector<std::string> cf_names = {kDefaultColumnFamilyName,
-                                               "alice", "bob"};
+    const std::vector<std::string> cf_names = {
+        kDefaultColumnFamilyName, kColumnFamilyName1, kColumnFamilyName2,
+        kColumnFamilyName3};
    const int kInitialNumOfCfs = static_cast<int>(cf_names.size());
    autovector<VersionEdit> new_cfs;
    uint64_t last_seq = 1;
@ -711,6 +716,15 @@ class VersionSetTest : public testing::Test {
  std::shared_ptr<mock::MockTableFactory> mock_table_factory_;
 };

+const std::string VersionSetTestBase::kColumnFamilyName1 = "alice";
+const std::string VersionSetTestBase::kColumnFamilyName2 = "bob";
+const std::string VersionSetTestBase::kColumnFamilyName3 = "charles";
+
+class VersionSetTest : public VersionSetTestBase, public testing::Test {
+ public:
+  VersionSetTest() : VersionSetTestBase() {}
+};
+
 TEST_F(VersionSetTest, SameColumnFamilyGroupCommit) {
  NewDB();
  const int kGroupSize = 5;
@ -958,6 +972,126 @@ TEST_F(VersionSetTest, HandleIncorrectAtomicGroupSize) {
            versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
  EXPECT_TRUE(incorrect_group_size);
 }
+
+class VersionSetTestDropOneCF : public VersionSetTestBase,
+                                public testing::TestWithParam<std::string> {
+ public:
+  VersionSetTestDropOneCF() : VersionSetTestBase() {}
+};
+
+// This test simulates the following execution sequence
+// Time  thread1                  bg_flush_thr
+//  |                             Prepare version edits (e1,e2,e3) for atomic
+//  |                             flush cf1, cf2, cf3
+//  |    Enqueue e to drop cfi
+//  |    to manifest_writers_
+//  |                             Enqueue (e1,e2,e3) to manifest_writers_
+//  |
+//  |    Apply e,
+//  |    cfi.IsDropped() is true
+//  |                             Apply (e1,e2,e3),
+//  |                             since cfi.IsDropped() == true, we need to
+//  |                             drop ei and write the rest to MANIFEST.
+//  V
+//
+//  Repeat the test for i = 1, 2, 3 to simulate dropping the first, middle and
+//  last column family in an atomic group.
+TEST_P(VersionSetTestDropOneCF, HandleDroppedColumnFamilyInAtomicGroup) {
+  std::vector<ColumnFamilyDescriptor> column_families;
+  SequenceNumber last_seqno;
+  std::unique_ptr<log::Writer> log_writer;
+  PrepareManifest(&column_families, &last_seqno, &log_writer);
+  Status s = SetCurrentFile(env_, dbname_, 1, nullptr);
+  ASSERT_OK(s);
+
+  EXPECT_OK(versions_->Recover(column_families, false /* read_only */));
+  EXPECT_EQ(column_families.size(),
+            versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+
+  const int kAtomicGroupSize = 3;
+  const std::vector<std::string> non_default_cf_names = {
+      kColumnFamilyName1, kColumnFamilyName2, kColumnFamilyName3};
+
+  // Drop one column family
+  VersionEdit drop_cf_edit;
+  drop_cf_edit.DropColumnFamily();
+  const std::string cf_to_drop_name(GetParam());
+  auto cfd_to_drop =
+      versions_->GetColumnFamilySet()->GetColumnFamily(cf_to_drop_name);
+  ASSERT_NE(nullptr, cfd_to_drop);
+  // Increase its refcount because cfd_to_drop is used later, and we need to
+  // prevent it from being deleted.
+  cfd_to_drop->Ref();
+  drop_cf_edit.SetColumnFamily(cfd_to_drop->GetID());
+  mutex_.Lock();
+  s = versions_->LogAndApply(cfd_to_drop,
+                             *cfd_to_drop->GetLatestMutableCFOptions(),
+                             &drop_cf_edit, &mutex_);
+  mutex_.Unlock();
+  ASSERT_OK(s);
+
+  std::vector<VersionEdit> edits(kAtomicGroupSize);
+  uint32_t remaining = kAtomicGroupSize;
+  size_t i = 0;
+  autovector<ColumnFamilyData*> cfds;
+  autovector<const MutableCFOptions*> mutable_cf_options_list;
+  autovector<autovector<VersionEdit*>> edit_lists;
+  for (const auto& cf_name : non_default_cf_names) {
+    auto cfd = (cf_name != cf_to_drop_name)
+                   ? versions_->GetColumnFamilySet()->GetColumnFamily(cf_name)
+                   : cfd_to_drop;
+    ASSERT_NE(nullptr, cfd);
+    cfds.push_back(cfd);
+    mutable_cf_options_list.emplace_back(cfd->GetLatestMutableCFOptions());
+    edits[i].SetColumnFamily(cfd->GetID());
+    edits[i].SetLogNumber(0);
+    edits[i].SetNextFile(2);
+    edits[i].MarkAtomicGroup(--remaining);
+    edits[i].SetLastSequence(last_seqno++);
+    autovector<VersionEdit*> tmp_edits;
+    tmp_edits.push_back(&edits[i]);
+    edit_lists.emplace_back(tmp_edits);
+    ++i;
+  }
+  int called = 0;
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::ProcessManifestWrites:CheckOneAtomicGroup", [&](void* arg) {
+        std::vector<VersionEdit*>* tmp_edits =
+            reinterpret_cast<std::vector<VersionEdit*>*>(arg);
+        EXPECT_EQ(kAtomicGroupSize - 1, tmp_edits->size());
+        for (const auto e : *tmp_edits) {
+          bool found = false;
+          for (const auto& e2 : edits) {
+            if (&e2 == e) {
+              found = true;
+              break;
+            }
+          }
+          ASSERT_TRUE(found);
+        }
+        ++called;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  mutex_.Lock();
+  s = versions_->LogAndApply(cfds, mutable_cf_options_list, edit_lists,
+                             &mutex_);
+  mutex_.Unlock();
+  ASSERT_OK(s);
+  ASSERT_EQ(1, called);
+  if (cfd_to_drop->Unref()) {
+    delete cfd_to_drop;
+    cfd_to_drop = nullptr;
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(
+    AtomicGroup, VersionSetTestDropOneCF,
+    testing::Values(VersionSetTestBase::kColumnFamilyName1,
+                    VersionSetTestBase::kColumnFamilyName2,
+                    VersionSetTestBase::kColumnFamilyName3));
+
 }  // namespace rocksdb

 int main(int argc, char** argv) {
--- a/include/rocksdb/listener.h
+++ b/include/rocksdb/listener.h
@ -4,6 +4,7 @@

 #pragma once

+#include <chrono>
 #include <memory>
 #include <string>
 #include <unordered_map>
@ -144,13 +145,18 @@ struct TableFileDeletionInfo {
 };

 struct FileOperationInfo {
+  using TimePoint = std::chrono::time_point<std::chrono::system_clock,
+                                            std::chrono::nanoseconds>;
+
  const std::string& path;
  uint64_t offset;
  size_t length;
-  time_t start_timestamp;
-  time_t finish_timestamp;
+  const TimePoint& start_timestamp;
+  const TimePoint& finish_timestamp;
  Status status;
-  FileOperationInfo(const std::string& _path) : path(_path) {}
+  FileOperationInfo(const std::string& _path, const TimePoint& start,
+                    const TimePoint& finish)
+      : path(_path), start_timestamp(start), finish_timestamp(finish) {}
 };

 struct FlushJobInfo {
--- a/include/rocksdb/perf_context.h
+++ b/include/rocksdb/perf_context.h
@ -201,8 +201,8 @@ struct PerfContext {
  uint64_t env_lock_file_nanos;
  uint64_t env_unlock_file_nanos;
  uint64_t env_new_logger_nanos;
-  std::map<uint32_t, PerfContextByLevel>* level_to_perf_context;
-  bool per_level_perf_context_enabled;
+  std::map<uint32_t, PerfContextByLevel>* level_to_perf_context = nullptr;
+  bool per_level_perf_context_enabled = false;
 };

 // Get Thread-local PerfContext object pointer
--- a/include/rocksdb/version.h
+++ b/include/rocksdb/version.h
@ -6,7 +6,7 @@

 #define ROCKSDB_MAJOR 5
 #define ROCKSDB_MINOR 18
-#define ROCKSDB_PATCH 0
+#define ROCKSDB_PATCH 4

 // Do not use these. We made the mistake of declaring macros starting with
 // double underscore. Now we have to live with our choice. We'll deprecate these
--- a/java/CMakeLists.txt
+++ b/java/CMakeLists.txt
@ -313,17 +313,19 @@ if(NOT EXISTS ${JAVA_TEST_LIBDIR})
  file(MAKE_DIRECTORY mkdir ${JAVA_TEST_LIBDIR})
 endif()

-if (DEFINED CUSTOM_REPO_URL)
-  set(SEARCH_REPO_URL ${CUSTOM_REPO_URL}/)
-  set(CENTRAL_REPO_URL ${CUSTOM_REPO_URL}/)
+if (DEFINED CUSTOM_DEPS_URL)
+  set(DEPS_URL ${CUSTOM_DEPS_URL}/)
 else ()
-  set(SEARCH_REPO_URL "http://search.maven.org/remotecontent?filepath=")
-  set(CENTRAL_REPO_URL "http://central.maven.org/maven2/")
+  # This is a URL for artifacts from a "fake" release on pdillinger's fork,
+  # so as not to put binaries in git (ew). We should move to hosting these
+  # under the facebook account on github, or something else more reliable
+  # than maven.org, which has been failing frequently from Travis.
+  set(DEPS_URL "https://github.com/pdillinger/rocksdb/releases/download/v6.6.x-java-deps")
 endif()

 if(NOT EXISTS ${JAVA_JUNIT_JAR})
  message("Downloading ${JAVA_JUNIT_JAR}")
-  file(DOWNLOAD ${SEARCH_REPO_URL}junit/junit/4.12/junit-4.12.jar ${JAVA_TMP_JAR} STATUS downloadStatus)
+  file(DOWNLOAD ${DEPS_URL}/junit-4.12.jar ${JAVA_TMP_JAR} STATUS downloadStatus)
  list(GET downloadStatus 0 error_code)
  if(NOT error_code EQUAL 0)
    message(FATAL_ERROR "Failed downloading ${JAVA_JUNIT_JAR}")
@ -332,7 +334,7 @@ if(NOT EXISTS ${JAVA_JUNIT_JAR})
 endif()
 if(NOT EXISTS ${JAVA_HAMCR_JAR})
  message("Downloading ${JAVA_HAMCR_JAR}")
-  file(DOWNLOAD ${SEARCH_REPO_URL}org/hamcrest/hamcrest-core/1.3/hamcrest-core-1.3.jar ${JAVA_TMP_JAR} STATUS downloadStatus)
+  file(DOWNLOAD ${DEPS_URL}/hamcrest-core-1.3.jar ${JAVA_TMP_JAR} STATUS downloadStatus)
  list(GET downloadStatus 0 error_code)
  if(NOT error_code EQUAL 0)
    message(FATAL_ERROR "Failed downloading ${JAVA_HAMCR_JAR}")
@ -341,7 +343,7 @@ if(NOT EXISTS ${JAVA_HAMCR_JAR})
 endif()
 if(NOT EXISTS ${JAVA_MOCKITO_JAR})
  message("Downloading ${JAVA_MOCKITO_JAR}")
-  file(DOWNLOAD ${SEARCH_REPO_URL}org/mockito/mockito-all/1.10.19/mockito-all-1.10.19.jar ${JAVA_TMP_JAR} STATUS downloadStatus)
+  file(DOWNLOAD ${DEPS_URL}/mockito-all-1.10.19.jar ${JAVA_TMP_JAR} STATUS downloadStatus)
  list(GET downloadStatus 0 error_code)
  if(NOT error_code EQUAL 0)
    message(FATAL_ERROR "Failed downloading ${JAVA_MOCKITO_JAR}")
@ -350,7 +352,7 @@ if(NOT EXISTS ${JAVA_MOCKITO_JAR})
 endif()
 if(NOT EXISTS ${JAVA_CGLIB_JAR})
  message("Downloading ${JAVA_CGLIB_JAR}")
-  file(DOWNLOAD ${SEARCH_REPO_URL}cglib/cglib/2.2.2/cglib-2.2.2.jar ${JAVA_TMP_JAR} STATUS downloadStatus)
+  file(DOWNLOAD ${DEPS_URL}/cglib-2.2.2.jar ${JAVA_TMP_JAR} STATUS downloadStatus)
  list(GET downloadStatus 0 error_code)
  if(NOT error_code EQUAL 0)
    message(FATAL_ERROR "Failed downloading ${JAVA_CGLIB_JAR}")
@ -359,7 +361,7 @@ if(NOT EXISTS ${JAVA_CGLIB_JAR})
 endif()
 if(NOT EXISTS ${JAVA_ASSERTJ_JAR})
  message("Downloading ${JAVA_ASSERTJ_JAR}")
-  file(DOWNLOAD ${CENTRAL_REPO_URL}org/assertj/assertj-core/1.7.1/assertj-core-1.7.1.jar ${JAVA_TMP_JAR} STATUS downloadStatus)
+  file(DOWNLOAD ${DEPS_URL}/assertj-core-1.7.1.jar ${JAVA_TMP_JAR} STATUS downloadStatus)
  list(GET downloadStatus 0 error_code)
  if(NOT error_code EQUAL 0)
    message(FATAL_ERROR "Failed downloading ${JAVA_ASSERTJ_JAR}")
--- a/java/Makefile
+++ b/java/Makefile
@ -192,8 +192,11 @@ ifneq ($(DEBUG_LEVEL),0)
 	JAVAC_ARGS = -Xlint:deprecation -Xlint:unchecked
 endif

-SEARCH_REPO_URL?=http://search.maven.org/remotecontent?filepath=
-CENTRAL_REPO_URL?=http://central.maven.org/maven2/
+# This is a URL for artifacts from a "fake" release on pdillinger's fork,
+# so as not to put binaries in git (ew). We should move to hosting these
+# under the facebook account on github, or something else more reliable
+# than maven.org, which has been failing frequently from Travis.
+DEPS_URL?=https://github.com/pdillinger/rocksdb/releases/download/v6.6.x-java-deps

 clean:
 	$(AM_V_at)rm -rf include/*
@ -250,11 +253,11 @@ optimistic_transaction_sample: java

 resolve_test_deps:
 	test -d "$(JAVA_TEST_LIBDIR)" || mkdir -p "$(JAVA_TEST_LIBDIR)"
-	test -s "$(JAVA_JUNIT_JAR)" || cp $(MVN_LOCAL)/junit/junit/4.12/junit-4.12.jar $(JAVA_TEST_LIBDIR) || curl -k -L -o $(JAVA_JUNIT_JAR) $(SEARCH_REPO_URL)junit/junit/4.12/junit-4.12.jar
-	test -s "$(JAVA_HAMCR_JAR)" || cp $(MVN_LOCAL)/org/hamcrest/hamcrest-core/1.3/hamcrest-core-1.3.jar $(JAVA_TEST_LIBDIR) || curl -k -L -o $(JAVA_HAMCR_JAR) $(SEARCH_REPO_URL)org/hamcrest/hamcrest-core/1.3/hamcrest-core-1.3.jar
-	test -s "$(JAVA_MOCKITO_JAR)" || cp $(MVN_LOCAL)/org/mockito/mockito-all/1.10.19/mockito-all-1.10.19.jar $(JAVA_TEST_LIBDIR) || curl -k -L -o "$(JAVA_MOCKITO_JAR)" $(SEARCH_REPO_URL)org/mockito/mockito-all/1.10.19/mockito-all-1.10.19.jar
-	test -s "$(JAVA_CGLIB_JAR)" || cp $(MVN_LOCAL)/cglib/cglib/2.2.2/cglib-2.2.2.jar $(JAVA_TEST_LIBDIR) || curl -k -L -o "$(JAVA_CGLIB_JAR)" $(SEARCH_REPO_URL)cglib/cglib/2.2.2/cglib-2.2.2.jar
-	test -s "$(JAVA_ASSERTJ_JAR)" || cp $(MVN_LOCAL)/org/assertj/assertj-core/1.7.1/assertj-core-1.7.1.jar $(JAVA_TEST_LIBDIR) || curl -k -L -o "$(JAVA_ASSERTJ_JAR)" $(CENTRAL_REPO_URL)org/assertj/assertj-core/1.7.1/assertj-core-1.7.1.jar
+	test -s "$(JAVA_JUNIT_JAR)" || cp $(MVN_LOCAL)/junit/junit/4.12/junit-4.12.jar $(JAVA_TEST_LIBDIR) || curl --fail --insecure --output $(JAVA_JUNIT_JAR) --location $(DEPS_URL)/junit-4.12.jar
+	test -s "$(JAVA_HAMCR_JAR)" || cp $(MVN_LOCAL)/org/hamcrest/hamcrest-core/1.3/hamcrest-core-1.3.jar $(JAVA_TEST_LIBDIR) || curl --fail --insecure --output $(JAVA_HAMCR_JAR) --location $(DEPS_URL)/hamcrest-core-1.3.jar
+	test -s "$(JAVA_MOCKITO_JAR)" || cp $(MVN_LOCAL)/org/mockito/mockito-all/1.10.19/mockito-all-1.10.19.jar $(JAVA_TEST_LIBDIR) || curl --fail --insecure --output "$(JAVA_MOCKITO_JAR)" --location $(DEPS_URL)/mockito-all-1.10.19.jar
+	test -s "$(JAVA_CGLIB_JAR)" || cp $(MVN_LOCAL)/cglib/cglib/2.2.2/cglib-2.2.2.jar $(JAVA_TEST_LIBDIR) || curl --fail --insecure --output "$(JAVA_CGLIB_JAR)" --location $(DEPS_URL)/cglib-2.2.2.jar
+	test -s "$(JAVA_ASSERTJ_JAR)" || cp $(MVN_LOCAL)/org/assertj/assertj-core/1.7.1/assertj-core-1.7.1.jar $(JAVA_TEST_LIBDIR) || curl --fail --insecure --output "$(JAVA_ASSERTJ_JAR)" --location $(DEPS_URL)/assertj-core-1.7.1.jar

 java_test: java resolve_test_deps
 	$(AM_V_GEN)mkdir -p $(TEST_CLASSES)
--- a/java/crossbuild/docker-build-linux-centos.sh
+++ b/java/crossbuild/docker-build-linux-centos.sh
@ -8,11 +8,21 @@ cd /rocksdb-local

 # Use scl devtoolset if available (i.e. CentOS <7)
 if hash scl 2>/dev/null; then
-	scl enable devtoolset-2 'make jclean clean'
-	scl enable devtoolset-2 'PORTABLE=1 make -j8 rocksdbjavastatic'
+	if scl --list | grep -q 'devtoolset-7'; then
+		scl enable devtoolset-7 'make jclean clean'
+		scl enable devtoolset-7 'PORTABLE=1 make -j2 rocksdbjavastatic'
+
+	elif scl --list | grep -q 'devtoolset-2'; then
+		scl enable devtoolset-2 'make jclean clean'
+		scl enable devtoolset-2 'PORTABLE=1 make -j2 rocksdbjavastatic'
+
+	else
+		echo "Could not find devtoolset"
+		exit 1;
+	fi
 else
 	make jclean clean
-        PORTABLE=1 make -j8 rocksdbjavastatic
+        PORTABLE=1 make -j2 rocksdbjavastatic
 fi

 cp java/target/librocksdbjni-linux*.so java/target/rocksdbjni-*-linux*.jar /rocksdb-host/java/target
--- a/java/rocksjni/loggerjnicallback.cc
+++ b/java/rocksjni/loggerjnicallback.cc
@ -131,7 +131,6 @@ void LoggerJniCallback::Logv(const InfoLogLevel log_level, const char* format,
    }

    assert(format != nullptr);
-    assert(ap != nullptr);
    const std::unique_ptr<char[]> msg = format_str(format, ap);

    // pass msg to java callback handler
--- a/java/src/main/java/org/rocksdb/MemoryUsageType.java
+++ b/java/src/main/java/org/rocksdb/MemoryUsageType.java
@ -54,9 +54,9 @@ public enum MemoryUsageType {
   *     cannot be found
   */
  public static MemoryUsageType getMemoryUsageType(final byte byteIdentifier) {
-    for (final MemoryUsageType MemoryUsageType : MemoryUsageType.values()) {
-      if (MemoryUsageType.getValue() == byteIdentifier) {
-        return MemoryUsageType;
+    for (final MemoryUsageType memoryUsageType : MemoryUsageType.values()) {
+      if (memoryUsageType.getValue() == byteIdentifier) {
+        return memoryUsageType;
      }
    }

@ -64,7 +64,7 @@ public enum MemoryUsageType {
        "Illegal value provided for MemoryUsageType.");
  }

-  private MemoryUsageType(byte value) {
+  MemoryUsageType(byte value) {
    value_ = value;
  }

--- a/java/src/main/java/org/rocksdb/util/Environment.java
+++ b/java/src/main/java/org/rocksdb/util/Environment.java
@ -4,6 +4,10 @@ public class Environment {
  private static String OS = System.getProperty("os.name").toLowerCase();
  private static String ARCH = System.getProperty("os.arch").toLowerCase();

+  public static boolean isAarch64() {
+    return ARCH.contains("aarch64");
+  }
+
  public static boolean isPowerPC() {
    return ARCH.contains("ppc");
  }
@ -59,7 +63,7 @@ public class Environment {
  public static String getJniLibraryName(final String name) {
    if (isUnix()) {
      final String arch = is64Bit() ? "64" : "32";
-      if(isPowerPC()) {
+      if(isPowerPC() || isAarch64()) {
        return String.format("%sjni-linux-%s", name, ARCH);
      } else if(isS390x()) {
        return String.format("%sjni-linux%s", name, ARCH);
--- a/java/src/test/java/org/rocksdb/MergeTest.java
+++ b/java/src/test/java/org/rocksdb/MergeTest.java
@ -46,13 +46,13 @@ public class MergeTest {
  }

  private byte[] longToByteArray(long l) {
-    ByteBuffer buf = ByteBuffer.allocate(Long.BYTES);
+    ByteBuffer buf = ByteBuffer.allocate(Long.SIZE / Byte.SIZE);
    buf.putLong(l);
    return buf.array();
  }

  private long longFromByteArray(byte[] a) {
-    ByteBuffer buf = ByteBuffer.allocate(Long.BYTES);
+    ByteBuffer buf = ByteBuffer.allocate(Long.SIZE / Byte.SIZE);
    buf.put(a);
    buf.flip();
    return buf.getLong();
--- a/java/src/test/java/org/rocksdb/util/EnvironmentTest.java
+++ b/java/src/test/java/org/rocksdb/util/EnvironmentTest.java
@ -130,6 +130,24 @@ public class EnvironmentTest {
      isEqualTo("librocksdbjni.dll");
  }

+  @Test
+  public void aarch64() {
+    setEnvironmentClassFields("Linux", "aarch64");
+    assertThat(Environment.isUnix()).isTrue();
+    assertThat(Environment.isAarch64()).isTrue();
+    assertThat(Environment.is64Bit()).isTrue();
+    assertThat(Environment.getJniLibraryExtension()).
+        isEqualTo(".so");
+    assertThat(Environment.getSharedLibraryName("rocksdb")).
+        isEqualTo("rocksdbjni");
+    assertThat(Environment.getJniLibraryName("rocksdb")).
+        isEqualTo("rocksdbjni-linux-aarch64");
+    assertThat(Environment.getJniLibraryFileName("rocksdb")).
+        isEqualTo("librocksdbjni-linux-aarch64.so");
+    assertThat(Environment.getSharedLibraryFileName("rocksdb")).
+        isEqualTo("librocksdbjni.so");
+  }
+
  private void setEnvironmentClassFields(String osName,
      String osArch) {
    setEnvironmentClassField(OS_FIELD_NAME, osName);
--- a/port/jemalloc_helper.h
+++ b/port/jemalloc_helper.h
@ -0,0 +1,53 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifdef ROCKSDB_JEMALLOC
+#ifdef __FreeBSD__
+#include <malloc_np.h>
+#else
+#include <jemalloc/jemalloc.h>
+#endif
+
+#ifndef JEMALLOC_CXX_THROW
+#define JEMALLOC_CXX_THROW
+#endif
+
+// Declare non-standard jemalloc APIs as weak symbols. We can null-check these
+// symbols to detect whether jemalloc is linked with the binary.
+extern "C" void* mallocx(size_t, int) __attribute__((__weak__));
+extern "C" void* rallocx(void*, size_t, int) __attribute__((__weak__));
+extern "C" size_t xallocx(void*, size_t, size_t, int) __attribute__((__weak__));
+extern "C" size_t sallocx(const void*, int) __attribute__((__weak__));
+extern "C" void dallocx(void*, int) __attribute__((__weak__));
+extern "C" void sdallocx(void*, size_t, int) __attribute__((__weak__));
+extern "C" size_t nallocx(size_t, int) __attribute__((__weak__));
+extern "C" int mallctl(const char*, void*, size_t*, void*, size_t)
+    __attribute__((__weak__));
+extern "C" int mallctlnametomib(const char*, size_t*, size_t*)
+    __attribute__((__weak__));
+extern "C" int mallctlbymib(const size_t*, size_t, void*, size_t*, void*,
+                            size_t) __attribute__((__weak__));
+extern "C" void malloc_stats_print(void (*)(void*, const char*), void*,
+                                   const char*) __attribute__((__weak__));
+extern "C" size_t malloc_usable_size(JEMALLOC_USABLE_SIZE_CONST void*)
+    JEMALLOC_CXX_THROW __attribute__((__weak__));
+
+// Check if Jemalloc is linked with the binary. Note the main program might be
+// using a different memory allocator even this method return true.
+// It is loosely based on folly::usingJEMalloc(), minus the check that actually
+// allocate memory and see if it is through jemalloc, to handle the dlopen()
+// case:
+// https://github.com/facebook/folly/blob/76cf8b5841fb33137cfbf8b224f0226437c855bc/folly/memory/Malloc.h#L147
+static inline bool HasJemalloc() {
+  return mallocx != nullptr && rallocx != nullptr && xallocx != nullptr &&
+         sallocx != nullptr && dallocx != nullptr && sdallocx != nullptr &&
+         nallocx != nullptr && mallctl != nullptr &&
+         mallctlnametomib != nullptr && mallctlbymib != nullptr &&
+         malloc_stats_print != nullptr && malloc_usable_size != nullptr;
+}
+
+#endif  // ROCKSDB_JEMALLOC
--- a/src.mk
+++ b/src.mk
@ -44,7 +44,6 @@ LIB_SOURCES =                                                   \
  db/merge_helper.cc                                            \
  db/merge_operator.cc                                          \
  db/range_del_aggregator.cc                                    \
-  db/range_del_aggregator_v2.cc                                 \
  db/range_tombstone_fragmenter.cc                              \
  db/repair.cc                                                  \
  db/snapshot_impl.cc                                           \
@ -223,6 +222,11 @@ LIB_SOURCES =                                                   \
  utilities/write_batch_with_index/write_batch_with_index.cc    \
  utilities/write_batch_with_index/write_batch_with_index_internal.cc    \

+ifeq ($(ARMCRC_SOURCE),1)
+LIB_SOURCES +=\
+  util/crc32c_arm64.cc
+endif
+
 ifeq (,$(shell $(CXX) -fsyntax-only -maltivec -xc /dev/null 2>&1))
 LIB_SOURCES_ASM =\
  util/crc32c_ppc_asm.S
@ -334,7 +338,6 @@ MAIN_SOURCES =                                                          \
  db/repair_test.cc                                                     \
  db/range_del_aggregator_test.cc                                       \
  db/range_del_aggregator_bench.cc                                      \
-  db/range_del_aggregator_v2_test.cc                                    \
  db/range_tombstone_fragmenter_test.cc                                 \
  db/table_properties_collector_test.cc                                 \
  db/util_merge_operators_test.cc                                       \
--- a/table/block_based_table_reader.cc
+++ b/table/block_based_table_reader.cc
@ -2348,7 +2348,7 @@ FragmentedRangeTombstoneIterator* BlockBasedTable::NewRangeTombstoneIterator(
    snapshot = read_options.snapshot->GetSequenceNumber();
  }
  return new FragmentedRangeTombstoneIterator(
-      rep_->fragmented_range_dels, snapshot, rep_->internal_comparator);
+      rep_->fragmented_range_dels, rep_->internal_comparator, snapshot);
 }

 InternalIterator* BlockBasedTable::NewUnfragmentedRangeTombstoneIterator(
--- a/util/crc32c.cc
+++ b/util/crc32c.cc
@ -18,6 +18,8 @@
 #include "util/coding.h"
 #include "util/util.h"

+#include "util/crc32c_arm64.h"
+
 #ifdef __powerpc64__
 #include "util/crc32c_ppc.h"
 #include "util/crc32c_ppc_constants.h"
@ -396,6 +398,8 @@ uint32_t ExtendImpl(uint32_t crc, const char* buf, size_t size) {
  return static_cast<uint32_t>(l ^ 0xffffffffu);
 }

+// Detect if ARM64 CRC or not.
+#ifndef HAVE_ARM64_CRC
 // Detect if SS42 or not.
 #ifndef HAVE_POWER8

@ -434,6 +438,7 @@ static bool isPCLMULQDQ() {
 }

 #endif  // HAVE_POWER8
+#endif  // HAVE_ARM64_CRC

 typedef uint32_t (*Function)(uint32_t, const char*, size_t);

@ -463,6 +468,11 @@ static bool isAltiVec() {
 }
 #endif

+#if defined(__linux__) && defined(HAVE_ARM64_CRC)
+uint32_t ExtendARMImpl(uint32_t crc, const char *buf, size_t size) {
+  return crc32c_arm64(crc, (const unsigned char *)buf, size);
+}
+#endif

 std::string IsFastCrc32Supported() {
  bool has_fast_crc = false;
@ -478,6 +488,14 @@ std::string IsFastCrc32Supported() {
  has_fast_crc = false;
  arch = "PPC";
 #endif
+#elif defined(__linux__) && defined(HAVE_ARM64_CRC)
+  if (crc32c_runtime_check()) {
+    has_fast_crc = true;
+    arch = "Arm64";
+  } else {
+    has_fast_crc = false;
+    arch = "Arm64";
+  }
 #else
  has_fast_crc = isSSE42();
  arch = "x86";
@ -1200,7 +1218,15 @@ uint32_t crc32c_3way(uint32_t crc, const char* buf, size_t len) {
 #endif //HAVE_SSE42 && HAVE_PCLMUL

 static inline Function Choose_Extend() {
-#ifndef HAVE_POWER8
+#ifdef HAVE_POWER8
+  return isAltiVec() ? ExtendPPCImpl : ExtendImpl<Slow_CRC32>;
+#elif defined(__linux__) && defined(HAVE_ARM64_CRC)
+  if(crc32c_runtime_check()) {
+    return ExtendARMImpl;
+  } else {
+    return ExtendImpl<Slow_CRC32>;
+  }
+#else
  if (isSSE42()) {
    if (isPCLMULQDQ()) {
 #if defined HAVE_SSE42  && defined HAVE_PCLMUL && !defined NO_THREEWAY_CRC32C
@ -1216,8 +1242,6 @@ static inline Function Choose_Extend() {
  else {
    return ExtendImpl<Slow_CRC32>;
  }
-#else  //HAVE_POWER8
-  return isAltiVec() ? ExtendPPCImpl : ExtendImpl<Slow_CRC32>;
 #endif
 }

--- a/util/crc32c_arm64.cc
+++ b/util/crc32c_arm64.cc
@ -0,0 +1,56 @@
+//  Copyright (c) 2018, Arm Limited and affiliates. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "util/crc32c_arm64.h"
+
+#if defined(__linux__) && defined(HAVE_ARM64_CRC)
+
+#include <asm/hwcap.h>
+#include <sys/auxv.h>
+#ifndef HWCAP_CRC32
+#define HWCAP_CRC32 (1 << 7)
+#endif
+uint32_t crc32c_runtime_check(void) {
+  uint64_t auxv = getauxval(AT_HWCAP);
+  return (auxv & HWCAP_CRC32) != 0;
+}
+
+uint32_t crc32c_arm64(uint32_t crc, unsigned char const *data,
+                             unsigned len) {
+  const uint8_t *buf1;
+  const uint16_t *buf2;
+  const uint32_t *buf4;
+  const uint64_t *buf8;
+
+  int64_t length = (int64_t)len;
+
+  crc ^= 0xffffffff;
+  buf8 = (const uint64_t *)data;
+  while ((length -= sizeof(uint64_t)) >= 0) {
+    crc = __crc32cd(crc, *buf8++);
+  }
+
+  /* The following is more efficient than the straight loop */
+  buf4 = (const uint32_t *)buf8;
+  if (length & sizeof(uint32_t)) {
+    crc = __crc32cw(crc, *buf4++);
+    length -= 4;
+  }
+
+  buf2 = (const uint16_t *)buf4;
+  if (length & sizeof(uint16_t)) {
+    crc = __crc32ch(crc, *buf2++);
+    length -= 2;
+  }
+
+  buf1 = (const uint8_t *)buf2;
+  if (length & sizeof(uint8_t))
+    crc = __crc32cb(crc, *buf1);
+
+  crc ^= 0xffffffff;
+  return crc;
+}
+
+#endif
--- a/util/crc32c_arm64.h
+++ b/util/crc32c_arm64.h
@ -0,0 +1,21 @@
+//  Copyright (c) 2018, Arm Limited and affiliates. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef UTIL_CRC32C_ARM64_H
+#define UTIL_CRC32C_ARM64_H
+
+#include <inttypes.h>
+
+#if defined(__aarch64__) || defined(__AARCH64__)
+#ifdef __ARM_FEATURE_CRC32
+#define HAVE_ARM64_CRC
+#include <arm_acle.h>
+extern uint32_t crc32c_arm64(uint32_t crc, unsigned char const *data, unsigned len);
+extern uint32_t crc32c_runtime_check(void);
+#endif
+#endif
+
+
+#endif
--- a/util/delete_scheduler.cc
+++ b/util/delete_scheduler.cc
@ -52,11 +52,12 @@ DeleteScheduler::~DeleteScheduler() {
 }

 Status DeleteScheduler::DeleteFile(const std::string& file_path,
-                                   const std::string& dir_to_sync) {
+                                   const std::string& dir_to_sync,
+                                   const bool force_bg) {
  Status s;
-  if (rate_bytes_per_sec_.load() <= 0 ||
+  if (rate_bytes_per_sec_.load() <= 0 || (!force_bg &&
      total_trash_size_.load() >
-          sst_file_manager_->GetTotalSize() * max_trash_db_ratio_.load()) {
+          sst_file_manager_->GetTotalSize() * max_trash_db_ratio_.load())) {
    // Rate limiting is disabled or trash size makes up more than
    // max_trash_db_ratio_ (default 25%) of the total DB size
    TEST_SYNC_POINT("DeleteScheduler::DeleteFile");
--- a/util/delete_scheduler.h
+++ b/util/delete_scheduler.h
@ -46,8 +46,11 @@ class DeleteScheduler {
    rate_bytes_per_sec_.store(bytes_per_sec);
  }

-  // Mark file as trash directory and schedule it's deletion
-  Status DeleteFile(const std::string& fname, const std::string& dir_to_sync);
+  // Mark file as trash directory and schedule it's deletion. If force_bg is
+  // set, it forces the file to always be deleted in the background thread,
+  // except when rate limiting is disabled
+  Status DeleteFile(const std::string& fname, const std::string& dir_to_sync,
+      const bool force_bg = false);

  // Wait for all files being deleteing in the background to finish or for
  // destructor to be called.
--- a/util/file_reader_writer.cc
+++ b/util/file_reader_writer.cc
@ -99,17 +99,18 @@ Status RandomAccessFileReader::Read(uint64_t offset, size_t n, Slice* result,
        }
        Slice tmp;

-        time_t start_ts = 0;
+        FileOperationInfo::TimePoint start_ts;
        uint64_t orig_offset = 0;
        if (ShouldNotifyListeners()) {
-          start_ts = std::chrono::system_clock::to_time_t(
-              std::chrono::system_clock::now());
+          start_ts = std::chrono::system_clock::now();
          orig_offset = aligned_offset + buf.CurrentSize();
        }
        s = file_->Read(aligned_offset + buf.CurrentSize(), allowed, &tmp,
                        buf.Destination());
        if (ShouldNotifyListeners()) {
-          NotifyOnFileReadFinish(orig_offset, tmp.size(), start_ts, s);
+          auto finish_ts = std::chrono::system_clock::now();
+          NotifyOnFileReadFinish(orig_offset, tmp.size(), start_ts, finish_ts,
+                                 s);
        }

        buf.Size(buf.CurrentSize() + tmp.size());
@ -145,16 +146,17 @@ Status RandomAccessFileReader::Read(uint64_t offset, size_t n, Slice* result,
        Slice tmp_result;

 #ifndef ROCKSDB_LITE
-        time_t start_ts = 0;
+        FileOperationInfo::TimePoint start_ts;
        if (ShouldNotifyListeners()) {
-          start_ts = std::chrono::system_clock::to_time_t(
-              std::chrono::system_clock::now());
+          start_ts = std::chrono::system_clock::now();
        }
 #endif
        s = file_->Read(offset + pos, allowed, &tmp_result, scratch + pos);
 #ifndef ROCKSDB_LITE
        if (ShouldNotifyListeners()) {
-          NotifyOnFileReadFinish(offset + pos, tmp_result.size(), start_ts, s);
+          auto finish_ts = std::chrono::system_clock::now();
+          NotifyOnFileReadFinish(offset + pos, tmp_result.size(), start_ts,
+                                 finish_ts, s);
        }
 #endif

@ -442,18 +444,18 @@ Status WritableFileWriter::WriteBuffered(const char* data, size_t size) {
      TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend");

 #ifndef ROCKSDB_LITE
-      time_t start_ts = 0;
+      FileOperationInfo::TimePoint start_ts;
      uint64_t old_size = writable_file_->GetFileSize();
      if (ShouldNotifyListeners()) {
-        start_ts = std::chrono::system_clock::to_time_t(
-            std::chrono::system_clock::now());
+        start_ts = std::chrono::system_clock::now();
        old_size = next_write_offset_;
      }
 #endif
      s = writable_file_->Append(Slice(src, allowed));
 #ifndef ROCKSDB_LITE
      if (ShouldNotifyListeners()) {
-        NotifyOnFileWriteFinish(old_size, allowed, start_ts, s);
+        auto finish_ts = std::chrono::system_clock::now();
+        NotifyOnFileWriteFinish(old_size, allowed, start_ts, finish_ts, s);
      }
 #endif
      if (!s.ok()) {
@ -518,15 +520,15 @@ Status WritableFileWriter::WriteDirect() {
    {
      IOSTATS_TIMER_GUARD(write_nanos);
      TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend");
-      time_t start_ts(0);
+      FileOperationInfo::TimePoint start_ts;
      if (ShouldNotifyListeners()) {
-        start_ts = std::chrono::system_clock::to_time_t(
-            std::chrono::system_clock::now());
+        start_ts = std::chrono::system_clock::now();
      }
      // direct writes must be positional
      s = writable_file_->PositionedAppend(Slice(src, size), write_offset);
      if (ShouldNotifyListeners()) {
-        NotifyOnFileWriteFinish(write_offset, size, start_ts, s);
+        auto finish_ts = std::chrono::system_clock::now();
+        NotifyOnFileWriteFinish(write_offset, size, start_ts, finish_ts, s);
      }
      if (!s.ok()) {
        buf_.Size(file_advance + leftover_tail);
--- a/util/file_reader_writer.h
+++ b/util/file_reader_writer.h
@ -64,15 +64,13 @@ class SequentialFileReader {
 class RandomAccessFileReader {
 private:
 #ifndef ROCKSDB_LITE
-  void NotifyOnFileReadFinish(uint64_t offset, size_t length, time_t start_ts,
+  void NotifyOnFileReadFinish(uint64_t offset, size_t length,
+                              const FileOperationInfo::TimePoint& start_ts,
+                              const FileOperationInfo::TimePoint& finish_ts,
                              const Status& status) const {
-    FileOperationInfo info(file_name_);
+    FileOperationInfo info(file_name_, start_ts, finish_ts);
    info.offset = offset;
    info.length = length;
-    info.start_timestamp = start_ts;
-    time_t finish_ts =
-        std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
-    info.finish_timestamp = finish_ts;
    info.status = status;

    for (auto& listener : listeners_) {
@ -157,15 +155,13 @@ class RandomAccessFileReader {
 class WritableFileWriter {
 private:
 #ifndef ROCKSDB_LITE
-  void NotifyOnFileWriteFinish(uint64_t offset, size_t length, time_t start_ts,
+  void NotifyOnFileWriteFinish(uint64_t offset, size_t length,
+                               const FileOperationInfo::TimePoint& start_ts,
+                               const FileOperationInfo::TimePoint& finish_ts,
                               const Status& status) {
-    FileOperationInfo info(file_name_);
+    FileOperationInfo info(file_name_, start_ts, finish_ts);
    info.offset = offset;
    info.length = length;
-    info.start_timestamp = start_ts;
-    time_t finish_ts =
-        std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
-    info.finish_timestamp = finish_ts;
    info.status = status;

    for (auto& listener : listeners_) {
--- a/util/file_util.cc
+++ b/util/file_util.cc
@ -89,16 +89,23 @@ Status CreateFile(Env* env, const std::string& destination,

 Status DeleteSSTFile(const ImmutableDBOptions* db_options,
                     const std::string& fname, const std::string& dir_to_sync) {
+  return DeleteDBFile(db_options, fname, dir_to_sync, false);
+}
+
+Status DeleteDBFile(const ImmutableDBOptions* db_options,
+                     const std::string& fname, const std::string& dir_to_sync,
+                     const bool force_bg) {
 #ifndef ROCKSDB_LITE
  auto sfm =
      static_cast<SstFileManagerImpl*>(db_options->sst_file_manager.get());
  if (sfm) {
-    return sfm->ScheduleFileDeletion(fname, dir_to_sync);
+    return sfm->ScheduleFileDeletion(fname, dir_to_sync, force_bg);
  } else {
    return db_options->env->DeleteFile(fname);
  }
 #else
  (void)dir_to_sync;
+  (void)force_bg;
  // SstFileManager is not supported in ROCKSDB_LITE
  return db_options->env->DeleteFile(fname);
 #endif
--- a/util/file_util.h
+++ b/util/file_util.h
@ -25,4 +25,9 @@ extern Status DeleteSSTFile(const ImmutableDBOptions* db_options,
                            const std::string& fname,
                            const std::string& path_to_sync);

+extern Status DeleteDBFile(const ImmutableDBOptions* db_options,
+                            const std::string& fname,
+                            const std::string& path_to_sync,
+                            const bool force_bg);
+
 }  // namespace rocksdb
--- a/util/heap.h
+++ b/util/heap.h
@ -92,9 +92,7 @@ class BinaryHeap {
    reset_root_cmp_cache();
  }

-  bool empty() const {
-    return data_.empty();
-  }
+  bool empty() const { return data_.empty(); }

  size_t size() const { return data_.size(); }

--- a/util/jemalloc_nodump_allocator.cc
+++ b/util/jemalloc_nodump_allocator.cc
@ -133,12 +133,16 @@ Status NewJemallocNodumpAllocator(
    JemallocAllocatorOptions& options,
    std::shared_ptr<MemoryAllocator>* memory_allocator) {
  *memory_allocator = nullptr;
-#ifndef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
-  (void) options;
-  return Status::NotSupported(
+  Status unsupported = Status::NotSupported(
      "JemallocNodumpAllocator only available with jemalloc version >= 5 "
      "and MADV_DONTDUMP is available.");
+#ifndef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
+  (void)options;
+  return unsupported;
 #else
+  if (!HasJemalloc()) {
+    return unsupported;
+  }
  if (memory_allocator == nullptr) {
    return Status::InvalidArgument("memory_allocator must be non-null.");
  }
--- a/util/jemalloc_nodump_allocator.h
+++ b/util/jemalloc_nodump_allocator.h
@ -8,6 +8,7 @@
 #include <atomic>
 #include <vector>

+#include "port/jemalloc_helper.h"
 #include "port/port.h"
 #include "rocksdb/memory_allocator.h"
 #include "util/core_local.h"
@ -15,7 +16,6 @@

 #if defined(ROCKSDB_JEMALLOC) && defined(ROCKSDB_PLATFORM_POSIX)

-#include <jemalloc/jemalloc.h>
 #include <sys/mman.h>

 #if (JEMALLOC_VERSION_MAJOR >= 5) && defined(MADV_DONTDUMP)
--- a/util/sst_file_manager_impl.cc
+++ b/util/sst_file_manager_impl.cc
@ -402,8 +402,11 @@ bool SstFileManagerImpl::CancelErrorRecovery(ErrorHandler* handler) {
 }

 Status SstFileManagerImpl::ScheduleFileDeletion(
-    const std::string& file_path, const std::string& path_to_sync) {
-  return delete_scheduler_.DeleteFile(file_path, path_to_sync);
+    const std::string& file_path, const std::string& path_to_sync,
+    const bool force_bg) {
+  TEST_SYNC_POINT("SstFileManagerImpl::ScheduleFileDeletion");
+  return delete_scheduler_.DeleteFile(file_path, path_to_sync,
+                                      force_bg);
 }

 void SstFileManagerImpl::WaitForEmptyTrash() {
--- a/util/sst_file_manager_impl.h
+++ b/util/sst_file_manager_impl.h
@ -111,9 +111,12 @@ class SstFileManagerImpl : public SstFileManager {
  // not guaranteed
  bool CancelErrorRecovery(ErrorHandler* db);

-  // Mark file as trash and schedule it's deletion.
+  // Mark file as trash and schedule it's deletion. If force_bg is set, it
+  // forces the file to be deleting in the background regardless of DB size,
+  // except when rate limited delete is disabled
  virtual Status ScheduleFileDeletion(const std::string& file_path,
-                                      const std::string& dir_to_sync);
+                                      const std::string& dir_to_sync,
+                                      const bool force_bg = false);

  // Wait for all files being deleteing in the background to finish or for
  // destructor to be called.
--- a/utilities/blob_db/blob_db_impl.cc
+++ b/utilities/blob_db/blob_db_impl.cc
@ -26,6 +26,7 @@
 #include "util/cast_util.h"
 #include "util/crc32c.h"
 #include "util/file_reader_writer.h"
+#include "util/file_util.h"
 #include "util/filename.h"
 #include "util/logging.h"
 #include "util/mutexlock.h"
@ -1745,7 +1746,8 @@ std::pair<bool, int64_t> BlobDBImpl::DeleteObsoleteFiles(bool aborted) {
                   bfile->PathName().c_str());

    blob_files_.erase(bfile->BlobFileNumber());
-    Status s = env_->DeleteFile(bfile->PathName());
+    Status s = DeleteDBFile(&(db_impl_->immutable_db_options()),
+                             bfile->PathName(), blob_dir_, true);
    if (!s.ok()) {
      ROCKS_LOG_ERROR(db_options_.info_log,
                      "File failed to be deleted as obsolete %s",
@ -1835,7 +1837,7 @@ Status DestroyBlobDB(const std::string& dbname, const Options& options,
    uint64_t number;
    FileType type;
    if (ParseFileName(f, &number, &type) && type == kBlobFile) {
-      Status del = env->DeleteFile(blobdir + "/" + f);
+      Status del = DeleteDBFile(&soptions, blobdir + "/" + f, blobdir, true);
      if (status.ok() && !del.ok()) {
        status = del;
      }
--- a/utilities/blob_db/blob_db_test.cc
+++ b/utilities/blob_db/blob_db_test.cc
@ -18,6 +18,7 @@
 #include "util/cast_util.h"
 #include "util/fault_injection_test_env.h"
 #include "util/random.h"
+#include "util/sst_file_manager_impl.h"
 #include "util/string_util.h"
 #include "util/sync_point.h"
 #include "util/testharness.h"
@ -762,6 +763,52 @@ TEST_F(BlobDBTest, ReadWhileGC) {
  }
 }

+TEST_F(BlobDBTest, SstFileManager) {
+  // run the same test for Get(), MultiGet() and Iterator each.
+  std::shared_ptr<SstFileManager> sst_file_manager(
+      NewSstFileManager(mock_env_.get()));
+  sst_file_manager->SetDeleteRateBytesPerSecond(1);
+  SstFileManagerImpl *sfm =
+      static_cast<SstFileManagerImpl *>(sst_file_manager.get());
+
+  BlobDBOptions bdb_options;
+  bdb_options.min_blob_size = 0;
+  Options db_options;
+
+  int files_deleted_directly = 0;
+  int files_scheduled_to_delete = 0;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::ScheduleFileDeletion",
+      [&](void * /*arg*/) { files_scheduled_to_delete++; });
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteFile",
+      [&](void * /*arg*/) { files_deleted_directly++; });
+  SyncPoint::GetInstance()->EnableProcessing();
+  db_options.sst_file_manager = sst_file_manager;
+
+  Open(bdb_options, db_options);
+
+  // Create one obselete file and clean it.
+  blob_db_->Put(WriteOptions(), "foo", "bar");
+  auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
+  ASSERT_EQ(1, blob_files.size());
+  std::shared_ptr<BlobFile> bfile = blob_files[0];
+  ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(bfile));
+  GCStats gc_stats;
+  ASSERT_OK(blob_db_impl()->TEST_GCFileAndUpdateLSM(bfile, &gc_stats));
+  blob_db_impl()->TEST_DeleteObsoleteFiles();
+
+  // Even if SSTFileManager is not set, DB is creating a dummy one.
+  ASSERT_EQ(1, files_scheduled_to_delete);
+  ASSERT_EQ(0, files_deleted_directly);
+  Destroy();
+  // Make sure that DestroyBlobDB() also goes through delete scheduler.
+  ASSERT_GE(2, files_scheduled_to_delete);
+  ASSERT_EQ(0, files_deleted_directly);
+  SyncPoint::GetInstance()->DisableProcessing();
+  sfm->WaitForEmptyTrash();
+}
+
 TEST_F(BlobDBTest, SnapshotAndGarbageCollection) {
  BlobDBOptions bdb_options;
  bdb_options.min_blob_size = 0;
--- a/utilities/checkpoint/checkpoint_test.cc
+++ b/utilities/checkpoint/checkpoint_test.cc
@ -164,6 +164,16 @@ class CheckpointTest : public testing::Test {
    return DB::OpenForReadOnly(options, dbname_, &db_);
  }

+  Status ReadOnlyReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+                                          const Options& options) {
+    std::vector<ColumnFamilyDescriptor> column_families;
+    for (const auto& cf : cfs) {
+      column_families.emplace_back(cf, options);
+    }
+    return DB::OpenForReadOnly(options, dbname_, column_families, &handles_,
+                               &db_);
+  }
+
  Status TryReopen(const Options& options) {
    Close();
    last_options_ = options;
@ -612,6 +622,69 @@ TEST_F(CheckpointTest, CheckpointWithUnsyncedDataDropped) {
  db_ = nullptr;
 }

+TEST_F(CheckpointTest, CheckpointReadOnlyDB) {
+  ASSERT_OK(Put("foo", "foo_value"));
+  ASSERT_OK(Flush());
+  Close();
+  Options options = CurrentOptions();
+  ASSERT_OK(ReadOnlyReopen(options));
+  Checkpoint* checkpoint = nullptr;
+  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_));
+  delete checkpoint;
+  checkpoint = nullptr;
+  Close();
+  DB* snapshot_db = nullptr;
+  ASSERT_OK(DB::Open(options, snapshot_name_, &snapshot_db));
+  ReadOptions read_opts;
+  std::string get_result;
+  ASSERT_OK(snapshot_db->Get(read_opts, "foo", &get_result));
+  ASSERT_EQ("foo_value", get_result);
+  delete snapshot_db;
+}
+
+TEST_F(CheckpointTest, CheckpointReadOnlyDBWithMultipleColumnFamilies) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+  for (int i = 0; i != 3; ++i) {
+    ASSERT_OK(Put(i, "foo", "foo_value"));
+    ASSERT_OK(Flush(i));
+  }
+  Close();
+  Status s = ReadOnlyReopenWithColumnFamilies(
+      {kDefaultColumnFamilyName, "pikachu", "eevee"}, options);
+  ASSERT_OK(s);
+  Checkpoint* checkpoint = nullptr;
+  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_));
+  delete checkpoint;
+  checkpoint = nullptr;
+  Close();
+
+  std::vector<ColumnFamilyDescriptor> column_families{
+      {kDefaultColumnFamilyName, options},
+      {"pikachu", options},
+      {"eevee", options}};
+  DB* snapshot_db = nullptr;
+  std::vector<ColumnFamilyHandle*> snapshot_handles;
+  s = DB::Open(options, snapshot_name_, column_families, &snapshot_handles,
+               &snapshot_db);
+  ASSERT_OK(s);
+  ReadOptions read_opts;
+  for (int i = 0; i != 3; ++i) {
+    std::string get_result;
+    s = snapshot_db->Get(read_opts, snapshot_handles[i], "foo", &get_result);
+    ASSERT_OK(s);
+    ASSERT_EQ("foo_value", get_result);
+  }
+
+  for (auto snapshot_h : snapshot_handles) {
+    delete snapshot_h;
+  }
+  snapshot_handles.clear();
+  delete snapshot_db;
+}
+
 }  // namespace rocksdb

 int main(int argc, char** argv) {
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Adam Retter	b9a4a10659	Fix build on MSVC	2020-03-07 17:58:42 +01:00
Adam Retter	1c63b82b9a	Snappy must be built with position independent code for static linking	2020-03-07 13:56:13 +01:00
Adam Retter	64dbc7813e	RocksJava must compile on JDK7 (#4768 ) Summary: Fixes some RocksJava regressions recently introduced, whereby RocksJava would not build on JDK 7. These should have been visible on Travis-CI! Pull Request resolved: https://github.com/facebook/rocksdb/pull/4768 Differential Revision: D13418173 Pulled By: sagar0 fbshipit-source-id: 57bf223188887f84d9e072031af2e0d2c8a69c30	2020-03-07 13:31:52 +01:00
Adam Retter	466af89356	Update to latest Snappy to fix compilation issue on latest MacOS XCode	2020-03-07 13:08:19 +01:00
Peter Dillinger	614c6d453e	Don't download from (unreliable) maven.org (#6348 ) Summary: I set up a mirror of our Java deps on github so we can download them through github URLs rather than maven.org, which is proving terribly unreliable from Travis builds. Also sanitized calls to curl, so they are easier to read and appropriately fail on download failure. Pull Request resolved: https://github.com/facebook/rocksdb/pull/6348 Test Plan: CI Differential Revision: D19633621 Pulled By: pdillinger fbshipit-source-id: 7eb3f730953db2ead758dc94039c040f406790f3	2020-03-06 22:29:05 +01:00
Adam Retter	03492d61d4	Update 3rd-party libraries used by RocksJava (#6084 ) Summary: * LZ4 1.8.3 -> 1.9.2 * ZSTD 1.4.0 -> 1.4.4 Pull Request resolved: https://github.com/facebook/rocksdb/pull/6084 Differential Revision: D18710224 fbshipit-source-id: a461ef19a473d3480acdc027f627ec3048730692	2020-03-06 22:27:09 +01:00
Yun Tang	49f97dc93e	Download bzip2 packages from sourceforge (#5995 ) Summary: From bzip2's official [download page](http://www.bzip.org/downloads.html), we could download it from sourceforge. This source would be more credible than previous web archive. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5995 Differential Revision: D18377662 fbshipit-source-id: e8353f83d5d6ea6067f78208b7bfb7f0d5b49c05	2020-03-06 22:25:09 +01:00
sdong	fc38b0d5fe	Rename InternalDBStatsType enum names (#5779 ) Summary: When building with clang 9, warning is reported for InternalDBStatsType type names shadowed the one for statistics. Rename them. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5779 Test Plan: Build with clang 9 and see it passes. Differential Revision: D17239378 fbshipit-source-id: af28fb42066c738cd1b841f9fe21ab4671dafd18	2020-03-06 22:23:24 +01:00
Harry Wong	3598e1a9a9	Removed const fields in copyable classes (#5095 ) Summary: This fixed the compile error in Clang-8: ``` error: explicitly defaulted copy assignment operator is implicitly deleted [-Werror,-Wdefaulted-function-deleted] ``` Pull Request resolved: https://github.com/facebook/rocksdb/pull/5095 Differential Revision: D14811961 Pulled By: riversand963 fbshipit-source-id: d935d1f85a4e8694dca10033fb5af92d8777eca0	2020-03-06 22:22:59 +01:00
Yi Wu	d07167c9a3	Fix build failures due to missing JEMALLOC_CXX_THROW macro (#5053 ) Summary: JEMALLOC_CXX_THROW is not defined for earlier versions of jemalloc (e.g. 3.6), causing builds to fail on some platforms. Fixing it. Closes #4869 Pull Request resolved: https://github.com/facebook/rocksdb/pull/5053 Differential Revision: D14390034 Pulled By: sagar0 fbshipit-source-id: b2b7a03cd377201ef385eb521f65bae85c558055	2020-03-06 22:22:57 +01:00
Adam Retter	931e6704af	Update the version of the dependencies used by the RocksJava static build (#4761 ) Summary: Note that Snappy now requires CMake to build it, so I added a note about RocksJava to the README.md file. Pull Request resolved: https://github.com/facebook/rocksdb/pull/4761 Differential Revision: D13403811 Pulled By: ajkr fbshipit-source-id: 8fcd7e3dc7f7152080364a374d3065472f417eff	2020-03-06 22:21:26 +01:00
Peter Dillinger	e8e81bc39f	Update version.h and HISTORY.md for 5.18.4	2020-03-03 08:21:13 -08:00
Adam Retter	689c15523f	ARM64 commits to 5.18.3 to create 5.18.4 (#6250 ) * RocksDB CRC32c optimization with ARMv8 Intrinsic (#5221) Summary: 1. Add Arm linear crc32c implemtation for RocksDB. 2. Arm runtime check for crc32 Pull Request resolved: https://github.com/facebook/rocksdb/pull/5221 Differential Revision: D15013685 Pulled By: siying fbshipit-source-id: 2c2983743d26656d93f212dc7c1a3cf66a1acf12 * Support rocksdbjava aarch64 build and test (#5258) Summary: Verified with an Ampere Computing eMAG aarch64 system. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5258 Differential Revision: D15807309 Pulled By: maysamyabandeh fbshipit-source-id: ab85d2fd3fe40e6094430ab0eba557b1e979510d * Cleanup the Arm64 CRC32 unused warning (#5565) Summary: When 'HAVE_ARM64_CRC' is set, the blew methods: - bool rocksdb::crc32c::isSSE42() - bool rocksdb::crc32c::isPCLMULQDQ() are defined but not used, the unused-function is raised when do rocksdb build. This patch try to cleanup these warnings by add ifndef, if it build under the HAVE_ARM64_CRC, we will not define `isSSE42` and `isPCLMULQDQ`. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5565 Differential Revision: D16233654 fbshipit-source-id: c32a9dda7465dbf65f9ccafef159124db92cdffd * Fixes for building RocksJava releases on arm64v8 Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/5674 Differential Revision: D16870338 fbshipit-source-id: c8dac644b1479fa734b491f3a8d50151772290f7 * Remove invalid comparison of va_list and nullptr (#5836) Summary: The comparison of va_list and nullptr is always False under any arch, and will raise invalid operands of types error in aarch64 env (`error: invalid operands of types ‘va_list {aka __va_list}’ and ‘std::nullptr_t’ to binary ‘operator!=’`). This patch removes this invalid assert. Closes: https://github.com/facebook/rocksdb/issues/4277 Pull Request resolved: https://github.com/facebook/rocksdb/pull/5836 Differential Revision: D17532470 fbshipit-source-id: ca98078ecbc6a9416c69de3bd6ffcfa33a0f0185 * Fix naming of library on PPC64LE (#6080) Summary: NOTE: This also needs to be back-ported to be 6.4.6 Fix a regression introduced in `f2bf0b2` by https://github.com/facebook/rocksdb/pull/5674 whereby the compiled library would get the wrong name on PPC64LE platforms. On PPC64LE, the regression caused the library to be named `librocksdbjni-linux64.so` instead of `librocksdbjni-linux-ppc64le.so`. This PR corrects the name back to `librocksdbjni-linux-ppc64le.so` and also corrects the ordering of conditional arguments in the Makefile to match the expected order as defined in the documentation for Make. Pull Request resolved: https://github.com/facebook/rocksdb/pull/6080 Differential Revision: D18710351 fbshipit-source-id: d4db87ef378263b57de7f9edce1b7d15644cf9de Co-authored-by: Yuqi <yuqi.gu@arm.com> Co-authored-by: Patrick Zhang <cnqpzhang@163.com> Co-authored-by: Yikun Jiang <yikunkero@gmail.com>	2020-01-06 10:39:50 -08:00
Vijay Nadimpalli	1369736b18	Making platform 007 (gcc 7) default in build_detect_platform.sh (#5947 ) Summary: Making platform 007 (gcc 7) default in build_detect_platform.sh. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5947 Differential Revision: D18038837 Pulled By: vjnadimpalli fbshipit-source-id: 9ac2ddaa93bf328a416faec028970e039886378e	2019-10-30 10:16:03 -07:00
Andrew Kryczka	95dc6cd6ed	Add latest toolchain (gcc-8, etc.) build support for fbcode users (#4923 ) Summary: - When building with internal dependencies, specify this toolchain by setting `ROCKSDB_FBCODE_BUILD_WITH_PLATFORM007=1` - It is not enabled by default. However, it is enabled for TSAN builds in CI since there is a known problem with TSAN in gcc-5: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=71090 - I did not add support for Lua since (1) we agreed to deprecate it, and (2) we only have an internal build for v5.3 with this toolchain while that has breaking changes compared to our current version (v5.2). Pull Request resolved: https://github.com/facebook/rocksdb/pull/4923 Differential Revision: D13827226 Pulled By: ajkr fbshipit-source-id: 9aa3388ed3679777cfb15ef8cbcb83c07f62f947	2019-10-30 10:15:48 -07:00
Andrew Kryczka	641fae60f6	update history and bump version	2019-02-11 14:02:52 -08:00
yangzhijia	b7434c29d2	Properly set upper bound of subcompaction output (#4879 ) (#4898 ) Summary: Fix the ouput overlap bug when using subcompactions, the upper bound of output file was extended incorrectly. Pull Request resolved: https://github.com/facebook/rocksdb/pull/4898 Differential Revision: D13736107 Pulled By: ajkr fbshipit-source-id: 21dca09f81d5f07bf2766bf566f9b50dcab7d8e3	2019-02-11 14:01:38 -08:00
Sagar Vemuri	a1774dde9a	Bump version to 5.18.2	2019-01-31 15:49:35 -08:00
Yanqin Jin	65b2298510	Use correct FileMeta for atomic flush result install (#4932 ) Summary: 1. this commit fixes our handling of a combination of two separate edge cases. If a flush job does not pick any memtable to flush (because another flush job has already picked the same memtables), and the column family assigned to the flush job is dropped right before RocksDB calls rocksdb::InstallMemtableAtomicFlushResults, our original code passes a FileMetaData object whose file number is 0, failing the assertion in rocksdb::InstallMemtableAtomicFlushResults (assert(m->GetFileNumber() > 0)). 2. Also piggyback a small change: since we already create a local copy of column family's mutable CF options to eliminate potential race condition with `SetOptions` call, we might as well use the local copy in other function calls in the same scope. Pull Request resolved: https://github.com/facebook/rocksdb/pull/4932 Differential Revision: D13901322 Pulled By: riversand963 fbshipit-source-id: b936580af7c127ea0c6c19ea10cd5fcede9fb0f9	2019-01-31 15:06:26 -08:00
Yanqin Jin	acba14b3d9	Make a copy of MutableCFOptions to avoid race condition (#4876 ) Summary: If we do not do this, then reading MutableCFOptions may have a race condition with SetOptions which modifies MutableCFOptions. Also reserve space in advance for vectors to avoid reallocation changing the address of its elements. Test plan ``` $make clean && make -j32 all check $make clean && COMPILE_WITH_TSAN=1 make -j32 all check $make clean && COMPILE_WITH_ASAN=1 make -j32 all check ``` Pull Request resolved: https://github.com/facebook/rocksdb/pull/4876 Differential Revision: D13644500 Pulled By: riversand963 fbshipit-source-id: 4b8112c5c819d5a2922bb61ad1521b3d2fb2fd47	2019-01-31 15:06:08 -08:00
anand76	53f760b8a8	Always delete Blob DB files in the background (#4928 ) Summary: Blob DB files are not tracked by the SFM, so they currently don't get deleted in the background. Force them to be deleted in background so rate limiting can be applied Pull Request resolved: https://github.com/facebook/rocksdb/pull/4928 Differential Revision: D13854649 Pulled By: anand1976 fbshipit-source-id: 8031ce66842ff0af440c715d886b377983dad7d8	2019-01-31 14:19:04 -08:00
Siying Dong	35c05bca0f	Deleting Blob files also goes through SstFileManager (#4904 ) Summary: Right now, deleting blob files is not rate limited, even if SstFileManger is specified. On the other hand, rate limiting blob deletion is not supported. With this change, Blob file deletion will go through SstFileManager too. Pull Request resolved: https://github.com/facebook/rocksdb/pull/4904 Differential Revision: D13772545 Pulled By: siying fbshipit-source-id: bd1b1d0beb26d5167385e00b7ecb8b94b879de84	2019-01-31 14:18:21 -08:00
Yanqin Jin	9ae0528dc4	Use chrono::time_point instead of time_t (#4868 ) Summary: By convention, time_t almost always stores the integral number of seconds since 00:00 hours, Jan 1, 1970 UTC, according to http://www.cplusplus.com/reference/ctime/time_t/. We surely want more precision than seconds. Pull Request resolved: https://github.com/facebook/rocksdb/pull/4868 Differential Revision: D13633046 Pulled By: riversand963 fbshipit-source-id: 4e01e23a22e8838023c51a91247a286dbf3a5396	2019-01-23 11:11:13 -08:00
Yanqin Jin	4eeb1bf0a6	Bump version to 5.18.1	2019-01-09 16:15:59 -08:00
Yanqin Jin	3bcc31295a	Initialize two members in PerfContext (#4859 ) Summary: as titled. Currently it's possible to create a local object of type PerfContext since it's part of public API. Then it's safe to initialize the two members to 0. If PerfContext is created as thread-local object, then all members are zero-initialized according to C++ standard. Pull Request resolved: https://github.com/facebook/rocksdb/pull/4859 Differential Revision: D13614504 Pulled By: riversand963 fbshipit-source-id: 406ff548e105a074f379ad1054d56fece5f524a0	2019-01-09 16:15:31 -08:00
Andrew Kryczka	e78f5cfba7	Fix point lookup on range tombstone sentinel endpoint (#4829 ) Summary: Previously for point lookup we decided which file to look into based on user key overlap only. We also did not truncate range tombstones in the point lookup code path. These two ideas did not interact well in cases like this: - L1 has range tombstone [a, c)#1 and point key b#2. The data is split between file1 with range [a#1,1, b#72057594037927935,15], and file2 with range [b#2, c#1]. - L1's file2 gets compacted to L2. - User issues `Get()` for b#3. - L1's file1 is opened and the range tombstone [a, c)#1 is found for b, while no point-key for b is found in L1. - `Get()` assumes that the range tombstone must cover all data in that range in lower levels, so short circuits and returns `NotFound`. The solution to this problem is to not look into files that only overlap with the point lookup at a range tombstone sentinel endpoint. In the above example, this would mean not opening L1's file1 or its tombstones during the `Get()`. Pull Request resolved: https://github.com/facebook/rocksdb/pull/4829 Differential Revision: D13561355 Pulled By: ajkr fbshipit-source-id: a13c21c816870a2f5d32a48af6dbd719a7d9d19f	2019-01-08 17:50:02 -08:00
Yanqin Jin	97773d0967	Update HISTORY.md	2019-01-07 10:18:58 -08:00
Yanqin Jin	35c950a94e	Refactor atomic flush result installation to MANIFEST (#4791 ) Summary: as titled. Since different bg flush threads can flush different sets of column families (due to column family creation and drop), we decide not to let one thread perform atomic flush result installation for other threads. Bg flush threads will install their atomic flush results sequentially to MANIFEST, using a conditional variable, i.e. atomic_flush_install_cv_ to coordinate. Pull Request resolved: https://github.com/facebook/rocksdb/pull/4791 Differential Revision: D13498930 Pulled By: riversand963 fbshipit-source-id: dd7482fc41f4bd22dad1e1ef7d4764ef424688d7	2019-01-07 10:12:51 -08:00
Yanqin Jin	e265e08a02	Avoid switching empty memtable in certain cases (#4792 ) Summary: in certain cases, we do not perform memtable switching if the active memtable of the column family is empty. Two exceptions: 1. In manual flush, if cached_recoverable_state_empty_ is false, then we need to switch memtable due to requirement of transaction. 2. In switch WAL, we need to switch memtable anyway because we have to seal the memtable if the WAL on which it depends will be closed. This change can potentially delay the occurence of write stalls because number of memtables increase more slowly. Pull Request resolved: https://github.com/facebook/rocksdb/pull/4792 Differential Revision: D13499501 Pulled By: riversand963 fbshipit-source-id: 91c9b17ae753578578039f3851667d93610005e1	2019-01-07 10:06:49 -08:00
Yanqin Jin	663d24f467	Improve flushing multiple column families (#4708 ) Summary: If one column family is dropped, we should simply skip it and continue to flush other active ones. Currently we use Status::ShutdownInProgress to notify caller of column families being dropped. In the future, we should consider using a different Status code. Pull Request resolved: https://github.com/facebook/rocksdb/pull/4708 Differential Revision: D13378954 Pulled By: riversand963 fbshipit-source-id: 42f248cdf2d32d4c0f677cd39012694b8f1328ca	2019-01-07 10:02:59 -08:00
Yanqin Jin	ec43385bf3	Enable checkpoint of read-only db (#4681 ) Summary: 1. DBImplReadOnly::GetLiveFiles should not return NotSupported. Instead, it should call DBImpl::GetLiveFiles(flush_memtable=false). 2. In DBImp::Recover, we should also recover the OPTIONS file name and/or number so that an immediate subsequent GetLiveFiles will get the correct OPTIONS name. Pull Request resolved: https://github.com/facebook/rocksdb/pull/4681 Differential Revision: D13069205 Pulled By: riversand963 fbshipit-source-id: 3e6a0174307d06db5a01feb099b306cea1f7f88a	2019-01-07 09:54:42 -08:00
Yi Wu	8a643b70fd	Detect if Jemalloc is linked with the binary (#4844 ) Summary: Declare Jemalloc non-standard APIs as weak symbols, so that if Jemalloc is linked with the binary, these symbols will be replaced by Jemalloc's, otherwise they will be nullptr. This is similar to how folly detect jemalloc, but we assume the main program use jemalloc as long as jemalloc is linked: https://github.com/facebook/folly/blob/master/folly/memory/Malloc.h#L147 Pull Request resolved: https://github.com/facebook/rocksdb/pull/4844 Differential Revision: D13574934 Pulled By: yiwu-arbug fbshipit-source-id: 7ea871beb1be7d5a1259cc38f9b78078793db2db	2019-01-04 11:08:52 -08:00
Abhishek Madan	de0891ec01	Fix unused member compile error Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/4793 Differential Revision: D13509363 Pulled By: abhimadan fbshipit-source-id: 530b4765e3335d6ecd016bfaa89645f8aa98c61f	2018-12-18 15:23:20 -08:00
Abhishek Madan	33564d2c10	Remove v1 RangeDelAggregator (#4778 ) Summary: Now that v2 is fully functional, the v1 aggregator is removed. The v2 aggregator has been renamed. Pull Request resolved: https://github.com/facebook/rocksdb/pull/4778 Differential Revision: D13495930 Pulled By: abhimadan fbshipit-source-id: 9d69500a60a283e79b6c4fa938fc68a8aa4d40d6	2018-12-18 15:23:20 -08:00
Abhishek Madan	96de211f4c	Add compaction logic to RangeDelAggregatorV2 (#4758 ) Summary: RangeDelAggregatorV2 now supports ShouldDelete calls on snapshot stripes and creation of range tombstone compaction iterators. RangeDelAggregator is no longer used on any non-test code path, and will be removed in a future commit. Pull Request resolved: https://github.com/facebook/rocksdb/pull/4758 Differential Revision: D13439254 Pulled By: abhimadan fbshipit-source-id: fe105bcf8e3d4a2df37a622d5510843cd71b0401	2018-12-17 15:33:11 -08:00
Abhishek Madan	8522d9c74d	Prepare FragmentedRangeTombstoneIterator for use in compaction (#4740 ) Summary: To support the flush/compaction use cases of RangeDelAggregator in v2, FragmentedRangeTombstoneIterator now supports dropping tombstones that cannot be read in the compaction output file. Furthermore, FragmentedRangeTombstoneIterator supports the "snapshot striping" use case by allowing an iterator to be split by a list of snapshots. RangeDelAggregatorV2 will use these changes in a follow-up change. In the process of making these changes, other miscellaneous cleanups were also done in these files. Pull Request resolved: https://github.com/facebook/rocksdb/pull/4740 Differential Revision: D13287382 Pulled By: abhimadan fbshipit-source-id: f5aeb03e1b3058049b80c02a558ee48f723fa48c	2018-12-17 15:33:11 -08:00