Merge branch 'nvm_cache_proto' of https://github.com/facebook/rocksdb into nvm_cache_initial
This commit is contained in:
commit
354588fea4
@ -178,7 +178,7 @@ jobs:
|
||||
steps:
|
||||
- pre-steps
|
||||
- install-gflags
|
||||
- run: ASSERT_STATUS_CHECKED=1 TEST_UINT128_COMPAT=1 ROCKSDB_MODIFY_NPHASH=1 LIB_MODE=shared OPT="-DROCKSDB_NAMESPACE=alternative_rocksdb_ns" make V=1 -j32 all check_some | .circleci/cat_ignore_eagain
|
||||
- run: ASSERT_STATUS_CHECKED=1 TEST_UINT128_COMPAT=1 ROCKSDB_MODIFY_NPHASH=1 LIB_MODE=shared OPT="-DROCKSDB_NAMESPACE=alternative_rocksdb_ns" make V=1 -j32 check | .circleci/cat_ignore_eagain
|
||||
- post-steps
|
||||
|
||||
build-linux-release:
|
||||
@ -572,6 +572,58 @@ jobs:
|
||||
gtest-parallel $(</tmp/test_list) --output_dir=/tmp | cat # pipe to cat to continuously output status on circleci UI. Otherwise, no status will be printed while the job is running.
|
||||
- post-steps
|
||||
|
||||
build-linux-arm-test-full:
|
||||
machine:
|
||||
image: ubuntu-2004:202101-01
|
||||
resource_class: arm.large
|
||||
steps:
|
||||
- pre-steps
|
||||
- install-gflags
|
||||
- run: make V=1 J=4 -j4 check | .circleci/cat_ignore_eagain
|
||||
- post-steps
|
||||
|
||||
build-linux-arm:
|
||||
machine:
|
||||
image: ubuntu-2004:202101-01
|
||||
resource_class: arm.large
|
||||
steps:
|
||||
- pre-steps
|
||||
- install-gflags
|
||||
- run: ROCKSDBTESTS_PLATFORM_DEPENDENT=only make V=1 J=4 -j4 all_but_some_tests check_some | .circleci/cat_ignore_eagain
|
||||
- post-steps
|
||||
|
||||
build-linux-arm-cmake-no_test_run:
|
||||
machine:
|
||||
image: ubuntu-2004:202101-01
|
||||
resource_class: arm.large
|
||||
environment:
|
||||
JAVA_HOME: /usr/lib/jvm/java-8-openjdk-arm64
|
||||
steps:
|
||||
- pre-steps
|
||||
- run:
|
||||
name: "Set Java Environment"
|
||||
command: |
|
||||
echo "JAVA_HOME=${JAVA_HOME}"
|
||||
echo 'export PATH=$JAVA_HOME/bin:$PATH' >> $BASH_ENV
|
||||
which java && java -version
|
||||
which javac && javac -version
|
||||
- run:
|
||||
name: "Build with cmake"
|
||||
command: |
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -DCMAKE_BUILD_TYPE=Release -DWITH_TESTS=0 -DWITH_GFLAGS=0 -DWITH_BENCHMARK_TOOLS=0 -DWITH_TOOLS=0 -DWITH_CORE_TOOLS=1 ..
|
||||
make -j4
|
||||
- run:
|
||||
name: "Build Java with cmake"
|
||||
command: |
|
||||
rm -rf build
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -DJNI=1 -DCMAKE_BUILD_TYPE=Release -DWITH_GFLAGS=0 ..
|
||||
make -j4 rocksdb rocksdbjni
|
||||
- post-steps
|
||||
|
||||
build-format-compatible:
|
||||
machine:
|
||||
image: ubuntu-1604:202007-01
|
||||
@ -682,6 +734,7 @@ workflows:
|
||||
- build-linux-gcc-8-no_test_run
|
||||
- build-linux-gcc-9-no_test_run
|
||||
- build-linux-gcc-10-cxx20-no_test_run
|
||||
- build-linux-arm-cmake-no_test_run
|
||||
build-macos:
|
||||
jobs:
|
||||
- build-macos
|
||||
@ -691,6 +744,9 @@ workflows:
|
||||
build-cmake-mingw:
|
||||
jobs:
|
||||
- build-cmake-mingw
|
||||
build-linux-arm:
|
||||
jobs:
|
||||
- build-linux-arm
|
||||
nightly:
|
||||
triggers:
|
||||
- schedule:
|
||||
@ -701,3 +757,4 @@ workflows:
|
||||
- master
|
||||
jobs:
|
||||
- build-format-compatible
|
||||
- build-linux-arm-test-full
|
||||
|
@ -61,6 +61,10 @@ matrix:
|
||||
env: JOB_NAME=make-gcc4.8
|
||||
- os: linux
|
||||
compiler: clang
|
||||
- if: type = pull_request AND commit_message !~ /FULL_CI/
|
||||
os: linux
|
||||
arch: arm64
|
||||
env: TEST_GROUP=platform_dependent
|
||||
- if: type = pull_request AND commit_message !~ /FULL_CI/
|
||||
os : linux
|
||||
arch: arm64
|
||||
@ -93,6 +97,10 @@ matrix:
|
||||
os: linux
|
||||
arch: ppc64le
|
||||
env: TEST_GROUP=4
|
||||
- if: type = pull_request AND commit_message !~ /FULL_CI/
|
||||
os : linux
|
||||
arch: arm64
|
||||
env: JOB_NAME=cmake
|
||||
- if: type = pull_request AND commit_message !~ /FULL_CI/ AND commit_message !~ /java/
|
||||
os : linux
|
||||
arch: arm64
|
||||
|
@ -296,6 +296,7 @@ else()
|
||||
endif()
|
||||
|
||||
include(CheckCXXSourceCompiles)
|
||||
set(OLD_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
|
||||
if(NOT MSVC)
|
||||
set(CMAKE_REQUIRED_FLAGS "-msse4.2 -mpclmul")
|
||||
endif()
|
||||
@ -312,7 +313,6 @@ int main() {
|
||||
auto d = _mm_cvtsi128_si64(c);
|
||||
}
|
||||
" HAVE_SSE42)
|
||||
unset(CMAKE_REQUIRED_FLAGS)
|
||||
if(HAVE_SSE42)
|
||||
add_definitions(-DHAVE_SSE42)
|
||||
add_definitions(-DHAVE_PCLMUL)
|
||||
@ -320,18 +320,50 @@ elseif(FORCE_SSE42)
|
||||
message(FATAL_ERROR "FORCE_SSE42=ON but unable to compile with SSE4.2 enabled")
|
||||
endif()
|
||||
|
||||
# Check if -latomic is required or not
|
||||
if (NOT MSVC)
|
||||
set(CMAKE_REQUIRED_FLAGS "--std=c++11")
|
||||
CHECK_CXX_SOURCE_COMPILES("
|
||||
#include <atomic>
|
||||
std::atomic<uint64_t> x(0);
|
||||
int main() {
|
||||
uint64_t i = x.load(std::memory_order_relaxed);
|
||||
bool b = x.is_lock_free();
|
||||
return 0;
|
||||
}
|
||||
" BUILTIN_ATOMIC)
|
||||
if (NOT BUILTIN_ATOMIC)
|
||||
#TODO: Check if -latomic exists
|
||||
list(APPEND THIRDPARTY_LIBS atomic)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# Reset the required flags
|
||||
set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQUIRED_FLAGS})
|
||||
|
||||
CHECK_CXX_SOURCE_COMPILES("
|
||||
#if defined(_MSC_VER) && !defined(__thread)
|
||||
#define __thread __declspec(thread)
|
||||
#endif
|
||||
int main() {
|
||||
static __thread int tls;
|
||||
(void)tls;
|
||||
}
|
||||
" HAVE_THREAD_LOCAL)
|
||||
if(HAVE_THREAD_LOCAL)
|
||||
add_definitions(-DROCKSDB_SUPPORT_THREAD_LOCAL)
|
||||
endif()
|
||||
|
||||
option(WITH_IOSTATS_CONTEXT "Enable IO stats context" ON)
|
||||
if (NOT WITH_IOSTATS_CONTEXT)
|
||||
add_definitions(-DNIOSTATS_CONTEXT)
|
||||
endif()
|
||||
|
||||
option(WITH_PERF_CONTEXT "Enable perf context" ON)
|
||||
if (NOT WITH_PERF_CONTEXT)
|
||||
add_definitions(-DNPERF_CONTEXT)
|
||||
endif()
|
||||
|
||||
option(FAIL_ON_WARNINGS "Treat compile warnings as errors" ON)
|
||||
if(FAIL_ON_WARNINGS)
|
||||
if(MSVC)
|
||||
@ -583,7 +615,6 @@ set(SOURCES
|
||||
db/builder.cc
|
||||
db/c.cc
|
||||
db/column_family.cc
|
||||
db/compacted_db_impl.cc
|
||||
db/compaction/compaction.cc
|
||||
db/compaction/compaction_iterator.cc
|
||||
db/compaction/compaction_picker.cc
|
||||
@ -594,6 +625,7 @@ set(SOURCES
|
||||
db/compaction/sst_partitioner.cc
|
||||
db/convenience.cc
|
||||
db/db_filesnapshot.cc
|
||||
db/db_impl/compacted_db_impl.cc
|
||||
db/db_impl/db_impl.cc
|
||||
db/db_impl/db_impl_write.cc
|
||||
db/db_impl/db_impl_compaction_flush.cc
|
||||
@ -651,6 +683,7 @@ set(SOURCES
|
||||
env/env_hdfs.cc
|
||||
env/file_system.cc
|
||||
env/file_system_tracer.cc
|
||||
env/fs_remap.cc
|
||||
env/mock_env.cc
|
||||
file/delete_scheduler.cc
|
||||
file/file_prefetch_buffer.cc
|
||||
|
43
HISTORY.md
43
HISTORY.md
@ -1,5 +1,47 @@
|
||||
# Rocksdb Change Log
|
||||
## Unreleased
|
||||
### Bug Fixes
|
||||
* Fixed a bug in handling file rename error in distributed/network file systems when the server succeeds but client returns error. The bug can cause CURRENT file to point to non-existing MANIFEST file, thus DB cannot be opened.
|
||||
* Fixed a bug where ingested files were written with incorrect boundary key metadata. In rare cases this could have led to a level's files being wrongly ordered and queries for the boundary keys returning wrong results.
|
||||
* Fixed a data race between insertion into memtables and the retrieval of the DB properties `rocksdb.cur-size-active-mem-table`, `rocksdb.cur-size-all-mem-tables`, and `rocksdb.size-all-mem-tables`.
|
||||
* Fixed the false-positive alert when recovering from the WAL file. Avoid reporting "SST file is ahead of WAL" on a newly created empty column family, if the previous WAL file is corrupted.
|
||||
|
||||
### Behavior Changes
|
||||
* Due to the fix of false-postive alert of "SST file is ahead of WAL", all the CFs with no SST file (CF empty) will bypass the consistency check. We fixed a false-positive, but introduced a very rare true-negative which will be triggered in the following conditions: A CF with some delete operations in the last a few queries which will result in an empty CF (those are flushed to SST file and a compaction triggered which combines this file and all other SST files and generates an empty CF, or there is another reason to write a manifest entry for this CF after a flush that generates no SST file from an empty CF). The deletion entries are logged in a WAL and this WAL was corrupted, while the CF's log number points to the next WAL (due to the flush). Therefore, the DB can only recover to the point without these trailing deletions and cause the inconsistent DB status.
|
||||
|
||||
### New Features
|
||||
* Add new option allow_stall passed during instance creation of WriteBufferManager. When allow_stall is set, WriteBufferManager will stall all writers shared across multiple DBs and columns if memory usage goes beyond specified WriteBufferManager::buffer_size (soft limit). Stall will be cleared when memory is freed after flush and memory usage goes down below buffer_size.
|
||||
|
||||
## 6.20.0 (04/16/2021)
|
||||
### Behavior Changes
|
||||
* `ColumnFamilyOptions::sample_for_compression` now takes effect for creation of all block-based tables. Previously it only took effect for block-based tables created by flush.
|
||||
* `CompactFiles()` can no longer compact files from lower level to up level, which has the risk to corrupt DB (details: #8063). The validation is also added to all compactions.
|
||||
* Fixed some cases in which DB::OpenForReadOnly() could write to the filesystem. If you want a Logger with a read-only DB, you must now set DBOptions::info_log yourself, such as using CreateLoggerFromOptions().
|
||||
* get_iostats_context() will never return nullptr. If thread-local support is not available, and user does not opt-out iostats context, then compilation will fail. The same applies to perf context as well.
|
||||
|
||||
### Bug Fixes
|
||||
* Use thread-safe `strerror_r()` to get error messages.
|
||||
* Fixed a potential hang in shutdown for a DB whose `Env` has high-pri thread pool disabled (`Env::GetBackgroundThreads(Env::Priority::HIGH) == 0`)
|
||||
* Made BackupEngine thread-safe and added documentation comments to clarify what is safe for multiple BackupEngine objects accessing the same backup directory.
|
||||
* Fixed crash (divide by zero) when compression dictionary is applied to a file containing only range tombstones.
|
||||
* Fixed a backward iteration bug with partitioned filter enabled: not including the prefix of the last key of the previous filter partition in current filter partition can cause wrong iteration result.
|
||||
* Fixed a bug that allowed `DBOptions::max_open_files` to be set with a non-negative integer with `ColumnFamilyOptions::compaction_style = kCompactionStyleFIFO`.
|
||||
|
||||
### Performance Improvements
|
||||
* On ARM platform, use `yield` instead of `wfe` to relax cpu to gain better performance.
|
||||
|
||||
### Public API change
|
||||
* Added `TableProperties::slow_compression_estimated_data_size` and `TableProperties::fast_compression_estimated_data_size`. When `ColumnFamilyOptions::sample_for_compression > 0`, they estimate what `TableProperties::data_size` would have been if the "fast" or "slow" (see `ColumnFamilyOptions::sample_for_compression` API doc for definitions) compression had been used instead.
|
||||
* Update DB::StartIOTrace and remove Env object from the arguments as its redundant and DB already has Env object that is passed down to IOTracer::StartIOTrace
|
||||
* Added `FlushReason::kWalFull`, which is reported when a memtable is flushed due to the WAL reaching its size limit; those flushes were previously reported as `FlushReason::kWriteBufferManager`. Also, changed the reason for flushes triggered by the write buffer manager to `FlushReason::kWriteBufferManager`; they were previously reported as `FlushReason::kWriteBufferFull`.
|
||||
* Extend file_checksum_dump ldb command and DB::GetLiveFilesChecksumInfo API for IntegratedBlobDB and get checksum of blob files along with SST files.
|
||||
|
||||
### New Features
|
||||
* Added the ability to open BackupEngine backups as read-only DBs, using BackupInfo::name_for_open and env_for_open provided by BackupEngine::GetBackupInfo() with include_file_details=true.
|
||||
* Added BackupEngine support for integrated BlobDB, with blob files shared between backups when table files are shared. Because of current limitations, blob files always use the kLegacyCrc32cAndFileSize naming scheme, and incremental backups must read and checksum all blob files in a DB, even for files that are already backed up.
|
||||
* Added an optional output parameter to BackupEngine::CreateNewBackup(WithMetadata) to return the BackupID of the new backup.
|
||||
* Added BackupEngine::GetBackupInfo / GetLatestBackupInfo for querying individual backups.
|
||||
* Made the Ribbon filter a long-term supported feature in terms of the SST schema(compatible with version >= 6.15.0) though the API for enabling it is expected to change.
|
||||
|
||||
|
||||
## 6.19.0 (03/21/2021)
|
||||
@ -139,6 +181,7 @@
|
||||
* The settings of the DBOptions and ColumnFamilyOptions are now managed by Configurable objects (see New Features). The same convenience methods to configure these options still exist but the backend implementation has been unified under a common implementation.
|
||||
|
||||
### New Features
|
||||
|
||||
* Methods to configure serialize, and compare -- such as TableFactory -- are exposed directly through the Configurable base class (from which these objects inherit). This change will allow for better and more thorough configuration management and retrieval in the future. The options for a Configurable object can be set via the ConfigureFromMap, ConfigureFromString, or ConfigureOption method. The serialized version of the options of an object can be retrieved via the GetOptionString, ToString, or GetOption methods. The list of options supported by an object can be obtained via the GetOptionNames method. The "raw" object (such as the BlockBasedTableOption) for an option may be retrieved via the GetOptions method. Configurable options can be compared via the AreEquivalent method. The settings within a Configurable object may be validated via the ValidateOptions method. The object may be intialized (at which point only mutable options may be updated) via the PrepareOptions method.
|
||||
* Introduce options.check_flush_compaction_key_order with default value to be true. With this option, during flush and compaction, key order will be checked when writing to each SST file. If the order is violated, the flush or compaction will fail.
|
||||
* Added is_full_compaction to CompactionJobStats, so that the information is available through the EventListener interface.
|
||||
|
21
INSTALL.md
21
INSTALL.md
@ -43,6 +43,8 @@ to build a portable binary, add `PORTABLE=1` before your make commands, like thi
|
||||
command line flags processing. You can compile rocksdb library even
|
||||
if you don't have gflags installed.
|
||||
|
||||
* `make check` will also check code formatting, which requires [clang-format](https://clang.llvm.org/docs/ClangFormat.html)
|
||||
|
||||
* If you wish to build the RocksJava static target, then cmake is required for building Snappy.
|
||||
|
||||
## Supported platforms
|
||||
@ -94,12 +96,21 @@ to build a portable binary, add `PORTABLE=1` before your make commands, like thi
|
||||
sudo yum install libasan
|
||||
|
||||
* Install zstandard:
|
||||
* With [EPEL](https://fedoraproject.org/wiki/EPEL):
|
||||
|
||||
wget https://github.com/facebook/zstd/archive/v1.1.3.tar.gz
|
||||
mv v1.1.3.tar.gz zstd-1.1.3.tar.gz
|
||||
tar zxvf zstd-1.1.3.tar.gz
|
||||
cd zstd-1.1.3
|
||||
make && sudo make install
|
||||
sudo yum install libzstd-devel
|
||||
|
||||
* With CentOS 8:
|
||||
|
||||
sudo dnf install libzstd-devel
|
||||
|
||||
* From source:
|
||||
|
||||
wget https://github.com/facebook/zstd/archive/v1.1.3.tar.gz
|
||||
mv v1.1.3.tar.gz zstd-1.1.3.tar.gz
|
||||
tar zxvf zstd-1.1.3.tar.gz
|
||||
cd zstd-1.1.3
|
||||
make && sudo make install
|
||||
|
||||
* **OS X**:
|
||||
* Install latest C++ compiler that supports C++ 11:
|
||||
|
362
Makefile
362
Makefile
@ -55,50 +55,25 @@ DEBUG_LEVEL?=1
|
||||
# Set the default LIB_MODE to static
|
||||
LIB_MODE?=static
|
||||
|
||||
ifeq ($(MAKECMDGOALS),dbg)
|
||||
# OBJ_DIR is where the object files reside. Default to the current directory
|
||||
OBJ_DIR?=.
|
||||
|
||||
# Check the MAKECMDGOALS to set the DEBUG_LEVEL and LIB_MODE appropriately
|
||||
|
||||
ifneq ($(filter clean release install, $(MAKECMDGOALS)),)
|
||||
DEBUG_LEVEL=0
|
||||
endif
|
||||
ifneq ($(filter dbg, $(MAKECMDGOALS)),)
|
||||
DEBUG_LEVEL=2
|
||||
endif
|
||||
|
||||
ifeq ($(MAKECMDGOALS),clean)
|
||||
else ifneq ($(filter shared_lib install-shared, $(MAKECMDGOALS)),)
|
||||
DEBUG_LEVEL=0
|
||||
endif
|
||||
|
||||
ifeq ($(MAKECMDGOALS),release)
|
||||
DEBUG_LEVEL=0
|
||||
endif
|
||||
|
||||
ifeq ($(MAKECMDGOALS),shared_lib)
|
||||
LIB_MODE=shared
|
||||
DEBUG_LEVEL=0
|
||||
endif
|
||||
|
||||
ifeq ($(MAKECMDGOALS),install-shared)
|
||||
LIB_MODE=shared
|
||||
DEBUG_LEVEL=0
|
||||
endif
|
||||
|
||||
ifeq ($(MAKECMDGOALS),static_lib)
|
||||
else ifneq ($(filter static_lib install-static, $(MAKECMDGOALS)),)
|
||||
DEBUG_LEVEL=0
|
||||
LIB_MODE=static
|
||||
endif
|
||||
|
||||
ifeq ($(MAKECMDGOALS),install-static)
|
||||
DEBUG_LEVEL=0
|
||||
LIB_MODE=static
|
||||
endif
|
||||
|
||||
ifeq ($(MAKECMDGOALS),install)
|
||||
DEBUG_LEVEL=0
|
||||
endif
|
||||
|
||||
|
||||
ifneq ($(findstring jtest, $(MAKECMDGOALS)),)
|
||||
else ifneq ($(filter jtest rocksdbjava%, $(MAKECMDGOALS)),)
|
||||
OBJ_DIR=jl
|
||||
LIB_MODE=shared
|
||||
endif
|
||||
|
||||
ifneq ($(findstring rocksdbjava, $(MAKECMDGOALS)),)
|
||||
LIB_MODE=shared
|
||||
ifneq ($(findstring rocksdbjavastatic, $(MAKECMDGOALS)),)
|
||||
OBJ_DIR=jls
|
||||
ifneq ($(DEBUG_LEVEL),2)
|
||||
@ -107,8 +82,6 @@ ifneq ($(findstring rocksdbjava, $(MAKECMDGOALS)),)
|
||||
ifeq ($(MAKECMDGOALS),rocksdbjavastaticpublish)
|
||||
DEBUG_LEVEL=0
|
||||
endif
|
||||
else
|
||||
OBJ_DIR=jl
|
||||
endif
|
||||
endif
|
||||
|
||||
@ -259,6 +232,8 @@ AM_SHARE = $(AM_V_CCLD) $(CXX) $(PLATFORM_SHARED_LDFLAGS)$@ -L. $(patsubst lib%.
|
||||
# Export some common variables that might have been passed as Make variables
|
||||
# instead of environment variables.
|
||||
dummy := $(shell (export ROCKSDB_ROOT="$(CURDIR)"; \
|
||||
export CXXFLAGS="$(EXTRA_CXXFLAGS)"; \
|
||||
export LDFLAGS="$(EXTRA_LDFLAGS)"; \
|
||||
export COMPILE_WITH_ASAN="$(COMPILE_WITH_ASAN)"; \
|
||||
export COMPILE_WITH_TSAN="$(COMPILE_WITH_TSAN)"; \
|
||||
export COMPILE_WITH_UBSAN="$(COMPILE_WITH_UBSAN)"; \
|
||||
@ -489,7 +464,6 @@ CXXFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) -Woverl
|
||||
|
||||
LDFLAGS += $(PLATFORM_LDFLAGS)
|
||||
|
||||
OBJ_DIR?=.
|
||||
LIB_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(LIB_SOURCES))
|
||||
LIB_OBJECTS += $(patsubst %.cc, $(OBJ_DIR)/%.o, $(ROCKSDB_PLUGIN_SOURCES))
|
||||
ifeq ($(HAVE_POWER8),1)
|
||||
@ -501,6 +475,12 @@ ifeq ($(USE_FOLLY_DISTRIBUTED_MUTEX),1)
|
||||
LIB_OBJECTS += $(patsubst %.cpp, $(OBJ_DIR)/%.o, $(FOLLY_SOURCES))
|
||||
endif
|
||||
|
||||
# range_tree is not compatible with non GNU libc on ppc64
|
||||
# see https://jira.percona.com/browse/PS-7559
|
||||
ifneq ($(PPC_LIBC_IS_GNU),0)
|
||||
LIB_OBJECTS += $(patsubst %.cc, $(OBJ_DIR)/%.o, $(RANGE_TREE_SOURCES))
|
||||
endif
|
||||
|
||||
GTEST = $(OBJ_DIR)/$(GTEST_DIR)/gtest/gtest-all.o
|
||||
TESTUTIL = $(OBJ_DIR)/test_util/testutil.o
|
||||
TESTHARNESS = $(OBJ_DIR)/test_util/testharness.o $(TESTUTIL) $(GTEST)
|
||||
@ -516,6 +496,7 @@ TOOL_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(TOOL_LIB_SOURCES))
|
||||
ANALYZE_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(ANALYZER_LIB_SOURCES))
|
||||
STRESS_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(STRESS_LIB_SOURCES))
|
||||
|
||||
# Exclude build_version.cc -- a generated source file -- from all sources. Not needed for dependencies
|
||||
ALL_SOURCES = $(LIB_SOURCES) $(TEST_LIB_SOURCES) $(MOCK_LIB_SOURCES) $(GTEST_DIR)/gtest/gtest-all.cc
|
||||
ALL_SOURCES += $(TOOL_LIB_SOURCES) $(BENCH_LIB_SOURCES) $(CACHE_BENCH_LIB_SOURCES) $(ANALYZER_LIB_SOURCES) $(STRESS_LIB_SOURCES)
|
||||
ALL_SOURCES += $(TEST_MAIN_SOURCES) $(TOOL_MAIN_SOURCES) $(BENCH_MAIN_SOURCES)
|
||||
@ -528,236 +509,34 @@ ifeq ($(USE_FOLLY_DISTRIBUTED_MUTEX),1)
|
||||
ALL_SOURCES += third-party/folly/folly/synchronization/test/DistributedMutexTest.cc
|
||||
endif
|
||||
|
||||
PARALLEL_TEST = \
|
||||
backupable_db_test \
|
||||
db_bloom_filter_test \
|
||||
db_compaction_filter_test \
|
||||
db_compaction_test \
|
||||
db_merge_operator_test \
|
||||
db_sst_test \
|
||||
db_test \
|
||||
db_test2 \
|
||||
db_universal_compaction_test \
|
||||
db_wal_test \
|
||||
column_family_test \
|
||||
external_sst_file_test \
|
||||
import_column_family_test \
|
||||
fault_injection_test \
|
||||
file_reader_writer_test \
|
||||
inlineskiplist_test \
|
||||
manual_compaction_test \
|
||||
persistent_cache_test \
|
||||
table_test \
|
||||
transaction_test \
|
||||
point_lock_manager_test \
|
||||
range_locking_test \
|
||||
write_prepared_transaction_test \
|
||||
write_unprepared_transaction_test \
|
||||
|
||||
ifeq ($(USE_FOLLY_DISTRIBUTED_MUTEX),1)
|
||||
TESTS += folly_synchronization_distributed_mutex_test
|
||||
PARALLEL_TEST += folly_synchronization_distributed_mutex_test
|
||||
TESTS_PASSING_ASC = folly_synchronization_distributed_mutex_test
|
||||
endif
|
||||
|
||||
# options_settable_test doesn't pass with UBSAN as we use hack in the test
|
||||
ifdef COMPILE_WITH_UBSAN
|
||||
TESTS := $(shell echo $(TESTS) | sed 's/\boptions_settable_test\b//g')
|
||||
endif
|
||||
ifdef ASSERT_STATUS_CHECKED
|
||||
# This is a new check for which we will add support incrementally. This
|
||||
# list can be removed once support is fully added.
|
||||
TESTS_PASSING_ASC = \
|
||||
arena_test \
|
||||
autovector_test \
|
||||
cache_test \
|
||||
lru_cache_test \
|
||||
blob_file_addition_test \
|
||||
blob_file_builder_test \
|
||||
blob_file_cache_test \
|
||||
blob_file_garbage_test \
|
||||
blob_file_reader_test \
|
||||
bloom_test \
|
||||
cassandra_format_test \
|
||||
cassandra_functional_test \
|
||||
cassandra_row_merge_test \
|
||||
cassandra_serialize_test \
|
||||
cleanable_test \
|
||||
checkpoint_test \
|
||||
coding_test \
|
||||
crc32c_test \
|
||||
dbformat_test \
|
||||
db_basic_test \
|
||||
compact_files_test \
|
||||
compaction_picker_test \
|
||||
comparator_db_test \
|
||||
db_encryption_test \
|
||||
db_iter_test \
|
||||
db_iter_stress_test \
|
||||
db_log_iter_test \
|
||||
db_bloom_filter_test \
|
||||
db_blob_basic_test \
|
||||
db_blob_compaction_test \
|
||||
db_blob_corruption_test \
|
||||
db_blob_index_test \
|
||||
db_block_cache_test \
|
||||
db_compaction_test \
|
||||
db_compaction_filter_test \
|
||||
db_dynamic_level_test \
|
||||
db_flush_test \
|
||||
db_inplace_update_test \
|
||||
db_io_failure_test \
|
||||
db_iterator_test \
|
||||
db_kv_checksum_test \
|
||||
db_logical_block_size_cache_test \
|
||||
db_memtable_test \
|
||||
db_merge_operand_test \
|
||||
db_merge_operator_test \
|
||||
db_wal_test \
|
||||
db_with_timestamp_basic_test \
|
||||
db_with_timestamp_compaction_test \
|
||||
db_write_test \
|
||||
db_options_test \
|
||||
db_properties_test \
|
||||
db_range_del_test \
|
||||
db_secondary_test \
|
||||
deletefile_test \
|
||||
external_sst_file_test \
|
||||
options_file_test \
|
||||
db_sst_test \
|
||||
db_statistics_test \
|
||||
db_table_properties_test \
|
||||
db_tailing_iter_test \
|
||||
fault_injection_test \
|
||||
listener_test \
|
||||
log_test \
|
||||
manual_compaction_test \
|
||||
obsolete_files_test \
|
||||
perf_context_test \
|
||||
periodic_work_scheduler_test \
|
||||
perf_context_test \
|
||||
version_set_test \
|
||||
wal_manager_test \
|
||||
defer_test \
|
||||
filename_test \
|
||||
dynamic_bloom_test \
|
||||
env_basic_test \
|
||||
env_test \
|
||||
env_logger_test \
|
||||
event_logger_test \
|
||||
error_handler_fs_test \
|
||||
external_sst_file_basic_test \
|
||||
auto_roll_logger_test \
|
||||
file_indexer_test \
|
||||
delete_scheduler_test \
|
||||
flush_job_test \
|
||||
hash_table_test \
|
||||
hash_test \
|
||||
heap_test \
|
||||
histogram_test \
|
||||
inlineskiplist_test \
|
||||
io_posix_test \
|
||||
iostats_context_test \
|
||||
ldb_cmd_test \
|
||||
memkind_kmem_allocator_test \
|
||||
merge_test \
|
||||
merger_test \
|
||||
mock_env_test \
|
||||
object_registry_test \
|
||||
optimistic_transaction_test \
|
||||
prefix_test \
|
||||
plain_table_db_test \
|
||||
repair_test \
|
||||
configurable_test \
|
||||
customizable_test \
|
||||
options_settable_test \
|
||||
options_test \
|
||||
point_lock_manager_test \
|
||||
random_access_file_reader_test \
|
||||
random_test \
|
||||
range_del_aggregator_test \
|
||||
sst_file_reader_test \
|
||||
range_tombstone_fragmenter_test \
|
||||
repeatable_thread_test \
|
||||
ribbon_test \
|
||||
skiplist_test \
|
||||
slice_test \
|
||||
slice_transform_test \
|
||||
sst_dump_test \
|
||||
statistics_test \
|
||||
stats_history_test \
|
||||
stringappend_test \
|
||||
thread_local_test \
|
||||
trace_analyzer_test \
|
||||
transaction_test \
|
||||
env_timed_test \
|
||||
filelock_test \
|
||||
timer_queue_test \
|
||||
timer_test \
|
||||
options_util_test \
|
||||
persistent_cache_test \
|
||||
util_merge_operators_test \
|
||||
block_cache_trace_analyzer_test \
|
||||
block_cache_tracer_test \
|
||||
cache_simulator_test \
|
||||
sim_cache_test \
|
||||
version_builder_test \
|
||||
version_edit_test \
|
||||
work_queue_test \
|
||||
write_buffer_manager_test \
|
||||
write_controller_test \
|
||||
write_prepared_transaction_test \
|
||||
write_unprepared_transaction_test \
|
||||
compaction_iterator_test \
|
||||
compaction_job_test \
|
||||
compaction_job_stats_test \
|
||||
io_tracer_test \
|
||||
io_tracer_parser_test \
|
||||
prefetch_test \
|
||||
merge_helper_test \
|
||||
memtable_list_test \
|
||||
flush_job_test \
|
||||
block_based_filter_block_test \
|
||||
block_fetcher_test \
|
||||
block_test \
|
||||
data_block_hash_index_test \
|
||||
full_filter_block_test \
|
||||
partitioned_filter_block_test \
|
||||
column_family_test \
|
||||
file_reader_writer_test \
|
||||
rate_limiter_test \
|
||||
corruption_test \
|
||||
reduce_levels_test \
|
||||
thread_list_test \
|
||||
compact_on_deletion_collector_test \
|
||||
db_universal_compaction_test \
|
||||
import_column_family_test \
|
||||
option_change_migration_test \
|
||||
cuckoo_table_builder_test \
|
||||
cuckoo_table_db_test \
|
||||
cuckoo_table_reader_test \
|
||||
memory_test \
|
||||
table_test \
|
||||
backupable_db_test \
|
||||
blob_db_test \
|
||||
ttl_test \
|
||||
write_batch_test \
|
||||
write_batch_with_index_test \
|
||||
# TODO: finish fixing all tests to pass this check
|
||||
TESTS_FAILING_ASC = \
|
||||
db_test \
|
||||
db_test2 \
|
||||
range_locking_test \
|
||||
testutil_test \
|
||||
|
||||
ifeq ($(USE_FOLLY_DISTRIBUTED_MUTEX),1)
|
||||
TESTS_PASSING_ASC += folly_synchronization_distributed_mutex_test
|
||||
# Since we have very few ASC exclusions left, excluding them from
|
||||
# the build is the most convenient way to exclude them from testing
|
||||
TESTS := $(filter-out $(TESTS_FAILING_ASC),$(TESTS))
|
||||
endif
|
||||
|
||||
# Enable building all unit tests, but use check_some to run only tests
|
||||
# known to pass ASC (ASSERT_STATUS_CHECKED)
|
||||
ROCKSDBTESTS_SUBSET ?= $(TESTS_PASSING_ASC)
|
||||
# Alternate: only build unit tests known to pass ASC, and run them
|
||||
# with make check
|
||||
#TESTS := $(filter $(TESTS_PASSING_ASC),$(TESTS))
|
||||
#PARALLEL_TEST := $(filter $(TESTS_PASSING_ASC),$(PARALLEL_TEST))
|
||||
else
|
||||
ROCKSDBTESTS_SUBSET ?= $(TESTS)
|
||||
endif
|
||||
ROCKSDBTESTS_SUBSET ?= $(TESTS)
|
||||
|
||||
# env_test - suspicious use of test::TmpDir
|
||||
# deletefile_test - serial because it generates giant temporary files in
|
||||
# its various tests. Parallel can fill up your /dev/shm
|
||||
NON_PARALLEL_TEST = \
|
||||
env_test \
|
||||
deletefile_test \
|
||||
|
||||
PARALLEL_TEST = $(filter-out $(NON_PARALLEL_TEST), $(TESTS))
|
||||
|
||||
# Not necessarily well thought out or up-to-date, but matches old list
|
||||
TESTS_PLATFORM_DEPENDENT := \
|
||||
db_basic_test \
|
||||
@ -845,8 +624,8 @@ else
|
||||
LIBRARY=$(STATIC_LIBRARY)
|
||||
TEST_LIBRARY=$(STATIC_TEST_LIBRARY)
|
||||
TOOLS_LIBRARY=$(STATIC_TOOLS_LIBRARY)
|
||||
STRESS_LIBRARY=$(STATIC_STRESS_LIBRARY)
|
||||
endif
|
||||
STRESS_LIBRARY=$(STATIC_STRESS_LIBRARY)
|
||||
|
||||
ROCKSDB_MAJOR = $(shell egrep "ROCKSDB_MAJOR.[0-9]" include/rocksdb/version.h | cut -d ' ' -f 3)
|
||||
ROCKSDB_MINOR = $(shell egrep "ROCKSDB_MINOR.[0-9]" include/rocksdb/version.h | cut -d ' ' -f 3)
|
||||
@ -930,7 +709,8 @@ endif # PLATFORM_SHARED_EXT
|
||||
analyze tools tools_lib \
|
||||
blackbox_crash_test_with_atomic_flush whitebox_crash_test_with_atomic_flush \
|
||||
blackbox_crash_test_with_txn whitebox_crash_test_with_txn \
|
||||
blackbox_crash_test_with_best_efforts_recovery
|
||||
blackbox_crash_test_with_best_efforts_recovery \
|
||||
blackbox_crash_test_with_ts whitebox_crash_test_with_ts
|
||||
|
||||
|
||||
all: $(LIBRARY) $(BENCHMARKS) tools tools_lib test_libs $(TESTS)
|
||||
@ -1168,6 +948,8 @@ crash_test_with_txn: whitebox_crash_test_with_txn blackbox_crash_test_with_txn
|
||||
|
||||
crash_test_with_best_efforts_recovery: blackbox_crash_test_with_best_efforts_recovery
|
||||
|
||||
crash_test_with_ts: whitebox_crash_test_with_ts blackbox_crash_test_with_ts
|
||||
|
||||
blackbox_crash_test: db_stress
|
||||
$(PYTHON) -u tools/db_crashtest.py --simple blackbox $(CRASH_TEST_EXT_ARGS)
|
||||
$(PYTHON) -u tools/db_crashtest.py blackbox $(CRASH_TEST_EXT_ARGS)
|
||||
@ -1181,6 +963,9 @@ blackbox_crash_test_with_txn: db_stress
|
||||
blackbox_crash_test_with_best_efforts_recovery: db_stress
|
||||
$(PYTHON) -u tools/db_crashtest.py --test_best_efforts_recovery blackbox $(CRASH_TEST_EXT_ARGS)
|
||||
|
||||
blackbox_crash_test_with_ts: db_stress
|
||||
$(PYTHON) -u tools/db_crashtest.py --enable_ts blackbox $(CRASH_TEST_EXT_ARGS)
|
||||
|
||||
ifeq ($(CRASH_TEST_KILL_ODD),)
|
||||
CRASH_TEST_KILL_ODD=888887
|
||||
endif
|
||||
@ -1199,6 +984,10 @@ whitebox_crash_test_with_txn: db_stress
|
||||
$(PYTHON) -u tools/db_crashtest.py --txn whitebox --random_kill_odd \
|
||||
$(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS)
|
||||
|
||||
whitebox_crash_test_with_ts: db_stress
|
||||
$(PYTHON) -u tools/db_crashtest.py --enable_ts whitebox --random_kill_odd \
|
||||
$(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS)
|
||||
|
||||
asan_check: clean
|
||||
COMPILE_WITH_ASAN=1 $(MAKE) check -j32
|
||||
$(MAKE) clean
|
||||
@ -1344,8 +1133,9 @@ analyze_incremental:
|
||||
$(MAKE) dbg
|
||||
|
||||
CLEAN_FILES += unity.cc
|
||||
unity.cc: Makefile
|
||||
unity.cc: Makefile util/build_version.cc.in
|
||||
rm -f $@ $@-t
|
||||
$(AM_V_at)$(gen_build_version) > util/build_version.cc
|
||||
for source_file in $(LIB_SOURCES); do \
|
||||
echo "#include \"$$source_file\"" >> $@-t; \
|
||||
done
|
||||
@ -1429,7 +1219,7 @@ $(STATIC_TOOLS_LIBRARY): $(TOOL_OBJECTS)
|
||||
$(AM_V_AR)rm -f $@ $(SHARED_TOOLS_LIBRARY)
|
||||
$(AM_V_at)$(AR) $(ARFLAGS) $@ $^
|
||||
|
||||
$(STATIC_STRESS_LIBRARY): $(ANALYZE_OBJECTS) $(STRESS_OBJECTS)
|
||||
$(STATIC_STRESS_LIBRARY): $(ANALYZE_OBJECTS) $(STRESS_OBJECTS) $(TESTUTIL)
|
||||
$(AM_V_AR)rm -f $@ $(SHARED_STRESS_LIBRARY)
|
||||
$(AM_V_at)$(AR) $(ARFLAGS) $@ $^
|
||||
|
||||
@ -1441,7 +1231,7 @@ $(SHARED_TOOLS_LIBRARY): $(TOOL_OBJECTS) $(SHARED1)
|
||||
$(AM_V_AR)rm -f $@ $(STATIC_TOOLS_LIBRARY)
|
||||
$(AM_SHARE)
|
||||
|
||||
$(SHARED_STRESS_LIBRARY): $(ANALYZE_OBJECTS) $(STRESS_OBJECTS) $(SHARED_TOOLS_LIBRARY) $(SHARED1)
|
||||
$(SHARED_STRESS_LIBRARY): $(ANALYZE_OBJECTS) $(STRESS_OBJECTS) $(TESTUTIL) $(SHARED_TOOLS_LIBRARY) $(SHARED1)
|
||||
$(AM_V_AR)rm -f $@ $(STATIC_STRESS_LIBRARY)
|
||||
$(AM_SHARE)
|
||||
|
||||
@ -2070,6 +1860,9 @@ io_tracer_parser: $(OBJ_DIR)/tools/io_tracer_parser.o $(TOOLS_LIBRARY) $(LIBRARY
|
||||
|
||||
db_blob_corruption_test: $(OBJ_DIR)/db/blob/db_blob_corruption_test.o $(TEST_LIBRARY) $(LIBRARY)
|
||||
$(AM_LINK)
|
||||
|
||||
db_write_buffer_manager_test: $(OBJ_DIR)/db/db_write_buffer_manager_test.o $(TEST_LIBRARY) $(LIBRARY)
|
||||
$(AM_LINK)
|
||||
#-------------------------------------------------
|
||||
# make install related stuff
|
||||
PREFIX ?= /usr/local
|
||||
@ -2184,8 +1977,8 @@ SNAPPY_DOWNLOAD_BASE ?= https://github.com/google/snappy/archive
|
||||
LZ4_VER ?= 1.9.3
|
||||
LZ4_SHA256 ?= 030644df4611007ff7dc962d981f390361e6c97a34e5cbc393ddfbe019ffe2c1
|
||||
LZ4_DOWNLOAD_BASE ?= https://github.com/lz4/lz4/archive
|
||||
ZSTD_VER ?= 1.4.7
|
||||
ZSTD_SHA256 ?= 085500c8d0b9c83afbc1dc0d8b4889336ad019eba930c5d6a9c6c86c20c769c8
|
||||
ZSTD_VER ?= 1.4.9
|
||||
ZSTD_SHA256 ?= acf714d98e3db7b876e5b540cbf6dee298f60eb3c0723104f6d3f065cd60d6a8
|
||||
ZSTD_DOWNLOAD_BASE ?= https://github.com/facebook/zstd/archive
|
||||
CURL_SSL_OPTS ?= --tlsv1
|
||||
|
||||
@ -2485,12 +2278,14 @@ endif
|
||||
# ---------------------------------------------------------------------------
|
||||
# Source files dependencies detection
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# If skip dependencies is ON, skip including the dep files
|
||||
ifneq ($(SKIP_DEPENDS), 1)
|
||||
DEPFILES = $(patsubst %.cc, $(OBJ_DIR)/%.cc.d, $(ALL_SOURCES))
|
||||
DEPFILES+ = $(patsubst %.c, $(OBJ_DIR)/%.c.d, $(LIB_SOURCES_C) $(TEST_MAIN_SOURCES_C))
|
||||
ifeq ($(USE_FOLLY_DISTRIBUTED_MUTEX),1)
|
||||
DEPFILES +=$(patsubst %.cpp, $(OBJ_DIR)/%.cpp.d, $(FOLLY_SOURCES))
|
||||
endif
|
||||
endif
|
||||
|
||||
# Add proper dependency support so changing a .h file forces a .cc file to
|
||||
# rebuild.
|
||||
@ -2530,28 +2325,9 @@ endif
|
||||
build_subset_tests: $(ROCKSDBTESTS_SUBSET)
|
||||
$(AM_V_GEN)if [ -n "$${ROCKSDBTESTS_SUBSET_TESTS_TO_FILE}" ]; then echo "$(ROCKSDBTESTS_SUBSET)" > "$${ROCKSDBTESTS_SUBSET_TESTS_TO_FILE}"; else echo "$(ROCKSDBTESTS_SUBSET)"; fi
|
||||
|
||||
# if the make goal is either "clean" or "format", we shouldn't
|
||||
# try to import the *.d files.
|
||||
# TODO(kailiu) The unfamiliarity of Make's conditions leads to the ugly
|
||||
# working solution.
|
||||
ifneq ($(MAKECMDGOALS),clean)
|
||||
ifneq ($(MAKECMDGOALS),format)
|
||||
ifneq ($(MAKECMDGOALS),check-format)
|
||||
ifneq ($(MAKECMDGOALS),check-buck-targets)
|
||||
ifneq ($(MAKECMDGOALS),jclean)
|
||||
ifneq ($(MAKECMDGOALS),jtest)
|
||||
ifneq ($(MAKECMDGOALS),rocksdbjavastatic)
|
||||
ifneq ($(MAKECMDGOALS),rocksdbjavastatic_deps)
|
||||
ifneq ($(MAKECMDGOALS),package)
|
||||
ifneq ($(MAKECMDGOALS),analyze)
|
||||
# Remove the rules for which dependencies should not be generated and see if any are left.
|
||||
#If so, include the dependencies; if not, do not include the dependency files
|
||||
ROCKS_DEP_RULES=$(filter-out clean format check-format check-buck-targets jclean jtest package analyze tags rocksdbjavastatic% unity.% unity_test, $(MAKECMDGOALS))
|
||||
ifneq ("$(ROCKS_DEP_RULES)", "")
|
||||
-include $(DEPFILES)
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
@ -1,3 +1,4 @@
|
||||
This is the list of all known third-party plugins for RocksDB. If something is missing, please open a pull request to add it.
|
||||
|
||||
* [Dedupfs](https://github.com/ajkr/dedupfs): an example for plugin developers to reference
|
||||
* [ZenFS](https://github.com/westerndigitalcorporation/zenfs): a file system for zoned block devices
|
||||
|
@ -69,25 +69,25 @@ def get_cc_files(repo_path):
|
||||
return cc_files
|
||||
|
||||
|
||||
# Get parallel tests from Makefile
|
||||
def get_parallel_tests(repo_path):
|
||||
# Get non_parallel tests from Makefile
|
||||
def get_non_parallel_tests(repo_path):
|
||||
Makefile = repo_path + "/Makefile"
|
||||
|
||||
s = set({})
|
||||
|
||||
found_parallel_tests = False
|
||||
found_non_parallel_tests = False
|
||||
for line in open(Makefile):
|
||||
line = line.strip()
|
||||
if line.startswith("PARALLEL_TEST ="):
|
||||
found_parallel_tests = True
|
||||
elif found_parallel_tests:
|
||||
if line.startswith("NON_PARALLEL_TEST ="):
|
||||
found_non_parallel_tests = True
|
||||
elif found_non_parallel_tests:
|
||||
if line.endswith("\\"):
|
||||
# remove the trailing \
|
||||
line = line[:-1]
|
||||
line = line.strip()
|
||||
s.add(line)
|
||||
else:
|
||||
# we consumed all the parallel tests
|
||||
# we consumed all the non_parallel tests
|
||||
break
|
||||
|
||||
return s
|
||||
@ -123,10 +123,10 @@ def generate_targets(repo_path, deps_map):
|
||||
src_mk = parse_src_mk(repo_path)
|
||||
# get all .cc files
|
||||
cc_files = get_cc_files(repo_path)
|
||||
# get parallel tests from Makefile
|
||||
parallel_tests = get_parallel_tests(repo_path)
|
||||
# get non_parallel tests from Makefile
|
||||
non_parallel_tests = get_non_parallel_tests(repo_path)
|
||||
|
||||
if src_mk is None or cc_files is None or parallel_tests is None:
|
||||
if src_mk is None or cc_files is None or non_parallel_tests is None:
|
||||
return False
|
||||
|
||||
extra_argv = ""
|
||||
@ -141,11 +141,15 @@ def generate_targets(repo_path, deps_map):
|
||||
TARGETS.add_library(
|
||||
"rocksdb_lib",
|
||||
src_mk["LIB_SOURCES"] +
|
||||
# always add range_tree, it's only excluded on ppc64, which we don't use internally
|
||||
src_mk["RANGE_TREE_SOURCES"] +
|
||||
src_mk["TOOL_LIB_SOURCES"])
|
||||
# rocksdb_whole_archive_lib
|
||||
TARGETS.add_library(
|
||||
"rocksdb_whole_archive_lib",
|
||||
src_mk["LIB_SOURCES"] +
|
||||
# always add range_tree, it's only excluded on ppc64, which we don't use internally
|
||||
src_mk["RANGE_TREE_SOURCES"] +
|
||||
src_mk["TOOL_LIB_SOURCES"],
|
||||
deps=None,
|
||||
headers=None,
|
||||
@ -212,7 +216,7 @@ def generate_targets(repo_path, deps_map):
|
||||
TARGETS.register_test(
|
||||
test_target_name,
|
||||
test_src,
|
||||
test in parallel_tests,
|
||||
test not in non_parallel_tests,
|
||||
json.dumps(deps['extra_deps']),
|
||||
json.dumps(deps['extra_compiler_flags']))
|
||||
|
||||
|
@ -87,6 +87,7 @@ cpp_binary(
|
||||
os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS,
|
||||
compiler_flags = ROCKSDB_COMPILER_FLAGS,
|
||||
preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
|
||||
include_paths = ROCKSDB_INCLUDE_PATHS,
|
||||
deps = [":rocksdb_test_lib"],
|
||||
) if not is_opt_mode else None
|
||||
|
||||
|
@ -94,10 +94,12 @@ ROCKSDB_PREPROCESSOR_FLAGS = [
|
||||
|
||||
# Added missing flags from output of build_detect_platform
|
||||
"-DROCKSDB_BACKTRACE",
|
||||
]
|
||||
|
||||
# Directories with files for #include
|
||||
"-I" + REPO_PATH + "include/",
|
||||
"-I" + REPO_PATH,
|
||||
# Directories with files for #include
|
||||
ROCKSDB_INCLUDE_PATHS = [
|
||||
"",
|
||||
"include",
|
||||
]
|
||||
|
||||
ROCKSDB_ARCH_PREPROCESSOR_FLAGS = {{
|
||||
@ -145,6 +147,7 @@ cpp_library(
|
||||
os_deps = ROCKSDB_OS_DEPS,
|
||||
os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS,
|
||||
preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
|
||||
include_paths = ROCKSDB_INCLUDE_PATHS,
|
||||
deps = [{deps}],
|
||||
external_deps = ROCKSDB_EXTERNAL_DEPS{extra_external_deps},
|
||||
link_whole = {link_whole},
|
||||
@ -161,6 +164,7 @@ cpp_library(
|
||||
os_deps = ROCKSDB_OS_DEPS,
|
||||
os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS,
|
||||
preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
|
||||
include_paths = ROCKSDB_INCLUDE_PATHS,
|
||||
deps = ROCKSDB_LIB_DEPS,
|
||||
external_deps = ROCKSDB_EXTERNAL_DEPS,
|
||||
)
|
||||
@ -173,6 +177,7 @@ cpp_binary(
|
||||
arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS,
|
||||
compiler_flags = ROCKSDB_COMPILER_FLAGS,
|
||||
preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
|
||||
include_paths = ROCKSDB_INCLUDE_PATHS,
|
||||
deps = [{deps}],
|
||||
external_deps = ROCKSDB_EXTERNAL_DEPS,
|
||||
)
|
||||
@ -203,6 +208,7 @@ ROCKS_TESTS = [
|
||||
os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS,
|
||||
compiler_flags = ROCKSDB_COMPILER_FLAGS + extra_compiler_flags,
|
||||
preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
|
||||
include_paths = ROCKSDB_INCLUDE_PATHS,
|
||||
deps = [":rocksdb_test_lib"] + extra_deps,
|
||||
external_deps = ROCKSDB_EXTERNAL_DEPS + [
|
||||
("googletest", None, "gtest"),
|
||||
|
@ -177,7 +177,7 @@ case "$TARGET_OS" in
|
||||
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt -ldl"
|
||||
if test $ROCKSDB_USE_IO_URING; then
|
||||
# check for liburing
|
||||
$CXX $CFLAGS -x c++ - -luring -o /dev/null 2>/dev/null <<EOF
|
||||
$CXX $PLATFORM_CXXFLAGS -x c++ - -luring -o /dev/null 2>/dev/null <<EOF
|
||||
#include <liburing.h>
|
||||
int main() {
|
||||
struct io_uring ring;
|
||||
@ -288,7 +288,7 @@ if [ "$CROSS_COMPILE" = "true" -o "$FBCODE_BUILD" = "true" ]; then
|
||||
else
|
||||
if ! test $ROCKSDB_DISABLE_FALLOCATE; then
|
||||
# Test whether fallocate is available
|
||||
$CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
|
||||
$CXX $PLATFORM_CXXFLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
|
||||
#include <fcntl.h>
|
||||
#include <linux/falloc.h>
|
||||
int main() {
|
||||
@ -304,7 +304,7 @@ EOF
|
||||
if ! test $ROCKSDB_DISABLE_SNAPPY; then
|
||||
# Test whether Snappy library is installed
|
||||
# http://code.google.com/p/snappy/
|
||||
$CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
|
||||
$CXX $PLATFORM_CXXFLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
|
||||
#include <snappy.h>
|
||||
int main() {}
|
||||
EOF
|
||||
@ -319,7 +319,7 @@ EOF
|
||||
# Test whether gflags library is installed
|
||||
# http://gflags.github.io/gflags/
|
||||
# check if the namespace is gflags
|
||||
if $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null << EOF
|
||||
if $CXX $PLATFORM_CXXFLAGS -x c++ - -o /dev/null 2>/dev/null << EOF
|
||||
#include <gflags/gflags.h>
|
||||
using namespace GFLAGS_NAMESPACE;
|
||||
int main() {}
|
||||
@ -328,7 +328,7 @@ EOF
|
||||
COMMON_FLAGS="$COMMON_FLAGS -DGFLAGS=1"
|
||||
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lgflags"
|
||||
# check if namespace is gflags
|
||||
elif $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null << EOF
|
||||
elif $CXX $PLATFORM_CXXFLAGS -x c++ - -o /dev/null 2>/dev/null << EOF
|
||||
#include <gflags/gflags.h>
|
||||
using namespace gflags;
|
||||
int main() {}
|
||||
@ -337,7 +337,7 @@ EOF
|
||||
COMMON_FLAGS="$COMMON_FLAGS -DGFLAGS=1 -DGFLAGS_NAMESPACE=gflags"
|
||||
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lgflags"
|
||||
# check if namespace is google
|
||||
elif $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null << EOF
|
||||
elif $CXX $PLATFORM_CXXFLAGS -x c++ - -o /dev/null 2>/dev/null << EOF
|
||||
#include <gflags/gflags.h>
|
||||
using namespace google;
|
||||
int main() {}
|
||||
@ -350,7 +350,7 @@ EOF
|
||||
|
||||
if ! test $ROCKSDB_DISABLE_ZLIB; then
|
||||
# Test whether zlib library is installed
|
||||
$CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
|
||||
$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
|
||||
#include <zlib.h>
|
||||
int main() {}
|
||||
EOF
|
||||
@ -363,7 +363,7 @@ EOF
|
||||
|
||||
if ! test $ROCKSDB_DISABLE_BZIP; then
|
||||
# Test whether bzip library is installed
|
||||
$CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
|
||||
$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
|
||||
#include <bzlib.h>
|
||||
int main() {}
|
||||
EOF
|
||||
@ -376,7 +376,7 @@ EOF
|
||||
|
||||
if ! test $ROCKSDB_DISABLE_LZ4; then
|
||||
# Test whether lz4 library is installed
|
||||
$CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
|
||||
$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
|
||||
#include <lz4.h>
|
||||
#include <lz4hc.h>
|
||||
int main() {}
|
||||
@ -390,7 +390,7 @@ EOF
|
||||
|
||||
if ! test $ROCKSDB_DISABLE_ZSTD; then
|
||||
# Test whether zstd library is installed
|
||||
$CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
|
||||
$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
|
||||
#include <zstd.h>
|
||||
int main() {}
|
||||
EOF
|
||||
@ -403,7 +403,7 @@ EOF
|
||||
|
||||
if ! test $ROCKSDB_DISABLE_NUMA; then
|
||||
# Test whether numa is available
|
||||
$CXX $CFLAGS -x c++ - -o /dev/null -lnuma 2>/dev/null <<EOF
|
||||
$CXX $PLATFORM_CXXFLAGS -x c++ - -o /dev/null -lnuma 2>/dev/null <<EOF
|
||||
#include <numa.h>
|
||||
#include <numaif.h>
|
||||
int main() {}
|
||||
@ -417,7 +417,7 @@ EOF
|
||||
|
||||
if ! test $ROCKSDB_DISABLE_TBB; then
|
||||
# Test whether tbb is available
|
||||
$CXX $CFLAGS $LDFLAGS -x c++ - -o /dev/null -ltbb 2>/dev/null <<EOF
|
||||
$CXX $PLATFORM_CXXFLAGS $LDFLAGS -x c++ - -o /dev/null -ltbb 2>/dev/null <<EOF
|
||||
#include <tbb/tbb.h>
|
||||
int main() {}
|
||||
EOF
|
||||
@ -430,7 +430,7 @@ EOF
|
||||
|
||||
if ! test $ROCKSDB_DISABLE_JEMALLOC; then
|
||||
# Test whether jemalloc is available
|
||||
if echo 'int main() {}' | $CXX $CFLAGS -x c++ - -o /dev/null -ljemalloc \
|
||||
if echo 'int main() {}' | $CXX $PLATFORM_CXXFLAGS -x c++ - -o /dev/null -ljemalloc \
|
||||
2>/dev/null; then
|
||||
# This will enable some preprocessor identifiers in the Makefile
|
||||
JEMALLOC=1
|
||||
@ -451,7 +451,7 @@ EOF
|
||||
fi
|
||||
if ! test $JEMALLOC && ! test $ROCKSDB_DISABLE_TCMALLOC; then
|
||||
# jemalloc is not available. Let's try tcmalloc
|
||||
if echo 'int main() {}' | $CXX $CFLAGS -x c++ - -o /dev/null \
|
||||
if echo 'int main() {}' | $CXX $PLATFORM_CXXFLAGS -x c++ - -o /dev/null \
|
||||
-ltcmalloc 2>/dev/null; then
|
||||
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -ltcmalloc"
|
||||
JAVA_LDFLAGS="$JAVA_LDFLAGS -ltcmalloc"
|
||||
@ -460,7 +460,7 @@ EOF
|
||||
|
||||
if ! test $ROCKSDB_DISABLE_MALLOC_USABLE_SIZE; then
|
||||
# Test whether malloc_usable_size is available
|
||||
$CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
|
||||
$CXX $PLATFORM_CXXFLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
|
||||
#include <malloc.h>
|
||||
int main() {
|
||||
size_t res = malloc_usable_size(0);
|
||||
@ -475,7 +475,7 @@ EOF
|
||||
|
||||
if ! test $ROCKSDB_DISABLE_MEMKIND; then
|
||||
# Test whether memkind library is installed
|
||||
$CXX $CFLAGS $COMMON_FLAGS -lmemkind -x c++ - -o /dev/null 2>/dev/null <<EOF
|
||||
$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -lmemkind -x c++ - -o /dev/null 2>/dev/null <<EOF
|
||||
#include <memkind.h>
|
||||
int main() {
|
||||
memkind_malloc(MEMKIND_DAX_KMEM, 1024);
|
||||
@ -491,7 +491,7 @@ EOF
|
||||
|
||||
if ! test $ROCKSDB_DISABLE_PTHREAD_MUTEX_ADAPTIVE_NP; then
|
||||
# Test whether PTHREAD_MUTEX_ADAPTIVE_NP mutex type is available
|
||||
$CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
|
||||
$CXX $PLATFORM_CXXFLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
|
||||
#include <pthread.h>
|
||||
int main() {
|
||||
int x = PTHREAD_MUTEX_ADAPTIVE_NP;
|
||||
@ -506,7 +506,7 @@ EOF
|
||||
|
||||
if ! test $ROCKSDB_DISABLE_BACKTRACE; then
|
||||
# Test whether backtrace is available
|
||||
$CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
|
||||
$CXX $PLATFORM_CXXFLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
|
||||
#include <execinfo.h>
|
||||
int main() {
|
||||
void* frames[1];
|
||||
@ -518,7 +518,7 @@ EOF
|
||||
COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_BACKTRACE"
|
||||
else
|
||||
# Test whether execinfo library is installed
|
||||
$CXX $CFLAGS -lexecinfo -x c++ - -o /dev/null 2>/dev/null <<EOF
|
||||
$CXX $PLATFORM_CXXFLAGS -lexecinfo -x c++ - -o /dev/null 2>/dev/null <<EOF
|
||||
#include <execinfo.h>
|
||||
int main() {
|
||||
void* frames[1];
|
||||
@ -535,7 +535,7 @@ EOF
|
||||
|
||||
if ! test $ROCKSDB_DISABLE_PG; then
|
||||
# Test if -pg is supported
|
||||
$CXX $CFLAGS -pg -x c++ - -o /dev/null 2>/dev/null <<EOF
|
||||
$CXX $PLATFORM_CXXFLAGS -pg -x c++ - -o /dev/null 2>/dev/null <<EOF
|
||||
int main() {
|
||||
return 0;
|
||||
}
|
||||
@ -547,7 +547,7 @@ EOF
|
||||
|
||||
if ! test $ROCKSDB_DISABLE_SYNC_FILE_RANGE; then
|
||||
# Test whether sync_file_range is supported for compatibility with an old glibc
|
||||
$CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
|
||||
$CXX $PLATFORM_CXXFLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
|
||||
#include <fcntl.h>
|
||||
int main() {
|
||||
int fd = open("/dev/null", 0);
|
||||
@ -561,7 +561,7 @@ EOF
|
||||
|
||||
if ! test $ROCKSDB_DISABLE_SCHED_GETCPU; then
|
||||
# Test whether sched_getcpu is supported
|
||||
$CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
|
||||
$CXX $PLATFORM_CXXFLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
|
||||
#include <sched.h>
|
||||
int main() {
|
||||
int cpuid = sched_getcpu();
|
||||
@ -575,7 +575,7 @@ EOF
|
||||
|
||||
if ! test $ROCKSDB_DISABLE_AUXV_GETAUXVAL; then
|
||||
# Test whether getauxval is supported
|
||||
$CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
|
||||
$CXX $PLATFORM_CXXFLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
|
||||
#include <sys/auxv.h>
|
||||
int main() {
|
||||
uint64_t auxv = getauxval(AT_HWCAP);
|
||||
@ -603,7 +603,7 @@ fi
|
||||
# -Wshorten-64-to-32 breaks compilation on FreeBSD i386
|
||||
if ! [ "$TARGET_OS" = FreeBSD -a "$TARGET_ARCHITECTURE" = i386 ]; then
|
||||
# Test whether -Wshorten-64-to-32 is available
|
||||
$CXX $CFLAGS -x c++ - -o /dev/null -Wshorten-64-to-32 2>/dev/null <<EOF
|
||||
$CXX $PLATFORM_CXXFLAGS -x c++ - -o /dev/null -Wshorten-64-to-32 2>/dev/null <<EOF
|
||||
int main() {}
|
||||
EOF
|
||||
if [ "$?" = 0 ]; then
|
||||
@ -668,6 +668,23 @@ else
|
||||
fi
|
||||
fi
|
||||
|
||||
if test -n "`echo $TARGET_ARCHITECTURE | grep ^ppc64`"; then
|
||||
# check for GNU libc on ppc64
|
||||
$CXX -x c++ - -o /dev/null 2>/dev/null <<EOF
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <gnu/libc-version.h>
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
printf("GNU libc version: %s\n", gnu_get_libc_version());
|
||||
return 0;
|
||||
}
|
||||
EOF
|
||||
if [ "$?" != 0 ]; then
|
||||
PPC_LIBC_IS_GNU=0
|
||||
fi
|
||||
fi
|
||||
|
||||
if test "$TRY_SSE_ETC"; then
|
||||
# The USE_SSE flag now means "attempt to compile with widely-available
|
||||
# Intel architecture extensions utilized by specific optimizations in the
|
||||
@ -861,3 +878,6 @@ echo "LUA_PATH=$LUA_PATH" >> "$OUTPUT"
|
||||
if test -n "$USE_FOLLY_DISTRIBUTED_MUTEX"; then
|
||||
echo "USE_FOLLY_DISTRIBUTED_MUTEX=$USE_FOLLY_DISTRIBUTED_MUTEX" >> "$OUTPUT"
|
||||
fi
|
||||
if test -n "$PPC_LIBC_IS_GNU"; then
|
||||
echo "PPC_LIBC_IS_GNU=$PPC_LIBC_IS_GNU" >> "$OUTPUT"
|
||||
fi
|
||||
|
@ -136,9 +136,11 @@ then
|
||||
FORMAT_UPSTREAM_MERGE_BASE="$(git merge-base "$FORMAT_UPSTREAM" HEAD)"
|
||||
# Get the differences
|
||||
diffs=$(git diff -U0 "$FORMAT_UPSTREAM_MERGE_BASE" | $CLANG_FORMAT_DIFF -p 1)
|
||||
echo "Checking format of changes not yet in $FORMAT_UPSTREAM..."
|
||||
else
|
||||
# Check the format of uncommitted lines,
|
||||
diffs=$(git diff -U0 HEAD | $CLANG_FORMAT_DIFF -p 1)
|
||||
echo "Checking format of uncommitted changes..."
|
||||
fi
|
||||
|
||||
if [ -z "$diffs" ]
|
||||
|
@ -548,6 +548,36 @@ STRESS_CRASH_TEST_WITH_TXN_COMMANDS="[
|
||||
}
|
||||
]"
|
||||
|
||||
#
|
||||
# RocksDB stress/crash test with timestamp
|
||||
#
|
||||
STRESS_CRASH_TEST_WITH_TS_COMMANDS="[
|
||||
{
|
||||
'name':'Rocksdb Stress and Crash Test with ts',
|
||||
'oncall':'$ONCALL',
|
||||
'executeLocal': 'true',
|
||||
'timeout': 86400,
|
||||
'steps': [
|
||||
$CLEANUP_ENV,
|
||||
{
|
||||
'name':'Build and run RocksDB debug stress tests',
|
||||
'shell':'cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH make J=1 db_stress || $CONTRUN_NAME=db_stress $TASK_CREATION_TOOL',
|
||||
'user':'root',
|
||||
$PARSER
|
||||
},
|
||||
{
|
||||
'name':'Build and run RocksDB debug crash tests with ts',
|
||||
'timeout': 86400,
|
||||
'shell':'cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH make J=1 crash_test_with_ts || $CONTRUN_NAME=crash_test_with_ts $TASK_CREATION_TOOL',
|
||||
'user':'root',
|
||||
$PARSER
|
||||
},
|
||||
$UPLOAD_DB_DIR,
|
||||
],
|
||||
$REPORT
|
||||
}
|
||||
]"
|
||||
|
||||
# RocksDB write stress test.
|
||||
# We run on disk device on purpose (i.e. no $SHM)
|
||||
# because we want to add some randomness to fsync commands
|
||||
@ -1220,6 +1250,9 @@ case $1 in
|
||||
stress_crash_with_txn)
|
||||
echo $STRESS_CRASH_TEST_WITH_TXN_COMMANDS
|
||||
;;
|
||||
stress_crash_with_ts)
|
||||
echo $STRESS_CRASH_TEST_WITH_TS_COMMANDS
|
||||
;;
|
||||
write_stress)
|
||||
echo $WRITE_STRESS_COMMANDS
|
||||
;;
|
||||
|
4
cache/lru_cache.h
vendored
4
cache/lru_cache.h
vendored
@ -310,7 +310,7 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard {
|
||||
// not threadsafe
|
||||
size_t TEST_GetLRUSize();
|
||||
|
||||
// Retrives high pri pool ratio
|
||||
// Retrieves high pri pool ratio
|
||||
double GetHighPriPoolRatio();
|
||||
|
||||
private:
|
||||
@ -408,7 +408,7 @@ class LRUCache
|
||||
|
||||
// Retrieves number of elements in LRU, for unit test purpose only
|
||||
size_t TEST_GetLRUSize();
|
||||
// Retrives high pri pool ratio
|
||||
// Retrieves high pri pool ratio
|
||||
double GetHighPriPoolRatio();
|
||||
|
||||
private:
|
||||
|
@ -190,7 +190,7 @@ Status BlobFileBuilder::OpenBlobFileIfNeeded() {
|
||||
std::move(file), blob_file_paths_->back(), *file_options_,
|
||||
immutable_cf_options_->clock, io_tracer_, statistics,
|
||||
immutable_cf_options_->listeners,
|
||||
immutable_cf_options_->file_checksum_gen_factory,
|
||||
immutable_cf_options_->file_checksum_gen_factory.get(),
|
||||
tmp_set.Contains(FileType::kBlobFile)));
|
||||
|
||||
constexpr bool do_flush = false;
|
||||
@ -306,7 +306,6 @@ Status BlobFileBuilder::CloseBlobFile() {
|
||||
" total blobs, %" PRIu64 " total bytes",
|
||||
column_family_name_.c_str(), job_id_, blob_file_number,
|
||||
blob_count_, blob_bytes_);
|
||||
|
||||
if (blob_callback_) {
|
||||
s = blob_callback_->OnBlobFileCompleted(blob_file_paths_->back());
|
||||
}
|
||||
|
@ -16,6 +16,14 @@ namespace ROCKSDB_NAMESPACE {
|
||||
|
||||
class BlobFileCompletionCallback {
|
||||
public:
|
||||
#ifdef ROCKSDB_LITE
|
||||
BlobFileCompletionCallback(SstFileManager* /*sst_file_manager*/,
|
||||
InstrumentedMutex* /*mutex*/,
|
||||
ErrorHandler* /*error_handler*/) {}
|
||||
Status OnBlobFileCompleted(const std::string& /*file_name*/) {
|
||||
return Status::OK();
|
||||
}
|
||||
#else
|
||||
BlobFileCompletionCallback(SstFileManager* sst_file_manager,
|
||||
InstrumentedMutex* mutex,
|
||||
ErrorHandler* error_handler)
|
||||
@ -25,8 +33,6 @@ class BlobFileCompletionCallback {
|
||||
|
||||
Status OnBlobFileCompleted(const std::string& file_name) {
|
||||
Status s;
|
||||
|
||||
#ifndef ROCKSDB_LITE
|
||||
auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager_);
|
||||
if (sfm) {
|
||||
// Report new blob files to SstFileManagerImpl
|
||||
@ -39,9 +45,6 @@ class BlobFileCompletionCallback {
|
||||
error_handler_->SetBGError(s, BackgroundErrorReason::kFlush);
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void)file_name;
|
||||
#endif // ROCKSDB_LITE
|
||||
return s;
|
||||
}
|
||||
|
||||
@ -49,5 +52,6 @@ class BlobFileCompletionCallback {
|
||||
SstFileManager* sst_file_manager_;
|
||||
InstrumentedMutex* mutex_;
|
||||
ErrorHandler* error_handler_;
|
||||
#endif // ROCKSDB_LITE
|
||||
};
|
||||
} // namespace ROCKSDB_NAMESPACE
|
||||
|
@ -120,7 +120,7 @@ Status BlobFileReader::OpenFile(
|
||||
file_reader->reset(new RandomAccessFileReader(
|
||||
std::move(file), blob_file_path, immutable_cf_options.clock, io_tracer,
|
||||
immutable_cf_options.statistics, BLOB_DB_BLOB_FILE_READ_MICROS,
|
||||
blob_file_read_hist, immutable_cf_options.rate_limiter,
|
||||
blob_file_read_hist, immutable_cf_options.rate_limiter.get(),
|
||||
immutable_cf_options.listeners));
|
||||
|
||||
return Status::OK();
|
||||
|
@ -223,7 +223,7 @@ TEST_F(DBBlobBasicTest, GenerateIOTracing) {
|
||||
std::unique_ptr<TraceWriter> trace_writer;
|
||||
ASSERT_OK(
|
||||
NewFileTraceWriter(env_, EnvOptions(), trace_file, &trace_writer));
|
||||
ASSERT_OK(db_->StartIOTrace(env_, TraceOptions(), std::move(trace_writer)));
|
||||
ASSERT_OK(db_->StartIOTrace(TraceOptions(), std::move(trace_writer)));
|
||||
|
||||
constexpr char key[] = "key";
|
||||
constexpr char blob_value[] = "blob_value";
|
||||
@ -236,7 +236,7 @@ TEST_F(DBBlobBasicTest, GenerateIOTracing) {
|
||||
ASSERT_OK(env_->FileExists(trace_file));
|
||||
}
|
||||
{
|
||||
// Parse trace file to check file opertions related to blob files are
|
||||
// Parse trace file to check file operations related to blob files are
|
||||
// recorded.
|
||||
std::unique_ptr<TraceReader> trace_reader;
|
||||
ASSERT_OK(
|
||||
@ -267,6 +267,49 @@ TEST_F(DBBlobBasicTest, GenerateIOTracing) {
|
||||
}
|
||||
#endif // !ROCKSDB_LITE
|
||||
|
||||
TEST_F(DBBlobBasicTest, BestEffortsRecovery_MissingNewestBlobFile) {
|
||||
Options options = GetDefaultOptions();
|
||||
options.enable_blob_files = true;
|
||||
options.min_blob_size = 0;
|
||||
options.create_if_missing = true;
|
||||
Reopen(options);
|
||||
|
||||
ASSERT_OK(dbfull()->DisableFileDeletions());
|
||||
constexpr int kNumTableFiles = 2;
|
||||
for (int i = 0; i < kNumTableFiles; ++i) {
|
||||
for (char ch = 'a'; ch != 'c'; ++ch) {
|
||||
std::string key(1, ch);
|
||||
ASSERT_OK(Put(key, "value" + std::to_string(i)));
|
||||
}
|
||||
ASSERT_OK(Flush());
|
||||
}
|
||||
|
||||
Close();
|
||||
|
||||
std::vector<std::string> files;
|
||||
ASSERT_OK(env_->GetChildren(dbname_, &files));
|
||||
std::string blob_file_path;
|
||||
uint64_t max_blob_file_num = kInvalidBlobFileNumber;
|
||||
for (const auto& fname : files) {
|
||||
uint64_t file_num = 0;
|
||||
FileType type;
|
||||
if (ParseFileName(fname, &file_num, /*info_log_name_prefix=*/"", &type) &&
|
||||
type == kBlobFile) {
|
||||
if (file_num > max_blob_file_num) {
|
||||
max_blob_file_num = file_num;
|
||||
blob_file_path = dbname_ + "/" + fname;
|
||||
}
|
||||
}
|
||||
}
|
||||
ASSERT_OK(env_->DeleteFile(blob_file_path));
|
||||
|
||||
options.best_efforts_recovery = true;
|
||||
Reopen(options);
|
||||
std::string value;
|
||||
ASSERT_OK(db_->Get(ReadOptions(), "a", &value));
|
||||
ASSERT_EQ("value" + std::to_string(kNumTableFiles - 2), value);
|
||||
}
|
||||
|
||||
class DBBlobBasicIOErrorTest : public DBBlobBasicTest,
|
||||
public testing::WithParamInterface<std::string> {
|
||||
protected:
|
||||
|
@ -52,8 +52,8 @@ TableBuilder* NewTableBuilder(
|
||||
int_tbl_prop_collector_factories,
|
||||
uint32_t column_family_id, const std::string& column_family_name,
|
||||
WritableFileWriter* file, const CompressionType compression_type,
|
||||
uint64_t sample_for_compression, const CompressionOptions& compression_opts,
|
||||
int level, const bool skip_filters, const uint64_t creation_time,
|
||||
const CompressionOptions& compression_opts, int level,
|
||||
const bool skip_filters, const uint64_t creation_time,
|
||||
const uint64_t oldest_key_time, const uint64_t target_file_size,
|
||||
const uint64_t file_creation_time, const std::string& db_id,
|
||||
const std::string& db_session_id) {
|
||||
@ -63,10 +63,10 @@ TableBuilder* NewTableBuilder(
|
||||
return ioptions.table_factory->NewTableBuilder(
|
||||
TableBuilderOptions(ioptions, moptions, internal_comparator,
|
||||
int_tbl_prop_collector_factories, compression_type,
|
||||
sample_for_compression, compression_opts,
|
||||
skip_filters, column_family_name, level,
|
||||
creation_time, oldest_key_time, target_file_size,
|
||||
file_creation_time, db_id, db_session_id),
|
||||
compression_opts, skip_filters, column_family_name,
|
||||
level, creation_time, oldest_key_time,
|
||||
target_file_size, file_creation_time, db_id,
|
||||
db_session_id),
|
||||
column_family_id, file);
|
||||
}
|
||||
|
||||
@ -85,11 +85,10 @@ Status BuildTable(
|
||||
std::vector<SequenceNumber> snapshots,
|
||||
SequenceNumber earliest_write_conflict_snapshot,
|
||||
SnapshotChecker* snapshot_checker, const CompressionType compression,
|
||||
uint64_t sample_for_compression, const CompressionOptions& compression_opts,
|
||||
bool paranoid_file_checks, InternalStats* internal_stats,
|
||||
TableFileCreationReason reason, IOStatus* io_status,
|
||||
const std::shared_ptr<IOTracer>& io_tracer, EventLogger* event_logger,
|
||||
int job_id, const Env::IOPriority io_priority,
|
||||
const CompressionOptions& compression_opts, bool paranoid_file_checks,
|
||||
InternalStats* internal_stats, TableFileCreationReason reason,
|
||||
IOStatus* io_status, const std::shared_ptr<IOTracer>& io_tracer,
|
||||
EventLogger* event_logger, int job_id, const Env::IOPriority io_priority,
|
||||
TableProperties* table_properties, int level, const uint64_t creation_time,
|
||||
const uint64_t oldest_key_time, Env::WriteLifeTimeHint write_hint,
|
||||
const uint64_t file_creation_time, const std::string& db_id,
|
||||
@ -157,20 +156,19 @@ Status BuildTable(
|
||||
file_writer.reset(new WritableFileWriter(
|
||||
std::move(file), fname, file_options, ioptions.clock, io_tracer,
|
||||
ioptions.statistics, ioptions.listeners,
|
||||
ioptions.file_checksum_gen_factory,
|
||||
ioptions.file_checksum_gen_factory.get(),
|
||||
tmp_set.Contains(FileType::kTableFile)));
|
||||
|
||||
builder = NewTableBuilder(
|
||||
ioptions, mutable_cf_options, internal_comparator,
|
||||
int_tbl_prop_collector_factories, column_family_id,
|
||||
column_family_name, file_writer.get(), compression,
|
||||
sample_for_compression, compression_opts, level,
|
||||
false /* skip_filters */, creation_time, oldest_key_time,
|
||||
column_family_name, file_writer.get(), compression, compression_opts,
|
||||
level, false /* skip_filters */, creation_time, oldest_key_time,
|
||||
0 /*target_file_size*/, file_creation_time, db_id, db_session_id);
|
||||
}
|
||||
|
||||
MergeHelper merge(env, internal_comparator.user_comparator(),
|
||||
ioptions.merge_operator, nullptr, ioptions.info_log,
|
||||
ioptions.merge_operator.get(), nullptr, ioptions.info_log,
|
||||
true /* internal key corruption is not ok */,
|
||||
snapshots.empty() ? 0 : snapshots.back(),
|
||||
snapshot_checker);
|
||||
|
@ -50,7 +50,6 @@ TableBuilder* NewTableBuilder(
|
||||
int_tbl_prop_collector_factories,
|
||||
uint32_t column_family_id, const std::string& column_family_name,
|
||||
WritableFileWriter* file, const CompressionType compression_type,
|
||||
const uint64_t sample_for_compression,
|
||||
const CompressionOptions& compression_opts, int level,
|
||||
const bool skip_filters = false, const uint64_t creation_time = 0,
|
||||
const uint64_t oldest_key_time = 0, const uint64_t target_file_size = 0,
|
||||
@ -80,7 +79,6 @@ extern Status BuildTable(
|
||||
std::vector<SequenceNumber> snapshots,
|
||||
SequenceNumber earliest_write_conflict_snapshot,
|
||||
SnapshotChecker* snapshot_checker, const CompressionType compression,
|
||||
const uint64_t sample_for_compression,
|
||||
const CompressionOptions& compression_opts, bool paranoid_file_checks,
|
||||
InternalStats* internal_stats, TableFileCreationReason reason,
|
||||
IOStatus* io_status, const std::shared_ptr<IOTracer>& io_tracer,
|
||||
|
108
db/c.cc
108
db/c.cc
@ -79,6 +79,8 @@ using ROCKSDB_NAMESPACE::IngestExternalFileOptions;
|
||||
using ROCKSDB_NAMESPACE::Iterator;
|
||||
using ROCKSDB_NAMESPACE::LiveFileMetaData;
|
||||
using ROCKSDB_NAMESPACE::Logger;
|
||||
using ROCKSDB_NAMESPACE::LRUCacheOptions;
|
||||
using ROCKSDB_NAMESPACE::MemoryAllocator;
|
||||
using ROCKSDB_NAMESPACE::MemoryUtil;
|
||||
using ROCKSDB_NAMESPACE::MergeOperator;
|
||||
using ROCKSDB_NAMESPACE::NewBloomFilterPolicy;
|
||||
@ -150,6 +152,12 @@ struct rocksdb_filelock_t { FileLock* rep; };
|
||||
struct rocksdb_logger_t {
|
||||
std::shared_ptr<Logger> rep;
|
||||
};
|
||||
struct rocksdb_lru_cache_options_t {
|
||||
LRUCacheOptions rep;
|
||||
};
|
||||
struct rocksdb_memory_allocator_t {
|
||||
std::shared_ptr<MemoryAllocator> rep;
|
||||
};
|
||||
struct rocksdb_cache_t {
|
||||
std::shared_ptr<Cache> rep;
|
||||
};
|
||||
@ -2682,6 +2690,59 @@ unsigned char rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open(
|
||||
return opt->rep.skip_checking_sst_file_sizes_on_db_open;
|
||||
}
|
||||
|
||||
/* Blob Options Settings */
|
||||
void rocksdb_options_set_enable_blob_files(rocksdb_options_t* opt,
|
||||
unsigned char val) {
|
||||
opt->rep.enable_blob_files = val;
|
||||
}
|
||||
extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_enable_blob_files(
|
||||
rocksdb_options_t* opt) {
|
||||
return opt->rep.enable_blob_files;
|
||||
}
|
||||
|
||||
void rocksdb_options_set_min_blob_size(rocksdb_options_t* opt, uint64_t val) {
|
||||
opt->rep.min_blob_size = val;
|
||||
}
|
||||
|
||||
uint64_t rocksdb_options_get_min_blob_size(rocksdb_options_t* opt) {
|
||||
return opt->rep.min_blob_size;
|
||||
}
|
||||
|
||||
void rocksdb_options_set_blob_file_size(rocksdb_options_t* opt, uint64_t val) {
|
||||
opt->rep.blob_file_size = val;
|
||||
}
|
||||
|
||||
uint64_t rocksdb_options_get_blob_file_size(rocksdb_options_t* opt) {
|
||||
return opt->rep.blob_file_size;
|
||||
}
|
||||
|
||||
void rocksdb_options_set_blob_compression_type(rocksdb_options_t* opt,
|
||||
int val) {
|
||||
opt->rep.blob_compression_type = static_cast<CompressionType>(val);
|
||||
}
|
||||
|
||||
int rocksdb_options_get_blob_compression_type(rocksdb_options_t* opt) {
|
||||
return opt->rep.blob_compression_type;
|
||||
}
|
||||
|
||||
void rocksdb_options_set_enable_blob_gc(rocksdb_options_t* opt,
|
||||
unsigned char val) {
|
||||
opt->rep.enable_blob_garbage_collection = val;
|
||||
}
|
||||
|
||||
unsigned char rocksdb_options_get_enable_blob_gc(rocksdb_options_t* opt) {
|
||||
return opt->rep.enable_blob_garbage_collection;
|
||||
}
|
||||
|
||||
void rocksdb_options_set_blob_gc_age_cutoff(rocksdb_options_t* opt,
|
||||
double val) {
|
||||
opt->rep.blob_garbage_collection_age_cutoff = val;
|
||||
}
|
||||
|
||||
double rocksdb_options_get_blob_gc_age_cutoff(rocksdb_options_t* opt) {
|
||||
return opt->rep.blob_garbage_collection_age_cutoff;
|
||||
}
|
||||
|
||||
void rocksdb_options_set_num_levels(rocksdb_options_t* opt, int n) {
|
||||
opt->rep.num_levels = n;
|
||||
}
|
||||
@ -4102,16 +4163,58 @@ unsigned char rocksdb_flushoptions_get_wait(rocksdb_flushoptions_t* opt) {
|
||||
return opt->rep.wait;
|
||||
}
|
||||
|
||||
rocksdb_memory_allocator_t* rocksdb_jemalloc_nodump_allocator_create(
|
||||
char** errptr) {
|
||||
rocksdb_memory_allocator_t* allocator = new rocksdb_memory_allocator_t;
|
||||
ROCKSDB_NAMESPACE::JemallocAllocatorOptions options;
|
||||
SaveError(errptr, ROCKSDB_NAMESPACE::NewJemallocNodumpAllocator(
|
||||
options, &allocator->rep));
|
||||
return allocator;
|
||||
}
|
||||
|
||||
void rocksdb_memory_allocator_destroy(rocksdb_memory_allocator_t* allocator) {
|
||||
delete allocator;
|
||||
}
|
||||
|
||||
rocksdb_lru_cache_options_t* rocksdb_lru_cache_options_create() {
|
||||
return new rocksdb_lru_cache_options_t;
|
||||
}
|
||||
|
||||
void rocksdb_lru_cache_options_destroy(rocksdb_lru_cache_options_t* opt) {
|
||||
delete opt;
|
||||
}
|
||||
|
||||
void rocksdb_lru_cache_options_set_capacity(rocksdb_lru_cache_options_t* opt,
|
||||
size_t capacity) {
|
||||
opt->rep.capacity = capacity;
|
||||
}
|
||||
|
||||
void rocksdb_lru_cache_options_set_memory_allocator(
|
||||
rocksdb_lru_cache_options_t* opt, rocksdb_memory_allocator_t* allocator) {
|
||||
opt->rep.memory_allocator = allocator->rep;
|
||||
}
|
||||
|
||||
rocksdb_cache_t* rocksdb_cache_create_lru(size_t capacity) {
|
||||
rocksdb_cache_t* c = new rocksdb_cache_t;
|
||||
c->rep = NewLRUCache(capacity);
|
||||
return c;
|
||||
}
|
||||
|
||||
rocksdb_cache_t* rocksdb_cache_create_lru_opts(
|
||||
rocksdb_lru_cache_options_t* opt) {
|
||||
rocksdb_cache_t* c = new rocksdb_cache_t;
|
||||
c->rep = NewLRUCache(opt->rep);
|
||||
return c;
|
||||
}
|
||||
|
||||
void rocksdb_cache_destroy(rocksdb_cache_t* cache) {
|
||||
delete cache;
|
||||
}
|
||||
|
||||
void rocksdb_cache_disown_data(rocksdb_cache_t* cache) {
|
||||
cache->rep->DisownData();
|
||||
}
|
||||
|
||||
void rocksdb_cache_set_capacity(rocksdb_cache_t* cache, size_t capacity) {
|
||||
cache->rep->SetCapacity(capacity);
|
||||
}
|
||||
@ -4790,7 +4893,10 @@ void rocksdb_transaction_destroy(rocksdb_transaction_t* txn) {
|
||||
|
||||
const rocksdb_snapshot_t* rocksdb_transaction_get_snapshot(
|
||||
rocksdb_transaction_t* txn) {
|
||||
rocksdb_snapshot_t* result = new rocksdb_snapshot_t;
|
||||
// This will be freed later on using free, so use malloc here to avoid a
|
||||
// mismatch
|
||||
rocksdb_snapshot_t* result =
|
||||
(rocksdb_snapshot_t*)malloc(sizeof(rocksdb_snapshot_t));
|
||||
result->rep = txn->rep->GetSnapshot();
|
||||
return result;
|
||||
}
|
||||
|
51
db/c_test.c
51
db/c_test.c
@ -1759,6 +1759,25 @@ int main(int argc, char** argv) {
|
||||
rocksdb_options_set_atomic_flush(o, 1);
|
||||
CheckCondition(1 == rocksdb_options_get_atomic_flush(o));
|
||||
|
||||
/* Blob Options */
|
||||
rocksdb_options_set_enable_blob_files(o, 1);
|
||||
CheckCondition(1 == rocksdb_options_get_enable_blob_files(o));
|
||||
|
||||
rocksdb_options_set_min_blob_size(o, 29);
|
||||
CheckCondition(29 == rocksdb_options_get_min_blob_size(o));
|
||||
|
||||
rocksdb_options_set_blob_file_size(o, 30);
|
||||
CheckCondition(30 == rocksdb_options_get_blob_file_size(o));
|
||||
|
||||
rocksdb_options_set_blob_compression_type(o, 4);
|
||||
CheckCondition(4 == rocksdb_options_get_blob_compression_type(o));
|
||||
|
||||
rocksdb_options_set_enable_blob_gc(o, 1);
|
||||
CheckCondition(1 == rocksdb_options_get_enable_blob_gc(o));
|
||||
|
||||
rocksdb_options_set_blob_gc_age_cutoff(o, 0.75);
|
||||
CheckCondition(0.75 == rocksdb_options_get_blob_gc_age_cutoff(o));
|
||||
|
||||
// Create a copy that should be equal to the original.
|
||||
rocksdb_options_t* copy;
|
||||
copy = rocksdb_options_create_copy(o);
|
||||
@ -2362,6 +2381,37 @@ int main(int argc, char** argv) {
|
||||
rocksdb_cache_destroy(co);
|
||||
}
|
||||
|
||||
StartPhase("jemalloc_nodump_allocator");
|
||||
{
|
||||
rocksdb_memory_allocator_t* allocator;
|
||||
allocator = rocksdb_jemalloc_nodump_allocator_create(&err);
|
||||
if (err != NULL) {
|
||||
// not supported on all platforms, allow unsupported error
|
||||
const char* ni = "Not implemented: ";
|
||||
size_t ni_len = strlen(ni);
|
||||
size_t err_len = strlen(err);
|
||||
|
||||
CheckCondition(err_len >= ni_len);
|
||||
CheckCondition(memcmp(ni, err, ni_len) == 0);
|
||||
Free(&err);
|
||||
} else {
|
||||
rocksdb_cache_t* co;
|
||||
rocksdb_lru_cache_options_t* copts;
|
||||
|
||||
copts = rocksdb_lru_cache_options_create();
|
||||
|
||||
rocksdb_lru_cache_options_set_capacity(copts, 100);
|
||||
rocksdb_lru_cache_options_set_memory_allocator(copts, allocator);
|
||||
|
||||
co = rocksdb_cache_create_lru_opts(copts);
|
||||
CheckCondition(100 == rocksdb_cache_get_capacity(co));
|
||||
|
||||
rocksdb_cache_destroy(co);
|
||||
rocksdb_lru_cache_options_destroy(copts);
|
||||
}
|
||||
rocksdb_memory_allocator_destroy(allocator);
|
||||
}
|
||||
|
||||
StartPhase("env");
|
||||
{
|
||||
rocksdb_env_t* e;
|
||||
@ -2869,6 +2919,7 @@ int main(int argc, char** argv) {
|
||||
rocksdb_readoptions_destroy(roptions);
|
||||
rocksdb_writeoptions_destroy(woptions);
|
||||
rocksdb_compactoptions_destroy(coptions);
|
||||
rocksdb_cache_disown_data(cache);
|
||||
rocksdb_cache_destroy(cache);
|
||||
rocksdb_comparator_destroy(cmp);
|
||||
rocksdb_dbpath_destroy(dbpath);
|
||||
|
@ -1355,6 +1355,12 @@ Status ColumnFamilyData::ValidateOptions(
|
||||
"[0.0, 1.0].");
|
||||
}
|
||||
|
||||
if (cf_options.compaction_style == kCompactionStyleFIFO &&
|
||||
db_options.max_open_files != -1 && cf_options.ttl > 0) {
|
||||
return Status::NotSupported(
|
||||
"FIFO compaction only supported with max_open_files = -1.");
|
||||
}
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
|
@ -253,7 +253,7 @@ extern Status CheckCFPathsSupported(const DBOptions& db_options,
|
||||
|
||||
extern ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
|
||||
const ColumnFamilyOptions& src);
|
||||
// Wrap user defined table proproties collector factories `from cf_options`
|
||||
// Wrap user defined table properties collector factories `from cf_options`
|
||||
// into internal ones in int_tbl_prop_collector_factories. Add a system internal
|
||||
// one too.
|
||||
extern void GetIntTblPropCollectorFactory(
|
||||
@ -441,7 +441,7 @@ class ColumnFamilyData {
|
||||
// Get SuperVersion stored in thread local storage. If it does not exist,
|
||||
// get a reference from a current SuperVersion.
|
||||
SuperVersion* GetThreadLocalSuperVersion(DBImpl* db);
|
||||
// Try to return SuperVersion back to thread local storage. Retrun true on
|
||||
// Try to return SuperVersion back to thread local storage. Return true on
|
||||
// success and false on failure. It fails when the thread local storage
|
||||
// contains anything other than SuperVersion::kSVInUse flag.
|
||||
bool ReturnThreadLocalSuperVersion(SuperVersion* sv);
|
||||
|
@ -118,6 +118,78 @@ TEST_F(CompactFilesTest, L0ConflictsFiles) {
|
||||
delete db;
|
||||
}
|
||||
|
||||
TEST_F(CompactFilesTest, MultipleLevel) {
|
||||
Options options;
|
||||
options.create_if_missing = true;
|
||||
options.level_compaction_dynamic_level_bytes = true;
|
||||
options.num_levels = 6;
|
||||
// Add listener
|
||||
FlushedFileCollector* collector = new FlushedFileCollector();
|
||||
options.listeners.emplace_back(collector);
|
||||
|
||||
DB* db = nullptr;
|
||||
DestroyDB(db_name_, options);
|
||||
Status s = DB::Open(options, db_name_, &db);
|
||||
ASSERT_OK(s);
|
||||
ASSERT_NE(db, nullptr);
|
||||
|
||||
// create couple files in L0, L3, L4 and L5
|
||||
for (int i = 5; i > 2; --i) {
|
||||
collector->ClearFlushedFiles();
|
||||
ASSERT_OK(db->Put(WriteOptions(), ToString(i), ""));
|
||||
ASSERT_OK(db->Flush(FlushOptions()));
|
||||
auto l0_files = collector->GetFlushedFiles();
|
||||
ASSERT_OK(db->CompactFiles(CompactionOptions(), l0_files, i));
|
||||
|
||||
std::string prop;
|
||||
ASSERT_TRUE(
|
||||
db->GetProperty("rocksdb.num-files-at-level" + ToString(i), &prop));
|
||||
ASSERT_EQ("1", prop);
|
||||
}
|
||||
ASSERT_OK(db->Put(WriteOptions(), ToString(0), ""));
|
||||
ASSERT_OK(db->Flush(FlushOptions()));
|
||||
|
||||
ColumnFamilyMetaData meta;
|
||||
db->GetColumnFamilyMetaData(&meta);
|
||||
// Compact files except the file in L3
|
||||
std::vector<std::string> files;
|
||||
for (int i = 0; i < 6; ++i) {
|
||||
if (i == 3) continue;
|
||||
for (auto& file : meta.levels[i].files) {
|
||||
files.push_back(file.db_path + "/" + file.name);
|
||||
}
|
||||
}
|
||||
|
||||
SyncPoint::GetInstance()->LoadDependency({
|
||||
{"CompactionJob::Run():Start", "CompactFilesTest.MultipleLevel:0"},
|
||||
{"CompactFilesTest.MultipleLevel:1", "CompactFilesImpl:3"},
|
||||
});
|
||||
SyncPoint::GetInstance()->EnableProcessing();
|
||||
|
||||
std::thread thread([&] {
|
||||
TEST_SYNC_POINT("CompactFilesTest.MultipleLevel:0");
|
||||
ASSERT_OK(db->Put(WriteOptions(), "bar", "v2"));
|
||||
ASSERT_OK(db->Put(WriteOptions(), "foo", "v2"));
|
||||
ASSERT_OK(db->Flush(FlushOptions()));
|
||||
TEST_SYNC_POINT("CompactFilesTest.MultipleLevel:1");
|
||||
});
|
||||
|
||||
// Compaction cannot move up the data to higher level
|
||||
// here we have input file from level 5, so the output level has to be >= 5
|
||||
for (int invalid_output_level = 0; invalid_output_level < 5;
|
||||
invalid_output_level++) {
|
||||
s = db->CompactFiles(CompactionOptions(), files, invalid_output_level);
|
||||
std::cout << s.ToString() << std::endl;
|
||||
ASSERT_TRUE(s.IsInvalidArgument());
|
||||
}
|
||||
|
||||
ASSERT_OK(db->CompactFiles(CompactionOptions(), files, 5));
|
||||
SyncPoint::GetInstance()->DisableProcessing();
|
||||
thread.join();
|
||||
|
||||
delete db;
|
||||
}
|
||||
|
||||
TEST_F(CompactFilesTest, ObsoleteFiles) {
|
||||
Options options;
|
||||
// to trigger compaction more easily
|
||||
|
@ -519,7 +519,7 @@ uint64_t Compaction::OutputFilePreallocationSize() const {
|
||||
|
||||
// Over-estimate slightly so we don't end up just barely crossing
|
||||
// the threshold
|
||||
// No point to prellocate more than 1GB.
|
||||
// No point to preallocate more than 1GB.
|
||||
return std::min(uint64_t{1073741824},
|
||||
preallocation_size + (preallocation_size / 10));
|
||||
}
|
||||
|
@ -341,7 +341,7 @@ class Compaction {
|
||||
const uint32_t output_path_id_;
|
||||
CompressionType output_compression_;
|
||||
CompressionOptions output_compression_opts_;
|
||||
// If true, then the comaction can be done by simply deleting input files.
|
||||
// If true, then the compaction can be done by simply deleting input files.
|
||||
const bool deletion_compaction_;
|
||||
|
||||
// Compaction input files organized by level. Constant after construction
|
||||
|
@ -135,7 +135,7 @@ CompactionIterator::CompactionIterator(
|
||||
}
|
||||
|
||||
CompactionIterator::~CompactionIterator() {
|
||||
// input_ Iteartor lifetime is longer than pinned_iters_mgr_ lifetime
|
||||
// input_ Iterator lifetime is longer than pinned_iters_mgr_ lifetime
|
||||
input_->SetPinnedItersMgr(nullptr);
|
||||
}
|
||||
|
||||
|
@ -38,7 +38,7 @@ class NoMergingMergeOp : public MergeOperator {
|
||||
|
||||
// Compaction filter that gets stuck when it sees a particular key,
|
||||
// then gets unstuck when told to.
|
||||
// Always returns Decition::kRemove.
|
||||
// Always returns Decision::kRemove.
|
||||
class StallingFilter : public CompactionFilter {
|
||||
public:
|
||||
Decision FilterV2(int /*level*/, const Slice& key, ValueType /*type*/,
|
||||
@ -189,7 +189,7 @@ class FakeCompaction : public CompactionIterator::CompactionProxy {
|
||||
bool is_allow_ingest_behind = false;
|
||||
};
|
||||
|
||||
// A simplifed snapshot checker which assumes each snapshot has a global
|
||||
// A simplified snapshot checker which assumes each snapshot has a global
|
||||
// last visible sequence.
|
||||
class TestSnapshotChecker : public SnapshotChecker {
|
||||
public:
|
||||
@ -711,7 +711,7 @@ TEST_P(CompactionIteratorTest, ZeroOutSequenceAtBottomLevel) {
|
||||
RunTest({test::KeyStr("a", 1, kTypeValue), test::KeyStr("b", 2, kTypeValue)},
|
||||
{"v1", "v2"},
|
||||
{test::KeyStr("a", 0, kTypeValue), test::KeyStr("b", 2, kTypeValue)},
|
||||
{"v1", "v2"}, kMaxSequenceNumber /*last_commited_seq*/,
|
||||
{"v1", "v2"}, kMaxSequenceNumber /*last_committed_seq*/,
|
||||
nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
|
||||
true /*bottommost_level*/);
|
||||
}
|
||||
@ -720,15 +720,14 @@ TEST_P(CompactionIteratorTest, ZeroOutSequenceAtBottomLevel) {
|
||||
// permanently.
|
||||
TEST_P(CompactionIteratorTest, RemoveDeletionAtBottomLevel) {
|
||||
AddSnapshot(1);
|
||||
RunTest({test::KeyStr("a", 1, kTypeDeletion),
|
||||
test::KeyStr("b", 3, kTypeDeletion),
|
||||
test::KeyStr("b", 1, kTypeValue)},
|
||||
{"", "", ""},
|
||||
{test::KeyStr("b", 3, kTypeDeletion),
|
||||
test::KeyStr("b", 0, kTypeValue)},
|
||||
{"", ""},
|
||||
kMaxSequenceNumber /*last_commited_seq*/, nullptr /*merge_operator*/,
|
||||
nullptr /*compaction_filter*/, true /*bottommost_level*/);
|
||||
RunTest(
|
||||
{test::KeyStr("a", 1, kTypeDeletion), test::KeyStr("b", 3, kTypeDeletion),
|
||||
test::KeyStr("b", 1, kTypeValue)},
|
||||
{"", "", ""},
|
||||
{test::KeyStr("b", 3, kTypeDeletion), test::KeyStr("b", 0, kTypeValue)},
|
||||
{"", ""}, kMaxSequenceNumber /*last_committed_seq*/,
|
||||
nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
|
||||
true /*bottommost_level*/);
|
||||
}
|
||||
|
||||
// In bottommost level, single deletions earlier than earliest snapshot can be
|
||||
@ -738,7 +737,7 @@ TEST_P(CompactionIteratorTest, RemoveSingleDeletionAtBottomLevel) {
|
||||
RunTest({test::KeyStr("a", 1, kTypeSingleDeletion),
|
||||
test::KeyStr("b", 2, kTypeSingleDeletion)},
|
||||
{"", ""}, {test::KeyStr("b", 2, kTypeSingleDeletion)}, {""},
|
||||
kMaxSequenceNumber /*last_commited_seq*/, nullptr /*merge_operator*/,
|
||||
kMaxSequenceNumber /*last_committed_seq*/, nullptr /*merge_operator*/,
|
||||
nullptr /*compaction_filter*/, true /*bottommost_level*/);
|
||||
}
|
||||
|
||||
@ -895,7 +894,7 @@ TEST_F(CompactionIteratorWithSnapshotCheckerTest,
|
||||
{"v1", "v2", "v3"},
|
||||
{test::KeyStr("a", 0, kTypeValue), test::KeyStr("b", 2, kTypeValue),
|
||||
test::KeyStr("c", 3, kTypeValue)},
|
||||
{"v1", "v2", "v3"}, kMaxSequenceNumber /*last_commited_seq*/,
|
||||
{"v1", "v2", "v3"}, kMaxSequenceNumber /*last_committed_seq*/,
|
||||
nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
|
||||
true /*bottommost_level*/);
|
||||
}
|
||||
@ -906,9 +905,7 @@ TEST_F(CompactionIteratorWithSnapshotCheckerTest,
|
||||
RunTest(
|
||||
{test::KeyStr("a", 1, kTypeDeletion), test::KeyStr("b", 2, kTypeDeletion),
|
||||
test::KeyStr("c", 3, kTypeDeletion)},
|
||||
{"", "", ""},
|
||||
{},
|
||||
{"", ""}, kMaxSequenceNumber /*last_commited_seq*/,
|
||||
{"", "", ""}, {}, {"", ""}, kMaxSequenceNumber /*last_committed_seq*/,
|
||||
nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
|
||||
true /*bottommost_level*/);
|
||||
}
|
||||
@ -916,15 +913,14 @@ TEST_F(CompactionIteratorWithSnapshotCheckerTest,
|
||||
TEST_F(CompactionIteratorWithSnapshotCheckerTest,
|
||||
NotRemoveDeletionIfValuePresentToEarlierSnapshot) {
|
||||
AddSnapshot(2,1);
|
||||
RunTest(
|
||||
{test::KeyStr("a", 4, kTypeDeletion), test::KeyStr("a", 1, kTypeValue),
|
||||
test::KeyStr("b", 3, kTypeValue)},
|
||||
{"", "", ""},
|
||||
{test::KeyStr("a", 4, kTypeDeletion), test::KeyStr("a", 0, kTypeValue),
|
||||
test::KeyStr("b", 3, kTypeValue)},
|
||||
{"", "", ""}, kMaxSequenceNumber /*last_commited_seq*/,
|
||||
nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
|
||||
true /*bottommost_level*/);
|
||||
RunTest({test::KeyStr("a", 4, kTypeDeletion),
|
||||
test::KeyStr("a", 1, kTypeValue), test::KeyStr("b", 3, kTypeValue)},
|
||||
{"", "", ""},
|
||||
{test::KeyStr("a", 4, kTypeDeletion),
|
||||
test::KeyStr("a", 0, kTypeValue), test::KeyStr("b", 3, kTypeValue)},
|
||||
{"", "", ""}, kMaxSequenceNumber /*last_committed_seq*/,
|
||||
nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
|
||||
true /*bottommost_level*/);
|
||||
}
|
||||
|
||||
TEST_F(CompactionIteratorWithSnapshotCheckerTest,
|
||||
@ -936,7 +932,7 @@ TEST_F(CompactionIteratorWithSnapshotCheckerTest,
|
||||
{"", "", ""},
|
||||
{test::KeyStr("b", 2, kTypeSingleDeletion),
|
||||
test::KeyStr("c", 3, kTypeSingleDeletion)},
|
||||
{"", ""}, kMaxSequenceNumber /*last_commited_seq*/,
|
||||
{"", ""}, kMaxSequenceNumber /*last_committed_seq*/,
|
||||
nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
|
||||
true /*bottommost_level*/);
|
||||
}
|
||||
@ -986,8 +982,8 @@ TEST_F(CompactionIteratorWithSnapshotCheckerTest,
|
||||
}
|
||||
|
||||
// Compaction filter should keep uncommitted key as-is, and
|
||||
// * Convert the latest velue to deletion, and/or
|
||||
// * if latest value is a merge, apply filter to all suequent merges.
|
||||
// * Convert the latest value to deletion, and/or
|
||||
// * if latest value is a merge, apply filter to all subsequent merges.
|
||||
|
||||
TEST_F(CompactionIteratorWithSnapshotCheckerTest, CompactionFilter_Value) {
|
||||
std::unique_ptr<CompactionFilter> compaction_filter(
|
||||
|
@ -150,7 +150,7 @@ struct CompactionJob::SubcompactionState {
|
||||
// This subcompaction's output could be empty if compaction was aborted
|
||||
// before this subcompaction had a chance to generate any output files.
|
||||
// When subcompactions are executed sequentially this is more likely and
|
||||
// will be particulalry likely for the later subcompactions to be empty.
|
||||
// will be particularly likely for the later subcompactions to be empty.
|
||||
// Once they are run in parallel however it should be much rarer.
|
||||
return nullptr;
|
||||
} else {
|
||||
@ -312,14 +312,19 @@ CompactionJob::CompactionJob(
|
||||
const std::atomic<int>* manual_compaction_paused, const std::string& db_id,
|
||||
const std::string& db_session_id, std::string full_history_ts_low,
|
||||
BlobFileCompletionCallback* blob_callback)
|
||||
: job_id_(job_id),
|
||||
compact_(new CompactionState(compaction)),
|
||||
compaction_job_stats_(compaction_job_stats),
|
||||
: compact_(new CompactionState(compaction)),
|
||||
compaction_stats_(compaction->compaction_reason(), 1),
|
||||
db_options_(db_options),
|
||||
log_buffer_(log_buffer),
|
||||
output_directory_(output_directory),
|
||||
stats_(stats),
|
||||
bottommost_level_(false),
|
||||
write_hint_(Env::WLTH_NOT_SET),
|
||||
job_id_(job_id),
|
||||
compaction_job_stats_(compaction_job_stats),
|
||||
dbname_(dbname),
|
||||
db_id_(db_id),
|
||||
db_session_id_(db_session_id),
|
||||
db_options_(db_options),
|
||||
file_options_(file_options),
|
||||
env_(db_options.env),
|
||||
io_tracer_(io_tracer),
|
||||
@ -330,11 +335,8 @@ CompactionJob::CompactionJob(
|
||||
shutting_down_(shutting_down),
|
||||
manual_compaction_paused_(manual_compaction_paused),
|
||||
preserve_deletes_seqnum_(preserve_deletes_seqnum),
|
||||
log_buffer_(log_buffer),
|
||||
db_directory_(db_directory),
|
||||
output_directory_(output_directory),
|
||||
blob_output_directory_(blob_output_directory),
|
||||
stats_(stats),
|
||||
db_mutex_(db_mutex),
|
||||
db_error_handler_(db_error_handler),
|
||||
existing_snapshots_(std::move(existing_snapshots)),
|
||||
@ -342,10 +344,8 @@ CompactionJob::CompactionJob(
|
||||
snapshot_checker_(snapshot_checker),
|
||||
table_cache_(std::move(table_cache)),
|
||||
event_logger_(event_logger),
|
||||
bottommost_level_(false),
|
||||
paranoid_file_checks_(paranoid_file_checks),
|
||||
measure_io_stats_(measure_io_stats),
|
||||
write_hint_(Env::WLTH_NOT_SET),
|
||||
thread_pri_(thread_pri),
|
||||
full_history_ts_low_(std::move(full_history_ts_low)),
|
||||
blob_callback_(blob_callback) {
|
||||
@ -410,7 +410,7 @@ void CompactionJob::Prepare() {
|
||||
AutoThreadOperationStageUpdater stage_updater(
|
||||
ThreadStatus::STAGE_COMPACTION_PREPARE);
|
||||
|
||||
// Generate file_levels_ for compaction berfore making Iterator
|
||||
// Generate file_levels_ for compaction before making Iterator
|
||||
auto* c = compact_->compaction;
|
||||
assert(c->column_family_data() != nullptr);
|
||||
assert(c->column_family_data()->current()->storage_info()->NumLevelFiles(
|
||||
@ -965,7 +965,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
|
||||
}
|
||||
|
||||
MergeHelper merge(
|
||||
env_, cfd->user_comparator(), cfd->ioptions()->merge_operator,
|
||||
env_, cfd->user_comparator(), cfd->ioptions()->merge_operator.get(),
|
||||
compaction_filter, db_options_.info_log.get(),
|
||||
false /* internal key corruption is expected */,
|
||||
existing_snapshots_.empty() ? 0 : existing_snapshots_.back(),
|
||||
@ -1550,9 +1550,7 @@ Status CompactionJob::FinishCompactionOutputFile(
|
||||
FileDescriptor output_fd;
|
||||
uint64_t oldest_blob_file_number = kInvalidBlobFileNumber;
|
||||
if (meta != nullptr) {
|
||||
fname =
|
||||
TableFileName(sub_compact->compaction->immutable_cf_options()->cf_paths,
|
||||
meta->fd.GetNumber(), meta->fd.GetPathId());
|
||||
fname = GetTableFileName(meta->fd.GetNumber());
|
||||
output_fd = meta->fd;
|
||||
oldest_blob_file_number = meta->oldest_blob_file_number;
|
||||
} else {
|
||||
@ -1672,9 +1670,7 @@ Status CompactionJob::OpenCompactionOutputFile(
|
||||
assert(sub_compact->builder == nullptr);
|
||||
// no need to lock because VersionSet::next_file_number_ is atomic
|
||||
uint64_t file_number = versions_->NewFileNumber();
|
||||
std::string fname =
|
||||
TableFileName(sub_compact->compaction->immutable_cf_options()->cf_paths,
|
||||
file_number, sub_compact->compaction->output_path_id());
|
||||
std::string fname = GetTableFileName(file_number);
|
||||
// Fire events.
|
||||
ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
|
||||
#ifndef ROCKSDB_LITE
|
||||
@ -1770,7 +1766,6 @@ Status CompactionJob::OpenCompactionOutputFile(
|
||||
cfd->internal_comparator(), cfd->int_tbl_prop_collector_factories(),
|
||||
cfd->GetID(), cfd->GetName(), sub_compact->outfile.get(),
|
||||
sub_compact->compaction->output_compression(),
|
||||
0 /*sample_for_compression */,
|
||||
sub_compact->compaction->output_compression_opts(),
|
||||
sub_compact->compaction->output_level(), skip_filters,
|
||||
oldest_ancester_time, 0 /* oldest_key_time */,
|
||||
@ -1938,4 +1933,132 @@ void CompactionJob::LogCompaction() {
|
||||
}
|
||||
}
|
||||
|
||||
std::string CompactionJob::GetTableFileName(uint64_t file_number) {
|
||||
return TableFileName(compact_->compaction->immutable_cf_options()->cf_paths,
|
||||
file_number, compact_->compaction->output_path_id());
|
||||
}
|
||||
|
||||
std::string CompactionServiceCompactionJob::GetTableFileName(
|
||||
uint64_t file_number) {
|
||||
return MakeTableFileName(output_path_, file_number);
|
||||
}
|
||||
|
||||
CompactionServiceCompactionJob::CompactionServiceCompactionJob(
|
||||
int job_id, Compaction* compaction, const ImmutableDBOptions& db_options,
|
||||
const FileOptions& file_options, VersionSet* versions,
|
||||
const std::atomic<bool>* shutting_down, LogBuffer* log_buffer,
|
||||
FSDirectory* output_directory, Statistics* stats,
|
||||
InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler,
|
||||
std::vector<SequenceNumber> existing_snapshots,
|
||||
std::shared_ptr<Cache> table_cache, EventLogger* event_logger,
|
||||
const std::string& dbname, const std::shared_ptr<IOTracer>& io_tracer,
|
||||
const std::string& db_id, const std::string& db_session_id,
|
||||
const std::string& output_path,
|
||||
const CompactionServiceInput& compaction_service_input,
|
||||
CompactionServiceResult* compaction_service_result)
|
||||
: CompactionJob(
|
||||
job_id, compaction, db_options, file_options, versions, shutting_down,
|
||||
0, log_buffer, nullptr, output_directory, nullptr, stats, db_mutex,
|
||||
db_error_handler, existing_snapshots, kMaxSequenceNumber, nullptr,
|
||||
table_cache, event_logger,
|
||||
compaction->mutable_cf_options()->paranoid_file_checks,
|
||||
compaction->mutable_cf_options()->report_bg_io_stats, dbname,
|
||||
&(compaction_service_result->stats), Env::Priority::USER, io_tracer,
|
||||
nullptr, db_id, db_session_id,
|
||||
compaction->column_family_data()->GetFullHistoryTsLow()),
|
||||
output_path_(output_path),
|
||||
compaction_input_(compaction_service_input),
|
||||
compaction_result_(compaction_service_result) {}
|
||||
|
||||
Status CompactionServiceCompactionJob::Run() {
|
||||
AutoThreadOperationStageUpdater stage_updater(
|
||||
ThreadStatus::STAGE_COMPACTION_RUN);
|
||||
|
||||
auto* c = compact_->compaction;
|
||||
assert(c->column_family_data() != nullptr);
|
||||
assert(c->column_family_data()->current()->storage_info()->NumLevelFiles(
|
||||
compact_->compaction->level()) > 0);
|
||||
|
||||
write_hint_ =
|
||||
c->column_family_data()->CalculateSSTWriteHint(c->output_level());
|
||||
bottommost_level_ = c->bottommost_level();
|
||||
|
||||
compact_->sub_compact_states.emplace_back(c, compaction_input_.begin,
|
||||
compaction_input_.end,
|
||||
compaction_input_.approx_size);
|
||||
|
||||
log_buffer_->FlushBufferToLog();
|
||||
LogCompaction();
|
||||
const uint64_t start_micros = db_options_.clock->NowMicros();
|
||||
// Pick the only sub-compaction we should have
|
||||
assert(compact_->sub_compact_states.size() == 1);
|
||||
SubcompactionState* sub_compact = compact_->sub_compact_states.data();
|
||||
|
||||
ProcessKeyValueCompaction(sub_compact);
|
||||
|
||||
compaction_stats_.micros = db_options_.clock->NowMicros() - start_micros;
|
||||
compaction_stats_.cpu_micros = sub_compact->compaction_job_stats.cpu_micros;
|
||||
|
||||
RecordTimeToHistogram(stats_, COMPACTION_TIME, compaction_stats_.micros);
|
||||
RecordTimeToHistogram(stats_, COMPACTION_CPU_TIME,
|
||||
compaction_stats_.cpu_micros);
|
||||
|
||||
Status status = sub_compact->status;
|
||||
IOStatus io_s = sub_compact->io_status;
|
||||
|
||||
if (io_status_.ok()) {
|
||||
io_status_ = io_s;
|
||||
}
|
||||
|
||||
if (status.ok()) {
|
||||
constexpr IODebugContext* dbg = nullptr;
|
||||
|
||||
if (output_directory_) {
|
||||
io_s = output_directory_->Fsync(IOOptions(), dbg);
|
||||
}
|
||||
}
|
||||
if (io_status_.ok()) {
|
||||
io_status_ = io_s;
|
||||
}
|
||||
if (status.ok()) {
|
||||
status = io_s;
|
||||
}
|
||||
if (status.ok()) {
|
||||
// TODO: Add verify_table() and VerifyCompactionFileConsistency()
|
||||
}
|
||||
|
||||
// Finish up all book-keeping to unify the subcompaction results
|
||||
AggregateStatistics();
|
||||
UpdateCompactionStats();
|
||||
|
||||
compaction_result_->bytes_written = IOSTATS(bytes_written);
|
||||
compaction_result_->bytes_read = IOSTATS(bytes_read);
|
||||
RecordCompactionIOStats();
|
||||
|
||||
LogFlush(db_options_.info_log);
|
||||
compact_->status = status;
|
||||
compact_->status.PermitUncheckedError();
|
||||
|
||||
// Build compaction result
|
||||
compaction_result_->output_level = compact_->compaction->output_level();
|
||||
compaction_result_->output_path = output_path_;
|
||||
for (const auto& output_file : sub_compact->outputs) {
|
||||
auto& meta = output_file.meta;
|
||||
compaction_result_->output_files.emplace_back(
|
||||
MakeTableFileName(meta.fd.GetNumber()), meta.fd.smallest_seqno,
|
||||
meta.fd.largest_seqno, meta.smallest.Encode().ToString(),
|
||||
meta.largest.Encode().ToString(), meta.oldest_ancester_time,
|
||||
meta.file_creation_time, output_file.validator.GetHash(),
|
||||
meta.marked_for_compaction);
|
||||
}
|
||||
compaction_result_->num_output_records = sub_compact->num_output_records;
|
||||
compaction_result_->total_bytes = sub_compact->total_bytes;
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
void CompactionServiceCompactionJob::CleanupCompaction() {
|
||||
CompactionJob::CleanupCompaction();
|
||||
}
|
||||
|
||||
} // namespace ROCKSDB_NAMESPACE
|
||||
|
@ -84,7 +84,7 @@ class CompactionJob {
|
||||
std::string full_history_ts_low = "",
|
||||
BlobFileCompletionCallback* blob_callback = nullptr);
|
||||
|
||||
~CompactionJob();
|
||||
virtual ~CompactionJob();
|
||||
|
||||
// no copy/move
|
||||
CompactionJob(CompactionJob&& job) = delete;
|
||||
@ -107,11 +107,35 @@ class CompactionJob {
|
||||
// Return the IO status
|
||||
IOStatus io_status() const { return io_status_; }
|
||||
|
||||
private:
|
||||
protected:
|
||||
struct SubcompactionState;
|
||||
// CompactionJob state
|
||||
struct CompactionState;
|
||||
|
||||
void AggregateStatistics();
|
||||
void UpdateCompactionStats();
|
||||
void LogCompaction();
|
||||
void RecordCompactionIOStats();
|
||||
void CleanupCompaction();
|
||||
|
||||
// Call compaction filter. Then iterate through input and compact the
|
||||
// kv-pairs
|
||||
void ProcessKeyValueCompaction(SubcompactionState* sub_compact);
|
||||
|
||||
CompactionState* compact_;
|
||||
InternalStats::CompactionStats compaction_stats_;
|
||||
const ImmutableDBOptions& db_options_;
|
||||
LogBuffer* log_buffer_;
|
||||
FSDirectory* output_directory_;
|
||||
Statistics* stats_;
|
||||
// Is this compaction creating a file in the bottom most level?
|
||||
bool bottommost_level_;
|
||||
|
||||
Env::WriteLifeTimeHint write_hint_;
|
||||
|
||||
IOStatus io_status_;
|
||||
|
||||
private:
|
||||
// Generates a histogram representing potential divisions of key ranges from
|
||||
// the input. It adds the starting and/or ending keys of certain input files
|
||||
// to the working set and then finds the approximate size of data in between
|
||||
@ -122,9 +146,6 @@ class CompactionJob {
|
||||
// update the thread status for starting a compaction.
|
||||
void ReportStartedCompaction(Compaction* compaction);
|
||||
void AllocateCompactionOutputFileNumbers();
|
||||
// Call compaction filter. Then iterate through input and compact the
|
||||
// kv-pairs
|
||||
void ProcessKeyValueCompaction(SubcompactionState* sub_compact);
|
||||
|
||||
Status FinishCompactionOutputFile(
|
||||
const Status& input_status, SubcompactionState* sub_compact,
|
||||
@ -132,33 +153,23 @@ class CompactionJob {
|
||||
CompactionIterationStats* range_del_out_stats,
|
||||
const Slice* next_table_min_key = nullptr);
|
||||
Status InstallCompactionResults(const MutableCFOptions& mutable_cf_options);
|
||||
void RecordCompactionIOStats();
|
||||
Status OpenCompactionOutputFile(SubcompactionState* sub_compact);
|
||||
void CleanupCompaction();
|
||||
void UpdateCompactionJobStats(
|
||||
const InternalStats::CompactionStats& stats) const;
|
||||
void RecordDroppedKeys(const CompactionIterationStats& c_iter_stats,
|
||||
CompactionJobStats* compaction_job_stats = nullptr);
|
||||
|
||||
void UpdateCompactionStats();
|
||||
void UpdateCompactionInputStatsHelper(
|
||||
int* num_files, uint64_t* bytes_read, int input_level);
|
||||
|
||||
void LogCompaction();
|
||||
|
||||
int job_id_;
|
||||
|
||||
// CompactionJob state
|
||||
struct CompactionState;
|
||||
CompactionState* compact_;
|
||||
CompactionJobStats* compaction_job_stats_;
|
||||
InternalStats::CompactionStats compaction_stats_;
|
||||
|
||||
// DBImpl state
|
||||
const std::string& dbname_;
|
||||
const std::string db_id_;
|
||||
const std::string db_session_id_;
|
||||
const ImmutableDBOptions& db_options_;
|
||||
const FileOptions file_options_;
|
||||
|
||||
Env* env_;
|
||||
@ -170,11 +181,8 @@ class CompactionJob {
|
||||
const std::atomic<bool>* shutting_down_;
|
||||
const std::atomic<int>* manual_compaction_paused_;
|
||||
const SequenceNumber preserve_deletes_seqnum_;
|
||||
LogBuffer* log_buffer_;
|
||||
FSDirectory* db_directory_;
|
||||
FSDirectory* output_directory_;
|
||||
FSDirectory* blob_output_directory_;
|
||||
Statistics* stats_;
|
||||
InstrumentedMutex* db_mutex_;
|
||||
ErrorHandler* db_error_handler_;
|
||||
// If there were two snapshots with seq numbers s1 and
|
||||
@ -194,19 +202,128 @@ class CompactionJob {
|
||||
|
||||
EventLogger* event_logger_;
|
||||
|
||||
// Is this compaction creating a file in the bottom most level?
|
||||
bool bottommost_level_;
|
||||
bool paranoid_file_checks_;
|
||||
bool measure_io_stats_;
|
||||
// Stores the Slices that designate the boundaries for each subcompaction
|
||||
std::vector<Slice> boundaries_;
|
||||
// Stores the approx size of keys covered in the range of each subcompaction
|
||||
std::vector<uint64_t> sizes_;
|
||||
Env::WriteLifeTimeHint write_hint_;
|
||||
Env::Priority thread_pri_;
|
||||
IOStatus io_status_;
|
||||
std::string full_history_ts_low_;
|
||||
BlobFileCompletionCallback* blob_callback_;
|
||||
|
||||
// Get table file name in where it's outputting to, which should also be in
|
||||
// `output_directory_`.
|
||||
virtual std::string GetTableFileName(uint64_t file_number);
|
||||
};
|
||||
|
||||
// CompactionServiceInput is used the pass compaction information between two
|
||||
// db instances. It contains the information needed to do a compaction. It
|
||||
// doesn't contain the LSM tree information, which is passed though MANIFEST
|
||||
// file.
|
||||
struct CompactionServiceInput {
|
||||
ColumnFamilyDescriptor column_family;
|
||||
|
||||
DBOptions db_options;
|
||||
|
||||
std::vector<SequenceNumber> snapshots;
|
||||
|
||||
// SST files for compaction, it should already be expended to include all the
|
||||
// files needed for this compaction, for both input level files and output
|
||||
// level files.
|
||||
std::vector<std::string> input_files;
|
||||
int output_level;
|
||||
|
||||
// information for subcompaction
|
||||
Slice* begin = nullptr;
|
||||
Slice* end = nullptr;
|
||||
uint64_t approx_size = 0;
|
||||
};
|
||||
|
||||
// CompactionServiceOutputFile is the metadata for the output SST file
|
||||
struct CompactionServiceOutputFile {
|
||||
std::string file_name;
|
||||
SequenceNumber smallest_seqno;
|
||||
SequenceNumber largest_seqno;
|
||||
std::string smallest_internal_key;
|
||||
std::string largest_internal_key;
|
||||
uint64_t oldest_ancester_time;
|
||||
uint64_t file_creation_time;
|
||||
uint64_t paranoid_hash;
|
||||
bool marked_for_compaction;
|
||||
|
||||
CompactionServiceOutputFile() = default;
|
||||
CompactionServiceOutputFile(
|
||||
const std::string& name, SequenceNumber smallest, SequenceNumber largest,
|
||||
std::string _smallest_internal_key, std::string _largest_internal_key,
|
||||
uint64_t _oldest_ancester_time, uint64_t _file_creation_time,
|
||||
uint64_t _paranoid_hash, bool _marked_for_compaction)
|
||||
: file_name(name),
|
||||
smallest_seqno(smallest),
|
||||
largest_seqno(largest),
|
||||
smallest_internal_key(std::move(_smallest_internal_key)),
|
||||
largest_internal_key(std::move(_largest_internal_key)),
|
||||
oldest_ancester_time(_oldest_ancester_time),
|
||||
file_creation_time(_file_creation_time),
|
||||
paranoid_hash(_paranoid_hash),
|
||||
marked_for_compaction(_marked_for_compaction) {}
|
||||
};
|
||||
|
||||
// CompactionServiceResult contains the compaction result from a different db
|
||||
// instance, with these information, the primary db instance with write
|
||||
// permission is able to install the result to the DB.
|
||||
struct CompactionServiceResult {
|
||||
std::vector<CompactionServiceOutputFile> output_files;
|
||||
int output_level;
|
||||
|
||||
// location of the output files
|
||||
std::string output_path;
|
||||
|
||||
// some statistics about the compaction
|
||||
uint64_t num_output_records;
|
||||
uint64_t total_bytes;
|
||||
uint64_t bytes_read;
|
||||
uint64_t bytes_written;
|
||||
CompactionJobStats stats;
|
||||
};
|
||||
|
||||
// CompactionServiceCompactionJob is an read-only compaction job, it takes
|
||||
// input information from `compaction_service_input` and put result information
|
||||
// in `compaction_service_result`, the SST files are generated to `output_path`.
|
||||
class CompactionServiceCompactionJob : private CompactionJob {
|
||||
public:
|
||||
CompactionServiceCompactionJob(
|
||||
int job_id, Compaction* compaction, const ImmutableDBOptions& db_options,
|
||||
const FileOptions& file_options, VersionSet* versions,
|
||||
const std::atomic<bool>* shutting_down, LogBuffer* log_buffer,
|
||||
FSDirectory* output_directory, Statistics* stats,
|
||||
InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler,
|
||||
std::vector<SequenceNumber> existing_snapshots,
|
||||
std::shared_ptr<Cache> table_cache, EventLogger* event_logger,
|
||||
const std::string& dbname, const std::shared_ptr<IOTracer>& io_tracer,
|
||||
const std::string& db_id, const std::string& db_session_id,
|
||||
const std::string& output_path,
|
||||
const CompactionServiceInput& compaction_service_input,
|
||||
CompactionServiceResult* compaction_service_result);
|
||||
|
||||
// Run the compaction in current thread and return the result
|
||||
Status Run();
|
||||
|
||||
void CleanupCompaction();
|
||||
|
||||
IOStatus io_status() const { return CompactionJob::io_status(); }
|
||||
|
||||
private:
|
||||
// Get table file name in output_path
|
||||
std::string GetTableFileName(uint64_t file_number) override;
|
||||
// Specific the compaction output path, otherwise it uses default DB path
|
||||
const std::string output_path_;
|
||||
|
||||
// Compaction job input
|
||||
const CompactionServiceInput& compaction_input_;
|
||||
|
||||
// Compaction job result
|
||||
CompactionServiceResult* compaction_result_;
|
||||
};
|
||||
|
||||
} // namespace ROCKSDB_NAMESPACE
|
||||
|
@ -1004,6 +1004,7 @@ Status CompactionPicker::SanitizeCompactionInputFiles(
|
||||
// any currently-existing files.
|
||||
for (auto file_num : *input_files) {
|
||||
bool found = false;
|
||||
int input_file_level = -1;
|
||||
for (const auto& level_meta : cf_meta.levels) {
|
||||
for (const auto& file_meta : level_meta.files) {
|
||||
if (file_num == TableFileNameToNumber(file_meta.name)) {
|
||||
@ -1013,6 +1014,7 @@ Status CompactionPicker::SanitizeCompactionInputFiles(
|
||||
" is already being compacted.");
|
||||
}
|
||||
found = true;
|
||||
input_file_level = level_meta.level;
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -1025,6 +1027,13 @@ Status CompactionPicker::SanitizeCompactionInputFiles(
|
||||
"Specified compaction input file " + MakeTableFileName("", file_num) +
|
||||
" does not exist in column family " + cf_meta.name + ".");
|
||||
}
|
||||
if (input_file_level > output_level) {
|
||||
return Status::InvalidArgument(
|
||||
"Cannot compact file to up level, input file: " +
|
||||
MakeTableFileName("", file_num) + " level " +
|
||||
ToString(input_file_level) + " > output level " +
|
||||
ToString(output_level));
|
||||
}
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
|
@ -650,7 +650,7 @@ TEST_F(CompactionPickerTest, UniversalPeriodicCompaction3) {
|
||||
|
||||
TEST_F(CompactionPickerTest, UniversalPeriodicCompaction4) {
|
||||
// The case where universal periodic compaction couldn't form
|
||||
// a compaction that inlcudes any file marked for periodic compaction.
|
||||
// a compaction that includes any file marked for periodic compaction.
|
||||
// Right now we form the compaction anyway if it is more than one
|
||||
// sorted run. Just put the case here to validate that it doesn't
|
||||
// crash.
|
||||
@ -800,7 +800,7 @@ TEST_F(CompactionPickerTest, CompactionPriMinOverlapping2) {
|
||||
Add(2, 6U, "150", "175",
|
||||
60000000U); // Overlaps with file 26, 27, total size 521M
|
||||
Add(2, 7U, "176", "200", 60000000U); // Overlaps with file 27, 28, total size
|
||||
// 520M, the smalelst overlapping
|
||||
// 520M, the smallest overlapping
|
||||
Add(2, 8U, "201", "300",
|
||||
60000000U); // Overlaps with file 28, 29, total size 521M
|
||||
|
||||
@ -1228,7 +1228,7 @@ TEST_F(CompactionPickerTest, NotScheduleL1IfL0WithHigherPri1) {
|
||||
Add(0, 32U, "001", "400", 1000000000U, 0, 0);
|
||||
Add(0, 33U, "001", "400", 1000000000U, 0, 0);
|
||||
|
||||
// L1 total size 2GB, score 2.2. If one file being comapcted, score 1.1.
|
||||
// L1 total size 2GB, score 2.2. If one file being compacted, score 1.1.
|
||||
Add(1, 4U, "050", "300", 1000000000U, 0, 0);
|
||||
file_map_[4u].first->being_compacted = true;
|
||||
Add(1, 5U, "301", "350", 1000000000U, 0, 0);
|
||||
@ -1261,7 +1261,7 @@ TEST_F(CompactionPickerTest, NotScheduleL1IfL0WithHigherPri2) {
|
||||
Add(0, 32U, "001", "400", 1000000000U, 0, 0);
|
||||
Add(0, 33U, "001", "400", 1000000000U, 0, 0);
|
||||
|
||||
// L1 total size 2GB, score 2.2. If one file being comapcted, score 1.1.
|
||||
// L1 total size 2GB, score 2.2. If one file being compacted, score 1.1.
|
||||
Add(1, 4U, "050", "300", 1000000000U, 0, 0);
|
||||
Add(1, 5U, "301", "350", 1000000000U, 0, 0);
|
||||
|
||||
|
@ -733,7 +733,7 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSortedRuns(
|
||||
}
|
||||
|
||||
// Look at overall size amplification. If size amplification
|
||||
// exceeeds the configured value, then do a compaction
|
||||
// exceeds the configured value, then do a compaction
|
||||
// of the candidate files all the way upto the earliest
|
||||
// base file (overrides configured values of file-size ratios,
|
||||
// min_merge_width and max_merge_width).
|
||||
|
@ -7,6 +7,9 @@
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include <iomanip>
|
||||
#include <sstream>
|
||||
|
||||
#include "db/db_test_util.h"
|
||||
#include "options/options_helper.h"
|
||||
#include "port/stack_trace.h"
|
||||
@ -807,6 +810,7 @@ class TestingContextCustomFilterPolicy
|
||||
TEST_F(DBBloomFilterTest, ContextCustomFilterPolicy) {
|
||||
for (bool fifo : {true, false}) {
|
||||
Options options = CurrentOptions();
|
||||
options.max_open_files = fifo ? -1 : options.max_open_files;
|
||||
options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
|
||||
options.compaction_style =
|
||||
fifo ? kCompactionStyleFIFO : kCompactionStyleLevel;
|
||||
@ -817,6 +821,7 @@ TEST_F(DBBloomFilterTest, ContextCustomFilterPolicy) {
|
||||
table_options.format_version = 5;
|
||||
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
||||
|
||||
TryReopen(options);
|
||||
CreateAndReopenWithCF({fifo ? "abe" : "bob"}, options);
|
||||
|
||||
const int maxKey = 10000;
|
||||
@ -2117,6 +2122,54 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterOptions) {
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(DBBloomFilterTest, SeekForPrevWithPartitionedFilters) {
|
||||
Options options = CurrentOptions();
|
||||
constexpr size_t kNumKeys = 10000;
|
||||
static_assert(kNumKeys <= 10000, "kNumKeys have to be <= 10000");
|
||||
options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeys + 10));
|
||||
options.create_if_missing = true;
|
||||
constexpr size_t kPrefixLength = 4;
|
||||
options.prefix_extractor.reset(NewFixedPrefixTransform(kPrefixLength));
|
||||
options.compression = kNoCompression;
|
||||
BlockBasedTableOptions bbto;
|
||||
bbto.filter_policy.reset(NewBloomFilterPolicy(50));
|
||||
bbto.index_shortening =
|
||||
BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
|
||||
bbto.block_size = 128;
|
||||
bbto.metadata_block_size = 128;
|
||||
bbto.partition_filters = true;
|
||||
bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
|
||||
options.table_factory.reset(NewBlockBasedTableFactory(bbto));
|
||||
DestroyAndReopen(options);
|
||||
|
||||
const std::string value(64, '\0');
|
||||
|
||||
WriteOptions write_opts;
|
||||
write_opts.disableWAL = true;
|
||||
for (size_t i = 0; i < kNumKeys; ++i) {
|
||||
std::ostringstream oss;
|
||||
oss << std::setfill('0') << std::setw(4) << std::fixed << i;
|
||||
ASSERT_OK(db_->Put(write_opts, oss.str(), value));
|
||||
}
|
||||
ASSERT_OK(Flush());
|
||||
|
||||
ReadOptions read_opts;
|
||||
// Use legacy, implicit prefix seek
|
||||
read_opts.total_order_seek = false;
|
||||
read_opts.auto_prefix_mode = false;
|
||||
std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
|
||||
for (size_t i = 0; i < kNumKeys; ++i) {
|
||||
// Seek with a key after each one added but with same prefix. One will
|
||||
// surely cross a partition boundary.
|
||||
std::ostringstream oss;
|
||||
oss << std::setfill('0') << std::setw(4) << std::fixed << i << "a";
|
||||
it->SeekForPrev(oss.str());
|
||||
ASSERT_OK(it->status());
|
||||
ASSERT_TRUE(it->Valid());
|
||||
}
|
||||
it.reset();
|
||||
}
|
||||
|
||||
#endif // ROCKSDB_LITE
|
||||
|
||||
} // namespace ROCKSDB_NAMESPACE
|
||||
|
@ -166,6 +166,66 @@ TEST_F(DBFlushTest, FlushInLowPriThreadPool) {
|
||||
ASSERT_EQ(1, num_compactions);
|
||||
}
|
||||
|
||||
// Test when flush job is submitted to low priority thread pool and when DB is
|
||||
// closed in the meanwhile, CloseHelper doesn't hang.
|
||||
TEST_F(DBFlushTest, CloseDBWhenFlushInLowPri) {
|
||||
Options options = CurrentOptions();
|
||||
options.max_background_flushes = 1;
|
||||
options.max_total_wal_size = 8192;
|
||||
|
||||
DestroyAndReopen(options);
|
||||
CreateColumnFamilies({"cf1", "cf2"}, options);
|
||||
|
||||
env_->SetBackgroundThreads(0, Env::HIGH);
|
||||
env_->SetBackgroundThreads(1, Env::LOW);
|
||||
test::SleepingBackgroundTask sleeping_task_low;
|
||||
int num_flushes = 0;
|
||||
|
||||
SyncPoint::GetInstance()->SetCallBack("DBImpl::BGWorkFlush",
|
||||
[&](void* /*arg*/) { ++num_flushes; });
|
||||
|
||||
int num_low_flush_unscheduled = 0;
|
||||
SyncPoint::GetInstance()->SetCallBack(
|
||||
"DBImpl::UnscheduleLowFlushCallback", [&](void* /*arg*/) {
|
||||
num_low_flush_unscheduled++;
|
||||
// There should be one flush job in low pool that needs to be
|
||||
// unscheduled
|
||||
ASSERT_EQ(num_low_flush_unscheduled, 1);
|
||||
});
|
||||
|
||||
int num_high_flush_unscheduled = 0;
|
||||
SyncPoint::GetInstance()->SetCallBack(
|
||||
"DBImpl::UnscheduleHighFlushCallback", [&](void* /*arg*/) {
|
||||
num_high_flush_unscheduled++;
|
||||
// There should be no flush job in high pool
|
||||
ASSERT_EQ(num_high_flush_unscheduled, 0);
|
||||
});
|
||||
|
||||
SyncPoint::GetInstance()->EnableProcessing();
|
||||
|
||||
ASSERT_OK(Put(0, "key1", DummyString(8192)));
|
||||
// Block thread so that flush cannot be run and can be removed from the queue
|
||||
// when called Unschedule.
|
||||
env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
|
||||
Env::Priority::LOW);
|
||||
sleeping_task_low.WaitUntilSleeping();
|
||||
|
||||
// Trigger flush and flush job will be scheduled to LOW priority thread.
|
||||
ASSERT_OK(Put(0, "key2", DummyString(8192)));
|
||||
|
||||
// Close DB and flush job in low priority queue will be removed without
|
||||
// running.
|
||||
Close();
|
||||
sleeping_task_low.WakeUp();
|
||||
sleeping_task_low.WaitUntilDone();
|
||||
ASSERT_EQ(0, num_flushes);
|
||||
|
||||
TryReopenWithColumnFamilies({"default", "cf1", "cf2"}, options);
|
||||
ASSERT_OK(Put(0, "key3", DummyString(8192)));
|
||||
ASSERT_OK(Flush(0));
|
||||
ASSERT_EQ(1, num_flushes);
|
||||
}
|
||||
|
||||
TEST_F(DBFlushTest, ManualFlushWithMinWriteBufferNumberToMerge) {
|
||||
Options options = CurrentOptions();
|
||||
options.write_buffer_size = 100;
|
||||
|
@ -4,7 +4,7 @@
|
||||
// (found in the LICENSE.Apache file in the root directory).
|
||||
|
||||
#ifndef ROCKSDB_LITE
|
||||
#include "db/compacted_db_impl.h"
|
||||
#include "db/db_impl/compacted_db_impl.h"
|
||||
|
||||
#include "db/db_impl/db_impl.h"
|
||||
#include "db/version_set.h"
|
||||
@ -17,11 +17,13 @@ extern void MarkKeyMayExist(void* arg);
|
||||
extern bool SaveValue(void* arg, const ParsedInternalKey& parsed_key,
|
||||
const Slice& v, bool hit_and_return);
|
||||
|
||||
CompactedDBImpl::CompactedDBImpl(
|
||||
const DBOptions& options, const std::string& dbname)
|
||||
: DBImpl(options, dbname), cfd_(nullptr), version_(nullptr),
|
||||
user_comparator_(nullptr) {
|
||||
}
|
||||
CompactedDBImpl::CompactedDBImpl(const DBOptions& options,
|
||||
const std::string& dbname)
|
||||
: DBImpl(options, dbname, /*seq_per_batch*/ false, +/*batch_per_txn*/ true,
|
||||
/*read_only*/ true),
|
||||
cfd_(nullptr),
|
||||
version_(nullptr),
|
||||
user_comparator_(nullptr) {}
|
||||
|
||||
CompactedDBImpl::~CompactedDBImpl() {
|
||||
}
|
||||
@ -78,6 +80,7 @@ std::vector<Status> CompactedDBImpl::MultiGet(const ReadOptions& options,
|
||||
nullptr, nullptr, nullptr, true, nullptr, nullptr);
|
||||
LookupKey lkey(keys[idx], kMaxSequenceNumber);
|
||||
Status s = r->Get(options, lkey.internal_key(), &get_context, nullptr);
|
||||
assert(static_cast<size_t>(idx) < statuses.size());
|
||||
if (!s.ok() && !s.IsNotFound()) {
|
||||
statuses[idx] = s;
|
||||
} else {
|
@ -18,7 +18,7 @@ class CompactedDBImpl : public DBImpl {
|
||||
CompactedDBImpl(const CompactedDBImpl&) = delete;
|
||||
void operator=(const CompactedDBImpl&) = delete;
|
||||
|
||||
virtual ~CompactedDBImpl();
|
||||
~CompactedDBImpl() override;
|
||||
|
||||
static Status Open(const Options& options, const std::string& dbname,
|
||||
DB** dbptr);
|
@ -146,10 +146,11 @@ void DumpSupportInfo(Logger* logger) {
|
||||
} // namespace
|
||||
|
||||
DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
|
||||
const bool seq_per_batch, const bool batch_per_txn)
|
||||
const bool seq_per_batch, const bool batch_per_txn,
|
||||
bool read_only)
|
||||
: dbname_(dbname),
|
||||
own_info_log_(options.info_log == nullptr),
|
||||
initial_db_options_(SanitizeOptions(dbname, options)),
|
||||
initial_db_options_(SanitizeOptions(dbname, options, read_only)),
|
||||
env_(initial_db_options_.env),
|
||||
io_tracer_(std::make_shared<IOTracer>()),
|
||||
immutable_db_options_(initial_db_options_),
|
||||
@ -159,14 +160,17 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
|
||||
mutex_(stats_, immutable_db_options_.clock, DB_MUTEX_WAIT_MICROS,
|
||||
immutable_db_options_.use_adaptive_mutex),
|
||||
default_cf_handle_(nullptr),
|
||||
error_handler_(this, immutable_db_options_, &mutex_),
|
||||
event_logger_(immutable_db_options_.info_log.get()),
|
||||
max_total_in_memory_state_(0),
|
||||
file_options_(BuildDBOptions(immutable_db_options_, mutable_db_options_)),
|
||||
file_options_for_compaction_(fs_->OptimizeForCompactionTableWrite(
|
||||
file_options_, immutable_db_options_)),
|
||||
seq_per_batch_(seq_per_batch),
|
||||
batch_per_txn_(batch_per_txn),
|
||||
db_lock_(nullptr),
|
||||
next_job_id_(1),
|
||||
shutting_down_(false),
|
||||
db_lock_(nullptr),
|
||||
manual_compaction_paused_(false),
|
||||
bg_cv_(&mutex_),
|
||||
logfile_number_(0),
|
||||
@ -193,7 +197,6 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
|
||||
pending_purge_obsolete_files_(0),
|
||||
delete_obsolete_files_last_run_(immutable_db_options_.clock->NowMicros()),
|
||||
last_stats_dump_time_microsec_(0),
|
||||
next_job_id_(1),
|
||||
has_unpersisted_data_(false),
|
||||
unable_to_release_oldest_log_(false),
|
||||
num_running_ingest_file_(0),
|
||||
@ -201,7 +204,6 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
|
||||
wal_manager_(immutable_db_options_, file_options_, io_tracer_,
|
||||
seq_per_batch),
|
||||
#endif // ROCKSDB_LITE
|
||||
event_logger_(immutable_db_options_.info_log.get()),
|
||||
bg_work_paused_(0),
|
||||
bg_compaction_paused_(0),
|
||||
refitting_level_(false),
|
||||
@ -230,7 +232,6 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
|
||||
own_sfm_(options.sst_file_manager == nullptr),
|
||||
preserve_deletes_(options.preserve_deletes),
|
||||
closed_(false),
|
||||
error_handler_(this, immutable_db_options_, &mutex_),
|
||||
atomic_flush_install_cv_(&mutex_),
|
||||
blob_callback_(immutable_db_options_.sst_file_manager.get(), &mutex_,
|
||||
&error_handler_) {
|
||||
@ -269,6 +270,10 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
|
||||
// we won't drop any deletion markers until SetPreserveDeletesSequenceNumber()
|
||||
// is called by client and this seqnum is advanced.
|
||||
preserve_deletes_seqnum_.store(0);
|
||||
|
||||
if (write_buffer_manager_) {
|
||||
wbm_stall_.reset(new WBMStallInterface());
|
||||
}
|
||||
}
|
||||
|
||||
Status DBImpl::Resume() {
|
||||
@ -522,15 +527,11 @@ Status DBImpl::CloseHelper() {
|
||||
// marker. After this we do a variant of the waiting and unschedule work
|
||||
// (to consider: moving all the waiting into CancelAllBackgroundWork(true))
|
||||
CancelAllBackgroundWork(false);
|
||||
int bottom_compactions_unscheduled =
|
||||
env_->UnSchedule(this, Env::Priority::BOTTOM);
|
||||
int compactions_unscheduled = env_->UnSchedule(this, Env::Priority::LOW);
|
||||
int flushes_unscheduled = env_->UnSchedule(this, Env::Priority::HIGH);
|
||||
Status ret = Status::OK();
|
||||
mutex_.Lock();
|
||||
bg_bottom_compaction_scheduled_ -= bottom_compactions_unscheduled;
|
||||
bg_compaction_scheduled_ -= compactions_unscheduled;
|
||||
bg_flush_scheduled_ -= flushes_unscheduled;
|
||||
env_->UnSchedule(this, Env::Priority::BOTTOM);
|
||||
env_->UnSchedule(this, Env::Priority::LOW);
|
||||
env_->UnSchedule(this, Env::Priority::HIGH);
|
||||
Status ret = Status::OK();
|
||||
|
||||
// Wait for background work to finish
|
||||
while (bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ ||
|
||||
@ -663,6 +664,10 @@ Status DBImpl::CloseHelper() {
|
||||
}
|
||||
}
|
||||
|
||||
if (write_buffer_manager_ && wbm_stall_) {
|
||||
write_buffer_manager_->RemoveDBFromQueue(wbm_stall_.get());
|
||||
}
|
||||
|
||||
if (ret.IsAborted()) {
|
||||
// Reserve IsAborted() error for those where users didn't release
|
||||
// certain resource and they can release them and come back and
|
||||
@ -692,8 +697,8 @@ void DBImpl::MaybeIgnoreError(Status* s) const {
|
||||
}
|
||||
|
||||
const Status DBImpl::CreateArchivalDirectory() {
|
||||
if (immutable_db_options_.wal_ttl_seconds > 0 ||
|
||||
immutable_db_options_.wal_size_limit_mb > 0) {
|
||||
if (immutable_db_options_.WAL_ttl_seconds > 0 ||
|
||||
immutable_db_options_.WAL_size_limit_MB > 0) {
|
||||
std::string archivalPath = ArchivalDirectory(immutable_db_options_.wal_dir);
|
||||
return env_->CreateDirIfMissing(archivalPath);
|
||||
}
|
||||
@ -3149,7 +3154,7 @@ SystemClock* DBImpl::GetSystemClock() const {
|
||||
|
||||
#ifndef ROCKSDB_LITE
|
||||
|
||||
Status DBImpl::StartIOTrace(Env* /*env*/, const TraceOptions& trace_options,
|
||||
Status DBImpl::StartIOTrace(const TraceOptions& trace_options,
|
||||
std::unique_ptr<TraceWriter>&& trace_writer) {
|
||||
assert(trace_writer != nullptr);
|
||||
return io_tracer_->StartIOTrace(GetSystemClock(), trace_options,
|
||||
|
@ -129,7 +129,8 @@ class Directories {
|
||||
class DBImpl : public DB {
|
||||
public:
|
||||
DBImpl(const DBOptions& options, const std::string& dbname,
|
||||
const bool seq_per_batch = false, const bool batch_per_txn = true);
|
||||
const bool seq_per_batch = false, const bool batch_per_txn = true,
|
||||
bool read_only = false);
|
||||
// No copying allowed
|
||||
DBImpl(const DBImpl&) = delete;
|
||||
void operator=(const DBImpl&) = delete;
|
||||
@ -469,7 +470,7 @@ class DBImpl : public DB {
|
||||
Status EndBlockCacheTrace() override;
|
||||
|
||||
using DB::StartIOTrace;
|
||||
Status StartIOTrace(Env* env, const TraceOptions& options,
|
||||
Status StartIOTrace(const TraceOptions& options,
|
||||
std::unique_ptr<TraceWriter>&& trace_writer) override;
|
||||
|
||||
using DB::EndIOTrace;
|
||||
@ -1047,6 +1048,56 @@ class DBImpl : public DB {
|
||||
// flush LOG out of application buffer
|
||||
void FlushInfoLog();
|
||||
|
||||
// Interface to block and signal the DB in case of stalling writes by
|
||||
// WriteBufferManager. Each DBImpl object contains ptr to WBMStallInterface.
|
||||
// When DB needs to be blocked or signalled by WriteBufferManager,
|
||||
// state_ is changed accordingly.
|
||||
class WBMStallInterface : public StallInterface {
|
||||
public:
|
||||
enum State {
|
||||
BLOCKED = 0,
|
||||
RUNNING,
|
||||
};
|
||||
|
||||
WBMStallInterface() : state_cv_(&state_mutex_) {
|
||||
MutexLock lock(&state_mutex_);
|
||||
state_ = State::RUNNING;
|
||||
}
|
||||
|
||||
void SetState(State state) {
|
||||
MutexLock lock(&state_mutex_);
|
||||
state_ = state;
|
||||
}
|
||||
|
||||
// Change the state_ to State::BLOCKED and wait until its state is
|
||||
// changed by WriteBufferManager. When stall is cleared, Signal() is
|
||||
// called to change the state and unblock the DB.
|
||||
void Block() override {
|
||||
MutexLock lock(&state_mutex_);
|
||||
while (state_ == State::BLOCKED) {
|
||||
TEST_SYNC_POINT("WBMStallInterface::BlockDB");
|
||||
state_cv_.Wait();
|
||||
}
|
||||
}
|
||||
|
||||
// Called from WriteBufferManager. This function changes the state_
|
||||
// to State::RUNNING indicating the stall is cleared and DB can proceed.
|
||||
void Signal() override {
|
||||
MutexLock lock(&state_mutex_);
|
||||
state_ = State::RUNNING;
|
||||
state_cv_.Signal();
|
||||
}
|
||||
|
||||
private:
|
||||
// Conditional variable and mutex to block and
|
||||
// signal the DB during stalling process.
|
||||
port::Mutex state_mutex_;
|
||||
port::CondVar state_cv_;
|
||||
// state represting whether DB is running or blocked because of stall by
|
||||
// WriteBufferManager.
|
||||
State state_;
|
||||
};
|
||||
|
||||
protected:
|
||||
const std::string dbname_;
|
||||
std::string db_id_;
|
||||
@ -1079,6 +1130,14 @@ class DBImpl : public DB {
|
||||
ColumnFamilyHandleImpl* default_cf_handle_;
|
||||
InternalStats* default_cf_internal_stats_;
|
||||
|
||||
// table_cache_ provides its own synchronization
|
||||
std::shared_ptr<Cache> table_cache_;
|
||||
|
||||
ErrorHandler error_handler_;
|
||||
|
||||
// Unified interface for logging events
|
||||
EventLogger event_logger_;
|
||||
|
||||
// only used for dynamically adjusting max_total_wal_size. it is a sum of
|
||||
// [write_buffer_size * max_write_buffer_number] over all column families
|
||||
uint64_t max_total_in_memory_state_;
|
||||
@ -1109,6 +1168,12 @@ class DBImpl : public DB {
|
||||
// Default: true
|
||||
const bool batch_per_txn_;
|
||||
|
||||
// Each flush or compaction gets its own job id. this counter makes sure
|
||||
// they're unique
|
||||
std::atomic<int> next_job_id_;
|
||||
|
||||
std::atomic<bool> shutting_down_;
|
||||
|
||||
// Except in DB::Open(), WriteOptionsFile can only be called when:
|
||||
// Persist options to options file.
|
||||
// If need_mutex_lock = false, the method will lock DB mutex.
|
||||
@ -1236,7 +1301,7 @@ class DBImpl : public DB {
|
||||
virtual bool OwnTablesAndLogs() const { return true; }
|
||||
|
||||
// Set DB identity file, and write DB ID to manifest if necessary.
|
||||
Status SetDBId();
|
||||
Status SetDBId(bool read_only);
|
||||
|
||||
// REQUIRES: db mutex held when calling this function, but the db mutex can
|
||||
// be released and re-acquired. Db mutex will be held when the function
|
||||
@ -1308,6 +1373,7 @@ class DBImpl : public DB {
|
||||
|
||||
struct LogFileNumberSize {
|
||||
explicit LogFileNumberSize(uint64_t _number) : number(_number) {}
|
||||
LogFileNumberSize() {}
|
||||
void AddSize(uint64_t new_size) { size += new_size; }
|
||||
uint64_t number;
|
||||
uint64_t size = 0;
|
||||
@ -1413,6 +1479,7 @@ class DBImpl : public DB {
|
||||
DBImpl* db;
|
||||
// background compaction takes ownership of `prepicked_compaction`.
|
||||
PrepickedCompaction* prepicked_compaction;
|
||||
Env::Priority compaction_pri_;
|
||||
};
|
||||
|
||||
// Initialize the built-in column family for persistent stats. Depending on
|
||||
@ -1507,6 +1574,12 @@ class DBImpl : public DB {
|
||||
Status WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
|
||||
MemTable* mem, VersionEdit* edit);
|
||||
|
||||
// Get the size of a log file and, if truncate is true, truncate the
|
||||
// log file to its actual size, thereby freeing preallocated space.
|
||||
// Return success even if truncate fails
|
||||
Status GetLogSizeAndMaybeTruncate(uint64_t wal_number, bool truncate,
|
||||
LogFileNumberSize* log);
|
||||
|
||||
// Restore alive_log_files_ and total_log_size_ after recovery.
|
||||
// It needs to run only when there's no flush during recovery
|
||||
// (e.g. avoid_flush_during_recovery=true). May also trigger flush
|
||||
@ -1517,6 +1590,10 @@ class DBImpl : public DB {
|
||||
// `num_bytes` going through.
|
||||
Status DelayWrite(uint64_t num_bytes, const WriteOptions& write_options);
|
||||
|
||||
// Begin stalling of writes when memory usage increases beyond a certain
|
||||
// threshold.
|
||||
void WriteBufferManagerStallWrites();
|
||||
|
||||
Status ThrottleLowPriWritesIfNeeded(const WriteOptions& write_options,
|
||||
WriteBatch* my_batch);
|
||||
|
||||
@ -1596,7 +1673,7 @@ class DBImpl : public DB {
|
||||
Status SwitchWAL(WriteContext* write_context);
|
||||
|
||||
// REQUIRES: mutex locked and in write thread.
|
||||
Status HandleWriteBufferFull(WriteContext* write_context);
|
||||
Status HandleWriteBufferManagerFlush(WriteContext* write_context);
|
||||
|
||||
// REQUIRES: mutex locked
|
||||
Status PreprocessWrite(const WriteOptions& write_options, bool* need_log_sync,
|
||||
@ -1877,9 +1954,6 @@ class DBImpl : public DB {
|
||||
|
||||
Status IncreaseFullHistoryTsLow(ColumnFamilyData* cfd, std::string ts_low);
|
||||
|
||||
// table_cache_ provides its own synchronization
|
||||
std::shared_ptr<Cache> table_cache_;
|
||||
|
||||
// Lock over the persistent DB state. Non-nullptr iff successfully acquired.
|
||||
FileLock* db_lock_;
|
||||
|
||||
@ -1893,8 +1967,6 @@ class DBImpl : public DB {
|
||||
// mutex_, the order should be first mutex_ and then log_write_mutex_.
|
||||
InstrumentedMutex log_write_mutex_;
|
||||
|
||||
std::atomic<bool> shutting_down_;
|
||||
|
||||
// If zero, manual compactions are allowed to proceed. If non-zero, manual
|
||||
// compactions may still be running, but will quickly fail with
|
||||
// `Status::Incomplete`. The value indicates how many threads have paused
|
||||
@ -2103,10 +2175,6 @@ class DBImpl : public DB {
|
||||
// Number of threads intending to write to memtable
|
||||
std::atomic<size_t> pending_memtable_writes_ = {};
|
||||
|
||||
// Each flush or compaction gets its own job id. this counter makes sure
|
||||
// they're unique
|
||||
std::atomic<int> next_job_id_;
|
||||
|
||||
// A flag indicating whether the current rocksdb database has any
|
||||
// data that is not yet persisted into either WAL or SST file.
|
||||
// Used when disableWAL is true.
|
||||
@ -2135,9 +2203,6 @@ class DBImpl : public DB {
|
||||
WalManager wal_manager_;
|
||||
#endif // ROCKSDB_LITE
|
||||
|
||||
// Unified interface for logging events
|
||||
EventLogger event_logger_;
|
||||
|
||||
// A value of > 0 temporarily disables scheduling of background work
|
||||
int bg_work_paused_;
|
||||
|
||||
@ -2205,8 +2270,6 @@ class DBImpl : public DB {
|
||||
// Flag to check whether Close() has been called on this DB
|
||||
bool closed_;
|
||||
|
||||
ErrorHandler error_handler_;
|
||||
|
||||
// Conditional variable to coordinate installation of atomic flush results.
|
||||
// With atomic flush, each bg thread installs the result of flushing multiple
|
||||
// column families, and different threads can flush different column
|
||||
@ -2221,11 +2284,16 @@ class DBImpl : public DB {
|
||||
bool wal_in_db_path_;
|
||||
|
||||
BlobFileCompletionCallback blob_callback_;
|
||||
|
||||
// Pointer to WriteBufferManager stalling interface.
|
||||
std::unique_ptr<StallInterface> wbm_stall_;
|
||||
};
|
||||
|
||||
extern Options SanitizeOptions(const std::string& db, const Options& src);
|
||||
extern Options SanitizeOptions(const std::string& db, const Options& src,
|
||||
bool read_only = false);
|
||||
|
||||
extern DBOptions SanitizeOptions(const std::string& db, const DBOptions& src);
|
||||
extern DBOptions SanitizeOptions(const std::string& db, const DBOptions& src,
|
||||
bool read_only = false);
|
||||
|
||||
extern CompressionType GetCompressionFlush(
|
||||
const ImmutableCFOptions& ioptions,
|
||||
|
@ -260,18 +260,16 @@ Status DBImpl::FlushMemTableToOutputFile(
|
||||
// be pessimistic and try write to a new MANIFEST.
|
||||
// TODO: distinguish between MANIFEST write and CURRENT renaming
|
||||
if (!versions_->io_status().ok()) {
|
||||
if (total_log_size_ > 0) {
|
||||
// If the WAL is empty, we use different error reason
|
||||
error_handler_.SetBGError(io_s,
|
||||
BackgroundErrorReason::kManifestWrite);
|
||||
} else {
|
||||
error_handler_.SetBGError(io_s,
|
||||
BackgroundErrorReason::kManifestWriteNoWAL);
|
||||
}
|
||||
} else if (total_log_size_ > 0 || !log_io_s.ok()) {
|
||||
error_handler_.SetBGError(io_s, BackgroundErrorReason::kFlush);
|
||||
// If WAL sync is successful (either WAL size is 0 or there is no IO
|
||||
// error), all the Manifest write will be map to soft error.
|
||||
// TODO: kManifestWriteNoWAL and kFlushNoWAL are misleading. Refactor is
|
||||
// needed.
|
||||
error_handler_.SetBGError(io_s,
|
||||
BackgroundErrorReason::kManifestWriteNoWAL);
|
||||
} else {
|
||||
// If the WAL is empty, we use different error reason
|
||||
// If WAL sync is successful (either WAL size is 0 or there is no IO
|
||||
// error), all the other SST file write errors will be set as
|
||||
// kFlushNoWAL.
|
||||
error_handler_.SetBGError(io_s, BackgroundErrorReason::kFlushNoWAL);
|
||||
}
|
||||
} else {
|
||||
@ -687,18 +685,16 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
|
||||
// be pessimistic and try write to a new MANIFEST.
|
||||
// TODO: distinguish between MANIFEST write and CURRENT renaming
|
||||
if (!versions_->io_status().ok()) {
|
||||
if (total_log_size_ > 0) {
|
||||
// If the WAL is empty, we use different error reason
|
||||
error_handler_.SetBGError(io_s,
|
||||
BackgroundErrorReason::kManifestWrite);
|
||||
} else {
|
||||
error_handler_.SetBGError(io_s,
|
||||
BackgroundErrorReason::kManifestWriteNoWAL);
|
||||
}
|
||||
} else if (total_log_size_ > 0) {
|
||||
error_handler_.SetBGError(io_s, BackgroundErrorReason::kFlush);
|
||||
// If WAL sync is successful (either WAL size is 0 or there is no IO
|
||||
// error), all the Manifest write will be map to soft error.
|
||||
// TODO: kManifestWriteNoWAL and kFlushNoWAL are misleading. Refactor
|
||||
// is needed.
|
||||
error_handler_.SetBGError(io_s,
|
||||
BackgroundErrorReason::kManifestWriteNoWAL);
|
||||
} else {
|
||||
// If the WAL is empty, we use different error reason
|
||||
// If WAL sync is successful (either WAL size is 0 or there is no IO
|
||||
// error), all the other SST file write errors will be set as
|
||||
// kFlushNoWAL.
|
||||
error_handler_.SetBGError(io_s, BackgroundErrorReason::kFlushNoWAL);
|
||||
}
|
||||
} else {
|
||||
@ -807,6 +803,10 @@ Status DBImpl::CompactRange(const CompactRangeOptions& options,
|
||||
ColumnFamilyHandle* column_family,
|
||||
const Slice* begin_without_ts,
|
||||
const Slice* end_without_ts) {
|
||||
if (manual_compaction_paused_.load(std::memory_order_acquire) > 0) {
|
||||
return Status::Incomplete(Status::SubCode::kManualCompactionPaused);
|
||||
}
|
||||
|
||||
const Comparator* const ucmp = column_family->GetComparator();
|
||||
assert(ucmp);
|
||||
size_t ts_sz = ucmp->timestamp_size();
|
||||
@ -1745,6 +1745,7 @@ Status DBImpl::RunManualCompaction(
|
||||
}
|
||||
ca = new CompactionArg;
|
||||
ca->db = this;
|
||||
ca->compaction_pri_ = Env::Priority::LOW;
|
||||
ca->prepicked_compaction = new PrepickedCompaction;
|
||||
ca->prepicked_compaction->manual_compaction_state = &manual;
|
||||
ca->prepicked_compaction->compaction = compaction;
|
||||
@ -2276,6 +2277,7 @@ void DBImpl::MaybeScheduleFlushOrCompaction() {
|
||||
unscheduled_compactions_ > 0) {
|
||||
CompactionArg* ca = new CompactionArg;
|
||||
ca->db = this;
|
||||
ca->compaction_pri_ = Env::Priority::LOW;
|
||||
ca->prepicked_compaction = nullptr;
|
||||
bg_compaction_scheduled_++;
|
||||
unscheduled_compactions_--;
|
||||
@ -2463,7 +2465,16 @@ void DBImpl::BGWorkPurge(void* db) {
|
||||
}
|
||||
|
||||
void DBImpl::UnscheduleCompactionCallback(void* arg) {
|
||||
CompactionArg ca = *(reinterpret_cast<CompactionArg*>(arg));
|
||||
CompactionArg* ca_ptr = reinterpret_cast<CompactionArg*>(arg);
|
||||
Env::Priority compaction_pri = ca_ptr->compaction_pri_;
|
||||
if (Env::Priority::BOTTOM == compaction_pri) {
|
||||
// Decrement bg_bottom_compaction_scheduled_ if priority is BOTTOM
|
||||
ca_ptr->db->bg_bottom_compaction_scheduled_--;
|
||||
} else if (Env::Priority::LOW == compaction_pri) {
|
||||
// Decrement bg_compaction_scheduled_ if priority is LOW
|
||||
ca_ptr->db->bg_compaction_scheduled_--;
|
||||
}
|
||||
CompactionArg ca = *(ca_ptr);
|
||||
delete reinterpret_cast<CompactionArg*>(arg);
|
||||
if (ca.prepicked_compaction != nullptr) {
|
||||
if (ca.prepicked_compaction->compaction != nullptr) {
|
||||
@ -2475,6 +2486,14 @@ void DBImpl::UnscheduleCompactionCallback(void* arg) {
|
||||
}
|
||||
|
||||
void DBImpl::UnscheduleFlushCallback(void* arg) {
|
||||
// Decrement bg_flush_scheduled_ in flush callback
|
||||
reinterpret_cast<FlushThreadArg*>(arg)->db_->bg_flush_scheduled_--;
|
||||
Env::Priority flush_pri = reinterpret_cast<FlushThreadArg*>(arg)->thread_pri_;
|
||||
if (Env::Priority::LOW == flush_pri) {
|
||||
TEST_SYNC_POINT("DBImpl::UnscheduleLowFlushCallback");
|
||||
} else if (Env::Priority::HIGH == flush_pri) {
|
||||
TEST_SYNC_POINT("DBImpl::UnscheduleHighFlushCallback");
|
||||
}
|
||||
delete reinterpret_cast<FlushThreadArg*>(arg);
|
||||
TEST_SYNC_POINT("DBImpl::UnscheduleFlushCallback");
|
||||
}
|
||||
@ -2567,6 +2586,8 @@ void DBImpl::BackgroundCallFlush(Env::Priority thread_pri) {
|
||||
|
||||
LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL,
|
||||
immutable_db_options_.info_log.get());
|
||||
TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:Start:1");
|
||||
TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:Start:2");
|
||||
{
|
||||
InstrumentedMutexLock l(&mutex_);
|
||||
assert(bg_flush_scheduled_);
|
||||
@ -3077,6 +3098,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
|
||||
TEST_SYNC_POINT("DBImpl::BackgroundCompaction:ForwardToBottomPriPool");
|
||||
CompactionArg* ca = new CompactionArg;
|
||||
ca->db = this;
|
||||
ca->compaction_pri_ = Env::Priority::BOTTOM;
|
||||
ca->prepicked_compaction = new PrepickedCompaction;
|
||||
ca->prepicked_compaction->compaction = c.release();
|
||||
ca->prepicked_compaction->manual_compaction_state = nullptr;
|
||||
|
@ -558,8 +558,8 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
|
||||
}
|
||||
|
||||
#ifndef ROCKSDB_LITE
|
||||
if (type == kWalFile && (immutable_db_options_.wal_ttl_seconds > 0 ||
|
||||
immutable_db_options_.wal_size_limit_mb > 0)) {
|
||||
if (type == kWalFile && (immutable_db_options_.WAL_ttl_seconds > 0 ||
|
||||
immutable_db_options_.WAL_size_limit_MB > 0)) {
|
||||
wal_manager_.ArchiveWALFile(fname, number);
|
||||
continue;
|
||||
}
|
||||
@ -854,7 +854,7 @@ uint64_t PrecomputeMinLogNumberToKeep2PC(
|
||||
return min_log_number_to_keep;
|
||||
}
|
||||
|
||||
Status DBImpl::SetDBId() {
|
||||
Status DBImpl::SetDBId(bool read_only) {
|
||||
Status s;
|
||||
// Happens when immutable_db_options_.write_dbid_to_manifest is set to true
|
||||
// the very first time.
|
||||
@ -865,9 +865,15 @@ Status DBImpl::SetDBId() {
|
||||
// it is no longer available then at this point DB ID is not in Identity
|
||||
// file or Manifest.
|
||||
if (s.IsNotFound()) {
|
||||
s = SetIdentityFile(env_, dbname_);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
// Create a new DB ID, saving to file only if allowed
|
||||
if (read_only) {
|
||||
db_id_ = env_->GenerateUniqueId();
|
||||
return Status::OK();
|
||||
} else {
|
||||
s = SetIdentityFile(env_, dbname_);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
}
|
||||
} else if (!s.ok()) {
|
||||
assert(s.IsIOError());
|
||||
@ -884,7 +890,7 @@ Status DBImpl::SetDBId() {
|
||||
mutable_cf_options, &edit, &mutex_, nullptr,
|
||||
/* new_descriptor_log */ false);
|
||||
}
|
||||
} else {
|
||||
} else if (!read_only) {
|
||||
s = SetIdentityFile(env_, dbname_, db_id_);
|
||||
}
|
||||
return s;
|
||||
@ -937,7 +943,7 @@ Status DBImpl::DeleteUnreferencedSstFiles() {
|
||||
return s;
|
||||
}
|
||||
|
||||
if (largest_file_number > next_file_number) {
|
||||
if (largest_file_number >= next_file_number) {
|
||||
versions_->next_file_number_.store(largest_file_number + 1);
|
||||
}
|
||||
|
||||
|
@ -24,15 +24,17 @@
|
||||
#include "util/rate_limiter.h"
|
||||
|
||||
namespace ROCKSDB_NAMESPACE {
|
||||
Options SanitizeOptions(const std::string& dbname, const Options& src) {
|
||||
auto db_options = SanitizeOptions(dbname, DBOptions(src));
|
||||
Options SanitizeOptions(const std::string& dbname, const Options& src,
|
||||
bool read_only) {
|
||||
auto db_options = SanitizeOptions(dbname, DBOptions(src), read_only);
|
||||
ImmutableDBOptions immutable_db_options(db_options);
|
||||
auto cf_options =
|
||||
SanitizeOptions(immutable_db_options, ColumnFamilyOptions(src));
|
||||
return Options(db_options, cf_options);
|
||||
}
|
||||
|
||||
DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) {
|
||||
DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src,
|
||||
bool read_only) {
|
||||
DBOptions result(src);
|
||||
|
||||
if (result.env == nullptr) {
|
||||
@ -50,7 +52,7 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) {
|
||||
&result.max_open_files);
|
||||
}
|
||||
|
||||
if (result.info_log == nullptr) {
|
||||
if (result.info_log == nullptr && !read_only) {
|
||||
Status s = CreateLoggerFromOptions(dbname, result, &result.info_log);
|
||||
if (!s.ok()) {
|
||||
// No place suitable for logging
|
||||
@ -283,6 +285,9 @@ Status DBImpl::NewDB(std::vector<std::string>* new_filenames) {
|
||||
ROCKS_LOG_INFO(immutable_db_options_.info_log, "Creating manifest 1 \n");
|
||||
const std::string manifest = DescriptorFileName(dbname_, 1);
|
||||
{
|
||||
if (fs_->FileExists(manifest, IOOptions(), nullptr).ok()) {
|
||||
fs_->DeleteFile(manifest, IOOptions(), nullptr).PermitUncheckedError();
|
||||
}
|
||||
std::unique_ptr<FSWritableFile> file;
|
||||
FileOptions file_options = fs_->OptimizeForManifestWrite(file_options_);
|
||||
s = NewWritableFile(fs_.get(), manifest, &file, file_options);
|
||||
@ -312,7 +317,7 @@ Status DBImpl::NewDB(std::vector<std::string>* new_filenames) {
|
||||
manifest.substr(manifest.find_last_of("/\\") + 1));
|
||||
}
|
||||
} else {
|
||||
fs_->DeleteFile(manifest, IOOptions(), nullptr);
|
||||
fs_->DeleteFile(manifest, IOOptions(), nullptr).PermitUncheckedError();
|
||||
}
|
||||
return s;
|
||||
}
|
||||
@ -488,7 +493,7 @@ Status DBImpl::Recover(
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
s = SetDBId();
|
||||
s = SetDBId(read_only);
|
||||
if (s.ok() && !read_only) {
|
||||
s = DeleteUnreferencedSstFiles();
|
||||
}
|
||||
@ -1132,11 +1137,29 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
|
||||
immutable_db_options_.wal_recovery_mode ==
|
||||
WALRecoveryMode::kTolerateCorruptedTailRecords)) {
|
||||
for (auto cfd : *versions_->GetColumnFamilySet()) {
|
||||
if (cfd->GetLogNumber() > corrupted_wal_number) {
|
||||
// One special case cause cfd->GetLogNumber() > corrupted_wal_number but
|
||||
// the CF is still consistent: If a new column family is created during
|
||||
// the flush and the WAL sync fails at the same time, the new CF points to
|
||||
// the new WAL but the old WAL is curropted. Since the new CF is empty, it
|
||||
// is still consistent. We add the check of CF sst file size to avoid the
|
||||
// false positive alert.
|
||||
|
||||
// Note that, the check of (cfd->GetLiveSstFilesSize() > 0) may leads to
|
||||
// the ignorance of a very rare inconsistency case caused in data
|
||||
// canclation. One CF is empty due to KV deletion. But those operations
|
||||
// are in the WAL. If the WAL is corrupted, the status of this CF might
|
||||
// not be consistent with others. However, the consistency check will be
|
||||
// bypassed due to empty CF.
|
||||
// TODO: a better and complete implementation is needed to ensure strict
|
||||
// consistency check in WAL recovery including hanlding the tailing
|
||||
// issues.
|
||||
if (cfd->GetLogNumber() > corrupted_wal_number &&
|
||||
cfd->GetLiveSstFilesSize() > 0) {
|
||||
ROCKS_LOG_ERROR(immutable_db_options_.info_log,
|
||||
"Column family inconsistency: SST file contains data"
|
||||
" beyond the point of corruption.");
|
||||
return Status::Corruption("SST file is ahead of WALs");
|
||||
return Status::Corruption("SST file is ahead of WALs in CF " +
|
||||
cfd->GetName());
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1229,8 +1252,16 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
|
||||
}
|
||||
}
|
||||
|
||||
if (status.ok() && data_seen && !flushed) {
|
||||
status = RestoreAliveLogFiles(wal_numbers);
|
||||
if (status.ok()) {
|
||||
if (data_seen && !flushed) {
|
||||
status = RestoreAliveLogFiles(wal_numbers);
|
||||
} else {
|
||||
// If there's no data in the WAL, or we flushed all the data, still
|
||||
// truncate the log file. If the process goes into a crash loop before
|
||||
// the file is deleted, the preallocated space will never get freed.
|
||||
GetLogSizeAndMaybeTruncate(wal_numbers.back(), true, nullptr)
|
||||
.PermitUncheckedError();
|
||||
}
|
||||
}
|
||||
|
||||
event_logger_.Log() << "job" << job_id << "event"
|
||||
@ -1239,6 +1270,40 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
|
||||
return status;
|
||||
}
|
||||
|
||||
Status DBImpl::GetLogSizeAndMaybeTruncate(uint64_t wal_number, bool truncate,
|
||||
LogFileNumberSize* log_ptr) {
|
||||
LogFileNumberSize log(wal_number);
|
||||
std::string fname = LogFileName(immutable_db_options_.wal_dir, wal_number);
|
||||
Status s;
|
||||
// This gets the appear size of the wals, not including preallocated space.
|
||||
s = env_->GetFileSize(fname, &log.size);
|
||||
if (s.ok() && truncate) {
|
||||
std::unique_ptr<FSWritableFile> last_log;
|
||||
Status truncate_status = fs_->ReopenWritableFile(
|
||||
fname,
|
||||
fs_->OptimizeForLogWrite(
|
||||
file_options_,
|
||||
BuildDBOptions(immutable_db_options_, mutable_db_options_)),
|
||||
&last_log, nullptr);
|
||||
if (truncate_status.ok()) {
|
||||
truncate_status = last_log->Truncate(log.size, IOOptions(), nullptr);
|
||||
}
|
||||
if (truncate_status.ok()) {
|
||||
truncate_status = last_log->Close(IOOptions(), nullptr);
|
||||
}
|
||||
// Not a critical error if fail to truncate.
|
||||
if (!truncate_status.ok()) {
|
||||
ROCKS_LOG_WARN(immutable_db_options_.info_log,
|
||||
"Failed to truncate log #%" PRIu64 ": %s", wal_number,
|
||||
truncate_status.ToString().c_str());
|
||||
}
|
||||
}
|
||||
if (log_ptr) {
|
||||
*log_ptr = log;
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
Status DBImpl::RestoreAliveLogFiles(const std::vector<uint64_t>& wal_numbers) {
|
||||
if (wal_numbers.empty()) {
|
||||
return Status::OK();
|
||||
@ -1254,39 +1319,17 @@ Status DBImpl::RestoreAliveLogFiles(const std::vector<uint64_t>& wal_numbers) {
|
||||
total_log_size_ = 0;
|
||||
log_empty_ = false;
|
||||
for (auto wal_number : wal_numbers) {
|
||||
LogFileNumberSize log(wal_number);
|
||||
std::string fname = LogFileName(immutable_db_options_.wal_dir, wal_number);
|
||||
// This gets the appear size of the wals, not including preallocated space.
|
||||
s = env_->GetFileSize(fname, &log.size);
|
||||
// We preallocate space for wals, but then after a crash and restart, those
|
||||
// preallocated space are not needed anymore. It is likely only the last
|
||||
// log has such preallocated space, so we only truncate for the last log.
|
||||
LogFileNumberSize log;
|
||||
s = GetLogSizeAndMaybeTruncate(
|
||||
wal_number, /*truncate=*/(wal_number == wal_numbers.back()), &log);
|
||||
if (!s.ok()) {
|
||||
break;
|
||||
}
|
||||
total_log_size_ += log.size;
|
||||
alive_log_files_.push_back(log);
|
||||
// We preallocate space for wals, but then after a crash and restart, those
|
||||
// preallocated space are not needed anymore. It is likely only the last
|
||||
// log has such preallocated space, so we only truncate for the last log.
|
||||
if (wal_number == wal_numbers.back()) {
|
||||
std::unique_ptr<FSWritableFile> last_log;
|
||||
Status truncate_status = fs_->ReopenWritableFile(
|
||||
fname,
|
||||
fs_->OptimizeForLogWrite(
|
||||
file_options_,
|
||||
BuildDBOptions(immutable_db_options_, mutable_db_options_)),
|
||||
&last_log, nullptr);
|
||||
if (truncate_status.ok()) {
|
||||
truncate_status = last_log->Truncate(log.size, IOOptions(), nullptr);
|
||||
}
|
||||
if (truncate_status.ok()) {
|
||||
truncate_status = last_log->Close(IOOptions(), nullptr);
|
||||
}
|
||||
// Not a critical error if fail to truncate.
|
||||
if (!truncate_status.ok()) {
|
||||
ROCKS_LOG_WARN(immutable_db_options_.info_log,
|
||||
"Failed to truncate log #%" PRIu64 ": %s", wal_number,
|
||||
truncate_status.ToString().c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
if (two_write_queues_) {
|
||||
log_write_mutex_.Unlock();
|
||||
@ -1358,11 +1401,10 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
|
||||
cfd->GetID(), cfd->GetName(), snapshot_seqs,
|
||||
earliest_write_conflict_snapshot, snapshot_checker,
|
||||
GetCompressionFlush(*cfd->ioptions(), mutable_cf_options),
|
||||
mutable_cf_options.sample_for_compression,
|
||||
mutable_cf_options.compression_opts, paranoid_file_checks,
|
||||
cfd->internal_stats(), TableFileCreationReason::kRecovery, &io_s,
|
||||
io_tracer_, &event_logger_, job_id, Env::IO_HIGH,
|
||||
nullptr /* table_properties */, -1 /* level */, current_time,
|
||||
nullptr /* table_properties */, 0 /* level */, current_time,
|
||||
0 /* oldest_key_time */, write_hint, 0 /* file_creation_time */,
|
||||
db_id_, db_session_id_, nullptr /*full_history_ts_low*/,
|
||||
&blob_callback_);
|
||||
|
@ -6,7 +6,7 @@
|
||||
#include "db/db_impl/db_impl_readonly.h"
|
||||
|
||||
#include "db/arena_wrapped_db_iter.h"
|
||||
#include "db/compacted_db_impl.h"
|
||||
#include "db/db_impl/compacted_db_impl.h"
|
||||
#include "db/db_impl/db_impl.h"
|
||||
#include "db/db_iter.h"
|
||||
#include "db/merge_context.h"
|
||||
@ -19,7 +19,8 @@ namespace ROCKSDB_NAMESPACE {
|
||||
|
||||
DBImplReadOnly::DBImplReadOnly(const DBOptions& db_options,
|
||||
const std::string& dbname)
|
||||
: DBImpl(db_options, dbname) {
|
||||
: DBImpl(db_options, dbname, /*seq_per_batch*/ false,
|
||||
/*batch_per_txn*/ true, /*read_only*/ true) {
|
||||
ROCKS_LOG_INFO(immutable_db_options_.info_log,
|
||||
"Opening the db in read only mode");
|
||||
LogFlush(immutable_db_options_.info_log);
|
||||
@ -131,8 +132,8 @@ Status DBImplReadOnly::NewIterators(
|
||||
}
|
||||
|
||||
namespace {
|
||||
// Return OK if dbname exists in the file system
|
||||
// or create_if_missing is false
|
||||
// Return OK if dbname exists in the file system or create it if
|
||||
// create_if_missing
|
||||
Status OpenForReadOnlyCheckExistence(const DBOptions& db_options,
|
||||
const std::string& dbname) {
|
||||
Status s;
|
||||
@ -143,6 +144,9 @@ Status OpenForReadOnlyCheckExistence(const DBOptions& db_options,
|
||||
uint64_t manifest_file_number;
|
||||
s = VersionSet::GetCurrentManifestPath(dbname, fs.get(), &manifest_path,
|
||||
&manifest_file_number);
|
||||
} else {
|
||||
// Historic behavior that doesn't necessarily make sense
|
||||
s = db_options.env->CreateDirIfMissing(dbname);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
@ -150,7 +154,6 @@ Status OpenForReadOnlyCheckExistence(const DBOptions& db_options,
|
||||
|
||||
Status DB::OpenForReadOnly(const Options& options, const std::string& dbname,
|
||||
DB** dbptr, bool /*error_if_wal_file_exists*/) {
|
||||
// If dbname does not exist in the file system, should not do anything
|
||||
Status s = OpenForReadOnlyCheckExistence(options, dbname);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
|
@ -17,8 +17,10 @@ namespace ROCKSDB_NAMESPACE {
|
||||
|
||||
#ifndef ROCKSDB_LITE
|
||||
DBImplSecondary::DBImplSecondary(const DBOptions& db_options,
|
||||
const std::string& dbname)
|
||||
: DBImpl(db_options, dbname) {
|
||||
const std::string& dbname,
|
||||
std::string secondary_path)
|
||||
: DBImpl(db_options, dbname, false, true, true),
|
||||
secondary_path_(std::move(secondary_path)) {
|
||||
ROCKS_LOG_INFO(immutable_db_options_.info_log,
|
||||
"Opening the db in secondary mode");
|
||||
LogFlush(immutable_db_options_.info_log);
|
||||
@ -617,7 +619,7 @@ Status DB::OpenAsSecondary(
|
||||
}
|
||||
|
||||
handles->clear();
|
||||
DBImplSecondary* impl = new DBImplSecondary(tmp_opts, dbname);
|
||||
DBImplSecondary* impl = new DBImplSecondary(tmp_opts, dbname, secondary_path);
|
||||
impl->versions_.reset(new ReactiveVersionSet(
|
||||
dbname, &impl->immutable_db_options_, impl->file_options_,
|
||||
impl->table_cache_.get(), impl->write_buffer_manager_,
|
||||
@ -663,6 +665,86 @@ Status DB::OpenAsSecondary(
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
Status DBImplSecondary::CompactWithoutInstallation(
|
||||
ColumnFamilyHandle* cfh, const CompactionServiceInput& input,
|
||||
CompactionServiceResult* result) {
|
||||
InstrumentedMutexLock l(&mutex_);
|
||||
auto cfd = static_cast_with_check<ColumnFamilyHandleImpl>(cfh)->cfd();
|
||||
if (!cfd) {
|
||||
return Status::InvalidArgument("Cannot find column family" +
|
||||
cfh->GetName());
|
||||
}
|
||||
|
||||
std::unordered_set<uint64_t> input_set;
|
||||
for (const auto& file_name : input.input_files) {
|
||||
input_set.insert(TableFileNameToNumber(file_name));
|
||||
}
|
||||
|
||||
auto* version = cfd->current();
|
||||
|
||||
ColumnFamilyMetaData cf_meta;
|
||||
version->GetColumnFamilyMetaData(&cf_meta);
|
||||
|
||||
const MutableCFOptions* mutable_cf_options = cfd->GetLatestMutableCFOptions();
|
||||
ColumnFamilyOptions cf_options = cfd->GetLatestCFOptions();
|
||||
VersionStorageInfo* vstorage = version->storage_info();
|
||||
|
||||
// Use comp_options to reuse some CompactFiles functions
|
||||
CompactionOptions comp_options;
|
||||
comp_options.compression = kDisableCompressionOption;
|
||||
comp_options.output_file_size_limit = MaxFileSizeForLevel(
|
||||
*mutable_cf_options, input.output_level, cf_options.compaction_style,
|
||||
vstorage->base_level(), cf_options.level_compaction_dynamic_level_bytes);
|
||||
|
||||
std::vector<CompactionInputFiles> input_files;
|
||||
Status s = cfd->compaction_picker()->GetCompactionInputsFromFileNumbers(
|
||||
&input_files, &input_set, vstorage, comp_options);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
|
||||
std::unique_ptr<Compaction> c;
|
||||
assert(cfd->compaction_picker());
|
||||
c.reset(cfd->compaction_picker()->CompactFiles(
|
||||
comp_options, input_files, input.output_level, vstorage,
|
||||
*mutable_cf_options, mutable_db_options_, 0));
|
||||
assert(c != nullptr);
|
||||
|
||||
c->SetInputVersion(version);
|
||||
|
||||
// Create output directory if it's not existed yet
|
||||
std::unique_ptr<FSDirectory> output_dir;
|
||||
s = CreateAndNewDirectory(fs_.get(), secondary_path_, &output_dir);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
|
||||
LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL,
|
||||
immutable_db_options_.info_log.get());
|
||||
|
||||
const int job_id = next_job_id_.fetch_add(1);
|
||||
|
||||
CompactionServiceCompactionJob compaction_job(
|
||||
job_id, c.get(), immutable_db_options_, file_options_for_compaction_,
|
||||
versions_.get(), &shutting_down_, &log_buffer, output_dir.get(), stats_,
|
||||
&mutex_, &error_handler_, input.snapshots, table_cache_, &event_logger_,
|
||||
dbname_, io_tracer_, db_id_, db_session_id_, secondary_path_, input,
|
||||
result);
|
||||
|
||||
mutex_.Unlock();
|
||||
s = compaction_job.Run();
|
||||
mutex_.Lock();
|
||||
|
||||
// clean up
|
||||
compaction_job.io_status().PermitUncheckedError();
|
||||
compaction_job.CleanupCompaction();
|
||||
c->ReleaseCompactionFiles(s);
|
||||
c.reset();
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
#else // !ROCKSDB_LITE
|
||||
|
||||
Status DB::OpenAsSecondary(const Options& /*options*/,
|
||||
|
@ -71,7 +71,8 @@ class LogReaderContainer {
|
||||
// effort attempts to catch up with the primary.
|
||||
class DBImplSecondary : public DBImpl {
|
||||
public:
|
||||
DBImplSecondary(const DBOptions& options, const std::string& dbname);
|
||||
DBImplSecondary(const DBOptions& options, const std::string& dbname,
|
||||
std::string secondary_path);
|
||||
~DBImplSecondary() override;
|
||||
|
||||
// Recover by replaying MANIFEST and WAL. Also initialize manifest_reader_
|
||||
@ -222,6 +223,14 @@ class DBImplSecondary : public DBImpl {
|
||||
// not flag the missing file as inconsistency.
|
||||
Status CheckConsistency() override;
|
||||
|
||||
#ifndef NDEBUG
|
||||
Status TEST_CompactWithoutInstallation(ColumnFamilyHandle* cfh,
|
||||
const CompactionServiceInput& input,
|
||||
CompactionServiceResult* result) {
|
||||
return CompactWithoutInstallation(cfh, input, result);
|
||||
}
|
||||
#endif // NDEBUG
|
||||
|
||||
protected:
|
||||
// ColumnFamilyCollector is a write batch handler which does nothing
|
||||
// except recording unique column family IDs
|
||||
@ -316,6 +325,13 @@ class DBImplSecondary : public DBImpl {
|
||||
std::unordered_set<ColumnFamilyData*>* cfds_changed,
|
||||
JobContext* job_context);
|
||||
|
||||
// Run compaction without installation, the output files will be placed in the
|
||||
// secondary DB path. The LSM tree won't be changed, the secondary DB is still
|
||||
// in read-only mode.
|
||||
Status CompactWithoutInstallation(ColumnFamilyHandle* cfh,
|
||||
const CompactionServiceInput& input,
|
||||
CompactionServiceResult* result);
|
||||
|
||||
std::unique_ptr<log::FragmentBufferedReader> manifest_reader_;
|
||||
std::unique_ptr<log::Reader::Reporter> manifest_reporter_;
|
||||
std::unique_ptr<Status> manifest_reader_status_;
|
||||
@ -326,6 +342,8 @@ class DBImplSecondary : public DBImpl {
|
||||
|
||||
// Current WAL number replayed for each column family.
|
||||
std::unordered_map<ColumnFamilyData*, uint64_t> cfd_to_current_log_;
|
||||
|
||||
const std::string secondary_path_;
|
||||
};
|
||||
|
||||
} // namespace ROCKSDB_NAMESPACE
|
||||
|
@ -937,7 +937,7 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
|
||||
// be flushed. We may end up with flushing much more DBs than needed. It's
|
||||
// suboptimal but still correct.
|
||||
WaitForPendingWrites();
|
||||
status = HandleWriteBufferFull(write_context);
|
||||
status = HandleWriteBufferManagerFlush(write_context);
|
||||
}
|
||||
|
||||
if (UNLIKELY(status.ok() && !trim_history_scheduler_.Empty())) {
|
||||
@ -964,6 +964,20 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
|
||||
PERF_TIMER_START(write_pre_and_post_process_time);
|
||||
}
|
||||
|
||||
// If memory usage exceeded beyond a certain threshold,
|
||||
// write_buffer_manager_->ShouldStall() returns true to all threads writing to
|
||||
// all DBs and writers will be stalled.
|
||||
// It does soft checking because WriteBufferManager::buffer_limit_ has already
|
||||
// exceeded at this point so no new write (including current one) will go
|
||||
// through until memory usage is decreased.
|
||||
if (UNLIKELY(status.ok() && write_buffer_manager_->ShouldStall())) {
|
||||
if (write_options.no_slowdown) {
|
||||
status = Status::Incomplete("Write stall");
|
||||
} else {
|
||||
WriteBufferManagerStallWrites();
|
||||
}
|
||||
}
|
||||
|
||||
if (status.ok() && *need_log_sync) {
|
||||
// Wait until the parallel syncs are finished. Any sync process has to sync
|
||||
// the front log too so it is enough to check the status of front()
|
||||
@ -1348,20 +1362,20 @@ Status DBImpl::SwitchWAL(WriteContext* write_context) {
|
||||
if (!immutable_db_options_.atomic_flush) {
|
||||
FlushRequest flush_req;
|
||||
GenerateFlushRequest({cfd}, &flush_req);
|
||||
SchedulePendingFlush(flush_req, FlushReason::kWriteBufferManager);
|
||||
SchedulePendingFlush(flush_req, FlushReason::kWalFull);
|
||||
}
|
||||
}
|
||||
if (immutable_db_options_.atomic_flush) {
|
||||
FlushRequest flush_req;
|
||||
GenerateFlushRequest(cfds, &flush_req);
|
||||
SchedulePendingFlush(flush_req, FlushReason::kWriteBufferManager);
|
||||
SchedulePendingFlush(flush_req, FlushReason::kWalFull);
|
||||
}
|
||||
MaybeScheduleFlushOrCompaction();
|
||||
}
|
||||
return status;
|
||||
}
|
||||
|
||||
Status DBImpl::HandleWriteBufferFull(WriteContext* write_context) {
|
||||
Status DBImpl::HandleWriteBufferManagerFlush(WriteContext* write_context) {
|
||||
mutex_.AssertHeld();
|
||||
assert(write_context != nullptr);
|
||||
Status status;
|
||||
@ -1373,7 +1387,7 @@ Status DBImpl::HandleWriteBufferFull(WriteContext* write_context) {
|
||||
// suboptimal but still correct.
|
||||
ROCKS_LOG_INFO(
|
||||
immutable_db_options_.info_log,
|
||||
"Flushing column family with oldest memtable entry. Write buffer is "
|
||||
"Flushing column family with oldest memtable entry. Write buffers are "
|
||||
"using %" ROCKSDB_PRIszt " bytes out of a total of %" ROCKSDB_PRIszt ".",
|
||||
write_buffer_manager_->memory_usage(),
|
||||
write_buffer_manager_->buffer_size());
|
||||
@ -1434,13 +1448,13 @@ Status DBImpl::HandleWriteBufferFull(WriteContext* write_context) {
|
||||
if (!immutable_db_options_.atomic_flush) {
|
||||
FlushRequest flush_req;
|
||||
GenerateFlushRequest({cfd}, &flush_req);
|
||||
SchedulePendingFlush(flush_req, FlushReason::kWriteBufferFull);
|
||||
SchedulePendingFlush(flush_req, FlushReason::kWriteBufferManager);
|
||||
}
|
||||
}
|
||||
if (immutable_db_options_.atomic_flush) {
|
||||
FlushRequest flush_req;
|
||||
GenerateFlushRequest(cfds, &flush_req);
|
||||
SchedulePendingFlush(flush_req, FlushReason::kWriteBufferFull);
|
||||
SchedulePendingFlush(flush_req, FlushReason::kWriteBufferManager);
|
||||
}
|
||||
MaybeScheduleFlushOrCompaction();
|
||||
}
|
||||
@ -1536,6 +1550,29 @@ Status DBImpl::DelayWrite(uint64_t num_bytes,
|
||||
return s;
|
||||
}
|
||||
|
||||
// REQUIRES: mutex_ is held
|
||||
// REQUIRES: this thread is currently at the front of the writer queue
|
||||
void DBImpl::WriteBufferManagerStallWrites() {
|
||||
mutex_.AssertHeld();
|
||||
// First block future writer threads who want to add themselves to the queue
|
||||
// of WriteThread.
|
||||
write_thread_.BeginWriteStall();
|
||||
mutex_.Unlock();
|
||||
|
||||
// Change the state to State::Blocked.
|
||||
static_cast<WBMStallInterface*>(wbm_stall_.get())
|
||||
->SetState(WBMStallInterface::State::BLOCKED);
|
||||
// Then WriteBufferManager will add DB instance to its queue
|
||||
// and block this thread by calling WBMStallInterface::Block().
|
||||
write_buffer_manager_->BeginWriteStall(wbm_stall_.get());
|
||||
wbm_stall_->Block();
|
||||
|
||||
mutex_.Lock();
|
||||
// Stall has ended. Signal writer threads so that they can add
|
||||
// themselves to the WriteThread queue for writes.
|
||||
write_thread_.EndWriteStall();
|
||||
}
|
||||
|
||||
Status DBImpl::ThrottleLowPriWritesIfNeeded(const WriteOptions& write_options,
|
||||
WriteBatch* my_batch) {
|
||||
assert(write_options.low_pri);
|
||||
|
@ -45,10 +45,10 @@ DBIter::DBIter(Env* _env, const ReadOptions& read_options,
|
||||
ColumnFamilyData* cfd, bool expose_blob_index)
|
||||
: prefix_extractor_(mutable_cf_options.prefix_extractor.get()),
|
||||
env_(_env),
|
||||
clock_(_env->GetSystemClock().get()),
|
||||
clock_(cf_options.clock),
|
||||
logger_(cf_options.info_log),
|
||||
user_comparator_(cmp),
|
||||
merge_operator_(cf_options.merge_operator),
|
||||
merge_operator_(cf_options.merge_operator.get()),
|
||||
iter_(iter),
|
||||
version_(version),
|
||||
read_callback_(read_callback),
|
||||
@ -1343,7 +1343,7 @@ void DBIter::Seek(const Slice& target) {
|
||||
// we need to find out the next key that is visible to the user.
|
||||
ClearSavedValue();
|
||||
if (prefix_same_as_start_) {
|
||||
// The case where the iterator needs to be invalidated if it has exausted
|
||||
// The case where the iterator needs to be invalidated if it has exhausted
|
||||
// keys within the same prefix of the seek key.
|
||||
assert(prefix_extractor_ != nullptr);
|
||||
Slice target_prefix = prefix_extractor_->Transform(target);
|
||||
@ -1418,7 +1418,7 @@ void DBIter::SeekForPrev(const Slice& target) {
|
||||
// backward direction.
|
||||
ClearSavedValue();
|
||||
if (prefix_same_as_start_) {
|
||||
// The case where the iterator needs to be invalidated if it has exausted
|
||||
// The case where the iterator needs to be invalidated if it has exhausted
|
||||
// keys within the same prefix of the seek key.
|
||||
assert(prefix_extractor_ != nullptr);
|
||||
Slice target_prefix = prefix_extractor_->Transform(target);
|
||||
|
@ -235,7 +235,7 @@ class DBIter final : public Iterator {
|
||||
// If `skipping_saved_key` is true, the function will keep iterating until it
|
||||
// finds a user key that is larger than `saved_key_`.
|
||||
// If `prefix` is not null, the iterator needs to stop when all keys for the
|
||||
// prefix are exhausted and the interator is set to invalid.
|
||||
// prefix are exhausted and the iterator is set to invalid.
|
||||
bool FindNextUserEntry(bool skipping_saved_key, const Slice* prefix);
|
||||
// Internal implementation of FindNextUserEntry().
|
||||
bool FindNextUserEntryInternal(bool skipping_saved_key, const Slice* prefix);
|
||||
|
@ -1175,6 +1175,61 @@ class CountingDeleteTabPropCollectorFactory
|
||||
}
|
||||
};
|
||||
|
||||
class BlockCountingTablePropertiesCollector : public TablePropertiesCollector {
|
||||
public:
|
||||
static const std::string kNumSampledBlocksPropertyName;
|
||||
|
||||
const char* Name() const override {
|
||||
return "BlockCountingTablePropertiesCollector";
|
||||
}
|
||||
|
||||
Status Finish(UserCollectedProperties* properties) override {
|
||||
(*properties)[kNumSampledBlocksPropertyName] =
|
||||
ToString(num_sampled_blocks_);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AddUserKey(const Slice& /*user_key*/, const Slice& /*value*/,
|
||||
EntryType /*type*/, SequenceNumber /*seq*/,
|
||||
uint64_t /*file_size*/) override {
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
void BlockAdd(uint64_t /* block_raw_bytes */,
|
||||
uint64_t block_compressed_bytes_fast,
|
||||
uint64_t block_compressed_bytes_slow) override {
|
||||
if (block_compressed_bytes_fast > 0 || block_compressed_bytes_slow > 0) {
|
||||
num_sampled_blocks_++;
|
||||
}
|
||||
}
|
||||
|
||||
UserCollectedProperties GetReadableProperties() const override {
|
||||
return UserCollectedProperties{
|
||||
{kNumSampledBlocksPropertyName, ToString(num_sampled_blocks_)},
|
||||
};
|
||||
}
|
||||
|
||||
private:
|
||||
uint32_t num_sampled_blocks_ = 0;
|
||||
};
|
||||
|
||||
const std::string
|
||||
BlockCountingTablePropertiesCollector::kNumSampledBlocksPropertyName =
|
||||
"NumSampledBlocks";
|
||||
|
||||
class BlockCountingTablePropertiesCollectorFactory
|
||||
: public TablePropertiesCollectorFactory {
|
||||
public:
|
||||
const char* Name() const override {
|
||||
return "BlockCountingTablePropertiesCollectorFactory";
|
||||
}
|
||||
|
||||
TablePropertiesCollector* CreateTablePropertiesCollector(
|
||||
TablePropertiesCollectorFactory::Context /* context */) override {
|
||||
return new BlockCountingTablePropertiesCollector();
|
||||
}
|
||||
};
|
||||
|
||||
#ifndef ROCKSDB_LITE
|
||||
TEST_F(DBPropertiesTest, GetUserDefinedTableProperties) {
|
||||
Options options = CurrentOptions();
|
||||
@ -1413,6 +1468,132 @@ TEST_F(DBPropertiesTest, NeedCompactHintPersistentTest) {
|
||||
}
|
||||
}
|
||||
|
||||
// Excluded from RocksDB lite tests due to `GetPropertiesOfAllTables()` usage.
|
||||
TEST_F(DBPropertiesTest, BlockAddForCompressionSampling) {
|
||||
// Sampled compression requires at least one of the following four types.
|
||||
if (!Snappy_Supported() && !Zlib_Supported() && !LZ4_Supported() &&
|
||||
!ZSTD_Supported()) {
|
||||
return;
|
||||
}
|
||||
|
||||
Options options = CurrentOptions();
|
||||
options.disable_auto_compactions = true;
|
||||
options.table_properties_collector_factories.emplace_back(
|
||||
std::make_shared<BlockCountingTablePropertiesCollectorFactory>());
|
||||
|
||||
for (bool sample_for_compression : {false, true}) {
|
||||
// For simplicity/determinism, sample 100% when enabled, or 0% when disabled
|
||||
options.sample_for_compression = sample_for_compression ? 1 : 0;
|
||||
|
||||
DestroyAndReopen(options);
|
||||
|
||||
// Setup the following LSM:
|
||||
//
|
||||
// L0_0 ["a", "b"]
|
||||
// L1_0 ["a", "b"]
|
||||
//
|
||||
// L0_0 was created by flush. L1_0 was created by compaction. Each file
|
||||
// contains one data block.
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
ASSERT_OK(Put("a", "val"));
|
||||
ASSERT_OK(Put("b", "val"));
|
||||
ASSERT_OK(Flush());
|
||||
if (i == 1) {
|
||||
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
|
||||
}
|
||||
}
|
||||
|
||||
// A `BlockAdd()` should have been seen for files generated by flush or
|
||||
// compaction when `sample_for_compression` is enabled.
|
||||
TablePropertiesCollection file_to_props;
|
||||
ASSERT_OK(db_->GetPropertiesOfAllTables(&file_to_props));
|
||||
ASSERT_EQ(2, file_to_props.size());
|
||||
for (const auto& file_and_props : file_to_props) {
|
||||
auto& user_props = file_and_props.second->user_collected_properties;
|
||||
ASSERT_TRUE(user_props.find(BlockCountingTablePropertiesCollector::
|
||||
kNumSampledBlocksPropertyName) !=
|
||||
user_props.end());
|
||||
ASSERT_EQ(user_props.at(BlockCountingTablePropertiesCollector::
|
||||
kNumSampledBlocksPropertyName),
|
||||
ToString(sample_for_compression ? 1 : 0));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class CompressionSamplingDBPropertiesTest
|
||||
: public DBPropertiesTest,
|
||||
public ::testing::WithParamInterface<bool> {
|
||||
public:
|
||||
CompressionSamplingDBPropertiesTest() : fast_(GetParam()) {}
|
||||
|
||||
protected:
|
||||
const bool fast_;
|
||||
};
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(CompressionSamplingDBPropertiesTest,
|
||||
CompressionSamplingDBPropertiesTest, ::testing::Bool());
|
||||
|
||||
// Excluded from RocksDB lite tests due to `GetPropertiesOfAllTables()` usage.
|
||||
TEST_P(CompressionSamplingDBPropertiesTest,
|
||||
EstimateDataSizeWithCompressionSampling) {
|
||||
Options options = CurrentOptions();
|
||||
if (fast_) {
|
||||
// One of the following light compression libraries must be present.
|
||||
if (LZ4_Supported()) {
|
||||
options.compression = kLZ4Compression;
|
||||
} else if (Snappy_Supported()) {
|
||||
options.compression = kSnappyCompression;
|
||||
} else {
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
// One of the following heavy compression libraries must be present.
|
||||
if (ZSTD_Supported()) {
|
||||
options.compression = kZSTD;
|
||||
} else if (Zlib_Supported()) {
|
||||
options.compression = kZlibCompression;
|
||||
} else {
|
||||
return;
|
||||
}
|
||||
}
|
||||
options.disable_auto_compactions = true;
|
||||
// For simplicity/determinism, sample 100%.
|
||||
options.sample_for_compression = 1;
|
||||
Reopen(options);
|
||||
|
||||
// Setup the following LSM:
|
||||
//
|
||||
// L0_0 ["a", "b"]
|
||||
// L1_0 ["a", "b"]
|
||||
//
|
||||
// L0_0 was created by flush. L1_0 was created by compaction. Each file
|
||||
// contains one data block. The value consists of compressible data so the
|
||||
// data block should be stored compressed.
|
||||
std::string val(1024, 'a');
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
ASSERT_OK(Put("a", val));
|
||||
ASSERT_OK(Put("b", val));
|
||||
ASSERT_OK(Flush());
|
||||
if (i == 1) {
|
||||
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
|
||||
}
|
||||
}
|
||||
|
||||
TablePropertiesCollection file_to_props;
|
||||
ASSERT_OK(db_->GetPropertiesOfAllTables(&file_to_props));
|
||||
ASSERT_EQ(2, file_to_props.size());
|
||||
for (const auto& file_and_props : file_to_props) {
|
||||
ASSERT_GT(file_and_props.second->data_size, 0);
|
||||
if (fast_) {
|
||||
ASSERT_EQ(file_and_props.second->data_size,
|
||||
file_and_props.second->fast_compression_estimated_data_size);
|
||||
} else {
|
||||
ASSERT_EQ(file_and_props.second->data_size,
|
||||
file_and_props.second->slow_compression_estimated_data_size);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(DBPropertiesTest, EstimateNumKeysUnderflow) {
|
||||
Options options = CurrentOptions();
|
||||
Reopen(options);
|
||||
@ -1445,6 +1626,7 @@ TEST_F(DBPropertiesTest, EstimateOldestKeyTime) {
|
||||
|
||||
options.compaction_style = kCompactionStyleFIFO;
|
||||
options.ttl = 300;
|
||||
options.max_open_files = -1;
|
||||
options.compaction_options_fifo.allow_compaction = false;
|
||||
DestroyAndReopen(options);
|
||||
|
||||
|
@ -73,6 +73,15 @@ TEST_F(DBRangeDelTest, FlushOutputHasOnlyRangeTombstones) {
|
||||
} while (ChangeOptions(kRangeDelSkipConfigs));
|
||||
}
|
||||
|
||||
TEST_F(DBRangeDelTest, DictionaryCompressionWithOnlyRangeTombstones) {
|
||||
Options opts = CurrentOptions();
|
||||
opts.compression_opts.max_dict_bytes = 16384;
|
||||
Reopen(opts);
|
||||
ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "dr1",
|
||||
"dr2"));
|
||||
ASSERT_OK(db_->Flush(FlushOptions()));
|
||||
}
|
||||
|
||||
TEST_F(DBRangeDelTest, CompactionOutputHasOnlyRangeTombstone) {
|
||||
do {
|
||||
Options opts = CurrentOptions();
|
||||
|
@ -147,6 +147,206 @@ TEST_F(DBSecondaryTest, ReopenAsSecondary) {
|
||||
ASSERT_EQ(2, count);
|
||||
}
|
||||
|
||||
TEST_F(DBSecondaryTest, SimpleInternalCompaction) {
|
||||
Options options;
|
||||
options.env = env_;
|
||||
Reopen(options);
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
|
||||
ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
|
||||
ASSERT_OK(Flush());
|
||||
}
|
||||
CompactionServiceInput input;
|
||||
|
||||
ColumnFamilyMetaData meta;
|
||||
db_->GetColumnFamilyMetaData(&meta);
|
||||
for (auto& file : meta.levels[0].files) {
|
||||
ASSERT_EQ(0, meta.levels[0].level);
|
||||
input.input_files.push_back(file.name);
|
||||
}
|
||||
ASSERT_EQ(input.input_files.size(), 3);
|
||||
|
||||
input.output_level = 1;
|
||||
Close();
|
||||
|
||||
options.max_open_files = -1;
|
||||
OpenSecondary(options);
|
||||
auto cfh = db_secondary_->DefaultColumnFamily();
|
||||
|
||||
CompactionServiceResult result;
|
||||
ASSERT_OK(db_secondary_full()->TEST_CompactWithoutInstallation(cfh, input,
|
||||
&result));
|
||||
|
||||
ASSERT_EQ(result.output_files.size(), 1);
|
||||
InternalKey smallest, largest;
|
||||
smallest.DecodeFrom(result.output_files[0].smallest_internal_key);
|
||||
largest.DecodeFrom(result.output_files[0].largest_internal_key);
|
||||
ASSERT_EQ(smallest.user_key().ToString(), "bar");
|
||||
ASSERT_EQ(largest.user_key().ToString(), "foo");
|
||||
ASSERT_EQ(result.output_level, 1);
|
||||
ASSERT_EQ(result.output_path, this->secondary_path_);
|
||||
ASSERT_EQ(result.num_output_records, 2);
|
||||
ASSERT_GT(result.bytes_written, 0);
|
||||
}
|
||||
|
||||
TEST_F(DBSecondaryTest, InternalCompactionMultiLevels) {
|
||||
Options options;
|
||||
options.env = env_;
|
||||
options.disable_auto_compactions = true;
|
||||
Reopen(options);
|
||||
const int kRangeL2 = 10;
|
||||
const int kRangeL1 = 30;
|
||||
for (int i = 0; i < 10; i++) {
|
||||
ASSERT_OK(Put(Key(i * kRangeL2), "value" + ToString(i)));
|
||||
ASSERT_OK(Put(Key((i + 1) * kRangeL2 - 1), "value" + ToString(i)));
|
||||
ASSERT_OK(Flush());
|
||||
}
|
||||
MoveFilesToLevel(2);
|
||||
for (int i = 0; i < 5; i++) {
|
||||
ASSERT_OK(Put(Key(i * kRangeL1), "value" + ToString(i)));
|
||||
ASSERT_OK(Put(Key((i + 1) * kRangeL1 - 1), "value" + ToString(i)));
|
||||
ASSERT_OK(Flush());
|
||||
}
|
||||
MoveFilesToLevel(1);
|
||||
for (int i = 0; i < 4; i++) {
|
||||
ASSERT_OK(Put(Key(i * 30), "value" + ToString(i)));
|
||||
ASSERT_OK(Put(Key(i * 30 + 50), "value" + ToString(i)));
|
||||
ASSERT_OK(Flush());
|
||||
}
|
||||
|
||||
ColumnFamilyMetaData meta;
|
||||
db_->GetColumnFamilyMetaData(&meta);
|
||||
|
||||
// pick 2 files on level 0 for compaction, which has 3 overlap files on L1
|
||||
CompactionServiceInput input1;
|
||||
input1.input_files.push_back(meta.levels[0].files[2].name);
|
||||
input1.input_files.push_back(meta.levels[0].files[3].name);
|
||||
input1.input_files.push_back(meta.levels[1].files[0].name);
|
||||
input1.input_files.push_back(meta.levels[1].files[1].name);
|
||||
input1.input_files.push_back(meta.levels[1].files[2].name);
|
||||
|
||||
input1.output_level = 1;
|
||||
|
||||
options.max_open_files = -1;
|
||||
Close();
|
||||
|
||||
OpenSecondary(options);
|
||||
auto cfh = db_secondary_->DefaultColumnFamily();
|
||||
CompactionServiceResult result;
|
||||
ASSERT_OK(db_secondary_full()->TEST_CompactWithoutInstallation(cfh, input1,
|
||||
&result));
|
||||
|
||||
// pick 2 files on level 1 for compaction, which has 6 overlap files on L2
|
||||
CompactionServiceInput input2;
|
||||
input2.input_files.push_back(meta.levels[1].files[1].name);
|
||||
input2.input_files.push_back(meta.levels[1].files[2].name);
|
||||
for (int i = 3; i < 9; i++) {
|
||||
input2.input_files.push_back(meta.levels[2].files[i].name);
|
||||
}
|
||||
|
||||
input2.output_level = 2;
|
||||
ASSERT_OK(db_secondary_full()->TEST_CompactWithoutInstallation(cfh, input2,
|
||||
&result));
|
||||
|
||||
CloseSecondary();
|
||||
|
||||
// delete all l2 files, without update manifest
|
||||
for (auto& file : meta.levels[2].files) {
|
||||
ASSERT_OK(env_->DeleteFile(dbname_ + file.name));
|
||||
}
|
||||
OpenSecondary(options);
|
||||
cfh = db_secondary_->DefaultColumnFamily();
|
||||
Status s = db_secondary_full()->TEST_CompactWithoutInstallation(cfh, input2,
|
||||
&result);
|
||||
ASSERT_TRUE(s.IsInvalidArgument());
|
||||
|
||||
// TODO: L0 -> L1 compaction should success, currently version is not built
|
||||
// if files is missing.
|
||||
// ASSERT_OK(db_secondary_full()->TEST_CompactWithoutInstallation(cfh,
|
||||
// input1, &result));
|
||||
}
|
||||
|
||||
TEST_F(DBSecondaryTest, InternalCompactionCompactedFiles) {
|
||||
Options options;
|
||||
options.env = env_;
|
||||
options.level0_file_num_compaction_trigger = 4;
|
||||
Reopen(options);
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
|
||||
ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
|
||||
ASSERT_OK(Flush());
|
||||
}
|
||||
CompactionServiceInput input;
|
||||
|
||||
ColumnFamilyMetaData meta;
|
||||
db_->GetColumnFamilyMetaData(&meta);
|
||||
for (auto& file : meta.levels[0].files) {
|
||||
ASSERT_EQ(0, meta.levels[0].level);
|
||||
input.input_files.push_back(file.name);
|
||||
}
|
||||
ASSERT_EQ(input.input_files.size(), 3);
|
||||
|
||||
input.output_level = 1;
|
||||
|
||||
// trigger compaction to delete the files for secondary instance compaction
|
||||
ASSERT_OK(Put("foo", "foo_value" + std::to_string(3)));
|
||||
ASSERT_OK(Put("bar", "bar_value" + std::to_string(3)));
|
||||
ASSERT_OK(Flush());
|
||||
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
||||
|
||||
Close();
|
||||
|
||||
options.max_open_files = -1;
|
||||
OpenSecondary(options);
|
||||
auto cfh = db_secondary_->DefaultColumnFamily();
|
||||
|
||||
CompactionServiceResult result;
|
||||
Status s =
|
||||
db_secondary_full()->TEST_CompactWithoutInstallation(cfh, input, &result);
|
||||
ASSERT_TRUE(s.IsInvalidArgument());
|
||||
}
|
||||
|
||||
TEST_F(DBSecondaryTest, InternalCompactionMissingFiles) {
|
||||
Options options;
|
||||
options.env = env_;
|
||||
options.level0_file_num_compaction_trigger = 4;
|
||||
Reopen(options);
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
|
||||
ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
|
||||
ASSERT_OK(Flush());
|
||||
}
|
||||
CompactionServiceInput input;
|
||||
|
||||
ColumnFamilyMetaData meta;
|
||||
db_->GetColumnFamilyMetaData(&meta);
|
||||
for (auto& file : meta.levels[0].files) {
|
||||
ASSERT_EQ(0, meta.levels[0].level);
|
||||
input.input_files.push_back(file.name);
|
||||
}
|
||||
ASSERT_EQ(input.input_files.size(), 3);
|
||||
|
||||
input.output_level = 1;
|
||||
|
||||
Close();
|
||||
|
||||
ASSERT_OK(env_->DeleteFile(dbname_ + input.input_files[0]));
|
||||
|
||||
options.max_open_files = -1;
|
||||
OpenSecondary(options);
|
||||
auto cfh = db_secondary_->DefaultColumnFamily();
|
||||
|
||||
CompactionServiceResult result;
|
||||
Status s =
|
||||
db_secondary_full()->TEST_CompactWithoutInstallation(cfh, input, &result);
|
||||
ASSERT_TRUE(s.IsInvalidArgument());
|
||||
|
||||
input.input_files.erase(input.input_files.begin());
|
||||
|
||||
ASSERT_OK(db_secondary_full()->TEST_CompactWithoutInstallation(cfh, input,
|
||||
&result));
|
||||
}
|
||||
|
||||
TEST_F(DBSecondaryTest, OpenAsSecondary) {
|
||||
Options options;
|
||||
options.env = env_;
|
||||
|
@ -561,6 +561,7 @@ TEST_F(DBSSTTest, DBWithSstFileManagerForBlobFilesWithGC) {
|
||||
constexpr Slice* end = nullptr;
|
||||
|
||||
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
|
||||
sfm->WaitForEmptyTrash();
|
||||
|
||||
ASSERT_EQ(Get(first_key), first_value);
|
||||
ASSERT_EQ(Get(second_key), second_value);
|
||||
@ -593,6 +594,7 @@ TEST_F(DBSSTTest, DBWithSstFileManagerForBlobFilesWithGC) {
|
||||
|
||||
Close();
|
||||
ASSERT_OK(DestroyDB(dbname_, options));
|
||||
sfm->WaitForEmptyTrash();
|
||||
ASSERT_EQ(files_deleted, 5);
|
||||
ASSERT_EQ(files_scheduled_to_delete, 5);
|
||||
|
||||
@ -751,10 +753,11 @@ TEST_F(DBSSTTest, RateLimitedWALDelete) {
|
||||
}
|
||||
|
||||
class DBWALTestWithParam
|
||||
: public DBSSTTest,
|
||||
: public DBTestBase,
|
||||
public testing::WithParamInterface<std::tuple<std::string, bool>> {
|
||||
public:
|
||||
DBWALTestWithParam() {
|
||||
explicit DBWALTestWithParam()
|
||||
: DBTestBase("/db_wal_test_with_params", /*env_do_fsync=*/true) {
|
||||
wal_dir_ = std::get<0>(GetParam());
|
||||
wal_dir_same_as_dbname_ = std::get<1>(GetParam());
|
||||
}
|
||||
@ -1088,6 +1091,12 @@ TEST_F(DBSSTTest, DBWithMaxSpaceAllowedWithBlobFiles) {
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
||||
"BuildTable::AfterDeleteFile",
|
||||
[&](void* /*arg*/) { delete_blob_file = true; });
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
|
||||
{
|
||||
"BuildTable::AfterDeleteFile",
|
||||
"DBSSTTest::DBWithMaxSpaceAllowedWithBlobFiles:1",
|
||||
},
|
||||
});
|
||||
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
||||
|
||||
@ -1095,6 +1104,8 @@ TEST_F(DBSSTTest, DBWithMaxSpaceAllowedWithBlobFiles) {
|
||||
// This flush will fail
|
||||
ASSERT_NOK(Flush());
|
||||
ASSERT_TRUE(max_allowed_space_reached);
|
||||
|
||||
TEST_SYNC_POINT("DBSSTTest::DBWithMaxSpaceAllowedWithBlobFiles:1");
|
||||
ASSERT_TRUE(delete_blob_file);
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
||||
}
|
||||
|
@ -908,6 +908,9 @@ TEST_F(DBTest, FlushSchedule) {
|
||||
static_cast<int64_t>(options.write_buffer_size);
|
||||
options.max_write_buffer_number = 2;
|
||||
options.write_buffer_size = 120 * 1024;
|
||||
auto flush_listener = std::make_shared<FlushCounterListener>();
|
||||
flush_listener->expected_flush_reason = FlushReason::kWriteBufferFull;
|
||||
options.listeners.push_back(flush_listener);
|
||||
CreateAndReopenWithCF({"pikachu"}, options);
|
||||
std::vector<port::Thread> threads;
|
||||
|
||||
@ -3504,17 +3507,21 @@ TEST_F(DBTest, FIFOCompactionStyleWithCompactionAndDelete) {
|
||||
}
|
||||
|
||||
// Check that FIFO-with-TTL is not supported with max_open_files != -1.
|
||||
// Github issue #8014
|
||||
TEST_F(DBTest, FIFOCompactionWithTTLAndMaxOpenFilesTest) {
|
||||
Options options;
|
||||
Options options = CurrentOptions();
|
||||
options.compaction_style = kCompactionStyleFIFO;
|
||||
options.create_if_missing = true;
|
||||
options.ttl = 600; // seconds
|
||||
|
||||
// TTL is now supported with max_open_files != -1.
|
||||
options.max_open_files = 100;
|
||||
options = CurrentOptions(options);
|
||||
ASSERT_OK(TryReopen(options));
|
||||
// TTL is not supported with max_open_files != -1.
|
||||
options.max_open_files = 0;
|
||||
ASSERT_TRUE(TryReopen(options).IsNotSupported());
|
||||
|
||||
options.max_open_files = 100;
|
||||
ASSERT_TRUE(TryReopen(options).IsNotSupported());
|
||||
|
||||
// TTL is supported with unlimited max_open_files
|
||||
options.max_open_files = -1;
|
||||
ASSERT_OK(TryReopen(options));
|
||||
}
|
||||
@ -6694,20 +6701,19 @@ TEST_F(DBTest, MemoryUsageWithMaxWriteBufferSizeToMaintain) {
|
||||
Reopen(options);
|
||||
Random rnd(301);
|
||||
bool memory_limit_exceeded = false;
|
||||
uint64_t size_all_mem_table = 0;
|
||||
uint64_t cur_active_mem = 0;
|
||||
|
||||
ColumnFamilyData* cfd =
|
||||
static_cast<ColumnFamilyHandleImpl*>(db_->DefaultColumnFamily())->cfd();
|
||||
|
||||
for (int i = 0; i < 1000; i++) {
|
||||
std::string value = rnd.RandomString(1000);
|
||||
ASSERT_OK(Put("keykey_" + std::to_string(i), value));
|
||||
|
||||
dbfull()->TEST_WaitForFlushMemTable();
|
||||
|
||||
ASSERT_TRUE(db_->GetIntProperty(db_->DefaultColumnFamily(),
|
||||
DB::Properties::kSizeAllMemTables,
|
||||
&size_all_mem_table));
|
||||
ASSERT_TRUE(db_->GetIntProperty(db_->DefaultColumnFamily(),
|
||||
DB::Properties::kCurSizeActiveMemTable,
|
||||
&cur_active_mem));
|
||||
const uint64_t cur_active_mem = cfd->mem()->ApproximateMemoryUsage();
|
||||
const uint64_t size_all_mem_table =
|
||||
cur_active_mem + cfd->imm()->ApproximateMemoryUsage();
|
||||
|
||||
// Errors out if memory usage keeps on increasing beyond the limit.
|
||||
// Once memory limit exceeds, memory_limit_exceeded is set and if
|
||||
|
138
db/db_test2.cc
138
db/db_test2.cc
@ -344,6 +344,10 @@ class DBTestSharedWriteBufferAcrossCFs
|
||||
TEST_P(DBTestSharedWriteBufferAcrossCFs, SharedWriteBufferAcrossCFs) {
|
||||
Options options = CurrentOptions();
|
||||
options.arena_block_size = 4096;
|
||||
auto flush_listener = std::make_shared<FlushCounterListener>();
|
||||
options.listeners.push_back(flush_listener);
|
||||
// Don't trip the listener at shutdown.
|
||||
options.avoid_flush_during_shutdown = true;
|
||||
|
||||
// Avoid undeterministic value by malloc_usable_size();
|
||||
// Force arena block size to 1
|
||||
@ -387,6 +391,7 @@ TEST_P(DBTestSharedWriteBufferAcrossCFs, SharedWriteBufferAcrossCFs) {
|
||||
|
||||
// Create some data and flush "default" and "nikitich" so that they
|
||||
// are newer CFs created.
|
||||
flush_listener->expected_flush_reason = FlushReason::kManualFlush;
|
||||
ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
|
||||
Flush(3);
|
||||
ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
|
||||
@ -397,6 +402,7 @@ TEST_P(DBTestSharedWriteBufferAcrossCFs, SharedWriteBufferAcrossCFs) {
|
||||
ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
|
||||
static_cast<uint64_t>(1));
|
||||
|
||||
flush_listener->expected_flush_reason = FlushReason::kWriteBufferManager;
|
||||
ASSERT_OK(Put(3, Key(1), DummyString(30000), wo));
|
||||
if (cost_cache_) {
|
||||
ASSERT_GE(cache->GetUsage(), 256 * 1024);
|
||||
@ -521,6 +527,10 @@ TEST_F(DBTest2, SharedWriteBufferLimitAcrossDB) {
|
||||
std::string dbname2 = test::PerThreadDBPath("db_shared_wb_db2");
|
||||
Options options = CurrentOptions();
|
||||
options.arena_block_size = 4096;
|
||||
auto flush_listener = std::make_shared<FlushCounterListener>();
|
||||
options.listeners.push_back(flush_listener);
|
||||
// Don't trip the listener at shutdown.
|
||||
options.avoid_flush_during_shutdown = true;
|
||||
// Avoid undeterministic value by malloc_usable_size();
|
||||
// Force arena block size to 1
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
||||
@ -558,6 +568,7 @@ TEST_F(DBTest2, SharedWriteBufferLimitAcrossDB) {
|
||||
};
|
||||
|
||||
// Trigger a flush on cf2
|
||||
flush_listener->expected_flush_reason = FlushReason::kWriteBufferManager;
|
||||
ASSERT_OK(Put(2, Key(1), DummyString(70000), wo));
|
||||
wait_flush();
|
||||
ASSERT_OK(Put(0, Key(1), DummyString(20000), wo));
|
||||
@ -4116,7 +4127,7 @@ TEST_F(DBTest2, TraceWithFilter) {
|
||||
|
||||
// Open another db, replay, and verify the data
|
||||
std::string value;
|
||||
std::string dbname2 = test::TmpDir(env_) + "/db_replay";
|
||||
std::string dbname2 = test::PerThreadDBPath(env_, "db_replay");
|
||||
ASSERT_OK(DestroyDB(dbname2, options));
|
||||
|
||||
// Using a different name than db2, to pacify infer's use-after-lifetime
|
||||
@ -4167,7 +4178,7 @@ TEST_F(DBTest2, TraceWithFilter) {
|
||||
ASSERT_OK(DestroyDB(dbname2, options));
|
||||
|
||||
// Set up a new db.
|
||||
std::string dbname3 = test::TmpDir(env_) + "/db_not_trace_read";
|
||||
std::string dbname3 = test::PerThreadDBPath(env_, "db_not_trace_read");
|
||||
ASSERT_OK(DestroyDB(dbname3, options));
|
||||
|
||||
DB* db3_init = nullptr;
|
||||
@ -4584,7 +4595,7 @@ TEST_F(DBTest2, MultiDBParallelOpenTest) {
|
||||
Options options = CurrentOptions();
|
||||
std::vector<std::string> dbnames;
|
||||
for (int i = 0; i < kNumDbs; ++i) {
|
||||
dbnames.emplace_back(test::TmpDir(env_) + "/db" + ToString(i));
|
||||
dbnames.emplace_back(test::PerThreadDBPath(env_, "db" + ToString(i)));
|
||||
ASSERT_OK(DestroyDB(dbnames.back(), options));
|
||||
}
|
||||
|
||||
@ -5428,6 +5439,98 @@ TEST_F(DBTest2, AutoPrefixMode1) {
|
||||
ASSERT_EQ("a1", iterator->key().ToString());
|
||||
}
|
||||
}
|
||||
|
||||
class RenameCurrentTest : public DBTestBase,
|
||||
public testing::WithParamInterface<std::string> {
|
||||
public:
|
||||
RenameCurrentTest()
|
||||
: DBTestBase("rename_current_test", /*env_do_fsync=*/true),
|
||||
sync_point_(GetParam()) {}
|
||||
|
||||
~RenameCurrentTest() override {}
|
||||
|
||||
void SetUp() override {
|
||||
env_->no_file_overwrite_.store(true, std::memory_order_release);
|
||||
}
|
||||
|
||||
void TearDown() override {
|
||||
env_->no_file_overwrite_.store(false, std::memory_order_release);
|
||||
}
|
||||
|
||||
void SetupSyncPoints() {
|
||||
SyncPoint::GetInstance()->DisableProcessing();
|
||||
SyncPoint::GetInstance()->SetCallBack(sync_point_, [&](void* arg) {
|
||||
Status* s = reinterpret_cast<Status*>(arg);
|
||||
assert(s);
|
||||
*s = Status::IOError("Injected IO error.");
|
||||
});
|
||||
}
|
||||
|
||||
const std::string sync_point_;
|
||||
};
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(DistributedFS, RenameCurrentTest,
|
||||
::testing::Values("SetCurrentFile:BeforeRename",
|
||||
"SetCurrentFile:AfterRename"));
|
||||
|
||||
TEST_P(RenameCurrentTest, Open) {
|
||||
Destroy(last_options_);
|
||||
Options options = GetDefaultOptions();
|
||||
options.create_if_missing = true;
|
||||
SetupSyncPoints();
|
||||
SyncPoint::GetInstance()->EnableProcessing();
|
||||
Status s = TryReopen(options);
|
||||
ASSERT_NOK(s);
|
||||
|
||||
SyncPoint::GetInstance()->DisableProcessing();
|
||||
Reopen(options);
|
||||
}
|
||||
|
||||
TEST_P(RenameCurrentTest, Flush) {
|
||||
Destroy(last_options_);
|
||||
Options options = GetDefaultOptions();
|
||||
options.max_manifest_file_size = 1;
|
||||
options.create_if_missing = true;
|
||||
Reopen(options);
|
||||
ASSERT_OK(Put("key", "value"));
|
||||
SetupSyncPoints();
|
||||
SyncPoint::GetInstance()->EnableProcessing();
|
||||
ASSERT_NOK(Flush());
|
||||
|
||||
ASSERT_NOK(Put("foo", "value"));
|
||||
|
||||
SyncPoint::GetInstance()->DisableProcessing();
|
||||
Reopen(options);
|
||||
ASSERT_EQ("value", Get("key"));
|
||||
ASSERT_EQ("NOT_FOUND", Get("foo"));
|
||||
}
|
||||
|
||||
TEST_P(RenameCurrentTest, Compaction) {
|
||||
Destroy(last_options_);
|
||||
Options options = GetDefaultOptions();
|
||||
options.max_manifest_file_size = 1;
|
||||
options.create_if_missing = true;
|
||||
Reopen(options);
|
||||
ASSERT_OK(Put("a", "a_value"));
|
||||
ASSERT_OK(Put("c", "c_value"));
|
||||
ASSERT_OK(Flush());
|
||||
|
||||
ASSERT_OK(Put("b", "b_value"));
|
||||
ASSERT_OK(Put("d", "d_value"));
|
||||
ASSERT_OK(Flush());
|
||||
|
||||
SetupSyncPoints();
|
||||
SyncPoint::GetInstance()->EnableProcessing();
|
||||
ASSERT_NOK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
|
||||
/*end=*/nullptr));
|
||||
|
||||
ASSERT_NOK(Put("foo", "value"));
|
||||
|
||||
SyncPoint::GetInstance()->DisableProcessing();
|
||||
Reopen(options);
|
||||
ASSERT_EQ("NOT_FOUND", Get("foo"));
|
||||
ASSERT_EQ("d_value", Get("d"));
|
||||
}
|
||||
#endif // ROCKSDB_LITE
|
||||
|
||||
// WAL recovery mode is WALRecoveryMode::kPointInTimeRecovery.
|
||||
@ -5455,6 +5558,35 @@ TEST_F(DBTest2, PointInTimeRecoveryWithIOErrorWhileReadingWal) {
|
||||
Status s = TryReopen(options);
|
||||
ASSERT_TRUE(s.IsIOError());
|
||||
}
|
||||
|
||||
TEST_F(DBTest2, PointInTimeRecoveryWithSyncFailureInCFCreation) {
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
|
||||
{{"DBImpl::BackgroundCallFlush:Start:1",
|
||||
"PointInTimeRecoveryWithSyncFailureInCFCreation:1"},
|
||||
{"PointInTimeRecoveryWithSyncFailureInCFCreation:2",
|
||||
"DBImpl::BackgroundCallFlush:Start:2"}});
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
||||
|
||||
CreateColumnFamilies({"test1"}, Options());
|
||||
ASSERT_OK(Put("foo", "bar"));
|
||||
|
||||
// Creating a CF when a flush is going on, log is synced but the
|
||||
// closed log file is not synced and corrupted.
|
||||
port::Thread flush_thread([&]() { ASSERT_NOK(Flush()); });
|
||||
TEST_SYNC_POINT("PointInTimeRecoveryWithSyncFailureInCFCreation:1");
|
||||
CreateColumnFamilies({"test2"}, Options());
|
||||
env_->corrupt_in_sync_ = true;
|
||||
TEST_SYNC_POINT("PointInTimeRecoveryWithSyncFailureInCFCreation:2");
|
||||
flush_thread.join();
|
||||
env_->corrupt_in_sync_ = false;
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
||||
|
||||
// Reopening the DB should not corrupt anything
|
||||
Options options = CurrentOptions();
|
||||
options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
|
||||
ReopenWithColumnFamilies({"default", "test1", "test2"}, options);
|
||||
}
|
||||
|
||||
} // namespace ROCKSDB_NAMESPACE
|
||||
|
||||
#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
|
||||
|
@ -44,6 +44,7 @@ SpecialEnv::SpecialEnv(Env* base, bool time_elapse_only_sleep)
|
||||
manifest_sync_error_.store(false, std::memory_order_release);
|
||||
manifest_write_error_.store(false, std::memory_order_release);
|
||||
log_write_error_.store(false, std::memory_order_release);
|
||||
no_file_overwrite_.store(false, std::memory_order_release);
|
||||
random_file_open_counter_.store(0, std::memory_order_relaxed);
|
||||
delete_count_.store(0, std::memory_order_relaxed);
|
||||
num_open_wal_file_.store(0);
|
||||
@ -487,6 +488,7 @@ Options DBTestBase::GetOptions(
|
||||
}
|
||||
case kFIFOCompaction: {
|
||||
options.compaction_style = kCompactionStyleFIFO;
|
||||
options.max_open_files = -1;
|
||||
break;
|
||||
}
|
||||
case kBlockBasedTableWithPrefixHashIndex: {
|
||||
|
@ -378,14 +378,20 @@ class SpecialEnv : public EnvWrapper {
|
||||
return Append(data);
|
||||
}
|
||||
Status Truncate(uint64_t size) override { return base_->Truncate(size); }
|
||||
void PrepareWrite(size_t offset, size_t len) override {
|
||||
base_->PrepareWrite(offset, len);
|
||||
}
|
||||
void SetPreallocationBlockSize(size_t size) override {
|
||||
base_->SetPreallocationBlockSize(size);
|
||||
}
|
||||
Status Close() override {
|
||||
// SyncPoint is not supported in Released Windows Mode.
|
||||
#if !(defined NDEBUG) || !defined(OS_WIN)
|
||||
// Check preallocation size
|
||||
// preallocation size is never passed to base file.
|
||||
size_t preallocation_size = preallocation_block_size();
|
||||
size_t block_size, last_allocated_block;
|
||||
base_->GetPreallocationStatus(&block_size, &last_allocated_block);
|
||||
TEST_SYNC_POINT_CALLBACK("DBTestWalFile.GetPreallocationStatus",
|
||||
&preallocation_size);
|
||||
&block_size);
|
||||
#endif // !(defined NDEBUG) || !defined(OS_WIN)
|
||||
|
||||
return base_->Close();
|
||||
@ -393,6 +399,10 @@ class SpecialEnv : public EnvWrapper {
|
||||
Status Flush() override { return base_->Flush(); }
|
||||
Status Sync() override {
|
||||
++env_->sync_counter_;
|
||||
if (env_->corrupt_in_sync_) {
|
||||
Append(std::string(33000, ' '));
|
||||
return Status::IOError("Ingested Sync Failure");
|
||||
}
|
||||
if (env_->skip_fsync_) {
|
||||
return Status::OK();
|
||||
} else {
|
||||
@ -440,6 +450,11 @@ class SpecialEnv : public EnvWrapper {
|
||||
std::unique_ptr<WritableFile> base_;
|
||||
};
|
||||
|
||||
if (no_file_overwrite_.load(std::memory_order_acquire) &&
|
||||
target()->FileExists(f).ok()) {
|
||||
return Status::NotSupported("SpecialEnv::no_file_overwrite_ is true.");
|
||||
}
|
||||
|
||||
if (non_writeable_rate_.load(std::memory_order_acquire) > 0) {
|
||||
uint32_t random_number;
|
||||
{
|
||||
@ -687,6 +702,9 @@ class SpecialEnv : public EnvWrapper {
|
||||
// Slow down every log write, in micro-seconds.
|
||||
std::atomic<int> log_write_slowdown_;
|
||||
|
||||
// If true, returns Status::NotSupported for file overwrite.
|
||||
std::atomic<bool> no_file_overwrite_;
|
||||
|
||||
// Number of WAL files that are still open for write.
|
||||
std::atomic<int> num_open_wal_file_;
|
||||
|
||||
@ -709,6 +727,9 @@ class SpecialEnv : public EnvWrapper {
|
||||
// If true, all fsync to files and directories are skipped.
|
||||
bool skip_fsync_ = false;
|
||||
|
||||
// If true, ingest the corruption to file during sync.
|
||||
bool corrupt_in_sync_ = false;
|
||||
|
||||
std::atomic<uint32_t> non_writeable_rate_;
|
||||
|
||||
std::atomic<uint32_t> new_writable_count_;
|
||||
@ -761,6 +782,17 @@ class OnFileDeletionListener : public EventListener {
|
||||
size_t matched_count_;
|
||||
std::string expected_file_name_;
|
||||
};
|
||||
|
||||
class FlushCounterListener : public EventListener {
|
||||
public:
|
||||
std::atomic<int> count{0};
|
||||
std::atomic<FlushReason> expected_flush_reason{FlushReason::kOthers};
|
||||
|
||||
void OnFlushBegin(DB* /*db*/, const FlushJobInfo& flush_job_info) override {
|
||||
count++;
|
||||
ASSERT_EQ(expected_flush_reason.load(), flush_job_info.flush_reason);
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
// A test merge operator mimics put but also fails if one of merge operands is
|
||||
|
@ -24,13 +24,37 @@ class DBWALTestBase : public DBTestBase {
|
||||
|
||||
#if defined(ROCKSDB_PLATFORM_POSIX)
|
||||
public:
|
||||
#if defined(ROCKSDB_FALLOCATE_PRESENT)
|
||||
bool IsFallocateSupported() {
|
||||
// Test fallocate support of running file system.
|
||||
// Skip this test if fallocate is not supported.
|
||||
std::string fname_test_fallocate = dbname_ + "/preallocate_testfile";
|
||||
int fd = -1;
|
||||
do {
|
||||
fd = open(fname_test_fallocate.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644);
|
||||
} while (fd < 0 && errno == EINTR);
|
||||
assert(fd > 0);
|
||||
int alloc_status = fallocate(fd, 0, 0, 1);
|
||||
int err_number = errno;
|
||||
close(fd);
|
||||
assert(env_->DeleteFile(fname_test_fallocate) == Status::OK());
|
||||
if (err_number == ENOSYS || err_number == EOPNOTSUPP) {
|
||||
fprintf(stderr, "Skipped preallocated space check: %s\n",
|
||||
errnoStr(err_number).c_str());
|
||||
return false;
|
||||
}
|
||||
assert(alloc_status == 0);
|
||||
return true;
|
||||
}
|
||||
#endif // ROCKSDB_FALLOCATE_PRESENT
|
||||
|
||||
uint64_t GetAllocatedFileSize(std::string file_name) {
|
||||
struct stat sbuf;
|
||||
int err = stat(file_name.c_str(), &sbuf);
|
||||
assert(err == 0);
|
||||
return sbuf.st_blocks * 512;
|
||||
}
|
||||
#endif
|
||||
#endif // ROCKSDB_PLATFORM_POSIX
|
||||
};
|
||||
|
||||
class DBWALTest : public DBWALTestBase {
|
||||
@ -1777,19 +1801,8 @@ TEST_P(DBWALTestWithParamsVaryingRecoveryMode,
|
||||
// avoid_flush_during_recovery=true.
|
||||
// Flush should trigger if max_total_wal_size is reached.
|
||||
TEST_F(DBWALTest, RestoreTotalLogSizeAfterRecoverWithoutFlush) {
|
||||
class TestFlushListener : public EventListener {
|
||||
public:
|
||||
std::atomic<int> count{0};
|
||||
|
||||
TestFlushListener() = default;
|
||||
|
||||
void OnFlushBegin(DB* /*db*/, const FlushJobInfo& flush_job_info) override {
|
||||
count++;
|
||||
ASSERT_EQ(FlushReason::kWriteBufferManager, flush_job_info.flush_reason);
|
||||
}
|
||||
};
|
||||
std::shared_ptr<TestFlushListener> test_listener =
|
||||
std::make_shared<TestFlushListener>();
|
||||
auto test_listener = std::make_shared<FlushCounterListener>();
|
||||
test_listener->expected_flush_reason = FlushReason::kWalFull;
|
||||
|
||||
constexpr size_t kKB = 1024;
|
||||
constexpr size_t kMB = 1024 * 1024;
|
||||
@ -1849,23 +1862,9 @@ TEST_F(DBWALTest, TruncateLastLogAfterRecoverWithoutFlush) {
|
||||
ROCKSDB_GTEST_SKIP("Test requires non-mem environment");
|
||||
return;
|
||||
}
|
||||
// Test fallocate support of running file system.
|
||||
// Skip this test if fallocate is not supported.
|
||||
std::string fname_test_fallocate = dbname_ + "/preallocate_testfile";
|
||||
int fd = -1;
|
||||
do {
|
||||
fd = open(fname_test_fallocate.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644);
|
||||
} while (fd < 0 && errno == EINTR);
|
||||
ASSERT_GT(fd, 0);
|
||||
int alloc_status = fallocate(fd, 0, 0, 1);
|
||||
int err_number = errno;
|
||||
close(fd);
|
||||
ASSERT_OK(options.env->DeleteFile(fname_test_fallocate));
|
||||
if (err_number == ENOSYS || err_number == EOPNOTSUPP) {
|
||||
fprintf(stderr, "Skipped preallocated space check: %s\n", strerror(err_number));
|
||||
if (!IsFallocateSupported()) {
|
||||
return;
|
||||
}
|
||||
ASSERT_EQ(0, alloc_status);
|
||||
|
||||
DestroyAndReopen(options);
|
||||
size_t preallocated_size =
|
||||
@ -1888,6 +1887,120 @@ TEST_F(DBWALTest, TruncateLastLogAfterRecoverWithoutFlush) {
|
||||
ASSERT_LT(GetAllocatedFileSize(dbname_ + file_before->PathName()),
|
||||
preallocated_size);
|
||||
}
|
||||
// Tests that we will truncate the preallocated space of the last log from
|
||||
// previous.
|
||||
TEST_F(DBWALTest, TruncateLastLogAfterRecoverWithFlush) {
|
||||
constexpr size_t kKB = 1024;
|
||||
Options options = CurrentOptions();
|
||||
options.env = env_;
|
||||
options.avoid_flush_during_recovery = false;
|
||||
options.avoid_flush_during_shutdown = true;
|
||||
if (mem_env_) {
|
||||
ROCKSDB_GTEST_SKIP("Test requires non-mem environment");
|
||||
return;
|
||||
}
|
||||
if (!IsFallocateSupported()) {
|
||||
return;
|
||||
}
|
||||
|
||||
DestroyAndReopen(options);
|
||||
size_t preallocated_size =
|
||||
dbfull()->TEST_GetWalPreallocateBlockSize(options.write_buffer_size);
|
||||
ASSERT_OK(Put("foo", "v1"));
|
||||
VectorLogPtr log_files_before;
|
||||
ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_before));
|
||||
ASSERT_EQ(1, log_files_before.size());
|
||||
auto& file_before = log_files_before[0];
|
||||
ASSERT_LT(file_before->SizeFileBytes(), 1 * kKB);
|
||||
ASSERT_GE(GetAllocatedFileSize(dbname_ + file_before->PathName()),
|
||||
preallocated_size);
|
||||
// The log file has preallocated space.
|
||||
Close();
|
||||
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
|
||||
{{"DBImpl::PurgeObsoleteFiles:Begin",
|
||||
"DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterRecover"},
|
||||
{"DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterTruncate",
|
||||
"DBImpl::DeleteObsoleteFileImpl::BeforeDeletion"}});
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
||||
port::Thread reopen_thread([&]() { Reopen(options); });
|
||||
|
||||
TEST_SYNC_POINT(
|
||||
"DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterRecover");
|
||||
// After the flush during Open, the log file should get deleted. However,
|
||||
// if the process is in a crash loop, the log file may not get
|
||||
// deleted and thte preallocated space will keep accumulating. So we need
|
||||
// to ensure it gets trtuncated.
|
||||
EXPECT_LT(GetAllocatedFileSize(dbname_ + file_before->PathName()),
|
||||
preallocated_size);
|
||||
TEST_SYNC_POINT(
|
||||
"DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterTruncate");
|
||||
reopen_thread.join();
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
||||
}
|
||||
|
||||
TEST_F(DBWALTest, TruncateLastLogAfterRecoverWALEmpty) {
|
||||
Options options = CurrentOptions();
|
||||
options.env = env_;
|
||||
options.avoid_flush_during_recovery = false;
|
||||
if (mem_env_ || encrypted_env_) {
|
||||
ROCKSDB_GTEST_SKIP("Test requires non-mem/non-encrypted environment");
|
||||
return;
|
||||
}
|
||||
if (!IsFallocateSupported()) {
|
||||
return;
|
||||
}
|
||||
|
||||
DestroyAndReopen(options);
|
||||
size_t preallocated_size =
|
||||
dbfull()->TEST_GetWalPreallocateBlockSize(options.write_buffer_size);
|
||||
Close();
|
||||
std::vector<std::string> filenames;
|
||||
std::string last_log;
|
||||
uint64_t last_log_num = 0;
|
||||
ASSERT_OK(env_->GetChildren(dbname_, &filenames));
|
||||
for (auto fname : filenames) {
|
||||
uint64_t number;
|
||||
FileType type;
|
||||
if (ParseFileName(fname, &number, &type, nullptr)) {
|
||||
if (type == kWalFile && number > last_log_num) {
|
||||
last_log = fname;
|
||||
}
|
||||
}
|
||||
}
|
||||
ASSERT_NE(last_log, "");
|
||||
last_log = dbname_ + '/' + last_log;
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
|
||||
{{"DBImpl::PurgeObsoleteFiles:Begin",
|
||||
"DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterRecover"},
|
||||
{"DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterTruncate",
|
||||
"DBImpl::DeleteObsoleteFileImpl::BeforeDeletion"}});
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
||||
"PosixWritableFile::Close",
|
||||
[](void* arg) { *(reinterpret_cast<size_t*>(arg)) = 0; });
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
||||
// Preallocate space for the empty log file. This could happen if WAL data
|
||||
// was buffered in memory and the process crashed.
|
||||
std::unique_ptr<WritableFile> log_file;
|
||||
ASSERT_OK(env_->ReopenWritableFile(last_log, &log_file, EnvOptions()));
|
||||
log_file->SetPreallocationBlockSize(preallocated_size);
|
||||
log_file->PrepareWrite(0, 4096);
|
||||
log_file.reset();
|
||||
|
||||
ASSERT_GE(GetAllocatedFileSize(last_log), preallocated_size);
|
||||
|
||||
port::Thread reopen_thread([&]() { Reopen(options); });
|
||||
|
||||
TEST_SYNC_POINT(
|
||||
"DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterRecover");
|
||||
// The preallocated space should be truncated.
|
||||
EXPECT_LT(GetAllocatedFileSize(last_log), preallocated_size);
|
||||
TEST_SYNC_POINT(
|
||||
"DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterTruncate");
|
||||
reopen_thread.join();
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
|
||||
}
|
||||
#endif // ROCKSDB_FALLOCATE_PRESENT
|
||||
#endif // ROCKSDB_PLATFORM_POSIX
|
||||
|
||||
|
801
db/db_write_buffer_manager_test.cc
Normal file
801
db/db_write_buffer_manager_test.cc
Normal file
@ -0,0 +1,801 @@
|
||||
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under both the GPLv2 (found in the
|
||||
// COPYING file in the root directory) and Apache 2.0 License
|
||||
// (found in the LICENSE.Apache file in the root directory).
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "db/db_test_util.h"
|
||||
#include "db/write_thread.h"
|
||||
#include "port/stack_trace.h"
|
||||
|
||||
namespace ROCKSDB_NAMESPACE {
|
||||
|
||||
class DBWriteBufferManagerTest : public DBTestBase,
|
||||
public testing::WithParamInterface<bool> {
|
||||
public:
|
||||
DBWriteBufferManagerTest()
|
||||
: DBTestBase("/db_write_buffer_manager_test", /*env_do_fsync=*/false) {}
|
||||
bool cost_cache_;
|
||||
};
|
||||
|
||||
TEST_P(DBWriteBufferManagerTest, SharedBufferAcrossCFs1) {
|
||||
Options options = CurrentOptions();
|
||||
options.arena_block_size = 4096;
|
||||
options.write_buffer_size = 500000; // this is never hit
|
||||
std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);
|
||||
ASSERT_LT(cache->GetUsage(), 256 * 1024);
|
||||
cost_cache_ = GetParam();
|
||||
|
||||
if (cost_cache_) {
|
||||
options.write_buffer_manager.reset(
|
||||
new WriteBufferManager(100000, cache, true));
|
||||
} else {
|
||||
options.write_buffer_manager.reset(
|
||||
new WriteBufferManager(100000, nullptr, true));
|
||||
}
|
||||
|
||||
WriteOptions wo;
|
||||
wo.disableWAL = true;
|
||||
|
||||
CreateAndReopenWithCF({"cf1", "cf2", "cf3"}, options);
|
||||
ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
|
||||
Flush(3);
|
||||
ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
|
||||
ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
|
||||
Flush(0);
|
||||
|
||||
// Write to "Default", "cf2" and "cf3".
|
||||
ASSERT_OK(Put(3, Key(1), DummyString(30000), wo));
|
||||
ASSERT_OK(Put(0, Key(1), DummyString(40000), wo));
|
||||
ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
|
||||
|
||||
ASSERT_OK(Put(3, Key(2), DummyString(40000), wo));
|
||||
// WriteBufferManager::buffer_size_ has exceeded after the previous write is
|
||||
// completed.
|
||||
|
||||
// This make sures write will go through and if stall was in effect, it will
|
||||
// end.
|
||||
ASSERT_OK(Put(0, Key(2), DummyString(1), wo));
|
||||
}
|
||||
|
||||
// Test Single DB with multiple writer threads get blocked when
|
||||
// WriteBufferManager execeeds buffer_size_ and flush is waiting to be
|
||||
// finished.
|
||||
TEST_P(DBWriteBufferManagerTest, SharedWriteBufferAcrossCFs2) {
|
||||
Options options = CurrentOptions();
|
||||
options.arena_block_size = 4096;
|
||||
options.write_buffer_size = 500000; // this is never hit
|
||||
std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);
|
||||
ASSERT_LT(cache->GetUsage(), 256 * 1024);
|
||||
cost_cache_ = GetParam();
|
||||
|
||||
if (cost_cache_) {
|
||||
options.write_buffer_manager.reset(
|
||||
new WriteBufferManager(100000, cache, true));
|
||||
} else {
|
||||
options.write_buffer_manager.reset(
|
||||
new WriteBufferManager(100000, nullptr, true));
|
||||
}
|
||||
WriteOptions wo;
|
||||
wo.disableWAL = true;
|
||||
|
||||
CreateAndReopenWithCF({"cf1", "cf2", "cf3"}, options);
|
||||
ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
|
||||
Flush(3);
|
||||
ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
|
||||
ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
|
||||
Flush(0);
|
||||
|
||||
// Write to "Default", "cf2" and "cf3". No flush will be triggered.
|
||||
ASSERT_OK(Put(3, Key(1), DummyString(30000), wo));
|
||||
ASSERT_OK(Put(0, Key(1), DummyString(40000), wo));
|
||||
ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
|
||||
|
||||
ASSERT_OK(Put(3, Key(2), DummyString(40000), wo));
|
||||
// WriteBufferManager::buffer_size_ has exceeded after the previous write is
|
||||
// completed.
|
||||
|
||||
std::unordered_set<WriteThread::Writer*> w_set;
|
||||
std::vector<port::Thread> threads;
|
||||
int wait_count_db = 0;
|
||||
int num_writers = 4;
|
||||
InstrumentedMutex mutex;
|
||||
InstrumentedCondVar cv(&mutex);
|
||||
std::atomic<int> thread_num(0);
|
||||
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
|
||||
{{"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0",
|
||||
"DBImpl::BackgroundCallFlush:start"}});
|
||||
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
||||
"WBMStallInterface::BlockDB", [&](void*) {
|
||||
InstrumentedMutexLock lock(&mutex);
|
||||
wait_count_db++;
|
||||
cv.SignalAll();
|
||||
});
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
||||
"WriteThread::WriteStall::Wait", [&](void* arg) {
|
||||
InstrumentedMutexLock lock(&mutex);
|
||||
WriteThread::Writer* w = reinterpret_cast<WriteThread::Writer*>(arg);
|
||||
w_set.insert(w);
|
||||
// Allow the flush to continue if all writer threads are blocked.
|
||||
if (w_set.size() == (unsigned long)num_writers) {
|
||||
TEST_SYNC_POINT(
|
||||
"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
|
||||
}
|
||||
});
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
||||
|
||||
bool s = true;
|
||||
|
||||
std::function<void(int)> writer = [&](int cf) {
|
||||
int a = thread_num.fetch_add(1);
|
||||
std::string key = "foo" + std::to_string(a);
|
||||
Status tmp = Put(cf, Slice(key), DummyString(1), wo);
|
||||
InstrumentedMutexLock lock(&mutex);
|
||||
s = s && tmp.ok();
|
||||
};
|
||||
|
||||
// Flow:
|
||||
// main_writer thread will write but will be blocked (as Flush will on hold,
|
||||
// buffer_size_ has exceeded, thus will create stall in effect).
|
||||
// |
|
||||
// |
|
||||
// multiple writer threads will be created to write across multiple columns
|
||||
// and they will be blocked.
|
||||
// |
|
||||
// |
|
||||
// Last writer thread will write and when its blocked it will signal Flush to
|
||||
// continue to clear the stall.
|
||||
|
||||
threads.emplace_back(writer, 1);
|
||||
// Wait untill first thread (main_writer) writing to DB is blocked and then
|
||||
// create the multiple writers which will be blocked from getting added to the
|
||||
// queue because stall is in effect.
|
||||
{
|
||||
InstrumentedMutexLock lock(&mutex);
|
||||
while (wait_count_db != 1) {
|
||||
cv.Wait();
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < num_writers; i++) {
|
||||
threads.emplace_back(writer, i % 4);
|
||||
}
|
||||
for (auto& t : threads) {
|
||||
t.join();
|
||||
}
|
||||
|
||||
ASSERT_TRUE(s);
|
||||
|
||||
// Number of DBs blocked.
|
||||
ASSERT_EQ(wait_count_db, 1);
|
||||
// Number of Writer threads blocked.
|
||||
ASSERT_EQ(w_set.size(), num_writers);
|
||||
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
||||
}
|
||||
|
||||
// Test multiple DBs get blocked when WriteBufferManager limit exceeds and flush
|
||||
// is waiting to be finished but DBs tries to write meanwhile.
|
||||
TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB) {
|
||||
std::vector<std::string> dbnames;
|
||||
std::vector<DB*> dbs;
|
||||
int num_dbs = 3;
|
||||
|
||||
for (int i = 0; i < num_dbs; i++) {
|
||||
dbs.push_back(nullptr);
|
||||
dbnames.push_back(
|
||||
test::PerThreadDBPath("db_shared_wb_db" + std::to_string(i)));
|
||||
}
|
||||
|
||||
Options options = CurrentOptions();
|
||||
options.arena_block_size = 4096;
|
||||
options.write_buffer_size = 500000; // this is never hit
|
||||
std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);
|
||||
ASSERT_LT(cache->GetUsage(), 256 * 1024);
|
||||
cost_cache_ = GetParam();
|
||||
|
||||
if (cost_cache_) {
|
||||
options.write_buffer_manager.reset(
|
||||
new WriteBufferManager(100000, cache, true));
|
||||
} else {
|
||||
options.write_buffer_manager.reset(
|
||||
new WriteBufferManager(100000, nullptr, true));
|
||||
}
|
||||
CreateAndReopenWithCF({"cf1", "cf2"}, options);
|
||||
|
||||
for (int i = 0; i < num_dbs; i++) {
|
||||
ASSERT_OK(DestroyDB(dbnames[i], options));
|
||||
ASSERT_OK(DB::Open(options, dbnames[i], &(dbs[i])));
|
||||
}
|
||||
WriteOptions wo;
|
||||
wo.disableWAL = true;
|
||||
|
||||
for (int i = 0; i < num_dbs; i++) {
|
||||
ASSERT_OK(dbs[i]->Put(wo, Key(1), DummyString(20000)));
|
||||
}
|
||||
// Insert to db_.
|
||||
ASSERT_OK(Put(0, Key(1), DummyString(30000), wo));
|
||||
|
||||
// WriteBufferManager Limit exceeded.
|
||||
std::vector<port::Thread> threads;
|
||||
int wait_count_db = 0;
|
||||
InstrumentedMutex mutex;
|
||||
InstrumentedCondVar cv(&mutex);
|
||||
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
|
||||
{{"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0",
|
||||
"DBImpl::BackgroundCallFlush:start"}});
|
||||
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
||||
"WBMStallInterface::BlockDB", [&](void*) {
|
||||
{
|
||||
InstrumentedMutexLock lock(&mutex);
|
||||
wait_count_db++;
|
||||
cv.Signal();
|
||||
// Since this is the last DB, signal Flush to continue.
|
||||
if (wait_count_db == num_dbs + 1) {
|
||||
TEST_SYNC_POINT(
|
||||
"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
|
||||
}
|
||||
}
|
||||
});
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
||||
|
||||
bool s = true;
|
||||
|
||||
// Write to DB.
|
||||
std::function<void(DB*)> write_db = [&](DB* db) {
|
||||
Status tmp = db->Put(wo, Key(3), DummyString(1));
|
||||
InstrumentedMutexLock lock(&mutex);
|
||||
s = s && tmp.ok();
|
||||
};
|
||||
|
||||
// Flow:
|
||||
// db_ will write and will be blocked (as Flush will on hold and will create
|
||||
// stall in effect).
|
||||
// |
|
||||
// multiple dbs writers will be created to write to that db and they will be
|
||||
// blocked.
|
||||
// |
|
||||
// |
|
||||
// Last writer will write and when its blocked it will signal Flush to
|
||||
// continue to clear the stall.
|
||||
|
||||
threads.emplace_back(write_db, db_);
|
||||
// Wait untill first DB is blocked and then create the multiple writers for
|
||||
// different DBs which will be blocked from getting added to the queue because
|
||||
// stall is in effect.
|
||||
{
|
||||
InstrumentedMutexLock lock(&mutex);
|
||||
while (wait_count_db != 1) {
|
||||
cv.Wait();
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < num_dbs; i++) {
|
||||
threads.emplace_back(write_db, dbs[i]);
|
||||
}
|
||||
for (auto& t : threads) {
|
||||
t.join();
|
||||
}
|
||||
|
||||
ASSERT_TRUE(s);
|
||||
ASSERT_EQ(num_dbs + 1, wait_count_db);
|
||||
// Clean up DBs.
|
||||
for (int i = 0; i < num_dbs; i++) {
|
||||
ASSERT_OK(dbs[i]->Close());
|
||||
ASSERT_OK(DestroyDB(dbnames[i], options));
|
||||
delete dbs[i];
|
||||
}
|
||||
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
||||
}
|
||||
|
||||
// Test multiple threads writing across multiple DBs and multiple columns get
|
||||
// blocked when stall by WriteBufferManager is in effect.
|
||||
TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB1) {
|
||||
std::vector<std::string> dbnames;
|
||||
std::vector<DB*> dbs;
|
||||
int num_dbs = 3;
|
||||
|
||||
for (int i = 0; i < num_dbs; i++) {
|
||||
dbs.push_back(nullptr);
|
||||
dbnames.push_back(
|
||||
test::PerThreadDBPath("db_shared_wb_db" + std::to_string(i)));
|
||||
}
|
||||
|
||||
Options options = CurrentOptions();
|
||||
options.arena_block_size = 4096;
|
||||
options.write_buffer_size = 500000; // this is never hit
|
||||
std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);
|
||||
ASSERT_LT(cache->GetUsage(), 256 * 1024);
|
||||
cost_cache_ = GetParam();
|
||||
|
||||
if (cost_cache_) {
|
||||
options.write_buffer_manager.reset(
|
||||
new WriteBufferManager(100000, cache, true));
|
||||
} else {
|
||||
options.write_buffer_manager.reset(
|
||||
new WriteBufferManager(100000, nullptr, true));
|
||||
}
|
||||
CreateAndReopenWithCF({"cf1", "cf2"}, options);
|
||||
|
||||
for (int i = 0; i < num_dbs; i++) {
|
||||
ASSERT_OK(DestroyDB(dbnames[i], options));
|
||||
ASSERT_OK(DB::Open(options, dbnames[i], &(dbs[i])));
|
||||
}
|
||||
WriteOptions wo;
|
||||
wo.disableWAL = true;
|
||||
|
||||
for (int i = 0; i < num_dbs; i++) {
|
||||
ASSERT_OK(dbs[i]->Put(wo, Key(1), DummyString(20000)));
|
||||
}
|
||||
// Insert to db_.
|
||||
ASSERT_OK(Put(0, Key(1), DummyString(30000), wo));
|
||||
|
||||
// WriteBufferManager::buffer_size_ has exceeded after the previous write to
|
||||
// dbs[0] is completed.
|
||||
std::vector<port::Thread> threads;
|
||||
int wait_count_db = 0;
|
||||
InstrumentedMutex mutex;
|
||||
InstrumentedCondVar cv(&mutex);
|
||||
std::unordered_set<WriteThread::Writer*> w_set;
|
||||
std::vector<port::Thread> writer_threads;
|
||||
std::atomic<int> thread_num(0);
|
||||
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
|
||||
{{"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0",
|
||||
"DBImpl::BackgroundCallFlush:start"}});
|
||||
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
||||
"WBMStallInterface::BlockDB", [&](void*) {
|
||||
{
|
||||
InstrumentedMutexLock lock(&mutex);
|
||||
wait_count_db++;
|
||||
thread_num.fetch_add(1);
|
||||
cv.Signal();
|
||||
// Allow the flush to continue if all writer threads are blocked.
|
||||
if (thread_num.load(std::memory_order_relaxed) == 2 * num_dbs + 1) {
|
||||
TEST_SYNC_POINT(
|
||||
"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
|
||||
}
|
||||
}
|
||||
});
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
||||
"WriteThread::WriteStall::Wait", [&](void* arg) {
|
||||
WriteThread::Writer* w = reinterpret_cast<WriteThread::Writer*>(arg);
|
||||
{
|
||||
InstrumentedMutexLock lock(&mutex);
|
||||
w_set.insert(w);
|
||||
thread_num.fetch_add(1);
|
||||
// Allow the flush continue if all writer threads are blocked.
|
||||
if (thread_num.load(std::memory_order_relaxed) == 2 * num_dbs + 1) {
|
||||
TEST_SYNC_POINT(
|
||||
"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
|
||||
}
|
||||
}
|
||||
});
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
||||
|
||||
bool s1 = true, s2 = true;
|
||||
// Write to multiple columns of db_.
|
||||
std::function<void(int)> write_cf = [&](int cf) {
|
||||
Status tmp = Put(cf, Key(3), DummyString(1), wo);
|
||||
InstrumentedMutexLock lock(&mutex);
|
||||
s1 = s1 && tmp.ok();
|
||||
};
|
||||
// Write to multiple DBs.
|
||||
std::function<void(DB*)> write_db = [&](DB* db) {
|
||||
Status tmp = db->Put(wo, Key(3), DummyString(1));
|
||||
InstrumentedMutexLock lock(&mutex);
|
||||
s2 = s2 && tmp.ok();
|
||||
};
|
||||
|
||||
// Flow:
|
||||
// thread will write to db_ will be blocked (as Flush will on hold,
|
||||
// buffer_size_ has exceeded and will create stall in effect).
|
||||
// |
|
||||
// |
|
||||
// multiple writers threads writing to different DBs and to db_ across
|
||||
// multiple columns will be created and they will be blocked due to stall.
|
||||
// |
|
||||
// |
|
||||
// Last writer thread will write and when its blocked it will signal Flush to
|
||||
// continue to clear the stall.
|
||||
threads.emplace_back(write_db, db_);
|
||||
// Wait untill first thread is blocked and then create the multiple writer
|
||||
// threads.
|
||||
{
|
||||
InstrumentedMutexLock lock(&mutex);
|
||||
while (wait_count_db != 1) {
|
||||
cv.Wait();
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < num_dbs; i++) {
|
||||
// Write to multiple columns of db_.
|
||||
writer_threads.emplace_back(write_cf, i % 3);
|
||||
// Write to different dbs.
|
||||
threads.emplace_back(write_db, dbs[i]);
|
||||
}
|
||||
for (auto& t : threads) {
|
||||
t.join();
|
||||
}
|
||||
for (auto& t : writer_threads) {
|
||||
t.join();
|
||||
}
|
||||
|
||||
ASSERT_TRUE(s1);
|
||||
ASSERT_TRUE(s2);
|
||||
|
||||
// Number of DBs blocked.
|
||||
ASSERT_EQ(num_dbs + 1, wait_count_db);
|
||||
// Number of Writer threads blocked.
|
||||
ASSERT_EQ(w_set.size(), num_dbs);
|
||||
// Clean up DBs.
|
||||
for (int i = 0; i < num_dbs; i++) {
|
||||
ASSERT_OK(dbs[i]->Close());
|
||||
ASSERT_OK(DestroyDB(dbnames[i], options));
|
||||
delete dbs[i];
|
||||
}
|
||||
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
||||
}
|
||||
|
||||
// Test multiple threads writing across multiple columns of db_ by passing
|
||||
// different values to WriteOption.no_slown_down.
|
||||
TEST_P(DBWriteBufferManagerTest, MixedSlowDownOptionsSingleDB) {
|
||||
Options options = CurrentOptions();
|
||||
options.arena_block_size = 4096;
|
||||
options.write_buffer_size = 500000; // this is never hit
|
||||
std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);
|
||||
ASSERT_LT(cache->GetUsage(), 256 * 1024);
|
||||
cost_cache_ = GetParam();
|
||||
|
||||
if (cost_cache_) {
|
||||
options.write_buffer_manager.reset(
|
||||
new WriteBufferManager(100000, cache, true));
|
||||
} else {
|
||||
options.write_buffer_manager.reset(
|
||||
new WriteBufferManager(100000, nullptr, true));
|
||||
}
|
||||
WriteOptions wo;
|
||||
wo.disableWAL = true;
|
||||
|
||||
CreateAndReopenWithCF({"cf1", "cf2", "cf3"}, options);
|
||||
|
||||
ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
|
||||
Flush(3);
|
||||
ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
|
||||
ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
|
||||
Flush(0);
|
||||
|
||||
// Write to "Default", "cf2" and "cf3". No flush will be triggered.
|
||||
ASSERT_OK(Put(3, Key(1), DummyString(30000), wo));
|
||||
ASSERT_OK(Put(0, Key(1), DummyString(40000), wo));
|
||||
ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
|
||||
ASSERT_OK(Put(3, Key(2), DummyString(40000), wo));
|
||||
|
||||
// WriteBufferManager::buffer_size_ has exceeded after the previous write to
|
||||
// db_ is completed.
|
||||
|
||||
std::unordered_set<WriteThread::Writer*> w_slowdown_set;
|
||||
std::vector<port::Thread> threads;
|
||||
int wait_count_db = 0;
|
||||
int num_writers = 4;
|
||||
InstrumentedMutex mutex;
|
||||
InstrumentedCondVar cv(&mutex);
|
||||
std::atomic<int> thread_num(0);
|
||||
std::atomic<int> w_no_slowdown(0);
|
||||
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
|
||||
{{"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0",
|
||||
"DBImpl::BackgroundCallFlush:start"}});
|
||||
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
||||
"WBMStallInterface::BlockDB", [&](void*) {
|
||||
{
|
||||
InstrumentedMutexLock lock(&mutex);
|
||||
wait_count_db++;
|
||||
cv.SignalAll();
|
||||
}
|
||||
});
|
||||
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
||||
"WriteThread::WriteStall::Wait", [&](void* arg) {
|
||||
{
|
||||
InstrumentedMutexLock lock(&mutex);
|
||||
WriteThread::Writer* w = reinterpret_cast<WriteThread::Writer*>(arg);
|
||||
w_slowdown_set.insert(w);
|
||||
// Allow the flush continue if all writer threads are blocked.
|
||||
if (w_slowdown_set.size() + (unsigned long)w_no_slowdown.load(
|
||||
std::memory_order_relaxed) ==
|
||||
(unsigned long)num_writers) {
|
||||
TEST_SYNC_POINT(
|
||||
"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
|
||||
}
|
||||
}
|
||||
});
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
||||
|
||||
bool s1 = true, s2 = true;
|
||||
|
||||
std::function<void(int)> write_slow_down = [&](int cf) {
|
||||
int a = thread_num.fetch_add(1);
|
||||
std::string key = "foo" + std::to_string(a);
|
||||
WriteOptions write_op;
|
||||
write_op.no_slowdown = false;
|
||||
Status tmp = Put(cf, Slice(key), DummyString(1), write_op);
|
||||
InstrumentedMutexLock lock(&mutex);
|
||||
s1 = s1 && tmp.ok();
|
||||
};
|
||||
|
||||
std::function<void(int)> write_no_slow_down = [&](int cf) {
|
||||
int a = thread_num.fetch_add(1);
|
||||
std::string key = "foo" + std::to_string(a);
|
||||
WriteOptions write_op;
|
||||
write_op.no_slowdown = true;
|
||||
Status tmp = Put(cf, Slice(key), DummyString(1), write_op);
|
||||
{
|
||||
InstrumentedMutexLock lock(&mutex);
|
||||
s2 = s2 && !tmp.ok();
|
||||
w_no_slowdown.fetch_add(1);
|
||||
// Allow the flush continue if all writer threads are blocked.
|
||||
if (w_slowdown_set.size() +
|
||||
(unsigned long)w_no_slowdown.load(std::memory_order_relaxed) ==
|
||||
(unsigned long)num_writers) {
|
||||
TEST_SYNC_POINT(
|
||||
"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Flow:
|
||||
// main_writer thread will write but will be blocked (as Flush will on hold,
|
||||
// buffer_size_ has exceeded, thus will create stall in effect).
|
||||
// |
|
||||
// |
|
||||
// multiple writer threads will be created to write across multiple columns
|
||||
// with different values of WriteOptions.no_slowdown. Some of them will
|
||||
// be blocked and some of them will return with Incomplete status.
|
||||
// |
|
||||
// |
|
||||
// Last writer thread will write and when its blocked/return it will signal
|
||||
// Flush to continue to clear the stall.
|
||||
threads.emplace_back(write_slow_down, 1);
|
||||
// Wait untill first thread (main_writer) writing to DB is blocked and then
|
||||
// create the multiple writers which will be blocked from getting added to the
|
||||
// queue because stall is in effect.
|
||||
{
|
||||
InstrumentedMutexLock lock(&mutex);
|
||||
while (wait_count_db != 1) {
|
||||
cv.Wait();
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < num_writers; i += 2) {
|
||||
threads.emplace_back(write_no_slow_down, (i) % 4);
|
||||
threads.emplace_back(write_slow_down, (i + 1) % 4);
|
||||
}
|
||||
for (auto& t : threads) {
|
||||
t.join();
|
||||
}
|
||||
|
||||
ASSERT_TRUE(s1);
|
||||
ASSERT_TRUE(s2);
|
||||
// Number of DBs blocked.
|
||||
ASSERT_EQ(wait_count_db, 1);
|
||||
// Number of Writer threads blocked.
|
||||
ASSERT_EQ(w_slowdown_set.size(), num_writers / 2);
|
||||
// Number of Writer threads with WriteOptions.no_slowdown = true.
|
||||
ASSERT_EQ(w_no_slowdown.load(std::memory_order_relaxed), num_writers / 2);
|
||||
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
||||
}
|
||||
|
||||
// Test multiple threads writing across multiple columns of db_ and different
|
||||
// dbs by passing different values to WriteOption.no_slown_down.
|
||||
TEST_P(DBWriteBufferManagerTest, MixedSlowDownOptionsMultipleDB) {
|
||||
std::vector<std::string> dbnames;
|
||||
std::vector<DB*> dbs;
|
||||
int num_dbs = 4;
|
||||
|
||||
for (int i = 0; i < num_dbs; i++) {
|
||||
dbs.push_back(nullptr);
|
||||
dbnames.push_back(
|
||||
test::PerThreadDBPath("db_shared_wb_db" + std::to_string(i)));
|
||||
}
|
||||
|
||||
Options options = CurrentOptions();
|
||||
options.arena_block_size = 4096;
|
||||
options.write_buffer_size = 500000; // this is never hit
|
||||
std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);
|
||||
ASSERT_LT(cache->GetUsage(), 256 * 1024);
|
||||
cost_cache_ = GetParam();
|
||||
|
||||
if (cost_cache_) {
|
||||
options.write_buffer_manager.reset(
|
||||
new WriteBufferManager(100000, cache, true));
|
||||
} else {
|
||||
options.write_buffer_manager.reset(
|
||||
new WriteBufferManager(100000, nullptr, true));
|
||||
}
|
||||
CreateAndReopenWithCF({"cf1", "cf2"}, options);
|
||||
|
||||
for (int i = 0; i < num_dbs; i++) {
|
||||
ASSERT_OK(DestroyDB(dbnames[i], options));
|
||||
ASSERT_OK(DB::Open(options, dbnames[i], &(dbs[i])));
|
||||
}
|
||||
WriteOptions wo;
|
||||
wo.disableWAL = true;
|
||||
|
||||
for (int i = 0; i < num_dbs; i++) {
|
||||
ASSERT_OK(dbs[i]->Put(wo, Key(1), DummyString(20000)));
|
||||
}
|
||||
// Insert to db_.
|
||||
ASSERT_OK(Put(0, Key(1), DummyString(30000), wo));
|
||||
|
||||
// WriteBufferManager::buffer_size_ has exceeded after the previous write to
|
||||
// dbs[0] is completed.
|
||||
std::vector<port::Thread> threads;
|
||||
int wait_count_db = 0;
|
||||
InstrumentedMutex mutex;
|
||||
InstrumentedCondVar cv(&mutex);
|
||||
std::unordered_set<WriteThread::Writer*> w_slowdown_set;
|
||||
std::vector<port::Thread> writer_threads;
|
||||
std::atomic<int> thread_num(0);
|
||||
std::atomic<int> w_no_slowdown(0);
|
||||
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
|
||||
{{"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0",
|
||||
"DBImpl::BackgroundCallFlush:start"}});
|
||||
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
||||
"WBMStallInterface::BlockDB", [&](void*) {
|
||||
InstrumentedMutexLock lock(&mutex);
|
||||
wait_count_db++;
|
||||
cv.Signal();
|
||||
// Allow the flush continue if all writer threads are blocked.
|
||||
if (w_slowdown_set.size() +
|
||||
(unsigned long)(w_no_slowdown.load(std::memory_order_relaxed) +
|
||||
wait_count_db) ==
|
||||
(unsigned long)(2 * num_dbs + 1)) {
|
||||
TEST_SYNC_POINT(
|
||||
"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
|
||||
}
|
||||
});
|
||||
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
||||
"WriteThread::WriteStall::Wait", [&](void* arg) {
|
||||
WriteThread::Writer* w = reinterpret_cast<WriteThread::Writer*>(arg);
|
||||
InstrumentedMutexLock lock(&mutex);
|
||||
w_slowdown_set.insert(w);
|
||||
// Allow the flush continue if all writer threads are blocked.
|
||||
if (w_slowdown_set.size() +
|
||||
(unsigned long)(w_no_slowdown.load(std::memory_order_relaxed) +
|
||||
wait_count_db) ==
|
||||
(unsigned long)(2 * num_dbs + 1)) {
|
||||
TEST_SYNC_POINT(
|
||||
"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
|
||||
}
|
||||
});
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
||||
|
||||
bool s1 = true, s2 = true;
|
||||
std::function<void(DB*)> write_slow_down = [&](DB* db) {
|
||||
int a = thread_num.fetch_add(1);
|
||||
std::string key = "foo" + std::to_string(a);
|
||||
WriteOptions write_op;
|
||||
write_op.no_slowdown = false;
|
||||
Status tmp = db->Put(write_op, Slice(key), DummyString(1));
|
||||
InstrumentedMutexLock lock(&mutex);
|
||||
s1 = s1 && tmp.ok();
|
||||
};
|
||||
|
||||
std::function<void(DB*)> write_no_slow_down = [&](DB* db) {
|
||||
int a = thread_num.fetch_add(1);
|
||||
std::string key = "foo" + std::to_string(a);
|
||||
WriteOptions write_op;
|
||||
write_op.no_slowdown = true;
|
||||
Status tmp = db->Put(write_op, Slice(key), DummyString(1));
|
||||
{
|
||||
InstrumentedMutexLock lock(&mutex);
|
||||
s2 = s2 && !tmp.ok();
|
||||
w_no_slowdown.fetch_add(1);
|
||||
if (w_slowdown_set.size() +
|
||||
(unsigned long)(w_no_slowdown.load(std::memory_order_relaxed) +
|
||||
wait_count_db) ==
|
||||
(unsigned long)(2 * num_dbs + 1)) {
|
||||
TEST_SYNC_POINT(
|
||||
"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Flow:
|
||||
// first thread will write but will be blocked (as Flush will on hold,
|
||||
// buffer_size_ has exceeded, thus will create stall in effect).
|
||||
// |
|
||||
// |
|
||||
// multiple writer threads will be created to write across multiple columns
|
||||
// of db_ and different DBs with different values of
|
||||
// WriteOptions.no_slowdown. Some of them will be blocked and some of them
|
||||
// will return with Incomplete status.
|
||||
// |
|
||||
// |
|
||||
// Last writer thread will write and when its blocked/return it will signal
|
||||
// Flush to continue to clear the stall.
|
||||
threads.emplace_back(write_slow_down, db_);
|
||||
// Wait untill first thread writing to DB is blocked and then
|
||||
// create the multiple writers.
|
||||
{
|
||||
InstrumentedMutexLock lock(&mutex);
|
||||
while (wait_count_db != 1) {
|
||||
cv.Wait();
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < num_dbs; i += 2) {
|
||||
// Write to multiple columns of db_.
|
||||
writer_threads.emplace_back(write_slow_down, db_);
|
||||
writer_threads.emplace_back(write_no_slow_down, db_);
|
||||
// Write to different DBs.
|
||||
threads.emplace_back(write_slow_down, dbs[i]);
|
||||
threads.emplace_back(write_no_slow_down, dbs[i + 1]);
|
||||
}
|
||||
|
||||
for (auto& t : threads) {
|
||||
t.join();
|
||||
}
|
||||
|
||||
for (auto& t : writer_threads) {
|
||||
t.join();
|
||||
}
|
||||
|
||||
ASSERT_TRUE(s1);
|
||||
ASSERT_TRUE(s2);
|
||||
// Number of DBs blocked.
|
||||
ASSERT_EQ((num_dbs / 2) + 1, wait_count_db);
|
||||
// Number of writer threads writing to db_ blocked from getting added to the
|
||||
// queue.
|
||||
ASSERT_EQ(w_slowdown_set.size(), num_dbs / 2);
|
||||
// Number of threads with WriteOptions.no_slowdown = true.
|
||||
ASSERT_EQ(w_no_slowdown.load(std::memory_order_relaxed), num_dbs);
|
||||
|
||||
// Clean up DBs.
|
||||
for (int i = 0; i < num_dbs; i++) {
|
||||
ASSERT_OK(dbs[i]->Close());
|
||||
ASSERT_OK(DestroyDB(dbnames[i], options));
|
||||
delete dbs[i];
|
||||
}
|
||||
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(DBWriteBufferManagerTest, DBWriteBufferManagerTest,
|
||||
testing::Bool());
|
||||
|
||||
} // namespace ROCKSDB_NAMESPACE
|
||||
|
||||
#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
|
||||
extern "C" {
|
||||
void RegisterCustomObjects(int argc, char** argv);
|
||||
}
|
||||
#else
|
||||
void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {}
|
||||
#endif // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
RegisterCustomObjects(argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
}
|
@ -616,7 +616,7 @@ class IterKey {
|
||||
void EnlargeBuffer(size_t key_size);
|
||||
};
|
||||
|
||||
// Convert from a SliceTranform of user keys, to a SliceTransform of
|
||||
// Convert from a SliceTransform of user keys, to a SliceTransform of
|
||||
// user keys.
|
||||
class InternalKeySliceTransform : public SliceTransform {
|
||||
public:
|
||||
|
@ -103,7 +103,7 @@ class ErrorHandler {
|
||||
bool auto_recovery_;
|
||||
bool recovery_in_prog_;
|
||||
// A flag to indicate that for the soft error, we should not allow any
|
||||
// backrgound work execpt the work is from recovery.
|
||||
// background work except the work is from recovery.
|
||||
bool soft_error_no_bg_work_;
|
||||
|
||||
// Used to store the context for recover, such as flush reason.
|
||||
|
@ -216,7 +216,7 @@ TEST_F(DBErrorHandlingFSTest, FLushWriteRetryableError) {
|
||||
[&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
|
||||
SyncPoint::GetInstance()->EnableProcessing();
|
||||
s = Flush();
|
||||
ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
|
||||
ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
|
||||
SyncPoint::GetInstance()->DisableProcessing();
|
||||
fault_fs_->SetFilesystemActive(true);
|
||||
s = dbfull()->Resume();
|
||||
@ -242,7 +242,7 @@ TEST_F(DBErrorHandlingFSTest, FLushWriteRetryableError) {
|
||||
[&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
|
||||
SyncPoint::GetInstance()->EnableProcessing();
|
||||
s = Flush();
|
||||
ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
|
||||
ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
|
||||
SyncPoint::GetInstance()->DisableProcessing();
|
||||
fault_fs_->SetFilesystemActive(true);
|
||||
s = dbfull()->Resume();
|
||||
@ -256,7 +256,7 @@ TEST_F(DBErrorHandlingFSTest, FLushWriteRetryableError) {
|
||||
[&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
|
||||
SyncPoint::GetInstance()->EnableProcessing();
|
||||
s = Flush();
|
||||
ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
|
||||
ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
|
||||
SyncPoint::GetInstance()->DisableProcessing();
|
||||
fault_fs_->SetFilesystemActive(true);
|
||||
s = dbfull()->Resume();
|
||||
@ -292,7 +292,7 @@ TEST_F(DBErrorHandlingFSTest, FLushWriteFileScopeError) {
|
||||
[&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
|
||||
SyncPoint::GetInstance()->EnableProcessing();
|
||||
s = Flush();
|
||||
ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
|
||||
ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
|
||||
SyncPoint::GetInstance()->DisableProcessing();
|
||||
fault_fs_->SetFilesystemActive(true);
|
||||
s = dbfull()->Resume();
|
||||
@ -306,7 +306,7 @@ TEST_F(DBErrorHandlingFSTest, FLushWriteFileScopeError) {
|
||||
[&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
|
||||
SyncPoint::GetInstance()->EnableProcessing();
|
||||
s = Flush();
|
||||
ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
|
||||
ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
|
||||
SyncPoint::GetInstance()->DisableProcessing();
|
||||
fault_fs_->SetFilesystemActive(true);
|
||||
s = dbfull()->Resume();
|
||||
@ -320,7 +320,7 @@ TEST_F(DBErrorHandlingFSTest, FLushWriteFileScopeError) {
|
||||
[&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
|
||||
SyncPoint::GetInstance()->EnableProcessing();
|
||||
s = Flush();
|
||||
ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
|
||||
ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
|
||||
SyncPoint::GetInstance()->DisableProcessing();
|
||||
fault_fs_->SetFilesystemActive(true);
|
||||
s = dbfull()->Resume();
|
||||
@ -340,7 +340,7 @@ TEST_F(DBErrorHandlingFSTest, FLushWriteFileScopeError) {
|
||||
[&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
|
||||
SyncPoint::GetInstance()->EnableProcessing();
|
||||
s = Flush();
|
||||
ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
|
||||
ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
|
||||
SyncPoint::GetInstance()->DisableProcessing();
|
||||
fault_fs_->SetFilesystemActive(true);
|
||||
s = dbfull()->Resume();
|
||||
@ -649,7 +649,7 @@ TEST_F(DBErrorHandlingFSTest, ManifestWriteRetryableError) {
|
||||
[&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
|
||||
SyncPoint::GetInstance()->EnableProcessing();
|
||||
s = Flush();
|
||||
ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
|
||||
ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
|
||||
SyncPoint::GetInstance()->ClearAllCallBacks();
|
||||
SyncPoint::GetInstance()->DisableProcessing();
|
||||
fault_fs_->SetFilesystemActive(true);
|
||||
@ -695,7 +695,7 @@ TEST_F(DBErrorHandlingFSTest, ManifestWriteFileScopeError) {
|
||||
[&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
|
||||
SyncPoint::GetInstance()->EnableProcessing();
|
||||
s = Flush();
|
||||
ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
|
||||
ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
|
||||
SyncPoint::GetInstance()->ClearAllCallBacks();
|
||||
SyncPoint::GetInstance()->DisableProcessing();
|
||||
fault_fs_->SetFilesystemActive(true);
|
||||
@ -1698,7 +1698,7 @@ TEST_F(DBErrorHandlingFSTest, MultiDBVariousErrors) {
|
||||
// to soft error and trigger auto resume. During auto resume, SwitchMemtable
|
||||
// is disabled to avoid small SST tables. Write can still be applied before
|
||||
// the bg error is cleaned unless the memtable is full.
|
||||
TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableeErrorAutoRecover1) {
|
||||
TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableErrorAutoRecover1) {
|
||||
// Activate the FS before the first resume
|
||||
std::shared_ptr<ErrorHandlerFSListener> listener(
|
||||
new ErrorHandlerFSListener());
|
||||
@ -1744,14 +1744,14 @@ TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableeErrorAutoRecover1) {
|
||||
ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT));
|
||||
ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
|
||||
ERROR_HANDLER_AUTORESUME_COUNT));
|
||||
ASSERT_EQ(2, options.statistics->getAndResetTickerCount(
|
||||
ASSERT_LE(0, options.statistics->getAndResetTickerCount(
|
||||
ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT));
|
||||
ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
|
||||
ASSERT_LE(0, options.statistics->getAndResetTickerCount(
|
||||
ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT));
|
||||
HistogramData autoresume_retry;
|
||||
options.statistics->histogramData(ERROR_HANDLER_AUTORESUME_RETRY_COUNT,
|
||||
&autoresume_retry);
|
||||
ASSERT_EQ(autoresume_retry.max, 2);
|
||||
ASSERT_GE(autoresume_retry.max, 0);
|
||||
ASSERT_OK(Put(Key(2), "val2", wo));
|
||||
s = Flush();
|
||||
// Since auto resume fails, the bg error is not cleand, flush will
|
||||
@ -1768,7 +1768,7 @@ TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableeErrorAutoRecover1) {
|
||||
Destroy(options);
|
||||
}
|
||||
|
||||
TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableeErrorAutoRecover2) {
|
||||
TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableErrorAutoRecover2) {
|
||||
// Activate the FS before the first resume
|
||||
std::shared_ptr<ErrorHandlerFSListener> listener(
|
||||
new ErrorHandlerFSListener());
|
||||
@ -1810,14 +1810,14 @@ TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableeErrorAutoRecover2) {
|
||||
ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT));
|
||||
ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
|
||||
ERROR_HANDLER_AUTORESUME_COUNT));
|
||||
ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
|
||||
ASSERT_LE(0, options.statistics->getAndResetTickerCount(
|
||||
ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT));
|
||||
ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
|
||||
ASSERT_LE(0, options.statistics->getAndResetTickerCount(
|
||||
ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT));
|
||||
HistogramData autoresume_retry;
|
||||
options.statistics->histogramData(ERROR_HANDLER_AUTORESUME_RETRY_COUNT,
|
||||
&autoresume_retry);
|
||||
ASSERT_EQ(autoresume_retry.max, 1);
|
||||
ASSERT_GE(autoresume_retry.max, 0);
|
||||
ASSERT_OK(Put(Key(2), "val2", wo));
|
||||
s = Flush();
|
||||
// Since auto resume is successful, the bg error is cleaned, flush will
|
||||
@ -1827,56 +1827,7 @@ TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableeErrorAutoRecover2) {
|
||||
Destroy(options);
|
||||
}
|
||||
|
||||
TEST_F(DBErrorHandlingFSTest, DISABLED_FLushWritRetryableeErrorAutoRecover1) {
|
||||
// Fail the first resume and make the second resume successful
|
||||
std::shared_ptr<ErrorHandlerFSListener> listener(
|
||||
new ErrorHandlerFSListener());
|
||||
Options options = GetDefaultOptions();
|
||||
options.env = fault_env_.get();
|
||||
options.create_if_missing = true;
|
||||
options.listeners.emplace_back(listener);
|
||||
options.max_bgerror_resume_count = 2;
|
||||
options.bgerror_resume_retry_interval = 100000; // 0.1 second
|
||||
Status s;
|
||||
|
||||
listener->EnableAutoRecovery(false);
|
||||
DestroyAndReopen(options);
|
||||
|
||||
IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
|
||||
error_msg.SetRetryable(true);
|
||||
|
||||
ASSERT_OK(Put(Key(1), "val1"));
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
|
||||
{{"RecoverFromRetryableBGIOError:BeforeWait0",
|
||||
"FLushWritRetryableeErrorAutoRecover1:0"},
|
||||
{"FLushWritRetryableeErrorAutoRecover1:1",
|
||||
"RecoverFromRetryableBGIOError:BeforeWait1"},
|
||||
{"RecoverFromRetryableBGIOError:RecoverSuccess",
|
||||
"FLushWritRetryableeErrorAutoRecover1:2"}});
|
||||
SyncPoint::GetInstance()->SetCallBack(
|
||||
"BuildTable:BeforeFinishBuildTable",
|
||||
[&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
|
||||
SyncPoint::GetInstance()->EnableProcessing();
|
||||
s = Flush();
|
||||
ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
|
||||
TEST_SYNC_POINT("FLushWritRetryableeErrorAutoRecover1:0");
|
||||
fault_fs_->SetFilesystemActive(true);
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
|
||||
TEST_SYNC_POINT("FLushWritRetryableeErrorAutoRecover1:1");
|
||||
TEST_SYNC_POINT("FLushWritRetryableeErrorAutoRecover1:2");
|
||||
SyncPoint::GetInstance()->DisableProcessing();
|
||||
|
||||
ASSERT_EQ("val1", Get(Key(1)));
|
||||
Reopen(options);
|
||||
ASSERT_EQ("val1", Get(Key(1)));
|
||||
ASSERT_OK(Put(Key(2), "val2"));
|
||||
ASSERT_OK(Flush());
|
||||
ASSERT_EQ("val2", Get(Key(2)));
|
||||
|
||||
Destroy(options);
|
||||
}
|
||||
|
||||
TEST_F(DBErrorHandlingFSTest, FLushWritRetryableeErrorAutoRecover2) {
|
||||
TEST_F(DBErrorHandlingFSTest, FLushWritRetryableErrorAutoRecover1) {
|
||||
// Activate the FS before the first resume
|
||||
std::shared_ptr<ErrorHandlerFSListener> listener(
|
||||
new ErrorHandlerFSListener());
|
||||
@ -1901,7 +1852,7 @@ TEST_F(DBErrorHandlingFSTest, FLushWritRetryableeErrorAutoRecover2) {
|
||||
|
||||
SyncPoint::GetInstance()->EnableProcessing();
|
||||
s = Flush();
|
||||
ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
|
||||
ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
|
||||
SyncPoint::GetInstance()->DisableProcessing();
|
||||
fault_fs_->SetFilesystemActive(true);
|
||||
ASSERT_EQ(listener->WaitForRecovery(5000000), true);
|
||||
@ -1916,7 +1867,7 @@ TEST_F(DBErrorHandlingFSTest, FLushWritRetryableeErrorAutoRecover2) {
|
||||
Destroy(options);
|
||||
}
|
||||
|
||||
TEST_F(DBErrorHandlingFSTest, FLushWritRetryableeErrorAutoRecover3) {
|
||||
TEST_F(DBErrorHandlingFSTest, FLushWritRetryableErrorAutoRecover2) {
|
||||
// Fail all the resume and let user to resume
|
||||
std::shared_ptr<ErrorHandlerFSListener> listener(
|
||||
new ErrorHandlerFSListener());
|
||||
@ -1936,18 +1887,18 @@ TEST_F(DBErrorHandlingFSTest, FLushWritRetryableeErrorAutoRecover3) {
|
||||
|
||||
ASSERT_OK(Put(Key(1), "val1"));
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
|
||||
{{"FLushWritRetryableeErrorAutoRecover3:0",
|
||||
{{"FLushWritRetryableeErrorAutoRecover2:0",
|
||||
"RecoverFromRetryableBGIOError:BeforeStart"},
|
||||
{"RecoverFromRetryableBGIOError:LoopOut",
|
||||
"FLushWritRetryableeErrorAutoRecover3:1"}});
|
||||
"FLushWritRetryableeErrorAutoRecover2:1"}});
|
||||
SyncPoint::GetInstance()->SetCallBack(
|
||||
"BuildTable:BeforeFinishBuildTable",
|
||||
[&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
|
||||
SyncPoint::GetInstance()->EnableProcessing();
|
||||
s = Flush();
|
||||
ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
|
||||
TEST_SYNC_POINT("FLushWritRetryableeErrorAutoRecover3:0");
|
||||
TEST_SYNC_POINT("FLushWritRetryableeErrorAutoRecover3:1");
|
||||
ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
|
||||
TEST_SYNC_POINT("FLushWritRetryableeErrorAutoRecover2:0");
|
||||
TEST_SYNC_POINT("FLushWritRetryableeErrorAutoRecover2:1");
|
||||
fault_fs_->SetFilesystemActive(true);
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
|
||||
SyncPoint::GetInstance()->DisableProcessing();
|
||||
@ -1965,173 +1916,6 @@ TEST_F(DBErrorHandlingFSTest, FLushWritRetryableeErrorAutoRecover3) {
|
||||
Destroy(options);
|
||||
}
|
||||
|
||||
TEST_F(DBErrorHandlingFSTest, DISABLED_FLushWritRetryableeErrorAutoRecover4) {
|
||||
// Fail the first resume and does not do resume second time because
|
||||
// the IO error severity is Fatal Error and not Retryable.
|
||||
std::shared_ptr<ErrorHandlerFSListener> listener(
|
||||
new ErrorHandlerFSListener());
|
||||
Options options = GetDefaultOptions();
|
||||
options.env = fault_env_.get();
|
||||
options.create_if_missing = true;
|
||||
options.listeners.emplace_back(listener);
|
||||
options.max_bgerror_resume_count = 2;
|
||||
options.bgerror_resume_retry_interval = 10; // 0.1 second
|
||||
Status s;
|
||||
|
||||
listener->EnableAutoRecovery(false);
|
||||
DestroyAndReopen(options);
|
||||
|
||||
IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
|
||||
error_msg.SetRetryable(true);
|
||||
IOStatus nr_msg = IOStatus::IOError("No Retryable Fatal IO Error");
|
||||
nr_msg.SetRetryable(false);
|
||||
|
||||
ASSERT_OK(Put(Key(1), "val1"));
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
|
||||
{{"RecoverFromRetryableBGIOError:BeforeStart",
|
||||
"FLushWritRetryableeErrorAutoRecover4:0"},
|
||||
{"FLushWritRetryableeErrorAutoRecover4:2",
|
||||
"RecoverFromRetryableBGIOError:RecoverFail0"}});
|
||||
SyncPoint::GetInstance()->SetCallBack(
|
||||
"BuildTable:BeforeFinishBuildTable",
|
||||
[&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
|
||||
SyncPoint::GetInstance()->SetCallBack(
|
||||
"RecoverFromRetryableBGIOError:BeforeResume1",
|
||||
[&](void*) { fault_fs_->SetFilesystemActive(false, nr_msg); });
|
||||
|
||||
SyncPoint::GetInstance()->EnableProcessing();
|
||||
s = Flush();
|
||||
ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
|
||||
TEST_SYNC_POINT("FLushWritRetryableeErrorAutoRecover4:0");
|
||||
TEST_SYNC_POINT("FLushWritRetryableeErrorAutoRecover4:2");
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
|
||||
SyncPoint::GetInstance()->DisableProcessing();
|
||||
fault_fs_->SetFilesystemActive(true);
|
||||
// Even the FS is recoverd, due to the Fatal Error in bg_error_ the resume
|
||||
// and flush will all fail.
|
||||
ASSERT_EQ("val1", Get(Key(1)));
|
||||
ASSERT_NOK(dbfull()->Resume());
|
||||
ASSERT_EQ("val1", Get(Key(1)));
|
||||
ASSERT_OK(Put(Key(2), "val2"));
|
||||
ASSERT_NOK(Flush());
|
||||
ASSERT_EQ("NOT_FOUND", Get(Key(2)));
|
||||
|
||||
Reopen(options);
|
||||
ASSERT_EQ("val1", Get(Key(1)));
|
||||
ASSERT_OK(Put(Key(2), "val2"));
|
||||
ASSERT_OK(Flush());
|
||||
ASSERT_EQ("val2", Get(Key(2)));
|
||||
|
||||
Destroy(options);
|
||||
}
|
||||
|
||||
TEST_F(DBErrorHandlingFSTest, DISABLED_FLushWritRetryableeErrorAutoRecover5) {
|
||||
// During the resume, call DB->CLose, make sure the resume thread exist
|
||||
// before close continues. Due to the shutdown, the resume is not successful
|
||||
// and the FS does not become active, so close status is still IO error
|
||||
std::shared_ptr<ErrorHandlerFSListener> listener(
|
||||
new ErrorHandlerFSListener());
|
||||
Options options = GetDefaultOptions();
|
||||
options.env = fault_env_.get();
|
||||
options.create_if_missing = true;
|
||||
options.listeners.emplace_back(listener);
|
||||
options.max_bgerror_resume_count = 2;
|
||||
options.bgerror_resume_retry_interval = 10; // 0.1 second
|
||||
Status s;
|
||||
|
||||
listener->EnableAutoRecovery(false);
|
||||
DestroyAndReopen(options);
|
||||
|
||||
IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
|
||||
error_msg.SetRetryable(true);
|
||||
|
||||
ASSERT_OK(Put(Key(1), "val1"));
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
|
||||
{{"RecoverFromRetryableBGIOError:BeforeStart",
|
||||
"FLushWritRetryableeErrorAutoRecover5:0"}});
|
||||
SyncPoint::GetInstance()->SetCallBack(
|
||||
"BuildTable:BeforeFinishBuildTable",
|
||||
[&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
|
||||
SyncPoint::GetInstance()->EnableProcessing();
|
||||
s = Flush();
|
||||
ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
|
||||
TEST_SYNC_POINT("FLushWritRetryableeErrorAutoRecover5:0");
|
||||
// The first resume will cause recovery_error and its severity is the
|
||||
// Fatal error
|
||||
s = dbfull()->Close();
|
||||
ASSERT_NOK(s);
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
|
||||
SyncPoint::GetInstance()->DisableProcessing();
|
||||
fault_fs_->SetFilesystemActive(true);
|
||||
|
||||
Reopen(options);
|
||||
ASSERT_NE("val1", Get(Key(1)));
|
||||
ASSERT_OK(Put(Key(2), "val2"));
|
||||
s = Flush();
|
||||
ASSERT_OK(s);
|
||||
ASSERT_EQ("val2", Get(Key(2)));
|
||||
|
||||
Destroy(options);
|
||||
}
|
||||
|
||||
TEST_F(DBErrorHandlingFSTest, FLushWritRetryableeErrorAutoRecover6) {
|
||||
// During the resume, call DB->CLose, make sure the resume thread exist
|
||||
// before close continues. Due to the shutdown, the resume is not successful
|
||||
// and the FS does not become active, so close status is still IO error
|
||||
std::shared_ptr<ErrorHandlerFSListener> listener(
|
||||
new ErrorHandlerFSListener());
|
||||
Options options = GetDefaultOptions();
|
||||
options.env = fault_env_.get();
|
||||
options.create_if_missing = true;
|
||||
options.listeners.emplace_back(listener);
|
||||
options.max_bgerror_resume_count = 2;
|
||||
options.bgerror_resume_retry_interval = 10; // 0.1 second
|
||||
Status s;
|
||||
|
||||
listener->EnableAutoRecovery(false);
|
||||
DestroyAndReopen(options);
|
||||
|
||||
IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
|
||||
error_msg.SetRetryable(true);
|
||||
|
||||
ASSERT_OK(Put(Key(1), "val1"));
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
|
||||
{{"FLushWritRetryableeErrorAutoRecover6:0",
|
||||
"RecoverFromRetryableBGIOError:BeforeStart"},
|
||||
{"RecoverFromRetryableBGIOError:BeforeWait0",
|
||||
"FLushWritRetryableeErrorAutoRecover6:1"},
|
||||
{"FLushWritRetryableeErrorAutoRecover6:2",
|
||||
"RecoverFromRetryableBGIOError:BeforeWait1"},
|
||||
{"RecoverFromRetryableBGIOError:AfterWait0",
|
||||
"FLushWritRetryableeErrorAutoRecover6:3"}});
|
||||
SyncPoint::GetInstance()->SetCallBack(
|
||||
"BuildTable:BeforeFinishBuildTable",
|
||||
[&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
|
||||
SyncPoint::GetInstance()->EnableProcessing();
|
||||
s = Flush();
|
||||
ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
|
||||
TEST_SYNC_POINT("FLushWritRetryableeErrorAutoRecover6:0");
|
||||
TEST_SYNC_POINT("FLushWritRetryableeErrorAutoRecover6:1");
|
||||
fault_fs_->SetFilesystemActive(true);
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
|
||||
TEST_SYNC_POINT("FLushWritRetryableeErrorAutoRecover6:2");
|
||||
TEST_SYNC_POINT("FLushWritRetryableeErrorAutoRecover6:3");
|
||||
// The first resume will cause recovery_error and its severity is the
|
||||
// Fatal error
|
||||
s = dbfull()->Close();
|
||||
ASSERT_OK(s);
|
||||
SyncPoint::GetInstance()->DisableProcessing();
|
||||
|
||||
Reopen(options);
|
||||
ASSERT_EQ("val1", Get(Key(1)));
|
||||
ASSERT_OK(Put(Key(2), "val2"));
|
||||
s = Flush();
|
||||
ASSERT_OK(s);
|
||||
ASSERT_EQ("val2", Get(Key(2)));
|
||||
|
||||
Destroy(options);
|
||||
}
|
||||
|
||||
TEST_F(DBErrorHandlingFSTest, ManifestWriteRetryableErrorAutoRecover) {
|
||||
// Fail the first resume and let the second resume be successful
|
||||
std::shared_ptr<ErrorHandlerFSListener> listener(
|
||||
@ -2168,7 +1952,7 @@ TEST_F(DBErrorHandlingFSTest, ManifestWriteRetryableErrorAutoRecover) {
|
||||
[&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
|
||||
SyncPoint::GetInstance()->EnableProcessing();
|
||||
s = Flush();
|
||||
ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
|
||||
ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
|
||||
TEST_SYNC_POINT("ManifestWriteRetryableErrorAutoRecover:0");
|
||||
fault_fs_->SetFilesystemActive(true);
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
|
||||
|
@ -125,8 +125,12 @@ void EventHelpers::LogAndNotifyTableFileCreationFinished(
|
||||
<< table_properties.compression_options << "creation_time"
|
||||
<< table_properties.creation_time << "oldest_key_time"
|
||||
<< table_properties.oldest_key_time << "file_creation_time"
|
||||
<< table_properties.file_creation_time << "db_id"
|
||||
<< table_properties.db_id << "db_session_id"
|
||||
<< table_properties.file_creation_time
|
||||
<< "slow_compression_estimated_data_size"
|
||||
<< table_properties.slow_compression_estimated_data_size
|
||||
<< "fast_compression_estimated_data_size"
|
||||
<< table_properties.fast_compression_estimated_data_size
|
||||
<< "db_id" << table_properties.db_id << "db_session_id"
|
||||
<< table_properties.db_session_id;
|
||||
|
||||
// user collected properties
|
||||
|
@ -1542,6 +1542,44 @@ TEST_F(ExternalSSTFileBasicTest, OverlappingFiles) {
|
||||
ASSERT_EQ(2, NumTableFilesAtLevel(0));
|
||||
}
|
||||
|
||||
TEST_F(ExternalSSTFileBasicTest, IngestFileAfterDBPut) {
|
||||
// Repro https://github.com/facebook/rocksdb/issues/6245.
|
||||
// Flush three files to L0. Ingest one more file to trigger L0->L1 compaction
|
||||
// via trivial move. The bug happened when L1 files were incorrectly sorted
|
||||
// resulting in an old value for "k" returned by `Get()`.
|
||||
Options options = CurrentOptions();
|
||||
|
||||
ASSERT_OK(Put("k", "a"));
|
||||
Flush();
|
||||
ASSERT_OK(Put("k", "a"));
|
||||
Flush();
|
||||
ASSERT_OK(Put("k", "a"));
|
||||
Flush();
|
||||
SstFileWriter sst_file_writer(EnvOptions(), options);
|
||||
|
||||
// Current file size should be 0 after sst_file_writer init and before open a
|
||||
// file.
|
||||
ASSERT_EQ(sst_file_writer.FileSize(), 0);
|
||||
|
||||
std::string file1 = sst_files_dir_ + "file1.sst";
|
||||
ASSERT_OK(sst_file_writer.Open(file1));
|
||||
ASSERT_OK(sst_file_writer.Put("k", "b"));
|
||||
|
||||
ExternalSstFileInfo file1_info;
|
||||
Status s = sst_file_writer.Finish(&file1_info);
|
||||
ASSERT_OK(s) << s.ToString();
|
||||
|
||||
// Current file size should be non-zero after success write.
|
||||
ASSERT_GT(sst_file_writer.FileSize(), 0);
|
||||
|
||||
IngestExternalFileOptions ifo;
|
||||
s = db_->IngestExternalFile({file1}, ifo);
|
||||
ASSERT_OK(s);
|
||||
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
||||
|
||||
ASSERT_EQ(Get("k"), "b");
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(ExternalSSTFileBasicTest, ExternalSSTFileBasicTest,
|
||||
testing::Values(std::make_tuple(true, true),
|
||||
std::make_tuple(true, false),
|
||||
|
@ -40,16 +40,25 @@ Status ExternalSstFileIngestionJob::Prepare(
|
||||
if (!status.ok()) {
|
||||
return status;
|
||||
}
|
||||
files_to_ingest_.push_back(file_to_ingest);
|
||||
}
|
||||
|
||||
for (const IngestedFileInfo& f : files_to_ingest_) {
|
||||
if (f.cf_id !=
|
||||
if (file_to_ingest.cf_id !=
|
||||
TablePropertiesCollectorFactory::Context::kUnknownColumnFamily &&
|
||||
f.cf_id != cfd_->GetID()) {
|
||||
file_to_ingest.cf_id != cfd_->GetID()) {
|
||||
return Status::InvalidArgument(
|
||||
"External file column family id don't match");
|
||||
}
|
||||
|
||||
if (file_to_ingest.num_entries == 0 &&
|
||||
file_to_ingest.num_range_deletions == 0) {
|
||||
return Status::InvalidArgument("File contain no entries");
|
||||
}
|
||||
|
||||
if (!file_to_ingest.smallest_internal_key.Valid() ||
|
||||
!file_to_ingest.largest_internal_key.Valid()) {
|
||||
return Status::Corruption("Generated table have corrupted keys");
|
||||
}
|
||||
|
||||
files_to_ingest_.emplace_back(std::move(file_to_ingest));
|
||||
}
|
||||
|
||||
const Comparator* ucmp = cfd_->internal_comparator().user_comparator();
|
||||
@ -83,16 +92,6 @@ Status ExternalSstFileIngestionJob::Prepare(
|
||||
return Status::NotSupported("Files have overlapping ranges");
|
||||
}
|
||||
|
||||
for (IngestedFileInfo& f : files_to_ingest_) {
|
||||
if (f.num_entries == 0 && f.num_range_deletions == 0) {
|
||||
return Status::InvalidArgument("File contain no entries");
|
||||
}
|
||||
|
||||
if (!f.smallest_internal_key.Valid() || !f.largest_internal_key.Valid()) {
|
||||
return Status::Corruption("Generated table have corrupted keys");
|
||||
}
|
||||
}
|
||||
|
||||
// Copy/Move external files into DB
|
||||
std::unordered_set<size_t> ingestion_path_ids;
|
||||
for (IngestedFileInfo& f : files_to_ingest_) {
|
||||
@ -368,9 +367,32 @@ Status ExternalSstFileIngestionJob::Run() {
|
||||
super_version, force_global_seqno, cfd_->ioptions()->compaction_style,
|
||||
last_seqno, &f, &assigned_seqno);
|
||||
}
|
||||
|
||||
// Modify the smallest/largest internal key to include the sequence number
|
||||
// that we just learned. Only overwrite sequence number zero. There could
|
||||
// be a nonzero sequence number already to indicate a range tombstone's
|
||||
// exclusive endpoint.
|
||||
ParsedInternalKey smallest_parsed, largest_parsed;
|
||||
if (status.ok()) {
|
||||
status = ParseInternalKey(*f.smallest_internal_key.rep(),
|
||||
&smallest_parsed, false /* log_err_key */);
|
||||
}
|
||||
if (status.ok()) {
|
||||
status = ParseInternalKey(*f.largest_internal_key.rep(), &largest_parsed,
|
||||
false /* log_err_key */);
|
||||
}
|
||||
if (!status.ok()) {
|
||||
return status;
|
||||
}
|
||||
if (smallest_parsed.sequence == 0) {
|
||||
UpdateInternalKey(f.smallest_internal_key.rep(), assigned_seqno,
|
||||
smallest_parsed.type);
|
||||
}
|
||||
if (largest_parsed.sequence == 0) {
|
||||
UpdateInternalKey(f.largest_internal_key.rep(), assigned_seqno,
|
||||
largest_parsed.type);
|
||||
}
|
||||
|
||||
status = AssignGlobalSeqnoForIngestedFile(&f, assigned_seqno);
|
||||
TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Run",
|
||||
&assigned_seqno);
|
||||
|
@ -16,6 +16,7 @@
|
||||
#include "rocksdb/sst_file_writer.h"
|
||||
#include "test_util/testutil.h"
|
||||
#include "util/random.h"
|
||||
#include "util/thread_guard.h"
|
||||
#include "utilities/fault_injection_env.h"
|
||||
|
||||
namespace ROCKSDB_NAMESPACE {
|
||||
@ -1305,38 +1306,38 @@ TEST_F(ExternalSSTFileTest, PickedLevelBug) {
|
||||
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
||||
|
||||
// While writing the MANIFEST start a thread that will ask for compaction
|
||||
Status bg_compact_status;
|
||||
ROCKSDB_NAMESPACE::port::Thread bg_compact([&]() {
|
||||
bg_compact_status =
|
||||
db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
|
||||
});
|
||||
TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:2");
|
||||
|
||||
// Start a thread that will ingest a new file
|
||||
Status bg_addfile_status;
|
||||
ROCKSDB_NAMESPACE::port::Thread bg_addfile([&]() {
|
||||
file_keys = {1, 2, 3};
|
||||
bg_addfile_status = GenerateAndAddExternalFile(options, file_keys, 1);
|
||||
});
|
||||
|
||||
// Wait for AddFile to start picking levels and writing MANIFEST
|
||||
TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:0");
|
||||
{
|
||||
// While writing the MANIFEST start a thread that will ask for compaction
|
||||
ThreadGuard bg_compact(port::Thread([&]() {
|
||||
bg_compact_status =
|
||||
db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
|
||||
}));
|
||||
TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:2");
|
||||
|
||||
TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:3");
|
||||
// Start a thread that will ingest a new file
|
||||
ThreadGuard bg_addfile(port::Thread([&]() {
|
||||
file_keys = {1, 2, 3};
|
||||
bg_addfile_status = GenerateAndAddExternalFile(options, file_keys, 1);
|
||||
}));
|
||||
|
||||
// We need to verify that no compactions can run while AddFile is
|
||||
// ingesting the files into the levels it find suitable. So we will
|
||||
// wait for 2 seconds to give a chance for compactions to run during
|
||||
// this period, and then make sure that no compactions where able to run
|
||||
env_->SleepForMicroseconds(1000000 * 2);
|
||||
ASSERT_FALSE(bg_compact_started.load());
|
||||
// Wait for AddFile to start picking levels and writing MANIFEST
|
||||
TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:0");
|
||||
|
||||
// Hold AddFile from finishing writing the MANIFEST
|
||||
TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:1");
|
||||
TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:3");
|
||||
|
||||
bg_addfile.join();
|
||||
bg_compact.join();
|
||||
// We need to verify that no compactions can run while AddFile is
|
||||
// ingesting the files into the levels it find suitable. So we will
|
||||
// wait for 2 seconds to give a chance for compactions to run during
|
||||
// this period, and then make sure that no compactions where able to run
|
||||
env_->SleepForMicroseconds(1000000 * 2);
|
||||
ASSERT_FALSE(bg_compact_started.load());
|
||||
|
||||
// Hold AddFile from finishing writing the MANIFEST
|
||||
TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:1");
|
||||
}
|
||||
|
||||
ASSERT_OK(bg_addfile_status);
|
||||
ASSERT_OK(bg_compact_status);
|
||||
|
@ -75,6 +75,8 @@ const char* GetFlushReasonString (FlushReason flush_reason) {
|
||||
return "Manual Flush";
|
||||
case FlushReason::kErrorRecovery:
|
||||
return "Error Recovery";
|
||||
case FlushReason::kWalFull:
|
||||
return "WAL Full";
|
||||
default:
|
||||
return "Invalid";
|
||||
}
|
||||
@ -411,8 +413,7 @@ Status FlushJob::WriteLevel0Table() {
|
||||
cfd_->internal_comparator(), cfd_->int_tbl_prop_collector_factories(),
|
||||
cfd_->GetID(), cfd_->GetName(), existing_snapshots_,
|
||||
earliest_write_conflict_snapshot_, snapshot_checker_,
|
||||
output_compression_, mutable_cf_options_.sample_for_compression,
|
||||
mutable_cf_options_.compression_opts,
|
||||
output_compression_, mutable_cf_options_.compression_opts,
|
||||
mutable_cf_options_.paranoid_file_checks, cfd_->internal_stats(),
|
||||
TableFileCreationReason::kFlush, &io_s, io_tracer_, event_logger_,
|
||||
job_context_->job_id, Env::IO_HIGH, &table_properties_, 0 /* level */,
|
||||
|
@ -426,7 +426,7 @@ void ForwardIterator::SeekInternal(const Slice& internal_key,
|
||||
if (seek_to_first) {
|
||||
l0_iters_[i]->SeekToFirst();
|
||||
} else {
|
||||
// If the target key passes over the larget key, we are sure Next()
|
||||
// If the target key passes over the largest key, we are sure Next()
|
||||
// won't go over this file.
|
||||
if (user_comparator_->Compare(target_user_key,
|
||||
l0[i]->largest.user_key()) > 0) {
|
||||
|
@ -751,21 +751,24 @@ bool InternalStats::HandleBackgroundErrors(uint64_t* value, DBImpl* /*db*/,
|
||||
bool InternalStats::HandleCurSizeActiveMemTable(uint64_t* value, DBImpl* /*db*/,
|
||||
Version* /*version*/) {
|
||||
// Current size of the active memtable
|
||||
*value = cfd_->mem()->ApproximateMemoryUsage();
|
||||
// Using ApproximateMemoryUsageFast to avoid the need for synchronization
|
||||
*value = cfd_->mem()->ApproximateMemoryUsageFast();
|
||||
return true;
|
||||
}
|
||||
|
||||
bool InternalStats::HandleCurSizeAllMemTables(uint64_t* value, DBImpl* /*db*/,
|
||||
Version* /*version*/) {
|
||||
// Current size of the active memtable + immutable memtables
|
||||
*value = cfd_->mem()->ApproximateMemoryUsage() +
|
||||
// Using ApproximateMemoryUsageFast to avoid the need for synchronization
|
||||
*value = cfd_->mem()->ApproximateMemoryUsageFast() +
|
||||
cfd_->imm()->ApproximateUnflushedMemTablesMemoryUsage();
|
||||
return true;
|
||||
}
|
||||
|
||||
bool InternalStats::HandleSizeAllMemTables(uint64_t* value, DBImpl* /*db*/,
|
||||
Version* /*version*/) {
|
||||
*value = cfd_->mem()->ApproximateMemoryUsage() +
|
||||
// Using ApproximateMemoryUsageFast to avoid the need for synchronization
|
||||
*value = cfd_->mem()->ApproximateMemoryUsageFast() +
|
||||
cfd_->imm()->ApproximateMemoryUsage();
|
||||
return true;
|
||||
}
|
||||
@ -958,7 +961,7 @@ bool InternalStats::HandleEstimateOldestKeyTime(uint64_t* value, DBImpl* /*db*/,
|
||||
|
||||
bool InternalStats::HandleBlockCacheStat(Cache** block_cache) {
|
||||
assert(block_cache != nullptr);
|
||||
auto* table_factory = cfd_->ioptions()->table_factory;
|
||||
auto* table_factory = cfd_->ioptions()->table_factory.get();
|
||||
assert(table_factory != nullptr);
|
||||
*block_cache =
|
||||
table_factory->GetOptions<Cache>(TableFactory::kBlockCacheOpts());
|
||||
|
@ -59,7 +59,7 @@ ImmutableMemTableOptions::ImmutableMemTableOptions(
|
||||
inplace_callback(ioptions.inplace_callback),
|
||||
max_successive_merges(mutable_cf_options.max_successive_merges),
|
||||
statistics(ioptions.statistics),
|
||||
merge_operator(ioptions.merge_operator),
|
||||
merge_operator(ioptions.merge_operator.get()),
|
||||
info_log(ioptions.info_log),
|
||||
allow_data_in_errors(ioptions.allow_data_in_errors) {}
|
||||
|
||||
@ -106,7 +106,7 @@ MemTable::MemTable(const InternalKeyComparator& cmp,
|
||||
flush_state_(FLUSH_NOT_REQUESTED),
|
||||
clock_(ioptions.clock),
|
||||
insert_with_hint_prefix_extractor_(
|
||||
ioptions.memtable_insert_with_hint_prefix_extractor),
|
||||
ioptions.memtable_insert_with_hint_prefix_extractor.get()),
|
||||
oldest_key_time_(std::numeric_limits<uint64_t>::max()),
|
||||
atomic_flush_seqno_(kMaxSequenceNumber),
|
||||
approximate_memory_usage_(0) {
|
||||
|
@ -72,7 +72,7 @@ using MultiGetRange = MultiGetContext::Range;
|
||||
// Note: Many of the methods in this class have comments indicating that
|
||||
// external synchronization is required as these methods are not thread-safe.
|
||||
// It is up to higher layers of code to decide how to prevent concurrent
|
||||
// invokation of these methods. This is usually done by acquiring either
|
||||
// invocation of these methods. This is usually done by acquiring either
|
||||
// the db mutex or the single writer thread.
|
||||
//
|
||||
// Some of these methods are documented to only require external
|
||||
@ -139,7 +139,7 @@ class MemTable {
|
||||
// operations on the same MemTable (unless this Memtable is immutable).
|
||||
size_t ApproximateMemoryUsage();
|
||||
|
||||
// As a cheap version of `ApproximateMemoryUsage()`, this function doens't
|
||||
// As a cheap version of `ApproximateMemoryUsage()`, this function doesn't
|
||||
// require external synchronization. The value may be less accurate though
|
||||
size_t ApproximateMemoryUsageFast() const {
|
||||
return approximate_memory_usage_.load(std::memory_order_relaxed);
|
||||
@ -533,7 +533,7 @@ class MemTable {
|
||||
SequenceNumber atomic_flush_seqno_;
|
||||
|
||||
// keep track of memory usage in table_, arena_, and range_del_table_.
|
||||
// Gets refrshed inside `ApproximateMemoryUsage()` or `ShouldFlushNow`
|
||||
// Gets refreshed inside `ApproximateMemoryUsage()` or `ShouldFlushNow`
|
||||
std::atomic<uint64_t> approximate_memory_usage_;
|
||||
|
||||
#ifndef ROCKSDB_LITE
|
||||
|
@ -521,7 +521,7 @@ void MemTableList::Add(MemTable* m, autovector<MemTable*>* to_delete) {
|
||||
InstallNewVersion();
|
||||
// this method is used to move mutable memtable into an immutable list.
|
||||
// since mutable memtable is already refcounted by the DBImpl,
|
||||
// and when moving to the imutable list we don't unref it,
|
||||
// and when moving to the immutable list we don't unref it,
|
||||
// we don't have to ref the memtable here. we just take over the
|
||||
// reference from the DBImpl.
|
||||
current_->Add(m, to_delete);
|
||||
|
@ -33,11 +33,13 @@ class OutputValidator {
|
||||
return GetHash() == other_validator.GetHash();
|
||||
}
|
||||
|
||||
private:
|
||||
// Not (yet) intended to be persisted, so subject to change
|
||||
// without notice between releases.
|
||||
uint64_t GetHash() const { return paranoid_hash_; }
|
||||
|
||||
void SetHash(uint64_t hash) { paranoid_hash_ = hash; }
|
||||
|
||||
private:
|
||||
const InternalKeyComparator& icmp_;
|
||||
std::string prev_key_;
|
||||
uint64_t paranoid_hash_ = 0;
|
||||
|
@ -43,12 +43,12 @@ class TruncatedRangeDelIterator {
|
||||
|
||||
void InternalNext();
|
||||
|
||||
// Seeks to the tombstone with the highest viisble sequence number that covers
|
||||
// Seeks to the tombstone with the highest visible sequence number that covers
|
||||
// target (a user key). If no such tombstone exists, the position will be at
|
||||
// the earliest tombstone that ends after target.
|
||||
void Seek(const Slice& target);
|
||||
|
||||
// Seeks to the tombstone with the highest viisble sequence number that covers
|
||||
// Seeks to the tombstone with the highest visible sequence number that covers
|
||||
// target (a user key). If no such tombstone exists, the position will be at
|
||||
// the latest tombstone that starts before target.
|
||||
void SeekForPrev(const Slice& target);
|
||||
|
13
db/repair.cc
13
db/repair.cc
@ -447,13 +447,12 @@ class Repairer {
|
||||
nullptr /* blob_file_additions */, cfd->internal_comparator(),
|
||||
cfd->int_tbl_prop_collector_factories(), cfd->GetID(), cfd->GetName(),
|
||||
{}, kMaxSequenceNumber, snapshot_checker, kNoCompression,
|
||||
0 /* sample_for_compression */, CompressionOptions(), false,
|
||||
nullptr /* internal_stats */, TableFileCreationReason::kRecovery,
|
||||
&io_s, nullptr /*IOTracer*/, nullptr /* event_logger */,
|
||||
0 /* job_id */, Env::IO_HIGH, nullptr /* table_properties */,
|
||||
-1 /* level */, current_time, 0 /* oldest_key_time */, write_hint,
|
||||
0 /* file_creation_time */, "DB Repairer" /* db_id */,
|
||||
db_session_id_);
|
||||
CompressionOptions(), false, nullptr /* internal_stats */,
|
||||
TableFileCreationReason::kRecovery, &io_s, nullptr /*IOTracer*/,
|
||||
nullptr /* event_logger */, 0 /* job_id */, Env::IO_HIGH,
|
||||
nullptr /* table_properties */, -1 /* level */, current_time,
|
||||
0 /* oldest_key_time */, write_hint, 0 /* file_creation_time */,
|
||||
"DB Repairer" /* db_id */, db_session_id_);
|
||||
ROCKS_LOG_INFO(db_options_.info_log,
|
||||
"Log #%" PRIu64 ": %d ops saved to Table #%" PRIu64 " %s",
|
||||
log, counter, meta.fd.GetNumber(),
|
||||
|
@ -23,7 +23,7 @@ class SnapshotImpl : public Snapshot {
|
||||
SequenceNumber number_; // const after creation
|
||||
// It indicates the smallest uncommitted data at the time the snapshot was
|
||||
// taken. This is currently used by WritePrepared transactions to limit the
|
||||
// scope of queries to IsInSnpashot.
|
||||
// scope of queries to IsInSnapshot.
|
||||
SequenceNumber min_uncommitted_ = kMinUnCommittedSeq;
|
||||
|
||||
virtual SequenceNumber GetSequenceNumber() const override { return number_; }
|
||||
|
@ -130,7 +130,7 @@ Status TableCache::GetTableReader(
|
||||
new RandomAccessFileReader(
|
||||
std::move(file), fname, ioptions_.clock, io_tracer_,
|
||||
record_read_stats ? ioptions_.statistics : nullptr, SST_READ_MICROS,
|
||||
file_read_hist, ioptions_.rate_limiter, ioptions_.listeners));
|
||||
file_read_hist, ioptions_.rate_limiter.get(), ioptions_.listeners));
|
||||
s = ioptions_.table_factory->NewTableReader(
|
||||
ro,
|
||||
TableReaderOptions(ioptions_, prefix_extractor, file_options,
|
||||
|
@ -183,7 +183,7 @@ class TableCache {
|
||||
|
||||
Cache* get_cache() const { return cache_; }
|
||||
|
||||
// Capacity of the backing Cache that indicates inifinite TableCache capacity.
|
||||
// Capacity of the backing Cache that indicates infinite TableCache capacity.
|
||||
// For example when max_open_files is -1 we set the backing Cache to this.
|
||||
static const int kInfiniteCapacity = 0x400000;
|
||||
|
||||
|
@ -43,10 +43,10 @@ Status UserKeyTablePropertiesCollector::InternalAdd(const Slice& key,
|
||||
}
|
||||
|
||||
void UserKeyTablePropertiesCollector::BlockAdd(
|
||||
uint64_t bLockRawBytes, uint64_t blockCompressedBytesFast,
|
||||
uint64_t blockCompressedBytesSlow) {
|
||||
return collector_->BlockAdd(bLockRawBytes, blockCompressedBytesFast,
|
||||
blockCompressedBytesSlow);
|
||||
uint64_t block_raw_bytes, uint64_t block_compressed_bytes_fast,
|
||||
uint64_t block_compressed_bytes_slow) {
|
||||
return collector_->BlockAdd(block_raw_bytes, block_compressed_bytes_fast,
|
||||
block_compressed_bytes_slow);
|
||||
}
|
||||
|
||||
Status UserKeyTablePropertiesCollector::Finish(
|
||||
|
@ -27,9 +27,9 @@ class IntTblPropCollector {
|
||||
virtual Status InternalAdd(const Slice& key, const Slice& value,
|
||||
uint64_t file_size) = 0;
|
||||
|
||||
virtual void BlockAdd(uint64_t blockRawBytes,
|
||||
uint64_t blockCompressedBytesFast,
|
||||
uint64_t blockCompressedBytesSlow) = 0;
|
||||
virtual void BlockAdd(uint64_t block_raw_bytes,
|
||||
uint64_t block_compressed_bytes_fast,
|
||||
uint64_t block_compressed_bytes_slow) = 0;
|
||||
|
||||
virtual UserCollectedProperties GetReadableProperties() const = 0;
|
||||
|
||||
@ -64,9 +64,9 @@ class UserKeyTablePropertiesCollector : public IntTblPropCollector {
|
||||
virtual Status InternalAdd(const Slice& key, const Slice& value,
|
||||
uint64_t file_size) override;
|
||||
|
||||
virtual void BlockAdd(uint64_t blockRawBytes,
|
||||
uint64_t blockCompressedBytesFast,
|
||||
uint64_t blockCompressedBytesSlow) override;
|
||||
virtual void BlockAdd(uint64_t block_raw_bytes,
|
||||
uint64_t block_compressed_bytes_fast,
|
||||
uint64_t block_compressed_bytes_slow) override;
|
||||
|
||||
virtual Status Finish(UserCollectedProperties* properties) override;
|
||||
|
||||
|
@ -55,8 +55,7 @@ void MakeBuilder(const Options& options, const ImmutableCFOptions& ioptions,
|
||||
builder->reset(NewTableBuilder(
|
||||
ioptions, moptions, internal_comparator, int_tbl_prop_collector_factories,
|
||||
kTestColumnFamilyId, kTestColumnFamilyName, writable->get(),
|
||||
options.compression, options.sample_for_compression,
|
||||
options.compression_opts, unknown_level));
|
||||
options.compression, options.compression_opts, unknown_level));
|
||||
}
|
||||
} // namespace
|
||||
|
||||
@ -176,9 +175,9 @@ class RegularKeysStartWithAInternal : public IntTblPropCollector {
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
void BlockAdd(uint64_t /* blockRawBytes */,
|
||||
uint64_t /* blockCompressedBytesFast */,
|
||||
uint64_t /* blockCompressedBytesSlow */) override {
|
||||
void BlockAdd(uint64_t /* block_raw_bytes */,
|
||||
uint64_t /* block_compressed_bytes_fast */,
|
||||
uint64_t /* block_compressed_bytes_slow */) override {
|
||||
// Nothing to do.
|
||||
return;
|
||||
}
|
||||
|
@ -517,6 +517,28 @@ class VersionBuilder::Rep {
|
||||
return meta->oldest_blob_file_number;
|
||||
}
|
||||
|
||||
uint64_t GetMinOldestBlobFileNumber() const {
|
||||
uint64_t min_oldest_blob_file_num = std::numeric_limits<uint64_t>::max();
|
||||
for (int level = 0; level < num_levels_; ++level) {
|
||||
const auto& base_files = base_vstorage_->LevelFiles(level);
|
||||
for (const auto* fmeta : base_files) {
|
||||
assert(fmeta);
|
||||
min_oldest_blob_file_num =
|
||||
std::min(min_oldest_blob_file_num, fmeta->oldest_blob_file_number);
|
||||
}
|
||||
const auto& added_files = levels_[level].added_files;
|
||||
for (const auto& elem : added_files) {
|
||||
assert(elem.second);
|
||||
min_oldest_blob_file_num = std::min(
|
||||
min_oldest_blob_file_num, elem.second->oldest_blob_file_number);
|
||||
}
|
||||
}
|
||||
if (min_oldest_blob_file_num == std::numeric_limits<uint64_t>::max()) {
|
||||
min_oldest_blob_file_num = kInvalidBlobFileNumber;
|
||||
}
|
||||
return min_oldest_blob_file_num;
|
||||
}
|
||||
|
||||
Status ApplyFileDeletion(int level, uint64_t file_number) {
|
||||
assert(level != VersionStorageInfo::FileLocation::Invalid().GetLevel());
|
||||
|
||||
@ -834,7 +856,7 @@ class VersionBuilder::Rep {
|
||||
}
|
||||
}
|
||||
|
||||
// Save the current state in *v.
|
||||
// Save the current state in *vstorage.
|
||||
Status SaveTo(VersionStorageInfo* vstorage) {
|
||||
Status s = CheckConsistency(base_vstorage_);
|
||||
if (!s.ok()) {
|
||||
@ -1052,6 +1074,10 @@ Status VersionBuilder::LoadTableHandlers(
|
||||
is_initial_load, prefix_extractor, max_file_size_for_l0_meta_pin);
|
||||
}
|
||||
|
||||
uint64_t VersionBuilder::GetMinOldestBlobFileNumber() const {
|
||||
return rep_->GetMinOldestBlobFileNumber();
|
||||
}
|
||||
|
||||
BaseReferencedVersionBuilder::BaseReferencedVersionBuilder(
|
||||
ColumnFamilyData* cfd)
|
||||
: version_builder_(new VersionBuilder(
|
||||
|
@ -44,6 +44,7 @@ class VersionBuilder {
|
||||
bool is_initial_load,
|
||||
const SliceTransform* prefix_extractor,
|
||||
size_t max_file_size_for_l0_meta_pin);
|
||||
uint64_t GetMinOldestBlobFileNumber() const;
|
||||
|
||||
private:
|
||||
class Rep;
|
||||
|
@ -74,7 +74,7 @@ enum NewFileCustomTag : uint32_t {
|
||||
kNeedCompaction = 2,
|
||||
// Since Manifest is not entirely forward-compatible, we currently encode
|
||||
// kMinLogNumberToKeep as part of NewFile as a hack. This should be removed
|
||||
// when manifest becomes forward-comptabile.
|
||||
// when manifest becomes forward-compatible.
|
||||
kMinLogNumberToKeepHack = 3,
|
||||
kOldestBlobFileNumber = 4,
|
||||
kOldestAncesterTime = 5,
|
||||
@ -195,7 +195,7 @@ struct FileMetaData {
|
||||
|
||||
// The file could be the compaction output from other SST files, which could
|
||||
// in turn be outputs for compact older SST files. We track the memtable
|
||||
// flush timestamp for the oldest SST file that eventaully contribute data
|
||||
// flush timestamp for the oldest SST file that eventually contribute data
|
||||
// to this file. 0 means the information is not available.
|
||||
uint64_t oldest_ancester_time = kUnknownOldestAncesterTime;
|
||||
|
||||
|
@ -11,6 +11,8 @@
|
||||
|
||||
#include <cinttypes>
|
||||
|
||||
#include "db/blob/blob_file_cache.h"
|
||||
#include "db/blob/blob_file_reader.h"
|
||||
#include "monitoring/persistent_stats_history.h"
|
||||
|
||||
namespace ROCKSDB_NAMESPACE {
|
||||
@ -129,14 +131,14 @@ Status FileChecksumRetriever::ApplyVersionEdit(VersionEdit& edit,
|
||||
VersionEditHandler::VersionEditHandler(
|
||||
bool read_only, std::vector<ColumnFamilyDescriptor> column_families,
|
||||
VersionSet* version_set, bool track_missing_files,
|
||||
bool no_error_if_table_files_missing,
|
||||
const std::shared_ptr<IOTracer>& io_tracer, bool skip_load_table_files)
|
||||
bool no_error_if_files_missing, const std::shared_ptr<IOTracer>& io_tracer,
|
||||
bool skip_load_table_files)
|
||||
: VersionEditHandlerBase(),
|
||||
read_only_(read_only),
|
||||
column_families_(std::move(column_families)),
|
||||
version_set_(version_set),
|
||||
track_missing_files_(track_missing_files),
|
||||
no_error_if_table_files_missing_(no_error_if_table_files_missing),
|
||||
no_error_if_files_missing_(no_error_if_files_missing),
|
||||
io_tracer_(io_tracer),
|
||||
skip_load_table_files_(skip_load_table_files),
|
||||
initialized_(false) {
|
||||
@ -301,6 +303,14 @@ bool VersionEditHandler::HasMissingFiles() const {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!ret) {
|
||||
for (const auto& elem : cf_to_missing_blob_files_high_) {
|
||||
if (elem.second != kInvalidBlobFileNumber) {
|
||||
ret = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -437,6 +447,8 @@ ColumnFamilyData* VersionEditHandler::CreateCfAndInit(
|
||||
if (track_missing_files_) {
|
||||
cf_to_missing_files_.emplace(edit.column_family_,
|
||||
std::unordered_set<uint64_t>());
|
||||
cf_to_missing_blob_files_high_.emplace(edit.column_family_,
|
||||
kInvalidBlobFileNumber);
|
||||
}
|
||||
return cfd;
|
||||
}
|
||||
@ -450,6 +462,12 @@ ColumnFamilyData* VersionEditHandler::DestroyCfAndCleanup(
|
||||
auto missing_files_iter = cf_to_missing_files_.find(edit.column_family_);
|
||||
assert(missing_files_iter != cf_to_missing_files_.end());
|
||||
cf_to_missing_files_.erase(missing_files_iter);
|
||||
|
||||
auto missing_blob_files_high_iter =
|
||||
cf_to_missing_blob_files_high_.find(edit.column_family_);
|
||||
assert(missing_blob_files_high_iter !=
|
||||
cf_to_missing_blob_files_high_.end());
|
||||
cf_to_missing_blob_files_high_.erase(missing_blob_files_high_iter);
|
||||
}
|
||||
ColumnFamilyData* ret =
|
||||
version_set_->GetColumnFamilySet()->GetColumnFamily(edit.column_family_);
|
||||
@ -505,8 +523,7 @@ Status VersionEditHandler::LoadTables(ColumnFamilyData* cfd,
|
||||
prefetch_index_and_filter_in_cache, is_initial_load,
|
||||
cfd->GetLatestMutableCFOptions()->prefix_extractor.get(),
|
||||
MaxFileSizeForL0MetaPin(*cfd->GetLatestMutableCFOptions()));
|
||||
if ((s.IsPathNotFound() || s.IsCorruption()) &&
|
||||
no_error_if_table_files_missing_) {
|
||||
if ((s.IsPathNotFound() || s.IsCorruption()) && no_error_if_files_missing_) {
|
||||
s = Status::OK();
|
||||
}
|
||||
if (!s.ok() && !version_set_->db_options_->paranoid_checks) {
|
||||
@ -536,9 +553,13 @@ Status VersionEditHandler::ExtractInfoFromVersionEdit(ColumnFamilyData* cfd,
|
||||
}
|
||||
if (edit.has_comparator_ &&
|
||||
edit.comparator_ != cfd->user_comparator()->Name()) {
|
||||
s = Status::InvalidArgument(
|
||||
cfd->user_comparator()->Name(),
|
||||
"does not match existing comparator " + edit.comparator_);
|
||||
if (!cf_to_cmp_names_) {
|
||||
s = Status::InvalidArgument(
|
||||
cfd->user_comparator()->Name(),
|
||||
"does not match existing comparator " + edit.comparator_);
|
||||
} else {
|
||||
cf_to_cmp_names_->emplace(cfd->GetID(), edit.comparator_);
|
||||
}
|
||||
}
|
||||
if (edit.HasFullHistoryTsLow()) {
|
||||
const std::string& new_ts = edit.GetFullHistoryTsLow();
|
||||
@ -576,7 +597,7 @@ VersionEditHandlerPointInTime::VersionEditHandlerPointInTime(
|
||||
VersionSet* version_set, const std::shared_ptr<IOTracer>& io_tracer)
|
||||
: VersionEditHandler(read_only, column_families, version_set,
|
||||
/*track_missing_files=*/true,
|
||||
/*no_error_if_table_files_missing=*/true, io_tracer) {}
|
||||
/*no_error_if_files_missing=*/true, io_tracer) {}
|
||||
|
||||
VersionEditHandlerPointInTime::~VersionEditHandlerPointInTime() {
|
||||
for (const auto& elem : versions_) {
|
||||
@ -626,7 +647,29 @@ Status VersionEditHandlerPointInTime::MaybeCreateVersion(
|
||||
auto missing_files_iter = cf_to_missing_files_.find(cfd->GetID());
|
||||
assert(missing_files_iter != cf_to_missing_files_.end());
|
||||
std::unordered_set<uint64_t>& missing_files = missing_files_iter->second;
|
||||
const bool prev_has_missing_files = !missing_files.empty();
|
||||
|
||||
auto missing_blob_files_high_iter =
|
||||
cf_to_missing_blob_files_high_.find(cfd->GetID());
|
||||
assert(missing_blob_files_high_iter != cf_to_missing_blob_files_high_.end());
|
||||
const uint64_t prev_missing_blob_file_high =
|
||||
missing_blob_files_high_iter->second;
|
||||
|
||||
VersionBuilder* builder = nullptr;
|
||||
|
||||
if (prev_missing_blob_file_high != kInvalidBlobFileNumber) {
|
||||
auto builder_iter = builders_.find(cfd->GetID());
|
||||
assert(builder_iter != builders_.end());
|
||||
builder = builder_iter->second->version_builder();
|
||||
assert(builder != nullptr);
|
||||
}
|
||||
|
||||
// At this point, we have not yet applied the new version edits read from the
|
||||
// MANIFEST. We check whether we have any missing table and blob files.
|
||||
const bool prev_has_missing_files =
|
||||
!missing_files.empty() ||
|
||||
(prev_missing_blob_file_high != kInvalidBlobFileNumber &&
|
||||
prev_missing_blob_file_high >= builder->GetMinOldestBlobFileNumber());
|
||||
|
||||
for (const auto& file : edit.GetDeletedFiles()) {
|
||||
uint64_t file_num = file.second;
|
||||
auto fiter = missing_files.find(file_num);
|
||||
@ -634,6 +677,8 @@ Status VersionEditHandlerPointInTime::MaybeCreateVersion(
|
||||
missing_files.erase(fiter);
|
||||
}
|
||||
}
|
||||
|
||||
assert(!cfd->ioptions()->cf_paths.empty());
|
||||
Status s;
|
||||
for (const auto& elem : edit.GetNewFiles()) {
|
||||
const FileMetaData& meta = elem.second;
|
||||
@ -649,17 +694,60 @@ Status VersionEditHandlerPointInTime::MaybeCreateVersion(
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t missing_blob_file_num = prev_missing_blob_file_high;
|
||||
for (const auto& elem : edit.GetBlobFileAdditions()) {
|
||||
uint64_t file_num = elem.GetBlobFileNumber();
|
||||
s = VerifyBlobFile(cfd, file_num, elem);
|
||||
if (s.IsPathNotFound() || s.IsNotFound() || s.IsCorruption()) {
|
||||
missing_blob_file_num = std::max(missing_blob_file_num, file_num);
|
||||
s = Status::OK();
|
||||
} else if (!s.ok()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
bool has_missing_blob_files = false;
|
||||
if (missing_blob_file_num != kInvalidBlobFileNumber &&
|
||||
missing_blob_file_num >= prev_missing_blob_file_high) {
|
||||
missing_blob_files_high_iter->second = missing_blob_file_num;
|
||||
has_missing_blob_files = true;
|
||||
} else if (missing_blob_file_num < prev_missing_blob_file_high) {
|
||||
assert(false);
|
||||
}
|
||||
|
||||
// We still have not applied the new version edit, but have tried to add new
|
||||
// table and blob files after verifying their presence and consistency.
|
||||
// Therefore, we know whether we will see new missing table and blob files
|
||||
// later after actually applying the version edit. We perform the check here
|
||||
// and record the result.
|
||||
const bool has_missing_files =
|
||||
!missing_files.empty() || has_missing_blob_files;
|
||||
|
||||
bool missing_info = !version_edit_params_.has_log_number_ ||
|
||||
!version_edit_params_.has_next_file_number_ ||
|
||||
!version_edit_params_.has_last_sequence_;
|
||||
|
||||
// Create version before apply edit
|
||||
// Create version before apply edit. The version will represent the state
|
||||
// before applying the version edit.
|
||||
// A new version will created if:
|
||||
// 1) no error has occurred so far, and
|
||||
// 2) log_number_, next_file_number_ and last_sequence_ are known, and
|
||||
// 3) any of the following:
|
||||
// a) no missing file before, but will have missing file(s) after applying
|
||||
// this version edit.
|
||||
// b) no missing file after applying the version edit, and the caller
|
||||
// explicitly request that a new version be created.
|
||||
if (s.ok() && !missing_info &&
|
||||
((!missing_files.empty() && !prev_has_missing_files) ||
|
||||
(missing_files.empty() && force_create_version))) {
|
||||
auto builder_iter = builders_.find(cfd->GetID());
|
||||
assert(builder_iter != builders_.end());
|
||||
auto* builder = builder_iter->second->version_builder();
|
||||
((has_missing_files && !prev_has_missing_files) ||
|
||||
(!has_missing_files && force_create_version))) {
|
||||
if (!builder) {
|
||||
auto builder_iter = builders_.find(cfd->GetID());
|
||||
assert(builder_iter != builders_.end());
|
||||
builder = builder_iter->second->version_builder();
|
||||
assert(builder);
|
||||
}
|
||||
|
||||
auto* version = new Version(cfd, version_set_, version_set_->file_options_,
|
||||
*cfd->GetLatestMutableCFOptions(), io_tracer_,
|
||||
version_set_->current_version_number_++);
|
||||
@ -687,6 +775,22 @@ Status VersionEditHandlerPointInTime::VerifyFile(const std::string& fpath,
|
||||
return version_set_->VerifyFileMetadata(fpath, fmeta);
|
||||
}
|
||||
|
||||
Status VersionEditHandlerPointInTime::VerifyBlobFile(
|
||||
ColumnFamilyData* cfd, uint64_t blob_file_num,
|
||||
const BlobFileAddition& blob_addition) {
|
||||
BlobFileCache* blob_file_cache = cfd->blob_file_cache();
|
||||
assert(blob_file_cache);
|
||||
CacheHandleGuard<BlobFileReader> blob_file_reader;
|
||||
Status s =
|
||||
blob_file_cache->GetBlobFileReader(blob_file_num, &blob_file_reader);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
// TODO: verify checksum
|
||||
(void)blob_addition;
|
||||
return s;
|
||||
}
|
||||
|
||||
Status ManifestTailer::Initialize() {
|
||||
if (Mode::kRecovery == mode_) {
|
||||
return VersionEditHandler::Initialize();
|
||||
@ -789,13 +893,21 @@ void DumpManifestHandler::CheckIterationResult(const log::Reader& reader,
|
||||
fprintf(stdout, "%s\n", s->ToString().c_str());
|
||||
return;
|
||||
}
|
||||
assert(cf_to_cmp_names_);
|
||||
for (auto* cfd : *(version_set_->column_family_set_)) {
|
||||
fprintf(stdout,
|
||||
"--------------- Column family \"%s\" (ID %" PRIu32
|
||||
") --------------\n",
|
||||
cfd->GetName().c_str(), cfd->GetID());
|
||||
fprintf(stdout, "log number: %" PRIu64 "\n", cfd->GetLogNumber());
|
||||
fprintf(stdout, "comparator: %s\n", cfd->user_comparator()->Name());
|
||||
auto it = cf_to_cmp_names_->find(cfd->GetID());
|
||||
if (it != cf_to_cmp_names_->end()) {
|
||||
fprintf(stdout,
|
||||
"comparator: <%s>, but the comparator object is not available.\n",
|
||||
it->second.c_str());
|
||||
} else {
|
||||
fprintf(stdout, "comparator: %s\n", cfd->user_comparator()->Name());
|
||||
}
|
||||
assert(cfd->current());
|
||||
fprintf(stdout, "%s \n", cfd->current()->DebugString(hex_).c_str());
|
||||
}
|
||||
|
@ -97,7 +97,7 @@ using VersionBuilderUPtr = std::unique_ptr<BaseReferencedVersionBuilder>;
|
||||
// 1. Create an object of VersionEditHandler or its subclasses.
|
||||
// VersionEditHandler handler(read_only, column_families, version_set,
|
||||
// track_missing_files,
|
||||
// no_error_if_table_files_missing);
|
||||
// no_error_if_files_missing);
|
||||
// 2. Status s = handler.Iterate(reader, &db_id);
|
||||
// 3. Check s and handle possible errors.
|
||||
//
|
||||
@ -109,10 +109,10 @@ class VersionEditHandler : public VersionEditHandlerBase {
|
||||
bool read_only,
|
||||
const std::vector<ColumnFamilyDescriptor>& column_families,
|
||||
VersionSet* version_set, bool track_missing_files,
|
||||
bool no_error_if_table_files_missing,
|
||||
bool no_error_if_files_missing,
|
||||
const std::shared_ptr<IOTracer>& io_tracer)
|
||||
: VersionEditHandler(read_only, column_families, version_set,
|
||||
track_missing_files, no_error_if_table_files_missing,
|
||||
track_missing_files, no_error_if_files_missing,
|
||||
io_tracer, /*skip_load_table_files=*/false) {}
|
||||
|
||||
~VersionEditHandler() override {}
|
||||
@ -133,7 +133,7 @@ class VersionEditHandler : public VersionEditHandlerBase {
|
||||
explicit VersionEditHandler(
|
||||
bool read_only, std::vector<ColumnFamilyDescriptor> column_families,
|
||||
VersionSet* version_set, bool track_missing_files,
|
||||
bool no_error_if_table_files_missing,
|
||||
bool no_error_if_files_missing,
|
||||
const std::shared_ptr<IOTracer>& io_tracer, bool skip_load_table_files);
|
||||
|
||||
Status ApplyVersionEdit(VersionEdit& edit, ColumnFamilyData** cfd) override;
|
||||
@ -183,10 +183,12 @@ class VersionEditHandler : public VersionEditHandlerBase {
|
||||
const bool track_missing_files_;
|
||||
std::unordered_map<uint32_t, std::unordered_set<uint64_t>>
|
||||
cf_to_missing_files_;
|
||||
bool no_error_if_table_files_missing_;
|
||||
std::unordered_map<uint32_t, uint64_t> cf_to_missing_blob_files_high_;
|
||||
bool no_error_if_files_missing_;
|
||||
std::shared_ptr<IOTracer> io_tracer_;
|
||||
bool skip_load_table_files_;
|
||||
bool initialized_;
|
||||
std::unique_ptr<std::unordered_map<uint32_t, std::string>> cf_to_cmp_names_;
|
||||
|
||||
private:
|
||||
Status ExtractInfoFromVersionEdit(ColumnFamilyData* cfd,
|
||||
@ -213,6 +215,8 @@ class VersionEditHandlerPointInTime : public VersionEditHandler {
|
||||
bool force_create_version) override;
|
||||
virtual Status VerifyFile(const std::string& fpath,
|
||||
const FileMetaData& fmeta);
|
||||
virtual Status VerifyBlobFile(ColumnFamilyData* cfd, uint64_t blob_file_num,
|
||||
const BlobFileAddition& blob_addition);
|
||||
|
||||
std::unordered_map<uint32_t, Version*> versions_;
|
||||
};
|
||||
@ -267,12 +271,14 @@ class DumpManifestHandler : public VersionEditHandler {
|
||||
: VersionEditHandler(
|
||||
/*read_only=*/true, column_families, version_set,
|
||||
/*track_missing_files=*/false,
|
||||
/*no_error_if_table_files_missing=*/false, io_tracer,
|
||||
/*no_error_if_files_missing=*/false, io_tracer,
|
||||
/*skip_load_table_files=*/true),
|
||||
verbose_(verbose),
|
||||
hex_(hex),
|
||||
json_(json),
|
||||
count_(0) {}
|
||||
count_(0) {
|
||||
cf_to_cmp_names_.reset(new std::unordered_map<uint32_t, std::string>());
|
||||
}
|
||||
|
||||
~DumpManifestHandler() override {}
|
||||
|
||||
|
@ -408,7 +408,7 @@ class FilePickerMultiGet {
|
||||
int GetCurrentLevel() const { return curr_level_; }
|
||||
|
||||
// Iterates through files in the current level until it finds a file that
|
||||
// contains atleast one key from the MultiGet batch
|
||||
// contains at least one key from the MultiGet batch
|
||||
bool GetNextFileInLevelWithKeys(MultiGetRange* next_file_range,
|
||||
size_t* file_index, FdWithKeyRange** fd,
|
||||
bool* is_last_key_in_file) {
|
||||
@ -1768,8 +1768,8 @@ Version::Version(ColumnFamilyData* column_family_data, VersionSet* vset,
|
||||
: cfd_->ioptions()->statistics),
|
||||
table_cache_((cfd_ == nullptr) ? nullptr : cfd_->table_cache()),
|
||||
blob_file_cache_(cfd_ ? cfd_->blob_file_cache() : nullptr),
|
||||
merge_operator_((cfd_ == nullptr) ? nullptr
|
||||
: cfd_->ioptions()->merge_operator),
|
||||
merge_operator_(
|
||||
(cfd_ == nullptr) ? nullptr : cfd_->ioptions()->merge_operator.get()),
|
||||
storage_info_(
|
||||
(cfd_ == nullptr) ? nullptr : &cfd_->internal_comparator(),
|
||||
(cfd_ == nullptr) ? nullptr : cfd_->user_comparator(),
|
||||
@ -2786,7 +2786,7 @@ struct Fsize {
|
||||
FileMetaData* file;
|
||||
};
|
||||
|
||||
// Compator that is used to sort files based on their size
|
||||
// Comparator that is used to sort files based on their size
|
||||
// In normal mode: descending size
|
||||
bool CompareCompensatedSizeDescending(const Fsize& first, const Fsize& second) {
|
||||
return (first.file->compensated_file_size >
|
||||
@ -3206,7 +3206,7 @@ void VersionStorageInfo::GetCleanInputsWithinInterval(
|
||||
// specified range. From that file, iterate backwards and
|
||||
// forwards to find all overlapping files.
|
||||
// if within_range is set, then only store the maximum clean inputs
|
||||
// within range [begin, end]. "clean" means there is a boudnary
|
||||
// within range [begin, end]. "clean" means there is a boundary
|
||||
// between the files in "*inputs" and the surrounding files
|
||||
void VersionStorageInfo::GetOverlappingInputsRangeBinarySearch(
|
||||
int level, const InternalKey* begin, const InternalKey* end,
|
||||
@ -3517,7 +3517,7 @@ void VersionStorageInfo::CalculateBaseBytes(const ImmutableCFOptions& ioptions,
|
||||
// 1. the L0 size is larger than level size base, or
|
||||
// 2. number of L0 files reaches twice the L0->L1 compaction trigger
|
||||
// We don't do this otherwise to keep the LSM-tree structure stable
|
||||
// unless the L0 compation is backlogged.
|
||||
// unless the L0 compaction is backlogged.
|
||||
base_level_size = l0_size;
|
||||
if (base_level_ == num_levels_ - 1) {
|
||||
level_multiplier_ = 1.0;
|
||||
@ -4083,6 +4083,7 @@ Status VersionSet::ProcessManifestWrites(
|
||||
uint64_t new_manifest_file_size = 0;
|
||||
Status s;
|
||||
IOStatus io_s;
|
||||
IOStatus manifest_io_status;
|
||||
{
|
||||
FileOptions opt_file_opts = fs_->OptimizeForManifestWrite(file_options_);
|
||||
mu->Unlock();
|
||||
@ -4134,6 +4135,7 @@ Status VersionSet::ProcessManifestWrites(
|
||||
s = WriteCurrentStateToManifest(curr_state, wal_additions,
|
||||
descriptor_log_.get(), io_s);
|
||||
} else {
|
||||
manifest_io_status = io_s;
|
||||
s = io_s;
|
||||
}
|
||||
}
|
||||
@ -4171,11 +4173,13 @@ Status VersionSet::ProcessManifestWrites(
|
||||
io_s = descriptor_log_->AddRecord(record);
|
||||
if (!io_s.ok()) {
|
||||
s = io_s;
|
||||
manifest_io_status = io_s;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (s.ok()) {
|
||||
io_s = SyncManifest(db_options_, descriptor_log_->file());
|
||||
manifest_io_status = io_s;
|
||||
TEST_SYNC_POINT_CALLBACK(
|
||||
"VersionSet::ProcessManifestWrites:AfterSyncManifest", &io_s);
|
||||
}
|
||||
@ -4188,6 +4192,9 @@ Status VersionSet::ProcessManifestWrites(
|
||||
|
||||
// If we just created a new descriptor file, install it by writing a
|
||||
// new CURRENT file that points to it.
|
||||
if (s.ok()) {
|
||||
assert(manifest_io_status.ok());
|
||||
}
|
||||
if (s.ok() && new_descriptor_log) {
|
||||
io_s = SetCurrentFile(fs_.get(), dbname_, pending_manifest_file_number_,
|
||||
db_directory);
|
||||
@ -4303,11 +4310,41 @@ Status VersionSet::ProcessManifestWrites(
|
||||
for (auto v : versions) {
|
||||
delete v;
|
||||
}
|
||||
if (manifest_io_status.ok()) {
|
||||
manifest_file_number_ = pending_manifest_file_number_;
|
||||
manifest_file_size_ = new_manifest_file_size;
|
||||
}
|
||||
// If manifest append failed for whatever reason, the file could be
|
||||
// corrupted. So we need to force the next version update to start a
|
||||
// new manifest file.
|
||||
descriptor_log_.reset();
|
||||
if (new_descriptor_log) {
|
||||
// If manifest operations failed, then we know the CURRENT file still
|
||||
// points to the original MANIFEST. Therefore, we can safely delete the
|
||||
// new MANIFEST.
|
||||
// If manifest operations succeeded, and we are here, then it is possible
|
||||
// that renaming tmp file to CURRENT failed.
|
||||
//
|
||||
// On local POSIX-compliant FS, the CURRENT must point to the original
|
||||
// MANIFEST. We can delete the new MANIFEST for simplicity, but we can also
|
||||
// keep it. Future recovery will ignore this MANIFEST. It's also ok for the
|
||||
// process not to crash and continue using the db. Any future LogAndApply()
|
||||
// call will switch to a new MANIFEST and update CURRENT, still ignoring
|
||||
// this one.
|
||||
//
|
||||
// On non-local FS, it is
|
||||
// possible that the rename operation succeeded on the server (remote)
|
||||
// side, but the client somehow returns a non-ok status to RocksDB. Note
|
||||
// that this does not violate atomicity. Should we delete the new MANIFEST
|
||||
// successfully, a subsequent recovery attempt will likely see the CURRENT
|
||||
// pointing to the new MANIFEST, thus fail. We will not be able to open the
|
||||
// DB again. Therefore, if manifest operations succeed, we should keep the
|
||||
// the new MANIFEST. If the process proceeds, any future LogAndApply() call
|
||||
// will switch to a new MANIFEST and update CURRENT. If user tries to
|
||||
// re-open the DB,
|
||||
// a) CURRENT points to the new MANIFEST, and the new MANIFEST is present.
|
||||
// b) CURRENT points to the original MANIFEST, and the original MANIFEST
|
||||
// also exists.
|
||||
if (new_descriptor_log && !manifest_io_status.ok()) {
|
||||
ROCKS_LOG_INFO(db_options_->info_log,
|
||||
"Deleting manifest %" PRIu64 " current manifest %" PRIu64
|
||||
"\n",
|
||||
@ -4354,7 +4391,7 @@ Status VersionSet::ProcessManifestWrites(
|
||||
return s;
|
||||
}
|
||||
|
||||
// 'datas' is gramatically incorrect. We still use this notation to indicate
|
||||
// 'datas' is grammatically incorrect. We still use this notation to indicate
|
||||
// that this variable represents a collection of column_family_data.
|
||||
Status VersionSet::LogAndApply(
|
||||
const autovector<ColumnFamilyData*>& column_family_datas,
|
||||
@ -4490,60 +4527,6 @@ Status VersionSet::LogAndApplyHelper(ColumnFamilyData* cfd,
|
||||
return builder ? builder->Apply(edit) : Status::OK();
|
||||
}
|
||||
|
||||
Status VersionSet::ExtractInfoFromVersionEdit(
|
||||
ColumnFamilyData* cfd, const VersionEdit& from_edit,
|
||||
VersionEditParams* version_edit_params) {
|
||||
if (cfd != nullptr) {
|
||||
if (from_edit.has_db_id_) {
|
||||
version_edit_params->SetDBId(from_edit.db_id_);
|
||||
}
|
||||
if (from_edit.has_log_number_) {
|
||||
if (cfd->GetLogNumber() > from_edit.log_number_) {
|
||||
ROCKS_LOG_WARN(
|
||||
db_options_->info_log,
|
||||
"MANIFEST corruption detected, but ignored - Log numbers in "
|
||||
"records NOT monotonically increasing");
|
||||
} else {
|
||||
cfd->SetLogNumber(from_edit.log_number_);
|
||||
version_edit_params->SetLogNumber(from_edit.log_number_);
|
||||
}
|
||||
}
|
||||
if (from_edit.has_comparator_ &&
|
||||
from_edit.comparator_ != cfd->user_comparator()->Name()) {
|
||||
return Status::InvalidArgument(
|
||||
cfd->user_comparator()->Name(),
|
||||
"does not match existing comparator " + from_edit.comparator_);
|
||||
}
|
||||
if (from_edit.HasFullHistoryTsLow()) {
|
||||
const std::string& new_ts = from_edit.GetFullHistoryTsLow();
|
||||
cfd->SetFullHistoryTsLow(new_ts);
|
||||
}
|
||||
}
|
||||
|
||||
if (from_edit.has_prev_log_number_) {
|
||||
version_edit_params->SetPrevLogNumber(from_edit.prev_log_number_);
|
||||
}
|
||||
|
||||
if (from_edit.has_next_file_number_) {
|
||||
version_edit_params->SetNextFile(from_edit.next_file_number_);
|
||||
}
|
||||
|
||||
if (from_edit.has_max_column_family_) {
|
||||
version_edit_params->SetMaxColumnFamily(from_edit.max_column_family_);
|
||||
}
|
||||
|
||||
if (from_edit.has_min_log_number_to_keep_) {
|
||||
version_edit_params->min_log_number_to_keep_ =
|
||||
std::max(version_edit_params->min_log_number_to_keep_,
|
||||
from_edit.min_log_number_to_keep_);
|
||||
}
|
||||
|
||||
if (from_edit.has_last_sequence_) {
|
||||
version_edit_params->SetLastSequence(from_edit.last_sequence_);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status VersionSet::GetCurrentManifestPath(const std::string& dbname,
|
||||
FileSystem* fs,
|
||||
std::string* manifest_path,
|
||||
@ -4610,10 +4593,10 @@ Status VersionSet::Recover(
|
||||
reporter.status = &log_read_status;
|
||||
log::Reader reader(nullptr, std::move(manifest_file_reader), &reporter,
|
||||
true /* checksum */, 0 /* log_number */);
|
||||
VersionEditHandler handler(
|
||||
read_only, column_families, const_cast<VersionSet*>(this),
|
||||
/*track_missing_files=*/false,
|
||||
/*no_error_if_table_files_missing=*/false, io_tracer_);
|
||||
VersionEditHandler handler(read_only, column_families,
|
||||
const_cast<VersionSet*>(this),
|
||||
/*track_missing_files=*/false,
|
||||
/*no_error_if_files_missing=*/false, io_tracer_);
|
||||
handler.Iterate(reader, &log_read_status);
|
||||
s = handler.status();
|
||||
if (s.ok()) {
|
||||
@ -4796,7 +4779,7 @@ Status VersionSet::TryRecoverFromOneManifest(
|
||||
Status VersionSet::ListColumnFamilies(std::vector<std::string>* column_families,
|
||||
const std::string& dbname,
|
||||
FileSystem* fs) {
|
||||
// these are just for performance reasons, not correcntes,
|
||||
// these are just for performance reasons, not correctness,
|
||||
// so we're fine using the defaults
|
||||
FileOptions soptions;
|
||||
// Read "CURRENT" file, which contains a pointer to the current manifest file
|
||||
@ -4937,7 +4920,7 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
|
||||
}
|
||||
|
||||
// Get the checksum information including the checksum and checksum function
|
||||
// name of all SST files in VersionSet. Store the information in
|
||||
// name of all SST and blob files in VersionSet. Store the information in
|
||||
// FileChecksumList which contains a map from file number to its checksum info.
|
||||
// If DB is not running, make sure call VersionSet::Recover() to load the file
|
||||
// metadata from Manifest to VersionSet before calling this function.
|
||||
@ -4954,6 +4937,7 @@ Status VersionSet::GetLiveFilesChecksumInfo(FileChecksumList* checksum_list) {
|
||||
if (cfd->IsDropped() || !cfd->initialized()) {
|
||||
continue;
|
||||
}
|
||||
/* SST files */
|
||||
for (int level = 0; level < cfd->NumberLevels(); level++) {
|
||||
for (const auto& file :
|
||||
cfd->current()->storage_info()->LevelFiles(level)) {
|
||||
@ -4961,17 +4945,36 @@ Status VersionSet::GetLiveFilesChecksumInfo(FileChecksumList* checksum_list) {
|
||||
file->file_checksum,
|
||||
file->file_checksum_func_name);
|
||||
if (!s.ok()) {
|
||||
break;
|
||||
return s;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Blob files */
|
||||
const auto& blob_files = cfd->current()->storage_info()->GetBlobFiles();
|
||||
for (const auto& pair : blob_files) {
|
||||
const uint64_t blob_file_number = pair.first;
|
||||
const auto& meta = pair.second;
|
||||
|
||||
assert(meta);
|
||||
assert(blob_file_number == meta->GetBlobFileNumber());
|
||||
|
||||
std::string checksum_value = meta->GetChecksumValue();
|
||||
std::string checksum_method = meta->GetChecksumMethod();
|
||||
assert(checksum_value.empty() == checksum_method.empty());
|
||||
if (meta->GetChecksumMethod().empty()) {
|
||||
checksum_value = kUnknownFileChecksum;
|
||||
checksum_method = kUnknownFileChecksumFuncName;
|
||||
}
|
||||
|
||||
s = checksum_list->InsertOneFileChecksum(blob_file_number, checksum_value,
|
||||
checksum_method);
|
||||
if (!s.ok()) {
|
||||
break;
|
||||
return s;
|
||||
}
|
||||
}
|
||||
if (!s.ok()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
@ -5499,20 +5502,6 @@ bool VersionSet::VerifyCompactionFileConsistency(Compaction* c) {
|
||||
"[%s] compaction output being applied to a different base version from"
|
||||
" input version",
|
||||
c->column_family_data()->GetName().c_str());
|
||||
|
||||
if (vstorage->compaction_style_ == kCompactionStyleLevel &&
|
||||
c->start_level() == 0 && c->num_input_levels() > 2U) {
|
||||
// We are doing a L0->base_level compaction. The assumption is if
|
||||
// base level is not L1, levels from L1 to base_level - 1 is empty.
|
||||
// This is ensured by having one compaction from L0 going on at the
|
||||
// same time in level-based compaction. So that during the time, no
|
||||
// compaction/flush can put files to those levels.
|
||||
for (int l = c->start_level() + 1; l < c->output_level(); l++) {
|
||||
if (vstorage->NumLevelFiles(l) != 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t input = 0; input < c->num_input_levels(); ++input) {
|
||||
|
@ -1331,10 +1331,6 @@ class VersionSet {
|
||||
ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& cf_options,
|
||||
const VersionEdit* edit);
|
||||
|
||||
Status ExtractInfoFromVersionEdit(ColumnFamilyData* cfd,
|
||||
const VersionEdit& from_edit,
|
||||
VersionEditParams* version_edit_params);
|
||||
|
||||
Status VerifyFileMetadata(const std::string& fpath,
|
||||
const FileMetaData& meta) const;
|
||||
|
||||
|
@ -2781,7 +2781,7 @@ class VersionSetTestMissingFiles : public VersionSetTestBase,
|
||||
TableBuilderOptions(
|
||||
immutable_cf_options_, mutable_cf_options_, *internal_comparator_,
|
||||
&int_tbl_prop_collector_factories, kNoCompression,
|
||||
/*_sample_for_compression=*/0, CompressionOptions(),
|
||||
CompressionOptions(),
|
||||
/*_skip_filters=*/false, info.column_family, info.level),
|
||||
TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
|
||||
fwriter.get()));
|
||||
@ -2793,11 +2793,9 @@ class VersionSetTestMissingFiles : public VersionSetTestBase,
|
||||
s = fs_->GetFileSize(fname, IOOptions(), &file_size, nullptr);
|
||||
ASSERT_OK(s);
|
||||
ASSERT_NE(0, file_size);
|
||||
FileMetaData meta;
|
||||
meta = FileMetaData(file_num, /*file_path_id=*/0, file_size, ikey, ikey,
|
||||
0, 0, false, 0, 0, 0, kUnknownFileChecksum,
|
||||
kUnknownFileChecksumFuncName);
|
||||
file_metas->emplace_back(meta);
|
||||
file_metas->emplace_back(file_num, /*file_path_id=*/0, file_size, ikey,
|
||||
ikey, 0, 0, false, 0, 0, 0, kUnknownFileChecksum,
|
||||
kUnknownFileChecksumFuncName);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -134,8 +134,8 @@ Status WalManager::GetUpdatesSince(
|
||||
// b. get sorted non-empty archived logs
|
||||
// c. delete what should be deleted
|
||||
void WalManager::PurgeObsoleteWALFiles() {
|
||||
bool const ttl_enabled = db_options_.wal_ttl_seconds > 0;
|
||||
bool const size_limit_enabled = db_options_.wal_size_limit_mb > 0;
|
||||
bool const ttl_enabled = db_options_.WAL_ttl_seconds > 0;
|
||||
bool const size_limit_enabled = db_options_.WAL_size_limit_MB > 0;
|
||||
if (!ttl_enabled && !size_limit_enabled) {
|
||||
return;
|
||||
}
|
||||
@ -150,7 +150,7 @@ void WalManager::PurgeObsoleteWALFiles() {
|
||||
}
|
||||
uint64_t const now_seconds = static_cast<uint64_t>(current_time);
|
||||
uint64_t const time_to_check = (ttl_enabled && !size_limit_enabled)
|
||||
? db_options_.wal_ttl_seconds / 2
|
||||
? db_options_.WAL_ttl_seconds / 2
|
||||
: kDefaultIntervalToDeleteObsoleteWAL;
|
||||
|
||||
if (purge_wal_files_last_run_ + time_to_check > now_seconds) {
|
||||
@ -185,7 +185,7 @@ void WalManager::PurgeObsoleteWALFiles() {
|
||||
s.ToString().c_str());
|
||||
continue;
|
||||
}
|
||||
if (now_seconds - file_m_time > db_options_.wal_ttl_seconds) {
|
||||
if (now_seconds - file_m_time > db_options_.WAL_ttl_seconds) {
|
||||
s = DeleteDBFile(&db_options_, file_path, archival_dir, false,
|
||||
/*force_fg=*/!wal_in_db_path_);
|
||||
if (!s.ok()) {
|
||||
@ -234,8 +234,8 @@ void WalManager::PurgeObsoleteWALFiles() {
|
||||
return;
|
||||
}
|
||||
|
||||
size_t const files_keep_num =
|
||||
static_cast<size_t>(db_options_.wal_size_limit_mb * 1024 * 1024 / log_file_size);
|
||||
size_t const files_keep_num = static_cast<size_t>(
|
||||
db_options_.WAL_size_limit_MB * 1024 * 1024 / log_file_size);
|
||||
if (log_files_num <= files_keep_num) {
|
||||
return;
|
||||
}
|
||||
|
@ -217,8 +217,8 @@ int CountRecords(TransactionLogIterator* iter) {
|
||||
} // namespace
|
||||
|
||||
TEST_F(WalManagerTest, WALArchivalSizeLimit) {
|
||||
db_options_.wal_ttl_seconds = 0;
|
||||
db_options_.wal_size_limit_mb = 1000;
|
||||
db_options_.WAL_ttl_seconds = 0;
|
||||
db_options_.WAL_size_limit_MB = 1000;
|
||||
Init();
|
||||
|
||||
// TEST : Create WalManager with huge size limit and no ttl.
|
||||
@ -226,7 +226,7 @@ TEST_F(WalManagerTest, WALArchivalSizeLimit) {
|
||||
// Count the archived log files that survived.
|
||||
// Assert that all of them did.
|
||||
// Change size limit. Re-open WalManager.
|
||||
// Assert that archive is not greater than wal_size_limit_mb after
|
||||
// Assert that archive is not greater than WAL_size_limit_MB after
|
||||
// PurgeObsoleteWALFiles()
|
||||
// Set ttl and time_to_check_ to small values. Re-open db.
|
||||
// Assert that there are no archived logs left.
|
||||
@ -238,14 +238,14 @@ TEST_F(WalManagerTest, WALArchivalSizeLimit) {
|
||||
ListSpecificFiles(env_.get(), archive_dir, kWalFile);
|
||||
ASSERT_EQ(log_files.size(), 20U);
|
||||
|
||||
db_options_.wal_size_limit_mb = 8;
|
||||
db_options_.WAL_size_limit_MB = 8;
|
||||
Reopen();
|
||||
wal_manager_->PurgeObsoleteWALFiles();
|
||||
|
||||
uint64_t archive_size = GetLogDirSize(archive_dir, env_.get());
|
||||
ASSERT_TRUE(archive_size <= db_options_.wal_size_limit_mb * 1024 * 1024);
|
||||
ASSERT_TRUE(archive_size <= db_options_.WAL_size_limit_MB * 1024 * 1024);
|
||||
|
||||
db_options_.wal_ttl_seconds = 1;
|
||||
db_options_.WAL_ttl_seconds = 1;
|
||||
env_->FakeSleepForMicroseconds(2 * 1000 * 1000);
|
||||
Reopen();
|
||||
wal_manager_->PurgeObsoleteWALFiles();
|
||||
@ -255,7 +255,7 @@ TEST_F(WalManagerTest, WALArchivalSizeLimit) {
|
||||
}
|
||||
|
||||
TEST_F(WalManagerTest, WALArchivalTtl) {
|
||||
db_options_.wal_ttl_seconds = 1000;
|
||||
db_options_.WAL_ttl_seconds = 1000;
|
||||
Init();
|
||||
|
||||
// TEST : Create WalManager with a ttl and no size limit.
|
||||
@ -271,7 +271,7 @@ TEST_F(WalManagerTest, WALArchivalTtl) {
|
||||
ListSpecificFiles(env_.get(), archive_dir, kWalFile);
|
||||
ASSERT_GT(log_files.size(), 0U);
|
||||
|
||||
db_options_.wal_ttl_seconds = 1;
|
||||
db_options_.WAL_ttl_seconds = 1;
|
||||
env_->FakeSleepForMicroseconds(3 * 1000 * 1000);
|
||||
Reopen();
|
||||
wal_manager_->PurgeObsoleteWALFiles();
|
||||
|
@ -241,6 +241,7 @@ bool WriteThread::LinkOne(Writer* w, std::atomic<Writer*>* newest_writer) {
|
||||
MutexLock lock(&stall_mu_);
|
||||
writers = newest_writer->load(std::memory_order_relaxed);
|
||||
if (writers == &write_stall_dummy_) {
|
||||
TEST_SYNC_POINT_CALLBACK("WriteThread::WriteStall::Wait", w);
|
||||
stall_cv_.Wait();
|
||||
// Load newest_writers_ again since it may have changed
|
||||
writers = newest_writer->load(std::memory_order_relaxed);
|
||||
|
@ -30,7 +30,7 @@ enum ROCKSDB_NAMESPACE::ChecksumType checksum_type_e =
|
||||
ROCKSDB_NAMESPACE::kCRC32c;
|
||||
enum RepFactory FLAGS_rep_factory = kSkipList;
|
||||
std::vector<double> sum_probs(100001);
|
||||
int64_t zipf_sum_size = 100000;
|
||||
constexpr int64_t zipf_sum_size = 100000;
|
||||
|
||||
namespace ROCKSDB_NAMESPACE {
|
||||
|
||||
@ -233,6 +233,15 @@ size_t GenerateValue(uint32_t rand, char* v, size_t max_sz) {
|
||||
return value_sz; // the size of the value set.
|
||||
}
|
||||
|
||||
std::string NowNanosStr() {
|
||||
uint64_t t = db_stress_env->NowNanos();
|
||||
std::string ret;
|
||||
PutFixed64(&ret, t);
|
||||
return ret;
|
||||
}
|
||||
|
||||
std::string GenerateTimestampForRead() { return NowNanosStr(); }
|
||||
|
||||
namespace {
|
||||
|
||||
class MyXXH64Checksum : public FileChecksumGenerator {
|
||||
|
@ -260,9 +260,11 @@ DECLARE_bool(enable_compaction_filter);
|
||||
DECLARE_bool(paranoid_file_checks);
|
||||
DECLARE_uint64(batch_protection_bytes_per_key);
|
||||
|
||||
const long KB = 1024;
|
||||
const int kRandomValueMaxFactor = 3;
|
||||
const int kValueMaxLen = 100;
|
||||
DECLARE_uint64(user_timestamp_size);
|
||||
|
||||
constexpr long KB = 1024;
|
||||
constexpr int kRandomValueMaxFactor = 3;
|
||||
constexpr int kValueMaxLen = 100;
|
||||
|
||||
// wrapped posix or hdfs environment
|
||||
extern ROCKSDB_NAMESPACE::Env* db_stress_env;
|
||||
@ -561,6 +563,9 @@ extern StressTest* CreateNonBatchedOpsStressTest();
|
||||
extern void InitializeHotKeyGenerator(double alpha);
|
||||
extern int64_t GetOneHotKeyID(double rand_seed, int64_t max_key);
|
||||
|
||||
extern std::string GenerateTimestampForRead();
|
||||
extern std::string NowNanosStr();
|
||||
|
||||
std::shared_ptr<FileChecksumGenFactory> GetFileChecksumImpl(
|
||||
const std::string& name);
|
||||
} // namespace ROCKSDB_NAMESPACE
|
||||
|
@ -804,4 +804,8 @@ DEFINE_string(file_checksum_impl, "none",
|
||||
DEFINE_int32(write_fault_one_in, 0,
|
||||
"On non-zero, enables fault injection on write");
|
||||
|
||||
DEFINE_uint64(user_timestamp_size, 0,
|
||||
"Number of bytes for a user-defined timestamp. Currently, only "
|
||||
"8-byte is supported");
|
||||
|
||||
#endif // GFLAGS
|
||||
|
@ -418,6 +418,8 @@ struct ThreadState {
|
||||
std::string value;
|
||||
// optional state of all keys in the db
|
||||
std::vector<bool>* key_vec;
|
||||
|
||||
std::string timestamp;
|
||||
};
|
||||
std::queue<std::pair<uint64_t, SnapshotState>> snapshot_queue;
|
||||
|
||||
|
@ -317,6 +317,11 @@ Status StressTest::AssertSame(DB* db, ColumnFamilyHandle* cf,
|
||||
}
|
||||
ReadOptions ropt;
|
||||
ropt.snapshot = snap_state.snapshot;
|
||||
Slice ts;
|
||||
if (!snap_state.timestamp.empty()) {
|
||||
ts = snap_state.timestamp;
|
||||
ropt.timestamp = &ts;
|
||||
}
|
||||
PinnableSlice exp_v(&snap_state.value);
|
||||
exp_v.PinSelf();
|
||||
PinnableSlice v;
|
||||
@ -422,6 +427,13 @@ void StressTest::PreloadDbAndReopenAsReadOnly(int64_t number_of_keys,
|
||||
}
|
||||
} else {
|
||||
if (!FLAGS_use_txn) {
|
||||
std::string ts_str;
|
||||
Slice ts;
|
||||
if (FLAGS_user_timestamp_size > 0) {
|
||||
ts_str = NowNanosStr();
|
||||
ts = ts_str;
|
||||
write_opts.timestamp = &ts;
|
||||
}
|
||||
s = db_->Put(write_opts, cfh, key, v);
|
||||
} else {
|
||||
#ifndef ROCKSDB_LITE
|
||||
@ -564,10 +576,9 @@ void StressTest::OperateDb(ThreadState* thread) {
|
||||
if (FLAGS_write_fault_one_in) {
|
||||
IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
|
||||
error_msg.SetRetryable(true);
|
||||
std::vector<FileType> types;
|
||||
types.push_back(FileType::kTableFile);
|
||||
types.push_back(FileType::kDescriptorFile);
|
||||
types.push_back(FileType::kCurrentFile);
|
||||
std::vector<FileType> types = {FileType::kTableFile,
|
||||
FileType::kDescriptorFile,
|
||||
FileType::kCurrentFile};
|
||||
fault_fs_guard->SetRandomWriteError(
|
||||
thread->shared->GetSeed(), FLAGS_write_fault_one_in, error_msg, types);
|
||||
}
|
||||
@ -766,6 +777,20 @@ void StressTest::OperateDb(ThreadState* thread) {
|
||||
}
|
||||
}
|
||||
|
||||
// Assign timestamps if necessary.
|
||||
std::string read_ts_str;
|
||||
std::string write_ts_str;
|
||||
Slice read_ts;
|
||||
Slice write_ts;
|
||||
if (ShouldAcquireMutexOnKey() && FLAGS_user_timestamp_size > 0) {
|
||||
read_ts_str = GenerateTimestampForRead();
|
||||
read_ts = read_ts_str;
|
||||
read_opts.timestamp = &read_ts;
|
||||
write_ts_str = NowNanosStr();
|
||||
write_ts = write_ts_str;
|
||||
write_opts.timestamp = &write_ts;
|
||||
}
|
||||
|
||||
int prob_op = thread->rand.Uniform(100);
|
||||
// Reset this in case we pick something other than a read op. We don't
|
||||
// want to use a stale value when deciding at the beginning of the loop
|
||||
@ -856,8 +881,16 @@ std::vector<std::string> StressTest::GetWhiteBoxKeys(ThreadState* thread,
|
||||
std::vector<std::string> boundaries;
|
||||
for (const LevelMetaData& lmd : cfmd.levels) {
|
||||
for (const SstFileMetaData& sfmd : lmd.files) {
|
||||
boundaries.push_back(sfmd.smallestkey);
|
||||
boundaries.push_back(sfmd.largestkey);
|
||||
// If FLAGS_user_timestamp_size > 0, then both smallestkey and largestkey
|
||||
// have timestamps.
|
||||
const auto& skey = sfmd.smallestkey;
|
||||
const auto& lkey = sfmd.largestkey;
|
||||
assert(skey.size() >= FLAGS_user_timestamp_size);
|
||||
assert(lkey.size() >= FLAGS_user_timestamp_size);
|
||||
boundaries.push_back(
|
||||
skey.substr(0, skey.size() - FLAGS_user_timestamp_size));
|
||||
boundaries.push_back(
|
||||
lkey.substr(0, lkey.size() - FLAGS_user_timestamp_size));
|
||||
}
|
||||
}
|
||||
if (boundaries.empty()) {
|
||||
@ -1007,6 +1040,7 @@ Status StressTest::TestIterate(ThreadState* thread,
|
||||
// iterators with the same set-up, and it doesn't hurt to check them
|
||||
// to be equal.
|
||||
ReadOptions cmp_ro;
|
||||
cmp_ro.timestamp = readoptionscopy.timestamp;
|
||||
cmp_ro.snapshot = snapshot;
|
||||
cmp_ro.total_order_seek = true;
|
||||
ColumnFamilyHandle* cmp_cfh =
|
||||
@ -1126,21 +1160,25 @@ void StressTest::VerifyIterator(ThreadState* thread,
|
||||
*diverged = true;
|
||||
return;
|
||||
} else if (op == kLastOpSeek && ro.iterate_lower_bound != nullptr &&
|
||||
(options_.comparator->Compare(*ro.iterate_lower_bound, seek_key) >=
|
||||
0 ||
|
||||
(options_.comparator->CompareWithoutTimestamp(
|
||||
*ro.iterate_lower_bound, /*a_has_ts=*/false, seek_key,
|
||||
/*b_has_ts=*/false) >= 0 ||
|
||||
(ro.iterate_upper_bound != nullptr &&
|
||||
options_.comparator->Compare(*ro.iterate_lower_bound,
|
||||
*ro.iterate_upper_bound) >= 0))) {
|
||||
options_.comparator->CompareWithoutTimestamp(
|
||||
*ro.iterate_lower_bound, /*a_has_ts=*/false,
|
||||
*ro.iterate_upper_bound, /*b_has_ts*/ false) >= 0))) {
|
||||
// Lower bound behavior is not well defined if it is larger than
|
||||
// seek key or upper bound. Disable the check for now.
|
||||
*diverged = true;
|
||||
return;
|
||||
} else if (op == kLastOpSeekForPrev && ro.iterate_upper_bound != nullptr &&
|
||||
(options_.comparator->Compare(*ro.iterate_upper_bound, seek_key) <=
|
||||
0 ||
|
||||
(options_.comparator->CompareWithoutTimestamp(
|
||||
*ro.iterate_upper_bound, /*a_has_ts=*/false, seek_key,
|
||||
/*b_has_ts=*/false) <= 0 ||
|
||||
(ro.iterate_lower_bound != nullptr &&
|
||||
options_.comparator->Compare(*ro.iterate_lower_bound,
|
||||
*ro.iterate_upper_bound) >= 0))) {
|
||||
options_.comparator->CompareWithoutTimestamp(
|
||||
*ro.iterate_lower_bound, /*a_has_ts=*/false,
|
||||
*ro.iterate_upper_bound, /*b_has_ts=*/false) >= 0))) {
|
||||
// Uppder bound behavior is not well defined if it is smaller than
|
||||
// seek key or lower bound. Disable the check for now.
|
||||
*diverged = true;
|
||||
@ -1209,9 +1247,13 @@ void StressTest::VerifyIterator(ThreadState* thread,
|
||||
if ((iter->Valid() && iter->key() != cmp_iter->key()) ||
|
||||
(!iter->Valid() &&
|
||||
(ro.iterate_upper_bound == nullptr ||
|
||||
cmp->Compare(total_order_key, *ro.iterate_upper_bound) < 0) &&
|
||||
cmp->CompareWithoutTimestamp(total_order_key, /*a_has_ts=*/false,
|
||||
*ro.iterate_upper_bound,
|
||||
/*b_has_ts=*/false) < 0) &&
|
||||
(ro.iterate_lower_bound == nullptr ||
|
||||
cmp->Compare(total_order_key, *ro.iterate_lower_bound) > 0))) {
|
||||
cmp->CompareWithoutTimestamp(total_order_key, /*a_has_ts=*/false,
|
||||
*ro.iterate_lower_bound,
|
||||
/*b_has_ts=*/false) > 0))) {
|
||||
fprintf(stderr,
|
||||
"Iterator diverged from control iterator which"
|
||||
" has value %s %s\n",
|
||||
@ -1326,8 +1368,13 @@ Status StressTest::TestBackupRestore(
|
||||
}
|
||||
}
|
||||
std::vector<BackupInfo> backup_info;
|
||||
// If inplace_not_restore, we verify the backup by opening it as a
|
||||
// read-only DB. If !inplace_not_restore, we restore it to a temporary
|
||||
// directory for verification.
|
||||
bool inplace_not_restore = thread->rand.OneIn(3);
|
||||
if (s.ok()) {
|
||||
backup_engine->GetBackupInfo(&backup_info);
|
||||
backup_engine->GetBackupInfo(&backup_info,
|
||||
/*include_file_details*/ inplace_not_restore);
|
||||
if (backup_info.empty()) {
|
||||
s = Status::NotFound("no backups found");
|
||||
from = "BackupEngine::GetBackupInfo";
|
||||
@ -1343,8 +1390,8 @@ Status StressTest::TestBackupRestore(
|
||||
}
|
||||
const bool allow_persistent = thread->tid == 0; // not too many
|
||||
bool from_latest = false;
|
||||
if (s.ok()) {
|
||||
int count = static_cast<int>(backup_info.size());
|
||||
int count = static_cast<int>(backup_info.size());
|
||||
if (s.ok() && !inplace_not_restore) {
|
||||
if (count > 1) {
|
||||
s = backup_engine->RestoreDBFromBackup(
|
||||
RestoreOptions(), backup_info[thread->rand.Uniform(count)].backup_id,
|
||||
@ -1362,7 +1409,9 @@ Status StressTest::TestBackupRestore(
|
||||
}
|
||||
}
|
||||
}
|
||||
if (s.ok()) {
|
||||
if (s.ok() && !inplace_not_restore) {
|
||||
// Purge early if restoring, to ensure the restored directory doesn't
|
||||
// have some secret dependency on the backup directory.
|
||||
uint32_t to_keep = 0;
|
||||
if (allow_persistent) {
|
||||
// allow one thread to keep up to 2 backups
|
||||
@ -1390,10 +1439,21 @@ Status StressTest::TestBackupRestore(
|
||||
for (auto name : column_family_names_) {
|
||||
cf_descriptors.emplace_back(name, ColumnFamilyOptions(restore_options));
|
||||
}
|
||||
s = DB::Open(DBOptions(restore_options), restore_dir, cf_descriptors,
|
||||
&restored_cf_handles, &restored_db);
|
||||
if (!s.ok()) {
|
||||
from = "DB::Open in backup/restore";
|
||||
if (inplace_not_restore) {
|
||||
BackupInfo& info = backup_info[thread->rand.Uniform(count)];
|
||||
restore_options.env = info.env_for_open.get();
|
||||
s = DB::OpenForReadOnly(DBOptions(restore_options), info.name_for_open,
|
||||
cf_descriptors, &restored_cf_handles,
|
||||
&restored_db);
|
||||
if (!s.ok()) {
|
||||
from = "DB::OpenForReadOnly in backup/restore";
|
||||
}
|
||||
} else {
|
||||
s = DB::Open(DBOptions(restore_options), restore_dir, cf_descriptors,
|
||||
&restored_cf_handles, &restored_db);
|
||||
if (!s.ok()) {
|
||||
from = "DB::Open in backup/restore";
|
||||
}
|
||||
}
|
||||
}
|
||||
// Note the column families chosen by `rand_column_families` cannot be
|
||||
@ -1407,8 +1467,16 @@ Status StressTest::TestBackupRestore(
|
||||
std::string key_str = Key(rand_keys[0]);
|
||||
Slice key = key_str;
|
||||
std::string restored_value;
|
||||
ReadOptions read_opts;
|
||||
std::string ts_str;
|
||||
Slice ts;
|
||||
if (FLAGS_user_timestamp_size > 0) {
|
||||
ts_str = GenerateTimestampForRead();
|
||||
ts = ts_str;
|
||||
read_opts.timestamp = &ts;
|
||||
}
|
||||
Status get_status = restored_db->Get(
|
||||
ReadOptions(), restored_cf_handles[rand_column_families[i]], key,
|
||||
read_opts, restored_cf_handles[rand_column_families[i]], key,
|
||||
&restored_value);
|
||||
bool exists = thread->shared->Exists(rand_column_families[i], rand_keys[0]);
|
||||
if (get_status.ok()) {
|
||||
@ -1426,10 +1494,6 @@ Status StressTest::TestBackupRestore(
|
||||
}
|
||||
}
|
||||
}
|
||||
if (backup_engine != nullptr) {
|
||||
delete backup_engine;
|
||||
backup_engine = nullptr;
|
||||
}
|
||||
if (restored_db != nullptr) {
|
||||
for (auto* cf_handle : restored_cf_handles) {
|
||||
restored_db->DestroyColumnFamilyHandle(cf_handle);
|
||||
@ -1437,6 +1501,22 @@ Status StressTest::TestBackupRestore(
|
||||
delete restored_db;
|
||||
restored_db = nullptr;
|
||||
}
|
||||
if (s.ok() && inplace_not_restore) {
|
||||
// Purge late if inplace open read-only
|
||||
uint32_t to_keep = 0;
|
||||
if (allow_persistent) {
|
||||
// allow one thread to keep up to 2 backups
|
||||
to_keep = thread->rand.Uniform(3);
|
||||
}
|
||||
s = backup_engine->PurgeOldBackups(to_keep);
|
||||
if (!s.ok()) {
|
||||
from = "BackupEngine::PurgeOldBackups";
|
||||
}
|
||||
}
|
||||
if (backup_engine != nullptr) {
|
||||
delete backup_engine;
|
||||
backup_engine = nullptr;
|
||||
}
|
||||
if (s.ok()) {
|
||||
// Preserve directories on failure, or allowed persistent backup
|
||||
if (!allow_persistent) {
|
||||
@ -1739,6 +1819,7 @@ void StressTest::TestAcquireSnapshot(ThreadState* thread,
|
||||
const std::string& keystr, uint64_t i) {
|
||||
Slice key = keystr;
|
||||
ColumnFamilyHandle* column_family = column_families_[rand_column_family];
|
||||
ReadOptions ropt;
|
||||
#ifndef ROCKSDB_LITE
|
||||
auto db_impl = static_cast_with_check<DBImpl>(db_->GetRootDB());
|
||||
const bool ww_snapshot = thread->rand.OneIn(10);
|
||||
@ -1748,8 +1829,19 @@ void StressTest::TestAcquireSnapshot(ThreadState* thread,
|
||||
#else
|
||||
const Snapshot* snapshot = db_->GetSnapshot();
|
||||
#endif // !ROCKSDB_LITE
|
||||
ReadOptions ropt;
|
||||
ropt.snapshot = snapshot;
|
||||
|
||||
// Ideally, we want snapshot taking and timestamp generation to be atomic
|
||||
// here, so that the snapshot corresponds to the timestamp. However, it is
|
||||
// not possible with current GetSnapshot() API.
|
||||
std::string ts_str;
|
||||
Slice ts;
|
||||
if (FLAGS_user_timestamp_size > 0) {
|
||||
ts_str = GenerateTimestampForRead();
|
||||
ts = ts_str;
|
||||
ropt.timestamp = &ts;
|
||||
}
|
||||
|
||||
std::string value_at;
|
||||
// When taking a snapshot, we also read a key from that snapshot. We
|
||||
// will later read the same key before releasing the snapshot and
|
||||
@ -1771,10 +1863,14 @@ void StressTest::TestAcquireSnapshot(ThreadState* thread,
|
||||
}
|
||||
}
|
||||
|
||||
ThreadState::SnapshotState snap_state = {
|
||||
snapshot, rand_column_family, column_family->GetName(),
|
||||
keystr, status_at, value_at,
|
||||
key_vec};
|
||||
ThreadState::SnapshotState snap_state = {snapshot,
|
||||
rand_column_family,
|
||||
column_family->GetName(),
|
||||
keystr,
|
||||
status_at,
|
||||
value_at,
|
||||
key_vec,
|
||||
ts_str};
|
||||
uint64_t hold_for = FLAGS_snapshot_hold_ops;
|
||||
if (FLAGS_long_running_snapshots) {
|
||||
// Hold 10% of snapshots for 10x more
|
||||
@ -1879,6 +1975,13 @@ uint32_t StressTest::GetRangeHash(ThreadState* thread, const Snapshot* snapshot,
|
||||
ReadOptions ro;
|
||||
ro.snapshot = snapshot;
|
||||
ro.total_order_seek = true;
|
||||
std::string ts_str;
|
||||
Slice ts;
|
||||
if (FLAGS_user_timestamp_size > 0) {
|
||||
ts_str = GenerateTimestampForRead();
|
||||
ts = ts_str;
|
||||
ro.timestamp = &ts;
|
||||
}
|
||||
std::unique_ptr<Iterator> it(db_->NewIterator(ro, column_family));
|
||||
for (it->Seek(start_key);
|
||||
it->Valid() && options_.comparator->Compare(it->key(), end_key) <= 0;
|
||||
@ -2004,6 +2107,8 @@ void StressTest::PrintEnv() const {
|
||||
fprintf(stdout, "Sync fault injection : %d\n", FLAGS_sync_fault_injection);
|
||||
fprintf(stdout, "Best efforts recovery : %d\n",
|
||||
static_cast<int>(FLAGS_best_efforts_recovery));
|
||||
fprintf(stdout, "User timestamp size bytes : %d\n",
|
||||
static_cast<int>(FLAGS_user_timestamp_size));
|
||||
|
||||
fprintf(stdout, "------------------------------------------------\n");
|
||||
}
|
||||
@ -2223,11 +2328,10 @@ void StressTest::Open() {
|
||||
|
||||
if ((options_.enable_blob_files || options_.enable_blob_garbage_collection ||
|
||||
FLAGS_allow_setting_blob_options_dynamically) &&
|
||||
(FLAGS_use_merge || FLAGS_backup_one_in > 0 ||
|
||||
FLAGS_best_efforts_recovery)) {
|
||||
(FLAGS_use_merge || FLAGS_best_efforts_recovery)) {
|
||||
fprintf(stderr,
|
||||
"Integrated BlobDB is currently incompatible with Merge, "
|
||||
"backup/restore, and best-effort recovery\n");
|
||||
"and best-effort recovery\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
@ -2247,6 +2351,11 @@ void StressTest::Open() {
|
||||
fprintf(stdout, "DB path: [%s]\n", FLAGS_db.c_str());
|
||||
|
||||
Status s;
|
||||
|
||||
if (FLAGS_user_timestamp_size > 0) {
|
||||
CheckAndSetOptionsForUserTimestamp();
|
||||
}
|
||||
|
||||
if (FLAGS_ttl == -1) {
|
||||
std::vector<std::string> existing_column_families;
|
||||
s = DB::ListColumnFamilies(DBOptions(options_), FLAGS_db,
|
||||
@ -2498,5 +2607,72 @@ void StressTest::Reopen(ThreadState* thread) {
|
||||
clock_->TimeToString(now / 1000000).c_str(), num_times_reopened_);
|
||||
Open();
|
||||
}
|
||||
|
||||
void StressTest::CheckAndSetOptionsForUserTimestamp() {
|
||||
assert(FLAGS_user_timestamp_size > 0);
|
||||
const Comparator* const cmp = test::ComparatorWithU64Ts();
|
||||
assert(cmp);
|
||||
if (FLAGS_user_timestamp_size != cmp->timestamp_size()) {
|
||||
fprintf(stderr,
|
||||
"Only -user_timestamp_size=%d is supported in stress test.\n",
|
||||
static_cast<int>(cmp->timestamp_size()));
|
||||
exit(1);
|
||||
}
|
||||
if (FLAGS_nooverwritepercent > 0) {
|
||||
fprintf(stderr,
|
||||
"-nooverwritepercent must be 0 because SingleDelete must be "
|
||||
"disabled.\n");
|
||||
exit(1);
|
||||
}
|
||||
if (FLAGS_use_merge || FLAGS_use_full_merge_v1) {
|
||||
fprintf(stderr, "Merge does not support timestamp yet.\n");
|
||||
exit(1);
|
||||
}
|
||||
if (FLAGS_delrangepercent > 0) {
|
||||
fprintf(stderr, "DeleteRange does not support timestamp yet.\n");
|
||||
exit(1);
|
||||
}
|
||||
if (FLAGS_use_txn) {
|
||||
fprintf(stderr, "TransactionDB does not support timestamp yet.\n");
|
||||
exit(1);
|
||||
}
|
||||
if (FLAGS_read_only) {
|
||||
fprintf(stderr, "When opened as read-only, timestamp not supported.\n");
|
||||
exit(1);
|
||||
}
|
||||
if (FLAGS_test_secondary || FLAGS_secondary_catch_up_one_in > 0 ||
|
||||
FLAGS_continuous_verification_interval > 0) {
|
||||
fprintf(stderr, "Secondary instance does not support timestamp.\n");
|
||||
exit(1);
|
||||
}
|
||||
if (FLAGS_checkpoint_one_in > 0) {
|
||||
fprintf(stderr,
|
||||
"-checkpoint_one_in=%d requires "
|
||||
"DBImplReadOnly, which is not supported with timestamp\n",
|
||||
FLAGS_checkpoint_one_in);
|
||||
exit(1);
|
||||
}
|
||||
#ifndef ROCKSDB_LITE
|
||||
if (FLAGS_enable_blob_files || FLAGS_use_blob_db) {
|
||||
fprintf(stderr, "BlobDB not supported with timestamp.\n");
|
||||
exit(1);
|
||||
}
|
||||
#endif // !ROCKSDB_LITE
|
||||
if (FLAGS_enable_compaction_filter) {
|
||||
fprintf(stderr, "CompactionFilter not supported with timestamp.\n");
|
||||
exit(1);
|
||||
}
|
||||
if (FLAGS_test_cf_consistency || FLAGS_test_batches_snapshots) {
|
||||
fprintf(stderr,
|
||||
"Due to per-key ts-seq ordering constraint, only the (default) "
|
||||
"non-batched test is supported with timestamp.\n");
|
||||
exit(1);
|
||||
}
|
||||
if (FLAGS_ingest_external_file_one_in > 0) {
|
||||
fprintf(stderr, "Bulk loading may not support timestamp yet.\n");
|
||||
exit(1);
|
||||
}
|
||||
options_.comparator = cmp;
|
||||
}
|
||||
} // namespace ROCKSDB_NAMESPACE
|
||||
#endif // GFLAGS
|
||||
|
@ -211,6 +211,8 @@ class StressTest {
|
||||
|
||||
void Reopen(ThreadState* thread);
|
||||
|
||||
void CheckAndSetOptionsForUserTimestamp();
|
||||
|
||||
std::shared_ptr<Cache> cache_;
|
||||
std::shared_ptr<Cache> compressed_cache_;
|
||||
std::shared_ptr<const FilterPolicy> filter_policy_;
|
||||
|
@ -22,6 +22,13 @@ class NonBatchedOpsStressTest : public StressTest {
|
||||
|
||||
void VerifyDb(ThreadState* thread) const override {
|
||||
ReadOptions options(FLAGS_verify_checksum, true);
|
||||
std::string ts_str;
|
||||
Slice ts;
|
||||
if (FLAGS_user_timestamp_size > 0) {
|
||||
ts_str = GenerateTimestampForRead();
|
||||
ts = ts_str;
|
||||
options.timestamp = &ts;
|
||||
}
|
||||
auto shared = thread->shared;
|
||||
const int64_t max_key = shared->GetMaxKey();
|
||||
const int64_t keys_per_thread = max_key / shared->GetNumThreads();
|
||||
@ -477,6 +484,8 @@ class NonBatchedOpsStressTest : public StressTest {
|
||||
int64_t max_key = shared->GetMaxKey();
|
||||
int64_t rand_key = rand_keys[0];
|
||||
int rand_column_family = rand_column_families[0];
|
||||
std::string write_ts_str;
|
||||
Slice write_ts;
|
||||
while (!shared->AllowsOverwrite(rand_key) &&
|
||||
(FLAGS_use_merge || shared->Exists(rand_column_family, rand_key))) {
|
||||
lock.reset();
|
||||
@ -484,6 +493,11 @@ class NonBatchedOpsStressTest : public StressTest {
|
||||
rand_column_family = thread->rand.Next() % FLAGS_column_families;
|
||||
lock.reset(
|
||||
new MutexLock(shared->GetMutexForKey(rand_column_family, rand_key)));
|
||||
if (FLAGS_user_timestamp_size > 0) {
|
||||
write_ts_str = NowNanosStr();
|
||||
write_ts = write_ts_str;
|
||||
write_opts.timestamp = &write_ts;
|
||||
}
|
||||
}
|
||||
|
||||
std::string key_str = Key(rand_key);
|
||||
@ -559,6 +573,8 @@ class NonBatchedOpsStressTest : public StressTest {
|
||||
// OPERATION delete
|
||||
// If the chosen key does not allow overwrite and it does not exist,
|
||||
// choose another key.
|
||||
std::string write_ts_str;
|
||||
Slice write_ts;
|
||||
while (!shared->AllowsOverwrite(rand_key) &&
|
||||
!shared->Exists(rand_column_family, rand_key)) {
|
||||
lock.reset();
|
||||
@ -566,6 +582,11 @@ class NonBatchedOpsStressTest : public StressTest {
|
||||
rand_column_family = thread->rand.Next() % FLAGS_column_families;
|
||||
lock.reset(
|
||||
new MutexLock(shared->GetMutexForKey(rand_column_family, rand_key)));
|
||||
if (FLAGS_user_timestamp_size > 0) {
|
||||
write_ts_str = NowNanosStr();
|
||||
write_ts = write_ts_str;
|
||||
write_opts.timestamp = &write_ts;
|
||||
}
|
||||
}
|
||||
|
||||
std::string key_str = Key(rand_key);
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user