Compare commits
40 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
f2fc398256 | ||
|
3f796bf566 | ||
|
aaa73db29f | ||
|
14c7263445 | ||
|
e491ffd423 | ||
|
2a06ee8994 | ||
|
984140f46a | ||
|
ddabcd8b88 | ||
|
69cd43e433 | ||
|
bf152e7e48 | ||
|
b1124fa127 | ||
|
80474e6791 | ||
|
86abd221aa | ||
|
8e68ffb872 | ||
|
41526f44f6 | ||
|
1412fd9a35 | ||
|
bda9def2d4 | ||
|
5467d7a116 | ||
|
28b01eb86b | ||
|
82fb020445 | ||
|
c4d49a7cd6 | ||
|
60f60c8475 | ||
|
826b5013a5 | ||
|
b91318fb8a | ||
|
55cb17e544 | ||
|
71cd1c2c59 | ||
|
d800ab757c | ||
|
0418aab87c | ||
|
753499f70d | ||
|
2a21b0410b | ||
|
35b5a76faf | ||
|
749cc74632 | ||
|
42b82e76d2 | ||
|
a71e3934fd | ||
|
f74f512df9 | ||
|
b2b06e5200 | ||
|
7faa39b288 | ||
|
f201a44b41 | ||
|
21fb8c0733 | ||
|
74bcb5ed20 |
@ -252,11 +252,11 @@ set(SOURCES
|
||||
db/db_impl.cc
|
||||
db/db_impl_debug.cc
|
||||
db/db_impl_experimental.cc
|
||||
db/db_impl_add_file.cc
|
||||
db/db_impl_readonly.cc
|
||||
db/db_info_dumper.cc
|
||||
db/db_iter.cc
|
||||
db/event_helpers.cc
|
||||
db/external_sst_file_ingestion_job.cc
|
||||
db/experimental.cc
|
||||
db/filename.cc
|
||||
db/file_indexer.cc
|
||||
@ -391,7 +391,6 @@ set(SOURCES
|
||||
utilities/document/json_document_builder.cc
|
||||
utilities/env_mirror.cc
|
||||
utilities/env_registry.cc
|
||||
utilities/flashcache/flashcache.cc
|
||||
utilities/geodb/geodb_impl.cc
|
||||
utilities/leveldb_options/leveldb_options.cc
|
||||
utilities/memory/memory_util.cc
|
||||
|
10
HISTORY.md
10
HISTORY.md
@ -1,4 +1,14 @@
|
||||
# Rocksdb Change Log
|
||||
## 4.13.5
|
||||
### Public API Change
|
||||
* Fix a regression in compaction performance.
|
||||
* Disallow calling IngestExternalFile() on a dropped column family.
|
||||
* Add EventListener::OnExternalFileIngested() event that will be called for files that are successfully ingested.
|
||||
|
||||
## 4.13.4
|
||||
### Public API Change
|
||||
* Removed flashcache support.
|
||||
|
||||
## 4.13.0 (10/18/2016)
|
||||
### Public API Change
|
||||
* DB::GetOptions() reflect dynamic changed options (i.e. through DB::SetOptions()) and return copy of options instead of reference.
|
||||
|
4
Makefile
4
Makefile
@ -216,10 +216,6 @@ default: all
|
||||
WARNING_FLAGS = -W -Wextra -Wall -Wsign-compare -Wshadow \
|
||||
-Wno-unused-parameter
|
||||
|
||||
ifndef DISABLE_WARNING_AS_ERROR
|
||||
WARNING_FLAGS += -Werror
|
||||
endif
|
||||
|
||||
CFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CCFLAGS) $(OPT)
|
||||
CXXFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) -Woverloaded-virtual -Wnon-virtual-dtor -Wno-missing-field-initializers
|
||||
|
||||
|
@ -51,12 +51,7 @@ if [ -z "$ROCKSDB_NO_FBCODE" -a -d /mnt/gvfs/third-party ]; then
|
||||
FBCODE_BUILD="true"
|
||||
# If we're compiling with TSAN we need pic build
|
||||
PIC_BUILD=$COMPILE_WITH_TSAN
|
||||
if [ -z "$ROCKSDB_FBCODE_BUILD_WITH_481" ]; then
|
||||
source "$PWD/build_tools/fbcode_config.sh"
|
||||
else
|
||||
# we need this to build with MySQL. Don't use for other purposes.
|
||||
source "$PWD/build_tools/fbcode_config4.8.1.sh"
|
||||
fi
|
||||
fi
|
||||
|
||||
# Delete existing output, if it exists
|
||||
|
@ -1,17 +1,19 @@
|
||||
GCC_BASE=/mnt/gvfs/third-party2/gcc/cf7d14c625ce30bae1a4661c2319c5a283e4dd22/4.9.x/centos6-native/108cf83
|
||||
CLANG_BASE=/mnt/gvfs/third-party2/llvm-fb/8598c375b0e94e1448182eb3df034704144a838d/stable/centos6-native/3f16ddd
|
||||
LIBGCC_BASE=/mnt/gvfs/third-party2/libgcc/d6e0a7da6faba45f5e5b1638f9edd7afc2f34e7d/4.9.x/gcc-4.9-glibc-2.20/024dbc3
|
||||
GLIBC_BASE=/mnt/gvfs/third-party2/glibc/d282e6e8f3d20f4e40a516834847bdc038e07973/2.20/gcc-4.9-glibc-2.20/500e281
|
||||
SNAPPY_BASE=/mnt/gvfs/third-party2/snappy/8c38a4c1e52b4c2cc8a9cdc31b9c947ed7dbfcb4/1.1.3/gcc-4.9-glibc-2.20/e9936bf
|
||||
ZLIB_BASE=/mnt/gvfs/third-party2/zlib/0882df3713c7a84f15abe368dc004581f20b39d7/1.2.8/gcc-5-glibc-2.23/9bc6787
|
||||
BZIP2_BASE=/mnt/gvfs/third-party2/bzip2/740325875f6729f42d28deaa2147b0854f3a347e/1.0.6/gcc-5-glibc-2.23/9bc6787
|
||||
LZ4_BASE=/mnt/gvfs/third-party2/lz4/0e790b441e2d9acd68d51e1d2e028f88c6a79ddf/r131/gcc-5-glibc-2.23/9bc6787
|
||||
ZSTD_BASE=/mnt/gvfs/third-party2/zstd/9455f75ff7f4831dc9fda02a6a0f8c68922fad8f/1.0.0/gcc-5-glibc-2.23/9bc6787
|
||||
GFLAGS_BASE=/mnt/gvfs/third-party2/gflags/f001a51b2854957676d07306ef3abf67186b5c8b/2.1.1/gcc-4.8.1-glibc-2.17/c3f970a
|
||||
JEMALLOC_BASE=/mnt/gvfs/third-party2/jemalloc/fc8a13ca1fffa4d0765c716c5a0b49f0c107518f/master/gcc-5-glibc-2.23/1c32b4b
|
||||
NUMA_BASE=/mnt/gvfs/third-party2/numa/17c514c4d102a25ca15f4558be564eeed76f4b6a/2.0.8/gcc-5-glibc-2.23/9bc6787
|
||||
LIBUNWIND_BASE=/mnt/gvfs/third-party2/libunwind/ad576de2a1ea560c4d3434304f0fc4e079bede42/trunk/gcc-5-glibc-2.23/b1847cb
|
||||
TBB_BASE=/mnt/gvfs/third-party2/tbb/9d9a554877d0c5bef330fe818ab7178806dd316a/4.0_update2/gcc-4.9-glibc-2.20/e9936bf
|
||||
KERNEL_HEADERS_BASE=/mnt/gvfs/third-party2/kernel-headers/7c111ff27e0c466235163f00f280a9d617c3d2ec/4.0.9-36_fbk5_2933_gd092e3f/gcc-5-glibc-2.23/da39a3e
|
||||
BINUTILS_BASE=/mnt/gvfs/third-party2/binutils/b7fd454c4b10c6a81015d4524ed06cdeab558490/2.26/centos6-native/da39a3e
|
||||
VALGRIND_BASE=/mnt/gvfs/third-party2/valgrind/d7f4d4d86674a57668e3a96f76f0e17dd0eb8765/3.10.0/gcc-4.9-glibc-2.20/e9936bf
|
||||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
|
||||
GCC_BASE=/mnt/gvfs/third-party2/gcc/7331085db891a2ef4a88a48a751d834e8d68f4cb/7.x/centos7-native/b2ef2b6
|
||||
CLANG_BASE=/mnt/gvfs/third-party2/llvm-fb/963d9aeda70cc4779885b1277484fe7544a04e3e/9.0.0/platform007/9e92d53/
|
||||
LIBGCC_BASE=/mnt/gvfs/third-party2/libgcc/6ace84e956873d53638c738b6f65f3f469cca74c/7.x/platform007/5620abc
|
||||
GLIBC_BASE=/mnt/gvfs/third-party2/glibc/192b0f42d63dcf6210d6ceae387b49af049e6e0c/2.26/platform007/f259413
|
||||
SNAPPY_BASE=/mnt/gvfs/third-party2/snappy/7f9bdaada18f59bc27ec2b0871eb8a6144343aef/1.1.3/platform007/ca4da3d
|
||||
ZLIB_BASE=/mnt/gvfs/third-party2/zlib/2d9f0b9a4274cc21f61272a9e89bdb859bce8f1f/1.2.8/platform007/ca4da3d
|
||||
BZIP2_BASE=/mnt/gvfs/third-party2/bzip2/dc49a21c5fceec6456a7a28a94dcd16690af1337/1.0.6/platform007/ca4da3d
|
||||
LZ4_BASE=/mnt/gvfs/third-party2/lz4/0f607f8fc442ea7d6b876931b1898bb573d5e5da/1.9.1/platform007/ca4da3d
|
||||
ZSTD_BASE=/mnt/gvfs/third-party2/zstd/ca22bc441a4eb709e9e0b1f9fec9750fed7b31c5/1.4.x/platform007/15a3614
|
||||
GFLAGS_BASE=/mnt/gvfs/third-party2/gflags/0b9929d2588991c65a57168bf88aff2db87c5d48/2.2.0/platform007/ca4da3d
|
||||
JEMALLOC_BASE=/mnt/gvfs/third-party2/jemalloc/c26f08f47ac35fc31da2633b7da92d6b863246eb/master/platform007/c26c002
|
||||
NUMA_BASE=/mnt/gvfs/third-party2/numa/3f3fb57a5ccc5fd21c66416c0b83e0aa76a05376/2.0.11/platform007/ca4da3d
|
||||
LIBUNWIND_BASE=/mnt/gvfs/third-party2/libunwind/40c73d874898b386a71847f1b99115d93822d11f/1.4/platform007/6f3e0a9
|
||||
TBB_BASE=/mnt/gvfs/third-party2/tbb/4ce8e8dba77cdbd81b75d6f0c32fd7a1b76a11ec/2018_U5/platform007/ca4da3d
|
||||
KERNEL_HEADERS_BASE=/mnt/gvfs/third-party2/kernel-headers/fb251ecd2f5ae16f8671f7014c246e52a748fe0b/fb/platform007/da39a3e
|
||||
BINUTILS_BASE=/mnt/gvfs/third-party2/binutils/ab9f09bba370e7066cafd4eb59752db93f2e8312/2.29.1/platform007/15a3614
|
||||
VALGRIND_BASE=/mnt/gvfs/third-party2/valgrind/d42d152a15636529b0861ec493927200ebebca8e/3.15.0/platform007/ca4da3d
|
||||
LUA_BASE=/mnt/gvfs/third-party2/lua/f0cd714433206d5139df61659eb7b28b1dea6683/5.3.4/platform007/5007832
|
||||
|
@ -13,7 +13,7 @@ source "$BASEDIR/dependencies.sh"
|
||||
CFLAGS=""
|
||||
|
||||
# libgcc
|
||||
LIBGCC_INCLUDE="$LIBGCC_BASE/include"
|
||||
LIBGCC_INCLUDE="$LIBGCC_BASE/include/c++/7.3.0"
|
||||
LIBGCC_LIBS=" -L $LIBGCC_BASE/lib"
|
||||
|
||||
# glibc
|
||||
@ -43,12 +43,16 @@ if test -z $PIC_BUILD; then
|
||||
LZ4_INCLUDE=" -I $LZ4_BASE/include/"
|
||||
LZ4_LIBS=" $LZ4_BASE/lib/liblz4.a"
|
||||
CFLAGS+=" -DLZ4"
|
||||
|
||||
ZSTD_INCLUDE=" -I $ZSTD_BASE/include/"
|
||||
ZSTD_LIBS=" $ZSTD_BASE/lib/libzstd.a"
|
||||
CFLAGS+=" -DZSTD"
|
||||
fi
|
||||
|
||||
ZSTD_INCLUDE=" -I $ZSTD_BASE/include/"
|
||||
if test -z $PIC_BUILD; then
|
||||
ZSTD_LIBS=" $ZSTD_BASE/lib/libzstd.a"
|
||||
else
|
||||
ZSTD_LIBS=" $ZSTD_BASE/lib/libzstd_pic.a"
|
||||
fi
|
||||
CFLAGS+=" -DZSTD"
|
||||
|
||||
# location of gflags headers and libraries
|
||||
GFLAGS_INCLUDE=" -I $GFLAGS_BASE/include/"
|
||||
if test -z $PIC_BUILD; then
|
||||
@ -56,7 +60,7 @@ if test -z $PIC_BUILD; then
|
||||
else
|
||||
GFLAGS_LIBS=" $GFLAGS_BASE/lib/libgflags_pic.a"
|
||||
fi
|
||||
CFLAGS+=" -DGFLAGS=google"
|
||||
CFLAGS+=" -DGFLAGS=gflags"
|
||||
|
||||
# location of jemalloc
|
||||
JEMALLOC_INCLUDE=" -I $JEMALLOC_BASE/include/"
|
||||
@ -104,8 +108,8 @@ if [ -z "$USE_CLANG" ]; then
|
||||
CXX="$GCC_BASE/bin/g++"
|
||||
|
||||
CFLAGS+=" -B$BINUTILS/gold"
|
||||
CFLAGS+=" -isystem $GLIBC_INCLUDE"
|
||||
CFLAGS+=" -isystem $LIBGCC_INCLUDE"
|
||||
CFLAGS+=" -isystem $GLIBC_INCLUDE"
|
||||
JEMALLOC=1
|
||||
else
|
||||
# clang
|
||||
@ -116,8 +120,8 @@ else
|
||||
KERNEL_HEADERS_INCLUDE="$KERNEL_HEADERS_BASE/include"
|
||||
|
||||
CFLAGS+=" -B$BINUTILS/gold -nostdinc -nostdlib"
|
||||
CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/4.9.x "
|
||||
CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/4.9.x/x86_64-facebook-linux "
|
||||
CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/7.x "
|
||||
CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/7.x/x86_64-facebook-linux "
|
||||
CFLAGS+=" -isystem $GLIBC_INCLUDE"
|
||||
CFLAGS+=" -isystem $LIBGCC_INCLUDE"
|
||||
CFLAGS+=" -isystem $CLANG_INCLUDE"
|
||||
@ -128,13 +132,14 @@ else
|
||||
fi
|
||||
|
||||
CFLAGS+=" $DEPS_INCLUDE"
|
||||
CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE"
|
||||
CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE -DROCKSDB_RANGESYNC_PRESENT -DROCKSDB_SCHED_GETCPU_PRESENT -DROCKSDB_SUPPORT_THREAD_LOCAL -DHAVE_SSE42"
|
||||
CXXFLAGS+=" $CFLAGS"
|
||||
|
||||
EXEC_LDFLAGS=" $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS $NUMA_LIB $TBB_LIBS"
|
||||
EXEC_LDFLAGS+=" -Wl,--dynamic-linker,/usr/local/fbcode/gcc-4.9-glibc-2.20/lib/ld.so"
|
||||
EXEC_LDFLAGS+=" -B$BINUTILS/gold"
|
||||
EXEC_LDFLAGS+=" -Wl,--dynamic-linker,/usr/local/fbcode/platform007/lib/ld.so"
|
||||
EXEC_LDFLAGS+=" $LIBUNWIND"
|
||||
EXEC_LDFLAGS+=" -Wl,-rpath=/usr/local/fbcode/gcc-4.9-glibc-2.20/lib"
|
||||
EXEC_LDFLAGS+=" -Wl,-rpath=/usr/local/fbcode/platform007/lib"
|
||||
# required by libtbb
|
||||
EXEC_LDFLAGS+=" -ldl"
|
||||
|
||||
|
@ -160,6 +160,10 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
|
||||
result.min_write_buffer_number_to_merge =
|
||||
std::min(result.min_write_buffer_number_to_merge,
|
||||
result.max_write_buffer_number - 1);
|
||||
if (result.min_write_buffer_number_to_merge < 1) {
|
||||
result.min_write_buffer_number_to_merge = 1;
|
||||
}
|
||||
|
||||
if (result.num_levels < 1) {
|
||||
result.num_levels = 1;
|
||||
}
|
||||
@ -545,14 +549,31 @@ int GetL0ThresholdSpeedupCompaction(int level0_file_num_compaction_trigger,
|
||||
// SanitizeOptions() ensures it.
|
||||
assert(level0_file_num_compaction_trigger <= level0_slowdown_writes_trigger);
|
||||
|
||||
if (level0_file_num_compaction_trigger < 0) {
|
||||
return std::numeric_limits<int>::max();
|
||||
}
|
||||
|
||||
const int64_t twice_level0_trigger =
|
||||
static_cast<int64_t>(level0_file_num_compaction_trigger) * 2;
|
||||
|
||||
const int64_t one_fourth_trigger_slowdown =
|
||||
static_cast<int64_t>(level0_file_num_compaction_trigger) +
|
||||
((level0_slowdown_writes_trigger - level0_file_num_compaction_trigger) /
|
||||
4);
|
||||
|
||||
assert(twice_level0_trigger >= 0);
|
||||
assert(one_fourth_trigger_slowdown >= 0);
|
||||
|
||||
// 1/4 of the way between L0 compaction trigger threshold and slowdown
|
||||
// condition.
|
||||
// Or twice as compaction trigger, if it is smaller.
|
||||
return std::min(level0_file_num_compaction_trigger * 2,
|
||||
level0_file_num_compaction_trigger +
|
||||
(level0_slowdown_writes_trigger -
|
||||
level0_file_num_compaction_trigger) /
|
||||
4);
|
||||
int64_t res = std::min(twice_level0_trigger, one_fourth_trigger_slowdown);
|
||||
if (res >= port::kMaxInt32) {
|
||||
return port::kMaxInt32;
|
||||
} else {
|
||||
// res fits in int
|
||||
return static_cast<int>(res);
|
||||
}
|
||||
}
|
||||
} // namespace
|
||||
|
||||
|
@ -941,6 +941,12 @@ TEST_F(ColumnFamilyTest, CrashAfterFlush) {
|
||||
db_options_.env = env_;
|
||||
}
|
||||
|
||||
TEST_F(ColumnFamilyTest, OpenNonexistentColumnFamily) {
|
||||
ASSERT_OK(TryOpen({"default"}));
|
||||
Close();
|
||||
ASSERT_TRUE(TryOpen({"default", "dne"}).IsInvalidArgument());
|
||||
}
|
||||
|
||||
#ifndef ROCKSDB_LITE // WaitForFlush() is not supported
|
||||
// Makes sure that obsolete log files get deleted
|
||||
TEST_F(ColumnFamilyTest, DifferentWriteBufferSizes) {
|
||||
|
@ -484,9 +484,8 @@ void CompactionJob::GenSubcompactionBoundaries() {
|
||||
static_cast<uint64_t>(db_options_.max_subcompactions),
|
||||
max_output_files});
|
||||
|
||||
double mean = sum * 1.0 / subcompactions;
|
||||
|
||||
if (subcompactions > 1) {
|
||||
double mean = sum * 1.0 / subcompactions;
|
||||
// Greedily add ranges to the subcompaction until the sum of the ranges'
|
||||
// sizes becomes >= the expected mean size of a subcompaction
|
||||
sum = 0;
|
||||
@ -591,6 +590,16 @@ Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options) {
|
||||
VersionStorageInfo::LevelSummaryStorage tmp;
|
||||
auto vstorage = cfd->current()->storage_info();
|
||||
const auto& stats = compaction_stats_;
|
||||
|
||||
double read_write_amp = 0.0;
|
||||
double write_amp = 0.0;
|
||||
if (stats.bytes_read_non_output_levels > 0) {
|
||||
read_write_amp = (stats.bytes_written + stats.bytes_read_output_level +
|
||||
stats.bytes_read_non_output_levels) /
|
||||
static_cast<double>(stats.bytes_read_non_output_levels);
|
||||
write_amp = stats.bytes_written /
|
||||
static_cast<double>(stats.bytes_read_non_output_levels);
|
||||
}
|
||||
LogToBuffer(
|
||||
log_buffer_,
|
||||
"[%s] compacted to: %s, MB/sec: %.1f rd, %.1f wr, level %d, "
|
||||
@ -603,16 +612,10 @@ Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options) {
|
||||
stats.bytes_written / static_cast<double>(stats.micros),
|
||||
compact_->compaction->output_level(),
|
||||
stats.num_input_files_in_non_output_levels,
|
||||
stats.num_input_files_in_output_level,
|
||||
stats.num_output_files,
|
||||
stats.num_input_files_in_output_level, stats.num_output_files,
|
||||
stats.bytes_read_non_output_levels / 1048576.0,
|
||||
stats.bytes_read_output_level / 1048576.0,
|
||||
stats.bytes_written / 1048576.0,
|
||||
(stats.bytes_written + stats.bytes_read_output_level +
|
||||
stats.bytes_read_non_output_levels) /
|
||||
static_cast<double>(stats.bytes_read_non_output_levels),
|
||||
stats.bytes_written /
|
||||
static_cast<double>(stats.bytes_read_non_output_levels),
|
||||
stats.bytes_written / 1048576.0, read_write_amp, write_amp,
|
||||
status.ToString().c_str(), stats.num_input_records,
|
||||
stats.num_dropped_records);
|
||||
|
||||
@ -846,9 +849,6 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
|
||||
}
|
||||
}
|
||||
|
||||
Status input_status = input->status();
|
||||
c_iter->Next();
|
||||
|
||||
// Close output file if it is big enough
|
||||
// TODO(aekmekji): determine if file should be closed earlier than this
|
||||
// during subcompactions (i.e. if output size, estimated by input size, is
|
||||
@ -857,6 +857,9 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
|
||||
if (sub_compact->compaction->output_level() != 0 &&
|
||||
sub_compact->current_output_file_size >=
|
||||
sub_compact->compaction->max_output_file_size()) {
|
||||
Status input_status = input->status();
|
||||
c_iter->Next();
|
||||
|
||||
const Slice* next_key = nullptr;
|
||||
if (c_iter->Valid()) {
|
||||
next_key = &c_iter->key();
|
||||
@ -868,6 +871,8 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
|
||||
// files.
|
||||
sub_compact->compression_dict = std::move(compression_dict);
|
||||
}
|
||||
} else {
|
||||
c_iter->Next();
|
||||
}
|
||||
}
|
||||
|
||||
|
183
db/db_impl.cc
183
db/db_impl.cc
@ -39,6 +39,7 @@
|
||||
#include "db/db_iter.h"
|
||||
#include "db/dbformat.h"
|
||||
#include "db/event_helpers.h"
|
||||
#include "db/external_sst_file_ingestion_job.h"
|
||||
#include "db/filename.h"
|
||||
#include "db/flush_job.h"
|
||||
#include "db/forward_iterator.h"
|
||||
@ -346,7 +347,7 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname)
|
||||
next_job_id_(1),
|
||||
has_unpersisted_data_(false),
|
||||
env_options_(BuildDBOptions(immutable_db_options_, mutable_db_options_)),
|
||||
num_running_addfile_(0),
|
||||
num_running_ingest_file_(0),
|
||||
#ifndef ROCKSDB_LITE
|
||||
wal_manager_(immutable_db_options_, env_options_),
|
||||
#endif // ROCKSDB_LITE
|
||||
@ -1347,7 +1348,6 @@ Status DBImpl::Recover(
|
||||
}
|
||||
}
|
||||
}
|
||||
SetTickerCount(stats_, SEQUENCE_NUMBER, versions_->LastSequence());
|
||||
}
|
||||
|
||||
// Initial value
|
||||
@ -1609,10 +1609,16 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
|
||||
// we just ignore the update.
|
||||
// That's why we set ignore missing column families to true
|
||||
bool has_valid_writes = false;
|
||||
// If we pass DB through and options.max_successive_merges is hit
|
||||
// during recovery, Get() will be issued which will try to acquire
|
||||
// DB mutex and cause deadlock, as DB mutex is already held.
|
||||
// The DB pointer is not needed unless 2PC is used.
|
||||
// TODO(sdong) fix the allow_2pc case too.
|
||||
status = WriteBatchInternal::InsertInto(
|
||||
&batch, column_family_memtables_.get(), &flush_scheduler_, true,
|
||||
log_number, this, false /* concurrent_memtable_writes */,
|
||||
next_sequence, &has_valid_writes);
|
||||
log_number, immutable_db_options_.allow_2pc ? this : nullptr,
|
||||
false /* concurrent_memtable_writes */, next_sequence,
|
||||
&has_valid_writes);
|
||||
// If it is the first log file and there is no column family updated
|
||||
// after replaying the file, this file may be a stale file. We ignore
|
||||
// sequence IDs from the file. Otherwise, if a newer stale log file that
|
||||
@ -1687,6 +1693,9 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
|
||||
}
|
||||
}
|
||||
|
||||
// True if there's any data in the WALs; if not, we can skip re-processing
|
||||
// them later
|
||||
bool data_seen = false;
|
||||
if (!read_only) {
|
||||
// no need to refcount since client still doesn't have access
|
||||
// to the DB and can not drop column families while we iterate
|
||||
@ -1722,6 +1731,7 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
|
||||
cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(),
|
||||
*next_sequence);
|
||||
}
|
||||
data_seen = true;
|
||||
}
|
||||
|
||||
// write MANIFEST with update
|
||||
@ -1747,7 +1757,7 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
|
||||
}
|
||||
}
|
||||
|
||||
if (!flushed) {
|
||||
if (data_seen && !flushed) {
|
||||
// Mark these as alive so they'll be considered for deletion later by
|
||||
// FindObsoleteFiles()
|
||||
for (auto log_number : log_numbers) {
|
||||
@ -2143,8 +2153,8 @@ Status DBImpl::CompactFiles(
|
||||
InstrumentedMutexLock l(&mutex_);
|
||||
|
||||
// This call will unlock/lock the mutex to wait for current running
|
||||
// AddFile() calls to finish.
|
||||
WaitForAddFile();
|
||||
// IngestExternalFile() calls to finish.
|
||||
WaitForIngestFile();
|
||||
|
||||
s = CompactFilesImpl(compact_options, cfd, sv->current,
|
||||
input_file_names, output_level,
|
||||
@ -2899,7 +2909,8 @@ InternalIterator* DBImpl::NewInternalIterator(
|
||||
}
|
||||
|
||||
Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
|
||||
const FlushOptions& flush_options) {
|
||||
const FlushOptions& flush_options,
|
||||
bool writes_stopped) {
|
||||
Status s;
|
||||
{
|
||||
WriteContext context;
|
||||
@ -2911,12 +2922,17 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
|
||||
}
|
||||
|
||||
WriteThread::Writer w;
|
||||
if (!writes_stopped) {
|
||||
write_thread_.EnterUnbatched(&w, &mutex_);
|
||||
}
|
||||
|
||||
// SwitchMemtable() will release and reacquire mutex
|
||||
// during execution
|
||||
s = SwitchMemtable(cfd, &context);
|
||||
|
||||
if (!writes_stopped) {
|
||||
write_thread_.ExitUnbatched(&w);
|
||||
}
|
||||
|
||||
cfd->imm()->FlushRequested();
|
||||
|
||||
@ -2940,6 +2956,12 @@ Status DBImpl::WaitForFlushMemTable(ColumnFamilyData* cfd) {
|
||||
if (shutting_down_.load(std::memory_order_acquire)) {
|
||||
return Status::ShutdownInProgress();
|
||||
}
|
||||
if (cfd->IsDropped()) {
|
||||
// FlushJob cannot flush a dropped CF, if we did not break here
|
||||
// we will loop forever since cfd->imm()->NumNotFlushed() will never
|
||||
// drop to zero
|
||||
return Status::InvalidArgument("Cannot flush a dropped CF");
|
||||
}
|
||||
bg_cv_.Wait();
|
||||
}
|
||||
if (!bg_error_.ok()) {
|
||||
@ -3295,8 +3317,8 @@ void DBImpl::BackgroundCallCompaction(void* arg) {
|
||||
InstrumentedMutexLock l(&mutex_);
|
||||
|
||||
// This call will unlock/lock the mutex to wait for current running
|
||||
// AddFile() calls to finish.
|
||||
WaitForAddFile();
|
||||
// IngestExternalFile() calls to finish.
|
||||
WaitForIngestFile();
|
||||
|
||||
num_running_compactions_++;
|
||||
|
||||
@ -3704,8 +3726,8 @@ void DBImpl::RemoveManualCompaction(DBImpl::ManualCompaction* m) {
|
||||
}
|
||||
|
||||
bool DBImpl::ShouldntRunManualCompaction(ManualCompaction* m) {
|
||||
if (num_running_addfile_ > 0) {
|
||||
// We need to wait for other AddFile() calls to finish
|
||||
if (num_running_ingest_file_ > 0) {
|
||||
// We need to wait for other IngestExternalFile() calls to finish
|
||||
// before running a manual compaction.
|
||||
return true;
|
||||
}
|
||||
@ -3850,7 +3872,10 @@ InternalIterator* DBImpl::NewInternalIterator(const ReadOptions& read_options,
|
||||
InternalIterator* internal_iter;
|
||||
assert(arena != nullptr);
|
||||
// Need to create internal iterator from the arena.
|
||||
MergeIteratorBuilder merge_iter_builder(&cfd->internal_comparator(), arena);
|
||||
MergeIteratorBuilder merge_iter_builder(
|
||||
&cfd->internal_comparator(), arena,
|
||||
!read_options.total_order_seek &&
|
||||
cfd->ioptions()->prefix_extractor != nullptr);
|
||||
// Collect iterator for mutable mem
|
||||
merge_iter_builder.AddIterator(
|
||||
super_version->mem->NewIterator(read_options, arena));
|
||||
@ -4610,7 +4635,6 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
||||
if (write_thread_.CompleteParallelWorker(&w)) {
|
||||
// we're responsible for early exit
|
||||
auto last_sequence = w.parallel_group->last_sequence;
|
||||
SetTickerCount(stats_, SEQUENCE_NUMBER, last_sequence);
|
||||
versions_->SetLastSequence(last_sequence);
|
||||
write_thread_.EarlyExitParallelGroup(&w);
|
||||
}
|
||||
@ -4946,7 +4970,6 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
||||
}
|
||||
|
||||
if (!exit_completed_early && w.status.ok()) {
|
||||
SetTickerCount(stats_, SEQUENCE_NUMBER, last_sequence);
|
||||
versions_->SetLastSequence(last_sequence);
|
||||
if (!need_log_sync) {
|
||||
write_thread_.ExitAsBatchGroupLeader(&w, last_writer, w.status);
|
||||
@ -6364,6 +6387,136 @@ Status DBImpl::GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status DBImpl::IngestExternalFile(
|
||||
ColumnFamilyHandle* column_family,
|
||||
const std::vector<std::string>& external_files,
|
||||
const IngestExternalFileOptions& ingestion_options) {
|
||||
Status status;
|
||||
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
|
||||
auto cfd = cfh->cfd();
|
||||
|
||||
ExternalSstFileIngestionJob ingestion_job(env_, versions_.get(), cfd,
|
||||
immutable_db_options_, env_options_,
|
||||
&snapshots_, ingestion_options);
|
||||
|
||||
// Make sure that bg cleanup wont delete the files that we are ingesting
|
||||
std::list<uint64_t>::iterator pending_output_elem;
|
||||
{
|
||||
InstrumentedMutexLock l(&mutex_);
|
||||
pending_output_elem = CaptureCurrentFileNumberInPendingOutputs();
|
||||
}
|
||||
|
||||
status = ingestion_job.Prepare(external_files);
|
||||
if (!status.ok()) {
|
||||
return status;
|
||||
}
|
||||
|
||||
TEST_SYNC_POINT("DBImpl::AddFile:Start");
|
||||
{
|
||||
// Lock db mutex
|
||||
InstrumentedMutexLock l(&mutex_);
|
||||
TEST_SYNC_POINT("DBImpl::AddFile:MutexLock");
|
||||
|
||||
// Stop writes to the DB
|
||||
WriteThread::Writer w;
|
||||
write_thread_.EnterUnbatched(&w, &mutex_);
|
||||
|
||||
num_running_ingest_file_++;
|
||||
|
||||
// We cannot ingest a file into a dropped CF
|
||||
if (cfd->IsDropped()) {
|
||||
status = Status::InvalidArgument(
|
||||
"Cannot ingest an external file into a dropped CF");
|
||||
}
|
||||
|
||||
// Figure out if we need to flush the memtable first
|
||||
if (status.ok()) {
|
||||
bool need_flush = false;
|
||||
status = ingestion_job.NeedsFlush(&need_flush);
|
||||
|
||||
if (status.ok() && need_flush) {
|
||||
mutex_.Unlock();
|
||||
status = FlushMemTable(cfd, FlushOptions(), true /* writes_stopped */);
|
||||
mutex_.Lock();
|
||||
}
|
||||
}
|
||||
|
||||
// Run the ingestion job
|
||||
if (status.ok()) {
|
||||
status = ingestion_job.Run();
|
||||
}
|
||||
|
||||
// Install job edit [Mutex will be unlocked here]
|
||||
auto mutable_cf_options = cfd->GetLatestMutableCFOptions();
|
||||
if (status.ok()) {
|
||||
status =
|
||||
versions_->LogAndApply(cfd, *mutable_cf_options, ingestion_job.edit(),
|
||||
&mutex_, directories_.GetDbDir());
|
||||
}
|
||||
if (status.ok()) {
|
||||
delete InstallSuperVersionAndScheduleWork(cfd, nullptr,
|
||||
*mutable_cf_options);
|
||||
}
|
||||
|
||||
// Resume writes to the DB
|
||||
write_thread_.ExitUnbatched(&w);
|
||||
|
||||
// Update stats
|
||||
if (status.ok()) {
|
||||
ingestion_job.UpdateStats();
|
||||
}
|
||||
|
||||
ReleaseFileNumberFromPendingOutputs(pending_output_elem);
|
||||
|
||||
num_running_ingest_file_--;
|
||||
if (num_running_ingest_file_ == 0) {
|
||||
bg_cv_.SignalAll();
|
||||
}
|
||||
|
||||
TEST_SYNC_POINT("DBImpl::AddFile:MutexUnlock");
|
||||
}
|
||||
// mutex_ is unlocked here
|
||||
|
||||
// Cleanup
|
||||
ingestion_job.Cleanup(status);
|
||||
|
||||
if (status.ok()) {
|
||||
NotifyOnExternalFileIngested(cfd, ingestion_job);
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
void DBImpl::NotifyOnExternalFileIngested(
|
||||
ColumnFamilyData* cfd, const ExternalSstFileIngestionJob& ingestion_job) {
|
||||
#ifndef ROCKSDB_LITE
|
||||
if (immutable_db_options_.listeners.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (const IngestedFileInfo& f : ingestion_job.files_to_ingest()) {
|
||||
ExternalFileIngestionInfo info;
|
||||
info.cf_name = cfd->GetName();
|
||||
info.external_file_path = f.external_file_path;
|
||||
info.internal_file_path = f.internal_file_path;
|
||||
info.global_seqno = f.assigned_seqno;
|
||||
info.table_properties = f.table_properties;
|
||||
for (auto listener : immutable_db_options_.listeners) {
|
||||
listener->OnExternalFileIngested(this, info);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
void DBImpl::WaitForIngestFile() {
|
||||
mutex_.AssertHeld();
|
||||
while (num_running_ingest_file_ > 0) {
|
||||
bg_cv_.Wait();
|
||||
}
|
||||
}
|
||||
|
||||
#endif // ROCKSDB_LITE
|
||||
|
||||
} // namespace rocksdb
|
||||
|
43
db/db_impl.h
43
db/db_impl.h
@ -22,6 +22,7 @@
|
||||
#include "db/column_family.h"
|
||||
#include "db/compaction_job.h"
|
||||
#include "db/dbformat.h"
|
||||
#include "db/external_sst_file_ingestion_job.h"
|
||||
#include "db/flush_job.h"
|
||||
#include "db/flush_scheduler.h"
|
||||
#include "db/internal_stats.h"
|
||||
@ -260,13 +261,11 @@ class DBImpl : public DB {
|
||||
bool cache_only, SequenceNumber* seq,
|
||||
bool* found_record_for_key);
|
||||
|
||||
using DB::AddFile;
|
||||
virtual Status AddFile(ColumnFamilyHandle* column_family,
|
||||
const std::vector<ExternalSstFileInfo>& file_info_list,
|
||||
bool move_file, bool skip_snapshot_check) override;
|
||||
virtual Status AddFile(ColumnFamilyHandle* column_family,
|
||||
const std::vector<std::string>& file_path_list,
|
||||
bool move_file, bool skip_snapshot_check) override;
|
||||
using DB::IngestExternalFile;
|
||||
virtual Status IngestExternalFile(
|
||||
ColumnFamilyHandle* column_family,
|
||||
const std::vector<std::string>& external_files,
|
||||
const IngestExternalFileOptions& ingestion_options) override;
|
||||
|
||||
#endif // ROCKSDB_LITE
|
||||
|
||||
@ -551,6 +550,9 @@ class DBImpl : public DB {
|
||||
void NotifyOnMemTableSealed(ColumnFamilyData* cfd,
|
||||
const MemTableInfo& mem_table_info);
|
||||
|
||||
void NotifyOnExternalFileIngested(
|
||||
ColumnFamilyData* cfd, const ExternalSstFileIngestionJob& ingestion_job);
|
||||
|
||||
void NewThreadStatusCfInfo(ColumnFamilyData* cfd) const;
|
||||
|
||||
void EraseThreadStatusCfInfo(ColumnFamilyData* cfd) const;
|
||||
@ -650,20 +652,13 @@ class DBImpl : public DB {
|
||||
Status SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context);
|
||||
|
||||
// Force current memtable contents to be flushed.
|
||||
Status FlushMemTable(ColumnFamilyData* cfd, const FlushOptions& options);
|
||||
Status FlushMemTable(ColumnFamilyData* cfd, const FlushOptions& options,
|
||||
bool writes_stopped = false);
|
||||
|
||||
// Wait for memtable flushed
|
||||
Status WaitForFlushMemTable(ColumnFamilyData* cfd);
|
||||
|
||||
#ifndef ROCKSDB_LITE
|
||||
// Finds the lowest level in the DB that the ingested file can be added to
|
||||
// REQUIRES: mutex_ held
|
||||
int PickLevelForIngestedFile(ColumnFamilyData* cfd,
|
||||
const ExternalSstFileInfo& file_info);
|
||||
|
||||
// Wait for current AddFile() calls to finish.
|
||||
// REQUIRES: mutex_ held
|
||||
void WaitForAddFile();
|
||||
|
||||
Status CompactFilesImpl(const CompactionOptions& compact_options,
|
||||
ColumnFamilyData* cfd, Version* version,
|
||||
@ -671,14 +666,14 @@ class DBImpl : public DB {
|
||||
const int output_level, int output_path_id,
|
||||
JobContext* job_context, LogBuffer* log_buffer);
|
||||
|
||||
Status ReadExternalSstFileInfo(ColumnFamilyHandle* column_family,
|
||||
const std::string& file_path,
|
||||
ExternalSstFileInfo* file_info);
|
||||
// Wait for current IngestExternalFile() calls to finish.
|
||||
// REQUIRES: mutex_ held
|
||||
void WaitForIngestFile();
|
||||
|
||||
#else
|
||||
// AddFile is not supported in ROCKSDB_LITE so this function
|
||||
// IngestExternalFile is not supported in ROCKSDB_LITE so this function
|
||||
// will be no-op
|
||||
void WaitForAddFile() {}
|
||||
void WaitForIngestFile() {}
|
||||
#endif // ROCKSDB_LITE
|
||||
|
||||
ColumnFamilyData* GetColumnFamilyDataByName(const std::string& cf_name);
|
||||
@ -752,7 +747,7 @@ class DBImpl : public DB {
|
||||
// * whenever bg_flush_scheduled_ or bg_purge_scheduled_ value decreases
|
||||
// (i.e. whenever a flush is done, even if it didn't make any progress)
|
||||
// * whenever there is an error in background purge, flush or compaction
|
||||
// * whenever num_running_addfile_ goes to 0.
|
||||
// * whenever num_running_ingest_file_ goes to 0.
|
||||
InstrumentedCondVar bg_cv_;
|
||||
uint64_t logfile_number_;
|
||||
std::deque<uint64_t>
|
||||
@ -994,9 +989,9 @@ class DBImpl : public DB {
|
||||
// The options to access storage files
|
||||
const EnvOptions env_options_;
|
||||
|
||||
// Number of running AddFile() calls.
|
||||
// Number of running IngestExternalFile() calls.
|
||||
// REQUIRES: mutex held
|
||||
int num_running_addfile_;
|
||||
int num_running_ingest_file_;
|
||||
|
||||
#ifndef ROCKSDB_LITE
|
||||
WalManager wal_manager_;
|
||||
|
@ -1,430 +0,0 @@
|
||||
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#include "db/db_impl.h"
|
||||
|
||||
#ifndef __STDC_FORMAT_MACROS
|
||||
#define __STDC_FORMAT_MACROS
|
||||
#endif
|
||||
|
||||
#include <inttypes.h>
|
||||
|
||||
#include "rocksdb/db.h"
|
||||
#include "rocksdb/env.h"
|
||||
#include "rocksdb/sst_file_writer.h"
|
||||
|
||||
#include "db/builder.h"
|
||||
#include "table/sst_file_writer_collectors.h"
|
||||
#include "table/table_builder.h"
|
||||
#include "util/file_reader_writer.h"
|
||||
#include "util/file_util.h"
|
||||
#include "util/sync_point.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
#ifndef ROCKSDB_LITE
|
||||
|
||||
Status DBImpl::ReadExternalSstFileInfo(ColumnFamilyHandle* column_family,
|
||||
const std::string& file_path,
|
||||
ExternalSstFileInfo* file_info) {
|
||||
Status status;
|
||||
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
|
||||
auto cfd = cfh->cfd();
|
||||
|
||||
file_info->file_path = file_path;
|
||||
status = env_->GetFileSize(file_path, &file_info->file_size);
|
||||
if (!status.ok()) {
|
||||
return status;
|
||||
}
|
||||
|
||||
// Access the file using TableReader to extract
|
||||
// version, number of entries, smallest user key, largest user key
|
||||
std::unique_ptr<RandomAccessFile> sst_file;
|
||||
status = env_->NewRandomAccessFile(file_path, &sst_file, env_options_);
|
||||
if (!status.ok()) {
|
||||
return status;
|
||||
}
|
||||
std::unique_ptr<RandomAccessFileReader> sst_file_reader;
|
||||
sst_file_reader.reset(new RandomAccessFileReader(std::move(sst_file)));
|
||||
|
||||
std::unique_ptr<TableReader> table_reader;
|
||||
status = cfd->ioptions()->table_factory->NewTableReader(
|
||||
TableReaderOptions(*cfd->ioptions(), env_options_,
|
||||
cfd->internal_comparator()),
|
||||
std::move(sst_file_reader), file_info->file_size, &table_reader);
|
||||
if (!status.ok()) {
|
||||
return status;
|
||||
}
|
||||
|
||||
// Get the external sst file version from table properties
|
||||
const UserCollectedProperties& user_collected_properties =
|
||||
table_reader->GetTableProperties()->user_collected_properties;
|
||||
UserCollectedProperties::const_iterator external_sst_file_version_iter =
|
||||
user_collected_properties.find(ExternalSstFilePropertyNames::kVersion);
|
||||
if (external_sst_file_version_iter == user_collected_properties.end()) {
|
||||
return Status::InvalidArgument("Generated table version not found");
|
||||
}
|
||||
|
||||
file_info->version =
|
||||
DecodeFixed32(external_sst_file_version_iter->second.c_str());
|
||||
if (file_info->version == 2) {
|
||||
// version 2 imply that we have global sequence number
|
||||
|
||||
// TODO(tec): Implement version 2 ingestion
|
||||
file_info->sequence_number = 0;
|
||||
} else if (file_info->version == 1) {
|
||||
// version 1 imply that all sequence numbers in table equal 0
|
||||
file_info->sequence_number = 0;
|
||||
} else {
|
||||
return Status::InvalidArgument("Generated table version is not supported");
|
||||
}
|
||||
// Get number of entries in table
|
||||
file_info->num_entries = table_reader->GetTableProperties()->num_entries;
|
||||
|
||||
ParsedInternalKey key;
|
||||
std::unique_ptr<InternalIterator> iter(
|
||||
table_reader->NewIterator(ReadOptions()));
|
||||
|
||||
// Get first (smallest) key from file
|
||||
iter->SeekToFirst();
|
||||
if (!ParseInternalKey(iter->key(), &key)) {
|
||||
return Status::Corruption("Generated table have corrupted keys");
|
||||
}
|
||||
if (key.sequence != 0) {
|
||||
return Status::Corruption("Generated table have non zero sequence number");
|
||||
}
|
||||
file_info->smallest_key = key.user_key.ToString();
|
||||
|
||||
// Get last (largest) key from file
|
||||
iter->SeekToLast();
|
||||
if (!ParseInternalKey(iter->key(), &key)) {
|
||||
return Status::Corruption("Generated table have corrupted keys");
|
||||
}
|
||||
if (key.sequence != 0) {
|
||||
return Status::Corruption("Generated table have non zero sequence number");
|
||||
}
|
||||
file_info->largest_key = key.user_key.ToString();
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status DBImpl::AddFile(ColumnFamilyHandle* column_family,
|
||||
const std::vector<std::string>& file_path_list,
|
||||
bool move_file, bool skip_snapshot_check) {
|
||||
Status status;
|
||||
auto num_files = file_path_list.size();
|
||||
if (num_files == 0) {
|
||||
return Status::InvalidArgument("The list of files is empty");
|
||||
}
|
||||
|
||||
std::vector<ExternalSstFileInfo> file_info_list(num_files);
|
||||
for (size_t i = 0; i < num_files; i++) {
|
||||
status = ReadExternalSstFileInfo(column_family, file_path_list[i],
|
||||
&file_info_list[i]);
|
||||
if (!status.ok()) {
|
||||
return status;
|
||||
}
|
||||
}
|
||||
return AddFile(column_family, file_info_list, move_file, skip_snapshot_check);
|
||||
}
|
||||
|
||||
Status DBImpl::AddFile(ColumnFamilyHandle* column_family,
|
||||
const std::vector<ExternalSstFileInfo>& file_info_list,
|
||||
bool move_file, bool skip_snapshot_check) {
|
||||
Status status;
|
||||
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
|
||||
ColumnFamilyData* cfd = cfh->cfd();
|
||||
const Comparator* user_cmp = cfd->internal_comparator().user_comparator();
|
||||
|
||||
auto num_files = file_info_list.size();
|
||||
if (num_files == 0) {
|
||||
return Status::InvalidArgument("The list of files is empty");
|
||||
}
|
||||
|
||||
// Verify that passed files dont have overlapping ranges
|
||||
if (num_files > 1) {
|
||||
std::vector<const ExternalSstFileInfo*> sorted_file_info_list(num_files);
|
||||
for (size_t i = 0; i < num_files; i++) {
|
||||
sorted_file_info_list[i] = &file_info_list[i];
|
||||
}
|
||||
|
||||
std::sort(sorted_file_info_list.begin(), sorted_file_info_list.end(),
|
||||
[&user_cmp, &file_info_list](const ExternalSstFileInfo* info1,
|
||||
const ExternalSstFileInfo* info2) {
|
||||
return user_cmp->Compare(info1->smallest_key,
|
||||
info2->smallest_key) < 0;
|
||||
});
|
||||
|
||||
for (size_t i = 0; i < num_files - 1; i++) {
|
||||
if (user_cmp->Compare(sorted_file_info_list[i]->largest_key,
|
||||
sorted_file_info_list[i + 1]->smallest_key) >= 0) {
|
||||
return Status::NotSupported("Files have overlapping ranges");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<uint64_t> micro_list(num_files, 0);
|
||||
std::vector<FileMetaData> meta_list(num_files);
|
||||
for (size_t i = 0; i < num_files; i++) {
|
||||
StopWatch sw(env_, nullptr, 0, µ_list[i], false);
|
||||
if (file_info_list[i].num_entries == 0) {
|
||||
return Status::InvalidArgument("File contain no entries");
|
||||
}
|
||||
|
||||
if (file_info_list[i].version == 2) {
|
||||
// version 2 imply that file have only Put Operations
|
||||
// with global Sequence Number
|
||||
|
||||
// TODO(tec): Implement changing file global sequence number
|
||||
} else if (file_info_list[i].version == 1) {
|
||||
// version 1 imply that file have only Put Operations
|
||||
// with Sequence Number = 0
|
||||
} else {
|
||||
// Unknown version !
|
||||
return Status::InvalidArgument(
|
||||
"Generated table version is not supported");
|
||||
}
|
||||
|
||||
meta_list[i].smallest =
|
||||
InternalKey(file_info_list[i].smallest_key,
|
||||
file_info_list[i].sequence_number, ValueType::kTypeValue);
|
||||
meta_list[i].largest =
|
||||
InternalKey(file_info_list[i].largest_key,
|
||||
file_info_list[i].sequence_number, ValueType::kTypeValue);
|
||||
if (!meta_list[i].smallest.Valid() || !meta_list[i].largest.Valid()) {
|
||||
return Status::Corruption("Generated table have corrupted keys");
|
||||
}
|
||||
meta_list[i].smallest_seqno = file_info_list[i].sequence_number;
|
||||
meta_list[i].largest_seqno = file_info_list[i].sequence_number;
|
||||
if (meta_list[i].smallest_seqno != 0 || meta_list[i].largest_seqno != 0) {
|
||||
return Status::InvalidArgument(
|
||||
"Non zero sequence numbers are not supported");
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::list<uint64_t>::iterator> pending_outputs_inserted_elem_list(
|
||||
num_files);
|
||||
// Generate locations for the new tables
|
||||
{
|
||||
InstrumentedMutexLock l(&mutex_);
|
||||
for (size_t i = 0; i < num_files; i++) {
|
||||
StopWatch sw(env_, nullptr, 0, µ_list[i], false);
|
||||
pending_outputs_inserted_elem_list[i] =
|
||||
CaptureCurrentFileNumberInPendingOutputs();
|
||||
meta_list[i].fd = FileDescriptor(versions_->NewFileNumber(), 0,
|
||||
file_info_list[i].file_size);
|
||||
}
|
||||
}
|
||||
|
||||
// Copy/Move external files into DB
|
||||
std::vector<std::string> db_fname_list(num_files);
|
||||
size_t j = 0;
|
||||
for (; j < num_files; j++) {
|
||||
StopWatch sw(env_, nullptr, 0, µ_list[j], false);
|
||||
db_fname_list[j] =
|
||||
TableFileName(immutable_db_options_.db_paths,
|
||||
meta_list[j].fd.GetNumber(), meta_list[j].fd.GetPathId());
|
||||
if (move_file) {
|
||||
status = env_->LinkFile(file_info_list[j].file_path, db_fname_list[j]);
|
||||
if (status.IsNotSupported()) {
|
||||
// Original file is on a different FS, use copy instead of hard linking
|
||||
status =
|
||||
CopyFile(env_, file_info_list[j].file_path, db_fname_list[j], 0);
|
||||
}
|
||||
} else {
|
||||
status = CopyFile(env_, file_info_list[j].file_path, db_fname_list[j], 0);
|
||||
}
|
||||
TEST_SYNC_POINT("DBImpl::AddFile:FileCopied");
|
||||
if (!status.ok()) {
|
||||
for (size_t i = 0; i < j; i++) {
|
||||
Status s = env_->DeleteFile(db_fname_list[i]);
|
||||
if (!s.ok()) {
|
||||
Log(InfoLogLevel::WARN_LEVEL, immutable_db_options_.info_log,
|
||||
"AddFile() clean up for file %s failed : %s",
|
||||
db_fname_list[i].c_str(), s.ToString().c_str());
|
||||
}
|
||||
}
|
||||
return status;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
InstrumentedMutexLock l(&mutex_);
|
||||
TEST_SYNC_POINT("DBImpl::AddFile:MutexLock");
|
||||
|
||||
const MutableCFOptions mutable_cf_options =
|
||||
*cfd->GetLatestMutableCFOptions();
|
||||
|
||||
WriteThread::Writer w;
|
||||
write_thread_.EnterUnbatched(&w, &mutex_);
|
||||
|
||||
num_running_addfile_++;
|
||||
|
||||
if (!skip_snapshot_check && !snapshots_.empty()) {
|
||||
// Check that no snapshots are being held
|
||||
status =
|
||||
Status::NotSupported("Cannot add a file while holding snapshots");
|
||||
}
|
||||
|
||||
if (status.ok()) {
|
||||
// Verify that added file key range dont overlap with any keys in DB
|
||||
SuperVersion* sv = cfd->GetSuperVersion()->Ref();
|
||||
Arena arena;
|
||||
ReadOptions ro;
|
||||
ro.total_order_seek = true;
|
||||
ScopedArenaIterator iter(NewInternalIterator(ro, cfd, sv, &arena));
|
||||
|
||||
for (size_t i = 0; i < num_files; i++) {
|
||||
StopWatch sw(env_, nullptr, 0, µ_list[i], false);
|
||||
InternalKey range_start(file_info_list[i].smallest_key,
|
||||
kMaxSequenceNumber, kValueTypeForSeek);
|
||||
iter->Seek(range_start.Encode());
|
||||
status = iter->status();
|
||||
|
||||
if (status.ok() && iter->Valid()) {
|
||||
ParsedInternalKey seek_result;
|
||||
if (ParseInternalKey(iter->key(), &seek_result)) {
|
||||
if (user_cmp->Compare(seek_result.user_key,
|
||||
file_info_list[i].largest_key) <= 0) {
|
||||
status = Status::NotSupported("Cannot add overlapping range");
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
status = Status::Corruption("DB have corrupted keys");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// The levels that the files will be ingested into
|
||||
std::vector<int> target_level_list(num_files, 0);
|
||||
if (status.ok()) {
|
||||
VersionEdit edit;
|
||||
edit.SetColumnFamily(cfd->GetID());
|
||||
for (size_t i = 0; i < num_files; i++) {
|
||||
StopWatch sw(env_, nullptr, 0, µ_list[i], false);
|
||||
// Add file to the lowest possible level
|
||||
target_level_list[i] = PickLevelForIngestedFile(cfd, file_info_list[i]);
|
||||
edit.AddFile(target_level_list[i], meta_list[i].fd.GetNumber(),
|
||||
meta_list[i].fd.GetPathId(), meta_list[i].fd.GetFileSize(),
|
||||
meta_list[i].smallest, meta_list[i].largest,
|
||||
meta_list[i].smallest_seqno, meta_list[i].largest_seqno,
|
||||
meta_list[i].marked_for_compaction);
|
||||
}
|
||||
status = versions_->LogAndApply(cfd, mutable_cf_options, &edit, &mutex_,
|
||||
directories_.GetDbDir());
|
||||
}
|
||||
write_thread_.ExitUnbatched(&w);
|
||||
|
||||
if (status.ok()) {
|
||||
delete InstallSuperVersionAndScheduleWork(cfd, nullptr,
|
||||
mutable_cf_options);
|
||||
|
||||
// Update internal stats for new ingested files
|
||||
uint64_t total_keys = 0;
|
||||
uint64_t total_l0_files = 0;
|
||||
for (size_t i = 0; i < num_files; i++) {
|
||||
InternalStats::CompactionStats stats(1);
|
||||
stats.micros = micro_list[i];
|
||||
stats.bytes_written = meta_list[i].fd.GetFileSize();
|
||||
stats.num_output_files = 1;
|
||||
cfd->internal_stats()->AddCompactionStats(target_level_list[i], stats);
|
||||
cfd->internal_stats()->AddCFStats(
|
||||
InternalStats::BYTES_INGESTED_ADD_FILE,
|
||||
meta_list[i].fd.GetFileSize());
|
||||
total_keys += file_info_list[i].num_entries;
|
||||
if (target_level_list[i] == 0) {
|
||||
total_l0_files += 1;
|
||||
}
|
||||
Log(InfoLogLevel::INFO_LEVEL, immutable_db_options_.info_log,
|
||||
"[AddFile] External SST file %s was ingested in L%d with path %s\n",
|
||||
file_info_list[i].file_path.c_str(), target_level_list[i],
|
||||
db_fname_list[i].c_str());
|
||||
}
|
||||
cfd->internal_stats()->AddCFStats(InternalStats::INGESTED_NUM_KEYS_TOTAL,
|
||||
total_keys);
|
||||
cfd->internal_stats()->AddCFStats(InternalStats::INGESTED_NUM_FILES_TOTAL,
|
||||
num_files);
|
||||
cfd->internal_stats()->AddCFStats(
|
||||
InternalStats::INGESTED_LEVEL0_NUM_FILES_TOTAL, total_l0_files);
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < num_files; i++) {
|
||||
ReleaseFileNumberFromPendingOutputs(
|
||||
pending_outputs_inserted_elem_list[i]);
|
||||
}
|
||||
|
||||
num_running_addfile_--;
|
||||
if (num_running_addfile_ == 0) {
|
||||
bg_cv_.SignalAll();
|
||||
}
|
||||
TEST_SYNC_POINT("DBImpl::AddFile:MutexUnlock");
|
||||
} // mutex_ is unlocked here;
|
||||
|
||||
if (!status.ok()) {
|
||||
// We failed to add the files to the database
|
||||
for (size_t i = 0; i < num_files; i++) {
|
||||
Status s = env_->DeleteFile(db_fname_list[i]);
|
||||
if (!s.ok()) {
|
||||
Log(InfoLogLevel::WARN_LEVEL, immutable_db_options_.info_log,
|
||||
"AddFile() clean up for file %s failed : %s",
|
||||
db_fname_list[i].c_str(), s.ToString().c_str());
|
||||
}
|
||||
}
|
||||
} else if (status.ok() && move_file) {
|
||||
// The files were moved and added successfully, remove original file links
|
||||
for (size_t i = 0; i < num_files; i++) {
|
||||
Status s = env_->DeleteFile(file_info_list[i].file_path);
|
||||
if (!s.ok()) {
|
||||
Log(InfoLogLevel::WARN_LEVEL, immutable_db_options_.info_log,
|
||||
"%s was added to DB successfully but failed to remove original "
|
||||
"file "
|
||||
"link : %s",
|
||||
file_info_list[i].file_path.c_str(), s.ToString().c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
return status;
|
||||
}
|
||||
|
||||
// Finds the lowest level in the DB that the ingested file can be added to
|
||||
int DBImpl::PickLevelForIngestedFile(ColumnFamilyData* cfd,
|
||||
const ExternalSstFileInfo& file_info) {
|
||||
mutex_.AssertHeld();
|
||||
|
||||
int target_level = 0;
|
||||
auto* vstorage = cfd->current()->storage_info();
|
||||
Slice file_smallest_user_key(file_info.smallest_key);
|
||||
Slice file_largest_user_key(file_info.largest_key);
|
||||
|
||||
for (int lvl = cfd->NumberLevels() - 1; lvl >= vstorage->base_level();
|
||||
lvl--) {
|
||||
// Make sure that the file fits in Level `lvl` and dont overlap with
|
||||
// the output of any compaction running right now.
|
||||
if (vstorage->OverlapInLevel(lvl, &file_smallest_user_key,
|
||||
&file_largest_user_key) == false &&
|
||||
cfd->RangeOverlapWithCompaction(file_smallest_user_key,
|
||||
file_largest_user_key, lvl) == false) {
|
||||
// Level lvl is the lowest level that dont have any files with key
|
||||
// range overlapping with our file key range and no compactions
|
||||
// planning to add overlapping files in it.
|
||||
target_level = lvl;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return target_level;
|
||||
}
|
||||
|
||||
void DBImpl::WaitForAddFile() {
|
||||
mutex_.AssertHeld();
|
||||
while (num_running_addfile_ > 0) {
|
||||
bg_cv_.Wait();
|
||||
}
|
||||
}
|
||||
#endif // ROCKSDB_LITE
|
||||
|
||||
} // namespace rocksdb
|
@ -558,7 +558,7 @@ TEST_F(DBPropertiesTest, NumImmutableMemTable) {
|
||||
ASSERT_TRUE(dbfull()->GetProperty(
|
||||
handles_[1], "rocksdb.cur-size-active-mem-table", &num));
|
||||
// "384" is the size of the metadata of two empty skiplists, this would
|
||||
// break if we change the default vectorrep/skiplist implementation
|
||||
// break if we change the default skiplist implementation
|
||||
ASSERT_EQ(num, "384");
|
||||
|
||||
uint64_t int_num;
|
||||
|
@ -699,6 +699,58 @@ TEST_F(DBTestTailingIterator, ForwardIteratorVersionProperty) {
|
||||
}
|
||||
ASSERT_EQ(v3, v4);
|
||||
}
|
||||
|
||||
TEST_F(DBTestTailingIterator, SeekWithUpperBoundBug) {
|
||||
ReadOptions read_options;
|
||||
read_options.tailing = true;
|
||||
const Slice upper_bound("cc", 3);
|
||||
read_options.iterate_upper_bound = &upper_bound;
|
||||
|
||||
|
||||
// 1st L0 file
|
||||
ASSERT_OK(db_->Put(WriteOptions(), "aa", "SEEN"));
|
||||
ASSERT_OK(Flush());
|
||||
|
||||
// 2nd L0 file
|
||||
ASSERT_OK(db_->Put(WriteOptions(), "zz", "NOT-SEEN"));
|
||||
ASSERT_OK(Flush());
|
||||
|
||||
std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
|
||||
|
||||
iter->Seek("aa");
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_EQ(iter->key().ToString(), "aa");
|
||||
}
|
||||
|
||||
TEST_F(DBTestTailingIterator, SeekToFirstWithUpperBoundBug) {
|
||||
ReadOptions read_options;
|
||||
read_options.tailing = true;
|
||||
const Slice upper_bound("cc", 3);
|
||||
read_options.iterate_upper_bound = &upper_bound;
|
||||
|
||||
|
||||
// 1st L0 file
|
||||
ASSERT_OK(db_->Put(WriteOptions(), "aa", "SEEN"));
|
||||
ASSERT_OK(Flush());
|
||||
|
||||
// 2nd L0 file
|
||||
ASSERT_OK(db_->Put(WriteOptions(), "zz", "NOT-SEEN"));
|
||||
ASSERT_OK(Flush());
|
||||
|
||||
std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
|
||||
|
||||
iter->SeekToFirst();
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_EQ(iter->key().ToString(), "aa");
|
||||
|
||||
iter->Next();
|
||||
ASSERT_FALSE(iter->Valid());
|
||||
|
||||
iter->SeekToFirst();
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_EQ(iter->key().ToString(), "aa");
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // !defined(ROCKSDB_LITE)
|
||||
|
@ -2646,15 +2646,11 @@ class ModelDB : public DB {
|
||||
}
|
||||
|
||||
#ifndef ROCKSDB_LITE
|
||||
using DB::AddFile;
|
||||
virtual Status AddFile(ColumnFamilyHandle* column_family,
|
||||
const std::vector<ExternalSstFileInfo>& file_info_list,
|
||||
bool move_file, bool skip_snapshot_check) override {
|
||||
return Status::NotSupported("Not implemented.");
|
||||
}
|
||||
virtual Status AddFile(ColumnFamilyHandle* column_family,
|
||||
const std::vector<std::string>& file_path_list,
|
||||
bool move_file, bool skip_snapshot_check) override {
|
||||
using DB::IngestExternalFile;
|
||||
virtual Status IngestExternalFile(
|
||||
ColumnFamilyHandle* column_family,
|
||||
const std::vector<std::string>& external_files,
|
||||
const IngestExternalFileOptions& options) override {
|
||||
return Status::NotSupported("Not implemented.");
|
||||
}
|
||||
|
||||
|
@ -145,22 +145,6 @@ TEST_F(DBTest2, CacheIndexAndFilterWithDBRestart) {
|
||||
value = Get(1, "a");
|
||||
}
|
||||
|
||||
TEST_F(DBTest2, MaxSuccessiveMergesChangeWithDBRecovery) {
|
||||
Options options = CurrentOptions();
|
||||
options.create_if_missing = true;
|
||||
options.statistics = rocksdb::CreateDBStatistics();
|
||||
options.max_successive_merges = 3;
|
||||
options.merge_operator = MergeOperators::CreatePutOperator();
|
||||
options.disable_auto_compactions = true;
|
||||
DestroyAndReopen(options);
|
||||
Put("poi", "Finch");
|
||||
db_->Merge(WriteOptions(), "poi", "Reese");
|
||||
db_->Merge(WriteOptions(), "poi", "Shaw");
|
||||
db_->Merge(WriteOptions(), "poi", "Root");
|
||||
options.max_successive_merges = 2;
|
||||
Reopen(options);
|
||||
}
|
||||
|
||||
#ifndef ROCKSDB_LITE
|
||||
class DBTestSharedWriteBufferAcrossCFs
|
||||
: public DBTestBase,
|
||||
@ -1905,6 +1889,23 @@ TEST_P(MergeOperatorPinningTest, TailingIterator) {
|
||||
}
|
||||
#endif // ROCKSDB_LITE
|
||||
|
||||
TEST_F(DBTest2, MaxSuccessiveMergesInRecovery) {
|
||||
Options options;
|
||||
options = CurrentOptions(options);
|
||||
options.merge_operator = MergeOperators::CreatePutOperator();
|
||||
DestroyAndReopen(options);
|
||||
|
||||
db_->Put(WriteOptions(), "foo", "bar");
|
||||
ASSERT_OK(db_->Merge(WriteOptions(), "foo", "bar"));
|
||||
ASSERT_OK(db_->Merge(WriteOptions(), "foo", "bar"));
|
||||
ASSERT_OK(db_->Merge(WriteOptions(), "foo", "bar"));
|
||||
ASSERT_OK(db_->Merge(WriteOptions(), "foo", "bar"));
|
||||
ASSERT_OK(db_->Merge(WriteOptions(), "foo", "bar"));
|
||||
|
||||
options.max_successive_merges = 3;
|
||||
Reopen(options);
|
||||
}
|
||||
|
||||
size_t GetEncodedEntrySize(size_t key_size, size_t value_size) {
|
||||
std::string buffer;
|
||||
|
||||
|
@ -1077,7 +1077,7 @@ std::vector<std::uint64_t> DBTestBase::ListTableFiles(Env* env,
|
||||
}
|
||||
|
||||
void DBTestBase::VerifyDBFromMap(std::map<std::string, std::string> true_data,
|
||||
size_t* total_reads_res) {
|
||||
size_t* total_reads_res, bool tailing_iter) {
|
||||
size_t total_reads = 0;
|
||||
|
||||
for (auto& kv : true_data) {
|
||||
@ -1126,6 +1126,7 @@ void DBTestBase::VerifyDBFromMap(std::map<std::string, std::string> true_data,
|
||||
delete iter;
|
||||
}
|
||||
|
||||
if (tailing_iter) {
|
||||
#ifndef ROCKSDB_LITE
|
||||
// Tailing iterator
|
||||
int iter_cnt = 0;
|
||||
@ -1156,6 +1157,7 @@ void DBTestBase::VerifyDBFromMap(std::map<std::string, std::string> true_data,
|
||||
|
||||
delete iter;
|
||||
#endif // ROCKSDB_LITE
|
||||
}
|
||||
|
||||
if (total_reads_res) {
|
||||
*total_reads_res = total_reads;
|
||||
|
@ -822,7 +822,8 @@ class DBTestBase : public testing::Test {
|
||||
std::vector<std::uint64_t> ListTableFiles(Env* env, const std::string& path);
|
||||
|
||||
void VerifyDBFromMap(std::map<std::string, std::string> true_data,
|
||||
size_t* total_reads_res = nullptr);
|
||||
size_t* total_reads_res = nullptr,
|
||||
bool tailing_iter = false);
|
||||
|
||||
#ifndef ROCKSDB_LITE
|
||||
uint64_t GetNumberOfSstFilesForColumnFamily(DB* db,
|
||||
|
510
db/external_sst_file_ingestion_job.cc
Normal file
510
db/external_sst_file_ingestion_job.cc
Normal file
@ -0,0 +1,510 @@
|
||||
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#include "db/external_sst_file_ingestion_job.h"
|
||||
|
||||
#define __STDC_FORMAT_MACROS
|
||||
|
||||
#include <inttypes.h>
|
||||
#include <algorithm>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "db/version_edit.h"
|
||||
#include "table/merger.h"
|
||||
#include "table/scoped_arena_iterator.h"
|
||||
#include "table/sst_file_writer_collectors.h"
|
||||
#include "table/table_builder.h"
|
||||
#include "util/file_reader_writer.h"
|
||||
#include "util/file_util.h"
|
||||
#include "util/stop_watch.h"
|
||||
#include "util/sync_point.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
Status ExternalSstFileIngestionJob::Prepare(
|
||||
const std::vector<std::string>& external_files_paths) {
|
||||
Status status;
|
||||
|
||||
// Read the information of files we are ingesting
|
||||
for (const std::string& file_path : external_files_paths) {
|
||||
IngestedFileInfo file_to_ingest;
|
||||
status = GetIngestedFileInfo(file_path, &file_to_ingest);
|
||||
if (!status.ok()) {
|
||||
return status;
|
||||
}
|
||||
files_to_ingest_.push_back(file_to_ingest);
|
||||
}
|
||||
|
||||
for (const IngestedFileInfo& f : files_to_ingest_) {
|
||||
if (f.cf_id !=
|
||||
TablePropertiesCollectorFactory::Context::kUnknownColumnFamily &&
|
||||
f.cf_id != cfd_->GetID()) {
|
||||
return Status::InvalidArgument(
|
||||
"External file column family id dont match");
|
||||
}
|
||||
}
|
||||
|
||||
const Comparator* ucmp = cfd_->internal_comparator().user_comparator();
|
||||
auto num_files = files_to_ingest_.size();
|
||||
if (num_files == 0) {
|
||||
return Status::InvalidArgument("The list of files is empty");
|
||||
} else if (num_files > 1) {
|
||||
// Verify that passed files dont have overlapping ranges
|
||||
autovector<const IngestedFileInfo*> sorted_files;
|
||||
for (size_t i = 0; i < num_files; i++) {
|
||||
sorted_files.push_back(&files_to_ingest_[i]);
|
||||
}
|
||||
|
||||
std::sort(
|
||||
sorted_files.begin(), sorted_files.end(),
|
||||
[&ucmp](const IngestedFileInfo* info1, const IngestedFileInfo* info2) {
|
||||
return ucmp->Compare(info1->smallest_user_key,
|
||||
info2->smallest_user_key) < 0;
|
||||
});
|
||||
|
||||
for (size_t i = 0; i < num_files - 1; i++) {
|
||||
if (ucmp->Compare(sorted_files[i]->largest_user_key,
|
||||
sorted_files[i + 1]->smallest_user_key) >= 0) {
|
||||
return Status::NotSupported("Files have overlapping ranges");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (IngestedFileInfo& f : files_to_ingest_) {
|
||||
if (f.num_entries == 0) {
|
||||
return Status::InvalidArgument("File contain no entries");
|
||||
}
|
||||
|
||||
if (!f.smallest_internal_key().Valid() ||
|
||||
!f.largest_internal_key().Valid()) {
|
||||
return Status::Corruption("Generated table have corrupted keys");
|
||||
}
|
||||
}
|
||||
|
||||
// Copy/Move external files into DB
|
||||
for (IngestedFileInfo& f : files_to_ingest_) {
|
||||
f.fd = FileDescriptor(versions_->NewFileNumber(), 0, f.file_size);
|
||||
|
||||
const std::string path_outside_db = f.external_file_path;
|
||||
const std::string path_inside_db =
|
||||
TableFileName(db_options_.db_paths, f.fd.GetNumber(), f.fd.GetPathId());
|
||||
|
||||
if (ingestion_options_.move_files) {
|
||||
status = env_->LinkFile(path_outside_db, path_inside_db);
|
||||
if (status.IsNotSupported()) {
|
||||
// Original file is on a different FS, use copy instead of hard linking
|
||||
status = CopyFile(env_, path_outside_db, path_inside_db, 0);
|
||||
}
|
||||
} else {
|
||||
status = CopyFile(env_, path_outside_db, path_inside_db, 0);
|
||||
}
|
||||
TEST_SYNC_POINT("DBImpl::AddFile:FileCopied");
|
||||
if (!status.ok()) {
|
||||
break;
|
||||
}
|
||||
f.internal_file_path = path_inside_db;
|
||||
}
|
||||
|
||||
if (!status.ok()) {
|
||||
// We failed, remove all files that we copied into the db
|
||||
for (IngestedFileInfo& f : files_to_ingest_) {
|
||||
if (f.internal_file_path == "") {
|
||||
break;
|
||||
}
|
||||
Status s = env_->DeleteFile(f.internal_file_path);
|
||||
if (!s.ok()) {
|
||||
Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
|
||||
"AddFile() clean up for file %s failed : %s",
|
||||
f.internal_file_path.c_str(), s.ToString().c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
Status ExternalSstFileIngestionJob::NeedsFlush(bool* flush_needed) {
|
||||
SuperVersion* super_version = cfd_->GetSuperVersion();
|
||||
Status status =
|
||||
IngestedFilesOverlapWithMemtables(super_version, flush_needed);
|
||||
|
||||
if (status.ok() && *flush_needed &&
|
||||
!ingestion_options_.allow_blocking_flush) {
|
||||
status = Status::InvalidArgument("External file requires flush");
|
||||
}
|
||||
return status;
|
||||
}
|
||||
|
||||
Status ExternalSstFileIngestionJob::Run() {
|
||||
Status status;
|
||||
#ifndef NDEBUG
|
||||
// We should never run the job with a memtable that is overlapping
|
||||
// with the files we are ingesting
|
||||
bool need_flush = false;
|
||||
status = NeedsFlush(&need_flush);
|
||||
assert(status.ok() && need_flush == false);
|
||||
#endif
|
||||
|
||||
bool consumed_seqno = false;
|
||||
bool force_global_seqno = false;
|
||||
const SequenceNumber last_seqno = versions_->LastSequence();
|
||||
if (ingestion_options_.snapshot_consistency && !db_snapshots_->empty()) {
|
||||
// We need to assign a global sequence number to all the files even
|
||||
// if the dont overlap with any ranges since we have snapshots
|
||||
force_global_seqno = true;
|
||||
}
|
||||
|
||||
SuperVersion* super_version = cfd_->GetSuperVersion();
|
||||
edit_.SetColumnFamily(cfd_->GetID());
|
||||
// The levels that the files will be ingested into
|
||||
for (IngestedFileInfo& f : files_to_ingest_) {
|
||||
bool overlap_with_db = false;
|
||||
status = AssignLevelForIngestedFile(super_version, &f, &overlap_with_db);
|
||||
if (!status.ok()) {
|
||||
return status;
|
||||
}
|
||||
|
||||
if (overlap_with_db || force_global_seqno) {
|
||||
status = AssignGlobalSeqnoForIngestedFile(&f, last_seqno + 1);
|
||||
consumed_seqno = true;
|
||||
} else {
|
||||
status = AssignGlobalSeqnoForIngestedFile(&f, 0);
|
||||
}
|
||||
|
||||
if (!status.ok()) {
|
||||
return status;
|
||||
}
|
||||
|
||||
edit_.AddFile(f.picked_level, f.fd.GetNumber(), f.fd.GetPathId(),
|
||||
f.fd.GetFileSize(), f.smallest_internal_key(),
|
||||
f.largest_internal_key(), f.assigned_seqno, f.assigned_seqno,
|
||||
false);
|
||||
}
|
||||
|
||||
if (consumed_seqno) {
|
||||
versions_->SetLastSequence(last_seqno + 1);
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
void ExternalSstFileIngestionJob::UpdateStats() {
|
||||
// Update internal stats for new ingested files
|
||||
uint64_t total_keys = 0;
|
||||
uint64_t total_l0_files = 0;
|
||||
uint64_t total_time = env_->NowMicros() - job_start_time_;
|
||||
for (IngestedFileInfo& f : files_to_ingest_) {
|
||||
InternalStats::CompactionStats stats(1);
|
||||
stats.micros = total_time;
|
||||
stats.bytes_written = f.fd.GetFileSize();
|
||||
stats.num_output_files = 1;
|
||||
cfd_->internal_stats()->AddCompactionStats(f.picked_level, stats);
|
||||
cfd_->internal_stats()->AddCFStats(InternalStats::BYTES_INGESTED_ADD_FILE,
|
||||
f.fd.GetFileSize());
|
||||
total_keys += f.num_entries;
|
||||
if (f.picked_level == 0) {
|
||||
total_l0_files += 1;
|
||||
}
|
||||
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
|
||||
"[AddFile] External SST file %s was ingested in L%d with path %s "
|
||||
"(global_seqno=%" PRIu64 ")\n",
|
||||
f.external_file_path.c_str(), f.picked_level,
|
||||
f.internal_file_path.c_str(), f.assigned_seqno);
|
||||
}
|
||||
cfd_->internal_stats()->AddCFStats(InternalStats::INGESTED_NUM_KEYS_TOTAL,
|
||||
total_keys);
|
||||
cfd_->internal_stats()->AddCFStats(InternalStats::INGESTED_NUM_FILES_TOTAL,
|
||||
files_to_ingest_.size());
|
||||
cfd_->internal_stats()->AddCFStats(
|
||||
InternalStats::INGESTED_LEVEL0_NUM_FILES_TOTAL, total_l0_files);
|
||||
}
|
||||
|
||||
void ExternalSstFileIngestionJob::Cleanup(const Status& status) {
|
||||
if (!status.ok()) {
|
||||
// We failed to add the files to the database
|
||||
// remove all the files we copied
|
||||
for (IngestedFileInfo& f : files_to_ingest_) {
|
||||
Status s = env_->DeleteFile(f.internal_file_path);
|
||||
if (!s.ok()) {
|
||||
Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
|
||||
"AddFile() clean up for file %s failed : %s",
|
||||
f.internal_file_path.c_str(), s.ToString().c_str());
|
||||
}
|
||||
}
|
||||
} else if (status.ok() && ingestion_options_.move_files) {
|
||||
// The files were moved and added successfully, remove original file links
|
||||
for (IngestedFileInfo& f : files_to_ingest_) {
|
||||
Status s = env_->DeleteFile(f.external_file_path);
|
||||
if (!s.ok()) {
|
||||
Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
|
||||
"%s was added to DB successfully but failed to remove original "
|
||||
"file link : %s",
|
||||
f.external_file_path.c_str(), s.ToString().c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
|
||||
const std::string& external_file, IngestedFileInfo* file_to_ingest) {
|
||||
file_to_ingest->external_file_path = external_file;
|
||||
|
||||
// Get external file size
|
||||
Status status = env_->GetFileSize(external_file, &file_to_ingest->file_size);
|
||||
if (!status.ok()) {
|
||||
return status;
|
||||
}
|
||||
|
||||
// Create TableReader for external file
|
||||
std::unique_ptr<TableReader> table_reader;
|
||||
std::unique_ptr<RandomAccessFile> sst_file;
|
||||
std::unique_ptr<RandomAccessFileReader> sst_file_reader;
|
||||
|
||||
status = env_->NewRandomAccessFile(external_file, &sst_file, env_options_);
|
||||
if (!status.ok()) {
|
||||
return status;
|
||||
}
|
||||
sst_file_reader.reset(new RandomAccessFileReader(std::move(sst_file)));
|
||||
|
||||
status = cfd_->ioptions()->table_factory->NewTableReader(
|
||||
TableReaderOptions(*cfd_->ioptions(), env_options_,
|
||||
cfd_->internal_comparator()),
|
||||
std::move(sst_file_reader), file_to_ingest->file_size, &table_reader);
|
||||
if (!status.ok()) {
|
||||
return status;
|
||||
}
|
||||
|
||||
// Get the external file properties
|
||||
auto props = table_reader->GetTableProperties();
|
||||
const auto& uprops = props->user_collected_properties;
|
||||
|
||||
// Get table version
|
||||
auto version_iter = uprops.find(ExternalSstFilePropertyNames::kVersion);
|
||||
if (version_iter == uprops.end()) {
|
||||
return Status::Corruption("External file version not found");
|
||||
}
|
||||
file_to_ingest->version = DecodeFixed32(version_iter->second.c_str());
|
||||
|
||||
auto seqno_iter = uprops.find(ExternalSstFilePropertyNames::kGlobalSeqno);
|
||||
if (file_to_ingest->version == 2) {
|
||||
// version 2 imply that we have global sequence number
|
||||
if (seqno_iter == uprops.end()) {
|
||||
return Status::Corruption(
|
||||
"External file global sequence number not found");
|
||||
}
|
||||
|
||||
// Set the global sequence number
|
||||
file_to_ingest->original_seqno = DecodeFixed64(seqno_iter->second.c_str());
|
||||
file_to_ingest->global_seqno_offset = props->properties_offsets.at(
|
||||
ExternalSstFilePropertyNames::kGlobalSeqno);
|
||||
|
||||
if (file_to_ingest->global_seqno_offset == 0) {
|
||||
return Status::Corruption("Was not able to find file global seqno field");
|
||||
}
|
||||
} else {
|
||||
return Status::InvalidArgument("external file version is not supported");
|
||||
}
|
||||
// Get number of entries in table
|
||||
file_to_ingest->num_entries = props->num_entries;
|
||||
|
||||
ParsedInternalKey key;
|
||||
ReadOptions ro;
|
||||
// During reading the external file we can cache blocks that we read into
|
||||
// the block cache, if we later change the global seqno of this file, we will
|
||||
// have block in cache that will include keys with wrong seqno.
|
||||
// We need to disable fill_cache so that we read from the file without
|
||||
// updating the block cache.
|
||||
ro.fill_cache = false;
|
||||
std::unique_ptr<InternalIterator> iter(table_reader->NewIterator(ro));
|
||||
|
||||
// Get first (smallest) key from file
|
||||
iter->SeekToFirst();
|
||||
if (!ParseInternalKey(iter->key(), &key)) {
|
||||
return Status::Corruption("external file have corrupted keys");
|
||||
}
|
||||
if (key.sequence != 0) {
|
||||
return Status::Corruption("external file have non zero sequence number");
|
||||
}
|
||||
file_to_ingest->smallest_user_key = key.user_key.ToString();
|
||||
|
||||
// Get last (largest) key from file
|
||||
iter->SeekToLast();
|
||||
if (!ParseInternalKey(iter->key(), &key)) {
|
||||
return Status::Corruption("external file have corrupted keys");
|
||||
}
|
||||
if (key.sequence != 0) {
|
||||
return Status::Corruption("external file have non zero sequence number");
|
||||
}
|
||||
file_to_ingest->largest_user_key = key.user_key.ToString();
|
||||
|
||||
file_to_ingest->cf_id = static_cast<uint32_t>(props->column_family_id);
|
||||
|
||||
file_to_ingest->table_properties = *props;
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
Status ExternalSstFileIngestionJob::IngestedFilesOverlapWithMemtables(
|
||||
SuperVersion* sv, bool* overlap) {
|
||||
// Create an InternalIterator over all memtables
|
||||
Arena arena;
|
||||
ReadOptions ro;
|
||||
ro.total_order_seek = true;
|
||||
MergeIteratorBuilder merge_iter_builder(&cfd_->internal_comparator(), &arena);
|
||||
merge_iter_builder.AddIterator(sv->mem->NewIterator(ro, &arena));
|
||||
sv->imm->AddIterators(ro, &merge_iter_builder);
|
||||
ScopedArenaIterator memtable_iter(merge_iter_builder.Finish());
|
||||
|
||||
Status status;
|
||||
*overlap = false;
|
||||
for (IngestedFileInfo& f : files_to_ingest_) {
|
||||
status =
|
||||
IngestedFileOverlapWithIteratorRange(&f, memtable_iter.get(), overlap);
|
||||
if (!status.ok() || *overlap == true) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
Status ExternalSstFileIngestionJob::AssignLevelForIngestedFile(
|
||||
SuperVersion* sv, IngestedFileInfo* file_to_ingest, bool* overlap_with_db) {
|
||||
*overlap_with_db = false;
|
||||
|
||||
Arena arena;
|
||||
ReadOptions ro;
|
||||
ro.total_order_seek = true;
|
||||
|
||||
Status status;
|
||||
int target_level = 0;
|
||||
auto* vstorage = cfd_->current()->storage_info();
|
||||
for (int lvl = 0; lvl < cfd_->NumberLevels(); lvl++) {
|
||||
if (lvl > 0 && lvl < vstorage->base_level()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (vstorage->NumLevelFiles(lvl) > 0) {
|
||||
bool overlap_with_level = false;
|
||||
MergeIteratorBuilder merge_iter_builder(&cfd_->internal_comparator(),
|
||||
&arena);
|
||||
sv->current->AddIteratorsForLevel(ro, env_options_, &merge_iter_builder,
|
||||
lvl);
|
||||
ScopedArenaIterator level_iter(merge_iter_builder.Finish());
|
||||
|
||||
status = IngestedFileOverlapWithIteratorRange(
|
||||
file_to_ingest, level_iter.get(), &overlap_with_level);
|
||||
if (!status.ok()) {
|
||||
return status;
|
||||
}
|
||||
|
||||
if (overlap_with_level) {
|
||||
// We must use L0 or any level higher than `lvl` to be able to overwrite
|
||||
// the keys that we overlap with in this level, We also need to assign
|
||||
// this file a seqno to overwrite the existing keys in level `lvl`
|
||||
*overlap_with_db = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// We dont overlap with any keys in this level, but we still need to check
|
||||
// if our file can fit in it
|
||||
|
||||
if (IngestedFileFitInLevel(file_to_ingest, lvl)) {
|
||||
target_level = lvl;
|
||||
}
|
||||
}
|
||||
file_to_ingest->picked_level = target_level;
|
||||
return status;
|
||||
}
|
||||
|
||||
Status ExternalSstFileIngestionJob::AssignGlobalSeqnoForIngestedFile(
|
||||
IngestedFileInfo* file_to_ingest, SequenceNumber seqno) {
|
||||
if (file_to_ingest->original_seqno == seqno) {
|
||||
// This file already have the correct global seqno
|
||||
return Status::OK();
|
||||
} else if (!ingestion_options_.allow_global_seqno) {
|
||||
return Status::InvalidArgument("Global seqno is required, but disabled");
|
||||
} else if (file_to_ingest->global_seqno_offset == 0) {
|
||||
return Status::InvalidArgument(
|
||||
"Trying to set global seqno for a file that dont have a global seqno "
|
||||
"field");
|
||||
}
|
||||
|
||||
std::unique_ptr<RandomRWFile> rwfile;
|
||||
Status status = env_->NewRandomRWFile(file_to_ingest->internal_file_path,
|
||||
&rwfile, env_options_);
|
||||
if (!status.ok()) {
|
||||
return status;
|
||||
}
|
||||
|
||||
// Write the new seqno in the global sequence number field in the file
|
||||
std::string seqno_val;
|
||||
PutFixed64(&seqno_val, seqno);
|
||||
status = rwfile->Write(file_to_ingest->global_seqno_offset, seqno_val);
|
||||
if (status.ok()) {
|
||||
file_to_ingest->assigned_seqno = seqno;
|
||||
}
|
||||
return status;
|
||||
}
|
||||
|
||||
Status ExternalSstFileIngestionJob::IngestedFileOverlapWithIteratorRange(
|
||||
const IngestedFileInfo* file_to_ingest, InternalIterator* iter,
|
||||
bool* overlap) {
|
||||
auto* vstorage = cfd_->current()->storage_info();
|
||||
auto* ucmp = vstorage->InternalComparator()->user_comparator();
|
||||
InternalKey range_start(file_to_ingest->smallest_user_key, kMaxSequenceNumber,
|
||||
kValueTypeForSeek);
|
||||
iter->Seek(range_start.Encode());
|
||||
if (!iter->status().ok()) {
|
||||
return iter->status();
|
||||
}
|
||||
|
||||
*overlap = false;
|
||||
if (iter->Valid()) {
|
||||
ParsedInternalKey seek_result;
|
||||
if (!ParseInternalKey(iter->key(), &seek_result)) {
|
||||
return Status::Corruption("DB have corrupted keys");
|
||||
}
|
||||
|
||||
if (ucmp->Compare(seek_result.user_key, file_to_ingest->largest_user_key) <=
|
||||
0) {
|
||||
*overlap = true;
|
||||
}
|
||||
}
|
||||
|
||||
return iter->status();
|
||||
}
|
||||
|
||||
bool ExternalSstFileIngestionJob::IngestedFileFitInLevel(
|
||||
const IngestedFileInfo* file_to_ingest, int level) {
|
||||
if (level == 0) {
|
||||
// Files can always fit in L0
|
||||
return true;
|
||||
}
|
||||
|
||||
auto* vstorage = cfd_->current()->storage_info();
|
||||
Slice file_smallest_user_key(file_to_ingest->smallest_user_key);
|
||||
Slice file_largest_user_key(file_to_ingest->largest_user_key);
|
||||
|
||||
if (vstorage->OverlapInLevel(level, &file_smallest_user_key,
|
||||
&file_largest_user_key)) {
|
||||
// File overlap with another files in this level, we cannot
|
||||
// add it to this level
|
||||
return false;
|
||||
}
|
||||
if (cfd_->RangeOverlapWithCompaction(file_smallest_user_key,
|
||||
file_largest_user_key, level)) {
|
||||
// File overlap with a running compaction output that will be stored
|
||||
// in this level, we cannot add this file to this level
|
||||
return false;
|
||||
}
|
||||
|
||||
// File did not overlap with level files, our compaction output
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
151
db/external_sst_file_ingestion_job.h
Normal file
151
db/external_sst_file_ingestion_job.h
Normal file
@ -0,0 +1,151 @@
|
||||
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#pragma once
|
||||
#include <string>
|
||||
#include <unordered_set>
|
||||
#include <vector>
|
||||
|
||||
#include "db/column_family.h"
|
||||
#include "db/dbformat.h"
|
||||
#include "db/internal_stats.h"
|
||||
#include "db/snapshot_impl.h"
|
||||
#include "rocksdb/db.h"
|
||||
#include "rocksdb/env.h"
|
||||
#include "rocksdb/sst_file_writer.h"
|
||||
#include "util/autovector.h"
|
||||
#include "util/db_options.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
struct IngestedFileInfo {
|
||||
// External file path
|
||||
std::string external_file_path;
|
||||
// Smallest user key in external file
|
||||
std::string smallest_user_key;
|
||||
// Largest user key in external file
|
||||
std::string largest_user_key;
|
||||
// Sequence number for keys in external file
|
||||
SequenceNumber original_seqno;
|
||||
// Offset of the global sequence number field in the file, will
|
||||
// be zero if version is 1 (global seqno is not supported)
|
||||
size_t global_seqno_offset;
|
||||
// External file size
|
||||
uint64_t file_size;
|
||||
// total number of keys in external file
|
||||
uint64_t num_entries;
|
||||
// Id of column family this file shoule be ingested into
|
||||
uint32_t cf_id;
|
||||
// TableProperties read from external file
|
||||
TableProperties table_properties;
|
||||
// Version of external file
|
||||
int version;
|
||||
|
||||
// FileDescriptor for the file inside the DB
|
||||
FileDescriptor fd;
|
||||
// file path that we picked for file inside the DB
|
||||
std::string internal_file_path = "";
|
||||
// Global sequence number that we picked for the file inside the DB
|
||||
SequenceNumber assigned_seqno = 0;
|
||||
// Level inside the DB we picked for the external file.
|
||||
int picked_level = 0;
|
||||
|
||||
InternalKey smallest_internal_key() const {
|
||||
return InternalKey(smallest_user_key, assigned_seqno,
|
||||
ValueType::kTypeValue);
|
||||
}
|
||||
|
||||
InternalKey largest_internal_key() const {
|
||||
return InternalKey(largest_user_key, assigned_seqno, ValueType::kTypeValue);
|
||||
}
|
||||
};
|
||||
|
||||
class ExternalSstFileIngestionJob {
|
||||
public:
|
||||
ExternalSstFileIngestionJob(
|
||||
Env* env, VersionSet* versions, ColumnFamilyData* cfd,
|
||||
const ImmutableDBOptions& db_options, const EnvOptions& env_options,
|
||||
SnapshotList* db_snapshots,
|
||||
const IngestExternalFileOptions& ingestion_options)
|
||||
: env_(env),
|
||||
versions_(versions),
|
||||
cfd_(cfd),
|
||||
db_options_(db_options),
|
||||
env_options_(env_options),
|
||||
db_snapshots_(db_snapshots),
|
||||
ingestion_options_(ingestion_options),
|
||||
job_start_time_(env_->NowMicros()) {}
|
||||
|
||||
// Prepare the job by copying external files into the DB.
|
||||
Status Prepare(const std::vector<std::string>& external_files_paths);
|
||||
|
||||
// Check if we need to flush the memtable before running the ingestion job
|
||||
// This will be true if the files we are ingesting are overlapping with any
|
||||
// key range in the memtable.
|
||||
// REQUIRES: Mutex held
|
||||
Status NeedsFlush(bool* flush_needed);
|
||||
|
||||
// Will execute the ingestion job and prepare edit() to be applied.
|
||||
// REQUIRES: Mutex held
|
||||
Status Run();
|
||||
|
||||
// Update column family stats.
|
||||
// REQUIRES: Mutex held
|
||||
void UpdateStats();
|
||||
|
||||
// Cleanup after successfull/failed job
|
||||
void Cleanup(const Status& status);
|
||||
|
||||
VersionEdit* edit() { return &edit_; }
|
||||
|
||||
const autovector<IngestedFileInfo>& files_to_ingest() const {
|
||||
return files_to_ingest_;
|
||||
}
|
||||
|
||||
private:
|
||||
// Open the external file and populate `file_to_ingest` with all the
|
||||
// external information we need to ingest this file.
|
||||
Status GetIngestedFileInfo(const std::string& external_file,
|
||||
IngestedFileInfo* file_to_ingest);
|
||||
|
||||
// Check if the files we are ingesting overlap with any memtable.
|
||||
// REQUIRES: Mutex held
|
||||
Status IngestedFilesOverlapWithMemtables(SuperVersion* sv, bool* overlap);
|
||||
|
||||
// Assign `file_to_ingest` the lowest possible level that it can
|
||||
// be ingested to.
|
||||
// REQUIRES: Mutex held
|
||||
Status AssignLevelForIngestedFile(SuperVersion* sv,
|
||||
IngestedFileInfo* file_to_ingest,
|
||||
bool* overlap_with_db);
|
||||
|
||||
// Set the file global sequence number to `seqno`
|
||||
Status AssignGlobalSeqnoForIngestedFile(IngestedFileInfo* file_to_ingest,
|
||||
SequenceNumber seqno);
|
||||
|
||||
// Check if `file_to_ingest` key range overlap with the range `iter` represent
|
||||
// REQUIRES: Mutex held
|
||||
Status IngestedFileOverlapWithIteratorRange(
|
||||
const IngestedFileInfo* file_to_ingest, InternalIterator* iter,
|
||||
bool* overlap);
|
||||
|
||||
// Check if `file_to_ingest` can fit in level `level`
|
||||
// REQUIRES: Mutex held
|
||||
bool IngestedFileFitInLevel(const IngestedFileInfo* file_to_ingest,
|
||||
int level);
|
||||
|
||||
Env* env_;
|
||||
VersionSet* versions_;
|
||||
ColumnFamilyData* cfd_;
|
||||
const ImmutableDBOptions& db_options_;
|
||||
const EnvOptions& env_options_;
|
||||
SnapshotList* db_snapshots_;
|
||||
autovector<IngestedFileInfo> files_to_ingest_;
|
||||
const IngestExternalFileOptions& ingestion_options_;
|
||||
VersionEdit edit_;
|
||||
uint64_t job_start_time_;
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
File diff suppressed because it is too large
Load Diff
@ -228,6 +228,11 @@ class FaultInjectionTest : public testing::Test,
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
#if defined(__clang__)
|
||||
__attribute__((__no_sanitize__("undefined")))
|
||||
#elif __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 9)
|
||||
__attribute__((__no_sanitize_undefined__))
|
||||
#endif
|
||||
// Return the ith key
|
||||
Slice Key(int i, std::string* storage) const {
|
||||
int num = i;
|
||||
|
@ -296,8 +296,14 @@ void ForwardIterator::SeekInternal(const Slice& internal_key,
|
||||
// an option to turn it off.
|
||||
if (seek_to_first || NeedToSeekImmutable(internal_key)) {
|
||||
immutable_status_ = Status::OK();
|
||||
if ((has_iter_trimmed_for_upper_bound_) &&
|
||||
(cfd_->internal_comparator().InternalKeyComparator::Compare(
|
||||
if (has_iter_trimmed_for_upper_bound_ &&
|
||||
(
|
||||
// prev_ is not set yet
|
||||
is_prev_set_ == false ||
|
||||
// We are doing SeekToFirst() and internal_key.size() = 0
|
||||
seek_to_first ||
|
||||
// prev_key_ > internal_key
|
||||
cfd_->internal_comparator().InternalKeyComparator::Compare(
|
||||
prev_key_.GetKey(), internal_key) > 0)) {
|
||||
// Some iterators are trimmed. Need to rebuild.
|
||||
RebuildIterators(true);
|
||||
|
@ -68,7 +68,7 @@ MemTable::MemTable(const InternalKeyComparator& cmp,
|
||||
table_(ioptions.memtable_factory->CreateMemTableRep(
|
||||
comparator_, &allocator_, ioptions.prefix_extractor,
|
||||
ioptions.info_log)),
|
||||
range_del_table_(ioptions.memtable_factory->CreateMemTableRep(
|
||||
range_del_table_(SkipListFactory().CreateMemTableRep(
|
||||
comparator_, &allocator_, nullptr /* transform */,
|
||||
ioptions.info_log)),
|
||||
data_size_(0),
|
||||
|
@ -386,10 +386,6 @@ TEST_P(PlainTableDBTest, Flush) {
|
||||
for (int total_order = 0; total_order <= 1; total_order++) {
|
||||
for (int store_index_in_file = 0; store_index_in_file <= 1;
|
||||
++store_index_in_file) {
|
||||
if (!bloom_bits && store_index_in_file) {
|
||||
continue;
|
||||
}
|
||||
|
||||
Options options = CurrentOptions();
|
||||
options.create_if_missing = true;
|
||||
// Set only one bucket to force bucket conflict.
|
||||
|
@ -32,12 +32,12 @@
|
||||
namespace rocksdb {
|
||||
|
||||
bool NewestFirstBySeqNo(FileMetaData* a, FileMetaData* b) {
|
||||
if (a->smallest_seqno != b->smallest_seqno) {
|
||||
return a->smallest_seqno > b->smallest_seqno;
|
||||
}
|
||||
if (a->largest_seqno != b->largest_seqno) {
|
||||
return a->largest_seqno > b->largest_seqno;
|
||||
}
|
||||
if (a->smallest_seqno != b->smallest_seqno) {
|
||||
return a->smallest_seqno > b->smallest_seqno;
|
||||
}
|
||||
// Break ties by file number
|
||||
return a->fd.GetNumber() > b->fd.GetNumber();
|
||||
}
|
||||
@ -146,13 +146,22 @@ class VersionBuilder::Rep {
|
||||
abort();
|
||||
}
|
||||
|
||||
if (!(f1->largest_seqno > f2->largest_seqno ||
|
||||
// We can have multiple files with seqno = 0 as a result of
|
||||
// using DB::AddFile()
|
||||
(f1->largest_seqno == 0 && f2->largest_seqno == 0))) {
|
||||
fprintf(stderr,
|
||||
"L0 files seqno missmatch %" PRIu64 " vs. %" PRIu64 "\n",
|
||||
f1->largest_seqno, f2->largest_seqno);
|
||||
if (f2->smallest_seqno == f2->largest_seqno) {
|
||||
// This is an external file that we ingested
|
||||
SequenceNumber external_file_seqno = f2->smallest_seqno;
|
||||
if (!(external_file_seqno < f1->largest_seqno ||
|
||||
external_file_seqno == 0)) {
|
||||
fprintf(stderr, "L0 file with seqno %" PRIu64 " %" PRIu64
|
||||
" vs. file with global_seqno %" PRIu64 "\n",
|
||||
f1->smallest_seqno, f1->largest_seqno,
|
||||
external_file_seqno);
|
||||
abort();
|
||||
}
|
||||
} else if (f1->smallest_seqno <= f2->smallest_seqno) {
|
||||
fprintf(stderr, "L0 files seqno %" PRIu64 " %" PRIu64
|
||||
" vs. %" PRIu64 " %" PRIu64 "\n",
|
||||
f1->smallest_seqno, f1->largest_seqno, f2->smallest_seqno,
|
||||
f2->largest_seqno);
|
||||
abort();
|
||||
}
|
||||
} else {
|
||||
|
@ -808,13 +808,26 @@ void Version::AddIterators(const ReadOptions& read_options,
|
||||
MergeIteratorBuilder* merge_iter_builder) {
|
||||
assert(storage_info_.finalized_);
|
||||
|
||||
if (storage_info_.num_non_empty_levels() == 0) {
|
||||
// No file in the Version.
|
||||
for (int level = 0; level < storage_info_.num_non_empty_levels(); level++) {
|
||||
AddIteratorsForLevel(read_options, soptions, merge_iter_builder, level);
|
||||
}
|
||||
}
|
||||
|
||||
void Version::AddIteratorsForLevel(const ReadOptions& read_options,
|
||||
const EnvOptions& soptions,
|
||||
MergeIteratorBuilder* merge_iter_builder,
|
||||
int level) {
|
||||
assert(storage_info_.finalized_);
|
||||
if (level >= storage_info_.num_non_empty_levels()) {
|
||||
// This is an empty level
|
||||
return;
|
||||
} else if (storage_info_.LevelFilesBrief(level).num_files == 0) {
|
||||
// No files in this level
|
||||
return;
|
||||
}
|
||||
|
||||
auto* arena = merge_iter_builder->GetArena();
|
||||
|
||||
if (level == 0) {
|
||||
// Merge all level zero files together since they may overlap
|
||||
for (size_t i = 0; i < storage_info_.LevelFilesBrief(0).num_files; i++) {
|
||||
const auto& file = storage_info_.LevelFilesBrief(0).files[i];
|
||||
@ -823,12 +836,10 @@ void Version::AddIterators(const ReadOptions& read_options,
|
||||
cfd_->internal_stats()->GetFileReadHist(0), false, arena,
|
||||
false /* skip_filters */, 0 /* level */));
|
||||
}
|
||||
|
||||
} else {
|
||||
// For levels > 0, we can use a concatenating iterator that sequentially
|
||||
// walks through the non-overlapping files in the level, opening them
|
||||
// lazily.
|
||||
for (int level = 1; level < storage_info_.num_non_empty_levels(); level++) {
|
||||
if (storage_info_.LevelFilesBrief(level).num_files != 0) {
|
||||
auto* mem = arena->AllocateAligned(sizeof(LevelFileIteratorState));
|
||||
auto* state = new (mem) LevelFileIteratorState(
|
||||
cfd_->table_cache(), read_options, soptions,
|
||||
@ -843,7 +854,6 @@ void Version::AddIterators(const ReadOptions& read_options,
|
||||
merge_iter_builder->AddIterator(
|
||||
NewTwoLevelIterator(state, first_level_iter, arena, false));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
VersionStorageInfo::VersionStorageInfo(
|
||||
|
@ -435,6 +435,10 @@ class Version {
|
||||
void AddIterators(const ReadOptions&, const EnvOptions& soptions,
|
||||
MergeIteratorBuilder* merger_iter_builder);
|
||||
|
||||
void AddIteratorsForLevel(const ReadOptions&, const EnvOptions& soptions,
|
||||
MergeIteratorBuilder* merger_iter_builder,
|
||||
int level);
|
||||
|
||||
// Lookup the value for key. If found, store it in *val and
|
||||
// return OK. Else return a non-OK status.
|
||||
// Uses *operands to store merge_operator operations to apply later.
|
||||
|
@ -885,13 +885,10 @@ class MemTableInserter : public WriteBatch::Handler {
|
||||
std::string merged_value;
|
||||
|
||||
auto cf_handle = cf_mems_->GetColumnFamilyHandle();
|
||||
Status s = Status::NotSupported();
|
||||
if (db_ != nullptr && recovering_log_number_ != 0) {
|
||||
if (cf_handle == nullptr) {
|
||||
cf_handle = db_->DefaultColumnFamily();
|
||||
}
|
||||
s = db_->Get(ropts, cf_handle, key, &prev_value);
|
||||
}
|
||||
Status s = db_->Get(ropts, cf_handle, key, &prev_value);
|
||||
|
||||
char* prev_buffer = const_cast<char*>(prev_value.c_str());
|
||||
uint32_t prev_size = static_cast<uint32_t>(prev_value.size());
|
||||
@ -995,12 +992,7 @@ class MemTableInserter : public WriteBatch::Handler {
|
||||
auto* moptions = mem->GetMemTableOptions();
|
||||
bool perform_merge = false;
|
||||
|
||||
// If we pass DB through and options.max_successive_merges is hit
|
||||
// during recovery, Get() will be issued which will try to acquire
|
||||
// DB mutex and cause deadlock, as DB mutex is already held.
|
||||
// So we disable merge in recovery
|
||||
if (moptions->max_successive_merges > 0 && db_ != nullptr &&
|
||||
recovering_log_number_ == 0) {
|
||||
if (moptions->max_successive_merges > 0 && db_ != nullptr) {
|
||||
LookupKey lkey(key, sequence_);
|
||||
|
||||
// Count the number of successive merges at the head
|
||||
|
@ -31,6 +31,11 @@
|
||||
#undef DeleteFile
|
||||
#endif
|
||||
|
||||
#if defined(__GNUC__) || defined(__clang__)
|
||||
#define ROCKSDB_DEPRECATED_FUNC __attribute__((__deprecated__))
|
||||
#elif _WIN32
|
||||
#define ROCKSDB_DEPRECATED_FUNC __declspec(deprecated)
|
||||
#endif
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
@ -589,28 +594,19 @@ class DB {
|
||||
return CompactRange(options, DefaultColumnFamily(), begin, end);
|
||||
}
|
||||
|
||||
#if defined(__GNUC__) || defined(__clang__)
|
||||
__attribute__((__deprecated__))
|
||||
#elif _WIN32
|
||||
__declspec(deprecated)
|
||||
#endif
|
||||
virtual Status
|
||||
CompactRange(ColumnFamilyHandle* column_family, const Slice* begin,
|
||||
const Slice* end, bool change_level = false,
|
||||
int target_level = -1, uint32_t target_path_id = 0) {
|
||||
ROCKSDB_DEPRECATED_FUNC virtual Status CompactRange(
|
||||
ColumnFamilyHandle* column_family, const Slice* begin, const Slice* end,
|
||||
bool change_level = false, int target_level = -1,
|
||||
uint32_t target_path_id = 0) {
|
||||
CompactRangeOptions options;
|
||||
options.change_level = change_level;
|
||||
options.target_level = target_level;
|
||||
options.target_path_id = target_path_id;
|
||||
return CompactRange(options, column_family, begin, end);
|
||||
}
|
||||
#if defined(__GNUC__) || defined(__clang__)
|
||||
__attribute__((__deprecated__))
|
||||
#elif _WIN32
|
||||
__declspec(deprecated)
|
||||
#endif
|
||||
virtual Status
|
||||
CompactRange(const Slice* begin, const Slice* end, bool change_level = false,
|
||||
|
||||
ROCKSDB_DEPRECATED_FUNC virtual Status CompactRange(
|
||||
const Slice* begin, const Slice* end, bool change_level = false,
|
||||
int target_level = -1, uint32_t target_path_id = 0) {
|
||||
CompactRangeOptions options;
|
||||
options.change_level = change_level;
|
||||
@ -803,79 +799,126 @@ class DB {
|
||||
GetColumnFamilyMetaData(DefaultColumnFamily(), metadata);
|
||||
}
|
||||
|
||||
// Batch load table files whose paths stored in "file_path_list" into
|
||||
// "column_family", a vector of ExternalSstFileInfo can be used
|
||||
// instead of "file_path_list" to do a blind batch add that wont
|
||||
// need to read the file, move_file can be set to true to
|
||||
// move the files instead of copying them, skip_snapshot_check can be set to
|
||||
// true to ignore the snapshot, make sure that you know that when you use it,
|
||||
// snapshots see the data that is added in the new files.
|
||||
// IngestExternalFile() will load a list of external SST files (1) into the DB
|
||||
// We will try to find the lowest possible level that the file can fit in, and
|
||||
// ingest the file into this level (2). A file that have a key range that
|
||||
// overlap with the memtable key range will require us to Flush the memtable
|
||||
// first before ingesting the file.
|
||||
//
|
||||
// Current Requirements:
|
||||
// (1) The key ranges of the files don't overlap with each other
|
||||
// (2) The key range of any file in list doesn't overlap with
|
||||
// existing keys or tombstones in DB.
|
||||
// (3) No snapshots are held (check skip_snapshot_check to skip this check).
|
||||
//
|
||||
// Notes: We will try to ingest the files to the lowest possible level
|
||||
// (1) External SST files can be created using SstFileWriter
|
||||
// (2) We will try to ingest the files to the lowest possible level
|
||||
// even if the file compression dont match the level compression
|
||||
virtual Status AddFile(ColumnFamilyHandle* column_family,
|
||||
const std::vector<std::string>& file_path_list,
|
||||
bool move_file = false, bool skip_snapshot_check = false) = 0;
|
||||
virtual Status AddFile(const std::vector<std::string>& file_path_list,
|
||||
bool move_file = false, bool skip_snapshot_check = false) {
|
||||
return AddFile(DefaultColumnFamily(), file_path_list, move_file, skip_snapshot_check);
|
||||
virtual Status IngestExternalFile(
|
||||
ColumnFamilyHandle* column_family,
|
||||
const std::vector<std::string>& external_files,
|
||||
const IngestExternalFileOptions& options) = 0;
|
||||
|
||||
virtual Status IngestExternalFile(
|
||||
const std::vector<std::string>& external_files,
|
||||
const IngestExternalFileOptions& options) {
|
||||
return IngestExternalFile(DefaultColumnFamily(), external_files, options);
|
||||
}
|
||||
#if defined(__GNUC__) || defined(__clang__)
|
||||
__attribute__((__deprecated__))
|
||||
#elif _WIN32
|
||||
__declspec(deprecated)
|
||||
#endif
|
||||
virtual Status
|
||||
AddFile(ColumnFamilyHandle* column_family, const std::string& file_path,
|
||||
bool move_file = false, bool skip_snapshot_check = false) {
|
||||
return AddFile(column_family, std::vector<std::string>(1, file_path),
|
||||
move_file, skip_snapshot_check);
|
||||
|
||||
// AddFile() is deprecated, please use IngestExternalFile()
|
||||
ROCKSDB_DEPRECATED_FUNC virtual Status AddFile(
|
||||
ColumnFamilyHandle* column_family,
|
||||
const std::vector<std::string>& file_path_list, bool move_file = false,
|
||||
bool skip_snapshot_check = false) {
|
||||
IngestExternalFileOptions ifo;
|
||||
ifo.move_files = move_file;
|
||||
ifo.snapshot_consistency = !skip_snapshot_check;
|
||||
ifo.allow_global_seqno = false;
|
||||
ifo.allow_blocking_flush = false;
|
||||
return IngestExternalFile(column_family, file_path_list, ifo);
|
||||
}
|
||||
#if defined(__GNUC__) || defined(__clang__)
|
||||
__attribute__((__deprecated__))
|
||||
#elif _WIN32
|
||||
__declspec(deprecated)
|
||||
#endif
|
||||
virtual Status
|
||||
AddFile(const std::string& file_path, bool move_file = false, bool skip_snapshot_check = false) {
|
||||
return AddFile(DefaultColumnFamily(),
|
||||
std::vector<std::string>(1, file_path), move_file, skip_snapshot_check);
|
||||
|
||||
ROCKSDB_DEPRECATED_FUNC virtual Status AddFile(
|
||||
const std::vector<std::string>& file_path_list, bool move_file = false,
|
||||
bool skip_snapshot_check = false) {
|
||||
IngestExternalFileOptions ifo;
|
||||
ifo.move_files = move_file;
|
||||
ifo.snapshot_consistency = !skip_snapshot_check;
|
||||
ifo.allow_global_seqno = false;
|
||||
ifo.allow_blocking_flush = false;
|
||||
return IngestExternalFile(DefaultColumnFamily(), file_path_list, ifo);
|
||||
}
|
||||
|
||||
// AddFile() is deprecated, please use IngestExternalFile()
|
||||
ROCKSDB_DEPRECATED_FUNC virtual Status AddFile(
|
||||
ColumnFamilyHandle* column_family, const std::string& file_path,
|
||||
bool move_file = false, bool skip_snapshot_check = false) {
|
||||
IngestExternalFileOptions ifo;
|
||||
ifo.move_files = move_file;
|
||||
ifo.snapshot_consistency = !skip_snapshot_check;
|
||||
ifo.allow_global_seqno = false;
|
||||
ifo.allow_blocking_flush = false;
|
||||
return IngestExternalFile(column_family, {file_path}, ifo);
|
||||
}
|
||||
|
||||
ROCKSDB_DEPRECATED_FUNC virtual Status AddFile(
|
||||
const std::string& file_path, bool move_file = false,
|
||||
bool skip_snapshot_check = false) {
|
||||
IngestExternalFileOptions ifo;
|
||||
ifo.move_files = move_file;
|
||||
ifo.snapshot_consistency = !skip_snapshot_check;
|
||||
ifo.allow_global_seqno = false;
|
||||
ifo.allow_blocking_flush = false;
|
||||
return IngestExternalFile(DefaultColumnFamily(), {file_path}, ifo);
|
||||
}
|
||||
|
||||
// Load table file with information "file_info" into "column_family"
|
||||
virtual Status AddFile(ColumnFamilyHandle* column_family,
|
||||
ROCKSDB_DEPRECATED_FUNC virtual Status AddFile(
|
||||
ColumnFamilyHandle* column_family,
|
||||
const std::vector<ExternalSstFileInfo>& file_info_list,
|
||||
bool move_file = false, bool skip_snapshot_check = false) = 0;
|
||||
virtual Status AddFile(const std::vector<ExternalSstFileInfo>& file_info_list,
|
||||
bool move_file = false, bool skip_snapshot_check = false) {
|
||||
return AddFile(DefaultColumnFamily(), file_info_list, move_file, skip_snapshot_check);
|
||||
std::vector<std::string> external_files;
|
||||
for (const ExternalSstFileInfo& file_info : file_info_list) {
|
||||
external_files.push_back(file_info.file_path);
|
||||
}
|
||||
#if defined(__GNUC__) || defined(__clang__)
|
||||
__attribute__((__deprecated__))
|
||||
#elif _WIN32
|
||||
__declspec(deprecated)
|
||||
#endif
|
||||
virtual Status
|
||||
AddFile(ColumnFamilyHandle* column_family,
|
||||
const ExternalSstFileInfo* file_info, bool move_file = false, bool skip_snapshot_check = false) {
|
||||
return AddFile(column_family,
|
||||
std::vector<ExternalSstFileInfo>(1, *file_info), move_file, skip_snapshot_check);
|
||||
IngestExternalFileOptions ifo;
|
||||
ifo.move_files = move_file;
|
||||
ifo.snapshot_consistency = !skip_snapshot_check;
|
||||
ifo.allow_global_seqno = false;
|
||||
ifo.allow_blocking_flush = false;
|
||||
return IngestExternalFile(column_family, external_files, ifo);
|
||||
}
|
||||
#if defined(__GNUC__) || defined(__clang__)
|
||||
__attribute__((__deprecated__))
|
||||
#elif _WIN32
|
||||
__declspec(deprecated)
|
||||
#endif
|
||||
virtual Status
|
||||
AddFile(const ExternalSstFileInfo* file_info, bool move_file = false, bool skip_snapshot_check = false) {
|
||||
return AddFile(DefaultColumnFamily(),
|
||||
std::vector<ExternalSstFileInfo>(1, *file_info), move_file, skip_snapshot_check);
|
||||
|
||||
ROCKSDB_DEPRECATED_FUNC virtual Status AddFile(
|
||||
const std::vector<ExternalSstFileInfo>& file_info_list,
|
||||
bool move_file = false, bool skip_snapshot_check = false) {
|
||||
std::vector<std::string> external_files;
|
||||
for (const ExternalSstFileInfo& file_info : file_info_list) {
|
||||
external_files.push_back(file_info.file_path);
|
||||
}
|
||||
IngestExternalFileOptions ifo;
|
||||
ifo.move_files = move_file;
|
||||
ifo.snapshot_consistency = !skip_snapshot_check;
|
||||
ifo.allow_global_seqno = false;
|
||||
ifo.allow_blocking_flush = false;
|
||||
return IngestExternalFile(DefaultColumnFamily(), external_files, ifo);
|
||||
}
|
||||
|
||||
ROCKSDB_DEPRECATED_FUNC virtual Status AddFile(
|
||||
ColumnFamilyHandle* column_family, const ExternalSstFileInfo* file_info,
|
||||
bool move_file = false, bool skip_snapshot_check = false) {
|
||||
IngestExternalFileOptions ifo;
|
||||
ifo.move_files = move_file;
|
||||
ifo.snapshot_consistency = !skip_snapshot_check;
|
||||
ifo.allow_global_seqno = false;
|
||||
ifo.allow_blocking_flush = false;
|
||||
return IngestExternalFile(column_family, {file_info->file_path}, ifo);
|
||||
}
|
||||
|
||||
ROCKSDB_DEPRECATED_FUNC virtual Status AddFile(
|
||||
const ExternalSstFileInfo* file_info, bool move_file = false,
|
||||
bool skip_snapshot_check = false) {
|
||||
IngestExternalFileOptions ifo;
|
||||
ifo.move_files = move_file;
|
||||
ifo.snapshot_consistency = !skip_snapshot_check;
|
||||
ifo.allow_global_seqno = false;
|
||||
ifo.allow_blocking_flush = false;
|
||||
return IngestExternalFile(DefaultColumnFamily(), {file_info->file_path},
|
||||
ifo);
|
||||
}
|
||||
|
||||
#endif // ROCKSDB_LITE
|
||||
|
@ -170,6 +170,20 @@ struct MemTableInfo {
|
||||
|
||||
};
|
||||
|
||||
struct ExternalFileIngestionInfo {
|
||||
// the name of the column family
|
||||
std::string cf_name;
|
||||
// Path of the file outside the DB
|
||||
std::string external_file_path;
|
||||
// Path of the file inside the DB
|
||||
std::string internal_file_path;
|
||||
// The global sequence number assigned to keys in this file
|
||||
SequenceNumber global_seqno;
|
||||
// Table properties of the table being flushed
|
||||
TableProperties table_properties;
|
||||
};
|
||||
|
||||
|
||||
// EventListener class contains a set of call-back functions that will
|
||||
// be called when specific RocksDB event happens such as flush. It can
|
||||
// be used as a building block for developing custom features such as
|
||||
@ -291,6 +305,15 @@ class EventListener {
|
||||
virtual void OnColumnFamilyHandleDeletionStarted(ColumnFamilyHandle* handle) {
|
||||
}
|
||||
|
||||
// A call-back function for RocksDB which will be called after an external
|
||||
// file is ingested using IngestExternalFile.
|
||||
//
|
||||
// Note that the this function will run on the same thread as
|
||||
// IngestExternalFile(), if this function is blocked, IngestExternalFile()
|
||||
// will be blocked from finishing.
|
||||
virtual void OnExternalFileIngested(
|
||||
DB* /*db*/, const ExternalFileIngestionInfo& /*info*/) {}
|
||||
|
||||
virtual ~EventListener() {}
|
||||
};
|
||||
|
||||
|
@ -1617,6 +1617,21 @@ struct CompactRangeOptions {
|
||||
BottommostLevelCompaction::kIfHaveCompactionFilter;
|
||||
};
|
||||
|
||||
// IngestExternalFileOptions is used by IngestExternalFile()
|
||||
struct IngestExternalFileOptions {
|
||||
// Can be set to true to move the files instead of copying them.
|
||||
bool move_files = false;
|
||||
// If set to false, an ingested file keys could appear in existing snapshots
|
||||
// that where created before the file was ingested.
|
||||
bool snapshot_consistency = true;
|
||||
// If set to false, IngestExternalFile() will fail if the file key range
|
||||
// overlaps with existing keys or tombstones in the DB.
|
||||
bool allow_global_seqno = true;
|
||||
// If set to false and the file key range overlaps with the memtable key range
|
||||
// (memtable flush required), IngestExternalFile will fail.
|
||||
bool allow_blocking_flush = true;
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_
|
||||
|
@ -7,6 +7,7 @@
|
||||
#include <string>
|
||||
#include "rocksdb/env.h"
|
||||
#include "rocksdb/options.h"
|
||||
#include "rocksdb/table_properties.h"
|
||||
#include "rocksdb/types.h"
|
||||
|
||||
namespace rocksdb {
|
||||
@ -43,8 +44,12 @@ struct ExternalSstFileInfo {
|
||||
// All keys in files generated by SstFileWriter will have sequence number = 0
|
||||
class SstFileWriter {
|
||||
public:
|
||||
// User can pass `column_family` to specify that the the generated file will
|
||||
// be ingested into this column_family, note that passing nullptr means that
|
||||
// the column_family is unknown.
|
||||
SstFileWriter(const EnvOptions& env_options, const Options& options,
|
||||
const Comparator* user_comparator);
|
||||
const Comparator* user_comparator,
|
||||
ColumnFamilyHandle* column_family = nullptr);
|
||||
|
||||
~SstFileWriter();
|
||||
|
||||
|
@ -152,7 +152,6 @@ enum Tickers : uint32_t {
|
||||
// written to storage because key does not exist
|
||||
NUMBER_FILTERED_DELETES,
|
||||
NUMBER_MERGE_FAILURES,
|
||||
SEQUENCE_NUMBER,
|
||||
|
||||
// number of times bloom was checked before creating iterator on a
|
||||
// file, and the number of times the check was useful in avoiding
|
||||
@ -280,7 +279,6 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
|
||||
{NUMBER_MULTIGET_BYTES_READ, "rocksdb.number.multiget.bytes.read"},
|
||||
{NUMBER_FILTERED_DELETES, "rocksdb.number.deletes.filtered"},
|
||||
{NUMBER_MERGE_FAILURES, "rocksdb.number.merge.failures"},
|
||||
{SEQUENCE_NUMBER, "rocksdb.sequence.number"},
|
||||
{BLOOM_FILTER_PREFIX_CHECKED, "rocksdb.bloom.filter.prefix.checked"},
|
||||
{BLOOM_FILTER_PREFIX_USEFUL, "rocksdb.bloom.filter.prefix.useful"},
|
||||
{NUMBER_OF_RESEEKS_IN_ITERATION, "rocksdb.number.reseeks.iteration"},
|
||||
|
@ -1,25 +0,0 @@
|
||||
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include "rocksdb/env.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
// This API is experimental. We will mark it stable once we run it in production
|
||||
// for a while.
|
||||
// NewFlashcacheAwareEnv() creates and Env that blacklists all background
|
||||
// threads (used for flush and compaction) from using flashcache to cache their
|
||||
// reads. Reads from compaction thread don't need to be cached because they are
|
||||
// going to be soon made obsolete (due to nature of compaction)
|
||||
// Usually you would pass Env::Default() as base.
|
||||
// cachedev_fd is a file descriptor of the flashcache device. Caller has to
|
||||
// open flashcache device before calling this API.
|
||||
extern std::unique_ptr<Env> NewFlashcacheAwareEnv(
|
||||
Env* base, const int cachedev_fd);
|
||||
|
||||
} // namespace rocksdb
|
@ -68,16 +68,12 @@ class StackableDB : public DB {
|
||||
return db_->MultiGet(options, column_family, keys, values);
|
||||
}
|
||||
|
||||
using DB::AddFile;
|
||||
virtual Status AddFile(ColumnFamilyHandle* column_family,
|
||||
const std::vector<ExternalSstFileInfo>& file_info_list,
|
||||
bool move_file, bool skip_snapshot_check) override {
|
||||
return db_->AddFile(column_family, file_info_list, move_file, skip_snapshot_check);
|
||||
}
|
||||
virtual Status AddFile(ColumnFamilyHandle* column_family,
|
||||
const std::vector<std::string>& file_path_list,
|
||||
bool move_file, bool skip_snapshot_check) override {
|
||||
return db_->AddFile(column_family, file_path_list, move_file, skip_snapshot_check);
|
||||
using DB::IngestExternalFile;
|
||||
virtual Status IngestExternalFile(
|
||||
ColumnFamilyHandle* column_family,
|
||||
const std::vector<std::string>& external_files,
|
||||
const IngestExternalFileOptions& options) override {
|
||||
return db_->IngestExternalFile(column_family, external_files, options);
|
||||
}
|
||||
|
||||
using DB::KeyMayExist;
|
||||
|
@ -6,7 +6,7 @@
|
||||
|
||||
#define ROCKSDB_MAJOR 4
|
||||
#define ROCKSDB_MINOR 13
|
||||
#define ROCKSDB_PATCH 0
|
||||
#define ROCKSDB_PATCH 5
|
||||
|
||||
// Do not use these. We made the mistake of declaring macros starting with
|
||||
// double underscore. Now we have to live with our choice. We'll deprecate these
|
||||
|
@ -16,8 +16,9 @@
|
||||
#include <algorithm>
|
||||
|
||||
#include "include/org_rocksdb_RocksDB.h"
|
||||
#include "rocksdb/db.h"
|
||||
#include "rocksdb/cache.h"
|
||||
#include "rocksdb/db.h"
|
||||
#include "rocksdb/options.h"
|
||||
#include "rocksdb/types.h"
|
||||
#include "rocksjni/portal.h"
|
||||
|
||||
@ -1757,17 +1758,6 @@ void add_file_helper(JNIEnv* env, const jobjectArray& jfile_path_list,
|
||||
}
|
||||
}
|
||||
|
||||
void add_file_helper(
|
||||
JNIEnv* env, jlongArray jfi_handle_list, int fi_handle_list_len,
|
||||
std::vector<rocksdb::ExternalSstFileInfo>* file_info_list) {
|
||||
jlong* jfih = env->GetLongArrayElements(jfi_handle_list, NULL);
|
||||
for (int i = 0; i < fi_handle_list_len; i++) {
|
||||
auto* file_info =
|
||||
reinterpret_cast<rocksdb::ExternalSstFileInfo*>(*(jfih + i));
|
||||
file_info_list->push_back(*file_info);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Class: org_rocksdb_RocksDB
|
||||
* Method: addFile
|
||||
@ -1783,32 +1773,15 @@ void Java_org_rocksdb_RocksDB_addFile__JJ_3Ljava_lang_String_2IZ(
|
||||
&file_path_list);
|
||||
auto* column_family =
|
||||
reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
|
||||
rocksdb::IngestExternalFileOptions ifo;
|
||||
ifo.move_files = static_cast<bool>(jmove_file);
|
||||
ifo.snapshot_consistency = true;
|
||||
ifo.allow_global_seqno = false;
|
||||
ifo.allow_blocking_flush = false;
|
||||
rocksdb::Status s =
|
||||
db->AddFile(column_family, file_path_list, static_cast<bool>(jmove_file));
|
||||
db->IngestExternalFile(column_family, file_path_list, ifo);
|
||||
if (!s.ok()) {
|
||||
rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Class: org_rocksdb_RocksDB
|
||||
* Method: addFile
|
||||
* Signature: (JJ[JIZ)V
|
||||
*/
|
||||
void Java_org_rocksdb_RocksDB_addFile__JJ_3JIZ(
|
||||
JNIEnv* env, jobject jdb, jlong jdb_handle, jlong jcf_handle,
|
||||
jlongArray jfile_info_handle_list, jint jfile_info_handle_list_len,
|
||||
jboolean jmove_file) {
|
||||
auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
|
||||
std::vector<rocksdb::ExternalSstFileInfo> file_info_list;
|
||||
add_file_helper(env, jfile_info_handle_list,
|
||||
static_cast<int>(jfile_info_handle_list_len),
|
||||
&file_info_list);
|
||||
auto* column_family =
|
||||
reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
|
||||
rocksdb::Status s =
|
||||
db->AddFile(column_family, file_info_list, static_cast<bool>(jmove_file));
|
||||
if (!s.ok()) {
|
||||
rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
|
||||
}
|
||||
}
|
||||
|
@ -87,7 +87,6 @@ public enum TickerType {
|
||||
// written to storage because key does not exist
|
||||
NUMBER_FILTERED_DELETES(36),
|
||||
NUMBER_MERGE_FAILURES(37),
|
||||
SEQUENCE_NUMBER(38),
|
||||
|
||||
// number of times bloom was checked before creating iterator on a
|
||||
// file, and the number of times the check was useful in avoiding
|
||||
|
@ -26,7 +26,7 @@ namespace port {
|
||||
std::string GetWindowsErrSz(DWORD err);
|
||||
|
||||
inline Status IOErrorFromWindowsError(const std::string& context, DWORD err) {
|
||||
return (err == ERROR_HANDLE_DISK_FULL) ?
|
||||
return ((err == ERROR_HANDLE_DISK_FULL) || (err == ERROR_DISK_FULL)) ?
|
||||
Status::NoSpace(context, GetWindowsErrSz(err)) :
|
||||
Status::IOError(context, GetWindowsErrSz(err));
|
||||
}
|
||||
|
3
src.mk
3
src.mk
@ -17,9 +17,9 @@ LIB_SOURCES = \
|
||||
db/db_impl_debug.cc \
|
||||
db/db_impl_readonly.cc \
|
||||
db/db_impl_experimental.cc \
|
||||
db/db_impl_add_file.cc \
|
||||
db/db_info_dumper.cc \
|
||||
db/db_iter.cc \
|
||||
db/external_sst_file_ingestion_job.cc \
|
||||
db/experimental.cc \
|
||||
db/event_helpers.cc \
|
||||
db/file_indexer.cc \
|
||||
@ -154,7 +154,6 @@ LIB_SOURCES = \
|
||||
utilities/document/json_document.cc \
|
||||
utilities/env_mirror.cc \
|
||||
utilities/env_registry.cc \
|
||||
utilities/flashcache/flashcache.cc \
|
||||
utilities/geodb/geodb_impl.cc \
|
||||
utilities/leveldb_options/leveldb_options.cc \
|
||||
utilities/memory/memory_util.cc \
|
||||
|
@ -36,12 +36,13 @@ const size_t kNumIterReserve = 4;
|
||||
class MergingIterator : public InternalIterator {
|
||||
public:
|
||||
MergingIterator(const Comparator* comparator, InternalIterator** children,
|
||||
int n, bool is_arena_mode)
|
||||
int n, bool is_arena_mode, bool prefix_seek_mode)
|
||||
: is_arena_mode_(is_arena_mode),
|
||||
comparator_(comparator),
|
||||
current_(nullptr),
|
||||
direction_(kForward),
|
||||
minHeap_(comparator_),
|
||||
prefix_seek_mode_(prefix_seek_mode),
|
||||
pinned_iters_mgr_(nullptr) {
|
||||
children_.resize(n);
|
||||
for (int i = 0; i < n; i++) {
|
||||
@ -204,16 +205,37 @@ class MergingIterator : public InternalIterator {
|
||||
InitMaxHeap();
|
||||
for (auto& child : children_) {
|
||||
if (&child != current_) {
|
||||
if (!prefix_seek_mode_) {
|
||||
child.Seek(key());
|
||||
if (child.Valid()) {
|
||||
// Child is at first entry >= key(). Step back one to be < key()
|
||||
TEST_SYNC_POINT_CALLBACK("MergeIterator::Prev:BeforePrev",
|
||||
&child);
|
||||
child.Prev();
|
||||
} else {
|
||||
// Child has no entries >= key(). Position at last entry.
|
||||
TEST_SYNC_POINT("MergeIterator::Prev:BeforeSeekToLast");
|
||||
child.SeekToLast();
|
||||
}
|
||||
} else {
|
||||
child.SeekForPrev(key());
|
||||
if (child.Valid() && comparator_->Equal(key(), child.key())) {
|
||||
child.Prev();
|
||||
}
|
||||
}
|
||||
}
|
||||
if (child.Valid()) {
|
||||
maxHeap_->push(&child);
|
||||
}
|
||||
}
|
||||
direction_ = kReverse;
|
||||
if (!prefix_seek_mode_) {
|
||||
// Note that we don't do assert(current_ == CurrentReverse()) here
|
||||
// because it is possible to have some keys larger than the seek-key
|
||||
// inserted between Seek() and SeekToLast(), which makes current_ not
|
||||
// equal to CurrentReverse().
|
||||
current_ = CurrentReverse();
|
||||
}
|
||||
// The loop advanced all non-current children to be < key() so current_
|
||||
// should still be strictly the smallest key.
|
||||
assert(current_ == CurrentReverse());
|
||||
@ -299,6 +321,8 @@ class MergingIterator : public InternalIterator {
|
||||
};
|
||||
Direction direction_;
|
||||
MergerMinIterHeap minHeap_;
|
||||
bool prefix_seek_mode_;
|
||||
|
||||
// Max heap is used for reverse iteration, which is way less common than
|
||||
// forward. Lazily initialize it to save memory.
|
||||
std::unique_ptr<MergerMaxIterHeap> maxHeap_;
|
||||
@ -331,7 +355,7 @@ void MergingIterator::InitMaxHeap() {
|
||||
|
||||
InternalIterator* NewMergingIterator(const Comparator* cmp,
|
||||
InternalIterator** list, int n,
|
||||
Arena* arena) {
|
||||
Arena* arena, bool prefix_seek_mode) {
|
||||
assert(n >= 0);
|
||||
if (n == 0) {
|
||||
return NewEmptyInternalIterator(arena);
|
||||
@ -339,19 +363,20 @@ InternalIterator* NewMergingIterator(const Comparator* cmp,
|
||||
return list[0];
|
||||
} else {
|
||||
if (arena == nullptr) {
|
||||
return new MergingIterator(cmp, list, n, false);
|
||||
return new MergingIterator(cmp, list, n, false, prefix_seek_mode);
|
||||
} else {
|
||||
auto mem = arena->AllocateAligned(sizeof(MergingIterator));
|
||||
return new (mem) MergingIterator(cmp, list, n, true);
|
||||
return new (mem) MergingIterator(cmp, list, n, true, prefix_seek_mode);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
MergeIteratorBuilder::MergeIteratorBuilder(const Comparator* comparator,
|
||||
Arena* a)
|
||||
Arena* a, bool prefix_seek_mode)
|
||||
: first_iter(nullptr), use_merging_iter(false), arena(a) {
|
||||
auto mem = arena->AllocateAligned(sizeof(MergingIterator));
|
||||
merge_iter = new (mem) MergingIterator(comparator, nullptr, 0, true);
|
||||
merge_iter =
|
||||
new (mem) MergingIterator(comparator, nullptr, 0, true, prefix_seek_mode);
|
||||
}
|
||||
|
||||
void MergeIteratorBuilder::AddIterator(InternalIterator* iter) {
|
||||
|
@ -28,7 +28,8 @@ class Arena;
|
||||
// REQUIRES: n >= 0
|
||||
extern InternalIterator* NewMergingIterator(const Comparator* comparator,
|
||||
InternalIterator** children, int n,
|
||||
Arena* arena = nullptr);
|
||||
Arena* arena = nullptr,
|
||||
bool prefix_seek_mode = false);
|
||||
|
||||
class MergingIterator;
|
||||
|
||||
@ -37,7 +38,8 @@ class MergeIteratorBuilder {
|
||||
public:
|
||||
// comparator: the comparator used in merging comparator
|
||||
// arena: where the merging iterator needs to be allocated from.
|
||||
explicit MergeIteratorBuilder(const Comparator* comparator, Arena* arena);
|
||||
explicit MergeIteratorBuilder(const Comparator* comparator, Arena* arena,
|
||||
bool prefix_seek_mode = false);
|
||||
~MergeIteratorBuilder() {}
|
||||
|
||||
// Add iter to the merging iterator.
|
||||
|
@ -80,7 +80,6 @@ PlainTableBuilder::PlainTableBuilder(
|
||||
index_builder_.reset(
|
||||
new PlainTableIndexBuilder(&arena_, ioptions, index_sparseness,
|
||||
hash_table_ratio, huge_page_tlb_size_));
|
||||
assert(bloom_bits_per_key_ > 0);
|
||||
properties_.user_collected_properties
|
||||
[PlainTablePropertyNames::kBloomVersion] = "1"; // For future use
|
||||
}
|
||||
@ -191,6 +190,9 @@ Status PlainTableBuilder::Finish() {
|
||||
|
||||
if (store_index_in_file_ && (properties_.num_entries > 0)) {
|
||||
assert(properties_.num_entries <= std::numeric_limits<uint32_t>::max());
|
||||
Status s;
|
||||
BlockHandle bloom_block_handle;
|
||||
if (bloom_bits_per_key_ > 0) {
|
||||
bloom_block_.SetTotalBits(
|
||||
&arena_,
|
||||
static_cast<uint32_t>(properties_.num_entries) * bloom_bits_per_key_,
|
||||
@ -201,27 +203,27 @@ Status PlainTableBuilder::Finish() {
|
||||
bloom_block_.GetNumBlocks());
|
||||
|
||||
bloom_block_.AddKeysHashes(keys_or_prefixes_hashes_);
|
||||
BlockHandle bloom_block_handle;
|
||||
auto finish_result = bloom_block_.Finish();
|
||||
|
||||
properties_.filter_size = finish_result.size();
|
||||
auto s = WriteBlock(finish_result, file_, &offset_, &bloom_block_handle);
|
||||
Slice bloom_finish_result = bloom_block_.Finish();
|
||||
|
||||
properties_.filter_size = bloom_finish_result.size();
|
||||
s = WriteBlock(bloom_finish_result, file_, &offset_, &bloom_block_handle);
|
||||
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
|
||||
BlockHandle index_block_handle;
|
||||
finish_result = index_builder_->Finish();
|
||||
|
||||
properties_.index_size = finish_result.size();
|
||||
s = WriteBlock(finish_result, file_, &offset_, &index_block_handle);
|
||||
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
|
||||
meta_index_builer.Add(BloomBlockBuilder::kBloomBlock, bloom_block_handle);
|
||||
}
|
||||
BlockHandle index_block_handle;
|
||||
Slice index_finish_result = index_builder_->Finish();
|
||||
|
||||
properties_.index_size = index_finish_result.size();
|
||||
s = WriteBlock(index_finish_result, file_, &offset_, &index_block_handle);
|
||||
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
|
||||
meta_index_builer.Add(PlainTableIndexBuilder::kPlainTableIndexBlock,
|
||||
index_block_handle);
|
||||
}
|
||||
|
@ -294,21 +294,25 @@ Status PlainTableReader::PopulateIndex(TableProperties* props,
|
||||
assert(props != nullptr);
|
||||
table_properties_.reset(props);
|
||||
|
||||
BlockContents bloom_block_contents;
|
||||
auto s = ReadMetaBlock(file_info_.file.get(), file_size_,
|
||||
kPlainTableMagicNumber, ioptions_,
|
||||
BloomBlockBuilder::kBloomBlock, &bloom_block_contents);
|
||||
bool index_in_file = s.ok();
|
||||
|
||||
BlockContents index_block_contents;
|
||||
s = ReadMetaBlock(
|
||||
Status s = ReadMetaBlock(
|
||||
file_info_.file.get(), file_size_, kPlainTableMagicNumber, ioptions_,
|
||||
PlainTableIndexBuilder::kPlainTableIndexBlock, &index_block_contents);
|
||||
|
||||
index_in_file &= s.ok();
|
||||
bool index_in_file = s.ok();
|
||||
|
||||
BlockContents bloom_block_contents;
|
||||
bool bloom_in_file = false;
|
||||
// We only need to read the bloom block if index block is in file.
|
||||
if (index_in_file) {
|
||||
s = ReadMetaBlock(file_info_.file.get(), file_size_, kPlainTableMagicNumber,
|
||||
ioptions_, BloomBlockBuilder::kBloomBlock,
|
||||
&bloom_block_contents);
|
||||
bloom_in_file = s.ok() && bloom_block_contents.data.size() > 0;
|
||||
}
|
||||
|
||||
Slice* bloom_block;
|
||||
if (index_in_file) {
|
||||
if (bloom_in_file) {
|
||||
// If bloom_block_contents.allocation is not empty (which will be the case
|
||||
// for non-mmap mode), it holds the alloated memory for the bloom block.
|
||||
// It needs to be kept alive to keep `bloom_block` valid.
|
||||
@ -318,8 +322,6 @@ Status PlainTableReader::PopulateIndex(TableProperties* props,
|
||||
bloom_block = nullptr;
|
||||
}
|
||||
|
||||
// index_in_file == true only if there are kBloomBlock and
|
||||
// kPlainTableIndexBlock in file
|
||||
Slice* index_block;
|
||||
if (index_in_file) {
|
||||
// If index_block_contents.allocation is not empty (which will be the case
|
||||
@ -355,7 +357,7 @@ Status PlainTableReader::PopulateIndex(TableProperties* props,
|
||||
huge_page_tlb_size, ioptions_.info_log);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
} else if (bloom_in_file) {
|
||||
enable_bloom_ = true;
|
||||
auto num_blocks_property = props->user_collected_properties.find(
|
||||
PlainTablePropertyNames::kNumBloomBlocks);
|
||||
@ -372,6 +374,10 @@ Status PlainTableReader::PopulateIndex(TableProperties* props,
|
||||
const_cast<unsigned char*>(
|
||||
reinterpret_cast<const unsigned char*>(bloom_block->data())),
|
||||
static_cast<uint32_t>(bloom_block->size()) * 8, num_blocks);
|
||||
} else {
|
||||
// Index in file but no bloom in file. Disable bloom filter in this case.
|
||||
enable_bloom_ = false;
|
||||
bloom_bits_per_key = 0;
|
||||
}
|
||||
|
||||
PlainTableIndexBuilder index_builder(&arena_, ioptions_, index_sparseness,
|
||||
|
@ -21,11 +21,12 @@ const std::string ExternalSstFilePropertyNames::kGlobalSeqno =
|
||||
|
||||
struct SstFileWriter::Rep {
|
||||
Rep(const EnvOptions& _env_options, const Options& options,
|
||||
const Comparator* _user_comparator)
|
||||
const Comparator* _user_comparator, ColumnFamilyHandle* _cfh)
|
||||
: env_options(_env_options),
|
||||
ioptions(options),
|
||||
mutable_cf_options(options),
|
||||
internal_comparator(_user_comparator) {}
|
||||
internal_comparator(_user_comparator),
|
||||
cfh(_cfh) {}
|
||||
|
||||
std::unique_ptr<WritableFileWriter> file_writer;
|
||||
std::unique_ptr<TableBuilder> builder;
|
||||
@ -34,16 +35,26 @@ struct SstFileWriter::Rep {
|
||||
MutableCFOptions mutable_cf_options;
|
||||
InternalKeyComparator internal_comparator;
|
||||
ExternalSstFileInfo file_info;
|
||||
std::string column_family_name;
|
||||
InternalKey ikey;
|
||||
std::string column_family_name;
|
||||
ColumnFamilyHandle* cfh;
|
||||
};
|
||||
|
||||
SstFileWriter::SstFileWriter(const EnvOptions& env_options,
|
||||
const Options& options,
|
||||
const Comparator* user_comparator)
|
||||
: rep_(new Rep(env_options, options, user_comparator)) {}
|
||||
const Comparator* user_comparator,
|
||||
ColumnFamilyHandle* column_family)
|
||||
: rep_(new Rep(env_options, options, user_comparator, column_family)) {}
|
||||
|
||||
SstFileWriter::~SstFileWriter() { delete rep_; }
|
||||
SstFileWriter::~SstFileWriter() {
|
||||
if (rep_->builder) {
|
||||
// User did not call Finish() or Finish() failed, we need to
|
||||
// abandon the builder.
|
||||
rep_->builder->Abandon();
|
||||
}
|
||||
|
||||
delete rep_;
|
||||
}
|
||||
|
||||
Status SstFileWriter::Open(const std::string& file_path) {
|
||||
Rep* r = rep_;
|
||||
@ -81,6 +92,18 @@ Status SstFileWriter::Open(const std::string& file_path) {
|
||||
user_collector_factories[i]));
|
||||
}
|
||||
int unknown_level = -1;
|
||||
uint32_t cf_id;
|
||||
|
||||
if (r->cfh != nullptr) {
|
||||
// user explicitly specified that this file will be ingested into cfh,
|
||||
// we can persist this information in the file.
|
||||
cf_id = r->cfh->GetID();
|
||||
r->column_family_name = r->cfh->GetName();
|
||||
} else {
|
||||
r->column_family_name = "";
|
||||
cf_id = TablePropertiesCollectorFactory::Context::kUnknownColumnFamily;
|
||||
}
|
||||
|
||||
TableBuilderOptions table_builder_options(
|
||||
r->ioptions, r->internal_comparator, &int_tbl_prop_collector_factories,
|
||||
compression_type, r->ioptions.compression_opts,
|
||||
@ -92,9 +115,7 @@ Status SstFileWriter::Open(const std::string& file_path) {
|
||||
// TODO(tec) : If table_factory is using compressed block cache, we will
|
||||
// be adding the external sst file blocks into it, which is wasteful.
|
||||
r->builder.reset(r->ioptions.table_factory->NewTableBuilder(
|
||||
table_builder_options,
|
||||
TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
|
||||
r->file_writer.get()));
|
||||
table_builder_options, cf_id, r->file_writer.get()));
|
||||
|
||||
r->file_info.file_path = file_path;
|
||||
r->file_info.file_size = 0;
|
||||
|
55
third-party/flashcache/flashcache_ioctl.h
vendored
55
third-party/flashcache/flashcache_ioctl.h
vendored
@ -1,55 +0,0 @@
|
||||
/****************************************************************************
|
||||
* flashcache_ioctl.h
|
||||
* FlashCache: Device mapper target for block-level disk caching
|
||||
*
|
||||
* Copyright 2010 Facebook, Inc.
|
||||
* Author: Mohan Srinivasan (mohan@facebook.com)
|
||||
*
|
||||
* Based on DM-Cache:
|
||||
* Copyright (C) International Business Machines Corp., 2006
|
||||
* Author: Ming Zhao (mingzhao@ufl.edu)
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; under version 2 of the License.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
****************************************************************************/
|
||||
|
||||
#ifdef OS_LINUX
|
||||
#ifndef FLASHCACHE_IOCTL_H
|
||||
#define FLASHCACHE_IOCTL_H
|
||||
|
||||
#include <linux/types.h>
|
||||
|
||||
#define FLASHCACHE_IOCTL 0xfe
|
||||
|
||||
enum {
|
||||
FLASHCACHEADDNCPID_CMD=200,
|
||||
FLASHCACHEDELNCPID_CMD,
|
||||
FLASHCACHEDELNCALL_CMD,
|
||||
FLASHCACHEADDWHITELIST_CMD,
|
||||
FLASHCACHEDELWHITELIST_CMD,
|
||||
FLASHCACHEDELWHITELISTALL_CMD,
|
||||
};
|
||||
|
||||
#define FLASHCACHEADDNCPID _IOW(FLASHCACHE_IOCTL, FLASHCACHEADDNCPID_CMD, pid_t)
|
||||
#define FLASHCACHEDELNCPID _IOW(FLASHCACHE_IOCTL, FLASHCACHEDELNCPID_CMD, pid_t)
|
||||
#define FLASHCACHEDELNCALL _IOW(FLASHCACHE_IOCTL, FLASHCACHEDELNCALL_CMD, pid_t)
|
||||
|
||||
#define FLASHCACHEADDBLACKLIST FLASHCACHEADDNCPID
|
||||
#define FLASHCACHEDELBLACKLIST FLASHCACHEDELNCPID
|
||||
#define FLASHCACHEDELALLBLACKLIST FLASHCACHEDELNCALL
|
||||
|
||||
#define FLASHCACHEADDWHITELIST _IOW(FLASHCACHE_IOCTL, FLASHCACHEADDWHITELIST_CMD, pid_t)
|
||||
#define FLASHCACHEDELWHITELIST _IOW(FLASHCACHE_IOCTL, FLASHCACHEDELWHITELIST_CMD, pid_t)
|
||||
#define FLASHCACHEDELALLWHITELIST _IOW(FLASHCACHE_IOCTL, FLASHCACHEDELWHITELISTALL_CMD, pid_t)
|
||||
|
||||
#endif /* FLASHCACHE_IOCTL_H */
|
||||
#endif /* OS_LINUX */
|
@ -48,7 +48,6 @@
|
||||
#include "rocksdb/slice.h"
|
||||
#include "rocksdb/slice_transform.h"
|
||||
#include "rocksdb/utilities/env_registry.h"
|
||||
#include "rocksdb/utilities/flashcache.h"
|
||||
#include "rocksdb/utilities/optimistic_transaction_db.h"
|
||||
#include "rocksdb/utilities/options_util.h"
|
||||
#include "rocksdb/utilities/sim_cache.h"
|
||||
@ -573,8 +572,6 @@ DEFINE_string(
|
||||
"\t--row_cache_size\n"
|
||||
"\t--row_cache_numshardbits\n"
|
||||
"\t--enable_io_prio\n"
|
||||
"\t--disable_flashcache_for_background_threads\n"
|
||||
"\t--flashcache_dev\n"
|
||||
"\t--dump_malloc_stats\n"
|
||||
"\t--num_multi_db\n");
|
||||
#endif // ROCKSDB_LITE
|
||||
@ -769,11 +766,6 @@ DEFINE_string(compaction_fadvice, "NORMAL",
|
||||
static auto FLAGS_compaction_fadvice_e =
|
||||
rocksdb::Options().access_hint_on_compaction_start;
|
||||
|
||||
DEFINE_bool(disable_flashcache_for_background_threads, false,
|
||||
"Disable flashcache for background threads");
|
||||
|
||||
DEFINE_string(flashcache_dev, "", "Path to flashcache device");
|
||||
|
||||
DEFINE_bool(use_tailing_iterator, false,
|
||||
"Use tailing iterator to access a series of keys instead of get");
|
||||
|
||||
@ -1739,7 +1731,6 @@ class Benchmark {
|
||||
int64_t readwrites_;
|
||||
int64_t merge_keys_;
|
||||
bool report_file_operations_;
|
||||
int cachedev_fd_;
|
||||
|
||||
bool SanityCheck() {
|
||||
if (FLAGS_compression_ratio > 1) {
|
||||
@ -1995,8 +1986,7 @@ class Benchmark {
|
||||
? FLAGS_num
|
||||
: ((FLAGS_writes > FLAGS_reads) ? FLAGS_writes : FLAGS_reads)),
|
||||
merge_keys_(FLAGS_merge_keys < 0 ? FLAGS_num : FLAGS_merge_keys),
|
||||
report_file_operations_(FLAGS_report_file_operations),
|
||||
cachedev_fd_(-1) {
|
||||
report_file_operations_(FLAGS_report_file_operations) {
|
||||
// use simcache instead of cache
|
||||
if (FLAGS_simcache_size >= 0) {
|
||||
if (FLAGS_cache_numshardbits >= 1) {
|
||||
@ -2055,11 +2045,6 @@ class Benchmark {
|
||||
// this will leak, but we're shutting down so nobody cares
|
||||
cache_->DisownData();
|
||||
}
|
||||
if (FLAGS_disable_flashcache_for_background_threads && cachedev_fd_ != -1) {
|
||||
// Dtor for thiis env should run before cachedev_fd_ is closed
|
||||
flashcache_aware_env_ = nullptr;
|
||||
close(cachedev_fd_);
|
||||
}
|
||||
}
|
||||
|
||||
Slice AllocateKey(std::unique_ptr<const char[]>* key_guard) {
|
||||
@ -2415,7 +2400,6 @@ class Benchmark {
|
||||
}
|
||||
|
||||
private:
|
||||
std::unique_ptr<Env> flashcache_aware_env_;
|
||||
std::shared_ptr<TimestampEmulator> timestamp_emulator_;
|
||||
|
||||
struct ThreadArg {
|
||||
@ -2994,23 +2978,7 @@ class Benchmark {
|
||||
FLAGS_env->LowerThreadPoolIOPriority(Env::LOW);
|
||||
FLAGS_env->LowerThreadPoolIOPriority(Env::HIGH);
|
||||
}
|
||||
if (FLAGS_disable_flashcache_for_background_threads && cachedev_fd_ == -1) {
|
||||
// Avoid creating the env twice when an use_existing_db is true
|
||||
cachedev_fd_ = open(FLAGS_flashcache_dev.c_str(), O_RDONLY);
|
||||
if (cachedev_fd_ < 0) {
|
||||
fprintf(stderr, "Open flash device failed\n");
|
||||
exit(1);
|
||||
}
|
||||
flashcache_aware_env_ = NewFlashcacheAwareEnv(FLAGS_env, cachedev_fd_);
|
||||
if (flashcache_aware_env_.get() == nullptr) {
|
||||
fprintf(stderr, "Failed to open flashcache device at %s\n",
|
||||
FLAGS_flashcache_dev.c_str());
|
||||
std::abort();
|
||||
}
|
||||
options.env = flashcache_aware_env_.get();
|
||||
} else {
|
||||
options.env = FLAGS_env;
|
||||
}
|
||||
|
||||
if (FLAGS_num_multi_db <= 1) {
|
||||
OpenDb(options, FLAGS_db, &db_);
|
||||
|
@ -13,6 +13,12 @@
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
// This function may intentionally do a left shift on a -ve number
|
||||
#if defined(__clang__)
|
||||
__attribute__((__no_sanitize__("undefined")))
|
||||
#elif __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 9)
|
||||
__attribute__((__no_sanitize_undefined__))
|
||||
#endif
|
||||
uint32_t Hash(const char* data, size_t n, uint32_t seed) {
|
||||
// Similar to murmur hash
|
||||
const uint32_t m = 0xc6a4a793;
|
||||
|
@ -305,8 +305,8 @@ TEST_F(OptionsTest, GetColumnFamilyOptionsFromStringTest) {
|
||||
|
||||
// Units (k)
|
||||
ASSERT_OK(GetColumnFamilyOptionsFromString(
|
||||
base_cf_opt, "max_write_buffer_number=-15K", &new_cf_opt));
|
||||
ASSERT_EQ(new_cf_opt.max_write_buffer_number, -15 * kilo);
|
||||
base_cf_opt, "max_write_buffer_number=15K", &new_cf_opt));
|
||||
ASSERT_EQ(new_cf_opt.max_write_buffer_number, 15 * kilo);
|
||||
// Units (m)
|
||||
ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
|
||||
"max_write_buffer_number=16m;inplace_update_num_locks=17M",
|
||||
|
@ -51,9 +51,11 @@ uint64_t StatisticsImpl::getTickerCount(uint32_t tickerType) const {
|
||||
|
||||
std::unique_ptr<HistogramImpl>
|
||||
StatisticsImpl::HistogramInfo::getMergedHistogram() const {
|
||||
MutexLock lock(&merge_lock);
|
||||
std::unique_ptr<HistogramImpl> res_hist(new HistogramImpl());
|
||||
{
|
||||
MutexLock lock(&merge_lock);
|
||||
res_hist->Merge(merged_hist);
|
||||
}
|
||||
thread_value->Fold(
|
||||
[](void* curr_ptr, void* res) {
|
||||
auto tmp_res_hist = static_cast<HistogramImpl*>(res);
|
||||
|
@ -6,6 +6,7 @@
|
||||
|
||||
#include <assert.h>
|
||||
#include <condition_variable>
|
||||
#include <functional>
|
||||
#include <mutex>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
|
@ -10,6 +10,7 @@
|
||||
#pragma once
|
||||
|
||||
#include <atomic>
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
@ -46,6 +46,11 @@ ColBufEncoder *ColBufEncoder::NewColBufEncoder(
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
#if defined(__clang__)
|
||||
__attribute__((__no_sanitize__("undefined")))
|
||||
#elif __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 9)
|
||||
__attribute__((__no_sanitize_undefined__))
|
||||
#endif
|
||||
size_t FixedLengthColBufEncoder::Append(const char *buf) {
|
||||
if (nullable_) {
|
||||
if (buf == nullptr) {
|
||||
|
@ -1,136 +0,0 @@
|
||||
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#include "utilities/flashcache/flashcache.h"
|
||||
|
||||
#include "rocksdb/utilities/flashcache.h"
|
||||
|
||||
#ifdef OS_LINUX
|
||||
#include <fcntl.h>
|
||||
#include <sys/ioctl.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/syscall.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include "third-party/flashcache/flashcache_ioctl.h"
|
||||
#endif
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
#if !defined(ROCKSDB_LITE) && defined(OS_LINUX)
|
||||
// Most of the code that handles flashcache is copied from websql's branch of
|
||||
// mysql-5.6
|
||||
class FlashcacheAwareEnv : public EnvWrapper {
|
||||
public:
|
||||
FlashcacheAwareEnv(Env* base, int cachedev_fd)
|
||||
: EnvWrapper(base), cachedev_fd_(cachedev_fd) {
|
||||
pid_t pid = getpid();
|
||||
/* cleanup previous whitelistings */
|
||||
if (ioctl(cachedev_fd_, FLASHCACHEDELALLWHITELIST, &pid) < 0) {
|
||||
cachedev_fd_ = -1;
|
||||
fprintf(stderr, "ioctl del-all-whitelist for flashcache failed\n");
|
||||
return;
|
||||
}
|
||||
if (ioctl(cachedev_fd_, FLASHCACHEADDWHITELIST, &pid) < 0) {
|
||||
fprintf(stderr, "ioctl add-whitelist for flashcache failed\n");
|
||||
}
|
||||
}
|
||||
|
||||
~FlashcacheAwareEnv() {
|
||||
// cachedev_fd_ is -1 if it's unitialized
|
||||
if (cachedev_fd_ != -1) {
|
||||
pid_t pid = getpid();
|
||||
if (ioctl(cachedev_fd_, FLASHCACHEDELWHITELIST, &pid) < 0) {
|
||||
fprintf(stderr, "ioctl del-whitelist for flashcache failed\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static int BlacklistCurrentThread(int cachedev_fd) {
|
||||
pid_t pid = static_cast<pid_t>(syscall(SYS_gettid));
|
||||
return ioctl(cachedev_fd, FLASHCACHEADDNCPID, &pid);
|
||||
}
|
||||
|
||||
static int WhitelistCurrentThread(int cachedev_fd) {
|
||||
pid_t pid = static_cast<pid_t>(syscall(SYS_gettid));
|
||||
return ioctl(cachedev_fd, FLASHCACHEDELNCPID, &pid);
|
||||
}
|
||||
|
||||
int GetFlashCacheFileDescriptor() { return cachedev_fd_; }
|
||||
|
||||
struct Arg {
|
||||
Arg(void (*f)(void* arg), void* a, int _cachedev_fd)
|
||||
: original_function_(f), original_arg_(a), cachedev_fd(_cachedev_fd) {}
|
||||
|
||||
void (*original_function_)(void* arg);
|
||||
void* original_arg_;
|
||||
int cachedev_fd;
|
||||
};
|
||||
|
||||
static void BgThreadWrapper(void* a) {
|
||||
Arg* arg = reinterpret_cast<Arg*>(a);
|
||||
if (arg->cachedev_fd != -1) {
|
||||
if (BlacklistCurrentThread(arg->cachedev_fd) < 0) {
|
||||
fprintf(stderr, "ioctl add-nc-pid for flashcache failed\n");
|
||||
}
|
||||
}
|
||||
arg->original_function_(arg->original_arg_);
|
||||
if (arg->cachedev_fd != -1) {
|
||||
if (WhitelistCurrentThread(arg->cachedev_fd) < 0) {
|
||||
fprintf(stderr, "ioctl del-nc-pid for flashcache failed\n");
|
||||
}
|
||||
}
|
||||
delete arg;
|
||||
}
|
||||
|
||||
int UnSchedule(void* arg, Priority pri) override {
|
||||
// no unschedule for you
|
||||
return 0;
|
||||
}
|
||||
|
||||
void Schedule(void (*f)(void* arg), void* a, Priority pri,
|
||||
void* tag = nullptr, void (*u)(void* arg) = 0) override {
|
||||
EnvWrapper::Schedule(&BgThreadWrapper, new Arg(f, a, cachedev_fd_), pri,
|
||||
tag);
|
||||
}
|
||||
|
||||
private:
|
||||
int cachedev_fd_;
|
||||
};
|
||||
|
||||
std::unique_ptr<Env> NewFlashcacheAwareEnv(Env* base,
|
||||
const int cachedev_fd) {
|
||||
std::unique_ptr<Env> ret(new FlashcacheAwareEnv(base, cachedev_fd));
|
||||
return ret;
|
||||
}
|
||||
|
||||
int FlashcacheBlacklistCurrentThread(Env* flashcache_aware_env) {
|
||||
int fd = dynamic_cast<FlashcacheAwareEnv*>(flashcache_aware_env)
|
||||
->GetFlashCacheFileDescriptor();
|
||||
if (fd == -1) {
|
||||
return -1;
|
||||
}
|
||||
return FlashcacheAwareEnv::BlacklistCurrentThread(fd);
|
||||
}
|
||||
int FlashcacheWhitelistCurrentThread(Env* flashcache_aware_env) {
|
||||
int fd = dynamic_cast<FlashcacheAwareEnv*>(flashcache_aware_env)
|
||||
->GetFlashCacheFileDescriptor();
|
||||
if (fd == -1) {
|
||||
return -1;
|
||||
}
|
||||
return FlashcacheAwareEnv::WhitelistCurrentThread(fd);
|
||||
}
|
||||
|
||||
#else // !defined(ROCKSDB_LITE) && defined(OS_LINUX)
|
||||
std::unique_ptr<Env> NewFlashcacheAwareEnv(Env* base,
|
||||
const int cachedev_fd) {
|
||||
return nullptr;
|
||||
}
|
||||
int FlashcacheBlacklistCurrentThread(Env* flashcache_aware_env) { return -1; }
|
||||
int FlashcacheWhitelistCurrentThread(Env* flashcache_aware_env) { return -1; }
|
||||
|
||||
#endif // !defined(ROCKSDB_LITE) && defined(OS_LINUX)
|
||||
|
||||
} // namespace rocksdb
|
@ -1,18 +0,0 @@
|
||||
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include "rocksdb/env.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
// This is internal API that will make hacking on flashcache easier. Not sure if
|
||||
// we need to expose this to public users, probably not
|
||||
extern int FlashcacheBlacklistCurrentThread(Env* flashcache_aware_env);
|
||||
extern int FlashcacheWhitelistCurrentThread(Env* flashcache_aware_env);
|
||||
|
||||
} // namespace rocksdb
|
@ -46,6 +46,7 @@ Status CompactToLevel(const Options& options, const std::string& dbname,
|
||||
// only one level. In this case, compacting to one file is also
|
||||
// optimal.
|
||||
no_compact_opts.target_file_size_base = 999999999999999;
|
||||
no_compact_opts.max_compaction_bytes = 999999999999999;
|
||||
}
|
||||
Status s = OpenDb(no_compact_opts, dbname, &db);
|
||||
if (!s.ok()) {
|
||||
@ -54,6 +55,9 @@ Status CompactToLevel(const Options& options, const std::string& dbname,
|
||||
CompactRangeOptions cro;
|
||||
cro.change_level = true;
|
||||
cro.target_level = dest_level;
|
||||
if (dest_level == 0) {
|
||||
cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
|
||||
}
|
||||
db->CompactRange(cro, nullptr, nullptr);
|
||||
|
||||
if (need_reopen) {
|
||||
@ -72,7 +76,8 @@ Status CompactToLevel(const Options& options, const std::string& dbname,
|
||||
|
||||
Status MigrateToUniversal(std::string dbname, const Options& old_opts,
|
||||
const Options& new_opts) {
|
||||
if (old_opts.num_levels <= new_opts.num_levels) {
|
||||
if (old_opts.num_levels <= new_opts.num_levels ||
|
||||
old_opts.compaction_style == CompactionStyle::kCompactionStyleFIFO) {
|
||||
return Status::OK();
|
||||
} else {
|
||||
bool need_compact = false;
|
||||
@ -132,11 +137,18 @@ Status MigrateToLevelBase(std::string dbname, const Options& old_opts,
|
||||
|
||||
Status OptionChangeMigration(std::string dbname, const Options& old_opts,
|
||||
const Options& new_opts) {
|
||||
if (new_opts.compaction_style == CompactionStyle::kCompactionStyleUniversal) {
|
||||
if (old_opts.compaction_style == CompactionStyle::kCompactionStyleFIFO) {
|
||||
// LSM generated by FIFO compation can be opened by any compaction.
|
||||
return Status::OK();
|
||||
} else if (new_opts.compaction_style ==
|
||||
CompactionStyle::kCompactionStyleUniversal) {
|
||||
return MigrateToUniversal(dbname, old_opts, new_opts);
|
||||
} else if (new_opts.compaction_style ==
|
||||
CompactionStyle::kCompactionStyleLevel) {
|
||||
return MigrateToLevelBase(dbname, old_opts, new_opts);
|
||||
} else if (new_opts.compaction_style ==
|
||||
CompactionStyle::kCompactionStyleFIFO) {
|
||||
return CompactToLevel(old_opts, dbname, 0, true);
|
||||
} else {
|
||||
return Status::NotSupported(
|
||||
"Do not how to migrate to this compaction style");
|
||||
|
@ -13,19 +13,19 @@
|
||||
#include "port/stack_trace.h"
|
||||
namespace rocksdb {
|
||||
|
||||
class DBOptionChangeMigrationTest
|
||||
class DBOptionChangeMigrationTests
|
||||
: public DBTestBase,
|
||||
public testing::WithParamInterface<
|
||||
std::tuple<int, bool, bool, int, bool, bool>> {
|
||||
std::tuple<int, int, bool, int, int, bool>> {
|
||||
public:
|
||||
DBOptionChangeMigrationTest()
|
||||
DBOptionChangeMigrationTests()
|
||||
: DBTestBase("/db_option_change_migration_test") {
|
||||
level1_ = std::get<0>(GetParam());
|
||||
is_universal1_ = std::get<1>(GetParam());
|
||||
compaction_style1_ = std::get<1>(GetParam());
|
||||
is_dynamic1_ = std::get<2>(GetParam());
|
||||
|
||||
level2_ = std::get<3>(GetParam());
|
||||
is_universal2_ = std::get<4>(GetParam());
|
||||
compaction_style2_ = std::get<4>(GetParam());
|
||||
is_dynamic2_ = std::get<5>(GetParam());
|
||||
}
|
||||
|
||||
@ -34,23 +34,23 @@ class DBOptionChangeMigrationTest
|
||||
static void TearDownTestCase() {}
|
||||
|
||||
int level1_;
|
||||
bool is_universal1_;
|
||||
int compaction_style1_;
|
||||
bool is_dynamic1_;
|
||||
|
||||
int level2_;
|
||||
bool is_universal2_;
|
||||
int compaction_style2_;
|
||||
bool is_dynamic2_;
|
||||
};
|
||||
|
||||
#ifndef ROCKSDB_LITE
|
||||
TEST_P(DBOptionChangeMigrationTest, Migrate1) {
|
||||
TEST_P(DBOptionChangeMigrationTests, Migrate1) {
|
||||
Options old_options = CurrentOptions();
|
||||
if (is_universal1_) {
|
||||
old_options.compaction_style = CompactionStyle::kCompactionStyleUniversal;
|
||||
} else {
|
||||
old_options.compaction_style = CompactionStyle::kCompactionStyleLevel;
|
||||
old_options.compaction_style =
|
||||
static_cast<CompactionStyle>(compaction_style1_);
|
||||
if (old_options.compaction_style == CompactionStyle::kCompactionStyleLevel) {
|
||||
old_options.level_compaction_dynamic_level_bytes = is_dynamic1_;
|
||||
}
|
||||
|
||||
old_options.level0_file_num_compaction_trigger = 3;
|
||||
old_options.write_buffer_size = 64 * 1024;
|
||||
old_options.target_file_size_base = 128 * 1024;
|
||||
@ -83,10 +83,9 @@ TEST_P(DBOptionChangeMigrationTest, Migrate1) {
|
||||
Close();
|
||||
|
||||
Options new_options = old_options;
|
||||
if (is_universal2_) {
|
||||
new_options.compaction_style = CompactionStyle::kCompactionStyleUniversal;
|
||||
} else {
|
||||
new_options.compaction_style = CompactionStyle::kCompactionStyleLevel;
|
||||
new_options.compaction_style =
|
||||
static_cast<CompactionStyle>(compaction_style2_);
|
||||
if (new_options.compaction_style == CompactionStyle::kCompactionStyleLevel) {
|
||||
new_options.level_compaction_dynamic_level_bytes = is_dynamic2_;
|
||||
}
|
||||
new_options.target_file_size_base = 256 * 1024;
|
||||
@ -113,12 +112,11 @@ TEST_P(DBOptionChangeMigrationTest, Migrate1) {
|
||||
}
|
||||
}
|
||||
|
||||
TEST_P(DBOptionChangeMigrationTest, Migrate2) {
|
||||
TEST_P(DBOptionChangeMigrationTests, Migrate2) {
|
||||
Options old_options = CurrentOptions();
|
||||
if (is_universal2_) {
|
||||
old_options.compaction_style = CompactionStyle::kCompactionStyleUniversal;
|
||||
} else {
|
||||
old_options.compaction_style = CompactionStyle::kCompactionStyleLevel;
|
||||
old_options.compaction_style =
|
||||
static_cast<CompactionStyle>(compaction_style2_);
|
||||
if (old_options.compaction_style == CompactionStyle::kCompactionStyleLevel) {
|
||||
old_options.level_compaction_dynamic_level_bytes = is_dynamic2_;
|
||||
}
|
||||
old_options.level0_file_num_compaction_trigger = 3;
|
||||
@ -154,10 +152,158 @@ TEST_P(DBOptionChangeMigrationTest, Migrate2) {
|
||||
Close();
|
||||
|
||||
Options new_options = old_options;
|
||||
if (is_universal1_) {
|
||||
new_options.compaction_style = CompactionStyle::kCompactionStyleUniversal;
|
||||
} else {
|
||||
new_options.compaction_style = CompactionStyle::kCompactionStyleLevel;
|
||||
new_options.compaction_style =
|
||||
static_cast<CompactionStyle>(compaction_style1_);
|
||||
if (new_options.compaction_style == CompactionStyle::kCompactionStyleLevel) {
|
||||
new_options.level_compaction_dynamic_level_bytes = is_dynamic1_;
|
||||
}
|
||||
new_options.target_file_size_base = 256 * 1024;
|
||||
new_options.num_levels = level1_;
|
||||
new_options.max_bytes_for_level_base = 150 * 1024;
|
||||
new_options.max_bytes_for_level_multiplier = 4;
|
||||
ASSERT_OK(OptionChangeMigration(dbname_, old_options, new_options));
|
||||
Reopen(new_options);
|
||||
// Wait for compaction to finish and make sure it can reopen
|
||||
dbfull()->TEST_WaitForFlushMemTable();
|
||||
dbfull()->TEST_WaitForCompact();
|
||||
Reopen(new_options);
|
||||
|
||||
{
|
||||
std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
|
||||
it->SeekToFirst();
|
||||
for (std::string key : keys) {
|
||||
ASSERT_TRUE(it->Valid());
|
||||
ASSERT_EQ(key, it->key().ToString());
|
||||
it->Next();
|
||||
}
|
||||
ASSERT_TRUE(!it->Valid());
|
||||
}
|
||||
}
|
||||
|
||||
TEST_P(DBOptionChangeMigrationTests, Migrate3) {
|
||||
Options old_options = CurrentOptions();
|
||||
old_options.compaction_style =
|
||||
static_cast<CompactionStyle>(compaction_style1_);
|
||||
if (old_options.compaction_style == CompactionStyle::kCompactionStyleLevel) {
|
||||
old_options.level_compaction_dynamic_level_bytes = is_dynamic1_;
|
||||
}
|
||||
|
||||
old_options.level0_file_num_compaction_trigger = 3;
|
||||
old_options.write_buffer_size = 64 * 1024;
|
||||
old_options.target_file_size_base = 128 * 1024;
|
||||
// Make level target of L1, L2 to be 200KB and 600KB
|
||||
old_options.num_levels = level1_;
|
||||
old_options.max_bytes_for_level_multiplier = 3;
|
||||
old_options.max_bytes_for_level_base = 200 * 1024;
|
||||
|
||||
Reopen(old_options);
|
||||
Random rnd(301);
|
||||
for (int num = 0; num < 20; num++) {
|
||||
for (int i = 0; i < 50; i++) {
|
||||
ASSERT_OK(Put(Key(num * 100 + i), RandomString(&rnd, 900)));
|
||||
}
|
||||
Flush();
|
||||
dbfull()->TEST_WaitForCompact();
|
||||
if (num == 9) {
|
||||
// Issue a full compaction to generate some zero-out files
|
||||
CompactRangeOptions cro;
|
||||
cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
|
||||
dbfull()->CompactRange(cro, nullptr, nullptr);
|
||||
}
|
||||
}
|
||||
dbfull()->TEST_WaitForFlushMemTable();
|
||||
dbfull()->TEST_WaitForCompact();
|
||||
|
||||
// Will make sure exactly those keys are in the DB after migration.
|
||||
std::set<std::string> keys;
|
||||
{
|
||||
std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
|
||||
it->SeekToFirst();
|
||||
for (; it->Valid(); it->Next()) {
|
||||
keys.insert(it->key().ToString());
|
||||
}
|
||||
}
|
||||
Close();
|
||||
|
||||
Options new_options = old_options;
|
||||
new_options.compaction_style =
|
||||
static_cast<CompactionStyle>(compaction_style2_);
|
||||
if (new_options.compaction_style == CompactionStyle::kCompactionStyleLevel) {
|
||||
new_options.level_compaction_dynamic_level_bytes = is_dynamic2_;
|
||||
}
|
||||
new_options.target_file_size_base = 256 * 1024;
|
||||
new_options.num_levels = level2_;
|
||||
new_options.max_bytes_for_level_base = 150 * 1024;
|
||||
new_options.max_bytes_for_level_multiplier = 4;
|
||||
ASSERT_OK(OptionChangeMigration(dbname_, old_options, new_options));
|
||||
Reopen(new_options);
|
||||
|
||||
// Wait for compaction to finish and make sure it can reopen
|
||||
dbfull()->TEST_WaitForFlushMemTable();
|
||||
dbfull()->TEST_WaitForCompact();
|
||||
Reopen(new_options);
|
||||
|
||||
{
|
||||
std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
|
||||
it->SeekToFirst();
|
||||
for (std::string key : keys) {
|
||||
ASSERT_TRUE(it->Valid());
|
||||
ASSERT_EQ(key, it->key().ToString());
|
||||
it->Next();
|
||||
}
|
||||
ASSERT_TRUE(!it->Valid());
|
||||
}
|
||||
}
|
||||
|
||||
TEST_P(DBOptionChangeMigrationTests, Migrate4) {
|
||||
Options old_options = CurrentOptions();
|
||||
old_options.compaction_style =
|
||||
static_cast<CompactionStyle>(compaction_style2_);
|
||||
if (old_options.compaction_style == CompactionStyle::kCompactionStyleLevel) {
|
||||
old_options.level_compaction_dynamic_level_bytes = is_dynamic2_;
|
||||
}
|
||||
old_options.level0_file_num_compaction_trigger = 3;
|
||||
old_options.write_buffer_size = 64 * 1024;
|
||||
old_options.target_file_size_base = 128 * 1024;
|
||||
// Make level target of L1, L2 to be 200KB and 600KB
|
||||
old_options.num_levels = level2_;
|
||||
old_options.max_bytes_for_level_multiplier = 3;
|
||||
old_options.max_bytes_for_level_base = 200 * 1024;
|
||||
|
||||
Reopen(old_options);
|
||||
Random rnd(301);
|
||||
for (int num = 0; num < 20; num++) {
|
||||
for (int i = 0; i < 50; i++) {
|
||||
ASSERT_OK(Put(Key(num * 100 + i), RandomString(&rnd, 900)));
|
||||
}
|
||||
Flush();
|
||||
dbfull()->TEST_WaitForCompact();
|
||||
if (num == 9) {
|
||||
// Issue a full compaction to generate some zero-out files
|
||||
CompactRangeOptions cro;
|
||||
cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
|
||||
dbfull()->CompactRange(cro, nullptr, nullptr);
|
||||
}
|
||||
}
|
||||
dbfull()->TEST_WaitForFlushMemTable();
|
||||
dbfull()->TEST_WaitForCompact();
|
||||
|
||||
// Will make sure exactly those keys are in the DB after migration.
|
||||
std::set<std::string> keys;
|
||||
{
|
||||
std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
|
||||
it->SeekToFirst();
|
||||
for (; it->Valid(); it->Next()) {
|
||||
keys.insert(it->key().ToString());
|
||||
}
|
||||
}
|
||||
|
||||
Close();
|
||||
|
||||
Options new_options = old_options;
|
||||
new_options.compaction_style =
|
||||
static_cast<CompactionStyle>(compaction_style1_);
|
||||
if (new_options.compaction_style == CompactionStyle::kCompactionStyleLevel) {
|
||||
new_options.level_compaction_dynamic_level_bytes = is_dynamic1_;
|
||||
}
|
||||
new_options.target_file_size_base = 256 * 1024;
|
||||
@ -184,18 +330,90 @@ TEST_P(DBOptionChangeMigrationTest, Migrate2) {
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
DBOptionChangeMigrationTest, DBOptionChangeMigrationTest,
|
||||
::testing::Values(std::make_tuple(3, false, false, 4, false, false),
|
||||
std::make_tuple(3, false, true, 4, false, true),
|
||||
std::make_tuple(3, false, true, 4, false, false),
|
||||
std::make_tuple(3, false, false, 4, false, true),
|
||||
std::make_tuple(3, true, false, 4, true, false),
|
||||
std::make_tuple(1, true, false, 4, true, false),
|
||||
std::make_tuple(3, false, false, 4, true, false),
|
||||
std::make_tuple(3, false, false, 1, true, false),
|
||||
std::make_tuple(3, false, true, 4, true, false),
|
||||
std::make_tuple(3, false, true, 1, true, false),
|
||||
std::make_tuple(1, true, false, 4, false, false)));
|
||||
DBOptionChangeMigrationTests, DBOptionChangeMigrationTests,
|
||||
::testing::Values(std::make_tuple(3, 0, false, 4, 0, false),
|
||||
std::make_tuple(3, 0, true, 4, 0, true),
|
||||
std::make_tuple(3, 0, true, 4, 0, false),
|
||||
std::make_tuple(3, 0, false, 4, 0, true),
|
||||
std::make_tuple(3, 1, false, 4, 1, false),
|
||||
std::make_tuple(1, 1, false, 4, 1, false),
|
||||
std::make_tuple(3, 0, false, 4, 1, false),
|
||||
std::make_tuple(3, 0, false, 1, 1, false),
|
||||
std::make_tuple(3, 0, true, 4, 1, false),
|
||||
std::make_tuple(3, 0, true, 1, 1, false),
|
||||
std::make_tuple(1, 1, false, 4, 0, false),
|
||||
std::make_tuple(4, 0, false, 1, 2, false),
|
||||
std::make_tuple(3, 0, true, 2, 2, false),
|
||||
std::make_tuple(3, 1, false, 3, 2, false),
|
||||
std::make_tuple(1, 1, false, 4, 2, false)));
|
||||
|
||||
class DBOptionChangeMigrationTest : public DBTestBase {
|
||||
public:
|
||||
DBOptionChangeMigrationTest()
|
||||
: DBTestBase("/db_option_change_migration_test2") {}
|
||||
};
|
||||
|
||||
TEST_F(DBOptionChangeMigrationTest, CompactedSrcToUniversal) {
|
||||
Options old_options = CurrentOptions();
|
||||
old_options.compaction_style = CompactionStyle::kCompactionStyleLevel;
|
||||
old_options.max_compaction_bytes = 200 * 1024;
|
||||
old_options.level_compaction_dynamic_level_bytes = false;
|
||||
old_options.level0_file_num_compaction_trigger = 3;
|
||||
old_options.write_buffer_size = 64 * 1024;
|
||||
old_options.target_file_size_base = 128 * 1024;
|
||||
// Make level target of L1, L2 to be 200KB and 600KB
|
||||
old_options.num_levels = 4;
|
||||
old_options.max_bytes_for_level_multiplier = 3;
|
||||
old_options.max_bytes_for_level_base = 200 * 1024;
|
||||
|
||||
Reopen(old_options);
|
||||
Random rnd(301);
|
||||
for (int num = 0; num < 20; num++) {
|
||||
for (int i = 0; i < 50; i++) {
|
||||
ASSERT_OK(Put(Key(num * 100 + i), RandomString(&rnd, 900)));
|
||||
}
|
||||
}
|
||||
Flush();
|
||||
CompactRangeOptions cro;
|
||||
cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
|
||||
dbfull()->CompactRange(cro, nullptr, nullptr);
|
||||
|
||||
// Will make sure exactly those keys are in the DB after migration.
|
||||
std::set<std::string> keys;
|
||||
{
|
||||
std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
|
||||
it->SeekToFirst();
|
||||
for (; it->Valid(); it->Next()) {
|
||||
keys.insert(it->key().ToString());
|
||||
}
|
||||
}
|
||||
|
||||
Close();
|
||||
|
||||
Options new_options = old_options;
|
||||
new_options.compaction_style = CompactionStyle::kCompactionStyleUniversal;
|
||||
new_options.target_file_size_base = 256 * 1024;
|
||||
new_options.num_levels = 1;
|
||||
new_options.max_bytes_for_level_base = 150 * 1024;
|
||||
new_options.max_bytes_for_level_multiplier = 4;
|
||||
ASSERT_OK(OptionChangeMigration(dbname_, old_options, new_options));
|
||||
Reopen(new_options);
|
||||
// Wait for compaction to finish and make sure it can reopen
|
||||
dbfull()->TEST_WaitForFlushMemTable();
|
||||
dbfull()->TEST_WaitForCompact();
|
||||
Reopen(new_options);
|
||||
|
||||
{
|
||||
std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
|
||||
it->SeekToFirst();
|
||||
for (std::string key : keys) {
|
||||
ASSERT_TRUE(it->Valid());
|
||||
ASSERT_EQ(key, it->key().ToString());
|
||||
it->Next();
|
||||
}
|
||||
ASSERT_TRUE(!it->Valid());
|
||||
}
|
||||
}
|
||||
|
||||
#endif // ROCKSDB_LITE
|
||||
} // namespace rocksdb
|
||||
|
@ -9,6 +9,7 @@
|
||||
#ifndef OS_WIN
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
|
@ -6,6 +6,7 @@
|
||||
|
||||
#ifndef ROCKSDB_LITE
|
||||
|
||||
#include <functional>
|
||||
#include <list>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
@ -7,6 +7,7 @@
|
||||
|
||||
#ifndef ROCKSDB_LITE
|
||||
|
||||
#include <functional>
|
||||
#include "util/random.h"
|
||||
#include "utilities/persistent_cache/hash_table.h"
|
||||
#include "utilities/persistent_cache/lrulist.h"
|
||||
|
Loading…
Reference in New Issue
Block a user