Compare commits
25 Commits
main
...
5.9.fb.myr
Author | SHA1 | Date | |
---|---|---|---|
|
4d7b0d528c | ||
|
8486eab5ad | ||
|
497cd90d6a | ||
|
4d06d2862d | ||
|
d1939b0dca | ||
|
920b7df154 | ||
|
7b3bc81f82 | ||
|
84afb8a524 | ||
|
3c3f7b12ec | ||
|
9debbba3a5 | ||
|
3984b44095 | ||
|
28d93aea53 | ||
|
5652b6e57f | ||
|
adee21951b | ||
|
e58d377182 | ||
|
3e2998658f | ||
|
c468fd127b | ||
|
c3efe60855 | ||
|
a0016dc358 | ||
|
49e764a468 | ||
|
7c27f3ddc6 | ||
|
37db5f3e89 | ||
|
cb9104c92c | ||
|
f1b040c14a | ||
|
d070003313 |
@ -811,7 +811,6 @@ if(WITH_TESTS)
|
||||
util/hash_test.cc
|
||||
util/heap_test.cc
|
||||
util/rate_limiter_test.cc
|
||||
util/slice_test.cc
|
||||
util/slice_transform_test.cc
|
||||
util/timer_queue_test.cc
|
||||
util/thread_list_test.cc
|
||||
|
@ -1,8 +1,8 @@
|
||||
# Rocksdb Change Log
|
||||
## Unreleased
|
||||
### Public API Change
|
||||
### New Features
|
||||
## 5.9.1 (11/28/2017)
|
||||
### Bug Fixes
|
||||
* Fix IOError on WAL write doesn't propagate to write group follower
|
||||
* Fix calculating filter partition target size
|
||||
|
||||
## 5.9.0 (11/1/2017)
|
||||
### Public API Change
|
||||
|
4
Makefile
4
Makefile
@ -494,7 +494,6 @@ TESTS = \
|
||||
repair_test \
|
||||
env_timed_test \
|
||||
write_prepared_transaction_test \
|
||||
slice_test \
|
||||
|
||||
PARALLEL_TEST = \
|
||||
backupable_db_test \
|
||||
@ -1478,9 +1477,6 @@ range_del_aggregator_test: db/range_del_aggregator_test.o db/db_test_util.o $(LI
|
||||
blob_db_test: utilities/blob_db/blob_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(AM_LINK)
|
||||
|
||||
slice_test: util/slice_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(AM_LINK)
|
||||
|
||||
#-------------------------------------------------
|
||||
# make install related stuff
|
||||
INSTALL_PATH ?= /usr/local
|
||||
|
@ -3,10 +3,8 @@ from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
import targets_cfg
|
||||
import pprint
|
||||
|
||||
# TODO(tec): replace this with PrettyPrinter
|
||||
def pretty_list(lst, indent=6):
|
||||
def pretty_list(lst, indent=8):
|
||||
if lst is None or len(lst) == 0:
|
||||
return ""
|
||||
|
||||
@ -14,8 +12,8 @@ def pretty_list(lst, indent=6):
|
||||
return "\"%s\"" % lst[0]
|
||||
|
||||
separator = "\",\n%s\"" % (" " * indent)
|
||||
res = separator.join(lst)
|
||||
res = "\n" + (" " * indent) + "\"" + res + "\",\n" + (" " * (indent - 2))
|
||||
res = separator.join(sorted(lst))
|
||||
res = "\n" + (" " * indent) + "\"" + res + "\",\n" + (" " * (indent - 4))
|
||||
return res
|
||||
|
||||
|
||||
@ -27,7 +25,7 @@ class TARGETSBuilder:
|
||||
self.total_lib = 0
|
||||
self.total_bin = 0
|
||||
self.total_test = 0
|
||||
self.tests_cfg = []
|
||||
self.tests_cfg = ""
|
||||
|
||||
def __del__(self):
|
||||
self.targets_file.close()
|
||||
@ -37,8 +35,8 @@ class TARGETSBuilder:
|
||||
headers = "AutoHeaders.RECURSIVE_GLOB"
|
||||
self.targets_file.write(targets_cfg.library_template % (
|
||||
name,
|
||||
headers,
|
||||
pretty_list(srcs),
|
||||
headers,
|
||||
pretty_list(deps)))
|
||||
self.total_lib = self.total_lib + 1
|
||||
|
||||
@ -53,13 +51,13 @@ class TARGETSBuilder:
|
||||
exec_mode = "serial"
|
||||
if is_parallel:
|
||||
exec_mode = "parallel"
|
||||
self.tests_cfg.append([test_name, str(src), str(exec_mode)])
|
||||
self.tests_cfg += targets_cfg.test_cfg_template % (
|
||||
test_name,
|
||||
str(src),
|
||||
str(exec_mode))
|
||||
|
||||
self.total_test = self.total_test + 1
|
||||
|
||||
def flush_tests(self):
|
||||
self.targets_file.write(targets_cfg.unittests_template % (
|
||||
pprint.PrettyPrinter().pformat(self.tests_cfg)
|
||||
))
|
||||
|
||||
self.tests_cfg = []
|
||||
self.targets_file.write(targets_cfg.unittests_template % self.tests_cfg)
|
||||
self.tests_cfg = ""
|
||||
|
@ -2,13 +2,12 @@ from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
rocksdb_target_header = """
|
||||
import os
|
||||
rocksdb_target_header = """REPO_PATH = package_name() + "/"
|
||||
|
||||
TARGETS_PATH = os.path.dirname(__file__)
|
||||
REPO_PATH = "rocksdb/src/"
|
||||
BUCK_BINS = "buck-out/gen/" + REPO_PATH
|
||||
|
||||
TEST_RUNNER = REPO_PATH + "buckifier/rocks_test_runner.sh"
|
||||
|
||||
rocksdb_compiler_flags = [
|
||||
"-fno-builtin-memcmp",
|
||||
"-DROCKSDB_PLATFORM_POSIX",
|
||||
@ -33,13 +32,13 @@ rocksdb_compiler_flags = [
|
||||
]
|
||||
|
||||
rocksdb_external_deps = [
|
||||
('bzip2', None, 'bz2'),
|
||||
('snappy', None, "snappy"),
|
||||
('zlib', None, 'z'),
|
||||
('gflags', None, 'gflags'),
|
||||
('lz4', None, 'lz4'),
|
||||
('zstd', None),
|
||||
('tbb', None),
|
||||
("bzip2", None, "bz2"),
|
||||
("snappy", None, "snappy"),
|
||||
("zlib", None, "z"),
|
||||
("gflags", None, "gflags"),
|
||||
("lz4", None, "lz4"),
|
||||
("zstd", None),
|
||||
("tbb", None),
|
||||
("numa", None, "numa"),
|
||||
("googletest", None, "gtest"),
|
||||
]
|
||||
@ -53,18 +52,27 @@ rocksdb_preprocessor_flags = [
|
||||
rocksdb_arch_preprocessor_flags = {
|
||||
"x86_64": ["-DHAVE_SSE42"],
|
||||
}
|
||||
|
||||
build_mode = read_config("fbcode", "build_mode")
|
||||
|
||||
is_opt_mode = build_mode.startswith("opt")
|
||||
|
||||
# -DNDEBUG is added by default in opt mode in fbcode. But adding it twice
|
||||
# doesn't harm and avoid forgetting to add it.
|
||||
if is_opt_mode:
|
||||
rocksdb_compiler_flags.append("-DNDEBUG")
|
||||
"""
|
||||
|
||||
|
||||
library_template = """
|
||||
cpp_library(
|
||||
name = "%s",
|
||||
headers = %s,
|
||||
srcs = [%s],
|
||||
deps = [%s],
|
||||
preprocessor_flags = rocksdb_preprocessor_flags,
|
||||
headers = %s,
|
||||
arch_preprocessor_flags = rocksdb_arch_preprocessor_flags,
|
||||
compiler_flags = rocksdb_compiler_flags,
|
||||
preprocessor_flags = rocksdb_preprocessor_flags,
|
||||
deps = [%s],
|
||||
external_deps = rocksdb_external_deps,
|
||||
)
|
||||
"""
|
||||
@ -73,21 +81,31 @@ binary_template = """
|
||||
cpp_binary(
|
||||
name = "%s",
|
||||
srcs = [%s],
|
||||
deps = [%s],
|
||||
preprocessor_flags = rocksdb_preprocessor_flags,
|
||||
arch_preprocessor_flags = rocksdb_arch_preprocessor_flags,
|
||||
compiler_flags = rocksdb_compiler_flags,
|
||||
preprocessor_flags = rocksdb_preprocessor_flags,
|
||||
deps = [%s],
|
||||
external_deps = rocksdb_external_deps,
|
||||
)
|
||||
"""
|
||||
|
||||
test_cfg_template = """ [
|
||||
"%s",
|
||||
"%s",
|
||||
"%s",
|
||||
],
|
||||
"""
|
||||
|
||||
unittests_template = """
|
||||
# [test_name, test_src, test_type]
|
||||
ROCKS_TESTS = %s
|
||||
|
||||
ROCKS_TESTS = [
|
||||
%s]
|
||||
|
||||
# Generate a test rule for each entry in ROCKS_TESTS
|
||||
for test_cfg in ROCKS_TESTS:
|
||||
# Do not build the tests in opt mode, since SyncPoint and other test code
|
||||
# will not be included.
|
||||
if not is_opt_mode:
|
||||
for test_cfg in ROCKS_TESTS:
|
||||
test_name = test_cfg[0]
|
||||
test_cc = test_cfg[1]
|
||||
ttype = "gtest" if test_cfg[2] == "parallel" else "simple"
|
||||
@ -112,13 +130,13 @@ for test_cfg in ROCKS_TESTS:
|
||||
|
||||
custom_unittest(
|
||||
name = "make_rocksdbjavastatic",
|
||||
type = "simple",
|
||||
command = ["internal_repo_rocksdb/make_rocksdbjavastatic.sh"],
|
||||
type = "simple",
|
||||
)
|
||||
|
||||
custom_unittest(
|
||||
name = "make_rocksdb_lite_release",
|
||||
type = "simple",
|
||||
command = ["internal_repo_rocksdb/make_rocksdb_lite_release.sh"],
|
||||
type = "simple",
|
||||
)
|
||||
"""
|
||||
|
@ -949,6 +949,10 @@ void ColumnFamilyData::InstallSuperVersion(
|
||||
RecalculateWriteStallConditions(mutable_cf_options);
|
||||
|
||||
if (old_superversion != nullptr) {
|
||||
if (old_superversion->mutable_cf_options.write_buffer_size !=
|
||||
mutable_cf_options.write_buffer_size) {
|
||||
mem_->UpdateWriteBufferSize(mutable_cf_options.write_buffer_size);
|
||||
}
|
||||
if (old_superversion->write_stall_condition !=
|
||||
new_superversion->write_stall_condition) {
|
||||
sv_context->PushWriteStallNotification(
|
||||
|
@ -182,7 +182,8 @@ void CompactionIterator::Next() {
|
||||
|
||||
void CompactionIterator::InvokeFilterIfNeeded(bool* need_skip,
|
||||
Slice* skip_until) {
|
||||
if (compaction_filter_ != nullptr && ikey_.type == kTypeValue &&
|
||||
if (compaction_filter_ != nullptr &&
|
||||
(ikey_.type == kTypeValue || ikey_.type == kTypeBlobIndex) &&
|
||||
(visible_at_tip_ || ikey_.sequence > latest_snapshot_ ||
|
||||
ignore_snapshots_)) {
|
||||
// If the user has specified a compaction filter and the sequence
|
||||
@ -192,11 +193,13 @@ void CompactionIterator::InvokeFilterIfNeeded(bool* need_skip,
|
||||
CompactionFilter::Decision filter;
|
||||
compaction_filter_value_.clear();
|
||||
compaction_filter_skip_until_.Clear();
|
||||
CompactionFilter::ValueType value_type =
|
||||
ikey_.type == kTypeValue ? CompactionFilter::ValueType::kValue
|
||||
: CompactionFilter::ValueType::kBlobIndex;
|
||||
{
|
||||
StopWatchNano timer(env_, true);
|
||||
filter = compaction_filter_->FilterV2(
|
||||
compaction_->level(), ikey_.user_key,
|
||||
CompactionFilter::ValueType::kValue, value_,
|
||||
compaction_->level(), ikey_.user_key, value_type, value_,
|
||||
&compaction_filter_value_, compaction_filter_skip_until_.rep());
|
||||
iter_stats_.total_filter_time +=
|
||||
env_ != nullptr ? timer.ElapsedNanos() : 0;
|
||||
|
@ -143,7 +143,7 @@ class CompactionJobTest : public testing::Test {
|
||||
}
|
||||
|
||||
void SetLastSequence(const SequenceNumber sequence_number) {
|
||||
versions_->SetLastToBeWrittenSequence(sequence_number + 1);
|
||||
versions_->SetLastAllocatedSequence(sequence_number + 1);
|
||||
versions_->SetLastSequence(sequence_number + 1);
|
||||
}
|
||||
|
||||
|
@ -1517,6 +1517,60 @@ TEST_F(DBCompactionTest, DeleteFileRange) {
|
||||
ASSERT_GT(old_num_files, new_num_files);
|
||||
}
|
||||
|
||||
TEST_F(DBCompactionTest, DeleteFileRangeFileEndpointsOverlapBug) {
|
||||
// regression test for #2833: groups of files whose user-keys overlap at the
|
||||
// endpoints could be split by `DeleteFilesInRange`. This caused old data to
|
||||
// reappear, either because a new version of the key was removed, or a range
|
||||
// deletion was partially dropped. It could also cause non-overlapping
|
||||
// invariant to be violated if the files dropped by DeleteFilesInRange were
|
||||
// a subset of files that a range deletion spans.
|
||||
const int kNumL0Files = 2;
|
||||
const int kValSize = 8 << 10; // 8KB
|
||||
Options options = CurrentOptions();
|
||||
options.level0_file_num_compaction_trigger = kNumL0Files;
|
||||
options.target_file_size_base = 1 << 10; // 1KB
|
||||
DestroyAndReopen(options);
|
||||
|
||||
// The snapshot prevents key 1 from having its old version dropped. The low
|
||||
// `target_file_size_base` ensures two keys will be in each output file.
|
||||
const Snapshot* snapshot = nullptr;
|
||||
Random rnd(301);
|
||||
// The value indicates which flush the key belonged to, which is enough
|
||||
// for us to determine the keys' relative ages. After L0 flushes finish,
|
||||
// files look like:
|
||||
//
|
||||
// File 0: 0 -> vals[0], 1 -> vals[0]
|
||||
// File 1: 1 -> vals[1], 2 -> vals[1]
|
||||
//
|
||||
// Then L0->L1 compaction happens, which outputs keys as follows:
|
||||
//
|
||||
// File 0: 0 -> vals[0], 1 -> vals[1]
|
||||
// File 1: 1 -> vals[0], 2 -> vals[1]
|
||||
//
|
||||
// DeleteFilesInRange shouldn't be allowed to drop just file 0, as that
|
||||
// would cause `1 -> vals[0]` (an older key) to reappear.
|
||||
std::string vals[kNumL0Files];
|
||||
for (int i = 0; i < kNumL0Files; ++i) {
|
||||
vals[i] = RandomString(&rnd, kValSize);
|
||||
Put(Key(i), vals[i]);
|
||||
Put(Key(i + 1), vals[i]);
|
||||
Flush();
|
||||
if (i == 0) {
|
||||
snapshot = db_->GetSnapshot();
|
||||
}
|
||||
}
|
||||
dbfull()->TEST_WaitForCompact();
|
||||
|
||||
// Verify `DeleteFilesInRange` can't drop only file 0 which would cause
|
||||
// "1 -> vals[0]" to reappear.
|
||||
Slice begin = Key(0);
|
||||
Slice end = Key(1);
|
||||
ASSERT_OK(DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin, &end));
|
||||
ASSERT_EQ(vals[1], Get(Key(1)));
|
||||
|
||||
db_->ReleaseSnapshot(snapshot);
|
||||
}
|
||||
|
||||
TEST_P(DBCompactionTestWithParam, TrivialMoveToLastLevelWithFiles) {
|
||||
int32_t trivial_move = 0;
|
||||
int32_t non_trivial_move = 0;
|
||||
|
@ -136,7 +136,8 @@ void DumpSupportInfo(Logger* logger) {
|
||||
int64_t kDefaultLowPriThrottledRate = 2 * 1024 * 1024;
|
||||
} // namespace
|
||||
|
||||
DBImpl::DBImpl(const DBOptions& options, const std::string& dbname)
|
||||
DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
|
||||
const bool seq_per_batch)
|
||||
: env_(options.env),
|
||||
dbname_(dbname),
|
||||
initial_db_options_(SanitizeOptions(dbname, options)),
|
||||
@ -185,18 +186,30 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname)
|
||||
env_options_, immutable_db_options_)),
|
||||
num_running_ingest_file_(0),
|
||||
#ifndef ROCKSDB_LITE
|
||||
wal_manager_(immutable_db_options_, env_options_),
|
||||
wal_manager_(immutable_db_options_, env_options_, seq_per_batch),
|
||||
#endif // ROCKSDB_LITE
|
||||
event_logger_(immutable_db_options_.info_log.get()),
|
||||
bg_work_paused_(0),
|
||||
bg_compaction_paused_(0),
|
||||
refitting_level_(false),
|
||||
opened_successfully_(false),
|
||||
concurrent_prepare_(options.concurrent_prepare),
|
||||
two_write_queues_(options.two_write_queues),
|
||||
manual_wal_flush_(options.manual_wal_flush),
|
||||
seq_per_batch_(options.seq_per_batch),
|
||||
// TODO(myabandeh): revise this when we change options.seq_per_batch
|
||||
use_custom_gc_(options.seq_per_batch),
|
||||
seq_per_batch_(seq_per_batch),
|
||||
// When two_write_queues_ and seq_per_batch_ are both enabled we
|
||||
// sometimes allocate a seq also to indicate the commit timestmamp of a
|
||||
// transaction. In such cases last_sequence_ would not indicate the last
|
||||
// visible sequence number in memtable and should not be used for
|
||||
// snapshots. It should use last_allocated_sequence_ instaed but also
|
||||
// needs other mechanisms to exclude the data that after last_sequence_
|
||||
// and before last_allocated_sequence_ from the snapshot. In
|
||||
// WritePreparedTxn this property is ensured since such data are not
|
||||
// committed yet.
|
||||
allocate_seq_only_for_data_(!(seq_per_batch && options.two_write_queues)),
|
||||
// Since seq_per_batch_ is currently set only by WritePreparedTxn which
|
||||
// requires a custom gc for compaction, we use that to set use_custom_gc_
|
||||
// as well.
|
||||
use_custom_gc_(seq_per_batch),
|
||||
preserve_deletes_(options.preserve_deletes) {
|
||||
env_->GetAbsolutePath(dbname, &db_absolute_path_);
|
||||
|
||||
@ -751,7 +764,7 @@ SequenceNumber DBImpl::GetLatestSequenceNumber() const {
|
||||
}
|
||||
|
||||
SequenceNumber DBImpl::IncAndFetchSequenceNumber() {
|
||||
return versions_->FetchAddLastToBeWrittenSequence(1ull) + 1ull;
|
||||
return versions_->FetchAddLastAllocatedSequence(1ull) + 1ull;
|
||||
}
|
||||
|
||||
bool DBImpl::SetPreserveDeletesSequenceNumber(SequenceNumber seqnum) {
|
||||
@ -977,9 +990,8 @@ Status DBImpl::GetImpl(const ReadOptions& read_options,
|
||||
// super versipon because a flush happening in between may compact
|
||||
// away data for the snapshot, but the snapshot is earlier than the
|
||||
// data overwriting it, so users may see wrong results.
|
||||
snapshot = concurrent_prepare_ && seq_per_batch_
|
||||
? versions_->LastToBeWrittenSequence()
|
||||
: versions_->LastSequence();
|
||||
snapshot = allocate_seq_only_for_data_ ? versions_->LastSequence()
|
||||
: versions_->LastAllocatedSequence();
|
||||
}
|
||||
TEST_SYNC_POINT("DBImpl::GetImpl:3");
|
||||
TEST_SYNC_POINT("DBImpl::GetImpl:4");
|
||||
@ -1070,9 +1082,8 @@ std::vector<Status> DBImpl::MultiGet(
|
||||
snapshot = reinterpret_cast<const SnapshotImpl*>(
|
||||
read_options.snapshot)->number_;
|
||||
} else {
|
||||
snapshot = concurrent_prepare_ && seq_per_batch_
|
||||
? versions_->LastToBeWrittenSequence()
|
||||
: versions_->LastSequence();
|
||||
snapshot = allocate_seq_only_for_data_ ? versions_->LastSequence()
|
||||
: versions_->LastAllocatedSequence();
|
||||
}
|
||||
for (auto mgd_iter : multiget_cf_data) {
|
||||
mgd_iter.second->super_version =
|
||||
@ -1478,8 +1489,9 @@ Iterator* DBImpl::NewIterator(const ReadOptions& read_options,
|
||||
read_callback);
|
||||
#endif
|
||||
} else {
|
||||
// Note: no need to consider the special case of concurrent_prepare_ &&
|
||||
// seq_per_batch_ since NewIterator is overridden in WritePreparedTxnDB
|
||||
// Note: no need to consider the special case of
|
||||
// allocate_seq_only_for_data_==false since NewIterator is overridden in
|
||||
// WritePreparedTxnDB
|
||||
auto snapshot = read_options.snapshot != nullptr
|
||||
? read_options.snapshot->GetSequenceNumber()
|
||||
: versions_->LastSequence();
|
||||
@ -1595,8 +1607,9 @@ Status DBImpl::NewIterators(
|
||||
}
|
||||
#endif
|
||||
} else {
|
||||
// Note: no need to consider the special case of concurrent_prepare_ &&
|
||||
// seq_per_batch_ since NewIterators is overridden in WritePreparedTxnDB
|
||||
// Note: no need to consider the special case of
|
||||
// allocate_seq_only_for_data_==false since NewIterators is overridden in
|
||||
// WritePreparedTxnDB
|
||||
auto snapshot = read_options.snapshot != nullptr
|
||||
? read_options.snapshot->GetSequenceNumber()
|
||||
: versions_->LastSequence();
|
||||
@ -1630,9 +1643,9 @@ const Snapshot* DBImpl::GetSnapshotImpl(bool is_write_conflict_boundary) {
|
||||
delete s;
|
||||
return nullptr;
|
||||
}
|
||||
auto snapshot_seq = concurrent_prepare_ && seq_per_batch_
|
||||
? versions_->LastToBeWrittenSequence()
|
||||
: versions_->LastSequence();
|
||||
auto snapshot_seq = allocate_seq_only_for_data_
|
||||
? versions_->LastSequence()
|
||||
: versions_->LastAllocatedSequence();
|
||||
return snapshots_.New(s, snapshot_seq, unix_time, is_write_conflict_boundary);
|
||||
}
|
||||
|
||||
@ -1643,9 +1656,9 @@ void DBImpl::ReleaseSnapshot(const Snapshot* s) {
|
||||
snapshots_.Delete(casted_s);
|
||||
uint64_t oldest_snapshot;
|
||||
if (snapshots_.empty()) {
|
||||
oldest_snapshot = concurrent_prepare_ && seq_per_batch_
|
||||
? versions_->LastToBeWrittenSequence()
|
||||
: versions_->LastSequence();
|
||||
oldest_snapshot = allocate_seq_only_for_data_
|
||||
? versions_->LastSequence()
|
||||
: versions_->LastAllocatedSequence();
|
||||
} else {
|
||||
oldest_snapshot = snapshots_.oldest()->number_;
|
||||
}
|
||||
@ -1663,12 +1676,10 @@ void DBImpl::ReleaseSnapshot(const Snapshot* s) {
|
||||
delete casted_s;
|
||||
}
|
||||
|
||||
bool DBImpl::HasActiveSnapshotLaterThanSN(SequenceNumber sn) {
|
||||
bool DBImpl::HasActiveSnapshotInRange(SequenceNumber lower_bound,
|
||||
SequenceNumber upper_bound) {
|
||||
InstrumentedMutexLock l(&mutex_);
|
||||
if (snapshots_.empty()) {
|
||||
return false;
|
||||
}
|
||||
return (snapshots_.newest()->GetSequenceNumber() >= sn);
|
||||
return snapshots_.HasSnapshotInRange(lower_bound, upper_bound);
|
||||
}
|
||||
|
||||
#ifndef ROCKSDB_LITE
|
||||
@ -2147,17 +2158,12 @@ Status DBImpl::DeleteFilesInRange(ColumnFamilyHandle* column_family,
|
||||
end_key = &end_storage;
|
||||
}
|
||||
|
||||
vstorage->GetOverlappingInputs(i, begin_key, end_key, &level_files, -1,
|
||||
nullptr, false);
|
||||
vstorage->GetCleanInputsWithinInterval(i, begin_key, end_key,
|
||||
&level_files, -1 /* hint_index */,
|
||||
nullptr /* file_index */);
|
||||
FileMetaData* level_file;
|
||||
for (uint32_t j = 0; j < level_files.size(); j++) {
|
||||
level_file = level_files[j];
|
||||
if (((begin == nullptr) ||
|
||||
(cfd->internal_comparator().user_comparator()->Compare(
|
||||
level_file->smallest.user_key(), *begin) >= 0)) &&
|
||||
((end == nullptr) ||
|
||||
(cfd->internal_comparator().user_comparator()->Compare(
|
||||
level_file->largest.user_key(), *end) <= 0))) {
|
||||
if (level_file->being_compacted) {
|
||||
continue;
|
||||
}
|
||||
@ -2167,7 +2173,6 @@ Status DBImpl::DeleteFilesInRange(ColumnFamilyHandle* column_family,
|
||||
level_file->being_compacted = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (edit.GetDeletedFiles().empty()) {
|
||||
job_context.Clean();
|
||||
return Status::OK();
|
||||
@ -2755,7 +2760,7 @@ Status DBImpl::IngestExternalFile(
|
||||
WriteThread::Writer w;
|
||||
write_thread_.EnterUnbatched(&w, &mutex_);
|
||||
WriteThread::Writer nonmem_w;
|
||||
if (concurrent_prepare_) {
|
||||
if (two_write_queues_) {
|
||||
nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
|
||||
}
|
||||
|
||||
@ -2798,7 +2803,7 @@ Status DBImpl::IngestExternalFile(
|
||||
}
|
||||
|
||||
// Resume writes to the DB
|
||||
if (concurrent_prepare_) {
|
||||
if (two_write_queues_) {
|
||||
nonmem_write_thread_.ExitUnbatched(&nonmem_w);
|
||||
}
|
||||
write_thread_.ExitUnbatched(&w);
|
||||
|
50
db/db_impl.h
50
db/db_impl.h
@ -68,7 +68,8 @@ struct MemTableInfo;
|
||||
|
||||
class DBImpl : public DB {
|
||||
public:
|
||||
DBImpl(const DBOptions& options, const std::string& dbname);
|
||||
DBImpl(const DBOptions& options, const std::string& dbname,
|
||||
const bool seq_per_batch = false);
|
||||
virtual ~DBImpl();
|
||||
|
||||
// Implementations of the DB interface
|
||||
@ -220,14 +221,16 @@ class DBImpl : public DB {
|
||||
|
||||
virtual SequenceNumber GetLatestSequenceNumber() const override;
|
||||
virtual SequenceNumber IncAndFetchSequenceNumber();
|
||||
// Returns LastToBeWrittenSequence in concurrent_prepare_ && seq_per_batch_
|
||||
// mode and LastSequence otherwise. This is useful when visiblility depends
|
||||
// also on data written to the WAL but not to the memtable.
|
||||
SequenceNumber TEST_GetLatestVisibleSequenceNumber() const;
|
||||
// Returns LastSequence in allocate_seq_only_for_data_
|
||||
// mode and LastAllocatedSequence otherwise. This is useful when visiblility
|
||||
// depends also on data written to the WAL but not to the memtable.
|
||||
SequenceNumber TEST_GetLastVisibleSequence() const;
|
||||
|
||||
virtual bool SetPreserveDeletesSequenceNumber(SequenceNumber seqnum) override;
|
||||
|
||||
bool HasActiveSnapshotLaterThanSN(SequenceNumber sn);
|
||||
// Whether there is an active snapshot in range [lower_bound, upper_bound).
|
||||
bool HasActiveSnapshotInRange(SequenceNumber lower_bound,
|
||||
SequenceNumber upper_bound);
|
||||
|
||||
#ifndef ROCKSDB_LITE
|
||||
using DB::ResetStats;
|
||||
@ -604,6 +607,12 @@ class DBImpl : public DB {
|
||||
|
||||
Status NewDB();
|
||||
|
||||
// This is to be used only by internal rocksdb classes.
|
||||
static Status Open(const DBOptions& db_options, const std::string& name,
|
||||
const std::vector<ColumnFamilyDescriptor>& column_families,
|
||||
std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
|
||||
const bool seq_per_batch);
|
||||
|
||||
protected:
|
||||
Env* const env_;
|
||||
const std::string dbname_;
|
||||
@ -903,12 +912,12 @@ class DBImpl : public DB {
|
||||
FileLock* db_lock_;
|
||||
|
||||
// In addition to mutex_, log_write_mutex_ protected writes to logs_ and
|
||||
// logfile_number_. With concurrent_prepare it also protects alive_log_files_,
|
||||
// logfile_number_. With two_write_queues it also protects alive_log_files_,
|
||||
// and log_empty_. Refer to the definition of each variable below for more
|
||||
// details.
|
||||
InstrumentedMutex log_write_mutex_;
|
||||
// State below is protected by mutex_
|
||||
// With concurrent_prepare enabled, some of the variables that accessed during
|
||||
// With two_write_queues enabled, some of the variables that accessed during
|
||||
// WriteToWAL need different synchronization: log_empty_, alive_log_files_,
|
||||
// logs_, logfile_number_. Refer to the definition of each variable below for
|
||||
// more description.
|
||||
@ -933,10 +942,10 @@ class DBImpl : public DB {
|
||||
std::deque<uint64_t>
|
||||
log_recycle_files; // a list of log files that we can recycle
|
||||
bool log_dir_synced_;
|
||||
// Without concurrent_prepare, read and writes to log_empty_ are protected by
|
||||
// Without two_write_queues, read and writes to log_empty_ are protected by
|
||||
// mutex_. Since it is currently updated/read only in write_thread_, it can be
|
||||
// accessed from the same write_thread_ without any locks. With
|
||||
// concurrent_prepare writes, where it can be updated in different threads,
|
||||
// two_write_queues writes, where it can be updated in different threads,
|
||||
// read and writes are protected by log_write_mutex_ instead. This is to avoid
|
||||
// expesnive mutex_ lock during WAL write, which update log_empty_.
|
||||
bool log_empty_;
|
||||
@ -973,10 +982,10 @@ class DBImpl : public DB {
|
||||
// true for some prefix of logs_
|
||||
bool getting_synced = false;
|
||||
};
|
||||
// Without concurrent_prepare, read and writes to alive_log_files_ are
|
||||
// Without two_write_queues, read and writes to alive_log_files_ are
|
||||
// protected by mutex_. However since back() is never popped, and push_back()
|
||||
// is done only from write_thread_, the same thread can access the item
|
||||
// reffered by back() without mutex_. With concurrent_prepare_, writes
|
||||
// reffered by back() without mutex_. With two_write_queues_, writes
|
||||
// are protected by locking both mutex_ and log_write_mutex_, and reads must
|
||||
// be under either mutex_ or log_write_mutex_.
|
||||
std::deque<LogFileNumberSize> alive_log_files_;
|
||||
@ -1001,7 +1010,7 @@ class DBImpl : public DB {
|
||||
// memtable on normal writes and hence improving the throughput. Each new
|
||||
// write of the state will replace the previous state entirely even if the
|
||||
// keys in the two consecuitive states do not overlap.
|
||||
// It is protected by log_write_mutex_ when concurrent_prepare_ is enabled.
|
||||
// It is protected by log_write_mutex_ when two_write_queues_ is enabled.
|
||||
// Otherwise only the heaad of write_thread_ can access it.
|
||||
WriteBatch cached_recoverable_state_;
|
||||
std::atomic<bool> cached_recoverable_state_empty_ = {true};
|
||||
@ -1317,9 +1326,22 @@ class DBImpl : public DB {
|
||||
|
||||
// When set, we use a seprate queue for writes that dont write to memtable. In
|
||||
// 2PC these are the writes at Prepare phase.
|
||||
const bool concurrent_prepare_;
|
||||
const bool two_write_queues_;
|
||||
const bool manual_wal_flush_;
|
||||
// Increase the sequence number after writing each batch, whether memtable is
|
||||
// disabled for that or not. Otherwise the sequence number is increased after
|
||||
// writing each key into memtable. This implies that when disable_memtable is
|
||||
// set, the seq is not increased at all.
|
||||
//
|
||||
// Default: false
|
||||
const bool seq_per_batch_;
|
||||
// A sequence number is allocated only for data written to DB. Otherwise it
|
||||
// could also be allocated for operational purposes such as commit timestamp
|
||||
// of a transaction.
|
||||
const bool allocate_seq_only_for_data_;
|
||||
// It indicates that a customized gc algorithm must be used for
|
||||
// flush/compaction and if it is not provided vis SnapshotChecker, we should
|
||||
// disable gc to be safe.
|
||||
const bool use_custom_gc_;
|
||||
|
||||
// Clients must periodically call SetPreserveDeletesSequenceNumber()
|
||||
|
@ -209,11 +209,11 @@ int DBImpl::TEST_BGFlushesAllowed() const {
|
||||
return GetBGJobLimits().max_flushes;
|
||||
}
|
||||
|
||||
SequenceNumber DBImpl::TEST_GetLatestVisibleSequenceNumber() const {
|
||||
if (concurrent_prepare_ && seq_per_batch_) {
|
||||
return versions_->LastToBeWrittenSequence();
|
||||
} else {
|
||||
SequenceNumber DBImpl::TEST_GetLastVisibleSequence() const {
|
||||
if (allocate_seq_only_for_data_) {
|
||||
return versions_->LastSequence();
|
||||
} else {
|
||||
return versions_->LastAllocatedSequence();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -252,11 +252,11 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
|
||||
}
|
||||
job_context->size_log_to_delete += earliest.size;
|
||||
total_log_size_ -= earliest.size;
|
||||
if (concurrent_prepare_) {
|
||||
if (two_write_queues_) {
|
||||
log_write_mutex_.Lock();
|
||||
}
|
||||
alive_log_files_.pop_front();
|
||||
if (concurrent_prepare_) {
|
||||
if (two_write_queues_) {
|
||||
log_write_mutex_.Unlock();
|
||||
}
|
||||
// Current log should always stay alive since it can't have
|
||||
|
@ -592,9 +592,10 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
|
||||
// happen when we open and write to a corrupted DB, where sequence id
|
||||
// will start from the last sequence id we recovered.
|
||||
if (sequence == *next_sequence ||
|
||||
// With seq_per_batch_, if previous run was with concurrent_prepare_
|
||||
// then gap in the sequence numbers is expected by the commits
|
||||
// without prepares.
|
||||
// With seq_per_batch_, if previous run was with two_write_queues_
|
||||
// then allocate_seq_only_for_data_ was disabled and a gap in the
|
||||
// sequence numbers in the log is expected by the commits without
|
||||
// prepares.
|
||||
(seq_per_batch_ && sequence >= *next_sequence)) {
|
||||
stop_replay_for_corruption = false;
|
||||
}
|
||||
@ -754,7 +755,7 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
|
||||
auto last_sequence = *next_sequence - 1;
|
||||
if ((*next_sequence != kMaxSequenceNumber) &&
|
||||
(versions_->LastSequence() <= last_sequence)) {
|
||||
versions_->SetLastToBeWrittenSequence(last_sequence);
|
||||
versions_->SetLastAllocatedSequence(last_sequence);
|
||||
versions_->SetLastSequence(last_sequence);
|
||||
}
|
||||
}
|
||||
@ -845,13 +846,13 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
|
||||
if (data_seen && !flushed) {
|
||||
// Mark these as alive so they'll be considered for deletion later by
|
||||
// FindObsoleteFiles()
|
||||
if (concurrent_prepare_) {
|
||||
if (two_write_queues_) {
|
||||
log_write_mutex_.Lock();
|
||||
}
|
||||
for (auto log_number : log_numbers) {
|
||||
alive_log_files_.push_back(LogFileNumberSize(log_number));
|
||||
}
|
||||
if (concurrent_prepare_) {
|
||||
if (two_write_queues_) {
|
||||
log_write_mutex_.Unlock();
|
||||
}
|
||||
}
|
||||
@ -966,6 +967,15 @@ Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) {
|
||||
Status DB::Open(const DBOptions& db_options, const std::string& dbname,
|
||||
const std::vector<ColumnFamilyDescriptor>& column_families,
|
||||
std::vector<ColumnFamilyHandle*>* handles, DB** dbptr) {
|
||||
const bool seq_per_batch = true;
|
||||
return DBImpl::Open(db_options, dbname, column_families, handles, dbptr,
|
||||
!seq_per_batch);
|
||||
}
|
||||
|
||||
Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
|
||||
const std::vector<ColumnFamilyDescriptor>& column_families,
|
||||
std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
|
||||
const bool seq_per_batch) {
|
||||
Status s = SanitizeOptionsByTable(db_options, column_families);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
@ -985,7 +995,7 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname,
|
||||
std::max(max_write_buffer_size, cf.options.write_buffer_size);
|
||||
}
|
||||
|
||||
DBImpl* impl = new DBImpl(db_options, dbname);
|
||||
DBImpl* impl = new DBImpl(db_options, dbname, seq_per_batch);
|
||||
s = impl->env_->CreateDirIfMissing(impl->immutable_db_options_.wal_dir);
|
||||
if (s.ok()) {
|
||||
for (auto db_path : impl->immutable_db_options_.db_paths) {
|
||||
@ -1070,12 +1080,12 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname,
|
||||
cfd, &sv_context, *cfd->GetLatestMutableCFOptions());
|
||||
}
|
||||
sv_context.Clean();
|
||||
if (impl->concurrent_prepare_) {
|
||||
if (impl->two_write_queues_) {
|
||||
impl->log_write_mutex_.Lock();
|
||||
}
|
||||
impl->alive_log_files_.push_back(
|
||||
DBImpl::LogFileNumberSize(impl->logfile_number_));
|
||||
if (impl->concurrent_prepare_) {
|
||||
if (impl->two_write_queues_) {
|
||||
impl->log_write_mutex_.Unlock();
|
||||
}
|
||||
impl->DeleteObsoleteFiles();
|
||||
|
@ -67,7 +67,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
||||
if (write_options.sync && write_options.disableWAL) {
|
||||
return Status::InvalidArgument("Sync writes has to enable WAL.");
|
||||
}
|
||||
if (concurrent_prepare_ && immutable_db_options_.enable_pipelined_write) {
|
||||
if (two_write_queues_ && immutable_db_options_.enable_pipelined_write) {
|
||||
return Status::NotSupported(
|
||||
"pipelined_writes is not compatible with concurrent prepares");
|
||||
}
|
||||
@ -87,7 +87,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
||||
}
|
||||
}
|
||||
|
||||
if (concurrent_prepare_ && disable_memtable) {
|
||||
if (two_write_queues_ && disable_memtable) {
|
||||
return WriteImplWALOnly(write_options, my_batch, callback, log_used,
|
||||
log_ref, seq_used);
|
||||
}
|
||||
@ -154,7 +154,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
||||
WriteThread::WriteGroup write_group;
|
||||
bool in_parallel_group = false;
|
||||
uint64_t last_sequence = kMaxSequenceNumber;
|
||||
if (!concurrent_prepare_) {
|
||||
if (!two_write_queues_) {
|
||||
last_sequence = versions_->LastSequence();
|
||||
}
|
||||
|
||||
@ -162,7 +162,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
||||
|
||||
bool need_log_sync = write_options.sync;
|
||||
bool need_log_dir_sync = need_log_sync && !log_dir_synced_;
|
||||
if (!concurrent_prepare_ || !disable_memtable) {
|
||||
if (!two_write_queues_ || !disable_memtable) {
|
||||
// With concurrent writes we do preprocess only in the write thread that
|
||||
// also does write to memtable to avoid sync issue on shared data structure
|
||||
// with the other thread
|
||||
@ -209,7 +209,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
||||
}
|
||||
size_t seq_inc = seq_per_batch_ ? write_group.size : total_count;
|
||||
|
||||
const bool concurrent_update = concurrent_prepare_;
|
||||
const bool concurrent_update = two_write_queues_;
|
||||
// Update stats while we are an exclusive group leader, so we know
|
||||
// that nobody else can be writing to these particular stats.
|
||||
// We're optimistic, updating the stats before we successfully
|
||||
@ -237,7 +237,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
||||
|
||||
PERF_TIMER_STOP(write_pre_and_post_process_time);
|
||||
|
||||
if (!concurrent_prepare_) {
|
||||
if (!two_write_queues_) {
|
||||
if (status.ok() && !write_options.disableWAL) {
|
||||
PERF_TIMER_GUARD(write_wal_time);
|
||||
status = WriteToWAL(write_group, log_writer, log_used, need_log_sync,
|
||||
@ -246,13 +246,13 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
||||
} else {
|
||||
if (status.ok() && !write_options.disableWAL) {
|
||||
PERF_TIMER_GUARD(write_wal_time);
|
||||
// LastToBeWrittenSequence is increased inside WriteToWAL under
|
||||
// LastAllocatedSequence is increased inside WriteToWAL under
|
||||
// wal_write_mutex_ to ensure ordered events in WAL
|
||||
status = ConcurrentWriteToWAL(write_group, log_used, &last_sequence,
|
||||
seq_inc);
|
||||
} else {
|
||||
// Otherwise we inc seq number for memtable writes
|
||||
last_sequence = versions_->FetchAddLastToBeWrittenSequence(seq_inc);
|
||||
last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc);
|
||||
}
|
||||
}
|
||||
assert(last_sequence != kMaxSequenceNumber);
|
||||
@ -310,9 +310,9 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
||||
mutex_.Lock();
|
||||
MarkLogsSynced(logfile_number_, need_log_dir_sync, status);
|
||||
mutex_.Unlock();
|
||||
// Requesting sync with concurrent_prepare_ is expected to be very rare. We
|
||||
// Requesting sync with two_write_queues_ is expected to be very rare. We
|
||||
// hance provide a simple implementation that is not necessarily efficient.
|
||||
if (concurrent_prepare_) {
|
||||
if (two_write_queues_) {
|
||||
if (manual_wal_flush_) {
|
||||
status = FlushWAL(true);
|
||||
} else {
|
||||
@ -332,7 +332,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
||||
versions_->SetLastSequence(last_sequence);
|
||||
}
|
||||
MemTableInsertStatusCheck(w.status);
|
||||
write_thread_.ExitAsBatchGroupLeader(write_group, w.status);
|
||||
write_thread_.ExitAsBatchGroupLeader(write_group, status);
|
||||
}
|
||||
|
||||
if (status.ok()) {
|
||||
@ -532,7 +532,7 @@ Status DBImpl::WriteImplWALOnly(const WriteOptions& write_options,
|
||||
PERF_TIMER_STOP(write_pre_and_post_process_time);
|
||||
|
||||
PERF_TIMER_GUARD(write_wal_time);
|
||||
// LastToBeWrittenSequence is increased inside WriteToWAL under
|
||||
// LastAllocatedSequence is increased inside WriteToWAL under
|
||||
// wal_write_mutex_ to ensure ordered events in WAL
|
||||
size_t seq_inc = seq_per_batch_ ? write_group.size : 0 /*total_count*/;
|
||||
status = ConcurrentWriteToWAL(write_group, log_used, &last_sequence, seq_inc);
|
||||
@ -548,7 +548,7 @@ Status DBImpl::WriteImplWALOnly(const WriteOptions& write_options,
|
||||
}
|
||||
}
|
||||
if (status.ok() && write_options.sync) {
|
||||
// Requesting sync with concurrent_prepare_ is expected to be very rare. We
|
||||
// Requesting sync with two_write_queues_ is expected to be very rare. We
|
||||
// hance provide a simple implementation that is not necessarily efficient.
|
||||
if (manual_wal_flush_) {
|
||||
status = FlushWAL(true);
|
||||
@ -561,7 +561,7 @@ Status DBImpl::WriteImplWALOnly(const WriteOptions& write_options,
|
||||
if (!w.CallbackFailed()) {
|
||||
WriteCallbackStatusCheck(status);
|
||||
}
|
||||
nonmem_write_thread_.ExitAsBatchGroupLeader(write_group, w.status);
|
||||
nonmem_write_thread_.ExitAsBatchGroupLeader(write_group, status);
|
||||
if (status.ok()) {
|
||||
status = w.FinalStatus();
|
||||
}
|
||||
@ -719,7 +719,7 @@ WriteBatch* DBImpl::MergeBatch(const WriteThread::WriteGroup& write_group,
|
||||
return merged_batch;
|
||||
}
|
||||
|
||||
// When concurrent_prepare_ is disabled, this function is called from the only
|
||||
// When two_write_queues_ is disabled, this function is called from the only
|
||||
// write thread. Otherwise this must be called holding log_write_mutex_.
|
||||
Status DBImpl::WriteToWAL(const WriteBatch& merged_batch,
|
||||
log::Writer* log_writer, uint64_t* log_used,
|
||||
@ -828,7 +828,7 @@ Status DBImpl::ConcurrentWriteToWAL(const WriteThread::WriteGroup& write_group,
|
||||
writer->log_used = logfile_number_;
|
||||
}
|
||||
}
|
||||
*last_sequence = versions_->FetchAddLastToBeWrittenSequence(seq_inc);
|
||||
*last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc);
|
||||
auto sequence = *last_sequence + 1;
|
||||
WriteBatchInternal::SetSequence(merged_batch, sequence);
|
||||
|
||||
@ -858,7 +858,7 @@ Status DBImpl::WriteRecoverableState() {
|
||||
if (!cached_recoverable_state_empty_) {
|
||||
bool dont_care_bool;
|
||||
SequenceNumber next_seq;
|
||||
if (concurrent_prepare_) {
|
||||
if (two_write_queues_) {
|
||||
log_write_mutex_.Lock();
|
||||
}
|
||||
SequenceNumber seq = versions_->LastSequence();
|
||||
@ -869,7 +869,7 @@ Status DBImpl::WriteRecoverableState() {
|
||||
false /* concurrent_memtable_writes */, &next_seq, &dont_care_bool,
|
||||
seq_per_batch_);
|
||||
versions_->SetLastSequence(--next_seq);
|
||||
if (concurrent_prepare_) {
|
||||
if (two_write_queues_) {
|
||||
log_write_mutex_.Unlock();
|
||||
}
|
||||
if (status.ok()) {
|
||||
@ -1109,7 +1109,7 @@ void DBImpl::NotifyOnMemTableSealed(ColumnFamilyData* cfd,
|
||||
Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
|
||||
mutex_.AssertHeld();
|
||||
WriteThread::Writer nonmem_w;
|
||||
if (concurrent_prepare_) {
|
||||
if (two_write_queues_) {
|
||||
// SwitchMemtable is a rare event. To simply the reasoning, we make sure
|
||||
// that there is no concurrent thread writing to WAL.
|
||||
nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
|
||||
@ -1135,11 +1135,11 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
|
||||
// Attempt to switch to a new memtable and trigger flush of old.
|
||||
// Do this without holding the dbmutex lock.
|
||||
assert(versions_->prev_log_number() == 0);
|
||||
if (concurrent_prepare_) {
|
||||
if (two_write_queues_) {
|
||||
log_write_mutex_.Lock();
|
||||
}
|
||||
bool creating_new_log = !log_empty_;
|
||||
if (concurrent_prepare_) {
|
||||
if (two_write_queues_) {
|
||||
log_write_mutex_.Unlock();
|
||||
}
|
||||
uint64_t recycle_log_number = 0;
|
||||
@ -1224,7 +1224,7 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
|
||||
assert(creating_new_log);
|
||||
assert(!new_mem);
|
||||
assert(!new_log);
|
||||
if (concurrent_prepare_) {
|
||||
if (two_write_queues_) {
|
||||
nonmem_write_thread_.ExitUnbatched(&nonmem_w);
|
||||
}
|
||||
return s;
|
||||
@ -1264,7 +1264,7 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
|
||||
cfd->SetMemtable(new_mem);
|
||||
InstallSuperVersionAndScheduleWork(cfd, &context->superversion_context,
|
||||
mutable_cf_options);
|
||||
if (concurrent_prepare_) {
|
||||
if (two_write_queues_) {
|
||||
nonmem_write_thread_.ExitUnbatched(&nonmem_w);
|
||||
}
|
||||
return s;
|
||||
|
@ -376,6 +376,30 @@ TEST_F(DBSSTTest, RateLimitedDelete) {
|
||||
rocksdb::SyncPoint::GetInstance()->DisableProcessing();
|
||||
}
|
||||
|
||||
TEST_F(DBSSTTest, OpenDBWithExistingTrash) {
|
||||
Options options = CurrentOptions();
|
||||
|
||||
options.sst_file_manager.reset(
|
||||
NewSstFileManager(env_, nullptr, "", 1024 * 1024 /* 1 MB/sec */));
|
||||
auto sfm = static_cast<SstFileManagerImpl*>(options.sst_file_manager.get());
|
||||
|
||||
Destroy(last_options_);
|
||||
|
||||
// Add some trash files to the db directory so the DB can clean them up
|
||||
env_->CreateDirIfMissing(dbname_);
|
||||
ASSERT_OK(WriteStringToFile(env_, "abc", dbname_ + "/" + "001.sst.trash"));
|
||||
ASSERT_OK(WriteStringToFile(env_, "abc", dbname_ + "/" + "002.sst.trash"));
|
||||
ASSERT_OK(WriteStringToFile(env_, "abc", dbname_ + "/" + "003.sst.trash"));
|
||||
|
||||
// Reopen the DB and verify that it deletes existing trash files
|
||||
ASSERT_OK(TryReopen(options));
|
||||
sfm->WaitForEmptyTrash();
|
||||
ASSERT_NOK(env_->FileExists(dbname_ + "/" + "001.sst.trash"));
|
||||
ASSERT_NOK(env_->FileExists(dbname_ + "/" + "002.sst.trash"));
|
||||
ASSERT_NOK(env_->FileExists(dbname_ + "/" + "003.sst.trash"));
|
||||
}
|
||||
|
||||
|
||||
// Create a DB with 2 db_paths, and generate multiple files in the 2
|
||||
// db_paths using CompactRangeOptions, make sure that files that were
|
||||
// deleted from first db_path were deleted using DeleteScheduler and
|
||||
|
@ -3354,11 +3354,23 @@ TEST_F(DBTest, DynamicMemtableOptions) {
|
||||
{"write_buffer_size", "131072"},
|
||||
}));
|
||||
|
||||
// The existing memtable is still 64KB in size, after it becomes immutable,
|
||||
// the next memtable will be 128KB in size. Write 256KB total, we should
|
||||
// have a 64KB L0 file, a 128KB L0 file, and a memtable with 64KB data
|
||||
gen_l0_kb(256);
|
||||
ASSERT_EQ(NumTableFilesAtLevel(0), 2); // (A)
|
||||
// The existing memtable inflated 64KB->128KB when we invoked SetOptions().
|
||||
// Write 192KB, we should have a 128KB L0 file and a memtable with 64KB data.
|
||||
gen_l0_kb(192);
|
||||
ASSERT_EQ(NumTableFilesAtLevel(0), 1); // (A)
|
||||
ASSERT_LT(SizeAtLevel(0), k128KB + 2 * k5KB);
|
||||
ASSERT_GT(SizeAtLevel(0), k128KB - 4 * k5KB);
|
||||
|
||||
// Decrease buffer size below current usage
|
||||
ASSERT_OK(dbfull()->SetOptions({
|
||||
{"write_buffer_size", "65536"},
|
||||
}));
|
||||
// The existing memtable became eligible for flush when we reduced its
|
||||
// capacity to 64KB. Two keys need to be added to trigger flush: first causes
|
||||
// memtable to be marked full, second schedules the flush. Then we should have
|
||||
// a 128KB L0 file, a 64KB L0 file, and a memtable with just one key.
|
||||
gen_l0_kb(2);
|
||||
ASSERT_EQ(NumTableFilesAtLevel(0), 2);
|
||||
ASSERT_LT(SizeAtLevel(0), k128KB + k64KB + 2 * k5KB);
|
||||
ASSERT_GT(SizeAtLevel(0), k128KB + k64KB - 4 * k5KB);
|
||||
|
||||
|
@ -486,7 +486,7 @@ Options DBTestBase::GetOptions(
|
||||
}
|
||||
case kConcurrentWALWrites: {
|
||||
// This options optimize 2PC commit path
|
||||
options.concurrent_prepare = true;
|
||||
options.two_write_queues = true;
|
||||
options.manual_wal_flush = true;
|
||||
break;
|
||||
}
|
||||
|
@ -730,7 +730,7 @@ class RecoveryTestHelper {
|
||||
batch.Put(key, value);
|
||||
WriteBatchInternal::SetSequence(&batch, seq);
|
||||
current_log_writer->AddRecord(WriteBatchInternal::Contents(&batch));
|
||||
versions->SetLastToBeWrittenSequence(seq);
|
||||
versions->SetLastAllocatedSequence(seq);
|
||||
versions->SetLastSequence(seq);
|
||||
}
|
||||
}
|
||||
|
@ -3,12 +3,18 @@
|
||||
// COPYING file in the root directory) and Apache 2.0 License
|
||||
// (found in the LICENSE.Apache file in the root directory).
|
||||
|
||||
#include <atomic>
|
||||
#include <memory>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
#include "db/db_test_util.h"
|
||||
#include "db/write_batch_internal.h"
|
||||
#include "db/write_thread.h"
|
||||
#include "port/port.h"
|
||||
#include "port/stack_trace.h"
|
||||
#include "util/fault_injection_test_env.h"
|
||||
#include "util/string_util.h"
|
||||
#include "util/sync_point.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
@ -17,7 +23,9 @@ class DBWriteTest : public DBTestBase, public testing::WithParamInterface<int> {
|
||||
public:
|
||||
DBWriteTest() : DBTestBase("/db_write_test") {}
|
||||
|
||||
void Open() { DBTestBase::Reopen(GetOptions(GetParam())); }
|
||||
Options GetOptions() { return DBTestBase::GetOptions(GetParam()); }
|
||||
|
||||
void Open() { DBTestBase::Reopen(GetOptions()); }
|
||||
};
|
||||
|
||||
// It is invalid to do sync write while disabling WAL.
|
||||
@ -77,6 +85,47 @@ TEST_P(DBWriteTest, ReturnSeuqneceNumberMultiThreaded) {
|
||||
}
|
||||
}
|
||||
|
||||
TEST_P(DBWriteTest, IOErrorOnWALWritePropagateToWriteThreadFollower) {
|
||||
constexpr int kNumThreads = 5;
|
||||
std::unique_ptr<FaultInjectionTestEnv> mock_env(
|
||||
new FaultInjectionTestEnv(Env::Default()));
|
||||
Options options = GetOptions();
|
||||
options.env = mock_env.get();
|
||||
Reopen(options);
|
||||
std::atomic<int> ready_count{0};
|
||||
std::atomic<int> leader_count{0};
|
||||
std::vector<port::Thread> threads;
|
||||
mock_env->SetFilesystemActive(false);
|
||||
// Wait until all threads linked to write threads, to make sure
|
||||
// all threads join the same batch group.
|
||||
SyncPoint::GetInstance()->SetCallBack(
|
||||
"WriteThread::JoinBatchGroup:Wait", [&](void* arg) {
|
||||
ready_count++;
|
||||
auto* w = reinterpret_cast<WriteThread::Writer*>(arg);
|
||||
if (w->state == WriteThread::STATE_GROUP_LEADER) {
|
||||
leader_count++;
|
||||
while (ready_count < kNumThreads) {
|
||||
// busy waiting
|
||||
}
|
||||
}
|
||||
});
|
||||
SyncPoint::GetInstance()->EnableProcessing();
|
||||
for (int i = 0; i < kNumThreads; i++) {
|
||||
threads.push_back(port::Thread(
|
||||
[&](int index) {
|
||||
// All threads should fail.
|
||||
ASSERT_FALSE(Put("key" + ToString(index), "value").ok());
|
||||
},
|
||||
i));
|
||||
}
|
||||
for (int i = 0; i < kNumThreads; i++) {
|
||||
threads[i].join();
|
||||
}
|
||||
ASSERT_EQ(1, leader_count);
|
||||
// Close before mock_env destruct.
|
||||
Close();
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(DBWriteTestInstance, DBWriteTest,
|
||||
testing::Values(DBTestBase::kDefault,
|
||||
DBTestBase::kConcurrentWALWrites,
|
||||
|
@ -164,7 +164,7 @@ Status ExternalSstFileIngestionJob::Run() {
|
||||
// if the dont overlap with any ranges since we have snapshots
|
||||
force_global_seqno = true;
|
||||
}
|
||||
// It is safe to use this instead of LastToBeWrittenSequence since we are
|
||||
// It is safe to use this instead of LastAllocatedSequence since we are
|
||||
// the only active writer, and hence they are equal
|
||||
const SequenceNumber last_seqno = versions_->LastSequence();
|
||||
SuperVersion* super_version = cfd_->GetSuperVersion();
|
||||
@ -199,7 +199,7 @@ Status ExternalSstFileIngestionJob::Run() {
|
||||
}
|
||||
|
||||
if (consumed_seqno) {
|
||||
versions_->SetLastToBeWrittenSequence(last_seqno + 1);
|
||||
versions_->SetLastAllocatedSequence(last_seqno + 1);
|
||||
versions_->SetLastSequence(last_seqno + 1);
|
||||
}
|
||||
|
||||
|
@ -39,10 +39,10 @@
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
MemTableOptions::MemTableOptions(const ImmutableCFOptions& ioptions,
|
||||
ImmutableMemTableOptions::ImmutableMemTableOptions(
|
||||
const ImmutableCFOptions& ioptions,
|
||||
const MutableCFOptions& mutable_cf_options)
|
||||
: write_buffer_size(mutable_cf_options.write_buffer_size),
|
||||
arena_block_size(mutable_cf_options.arena_block_size),
|
||||
: arena_block_size(mutable_cf_options.arena_block_size),
|
||||
memtable_prefix_bloom_bits(
|
||||
static_cast<uint32_t>(
|
||||
static_cast<double>(mutable_cf_options.write_buffer_size) *
|
||||
@ -83,6 +83,7 @@ MemTable::MemTable(const InternalKeyComparator& cmp,
|
||||
data_size_(0),
|
||||
num_entries_(0),
|
||||
num_deletes_(0),
|
||||
write_buffer_size_(mutable_cf_options.write_buffer_size),
|
||||
flush_in_progress_(false),
|
||||
flush_completed_(false),
|
||||
file_number_(0),
|
||||
@ -136,6 +137,7 @@ size_t MemTable::ApproximateMemoryUsage() {
|
||||
}
|
||||
|
||||
bool MemTable::ShouldFlushNow() const {
|
||||
size_t write_buffer_size = write_buffer_size_.load(std::memory_order_relaxed);
|
||||
// In a lot of times, we cannot allocate arena blocks that exactly matches the
|
||||
// buffer size. Thus we have to decide if we should over-allocate or
|
||||
// under-allocate.
|
||||
@ -153,16 +155,14 @@ bool MemTable::ShouldFlushNow() const {
|
||||
// if we can still allocate one more block without exceeding the
|
||||
// over-allocation ratio, then we should not flush.
|
||||
if (allocated_memory + kArenaBlockSize <
|
||||
moptions_.write_buffer_size +
|
||||
kArenaBlockSize * kAllowOverAllocationRatio) {
|
||||
write_buffer_size + kArenaBlockSize * kAllowOverAllocationRatio) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// if user keeps adding entries that exceeds moptions.write_buffer_size,
|
||||
// we need to flush earlier even though we still have much available
|
||||
// memory left.
|
||||
if (allocated_memory > moptions_.write_buffer_size +
|
||||
kArenaBlockSize * kAllowOverAllocationRatio) {
|
||||
// if user keeps adding entries that exceeds write_buffer_size, we need to
|
||||
// flush earlier even though we still have much available memory left.
|
||||
if (allocated_memory >
|
||||
write_buffer_size + kArenaBlockSize * kAllowOverAllocationRatio) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -265,7 +265,8 @@ class MemTableIterator : public InternalIterator {
|
||||
comparator_(mem.comparator_),
|
||||
valid_(false),
|
||||
arena_mode_(arena != nullptr),
|
||||
value_pinned_(!mem.GetMemTableOptions()->inplace_update_support) {
|
||||
value_pinned_(
|
||||
!mem.GetImmutableMemTableOptions()->inplace_update_support) {
|
||||
if (use_range_del_table) {
|
||||
iter_ = mem.range_del_table_->GetIterator(arena);
|
||||
} else if (prefix_extractor_ != nullptr && !read_options.total_order_seek) {
|
||||
|
@ -36,11 +36,9 @@ class MemTableIterator;
|
||||
class MergeContext;
|
||||
class InternalIterator;
|
||||
|
||||
struct MemTableOptions {
|
||||
explicit MemTableOptions(
|
||||
const ImmutableCFOptions& ioptions,
|
||||
struct ImmutableMemTableOptions {
|
||||
explicit ImmutableMemTableOptions(const ImmutableCFOptions& ioptions,
|
||||
const MutableCFOptions& mutable_cf_options);
|
||||
size_t write_buffer_size;
|
||||
size_t arena_block_size;
|
||||
uint32_t memtable_prefix_bloom_bits;
|
||||
size_t memtable_huge_page_size;
|
||||
@ -262,6 +260,18 @@ class MemTable {
|
||||
return num_deletes_.load(std::memory_order_relaxed);
|
||||
}
|
||||
|
||||
// Dynamically change the memtable's capacity. If set below the current usage,
|
||||
// the next key added will trigger a flush. Can only increase size when
|
||||
// memtable prefix bloom is disabled, since we can't easily allocate more
|
||||
// space.
|
||||
void UpdateWriteBufferSize(size_t new_write_buffer_size) {
|
||||
if (prefix_bloom_ == nullptr ||
|
||||
new_write_buffer_size < write_buffer_size_) {
|
||||
write_buffer_size_.store(new_write_buffer_size,
|
||||
std::memory_order_relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
// Returns the edits area that is needed for flushing the memtable
|
||||
VersionEdit* GetEdits() { return &edit_; }
|
||||
|
||||
@ -350,7 +360,9 @@ class MemTable {
|
||||
return comparator_.comparator;
|
||||
}
|
||||
|
||||
const MemTableOptions* GetMemTableOptions() const { return &moptions_; }
|
||||
const ImmutableMemTableOptions* GetImmutableMemTableOptions() const {
|
||||
return &moptions_;
|
||||
}
|
||||
|
||||
uint64_t ApproximateOldestKeyTime() const {
|
||||
return oldest_key_time_.load(std::memory_order_relaxed);
|
||||
@ -364,7 +376,7 @@ class MemTable {
|
||||
friend class MemTableList;
|
||||
|
||||
KeyComparator comparator_;
|
||||
const MemTableOptions moptions_;
|
||||
const ImmutableMemTableOptions moptions_;
|
||||
int refs_;
|
||||
const size_t kArenaBlockSize;
|
||||
AllocTracker mem_tracker_;
|
||||
@ -378,6 +390,9 @@ class MemTable {
|
||||
std::atomic<uint64_t> num_entries_;
|
||||
std::atomic<uint64_t> num_deletes_;
|
||||
|
||||
// Dynamically changeable memtable option
|
||||
std::atomic<size_t> write_buffer_size_;
|
||||
|
||||
// These are used to manage memtable flushes to storage
|
||||
bool flush_in_progress_; // started the flush
|
||||
bool flush_completed_; // finished the flush
|
||||
|
@ -546,7 +546,7 @@ class Repairer {
|
||||
max_sequence = tables_[i].max_sequence;
|
||||
}
|
||||
}
|
||||
vset_.SetLastToBeWrittenSequence(max_sequence);
|
||||
vset_.SetLastAllocatedSequence(max_sequence);
|
||||
vset_.SetLastSequence(max_sequence);
|
||||
|
||||
for (const auto& cf_id_and_tables : cf_id_to_tables) {
|
||||
|
@ -108,6 +108,22 @@ class SnapshotList {
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Whether there is an active snapshot in range [lower_bound, upper_bound).
|
||||
bool HasSnapshotInRange(SequenceNumber lower_bound,
|
||||
SequenceNumber upper_bound) {
|
||||
if (empty()) {
|
||||
return false;
|
||||
}
|
||||
const SnapshotImpl* s = &list_;
|
||||
while (s->next_ != &list_) {
|
||||
if (s->next_->number_ >= lower_bound) {
|
||||
return s->next_->number_ < upper_bound;
|
||||
}
|
||||
s = s->next_;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// get the sequence number of the most recent snapshot
|
||||
SequenceNumber GetNewest() {
|
||||
if (empty()) {
|
||||
|
@ -19,7 +19,8 @@ TransactionLogIteratorImpl::TransactionLogIteratorImpl(
|
||||
const std::string& dir, const ImmutableDBOptions* options,
|
||||
const TransactionLogIterator::ReadOptions& read_options,
|
||||
const EnvOptions& soptions, const SequenceNumber seq,
|
||||
std::unique_ptr<VectorLogPtr> files, VersionSet const* const versions)
|
||||
std::unique_ptr<VectorLogPtr> files, VersionSet const* const versions,
|
||||
const bool seq_per_batch)
|
||||
: dir_(dir),
|
||||
options_(options),
|
||||
read_options_(read_options),
|
||||
@ -31,7 +32,8 @@ TransactionLogIteratorImpl::TransactionLogIteratorImpl(
|
||||
currentFileIndex_(0),
|
||||
currentBatchSeq_(0),
|
||||
currentLastSeq_(0),
|
||||
versions_(versions) {
|
||||
versions_(versions),
|
||||
seq_per_batch_(seq_per_batch) {
|
||||
assert(files_ != nullptr);
|
||||
assert(versions_ != nullptr);
|
||||
|
||||
@ -241,12 +243,12 @@ void TransactionLogIteratorImpl::UpdateCurrentWriteBatch(const Slice& record) {
|
||||
}
|
||||
startingSequenceNumber_ = expectedSeq;
|
||||
// currentStatus_ will be set to Ok if reseek succeeds
|
||||
// Note: this is still ok in seq_pre_batch_ && concurrent_preparep_ mode
|
||||
// Note: this is still ok in seq_pre_batch_ && two_write_queuesp_ mode
|
||||
// that allows gaps in the WAL since it will still skip over the gap.
|
||||
currentStatus_ = Status::NotFound("Gap in sequence numbers");
|
||||
// In seq_per_batch mode, gaps in the seq are possible so the strict mode
|
||||
// In seq_per_batch_ mode, gaps in the seq are possible so the strict mode
|
||||
// should be disabled
|
||||
return SeekToStartSequence(currentFileIndex_, !options_->seq_per_batch);
|
||||
return SeekToStartSequence(currentFileIndex_, !seq_per_batch_);
|
||||
}
|
||||
|
||||
struct BatchCounter : public WriteBatch::Handler {
|
||||
@ -284,7 +286,7 @@ void TransactionLogIteratorImpl::UpdateCurrentWriteBatch(const Slice& record) {
|
||||
};
|
||||
|
||||
currentBatchSeq_ = WriteBatchInternal::Sequence(batch.get());
|
||||
if (options_->seq_per_batch) {
|
||||
if (seq_per_batch_) {
|
||||
BatchCounter counter(currentBatchSeq_);
|
||||
batch->Iterate(&counter);
|
||||
currentLastSeq_ = counter.sequence_;
|
||||
|
@ -62,7 +62,8 @@ class TransactionLogIteratorImpl : public TransactionLogIterator {
|
||||
const std::string& dir, const ImmutableDBOptions* options,
|
||||
const TransactionLogIterator::ReadOptions& read_options,
|
||||
const EnvOptions& soptions, const SequenceNumber seqNum,
|
||||
std::unique_ptr<VectorLogPtr> files, VersionSet const* const versions);
|
||||
std::unique_ptr<VectorLogPtr> files, VersionSet const* const versions,
|
||||
const bool seq_per_batch);
|
||||
|
||||
virtual bool Valid() override;
|
||||
|
||||
@ -103,7 +104,7 @@ class TransactionLogIteratorImpl : public TransactionLogIterator {
|
||||
// Used only to get latest seq. num
|
||||
// TODO(icanadi) can this be just a callback?
|
||||
VersionSet const* const versions_;
|
||||
|
||||
const bool seq_per_batch_;
|
||||
// Reads from transaction log only if the writebatch record has been written
|
||||
bool RestrictedRead(Slice* record, std::string* scratch);
|
||||
// Seeks to startingSequenceNumber reading from startFileIndex in files_.
|
||||
|
@ -1851,27 +1851,33 @@ void VersionStorageInfo::GetOverlappingInputs(
|
||||
void VersionStorageInfo::GetCleanInputsWithinInterval(
|
||||
int level, const InternalKey* begin, const InternalKey* end,
|
||||
std::vector<FileMetaData*>* inputs, int hint_index, int* file_index) const {
|
||||
if (level >= num_non_empty_levels_) {
|
||||
// this level is empty, no inputs within range
|
||||
return;
|
||||
}
|
||||
|
||||
inputs->clear();
|
||||
Slice user_begin, user_end;
|
||||
if (begin != nullptr) {
|
||||
user_begin = begin->user_key();
|
||||
}
|
||||
if (end != nullptr) {
|
||||
user_end = end->user_key();
|
||||
}
|
||||
if (file_index) {
|
||||
*file_index = -1;
|
||||
}
|
||||
if (begin != nullptr && end != nullptr && level > 0) {
|
||||
if (level >= num_non_empty_levels_ || level == 0 ||
|
||||
level_files_brief_[level].num_files == 0) {
|
||||
// this level is empty, no inputs within range
|
||||
// also don't support clean input interval within L0
|
||||
return;
|
||||
}
|
||||
|
||||
Slice user_begin, user_end;
|
||||
const auto& level_files = level_files_brief_[level];
|
||||
if (begin == nullptr) {
|
||||
user_begin = ExtractUserKey(level_files.files[0].smallest_key);
|
||||
} else {
|
||||
user_begin = begin->user_key();
|
||||
}
|
||||
if (end == nullptr) {
|
||||
user_end = ExtractUserKey(
|
||||
level_files.files[level_files.num_files - 1].largest_key);
|
||||
} else {
|
||||
user_end = end->user_key();
|
||||
}
|
||||
GetOverlappingInputsRangeBinarySearch(level, user_begin, user_end, inputs,
|
||||
hint_index, file_index,
|
||||
true /* within_interval */);
|
||||
}
|
||||
}
|
||||
|
||||
// Store in "*inputs" all files in "level" that overlap [begin,end]
|
||||
@ -1934,8 +1940,8 @@ void VersionStorageInfo::GetOverlappingInputsRangeBinarySearch(
|
||||
} else {
|
||||
ExtendFileRangeOverlappingInterval(level, user_begin, user_end, mid,
|
||||
&start_index, &end_index);
|
||||
}
|
||||
assert(end_index >= start_index);
|
||||
}
|
||||
// insert overlapping files into vector
|
||||
for (int i = start_index; i <= end_index; i++) {
|
||||
inputs->push_back(files_[level][i]);
|
||||
@ -2414,7 +2420,7 @@ VersionSet::VersionSet(const std::string& dbname,
|
||||
manifest_file_number_(0), // Filled by Recover()
|
||||
pending_manifest_file_number_(0),
|
||||
last_sequence_(0),
|
||||
last_to_be_written_sequence_(0),
|
||||
last_allocated_sequence_(0),
|
||||
prev_log_number_(0),
|
||||
current_version_number_(0),
|
||||
manifest_file_size_(0),
|
||||
@ -2754,9 +2760,8 @@ void VersionSet::LogAndApplyCFHelper(VersionEdit* edit) {
|
||||
// updated the last_sequence_ yet. It is also possible that the log has is
|
||||
// expecting some new data that is not written yet. Since LastSequence is an
|
||||
// upper bound on the sequence, it is ok to record
|
||||
// last_to_be_written_sequence_ as the last sequence.
|
||||
edit->SetLastSequence(db_options_->concurrent_prepare
|
||||
? last_to_be_written_sequence_
|
||||
// last_allocated_sequence_ as the last sequence.
|
||||
edit->SetLastSequence(db_options_->two_write_queues ? last_allocated_sequence_
|
||||
: last_sequence_);
|
||||
if (edit->is_column_family_drop_) {
|
||||
// if we drop column family, we have to make sure to save max column family,
|
||||
@ -2784,9 +2789,8 @@ void VersionSet::LogAndApplyHelper(ColumnFamilyData* cfd,
|
||||
// updated the last_sequence_ yet. It is also possible that the log has is
|
||||
// expecting some new data that is not written yet. Since LastSequence is an
|
||||
// upper bound on the sequence, it is ok to record
|
||||
// last_to_be_written_sequence_ as the last sequence.
|
||||
edit->SetLastSequence(db_options_->concurrent_prepare
|
||||
? last_to_be_written_sequence_
|
||||
// last_allocated_sequence_ as the last sequence.
|
||||
edit->SetLastSequence(db_options_->two_write_queues ? last_allocated_sequence_
|
||||
: last_sequence_);
|
||||
|
||||
builder->Apply(edit);
|
||||
@ -3077,7 +3081,7 @@ Status VersionSet::Recover(
|
||||
|
||||
manifest_file_size_ = current_manifest_file_size;
|
||||
next_file_number_.store(next_file + 1);
|
||||
last_to_be_written_sequence_ = last_sequence;
|
||||
last_allocated_sequence_ = last_sequence;
|
||||
last_sequence_ = last_sequence;
|
||||
prev_log_number_ = previous_log_number;
|
||||
|
||||
@ -3448,7 +3452,7 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
|
||||
}
|
||||
|
||||
next_file_number_.store(next_file + 1);
|
||||
last_to_be_written_sequence_ = last_sequence;
|
||||
last_allocated_sequence_ = last_sequence;
|
||||
last_sequence_ = last_sequence;
|
||||
prev_log_number_ = previous_log_number;
|
||||
|
||||
|
@ -765,28 +765,27 @@ class VersionSet {
|
||||
}
|
||||
|
||||
// Note: memory_order_acquire must be sufficient.
|
||||
uint64_t LastToBeWrittenSequence() const {
|
||||
return last_to_be_written_sequence_.load(std::memory_order_seq_cst);
|
||||
uint64_t LastAllocatedSequence() const {
|
||||
return last_allocated_sequence_.load(std::memory_order_seq_cst);
|
||||
}
|
||||
|
||||
// Set the last sequence number to s.
|
||||
void SetLastSequence(uint64_t s) {
|
||||
assert(s >= last_sequence_);
|
||||
// Last visible seqeunce must always be less than last written seq
|
||||
assert(!db_options_->concurrent_prepare ||
|
||||
s <= last_to_be_written_sequence_);
|
||||
assert(!db_options_->two_write_queues || s <= last_allocated_sequence_);
|
||||
last_sequence_.store(s, std::memory_order_release);
|
||||
}
|
||||
|
||||
// Note: memory_order_release must be sufficient
|
||||
void SetLastToBeWrittenSequence(uint64_t s) {
|
||||
assert(s >= last_to_be_written_sequence_);
|
||||
last_to_be_written_sequence_.store(s, std::memory_order_seq_cst);
|
||||
void SetLastAllocatedSequence(uint64_t s) {
|
||||
assert(s >= last_allocated_sequence_);
|
||||
last_allocated_sequence_.store(s, std::memory_order_seq_cst);
|
||||
}
|
||||
|
||||
// Note: memory_order_release must be sufficient
|
||||
uint64_t FetchAddLastToBeWrittenSequence(uint64_t s) {
|
||||
return last_to_be_written_sequence_.fetch_add(s, std::memory_order_seq_cst);
|
||||
uint64_t FetchAddLastAllocatedSequence(uint64_t s) {
|
||||
return last_allocated_sequence_.fetch_add(s, std::memory_order_seq_cst);
|
||||
}
|
||||
|
||||
// Mark the specified file number as used.
|
||||
@ -894,8 +893,9 @@ class VersionSet {
|
||||
uint64_t pending_manifest_file_number_;
|
||||
// The last seq visible to reads
|
||||
std::atomic<uint64_t> last_sequence_;
|
||||
// The last seq with which a writer has written/will write.
|
||||
std::atomic<uint64_t> last_to_be_written_sequence_;
|
||||
// The last seq that is already allocated. The seq might or might not have
|
||||
// appreated in memtable.
|
||||
std::atomic<uint64_t> last_allocated_sequence_;
|
||||
uint64_t prev_log_number_; // 0 or backing store for memtable being compacted
|
||||
|
||||
// Opened lazily
|
||||
|
@ -115,7 +115,7 @@ Status WalManager::GetUpdatesSince(
|
||||
}
|
||||
iter->reset(new TransactionLogIteratorImpl(
|
||||
db_options_.wal_dir, &db_options_, read_options, env_options_, seq,
|
||||
std::move(wal_files), version_set));
|
||||
std::move(wal_files), version_set, seq_per_batch_));
|
||||
return (*iter)->status();
|
||||
}
|
||||
|
||||
|
@ -31,11 +31,12 @@ namespace rocksdb {
|
||||
class WalManager {
|
||||
public:
|
||||
WalManager(const ImmutableDBOptions& db_options,
|
||||
const EnvOptions& env_options)
|
||||
const EnvOptions& env_options, const bool seq_per_batch = false)
|
||||
: db_options_(db_options),
|
||||
env_options_(env_options),
|
||||
env_(db_options.env),
|
||||
purge_wal_files_last_run_(0) {}
|
||||
purge_wal_files_last_run_(0),
|
||||
seq_per_batch_(seq_per_batch) {}
|
||||
|
||||
Status GetSortedWalFiles(VectorLogPtr& files);
|
||||
|
||||
@ -86,6 +87,8 @@ class WalManager {
|
||||
// last time when PurgeObsoleteWALFiles ran.
|
||||
uint64_t purge_wal_files_last_run_;
|
||||
|
||||
bool seq_per_batch_;
|
||||
|
||||
// obsolete files will be deleted every this seconds if ttl deletion is
|
||||
// enabled and archive size_limit is disabled.
|
||||
static const uint64_t kDefaultIntervalToDeleteObsoleteWAL = 600;
|
||||
|
@ -67,7 +67,7 @@ class WalManagerTest : public testing::Test {
|
||||
batch.Put(key, value);
|
||||
WriteBatchInternal::SetSequence(&batch, seq);
|
||||
current_log_writer_->AddRecord(WriteBatchInternal::Contents(&batch));
|
||||
versions_->SetLastToBeWrittenSequence(seq);
|
||||
versions_->SetLastAllocatedSequence(seq);
|
||||
versions_->SetLastSequence(seq);
|
||||
}
|
||||
|
||||
|
@ -1035,7 +1035,7 @@ class MemTableInserter : public WriteBatch::Handler {
|
||||
}
|
||||
|
||||
MemTable* mem = cf_mems_->GetMemTable();
|
||||
auto* moptions = mem->GetMemTableOptions();
|
||||
auto* moptions = mem->GetImmutableMemTableOptions();
|
||||
if (!moptions->inplace_update_support) {
|
||||
mem->Add(sequence_, value_type, key, value, concurrent_memtable_writes_,
|
||||
get_post_process_info(mem));
|
||||
@ -1196,7 +1196,7 @@ class MemTableInserter : public WriteBatch::Handler {
|
||||
}
|
||||
|
||||
MemTable* mem = cf_mems_->GetMemTable();
|
||||
auto* moptions = mem->GetMemTableOptions();
|
||||
auto* moptions = mem->GetImmutableMemTableOptions();
|
||||
bool perform_merge = false;
|
||||
|
||||
// If we pass DB through and options.max_successive_merges is hit
|
||||
|
@ -136,9 +136,8 @@ TEST_F(WriteCallbackTest, WriteWithCallbackTest) {
|
||||
options.create_if_missing = true;
|
||||
options.allow_concurrent_memtable_write = allow_parallel;
|
||||
options.enable_pipelined_write = enable_pipelined_write;
|
||||
options.concurrent_prepare = two_queues;
|
||||
if (options.enable_pipelined_write &&
|
||||
options.concurrent_prepare) {
|
||||
options.two_write_queues = two_queues;
|
||||
if (options.enable_pipelined_write && options.two_write_queues) {
|
||||
// This combination is not supported
|
||||
continue;
|
||||
}
|
||||
|
@ -533,6 +533,11 @@ void WriteThread::ExitAsBatchGroupLeader(WriteGroup& write_group,
|
||||
Writer* last_writer = write_group.last_writer;
|
||||
assert(leader->link_older == nullptr);
|
||||
|
||||
// Propagate memtable write error to the whole group.
|
||||
if (status.ok() && !write_group.status.ok()) {
|
||||
status = write_group.status;
|
||||
}
|
||||
|
||||
if (enable_pipelined_write_) {
|
||||
// Notify writers don't write to memtable to exit.
|
||||
for (Writer* w = last_writer; w != leader;) {
|
||||
|
@ -36,6 +36,7 @@ class CompactionFilter {
|
||||
enum ValueType {
|
||||
kValue,
|
||||
kMergeOperand,
|
||||
kBlobIndex, // used internally by BlobDB.
|
||||
};
|
||||
|
||||
enum class Decision {
|
||||
@ -171,6 +172,8 @@ class CompactionFilter {
|
||||
bool rv = FilterMergeOperand(level, key, existing_value);
|
||||
return rv ? Decision::kRemove : Decision::kKeep;
|
||||
}
|
||||
case ValueType::kBlobIndex:
|
||||
return Decision::kKeep;
|
||||
}
|
||||
assert(false);
|
||||
return Decision::kKeep;
|
||||
|
@ -325,7 +325,8 @@ void CancelAllBackgroundWork(DB* db, bool wait = false);
|
||||
|
||||
// Delete files which are entirely in the given range
|
||||
// Could leave some keys in the range which are in files which are not
|
||||
// entirely in the range.
|
||||
// entirely in the range. Also leaves L0 files regardless of whether they're
|
||||
// in the range.
|
||||
// Snapshots before the delete might not see the data in the given range.
|
||||
Status DeleteFilesInRange(DB* db, ColumnFamilyHandle* column_family,
|
||||
const Slice* begin, const Slice* end);
|
||||
|
@ -905,22 +905,12 @@ struct DBOptions {
|
||||
// allows the memtable writes not to lag behind other writes. It can be used
|
||||
// to optimize MySQL 2PC in which only the commits, which are serial, write to
|
||||
// memtable.
|
||||
bool concurrent_prepare = false;
|
||||
bool two_write_queues = false;
|
||||
|
||||
// If true WAL is not flushed automatically after each write. Instead it
|
||||
// relies on manual invocation of FlushWAL to write the WAL buffer to its
|
||||
// file.
|
||||
bool manual_wal_flush = false;
|
||||
|
||||
// Increase the sequence number after writing each batch, whether memtable is
|
||||
// disabled for that or not. Otherwise the sequence number is increased after
|
||||
// writing each key into memtable. This implies that when memtable_disable is
|
||||
// set, the seq is not increased at all.
|
||||
//
|
||||
// Default: false
|
||||
// Note: This option is experimental and meant to be used only for internal
|
||||
// projects.
|
||||
bool seq_per_batch = false;
|
||||
};
|
||||
|
||||
// Options to control the behavior of a database (passed to DB::Open)
|
||||
|
@ -131,6 +131,11 @@ struct PerfContext {
|
||||
// total number of SST table bloom misses
|
||||
uint64_t bloom_sst_miss_count;
|
||||
|
||||
// Time spent waiting on key locks in transaction lock manager.
|
||||
uint64_t key_lock_wait_time;
|
||||
// number of times acquiring a lock was blocked by another transaction.
|
||||
uint64_t key_lock_wait_count;
|
||||
|
||||
// Total time spent in Env filesystem operations. These are only populated
|
||||
// when TimedEnv is used.
|
||||
uint64_t env_new_sequential_file_nanos;
|
||||
|
@ -133,27 +133,6 @@ class PinnableSlice : public Slice, public Cleanable {
|
||||
PinnableSlice(PinnableSlice&) = delete;
|
||||
PinnableSlice& operator=(PinnableSlice&) = delete;
|
||||
|
||||
PinnableSlice(PinnableSlice&& other) { *this = std::move(other); }
|
||||
|
||||
PinnableSlice& operator=(PinnableSlice&& other) {
|
||||
if (this != &other) {
|
||||
// cleanup itself.
|
||||
Reset();
|
||||
|
||||
Slice::operator=(other);
|
||||
Cleanable::operator=(std::move(other));
|
||||
pinned_ = other.pinned_;
|
||||
if (!pinned_ && other.buf_ == &other.self_space_) {
|
||||
self_space_ = std::move(other.self_space_);
|
||||
buf_ = &self_space_;
|
||||
data_ = buf_->data();
|
||||
} else {
|
||||
buf_ = other.buf_;
|
||||
}
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline void PinSlice(const Slice& s, CleanupFunction f, void* arg1,
|
||||
void* arg2) {
|
||||
assert(!pinned_);
|
||||
|
@ -223,6 +223,76 @@ enum Tickers : uint32_t {
|
||||
// Number of refill intervals where rate limiter's bytes are fully consumed.
|
||||
NUMBER_RATE_LIMITER_DRAINS,
|
||||
|
||||
// Number of internal keys skipped by Iterator
|
||||
NUMBER_ITER_SKIP,
|
||||
|
||||
// BlobDB specific stats
|
||||
// # of Put/PutTTL/PutUntil to BlobDB.
|
||||
BLOB_DB_NUM_PUT,
|
||||
// # of Write to BlobDB.
|
||||
BLOB_DB_NUM_WRITE,
|
||||
// # of Get to BlobDB.
|
||||
BLOB_DB_NUM_GET,
|
||||
// # of MultiGet to BlobDB.
|
||||
BLOB_DB_NUM_MULTIGET,
|
||||
// # of Seek/SeekToFirst/SeekToLast/SeekForPrev to BlobDB iterator.
|
||||
BLOB_DB_NUM_SEEK,
|
||||
// # of Next to BlobDB iterator.
|
||||
BLOB_DB_NUM_NEXT,
|
||||
// # of Prev to BlobDB iterator.
|
||||
BLOB_DB_NUM_PREV,
|
||||
// # of keys written to BlobDB.
|
||||
BLOB_DB_NUM_KEYS_WRITTEN,
|
||||
// # of keys read from BlobDB.
|
||||
BLOB_DB_NUM_KEYS_READ,
|
||||
// # of bytes (key + value) written to BlobDB.
|
||||
BLOB_DB_BYTES_WRITTEN,
|
||||
// # of bytes (keys + value) read from BlobDB.
|
||||
BLOB_DB_BYTES_READ,
|
||||
// # of keys written by BlobDB as non-TTL inlined value.
|
||||
BLOB_DB_WRITE_INLINED,
|
||||
// # of keys written by BlobDB as TTL inlined value.
|
||||
BLOB_DB_WRITE_INLINED_TTL,
|
||||
// # of keys written by BlobDB as non-TTL blob value.
|
||||
BLOB_DB_WRITE_BLOB,
|
||||
// # of keys written by BlobDB as TTL blob value.
|
||||
BLOB_DB_WRITE_BLOB_TTL,
|
||||
// # of bytes written to blob file.
|
||||
BLOB_DB_BLOB_FILE_BYTES_WRITTEN,
|
||||
// # of bytes read from blob file.
|
||||
BLOB_DB_BLOB_FILE_BYTES_READ,
|
||||
// # of times a blob files being synced.
|
||||
BLOB_DB_BLOB_FILE_SYNCED,
|
||||
// # of blob index evicted from base DB by BlobDB compaction filter because
|
||||
// of expiration.
|
||||
BLOB_DB_BLOB_INDEX_EXPIRED,
|
||||
// # of blob files being garbage collected.
|
||||
BLOB_DB_GC_NUM_FILES,
|
||||
// # of blob files generated by garbage collection.
|
||||
BLOB_DB_GC_NUM_NEW_FILES,
|
||||
// # of BlobDB garbage collection failures.
|
||||
BLOB_DB_GC_FAILURES,
|
||||
// # of keys drop by BlobDB garbage collection because they had been
|
||||
// overwritten.
|
||||
BLOB_DB_GC_NUM_KEYS_OVERWRITTEN,
|
||||
// # of keys drop by BlobDB garbage collection because of expiration.
|
||||
BLOB_DB_GC_NUM_KEYS_EXPIRED,
|
||||
// # of keys relocated to new blob file by garbage collection.
|
||||
BLOB_DB_GC_NUM_KEYS_RELOCATED,
|
||||
// # of bytes drop by BlobDB garbage collection because they had been
|
||||
// overwritten.
|
||||
BLOB_DB_GC_BYTES_OVERWRITTEN,
|
||||
// # of bytes drop by BlobDB garbage collection because of expiration.
|
||||
BLOB_DB_GC_BYTES_EXPIRED,
|
||||
// # of bytes relocated to new blob file by garbage collection.
|
||||
BLOB_DB_GC_BYTES_RELOCATED,
|
||||
// # of blob files evicted because of BlobDB is full.
|
||||
BLOB_DB_FIFO_NUM_FILES_EVICTED,
|
||||
// # of keys in the blob files evicted because of BlobDB is full.
|
||||
BLOB_DB_FIFO_NUM_KEYS_EVICTED,
|
||||
// # of bytes in the blob files evicted because of BlobDB is full.
|
||||
BLOB_DB_FIFO_BYTES_EVICTED,
|
||||
|
||||
TICKER_ENUM_MAX
|
||||
};
|
||||
|
||||
@ -328,6 +398,38 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
|
||||
{READ_AMP_ESTIMATE_USEFUL_BYTES, "rocksdb.read.amp.estimate.useful.bytes"},
|
||||
{READ_AMP_TOTAL_READ_BYTES, "rocksdb.read.amp.total.read.bytes"},
|
||||
{NUMBER_RATE_LIMITER_DRAINS, "rocksdb.number.rate_limiter.drains"},
|
||||
{NUMBER_ITER_SKIP, "rocksdb.number.iter.skip"},
|
||||
{BLOB_DB_NUM_PUT, "rocksdb.blobdb.num.put"},
|
||||
{BLOB_DB_NUM_WRITE, "rocksdb.blobdb.num.write"},
|
||||
{BLOB_DB_NUM_GET, "rocksdb.blobdb.num.get"},
|
||||
{BLOB_DB_NUM_MULTIGET, "rocksdb.blobdb.num.multiget"},
|
||||
{BLOB_DB_NUM_SEEK, "rocksdb.blobdb.num.seek"},
|
||||
{BLOB_DB_NUM_NEXT, "rocksdb.blobdb.num.next"},
|
||||
{BLOB_DB_NUM_PREV, "rocksdb.blobdb.num.prev"},
|
||||
{BLOB_DB_NUM_KEYS_WRITTEN, "rocksdb.blobdb.num.keys.written"},
|
||||
{BLOB_DB_NUM_KEYS_READ, "rocksdb.blobdb.num.keys.read"},
|
||||
{BLOB_DB_BYTES_WRITTEN, "rocksdb.blobdb.bytes.written"},
|
||||
{BLOB_DB_BYTES_READ, "rocksdb.blobdb.bytes.read"},
|
||||
{BLOB_DB_WRITE_INLINED, "rocksdb.blobdb.write.inlined"},
|
||||
{BLOB_DB_WRITE_INLINED_TTL, "rocksdb.blobdb.write.inlined.ttl"},
|
||||
{BLOB_DB_WRITE_BLOB, "rocksdb.blobdb.write.blob"},
|
||||
{BLOB_DB_WRITE_BLOB_TTL, "rocksdb.blobdb.write.blob.ttl"},
|
||||
{BLOB_DB_BLOB_FILE_BYTES_WRITTEN, "rocksdb.blobdb.blob.file.bytes.written"},
|
||||
{BLOB_DB_BLOB_FILE_BYTES_READ, "rocksdb.blobdb.blob.file,bytes.read"},
|
||||
{BLOB_DB_BLOB_FILE_SYNCED, "rocksdb.blobdb.blob.file.synced"},
|
||||
{BLOB_DB_BLOB_INDEX_EXPIRED, "rocksdb.blobdb.blob.index.expired"},
|
||||
{BLOB_DB_GC_NUM_FILES, "rocksdb.blobdb.gc.num.files"},
|
||||
{BLOB_DB_GC_NUM_NEW_FILES, "rocksdb.blobdb.gc.num.new.files"},
|
||||
{BLOB_DB_GC_FAILURES, "rocksdb.blobdb.gc.failures"},
|
||||
{BLOB_DB_GC_NUM_KEYS_OVERWRITTEN, "rocksdb.blobdb.gc.num.keys.overwritten"},
|
||||
{BLOB_DB_GC_NUM_KEYS_EXPIRED, "rocksdb.blobdb.gc.num.keys.expired"},
|
||||
{BLOB_DB_GC_NUM_KEYS_RELOCATED, "rocksdb.blobdb.gc.num.keys.relocated"},
|
||||
{BLOB_DB_GC_BYTES_OVERWRITTEN, "rocksdb.blobdb.gc.bytes.overwritten"},
|
||||
{BLOB_DB_GC_BYTES_EXPIRED, "rocksdb.blobdb.gc.bytes.expired"},
|
||||
{BLOB_DB_GC_BYTES_RELOCATED, "rocksdb.blobdb.gc.bytes.relocated"},
|
||||
{BLOB_DB_FIFO_NUM_FILES_EVICTED, "rocksdb.blobdb.fifo.num.files.evicted"},
|
||||
{BLOB_DB_FIFO_NUM_KEYS_EVICTED, "rocksdb.blobdb.fifo.num.keys.evicted"},
|
||||
{BLOB_DB_FIFO_BYTES_EVICTED, "rocksdb.blobdb.fifo.bytes.evicted"},
|
||||
};
|
||||
|
||||
/**
|
||||
@ -379,6 +481,36 @@ enum Histograms : uint32_t {
|
||||
// requests.
|
||||
READ_NUM_MERGE_OPERANDS,
|
||||
|
||||
// BlobDB specific stats
|
||||
// Size of keys written to BlobDB.
|
||||
BLOB_DB_KEY_SIZE,
|
||||
// Size of values written to BlobDB.
|
||||
BLOB_DB_VALUE_SIZE,
|
||||
// BlobDB Put/PutWithTTL/PutUntil/Write latency.
|
||||
BLOB_DB_WRITE_MICROS,
|
||||
// BlobDB Get lagency.
|
||||
BLOB_DB_GET_MICROS,
|
||||
// BlobDB MultiGet latency.
|
||||
BLOB_DB_MULTIGET_MICROS,
|
||||
// BlobDB Seek/SeekToFirst/SeekToLast/SeekForPrev latency.
|
||||
BLOB_DB_SEEK_MICROS,
|
||||
// BlobDB Next latency.
|
||||
BLOB_DB_NEXT_MICROS,
|
||||
// BlobDB Prev latency.
|
||||
BLOB_DB_PREV_MICROS,
|
||||
// Blob file write latency.
|
||||
BLOB_DB_BLOB_FILE_WRITE_MICROS,
|
||||
// Blob file read latency.
|
||||
BLOB_DB_BLOB_FILE_READ_MICROS,
|
||||
// Blob file sync latency.
|
||||
BLOB_DB_BLOB_FILE_SYNC_MICROS,
|
||||
// BlobDB garbage collection time.
|
||||
BLOB_DB_GC_MICROS,
|
||||
// BlobDB compression time.
|
||||
BLOB_DB_COMPRESSION_MICROS,
|
||||
// BlobDB decompression time.
|
||||
BLOB_DB_DECOMPRESSION_MICROS,
|
||||
|
||||
HISTOGRAM_ENUM_MAX, // TODO(ldemailly): enforce HistogramsNameMap match
|
||||
};
|
||||
|
||||
@ -414,6 +546,20 @@ const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = {
|
||||
{COMPRESSION_TIMES_NANOS, "rocksdb.compression.times.nanos"},
|
||||
{DECOMPRESSION_TIMES_NANOS, "rocksdb.decompression.times.nanos"},
|
||||
{READ_NUM_MERGE_OPERANDS, "rocksdb.read.num.merge_operands"},
|
||||
{BLOB_DB_KEY_SIZE, "rocksdb.blobdb.key.size"},
|
||||
{BLOB_DB_VALUE_SIZE, "rocksdb.blobdb.value.size"},
|
||||
{BLOB_DB_WRITE_MICROS, "rocksdb.blobdb.write.micros"},
|
||||
{BLOB_DB_GET_MICROS, "rocksdb.blobdb.get.micros"},
|
||||
{BLOB_DB_MULTIGET_MICROS, "rocksdb.blobdb.multiget.micros"},
|
||||
{BLOB_DB_SEEK_MICROS, "rocksdb.blobdb.seek.micros"},
|
||||
{BLOB_DB_NEXT_MICROS, "rocksdb.blobdb.next.micros"},
|
||||
{BLOB_DB_PREV_MICROS, "rocksdb.blobdb.prev.micros"},
|
||||
{BLOB_DB_BLOB_FILE_WRITE_MICROS, "rocksdb.blobdb.blob.file.write.micros"},
|
||||
{BLOB_DB_BLOB_FILE_READ_MICROS, "rocksdb.blobdb.blob.file.read.micros"},
|
||||
{BLOB_DB_BLOB_FILE_SYNC_MICROS, "rocksdb.blobdb.blob.file.sync.micros"},
|
||||
{BLOB_DB_GC_MICROS, "rocksdb.blobdb.gc.micros"},
|
||||
{BLOB_DB_COMPRESSION_MICROS, "rocksdb.blobdb.compression.micros"},
|
||||
{BLOB_DB_DECOMPRESSION_MICROS, "rocksdb.blobdb.decompression.micros"},
|
||||
};
|
||||
|
||||
struct HistogramData {
|
||||
|
@ -6,7 +6,7 @@
|
||||
|
||||
#define ROCKSDB_MAJOR 5
|
||||
#define ROCKSDB_MINOR 9
|
||||
#define ROCKSDB_PATCH 0
|
||||
#define ROCKSDB_PATCH 1
|
||||
|
||||
// Do not use these. We made the mistake of declaring macros starting with
|
||||
// double underscore. Now we have to live with our choice. We'll deprecate these
|
||||
|
@ -79,6 +79,8 @@ void PerfContext::Reset() {
|
||||
bloom_memtable_miss_count = 0;
|
||||
bloom_sst_hit_count = 0;
|
||||
bloom_sst_miss_count = 0;
|
||||
key_lock_wait_time = 0;
|
||||
key_lock_wait_count = 0;
|
||||
|
||||
env_new_sequential_file_nanos = 0;
|
||||
env_new_random_access_file_nanos = 0;
|
||||
@ -158,6 +160,8 @@ std::string PerfContext::ToString(bool exclude_zero_counters) const {
|
||||
PERF_CONTEXT_OUTPUT(bloom_memtable_miss_count);
|
||||
PERF_CONTEXT_OUTPUT(bloom_sst_hit_count);
|
||||
PERF_CONTEXT_OUTPUT(bloom_sst_miss_count);
|
||||
PERF_CONTEXT_OUTPUT(key_lock_wait_time);
|
||||
PERF_CONTEXT_OUTPUT(key_lock_wait_count);
|
||||
PERF_CONTEXT_OUTPUT(env_new_sequential_file_nanos);
|
||||
PERF_CONTEXT_OUTPUT(env_new_random_access_file_nanos);
|
||||
PERF_CONTEXT_OUTPUT(env_new_writable_file_nanos);
|
||||
|
@ -85,9 +85,8 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options)
|
||||
avoid_flush_during_recovery(options.avoid_flush_during_recovery),
|
||||
allow_ingest_behind(options.allow_ingest_behind),
|
||||
preserve_deletes(options.preserve_deletes),
|
||||
concurrent_prepare(options.concurrent_prepare),
|
||||
manual_wal_flush(options.manual_wal_flush),
|
||||
seq_per_batch(options.seq_per_batch) {
|
||||
two_write_queues(options.two_write_queues),
|
||||
manual_wal_flush(options.manual_wal_flush) {
|
||||
}
|
||||
|
||||
void ImmutableDBOptions::Dump(Logger* log) const {
|
||||
@ -217,11 +216,10 @@ void ImmutableDBOptions::Dump(Logger* log) const {
|
||||
allow_ingest_behind);
|
||||
ROCKS_LOG_HEADER(log, " Options.preserve_deletes: %d",
|
||||
preserve_deletes);
|
||||
ROCKS_LOG_HEADER(log, " Options.concurrent_prepare: %d",
|
||||
concurrent_prepare);
|
||||
ROCKS_LOG_HEADER(log, " Options.two_write_queues: %d",
|
||||
two_write_queues);
|
||||
ROCKS_LOG_HEADER(log, " Options.manual_wal_flush: %d",
|
||||
manual_wal_flush);
|
||||
ROCKS_LOG_HEADER(log, " Options.seq_per_batch: %d", seq_per_batch);
|
||||
}
|
||||
|
||||
MutableDBOptions::MutableDBOptions()
|
||||
|
@ -77,9 +77,8 @@ struct ImmutableDBOptions {
|
||||
bool avoid_flush_during_recovery;
|
||||
bool allow_ingest_behind;
|
||||
bool preserve_deletes;
|
||||
bool concurrent_prepare;
|
||||
bool two_write_queues;
|
||||
bool manual_wal_flush;
|
||||
bool seq_per_batch;
|
||||
};
|
||||
|
||||
struct MutableDBOptions {
|
||||
|
@ -360,18 +360,18 @@ static std::unordered_map<std::string, OptionTypeInfo> db_options_type_info = {
|
||||
{offsetof(struct DBOptions, preserve_deletes), OptionType::kBoolean,
|
||||
OptionVerificationType::kNormal, false,
|
||||
offsetof(struct ImmutableDBOptions, preserve_deletes)}},
|
||||
{"concurrent_prepare",
|
||||
{offsetof(struct DBOptions, concurrent_prepare), OptionType::kBoolean,
|
||||
{"concurrent_prepare", // Deprecated by two_write_queues
|
||||
{0, OptionType::kBoolean, OptionVerificationType::kDeprecated, false, 0}},
|
||||
{"two_write_queues",
|
||||
{offsetof(struct DBOptions, two_write_queues), OptionType::kBoolean,
|
||||
OptionVerificationType::kNormal, false,
|
||||
offsetof(struct ImmutableDBOptions, concurrent_prepare)}},
|
||||
offsetof(struct ImmutableDBOptions, two_write_queues)}},
|
||||
{"manual_wal_flush",
|
||||
{offsetof(struct DBOptions, manual_wal_flush), OptionType::kBoolean,
|
||||
OptionVerificationType::kNormal, false,
|
||||
offsetof(struct ImmutableDBOptions, manual_wal_flush)}},
|
||||
{"seq_per_batch",
|
||||
{offsetof(struct DBOptions, seq_per_batch), OptionType::kBoolean,
|
||||
OptionVerificationType::kNormal, false,
|
||||
offsetof(struct ImmutableDBOptions, seq_per_batch)}}};
|
||||
{0, OptionType::kBoolean, OptionVerificationType::kDeprecated, false, 0}}};
|
||||
|
||||
// offset_of is used to get the offset of a class data member
|
||||
// ex: offset_of(&ColumnFamilyOptions::num_levels)
|
||||
|
@ -284,6 +284,7 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) {
|
||||
"allow_ingest_behind=false;"
|
||||
"preserve_deletes=false;"
|
||||
"concurrent_prepare=false;"
|
||||
"two_write_queues=false;"
|
||||
"manual_wal_flush=false;"
|
||||
"seq_per_batch=false;",
|
||||
new_options));
|
||||
|
15
src.mk
15
src.mk
@ -15,13 +15,13 @@ LIB_SOURCES = \
|
||||
db/convenience.cc \
|
||||
db/db_filesnapshot.cc \
|
||||
db/db_impl.cc \
|
||||
db/db_impl_write.cc \
|
||||
db/db_impl_compaction_flush.cc \
|
||||
db/db_impl_files.cc \
|
||||
db/db_impl_open.cc \
|
||||
db/db_impl_debug.cc \
|
||||
db/db_impl_experimental.cc \
|
||||
db/db_impl_files.cc \
|
||||
db/db_impl_open.cc \
|
||||
db/db_impl_readonly.cc \
|
||||
db/db_impl_write.cc \
|
||||
db/db_info_dumper.cc \
|
||||
db/db_iter.cc \
|
||||
db/dbformat.cc \
|
||||
@ -155,9 +155,9 @@ LIB_SOURCES = \
|
||||
utilities/blob_db/blob_db.cc \
|
||||
utilities/blob_db/blob_db_impl.cc \
|
||||
utilities/blob_db/blob_file.cc \
|
||||
utilities/blob_db/blob_log_format.cc \
|
||||
utilities/blob_db/blob_log_reader.cc \
|
||||
utilities/blob_db/blob_log_writer.cc \
|
||||
utilities/blob_db/blob_log_format.cc \
|
||||
utilities/blob_db/ttl_extractor.cc \
|
||||
utilities/cassandra/cassandra_compaction_filter.cc \
|
||||
utilities/cassandra/format.cc \
|
||||
@ -192,8 +192,8 @@ LIB_SOURCES = \
|
||||
utilities/simulator_cache/sim_cache.cc \
|
||||
utilities/spatialdb/spatial_db.cc \
|
||||
utilities/table_properties_collectors/compact_on_deletion_collector.cc \
|
||||
utilities/transactions/optimistic_transaction_db_impl.cc \
|
||||
utilities/transactions/optimistic_transaction.cc \
|
||||
utilities/transactions/optimistic_transaction_db_impl.cc \
|
||||
utilities/transactions/pessimistic_transaction.cc \
|
||||
utilities/transactions/pessimistic_transaction_db.cc \
|
||||
utilities/transactions/snapshot_checker.cc \
|
||||
@ -231,14 +231,14 @@ BENCH_LIB_SOURCES = \
|
||||
tools/db_bench_tool.cc \
|
||||
|
||||
EXP_LIB_SOURCES = \
|
||||
utilities/col_buf_encoder.cc \
|
||||
utilities/col_buf_decoder.cc \
|
||||
utilities/col_buf_encoder.cc \
|
||||
utilities/column_aware_encoding_util.cc
|
||||
|
||||
TEST_LIB_SOURCES = \
|
||||
db/db_test_util.cc \
|
||||
util/testharness.cc \
|
||||
util/testutil.cc \
|
||||
db/db_test_util.cc \
|
||||
utilities/cassandra/test_utils.cc \
|
||||
|
||||
MAIN_SOURCES = \
|
||||
@ -338,7 +338,6 @@ MAIN_SOURCES = \
|
||||
util/filelock_test.cc \
|
||||
util/log_write_bench.cc \
|
||||
util/rate_limiter_test.cc \
|
||||
util/slice_test.cc \
|
||||
util/slice_transform_test.cc \
|
||||
util/timer_queue_test.cc \
|
||||
util/thread_list_test.cc \
|
||||
|
@ -78,8 +78,8 @@ FilterBlockBuilder* CreateFilterBlockBuilder(
|
||||
// as partition size.
|
||||
assert(table_opt.block_size_deviation <= 100);
|
||||
auto partition_size = static_cast<uint32_t>(
|
||||
table_opt.metadata_block_size *
|
||||
(100 - table_opt.block_size_deviation));
|
||||
((table_opt.metadata_block_size *
|
||||
(100 - table_opt.block_size_deviation)) + 99) / 100);
|
||||
partition_size = std::max(partition_size, static_cast<uint32_t>(1));
|
||||
return new PartitionedFilterBlockBuilder(
|
||||
opt.prefix_extractor, table_opt.whole_key_filtering,
|
||||
@ -296,7 +296,7 @@ struct BlockBasedTableBuilder::Rep {
|
||||
file(f),
|
||||
data_block(table_options.block_restart_interval,
|
||||
table_options.use_delta_encoding),
|
||||
range_del_block(port::kMaxInt32),
|
||||
range_del_block(1), // TODO(andrewkr): restart_interval unnecessary
|
||||
internal_prefix_transform(_ioptions.prefix_extractor),
|
||||
compression_type(_compression_type),
|
||||
compression_opts(_compression_opts),
|
||||
|
@ -21,8 +21,7 @@
|
||||
namespace rocksdb {
|
||||
|
||||
MetaIndexBuilder::MetaIndexBuilder()
|
||||
: meta_index_block_(
|
||||
new BlockBuilder(port::kMaxInt32 /* restart interval */)) {}
|
||||
: meta_index_block_(new BlockBuilder(1 /* restart interval */)) {}
|
||||
|
||||
void MetaIndexBuilder::Add(const std::string& key,
|
||||
const BlockHandle& handle) {
|
||||
@ -39,8 +38,7 @@ Slice MetaIndexBuilder::Finish() {
|
||||
}
|
||||
|
||||
PropertyBlockBuilder::PropertyBlockBuilder()
|
||||
: properties_block_(
|
||||
new BlockBuilder(port::kMaxInt32 /* restart interval */)) {}
|
||||
: properties_block_(new BlockBuilder(1 /* restart interval */)) {}
|
||||
|
||||
void PropertyBlockBuilder::Add(const std::string& name,
|
||||
const std::string& val) {
|
||||
|
@ -75,7 +75,8 @@ class PartitionedFilterBlockTest : public testing::Test {
|
||||
auto partition_size =
|
||||
filter_bits_reader->CalculateSpace(num_keys, &dont_care1, &dont_care2);
|
||||
delete filter_bits_reader;
|
||||
return partition_size + table_options_.block_size_deviation;
|
||||
return partition_size +
|
||||
partition_size * table_options_.block_size_deviation / 100;
|
||||
}
|
||||
|
||||
int last_offset = 10;
|
||||
@ -94,8 +95,10 @@ class PartitionedFilterBlockTest : public testing::Test {
|
||||
PartitionedIndexBuilder* const p_index_builder) {
|
||||
assert(table_options_.block_size_deviation <= 100);
|
||||
auto partition_size = static_cast<uint32_t>(
|
||||
table_options_.metadata_block_size *
|
||||
( 100 - table_options_.block_size_deviation));
|
||||
((table_options_.metadata_block_size *
|
||||
(100 - table_options_.block_size_deviation)) +
|
||||
99) /
|
||||
100);
|
||||
partition_size = std::max(partition_size, static_cast<uint32_t>(1));
|
||||
return new PartitionedFilterBlockBuilder(
|
||||
nullptr, table_options_.whole_key_filtering,
|
||||
|
@ -148,6 +148,7 @@ Status DeleteScheduler::MarkAsTrash(const std::string& file_path,
|
||||
Status s;
|
||||
if (DeleteScheduler::IsTrashFile(file_path)) {
|
||||
// This is already a trash file
|
||||
*trash_file = file_path;
|
||||
return s;
|
||||
}
|
||||
|
||||
|
@ -1,70 +0,0 @@
|
||||
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under both the GPLv2 (found in the
|
||||
// COPYING file in the root directory) and Apache 2.0 License
|
||||
// (found in the LICENSE.Apache file in the root directory).
|
||||
|
||||
#include "port/stack_trace.h"
|
||||
#include "rocksdb/slice.h"
|
||||
#include "util/testharness.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class SliceTest : public testing::Test {};
|
||||
|
||||
namespace {
|
||||
void BumpCounter(void* arg1, void* arg2) {
|
||||
(*reinterpret_cast<int*>(arg1))++;
|
||||
}
|
||||
} // anonymous namespace
|
||||
|
||||
TEST_F(SliceTest, PinnableSliceMoveConstruct) {
|
||||
for (int i = 0; i < 3; i++) {
|
||||
int orig_cleanup = 0;
|
||||
int moved_cleanup = 0;
|
||||
PinnableSlice* s1 = nullptr;
|
||||
std::string external_storage;
|
||||
switch (i) {
|
||||
case 0:
|
||||
s1 = new PinnableSlice();
|
||||
*(s1->GetSelf()) = "foo";
|
||||
s1->PinSelf();
|
||||
s1->RegisterCleanup(BumpCounter, &moved_cleanup, nullptr);
|
||||
break;
|
||||
case 1:
|
||||
s1 = new PinnableSlice(&external_storage);
|
||||
*(s1->GetSelf()) = "foo";
|
||||
s1->PinSelf();
|
||||
s1->RegisterCleanup(BumpCounter, &moved_cleanup, nullptr);
|
||||
break;
|
||||
case 2:
|
||||
s1 = new PinnableSlice();
|
||||
s1->PinSlice("foo", BumpCounter, &moved_cleanup, nullptr);
|
||||
break;
|
||||
}
|
||||
ASSERT_EQ("foo", s1->ToString());
|
||||
PinnableSlice* s2 = new PinnableSlice();
|
||||
s2->PinSelf("bar");
|
||||
ASSERT_EQ("bar", s2->ToString());
|
||||
s2->RegisterCleanup(BumpCounter, &orig_cleanup, nullptr);
|
||||
*s2 = std::move(*s1);
|
||||
ASSERT_EQ("foo", s2->ToString());
|
||||
ASSERT_EQ(1, orig_cleanup);
|
||||
ASSERT_EQ(0, moved_cleanup);
|
||||
delete s1;
|
||||
// ASAN will check if it will access storage of s1, which is deleted.
|
||||
ASSERT_EQ("foo", s2->ToString());
|
||||
ASSERT_EQ(1, orig_cleanup);
|
||||
ASSERT_EQ(0, moved_cleanup);
|
||||
delete s2;
|
||||
ASSERT_EQ(1, orig_cleanup);
|
||||
ASSERT_EQ(1, moved_cleanup);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
rocksdb::port::InstallStackTraceHandler();
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
}
|
90
utilities/blob_db/blob_compaction_filter.h
Normal file
90
utilities/blob_db/blob_compaction_filter.h
Normal file
@ -0,0 +1,90 @@
|
||||
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under both the GPLv2 (found in the
|
||||
// COPYING file in the root directory) and Apache 2.0 License
|
||||
// (found in the LICENSE.Apache file in the root directory).
|
||||
#pragma once
|
||||
#ifndef ROCKSDB_LITE
|
||||
|
||||
#include "monitoring/statistics.h"
|
||||
#include "rocksdb/compaction_filter.h"
|
||||
#include "rocksdb/env.h"
|
||||
#include "utilities/blob_db/blob_index.h"
|
||||
|
||||
namespace rocksdb {
|
||||
namespace blob_db {
|
||||
|
||||
// CompactionFilter to delete expired blob index from base DB.
|
||||
class BlobIndexCompactionFilter : public CompactionFilter {
|
||||
public:
|
||||
BlobIndexCompactionFilter(uint64_t current_time, Statistics* statistics)
|
||||
: current_time_(current_time), statistics_(statistics) {}
|
||||
|
||||
virtual ~BlobIndexCompactionFilter() {
|
||||
RecordTick(statistics_, BLOB_DB_BLOB_INDEX_EXPIRED, expired_count_);
|
||||
}
|
||||
|
||||
virtual const char* Name() const override {
|
||||
return "BlobIndexCompactionFilter";
|
||||
}
|
||||
|
||||
// Filter expired blob indexes regardless of snapshots.
|
||||
virtual bool IgnoreSnapshots() const override { return true; }
|
||||
|
||||
virtual Decision FilterV2(int /*level*/, const Slice& /*key*/,
|
||||
ValueType value_type, const Slice& value,
|
||||
std::string* /*new_value*/,
|
||||
std::string* /*skip_until*/) const override {
|
||||
if (value_type != kBlobIndex) {
|
||||
return Decision::kKeep;
|
||||
}
|
||||
BlobIndex blob_index;
|
||||
Status s = blob_index.DecodeFrom(value);
|
||||
if (!s.ok()) {
|
||||
// Unable to decode blob index. Keeping the value.
|
||||
return Decision::kKeep;
|
||||
}
|
||||
if (blob_index.HasTTL() && blob_index.expiration() <= current_time_) {
|
||||
// Expired
|
||||
expired_count_++;
|
||||
return Decision::kRemove;
|
||||
}
|
||||
return Decision::kKeep;
|
||||
}
|
||||
|
||||
private:
|
||||
const uint64_t current_time_;
|
||||
Statistics* statistics_;
|
||||
// It is safe to not using std::atomic since the compaction filter, created
|
||||
// from a compaction filter factroy, will not be called from multiple threads.
|
||||
mutable uint64_t expired_count_ = 0;
|
||||
};
|
||||
|
||||
class BlobIndexCompactionFilterFactory : public CompactionFilterFactory {
|
||||
public:
|
||||
BlobIndexCompactionFilterFactory(Env* env, Statistics* statistics)
|
||||
: env_(env), statistics_(statistics) {}
|
||||
|
||||
virtual const char* Name() const override {
|
||||
return "BlobIndexCompactionFilterFactory";
|
||||
}
|
||||
|
||||
virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
|
||||
const CompactionFilter::Context& /*context*/) override {
|
||||
int64_t current_time = 0;
|
||||
Status s = env_->GetCurrentTime(¤t_time);
|
||||
if (!s.ok()) {
|
||||
return nullptr;
|
||||
}
|
||||
assert(current_time >= 0);
|
||||
return std::unique_ptr<CompactionFilter>(new BlobIndexCompactionFilter(
|
||||
static_cast<uint64_t>(current_time), statistics_));
|
||||
}
|
||||
|
||||
private:
|
||||
Env* env_;
|
||||
Statistics* statistics_;
|
||||
};
|
||||
|
||||
} // namespace blob_db
|
||||
} // namespace rocksdb
|
||||
#endif // ROCKSDB_LITE
|
@ -26,6 +26,7 @@
|
||||
#include "table/block_builder.h"
|
||||
#include "util/file_reader_writer.h"
|
||||
#include "util/filename.h"
|
||||
#include "utilities/blob_db/blob_compaction_filter.h"
|
||||
#include "utilities/blob_db/blob_db_impl.h"
|
||||
|
||||
namespace rocksdb {
|
||||
@ -45,6 +46,11 @@ Status BlobDB::OpenAndLoad(const Options& options,
|
||||
const BlobDBOptions& bdb_options,
|
||||
const std::string& dbname, BlobDB** blob_db,
|
||||
Options* changed_options) {
|
||||
if (options.compaction_filter != nullptr ||
|
||||
options.compaction_filter_factory != nullptr) {
|
||||
return Status::NotSupported("Blob DB doesn't support compaction filter.");
|
||||
}
|
||||
|
||||
*changed_options = options;
|
||||
*blob_db = nullptr;
|
||||
|
||||
@ -57,12 +63,19 @@ Status BlobDB::OpenAndLoad(const Options& options,
|
||||
{
|
||||
MutexLock l(&listener_mutex);
|
||||
all_blobdb_listeners.push_back(fblistener);
|
||||
if (bdb_options.enable_garbage_collection) {
|
||||
all_blobdb_listeners.push_back(ce_listener);
|
||||
}
|
||||
all_wal_filters.push_back(rw_filter);
|
||||
}
|
||||
|
||||
changed_options->compaction_filter_factory.reset(
|
||||
new BlobIndexCompactionFilterFactory(options.env,
|
||||
options.statistics.get()));
|
||||
changed_options->listeners.emplace_back(fblistener);
|
||||
if (bdb_options.enable_garbage_collection) {
|
||||
changed_options->listeners.emplace_back(ce_listener);
|
||||
}
|
||||
changed_options->wal_filter = rw_filter.get();
|
||||
|
||||
DBOptions db_options(*changed_options);
|
||||
@ -71,7 +84,9 @@ Status BlobDB::OpenAndLoad(const Options& options,
|
||||
BlobDBImpl* bdb = new BlobDBImpl(dbname, bdb_options, db_options);
|
||||
|
||||
fblistener->SetImplPtr(bdb);
|
||||
if (bdb_options.enable_garbage_collection) {
|
||||
ce_listener->SetImplPtr(bdb);
|
||||
}
|
||||
rw_filter->SetImplPtr(bdb);
|
||||
|
||||
Status s = bdb->OpenPhase1();
|
||||
@ -106,6 +121,11 @@ Status BlobDB::Open(const DBOptions& db_options_input,
|
||||
const std::vector<ColumnFamilyDescriptor>& column_families,
|
||||
std::vector<ColumnFamilyHandle*>* handles, BlobDB** blob_db,
|
||||
bool no_base_db) {
|
||||
if (column_families.size() != 1 ||
|
||||
column_families[0].name != kDefaultColumnFamilyName) {
|
||||
return Status::NotSupported(
|
||||
"Blob DB doesn't support non-default column family.");
|
||||
}
|
||||
*blob_db = nullptr;
|
||||
Status s;
|
||||
|
||||
@ -124,20 +144,36 @@ Status BlobDB::Open(const DBOptions& db_options_input,
|
||||
ReconcileWalFilter_t rw_filter = std::make_shared<BlobReconcileWalFilter>();
|
||||
|
||||
db_options.listeners.emplace_back(fblistener);
|
||||
if (bdb_options.enable_garbage_collection) {
|
||||
db_options.listeners.emplace_back(ce_listener);
|
||||
}
|
||||
db_options.wal_filter = rw_filter.get();
|
||||
|
||||
{
|
||||
MutexLock l(&listener_mutex);
|
||||
all_blobdb_listeners.push_back(fblistener);
|
||||
if (bdb_options.enable_garbage_collection) {
|
||||
all_blobdb_listeners.push_back(ce_listener);
|
||||
}
|
||||
all_wal_filters.push_back(rw_filter);
|
||||
}
|
||||
|
||||
ColumnFamilyOptions cf_options(column_families[0].options);
|
||||
if (cf_options.compaction_filter != nullptr ||
|
||||
cf_options.compaction_filter_factory != nullptr) {
|
||||
return Status::NotSupported("Blob DB doesn't support compaction filter.");
|
||||
}
|
||||
cf_options.compaction_filter_factory.reset(
|
||||
new BlobIndexCompactionFilterFactory(db_options.env,
|
||||
db_options.statistics.get()));
|
||||
ColumnFamilyDescriptor cf_descriptor(kDefaultColumnFamilyName, cf_options);
|
||||
|
||||
// we need to open blob db first so that recovery can happen
|
||||
BlobDBImpl* bdb = new BlobDBImpl(dbname, bdb_options, db_options);
|
||||
fblistener->SetImplPtr(bdb);
|
||||
if (bdb_options.enable_garbage_collection) {
|
||||
ce_listener->SetImplPtr(bdb);
|
||||
}
|
||||
rw_filter->SetImplPtr(bdb);
|
||||
|
||||
s = bdb->OpenPhase1();
|
||||
@ -152,7 +188,7 @@ Status BlobDB::Open(const DBOptions& db_options_input,
|
||||
}
|
||||
|
||||
DB* db = nullptr;
|
||||
s = DB::Open(db_options, dbname, column_families, handles, &db);
|
||||
s = DB::Open(db_options, dbname, {cf_descriptor}, handles, &db);
|
||||
if (!s.ok()) {
|
||||
delete bdb;
|
||||
return s;
|
||||
@ -190,6 +226,8 @@ void BlobDBOptions::Dump(Logger* log) const {
|
||||
ttl_extractor.get());
|
||||
ROCKS_LOG_HEADER(log, " blob_db_options.compression: %d",
|
||||
static_cast<int>(compression));
|
||||
ROCKS_LOG_HEADER(log, "blob_db_options.enable_garbage_collection: %d",
|
||||
enable_garbage_collection);
|
||||
ROCKS_LOG_HEADER(log, " blob_db_options.disable_background_tasks: %d",
|
||||
disable_background_tasks);
|
||||
}
|
||||
|
@ -71,7 +71,12 @@ struct BlobDBOptions {
|
||||
// what compression to use for Blob's
|
||||
CompressionType compression = kNoCompression;
|
||||
|
||||
// Disable all background job.
|
||||
// If enabled, blob DB periodically cleanup stale data by rewriting remaining
|
||||
// live data in blob files to new files. If garbage collection is not enabled,
|
||||
// blob files will be cleanup based on TTL.
|
||||
bool enable_garbage_collection = false;
|
||||
|
||||
// Disable all background job. Used for test only.
|
||||
bool disable_background_tasks = false;
|
||||
|
||||
void Dump(Logger* log) const;
|
||||
|
@ -14,6 +14,7 @@
|
||||
#include "db/db_impl.h"
|
||||
#include "db/write_batch_internal.h"
|
||||
#include "monitoring/instrumented_mutex.h"
|
||||
#include "monitoring/statistics.h"
|
||||
#include "rocksdb/convenience.h"
|
||||
#include "rocksdb/env.h"
|
||||
#include "rocksdb/iterator.h"
|
||||
@ -30,6 +31,7 @@
|
||||
#include "util/logging.h"
|
||||
#include "util/mutexlock.h"
|
||||
#include "util/random.h"
|
||||
#include "util/stop_watch.h"
|
||||
#include "util/sync_point.h"
|
||||
#include "util/timer_queue.h"
|
||||
#include "utilities/blob_db/blob_db_iterator.h"
|
||||
@ -62,13 +64,14 @@ bool blobf_compare_ttl::operator()(const std::shared_ptr<BlobFile>& lhs,
|
||||
if (lhs->expiration_range_.first > rhs->expiration_range_.first) {
|
||||
return false;
|
||||
}
|
||||
return lhs->BlobFileNumber() > rhs->BlobFileNumber();
|
||||
return lhs->BlobFileNumber() < rhs->BlobFileNumber();
|
||||
}
|
||||
|
||||
void EvictAllVersionsCompactionListener::InternalListener::OnCompaction(
|
||||
int level, const Slice& key,
|
||||
CompactionEventListener::CompactionListenerValueType value_type,
|
||||
const Slice& existing_value, const SequenceNumber& sn, bool is_new) {
|
||||
assert(impl_->bdb_options_.enable_garbage_collection);
|
||||
if (!is_new &&
|
||||
value_type ==
|
||||
CompactionEventListener::CompactionListenerValueType::kValue) {
|
||||
@ -105,19 +108,17 @@ BlobDBImpl::BlobDBImpl(const std::string& dbname,
|
||||
bdb_options_(blob_db_options),
|
||||
db_options_(db_options),
|
||||
env_options_(db_options),
|
||||
statistics_(db_options_.statistics.get()),
|
||||
dir_change_(false),
|
||||
next_file_number_(1),
|
||||
epoch_of_(0),
|
||||
shutdown_(false),
|
||||
current_epoch_(0),
|
||||
open_file_count_(0),
|
||||
last_period_write_(0),
|
||||
last_period_ampl_(0),
|
||||
total_periods_write_(0),
|
||||
total_periods_ampl_(0),
|
||||
total_blob_space_(0),
|
||||
open_p1_done_(false),
|
||||
debug_level_(0) {
|
||||
debug_level_(0),
|
||||
oldest_file_evicted_(false) {
|
||||
blob_dir_ = (bdb_options_.path_relative)
|
||||
? dbname + "/" + bdb_options_.blob_dir
|
||||
: bdb_options_.blob_dir;
|
||||
@ -161,17 +162,15 @@ BlobDBImpl::BlobDBImpl(DB* db, const BlobDBOptions& blob_db_options)
|
||||
bdb_options_(blob_db_options),
|
||||
db_options_(db->GetOptions()),
|
||||
env_options_(db_->GetOptions()),
|
||||
statistics_(db_options_.statistics.get()),
|
||||
dir_change_(false),
|
||||
next_file_number_(1),
|
||||
epoch_of_(0),
|
||||
shutdown_(false),
|
||||
current_epoch_(0),
|
||||
open_file_count_(0),
|
||||
last_period_write_(0),
|
||||
last_period_ampl_(0),
|
||||
total_periods_write_(0),
|
||||
total_periods_ampl_(0),
|
||||
total_blob_space_(0) {
|
||||
total_blob_space_(0),
|
||||
oldest_file_evicted_(false) {
|
||||
if (!bdb_options_.blob_dir.empty())
|
||||
blob_dir_ = (bdb_options_.path_relative)
|
||||
? db_->GetName() + "/" + bdb_options_.blob_dir
|
||||
@ -211,19 +210,19 @@ void BlobDBImpl::StartBackgroundTasks() {
|
||||
std::bind(&BlobDBImpl::ReclaimOpenFiles, this, std::placeholders::_1));
|
||||
tqueue_.add(kGCCheckPeriodMillisecs,
|
||||
std::bind(&BlobDBImpl::RunGC, this, std::placeholders::_1));
|
||||
if (bdb_options_.enable_garbage_collection) {
|
||||
tqueue_.add(
|
||||
kDeleteCheckPeriodMillisecs,
|
||||
std::bind(&BlobDBImpl::EvictDeletions, this, std::placeholders::_1));
|
||||
tqueue_.add(
|
||||
kDeleteCheckPeriodMillisecs,
|
||||
std::bind(&BlobDBImpl::EvictCompacted, this, std::placeholders::_1));
|
||||
}
|
||||
tqueue_.add(
|
||||
kDeleteObsoleteFilesPeriodMillisecs,
|
||||
std::bind(&BlobDBImpl::DeleteObsoleteFiles, this, std::placeholders::_1));
|
||||
tqueue_.add(kSanityCheckPeriodMillisecs,
|
||||
std::bind(&BlobDBImpl::SanityCheck, this, std::placeholders::_1));
|
||||
tqueue_.add(kWriteAmplificationStatsPeriodMillisecs,
|
||||
std::bind(&BlobDBImpl::WaStats, this, std::placeholders::_1));
|
||||
tqueue_.add(kFSyncFilesPeriodMillisecs,
|
||||
std::bind(&BlobDBImpl::FsyncFiles, this, std::placeholders::_1));
|
||||
tqueue_.add(
|
||||
@ -325,6 +324,7 @@ Status BlobDBImpl::OpenAllFiles() {
|
||||
continue;
|
||||
}
|
||||
bfptr->SetHasTTL(bfptr->header_.has_ttl);
|
||||
bfptr->SetCompression(bfptr->header_.compression);
|
||||
bfptr->header_valid_ = true;
|
||||
|
||||
std::shared_ptr<RandomAccessFileReader> ra_reader =
|
||||
@ -484,8 +484,8 @@ Status BlobDBImpl::CreateWriterLocked(const std::shared_ptr<BlobFile>& bfile) {
|
||||
}
|
||||
|
||||
bfile->log_writer_ = std::make_shared<Writer>(
|
||||
std::move(fwriter), bfile->file_number_, bdb_options_.bytes_per_sync,
|
||||
db_options_.use_fsync, boffset);
|
||||
std::move(fwriter), env_, statistics_, bfile->file_number_,
|
||||
bdb_options_.bytes_per_sync, db_options_.use_fsync, boffset);
|
||||
bfile->log_writer_->last_elem_type_ = et;
|
||||
|
||||
return s;
|
||||
@ -562,6 +562,7 @@ std::shared_ptr<BlobFile> BlobDBImpl::SelectBlobFile() {
|
||||
reinterpret_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily())->GetID();
|
||||
bfile->header_valid_ = true;
|
||||
bfile->SetHasTTL(false);
|
||||
bfile->SetCompression(bdb_options_.compression);
|
||||
|
||||
Status s = writer->WriteHeader(bfile->header_);
|
||||
if (!s.ok()) {
|
||||
@ -622,6 +623,7 @@ std::shared_ptr<BlobFile> BlobDBImpl::SelectBlobFileTTL(uint64_t expiration) {
|
||||
;
|
||||
bfile->header_valid_ = true;
|
||||
bfile->SetHasTTL(true);
|
||||
bfile->SetCompression(bdb_options_.compression);
|
||||
bfile->file_size_ = BlobLogHeader::kSize;
|
||||
|
||||
// set the first value of the range, since that is
|
||||
@ -657,8 +659,10 @@ Status BlobDBImpl::Delete(const WriteOptions& options, const Slice& key) {
|
||||
SequenceNumber lsn = db_impl_->GetLatestSequenceNumber();
|
||||
Status s = db_->Delete(options, key);
|
||||
|
||||
if (bdb_options_.enable_garbage_collection) {
|
||||
// add deleted key to list of keys that have been deleted for book-keeping
|
||||
delete_keys_q_.enqueue({DefaultColumnFamily(), key.ToString(), lsn});
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
@ -735,13 +739,23 @@ class BlobDBImpl::BlobInserter : public WriteBatch::Handler {
|
||||
};
|
||||
|
||||
Status BlobDBImpl::Write(const WriteOptions& options, WriteBatch* updates) {
|
||||
MutexLock l(&write_mutex_);
|
||||
|
||||
StopWatch write_sw(env_, statistics_, BLOB_DB_WRITE_MICROS);
|
||||
RecordTick(statistics_, BLOB_DB_NUM_WRITE);
|
||||
uint32_t default_cf_id =
|
||||
reinterpret_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily())->GetID();
|
||||
// TODO(yiwu): In case there are multiple writers the latest sequence would
|
||||
// not be the actually sequence we are writting. Need to get the sequence
|
||||
// from write batch after DB write instead.
|
||||
SequenceNumber current_seq = GetLatestSequenceNumber() + 1;
|
||||
Status s;
|
||||
BlobInserter blob_inserter(options, this, default_cf_id, current_seq);
|
||||
Status s = updates->Iterate(&blob_inserter);
|
||||
{
|
||||
// Release write_mutex_ before DB write to avoid race condition with
|
||||
// flush begin listener, which also require write_mutex_ to sync
|
||||
// blob files.
|
||||
MutexLock l(&write_mutex_);
|
||||
s = updates->Iterate(&blob_inserter);
|
||||
}
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
@ -749,7 +763,6 @@ Status BlobDBImpl::Write(const WriteOptions& options, WriteBatch* updates) {
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
assert(blob_inserter.sequence() == GetLatestSequenceNumber() + 1);
|
||||
|
||||
// add deleted key to list of keys that have been deleted for book-keeping
|
||||
class DeleteBookkeeper : public WriteBatch::Handler {
|
||||
@ -778,11 +791,13 @@ Status BlobDBImpl::Write(const WriteOptions& options, WriteBatch* updates) {
|
||||
SequenceNumber sequence_;
|
||||
};
|
||||
|
||||
if (bdb_options_.enable_garbage_collection) {
|
||||
// add deleted key to list of keys that have been deleted for book-keeping
|
||||
DeleteBookkeeper delete_bookkeeper(this, current_seq);
|
||||
updates->Iterate(&delete_bookkeeper);
|
||||
s = updates->Iterate(&delete_bookkeeper);
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
return s;
|
||||
}
|
||||
|
||||
Status BlobDBImpl::GetLiveFiles(std::vector<std::string>& ret,
|
||||
@ -836,20 +851,32 @@ Status BlobDBImpl::PutWithTTL(const WriteOptions& options,
|
||||
|
||||
Status BlobDBImpl::PutUntil(const WriteOptions& options, const Slice& key,
|
||||
const Slice& value, uint64_t expiration) {
|
||||
MutexLock l(&write_mutex_);
|
||||
SequenceNumber sequence = GetLatestSequenceNumber() + 1;
|
||||
StopWatch write_sw(env_, statistics_, BLOB_DB_WRITE_MICROS);
|
||||
RecordTick(statistics_, BLOB_DB_NUM_PUT);
|
||||
TEST_SYNC_POINT("BlobDBImpl::PutUntil:Start");
|
||||
Status s;
|
||||
WriteBatch batch;
|
||||
Status s = PutBlobValue(options, key, value, expiration, sequence, &batch);
|
||||
{
|
||||
// Release write_mutex_ before DB write to avoid race condition with
|
||||
// flush begin listener, which also require write_mutex_ to sync
|
||||
// blob files.
|
||||
MutexLock l(&write_mutex_);
|
||||
// TODO(yiwu): In case there are multiple writers the latest sequence would
|
||||
// not be the actually sequence we are writting. Need to get the sequence
|
||||
// from write batch after DB write instead.
|
||||
SequenceNumber sequence = GetLatestSequenceNumber() + 1;
|
||||
s = PutBlobValue(options, key, value, expiration, sequence, &batch);
|
||||
}
|
||||
if (s.ok()) {
|
||||
s = db_->Write(options, &batch);
|
||||
}
|
||||
TEST_SYNC_POINT("BlobDBImpl::PutUntil:Finish");
|
||||
return s;
|
||||
}
|
||||
|
||||
Status BlobDBImpl::PutBlobValue(const WriteOptions& options, const Slice& key,
|
||||
const Slice& value, uint64_t expiration,
|
||||
SequenceNumber sequence, WriteBatch* batch) {
|
||||
TEST_SYNC_POINT("BlobDBImpl::PutBlobValue:Start");
|
||||
Status s;
|
||||
std::string index_entry;
|
||||
uint32_t column_family_id =
|
||||
@ -858,11 +885,13 @@ Status BlobDBImpl::PutBlobValue(const WriteOptions& options, const Slice& key,
|
||||
if (expiration == kNoExpiration) {
|
||||
// Put as normal value
|
||||
s = batch->Put(key, value);
|
||||
RecordTick(statistics_, BLOB_DB_WRITE_INLINED);
|
||||
} else {
|
||||
// Inlined with TTL
|
||||
BlobIndex::EncodeInlinedTTL(&index_entry, expiration, value);
|
||||
s = WriteBatchInternal::PutBlobIndex(batch, column_family_id, key,
|
||||
index_entry);
|
||||
RecordTick(statistics_, BLOB_DB_WRITE_INLINED_TTL);
|
||||
}
|
||||
} else {
|
||||
std::shared_ptr<BlobFile> bfile = (expiration != kNoExpiration)
|
||||
@ -872,6 +901,7 @@ Status BlobDBImpl::PutBlobValue(const WriteOptions& options, const Slice& key,
|
||||
return Status::NotFound("Blob file not found");
|
||||
}
|
||||
|
||||
assert(bfile->compression() == bdb_options_.compression);
|
||||
std::string compression_output;
|
||||
Slice value_compressed = GetCompressedSlice(value, &compression_output);
|
||||
|
||||
@ -880,6 +910,11 @@ Status BlobDBImpl::PutBlobValue(const WriteOptions& options, const Slice& key,
|
||||
|
||||
s = AppendBlob(bfile, headerbuf, key, value_compressed, expiration,
|
||||
&index_entry);
|
||||
if (expiration == kNoExpiration) {
|
||||
RecordTick(statistics_, BLOB_DB_WRITE_BLOB);
|
||||
} else {
|
||||
RecordTick(statistics_, BLOB_DB_WRITE_BLOB_TTL);
|
||||
}
|
||||
|
||||
if (s.ok()) {
|
||||
bfile->ExtendSequenceRange(sequence);
|
||||
@ -901,7 +936,11 @@ Status BlobDBImpl::PutBlobValue(const WriteOptions& options, const Slice& key,
|
||||
}
|
||||
}
|
||||
|
||||
TEST_SYNC_POINT("BlobDBImpl::PutBlobValue:Finish");
|
||||
RecordTick(statistics_, BLOB_DB_NUM_KEYS_WRITTEN);
|
||||
RecordTick(statistics_, BLOB_DB_BYTES_WRITTEN, key.size() + value.size());
|
||||
MeasureTime(statistics_, BLOB_DB_KEY_SIZE, key.size());
|
||||
MeasureTime(statistics_, BLOB_DB_VALUE_SIZE, value.size());
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
@ -910,6 +949,7 @@ Slice BlobDBImpl::GetCompressedSlice(const Slice& raw,
|
||||
if (bdb_options_.compression == kNoCompression) {
|
||||
return raw;
|
||||
}
|
||||
StopWatch compression_sw(env_, statistics_, BLOB_DB_COMPRESSION_MICROS);
|
||||
CompressionType ct = bdb_options_.compression;
|
||||
CompressionOptions compression_opts;
|
||||
CompressBlock(raw, compression_opts, &ct, kBlockBasedTableVersionFormat,
|
||||
@ -931,19 +971,74 @@ uint64_t BlobDBImpl::ExtractExpiration(const Slice& key, const Slice& value,
|
||||
return has_expiration ? expiration : kNoExpiration;
|
||||
}
|
||||
|
||||
std::shared_ptr<BlobFile> BlobDBImpl::GetOldestBlobFile() {
|
||||
std::vector<std::shared_ptr<BlobFile>> blob_files;
|
||||
CopyBlobFiles(&blob_files, [](const std::shared_ptr<BlobFile>& f) {
|
||||
return !f->Obsolete() && f->Immutable();
|
||||
});
|
||||
blobf_compare_ttl compare;
|
||||
return *std::min_element(blob_files.begin(), blob_files.end(), compare);
|
||||
}
|
||||
|
||||
bool BlobDBImpl::EvictOldestBlobFile() {
|
||||
auto oldest_file = GetOldestBlobFile();
|
||||
if (oldest_file == nullptr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
WriteLock wl(&mutex_);
|
||||
// Double check the file is not obsolete by others
|
||||
if (oldest_file_evicted_ == false && !oldest_file->Obsolete()) {
|
||||
auto expiration_range = oldest_file->GetExpirationRange();
|
||||
ROCKS_LOG_INFO(db_options_.info_log,
|
||||
"Evict oldest blob file since DB out of space. Current "
|
||||
"space used: %" PRIu64 ", blob dir size: %" PRIu64
|
||||
", evicted blob file #%" PRIu64
|
||||
" with expiration range (%" PRIu64 ", %" PRIu64 ").",
|
||||
total_blob_space_.load(), bdb_options_.blob_dir_size,
|
||||
oldest_file->BlobFileNumber(), expiration_range.first,
|
||||
expiration_range.second);
|
||||
oldest_file->MarkObsolete(oldest_file->GetSequenceRange().second);
|
||||
obsolete_files_.push_back(oldest_file);
|
||||
oldest_file_evicted_.store(true);
|
||||
RecordTick(statistics_, BLOB_DB_FIFO_NUM_FILES_EVICTED);
|
||||
RecordTick(statistics_, BLOB_DB_FIFO_NUM_KEYS_EVICTED,
|
||||
oldest_file->BlobCount());
|
||||
RecordTick(statistics_, BLOB_DB_FIFO_BYTES_EVICTED,
|
||||
oldest_file->GetFileSize());
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
Status BlobDBImpl::CheckSize(size_t blob_size) {
|
||||
uint64_t new_space_util = total_blob_space_.load() + blob_size;
|
||||
if (bdb_options_.blob_dir_size > 0) {
|
||||
if (!bdb_options_.is_fifo &&
|
||||
(new_space_util > bdb_options_.blob_dir_size)) {
|
||||
return Status::NoSpace(
|
||||
"Write failed, as writing it would exceed blob_dir_size limit.");
|
||||
}
|
||||
if (bdb_options_.is_fifo && !oldest_file_evicted_.load() &&
|
||||
(new_space_util >
|
||||
kEvictOldestFileAtSize * bdb_options_.blob_dir_size)) {
|
||||
EvictOldestBlobFile();
|
||||
}
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status BlobDBImpl::AppendBlob(const std::shared_ptr<BlobFile>& bfile,
|
||||
const std::string& headerbuf, const Slice& key,
|
||||
const Slice& value, uint64_t expiration,
|
||||
std::string* index_entry) {
|
||||
auto size_put = BlobLogRecord::kHeaderSize + key.size() + value.size();
|
||||
if (bdb_options_.blob_dir_size > 0 &&
|
||||
(total_blob_space_.load() + size_put) > bdb_options_.blob_dir_size) {
|
||||
if (!bdb_options_.is_fifo) {
|
||||
return Status::NoSpace("Blob DB reached the maximum configured size.");
|
||||
Status s = CheckSize(size_put);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
}
|
||||
|
||||
Status s;
|
||||
|
||||
uint64_t blob_offset = 0;
|
||||
uint64_t key_offset = 0;
|
||||
@ -968,7 +1063,6 @@ Status BlobDBImpl::AppendBlob(const std::shared_ptr<BlobFile>& bfile,
|
||||
bfile->blob_count_++;
|
||||
|
||||
bfile->file_size_ += size_put;
|
||||
last_period_write_ += size_put;
|
||||
total_blob_space_ += size_put;
|
||||
|
||||
if (expiration == kNoExpiration) {
|
||||
@ -986,6 +1080,8 @@ Status BlobDBImpl::AppendBlob(const std::shared_ptr<BlobFile>& bfile,
|
||||
std::vector<Status> BlobDBImpl::MultiGet(
|
||||
const ReadOptions& read_options,
|
||||
const std::vector<Slice>& keys, std::vector<std::string>* values) {
|
||||
StopWatch multiget_sw(env_, statistics_, BLOB_DB_MULTIGET_MICROS);
|
||||
RecordTick(statistics_, BLOB_DB_NUM_MULTIGET);
|
||||
// Get a snapshot to avoid blob file get deleted between we
|
||||
// fetch and index entry and reading from the file.
|
||||
ReadOptions ro(read_options);
|
||||
@ -1089,7 +1185,12 @@ Status BlobDBImpl::GetBlobValue(const Slice& key, const Slice& index_entry,
|
||||
char* buffer = &(*valueptr)[0];
|
||||
|
||||
Slice blob_value;
|
||||
s = reader->Read(blob_index.offset(), blob_index.size(), &blob_value, buffer);
|
||||
{
|
||||
StopWatch read_sw(env_, statistics_, BLOB_DB_BLOB_FILE_READ_MICROS);
|
||||
s = reader->Read(blob_index.offset(), blob_index.size(), &blob_value,
|
||||
buffer);
|
||||
RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_READ, blob_value.size());
|
||||
}
|
||||
if (!s.ok() || blob_value.size() != blob_index.size()) {
|
||||
if (debug_level_ >= 2) {
|
||||
ROCKS_LOG_ERROR(db_options_.info_log,
|
||||
@ -1135,15 +1236,17 @@ Status BlobDBImpl::GetBlobValue(const Slice& key, const Slice& index_entry,
|
||||
return Status::Corruption("Corruption. Blob CRC mismatch");
|
||||
}
|
||||
|
||||
// TODO(yiwu): Should use compression flag in the blob file instead of
|
||||
// current compression option.
|
||||
if (bdb_options_.compression != kNoCompression) {
|
||||
if (bfile->compression() != kNoCompression) {
|
||||
BlockContents contents;
|
||||
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily());
|
||||
{
|
||||
StopWatch decompression_sw(env_, statistics_,
|
||||
BLOB_DB_DECOMPRESSION_MICROS);
|
||||
s = UncompressBlockContentsForCompressionType(
|
||||
blob_value.data(), blob_value.size(), &contents,
|
||||
kBlockBasedTableVersionFormat, Slice(), bdb_options_.compression,
|
||||
kBlockBasedTableVersionFormat, Slice(), bfile->compression(),
|
||||
*(cfh->cfd()->ioptions()));
|
||||
}
|
||||
*(value->GetSelf()) = contents.data.ToString();
|
||||
}
|
||||
|
||||
@ -1155,6 +1258,14 @@ Status BlobDBImpl::GetBlobValue(const Slice& key, const Slice& index_entry,
|
||||
Status BlobDBImpl::Get(const ReadOptions& read_options,
|
||||
ColumnFamilyHandle* column_family, const Slice& key,
|
||||
PinnableSlice* value) {
|
||||
StopWatch get_sw(env_, statistics_, BLOB_DB_GET_MICROS);
|
||||
RecordTick(statistics_, BLOB_DB_NUM_GET);
|
||||
return GetImpl(read_options, column_family, key, value);
|
||||
}
|
||||
|
||||
Status BlobDBImpl::GetImpl(const ReadOptions& read_options,
|
||||
ColumnFamilyHandle* column_family, const Slice& key,
|
||||
PinnableSlice* value) {
|
||||
if (column_family != DefaultColumnFamily()) {
|
||||
return Status::NotSupported(
|
||||
"Blob DB doesn't support non-default column family.");
|
||||
@ -1167,19 +1278,21 @@ Status BlobDBImpl::Get(const ReadOptions& read_options,
|
||||
|
||||
Status s;
|
||||
bool is_blob_index = false;
|
||||
s = db_impl_->GetImpl(ro, column_family, key, value, nullptr /*value_found*/,
|
||||
nullptr /*read_callback*/, &is_blob_index);
|
||||
s = db_impl_->GetImpl(ro, column_family, key, value,
|
||||
nullptr /*value_found*/, nullptr /*read_callback*/,
|
||||
&is_blob_index);
|
||||
TEST_SYNC_POINT("BlobDBImpl::Get:AfterIndexEntryGet:1");
|
||||
TEST_SYNC_POINT("BlobDBImpl::Get:AfterIndexEntryGet:2");
|
||||
if (s.ok()) {
|
||||
if (is_blob_index) {
|
||||
PinnableSlice index_entry = std::move(*value);
|
||||
if (s.ok() && is_blob_index) {
|
||||
std::string index_entry = value->ToString();
|
||||
value->Reset();
|
||||
s = GetBlobValue(key, index_entry, value);
|
||||
}
|
||||
}
|
||||
if (snapshot_created) {
|
||||
db_->ReleaseSnapshot(ro.snapshot);
|
||||
}
|
||||
RecordTick(statistics_, BLOB_DB_NUM_KEYS_READ);
|
||||
RecordTick(statistics_, BLOB_DB_BYTES_READ, value->size());
|
||||
return s;
|
||||
}
|
||||
|
||||
@ -1254,32 +1367,18 @@ Status BlobDBImpl::CloseBlobFileIfNeeded(std::shared_ptr<BlobFile>& bfile) {
|
||||
return CloseBlobFile(bfile);
|
||||
}
|
||||
|
||||
bool BlobDBImpl::FileDeleteOk_SnapshotCheckLocked(
|
||||
bool BlobDBImpl::VisibleToActiveSnapshot(
|
||||
const std::shared_ptr<BlobFile>& bfile) {
|
||||
assert(bfile->Obsolete());
|
||||
|
||||
SequenceNumber esn = bfile->GetSequenceRange().first;
|
||||
|
||||
// TODO(yiwu): Here we should check instead if there is an active snapshot
|
||||
// lies between the first sequence in the file, and the last sequence by
|
||||
// the time the file finished being garbage collect.
|
||||
bool notok = db_impl_->HasActiveSnapshotLaterThanSN(esn);
|
||||
if (notok) {
|
||||
ROCKS_LOG_INFO(db_options_.info_log,
|
||||
"Could not delete file due to snapshot failure %s",
|
||||
bfile->PathName().c_str());
|
||||
return false;
|
||||
} else {
|
||||
ROCKS_LOG_INFO(db_options_.info_log,
|
||||
"Will delete file due to snapshot success %s",
|
||||
bfile->PathName().c_str());
|
||||
return true;
|
||||
}
|
||||
SequenceNumber first_sequence = bfile->GetSequenceRange().first;
|
||||
SequenceNumber obsolete_sequence = bfile->GetObsoleteSequence();
|
||||
return db_impl_->HasActiveSnapshotInRange(first_sequence, obsolete_sequence);
|
||||
}
|
||||
|
||||
bool BlobDBImpl::FindFileAndEvictABlob(uint64_t file_number, uint64_t key_size,
|
||||
uint64_t blob_offset,
|
||||
uint64_t blob_size) {
|
||||
assert(bdb_options_.enable_garbage_collection);
|
||||
(void)blob_offset;
|
||||
std::shared_ptr<BlobFile> bfile;
|
||||
{
|
||||
@ -1302,6 +1401,7 @@ bool BlobDBImpl::FindFileAndEvictABlob(uint64_t file_number, uint64_t key_size,
|
||||
}
|
||||
|
||||
bool BlobDBImpl::MarkBlobDeleted(const Slice& key, const Slice& index_entry) {
|
||||
assert(bdb_options_.enable_garbage_collection);
|
||||
BlobIndex blob_index;
|
||||
Status s = blob_index.DecodeFrom(index_entry);
|
||||
if (!s.ok()) {
|
||||
@ -1316,6 +1416,7 @@ bool BlobDBImpl::MarkBlobDeleted(const Slice& key, const Slice& index_entry) {
|
||||
}
|
||||
|
||||
std::pair<bool, int64_t> BlobDBImpl::EvictCompacted(bool aborted) {
|
||||
assert(bdb_options_.enable_garbage_collection);
|
||||
if (aborted) return std::make_pair(false, -1);
|
||||
|
||||
override_packet_t packet;
|
||||
@ -1339,6 +1440,7 @@ std::pair<bool, int64_t> BlobDBImpl::EvictCompacted(bool aborted) {
|
||||
}
|
||||
|
||||
std::pair<bool, int64_t> BlobDBImpl::EvictDeletions(bool aborted) {
|
||||
assert(bdb_options_.enable_garbage_collection);
|
||||
if (aborted) return std::make_pair(false, -1);
|
||||
|
||||
ColumnFamilyHandle* last_cfh = nullptr;
|
||||
@ -1478,35 +1580,6 @@ std::pair<bool, int64_t> BlobDBImpl::ReclaimOpenFiles(bool aborted) {
|
||||
return std::make_pair(true, -1);
|
||||
}
|
||||
|
||||
// TODO(yiwu): correct the stats and expose it.
|
||||
std::pair<bool, int64_t> BlobDBImpl::WaStats(bool aborted) {
|
||||
if (aborted) return std::make_pair(false, -1);
|
||||
|
||||
WriteLock wl(&mutex_);
|
||||
|
||||
if (all_periods_write_.size() >= kWriteAmplificationStatsPeriods) {
|
||||
total_periods_write_ -= (*all_periods_write_.begin());
|
||||
total_periods_ampl_ = (*all_periods_ampl_.begin());
|
||||
|
||||
all_periods_write_.pop_front();
|
||||
all_periods_ampl_.pop_front();
|
||||
}
|
||||
|
||||
uint64_t val1 = last_period_write_.load();
|
||||
uint64_t val2 = last_period_ampl_.load();
|
||||
|
||||
all_periods_write_.push_back(val1);
|
||||
all_periods_ampl_.push_back(val2);
|
||||
|
||||
last_period_write_ = 0;
|
||||
last_period_ampl_ = 0;
|
||||
|
||||
total_periods_write_ += val1;
|
||||
total_periods_ampl_ += val2;
|
||||
|
||||
return std::make_pair(true, -1);
|
||||
}
|
||||
|
||||
// Write callback for garbage collection to check if key has been updated
|
||||
// since last read. Similar to how OptimisticTransaction works. See inline
|
||||
// comment in GCFileAndUpdateLSM().
|
||||
@ -1567,6 +1640,7 @@ class BlobDBImpl::GarbageCollectionWriteCallback : public WriteCallback {
|
||||
// DELETED in the LSM
|
||||
Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
|
||||
GCStats* gc_stats) {
|
||||
StopWatch gc_sw(env_, statistics_, BLOB_DB_GC_MICROS);
|
||||
uint64_t now = EpochNow();
|
||||
|
||||
std::shared_ptr<Reader> reader =
|
||||
@ -1648,7 +1722,7 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
|
||||
ReadOptions(), cfh, record.key, &index_entry, nullptr /*value_found*/,
|
||||
nullptr /*read_callback*/, &is_blob_index);
|
||||
TEST_SYNC_POINT("BlobDBImpl::GCFileAndUpdateLSM:AfterGetFromBaseDB");
|
||||
if (!get_status.ok() && !get_status.ok()) {
|
||||
if (!get_status.ok() && !get_status.IsNotFound()) {
|
||||
// error
|
||||
s = get_status;
|
||||
ROCKS_LOG_ERROR(db_options_.info_log,
|
||||
@ -1659,6 +1733,8 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
|
||||
if (get_status.IsNotFound() || !is_blob_index) {
|
||||
// Either the key is deleted or updated with a newer version whish is
|
||||
// inlined in LSM.
|
||||
gc_stats->num_keys_overwritten++;
|
||||
gc_stats->bytes_overwritten += record.record_size();
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -1670,18 +1746,23 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
|
||||
s.ToString().c_str());
|
||||
break;
|
||||
}
|
||||
if (blob_index.file_number() != bfptr->BlobFileNumber() ||
|
||||
if (blob_index.IsInlined() ||
|
||||
blob_index.file_number() != bfptr->BlobFileNumber() ||
|
||||
blob_index.offset() != blob_offset) {
|
||||
// Key has been overwritten. Drop the blob record.
|
||||
gc_stats->num_keys_overwritten++;
|
||||
gc_stats->bytes_overwritten += record.record_size();
|
||||
continue;
|
||||
}
|
||||
|
||||
GarbageCollectionWriteCallback callback(cfd, record.key, latest_seq);
|
||||
|
||||
// If key has expired, remove it from base DB.
|
||||
// TODO(yiwu): Blob indexes will be remove by BlobIndexCompactionFilter.
|
||||
// We can just drop the blob record.
|
||||
if (no_relocation_ttl || (has_ttl && now >= record.expiration)) {
|
||||
gc_stats->num_deletes++;
|
||||
gc_stats->deleted_size += record.value_size;
|
||||
gc_stats->num_keys_expired++;
|
||||
gc_stats->bytes_expired += record.record_size();
|
||||
TEST_SYNC_POINT("BlobDBImpl::GCFileAndUpdateLSM:BeforeDelete");
|
||||
WriteBatch delete_batch;
|
||||
Status delete_status = delete_batch.Delete(record.key);
|
||||
@ -1689,12 +1770,7 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
|
||||
delete_status = db_impl_->WriteWithCallback(WriteOptions(),
|
||||
&delete_batch, &callback);
|
||||
}
|
||||
if (delete_status.ok()) {
|
||||
gc_stats->delete_succeeded++;
|
||||
} else if (delete_status.IsBusy()) {
|
||||
// The key is overwritten in the meanwhile. Drop the blob record.
|
||||
gc_stats->overwritten_while_delete++;
|
||||
} else {
|
||||
if (!delete_status.ok() && !delete_status.IsBusy()) {
|
||||
// We hit an error.
|
||||
s = delete_status;
|
||||
ROCKS_LOG_ERROR(db_options_.info_log,
|
||||
@ -1717,7 +1793,6 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
|
||||
std::string reason("GC of ");
|
||||
reason += bfptr->PathName();
|
||||
newfile = NewBlobFile(reason);
|
||||
gc_stats->newfile = newfile;
|
||||
|
||||
new_writer = CheckOrCreateWriterLocked(newfile);
|
||||
newfile->header_ = std::move(header);
|
||||
@ -1739,9 +1814,7 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
|
||||
blob_files_.insert(std::make_pair(newfile->BlobFileNumber(), newfile));
|
||||
}
|
||||
|
||||
gc_stats->num_relocate++;
|
||||
std::string new_index_entry;
|
||||
|
||||
uint64_t new_blob_offset = 0;
|
||||
uint64_t new_key_offset = 0;
|
||||
// write the blob to the blob log.
|
||||
@ -1765,10 +1838,14 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
|
||||
&rewrite_batch, &callback);
|
||||
}
|
||||
if (rewrite_status.ok()) {
|
||||
gc_stats->relocate_succeeded++;
|
||||
newfile->ExtendSequenceRange(
|
||||
WriteBatchInternal::Sequence(&rewrite_batch));
|
||||
gc_stats->num_keys_relocated++;
|
||||
gc_stats->bytes_relocated += record.record_size();
|
||||
} else if (rewrite_status.IsBusy()) {
|
||||
// The key is overwritten in the meanwhile. Drop the blob record.
|
||||
gc_stats->overwritten_while_relocate++;
|
||||
gc_stats->num_keys_overwritten++;
|
||||
gc_stats->bytes_overwritten += record.record_size();
|
||||
} else {
|
||||
// We hit an error.
|
||||
s = rewrite_status;
|
||||
@ -1778,19 +1855,47 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
|
||||
}
|
||||
} // end of ReadRecord loop
|
||||
|
||||
if (s.ok()) {
|
||||
SequenceNumber obsolete_sequence =
|
||||
newfile == nullptr ? bfptr->GetSequenceRange().second + 1
|
||||
: newfile->GetSequenceRange().second;
|
||||
bfptr->MarkObsolete(obsolete_sequence);
|
||||
if (!first_gc) {
|
||||
WriteLock wl(&mutex_);
|
||||
obsolete_files_.push_back(bfptr);
|
||||
}
|
||||
}
|
||||
|
||||
ROCKS_LOG_INFO(
|
||||
db_options_.info_log,
|
||||
"%s blob file %" PRIu64
|
||||
". Total blob records: %" PRIu64 ", Deletes: %" PRIu64 "/%" PRIu64
|
||||
" succeeded, Relocates: %" PRIu64 "/%" PRIu64 " succeeded.",
|
||||
"%s blob file %" PRIu64 ". Total blob records: %" PRIu64
|
||||
", Expired: %" PRIu64 " keys/%" PRIu64 " bytes, Overwritten: %" PRIu64
|
||||
" keys/%" PRIu64 " bytes.",
|
||||
s.ok() ? "Successfully garbage collected" : "Failed to garbage collect",
|
||||
bfptr->BlobFileNumber(), gc_stats->blob_count, gc_stats->delete_succeeded,
|
||||
gc_stats->num_deletes, gc_stats->relocate_succeeded,
|
||||
gc_stats->num_relocate);
|
||||
bfptr->BlobFileNumber(), gc_stats->blob_count, gc_stats->num_keys_expired,
|
||||
gc_stats->bytes_expired, gc_stats->num_keys_overwritten,
|
||||
gc_stats->bytes_overwritten, gc_stats->num_keys_relocated,
|
||||
gc_stats->bytes_relocated);
|
||||
RecordTick(statistics_, BLOB_DB_GC_NUM_FILES);
|
||||
RecordTick(statistics_, BLOB_DB_GC_NUM_KEYS_OVERWRITTEN,
|
||||
gc_stats->num_keys_overwritten);
|
||||
RecordTick(statistics_, BLOB_DB_GC_NUM_KEYS_EXPIRED,
|
||||
gc_stats->num_keys_expired);
|
||||
RecordTick(statistics_, BLOB_DB_GC_BYTES_OVERWRITTEN,
|
||||
gc_stats->bytes_overwritten);
|
||||
RecordTick(statistics_, BLOB_DB_GC_BYTES_EXPIRED, gc_stats->bytes_expired);
|
||||
if (newfile != nullptr) {
|
||||
total_blob_space_ += newfile->file_size_;
|
||||
ROCKS_LOG_INFO(db_options_.info_log, "New blob file %" PRIu64 ".",
|
||||
newfile->BlobFileNumber());
|
||||
RecordTick(statistics_, BLOB_DB_GC_NUM_NEW_FILES);
|
||||
RecordTick(statistics_, BLOB_DB_GC_NUM_KEYS_RELOCATED,
|
||||
gc_stats->num_keys_relocated);
|
||||
RecordTick(statistics_, BLOB_DB_GC_BYTES_RELOCATED,
|
||||
gc_stats->bytes_relocated);
|
||||
}
|
||||
if (!s.ok()) {
|
||||
RecordTick(statistics_, BLOB_DB_GC_FAILURES);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
@ -1844,11 +1949,13 @@ bool BlobDBImpl::ShouldGCFile(std::shared_ptr<BlobFile> bfile, uint64_t now,
|
||||
|
||||
ReadLock lockbfile_r(&bfile->mutex_);
|
||||
|
||||
if (bdb_options_.enable_garbage_collection) {
|
||||
if ((bfile->deleted_size_ * 100.0 / bfile->file_size_.load()) >
|
||||
kPartialExpirationPercentage) {
|
||||
*reason = "deleted simple blobs beyond threshold";
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// if we haven't reached limits of disk space, don't DELETE
|
||||
if (bdb_options_.blob_dir_size == 0 ||
|
||||
@ -1884,11 +1991,17 @@ std::pair<bool, int64_t> BlobDBImpl::DeleteObsoleteFiles(bool aborted) {
|
||||
auto bfile = *iter;
|
||||
{
|
||||
ReadLock lockbfile_r(&bfile->mutex_);
|
||||
if (!FileDeleteOk_SnapshotCheckLocked(bfile)) {
|
||||
if (VisibleToActiveSnapshot(bfile)) {
|
||||
ROCKS_LOG_INFO(db_options_.info_log,
|
||||
"Could not delete file due to snapshot failure %s",
|
||||
bfile->PathName().c_str());
|
||||
++iter;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
ROCKS_LOG_INFO(db_options_.info_log,
|
||||
"Will delete file due to snapshot success %s",
|
||||
bfile->PathName().c_str());
|
||||
|
||||
blob_files_.erase(bfile->BlobFileNumber());
|
||||
Status s = env_->DeleteFile(bfile->PathName());
|
||||
@ -1910,7 +2023,12 @@ std::pair<bool, int64_t> BlobDBImpl::DeleteObsoleteFiles(bool aborted) {
|
||||
}
|
||||
|
||||
// directory change. Fsync
|
||||
if (file_deleted) dir_ent_->Fsync();
|
||||
if (file_deleted) {
|
||||
dir_ent_->Fsync();
|
||||
|
||||
// reset oldest_file_evicted flag
|
||||
oldest_file_evicted_.store(false);
|
||||
}
|
||||
|
||||
// put files back into obsolete if for some reason, delete failed
|
||||
if (!tobsolete.empty()) {
|
||||
@ -1924,14 +2042,19 @@ std::pair<bool, int64_t> BlobDBImpl::DeleteObsoleteFiles(bool aborted) {
|
||||
}
|
||||
|
||||
void BlobDBImpl::CopyBlobFiles(
|
||||
std::vector<std::shared_ptr<BlobFile>>* bfiles_copy) {
|
||||
std::vector<std::shared_ptr<BlobFile>>* bfiles_copy,
|
||||
std::function<bool(const std::shared_ptr<BlobFile>&)> predicate) {
|
||||
ReadLock rl(&mutex_);
|
||||
|
||||
// take a copy
|
||||
bfiles_copy->reserve(blob_files_.size());
|
||||
for (auto const& p : blob_files_) {
|
||||
bool pred_value = true;
|
||||
if (predicate) {
|
||||
pred_value = predicate(p.second);
|
||||
}
|
||||
if (pred_value) {
|
||||
bfiles_copy->push_back(p.second);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void BlobDBImpl::FilterSubsetOfFiles(
|
||||
@ -2008,8 +2131,6 @@ std::pair<bool, int64_t> BlobDBImpl::RunGC(bool aborted) {
|
||||
FilterSubsetOfFiles(blob_files, &to_process, current_epoch_,
|
||||
files_to_collect);
|
||||
|
||||
// in this collect the set of files, which became obsolete
|
||||
std::vector<std::shared_ptr<BlobFile>> obsoletes;
|
||||
for (auto bfile : to_process) {
|
||||
GCStats gc_stats;
|
||||
Status s = GCFileAndUpdateLSM(bfile, &gc_stats);
|
||||
@ -2020,19 +2141,11 @@ std::pair<bool, int64_t> BlobDBImpl::RunGC(bool aborted) {
|
||||
if (bfile->gc_once_after_open_.load()) {
|
||||
WriteLock lockbfile_w(&bfile->mutex_);
|
||||
|
||||
bfile->deleted_size_ = gc_stats.deleted_size;
|
||||
bfile->deleted_count_ = gc_stats.num_deletes;
|
||||
bfile->deleted_size_ =
|
||||
gc_stats.bytes_overwritten + gc_stats.bytes_expired;
|
||||
bfile->deleted_count_ =
|
||||
gc_stats.num_keys_overwritten + gc_stats.num_keys_expired;
|
||||
bfile->gc_once_after_open_ = false;
|
||||
} else {
|
||||
obsoletes.push_back(bfile);
|
||||
}
|
||||
}
|
||||
|
||||
if (!obsoletes.empty()) {
|
||||
WriteLock wl(&mutex_);
|
||||
for (auto bfile : obsoletes) {
|
||||
bfile->SetCanBeDeleted();
|
||||
obsolete_files_.push_front(bfile);
|
||||
}
|
||||
}
|
||||
|
||||
@ -2054,7 +2167,7 @@ Iterator* BlobDBImpl::NewIterator(const ReadOptions& read_options) {
|
||||
auto* iter = db_impl_->NewIteratorImpl(
|
||||
read_options, cfd, snapshot->GetSequenceNumber(),
|
||||
nullptr /*read_callback*/, true /*allow_blob*/);
|
||||
return new BlobDBIterator(own_snapshot, iter, this);
|
||||
return new BlobDBIterator(own_snapshot, iter, this, env_, statistics_);
|
||||
}
|
||||
|
||||
Status DestroyBlobDB(const std::string& dbname, const Options& options,
|
||||
@ -2129,16 +2242,6 @@ Status BlobDBImpl::TEST_GCFileAndUpdateLSM(std::shared_ptr<BlobFile>& bfile,
|
||||
}
|
||||
|
||||
void BlobDBImpl::TEST_RunGC() { RunGC(false /*abort*/); }
|
||||
|
||||
void BlobDBImpl::TEST_ObsoleteFile(std::shared_ptr<BlobFile>& bfile) {
|
||||
uint64_t number = bfile->BlobFileNumber();
|
||||
assert(blob_files_.count(number) > 0);
|
||||
bfile->SetCanBeDeleted();
|
||||
{
|
||||
WriteLock l(&mutex_);
|
||||
obsolete_files_.push_back(bfile);
|
||||
}
|
||||
}
|
||||
#endif // !NDEBUG
|
||||
|
||||
} // namespace blob_db
|
||||
|
@ -24,6 +24,7 @@
|
||||
#include "rocksdb/db.h"
|
||||
#include "rocksdb/listener.h"
|
||||
#include "rocksdb/options.h"
|
||||
#include "rocksdb/statistics.h"
|
||||
#include "rocksdb/wal_filter.h"
|
||||
#include "util/mpsc.h"
|
||||
#include "util/mutexlock.h"
|
||||
@ -135,16 +136,12 @@ struct blobf_compare_ttl {
|
||||
|
||||
struct GCStats {
|
||||
uint64_t blob_count = 0;
|
||||
uint64_t num_deletes = 0;
|
||||
uint64_t deleted_size = 0;
|
||||
uint64_t retry_delete = 0;
|
||||
uint64_t delete_succeeded = 0;
|
||||
uint64_t overwritten_while_delete = 0;
|
||||
uint64_t num_relocate = 0;
|
||||
uint64_t retry_relocate = 0;
|
||||
uint64_t relocate_succeeded = 0;
|
||||
uint64_t overwritten_while_relocate = 0;
|
||||
std::shared_ptr<BlobFile> newfile = nullptr;
|
||||
uint64_t num_keys_overwritten = 0;
|
||||
uint64_t num_keys_expired = 0;
|
||||
uint64_t num_keys_relocated = 0;
|
||||
uint64_t bytes_overwritten = 0;
|
||||
uint64_t bytes_expired = 0;
|
||||
uint64_t bytes_relocated = 0;
|
||||
};
|
||||
|
||||
/**
|
||||
@ -178,10 +175,6 @@ class BlobDBImpl : public BlobDB {
|
||||
// how many periods of stats do we keep.
|
||||
static constexpr uint32_t kWriteAmplificationStatsPeriods = 24;
|
||||
|
||||
// what is the length of any period
|
||||
static constexpr uint32_t kWriteAmplificationStatsPeriodMillisecs =
|
||||
3600 * 1000;
|
||||
|
||||
// we will garbage collect blob files in
|
||||
// which entire files have expired. However if the
|
||||
// ttl_range of files is very large say a day, we
|
||||
@ -205,6 +198,10 @@ class BlobDBImpl : public BlobDB {
|
||||
// how often to schedule check seq files period
|
||||
static constexpr uint32_t kCheckSeqFilesPeriodMillisecs = 10 * 1000;
|
||||
|
||||
// when should oldest file be evicted:
|
||||
// on reaching 90% of blob_dir_size
|
||||
static constexpr double kEvictOldestFileAtSize = 0.9;
|
||||
|
||||
using BlobDB::Put;
|
||||
Status Put(const WriteOptions& options, const Slice& key,
|
||||
const Slice& value) override;
|
||||
@ -275,8 +272,6 @@ class BlobDBImpl : public BlobDB {
|
||||
|
||||
void TEST_RunGC();
|
||||
|
||||
void TEST_ObsoleteFile(std::shared_ptr<BlobFile>& bfile);
|
||||
|
||||
void TEST_DeleteObsoleteFiles();
|
||||
#endif // !NDEBUG
|
||||
|
||||
@ -290,6 +285,10 @@ class BlobDBImpl : public BlobDB {
|
||||
// Return true if a snapshot is created.
|
||||
bool SetSnapshotIfNeeded(ReadOptions* read_options);
|
||||
|
||||
Status GetImpl(const ReadOptions& read_options,
|
||||
ColumnFamilyHandle* column_family, const Slice& key,
|
||||
PinnableSlice* value);
|
||||
|
||||
Status GetBlobValue(const Slice& key, const Slice& index_entry,
|
||||
PinnableSlice* value);
|
||||
|
||||
@ -362,9 +361,6 @@ class BlobDBImpl : public BlobDB {
|
||||
// efficiency
|
||||
std::pair<bool, int64_t> ReclaimOpenFiles(bool aborted);
|
||||
|
||||
// periodically print write amplification statistics
|
||||
std::pair<bool, int64_t> WaStats(bool aborted);
|
||||
|
||||
// background task to do book-keeping of deleted keys
|
||||
std::pair<bool, int64_t> EvictDeletions(bool aborted);
|
||||
|
||||
@ -407,6 +403,7 @@ class BlobDBImpl : public BlobDB {
|
||||
|
||||
// checks if there is no snapshot which is referencing the
|
||||
// blobs
|
||||
bool VisibleToActiveSnapshot(const std::shared_ptr<BlobFile>& file);
|
||||
bool FileDeleteOk_SnapshotCheckLocked(const std::shared_ptr<BlobFile>& bfile);
|
||||
|
||||
bool MarkBlobDeleted(const Slice& key, const Slice& lsmValue);
|
||||
@ -414,7 +411,9 @@ class BlobDBImpl : public BlobDB {
|
||||
bool FindFileAndEvictABlob(uint64_t file_number, uint64_t key_size,
|
||||
uint64_t blob_offset, uint64_t blob_size);
|
||||
|
||||
void CopyBlobFiles(std::vector<std::shared_ptr<BlobFile>>* bfiles_copy);
|
||||
void CopyBlobFiles(
|
||||
std::vector<std::shared_ptr<BlobFile>>* bfiles_copy,
|
||||
std::function<bool(const std::shared_ptr<BlobFile>&)> predicate = {});
|
||||
|
||||
void FilterSubsetOfFiles(
|
||||
const std::vector<std::shared_ptr<BlobFile>>& blob_files,
|
||||
@ -423,6 +422,12 @@ class BlobDBImpl : public BlobDB {
|
||||
|
||||
uint64_t EpochNow() { return env_->NowMicros() / 1000000; }
|
||||
|
||||
Status CheckSize(size_t blob_size);
|
||||
|
||||
std::shared_ptr<BlobFile> GetOldestBlobFile();
|
||||
|
||||
bool EvictOldestBlobFile();
|
||||
|
||||
// the base DB
|
||||
DBImpl* db_impl_;
|
||||
Env* env_;
|
||||
@ -433,6 +438,9 @@ class BlobDBImpl : public BlobDB {
|
||||
DBOptions db_options_;
|
||||
EnvOptions env_options_;
|
||||
|
||||
// Raw pointer of statistic. db_options_ has a shared_ptr to hold ownership.
|
||||
Statistics* statistics_;
|
||||
|
||||
// name of the database directory
|
||||
std::string dbname_;
|
||||
|
||||
@ -508,24 +516,14 @@ class BlobDBImpl : public BlobDB {
|
||||
// counter is used to monitor and close excess RA files.
|
||||
std::atomic<uint32_t> open_file_count_;
|
||||
|
||||
// should hold mutex to modify
|
||||
// STATISTICS for WA of Blob Files due to GC
|
||||
// collect by default 24 hourly periods
|
||||
std::list<uint64_t> all_periods_write_;
|
||||
std::list<uint64_t> all_periods_ampl_;
|
||||
|
||||
std::atomic<uint64_t> last_period_write_;
|
||||
std::atomic<uint64_t> last_period_ampl_;
|
||||
|
||||
uint64_t total_periods_write_;
|
||||
uint64_t total_periods_ampl_;
|
||||
|
||||
// total size of all blob files at a given time
|
||||
std::atomic<uint64_t> total_blob_space_;
|
||||
std::list<std::shared_ptr<BlobFile>> obsolete_files_;
|
||||
bool open_p1_done_;
|
||||
|
||||
uint32_t debug_level_;
|
||||
|
||||
std::atomic<bool> oldest_file_evicted_;
|
||||
};
|
||||
|
||||
} // namespace blob_db
|
||||
|
@ -6,7 +6,9 @@
|
||||
#pragma once
|
||||
#ifndef ROCKSDB_LITE
|
||||
|
||||
#include "monitoring/statistics.h"
|
||||
#include "rocksdb/iterator.h"
|
||||
#include "util/stop_watch.h"
|
||||
#include "utilities/blob_db/blob_db_impl.h"
|
||||
|
||||
namespace rocksdb {
|
||||
@ -17,8 +19,12 @@ using rocksdb::ManagedSnapshot;
|
||||
class BlobDBIterator : public Iterator {
|
||||
public:
|
||||
BlobDBIterator(ManagedSnapshot* snapshot, ArenaWrappedDBIter* iter,
|
||||
BlobDBImpl* blob_db)
|
||||
: snapshot_(snapshot), iter_(iter), blob_db_(blob_db) {}
|
||||
BlobDBImpl* blob_db, Env* env, Statistics* statistics)
|
||||
: snapshot_(snapshot),
|
||||
iter_(iter),
|
||||
blob_db_(blob_db),
|
||||
env_(env),
|
||||
statistics_(statistics) {}
|
||||
|
||||
virtual ~BlobDBIterator() = default;
|
||||
|
||||
@ -37,33 +43,45 @@ class BlobDBIterator : public Iterator {
|
||||
}
|
||||
|
||||
void SeekToFirst() override {
|
||||
StopWatch seek_sw(env_, statistics_, BLOB_DB_SEEK_MICROS);
|
||||
RecordTick(statistics_, BLOB_DB_NUM_SEEK);
|
||||
iter_->SeekToFirst();
|
||||
UpdateBlobValue();
|
||||
}
|
||||
|
||||
void SeekToLast() override {
|
||||
StopWatch seek_sw(env_, statistics_, BLOB_DB_SEEK_MICROS);
|
||||
RecordTick(statistics_, BLOB_DB_NUM_SEEK);
|
||||
iter_->SeekToLast();
|
||||
UpdateBlobValue();
|
||||
}
|
||||
|
||||
void Seek(const Slice& target) override {
|
||||
StopWatch seek_sw(env_, statistics_, BLOB_DB_SEEK_MICROS);
|
||||
RecordTick(statistics_, BLOB_DB_NUM_SEEK);
|
||||
iter_->Seek(target);
|
||||
UpdateBlobValue();
|
||||
}
|
||||
|
||||
void SeekForPrev(const Slice& target) override {
|
||||
StopWatch seek_sw(env_, statistics_, BLOB_DB_SEEK_MICROS);
|
||||
RecordTick(statistics_, BLOB_DB_NUM_SEEK);
|
||||
iter_->SeekForPrev(target);
|
||||
UpdateBlobValue();
|
||||
}
|
||||
|
||||
void Next() override {
|
||||
assert(Valid());
|
||||
StopWatch next_sw(env_, statistics_, BLOB_DB_NEXT_MICROS);
|
||||
RecordTick(statistics_, BLOB_DB_NUM_NEXT);
|
||||
iter_->Next();
|
||||
UpdateBlobValue();
|
||||
}
|
||||
|
||||
void Prev() override {
|
||||
assert(Valid());
|
||||
StopWatch prev_sw(env_, statistics_, BLOB_DB_PREV_MICROS);
|
||||
RecordTick(statistics_, BLOB_DB_NUM_PREV);
|
||||
iter_->Prev();
|
||||
UpdateBlobValue();
|
||||
}
|
||||
@ -96,6 +114,8 @@ class BlobDBIterator : public Iterator {
|
||||
std::unique_ptr<ManagedSnapshot> snapshot_;
|
||||
std::unique_ptr<ArenaWrappedDBIter> iter_;
|
||||
BlobDBImpl* blob_db_;
|
||||
Env* env_;
|
||||
Statistics* statistics_;
|
||||
Status status_;
|
||||
PinnableSlice value_;
|
||||
};
|
||||
|
@ -47,10 +47,23 @@ class BlobDBTest : public testing::Test {
|
||||
|
||||
~BlobDBTest() { Destroy(); }
|
||||
|
||||
void Open(BlobDBOptions bdb_options = BlobDBOptions(),
|
||||
Status TryOpen(BlobDBOptions bdb_options = BlobDBOptions(),
|
||||
Options options = Options()) {
|
||||
options.create_if_missing = true;
|
||||
ASSERT_OK(BlobDB::Open(options, bdb_options, dbname_, &blob_db_));
|
||||
return BlobDB::Open(options, bdb_options, dbname_, &blob_db_);
|
||||
}
|
||||
|
||||
void Open(BlobDBOptions bdb_options = BlobDBOptions(),
|
||||
Options options = Options()) {
|
||||
ASSERT_OK(TryOpen(bdb_options, options));
|
||||
}
|
||||
|
||||
void Reopen(BlobDBOptions bdb_options = BlobDBOptions(),
|
||||
Options options = Options()) {
|
||||
assert(blob_db_ != nullptr);
|
||||
delete blob_db_;
|
||||
blob_db_ = nullptr;
|
||||
Open(bdb_options, options);
|
||||
}
|
||||
|
||||
void Destroy() {
|
||||
@ -63,6 +76,26 @@ class BlobDBTest : public testing::Test {
|
||||
}
|
||||
}
|
||||
|
||||
BlobDBImpl *blob_db_impl() {
|
||||
return reinterpret_cast<BlobDBImpl *>(blob_db_);
|
||||
}
|
||||
|
||||
Status Put(const Slice &key, const Slice &value) {
|
||||
return blob_db_->Put(WriteOptions(), key, value);
|
||||
}
|
||||
|
||||
void Delete(const std::string &key,
|
||||
std::map<std::string, std::string> *data = nullptr) {
|
||||
ASSERT_OK(blob_db_->Delete(WriteOptions(), key));
|
||||
if (data != nullptr) {
|
||||
data->erase(key);
|
||||
}
|
||||
}
|
||||
|
||||
Status PutUntil(const Slice &key, const Slice &value, uint64_t expiration) {
|
||||
return blob_db_->PutUntil(WriteOptions(), key, value, expiration);
|
||||
}
|
||||
|
||||
void PutRandomWithTTL(const std::string &key, uint64_t ttl, Random *rnd,
|
||||
std::map<std::string, std::string> *data = nullptr) {
|
||||
int len = rnd->Next() % kMaxBlobSize + 1;
|
||||
@ -111,20 +144,24 @@ class BlobDBTest : public testing::Test {
|
||||
}
|
||||
}
|
||||
|
||||
void Delete(const std::string &key,
|
||||
std::map<std::string, std::string> *data = nullptr) {
|
||||
ASSERT_OK(blob_db_->Delete(WriteOptions(), key));
|
||||
if (data != nullptr) {
|
||||
data->erase(key);
|
||||
}
|
||||
}
|
||||
|
||||
// Verify blob db contain expected data and nothing more.
|
||||
void VerifyDB(const std::map<std::string, std::string> &data) {
|
||||
VerifyDB(blob_db_, data);
|
||||
}
|
||||
|
||||
void VerifyDB(DB *db, const std::map<std::string, std::string> &data) {
|
||||
// Verify normal Get
|
||||
auto* cfh = db->DefaultColumnFamily();
|
||||
for (auto &p : data) {
|
||||
PinnableSlice value_slice;
|
||||
ASSERT_OK(db->Get(ReadOptions(), cfh, p.first, &value_slice));
|
||||
ASSERT_EQ(p.second, value_slice.ToString());
|
||||
std::string value;
|
||||
ASSERT_OK(db->Get(ReadOptions(), cfh, p.first, &value));
|
||||
ASSERT_EQ(p.second, value);
|
||||
}
|
||||
|
||||
// Verify iterators
|
||||
Iterator *iter = db->NewIterator(ReadOptions());
|
||||
iter->SeekToFirst();
|
||||
for (auto &p : data) {
|
||||
@ -223,8 +260,8 @@ TEST_F(BlobDBTest, PutWithTTL) {
|
||||
ASSERT_OK(bdb_impl->TEST_CloseBlobFile(blob_files[0]));
|
||||
GCStats gc_stats;
|
||||
ASSERT_OK(bdb_impl->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats));
|
||||
ASSERT_EQ(100 - data.size(), gc_stats.num_deletes);
|
||||
ASSERT_EQ(data.size(), gc_stats.num_relocate);
|
||||
ASSERT_EQ(100 - data.size(), gc_stats.num_keys_expired);
|
||||
ASSERT_EQ(data.size(), gc_stats.num_keys_relocated);
|
||||
VerifyDB(data);
|
||||
}
|
||||
|
||||
@ -253,8 +290,8 @@ TEST_F(BlobDBTest, PutUntil) {
|
||||
ASSERT_OK(bdb_impl->TEST_CloseBlobFile(blob_files[0]));
|
||||
GCStats gc_stats;
|
||||
ASSERT_OK(bdb_impl->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats));
|
||||
ASSERT_EQ(100 - data.size(), gc_stats.num_deletes);
|
||||
ASSERT_EQ(data.size(), gc_stats.num_relocate);
|
||||
ASSERT_EQ(100 - data.size(), gc_stats.num_keys_expired);
|
||||
ASSERT_EQ(data.size(), gc_stats.num_keys_relocated);
|
||||
VerifyDB(data);
|
||||
}
|
||||
|
||||
@ -286,8 +323,8 @@ TEST_F(BlobDBTest, TTLExtrator_NoTTL) {
|
||||
ASSERT_OK(bdb_impl->TEST_CloseBlobFile(blob_files[0]));
|
||||
GCStats gc_stats;
|
||||
ASSERT_OK(bdb_impl->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats));
|
||||
ASSERT_EQ(0, gc_stats.num_deletes);
|
||||
ASSERT_EQ(100, gc_stats.num_relocate);
|
||||
ASSERT_EQ(0, gc_stats.num_keys_expired);
|
||||
ASSERT_EQ(100, gc_stats.num_keys_relocated);
|
||||
VerifyDB(data);
|
||||
}
|
||||
|
||||
@ -333,8 +370,8 @@ TEST_F(BlobDBTest, TTLExtractor_ExtractTTL) {
|
||||
GCStats gc_stats;
|
||||
ASSERT_OK(bdb_impl->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats));
|
||||
auto &data = static_cast<TestTTLExtractor *>(ttl_extractor_.get())->data;
|
||||
ASSERT_EQ(100 - data.size(), gc_stats.num_deletes);
|
||||
ASSERT_EQ(data.size(), gc_stats.num_relocate);
|
||||
ASSERT_EQ(100 - data.size(), gc_stats.num_keys_expired);
|
||||
ASSERT_EQ(data.size(), gc_stats.num_keys_relocated);
|
||||
VerifyDB(data);
|
||||
}
|
||||
|
||||
@ -381,8 +418,8 @@ TEST_F(BlobDBTest, TTLExtractor_ExtractExpiration) {
|
||||
GCStats gc_stats;
|
||||
ASSERT_OK(bdb_impl->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats));
|
||||
auto &data = static_cast<TestTTLExtractor *>(ttl_extractor_.get())->data;
|
||||
ASSERT_EQ(100 - data.size(), gc_stats.num_deletes);
|
||||
ASSERT_EQ(data.size(), gc_stats.num_relocate);
|
||||
ASSERT_EQ(100 - data.size(), gc_stats.num_keys_expired);
|
||||
ASSERT_EQ(data.size(), gc_stats.num_keys_relocated);
|
||||
VerifyDB(data);
|
||||
}
|
||||
|
||||
@ -438,8 +475,8 @@ TEST_F(BlobDBTest, TTLExtractor_ChangeValue) {
|
||||
ASSERT_OK(bdb_impl->TEST_CloseBlobFile(blob_files[0]));
|
||||
GCStats gc_stats;
|
||||
ASSERT_OK(bdb_impl->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats));
|
||||
ASSERT_EQ(100 - data.size(), gc_stats.num_deletes);
|
||||
ASSERT_EQ(data.size(), gc_stats.num_relocate);
|
||||
ASSERT_EQ(100 - data.size(), gc_stats.num_keys_expired);
|
||||
ASSERT_EQ(data.size(), gc_stats.num_keys_relocated);
|
||||
VerifyDB(data);
|
||||
}
|
||||
|
||||
@ -556,6 +593,24 @@ TEST_F(BlobDBTest, Compression) {
|
||||
}
|
||||
VerifyDB(data);
|
||||
}
|
||||
|
||||
TEST_F(BlobDBTest, DecompressAfterReopen) {
|
||||
Random rnd(301);
|
||||
BlobDBOptions bdb_options;
|
||||
bdb_options.min_blob_size = 0;
|
||||
bdb_options.disable_background_tasks = true;
|
||||
bdb_options.compression = CompressionType::kSnappyCompression;
|
||||
Open(bdb_options);
|
||||
std::map<std::string, std::string> data;
|
||||
for (size_t i = 0; i < 100; i++) {
|
||||
PutRandom("put-key" + ToString(i), &rnd, &data);
|
||||
}
|
||||
VerifyDB(data);
|
||||
bdb_options.compression = CompressionType::kNoCompression;
|
||||
Reopen(bdb_options);
|
||||
VerifyDB(data);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
TEST_F(BlobDBTest, MultipleWriters) {
|
||||
@ -593,16 +648,14 @@ TEST_F(BlobDBTest, GCAfterOverwriteKeys) {
|
||||
bdb_options.min_blob_size = 0;
|
||||
bdb_options.disable_background_tasks = true;
|
||||
Open(bdb_options);
|
||||
BlobDBImpl *blob_db_impl =
|
||||
static_cast_with_check<BlobDBImpl, BlobDB>(blob_db_);
|
||||
DBImpl *db_impl = static_cast_with_check<DBImpl, DB>(blob_db_->GetBaseDB());
|
||||
std::map<std::string, std::string> data;
|
||||
for (int i = 0; i < 200; i++) {
|
||||
PutRandom("key" + ToString(i), &rnd, &data);
|
||||
}
|
||||
auto blob_files = blob_db_impl->TEST_GetBlobFiles();
|
||||
auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
|
||||
ASSERT_EQ(1, blob_files.size());
|
||||
ASSERT_OK(blob_db_impl->TEST_CloseBlobFile(blob_files[0]));
|
||||
ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(blob_files[0]));
|
||||
// Test for data in SST
|
||||
size_t new_keys = 0;
|
||||
for (int i = 0; i < 100; i++) {
|
||||
@ -620,10 +673,10 @@ TEST_F(BlobDBTest, GCAfterOverwriteKeys) {
|
||||
}
|
||||
}
|
||||
GCStats gc_stats;
|
||||
ASSERT_OK(blob_db_impl->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats));
|
||||
ASSERT_OK(blob_db_impl()->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats));
|
||||
ASSERT_EQ(200, gc_stats.blob_count);
|
||||
ASSERT_EQ(0, gc_stats.num_deletes);
|
||||
ASSERT_EQ(200 - new_keys, gc_stats.num_relocate);
|
||||
ASSERT_EQ(0, gc_stats.num_keys_expired);
|
||||
ASSERT_EQ(200 - new_keys, gc_stats.num_keys_relocated);
|
||||
VerifyDB(data);
|
||||
}
|
||||
|
||||
@ -634,16 +687,14 @@ TEST_F(BlobDBTest, GCRelocateKeyWhileOverwriting) {
|
||||
bdb_options.disable_background_tasks = true;
|
||||
Open(bdb_options);
|
||||
ASSERT_OK(blob_db_->Put(WriteOptions(), "foo", "v1"));
|
||||
BlobDBImpl *blob_db_impl =
|
||||
static_cast_with_check<BlobDBImpl, BlobDB>(blob_db_);
|
||||
auto blob_files = blob_db_impl->TEST_GetBlobFiles();
|
||||
auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
|
||||
ASSERT_EQ(1, blob_files.size());
|
||||
ASSERT_OK(blob_db_impl->TEST_CloseBlobFile(blob_files[0]));
|
||||
ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(blob_files[0]));
|
||||
|
||||
SyncPoint::GetInstance()->LoadDependency(
|
||||
{{"BlobDBImpl::GCFileAndUpdateLSM:AfterGetFromBaseDB",
|
||||
"BlobDBImpl::PutBlobValue:Start"},
|
||||
{"BlobDBImpl::PutBlobValue:Finish",
|
||||
"BlobDBImpl::PutUntil:Start"},
|
||||
{"BlobDBImpl::PutUntil:Finish",
|
||||
"BlobDBImpl::GCFileAndUpdateLSM:BeforeRelocate"}});
|
||||
SyncPoint::GetInstance()->EnableProcessing();
|
||||
|
||||
@ -651,12 +702,11 @@ TEST_F(BlobDBTest, GCRelocateKeyWhileOverwriting) {
|
||||
[this]() { ASSERT_OK(blob_db_->Put(WriteOptions(), "foo", "v2")); });
|
||||
|
||||
GCStats gc_stats;
|
||||
ASSERT_OK(blob_db_impl->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats));
|
||||
ASSERT_OK(blob_db_impl()->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats));
|
||||
ASSERT_EQ(1, gc_stats.blob_count);
|
||||
ASSERT_EQ(0, gc_stats.num_deletes);
|
||||
ASSERT_EQ(1, gc_stats.num_relocate);
|
||||
ASSERT_EQ(0, gc_stats.relocate_succeeded);
|
||||
ASSERT_EQ(1, gc_stats.overwritten_while_relocate);
|
||||
ASSERT_EQ(0, gc_stats.num_keys_expired);
|
||||
ASSERT_EQ(1, gc_stats.num_keys_overwritten);
|
||||
ASSERT_EQ(0, gc_stats.num_keys_relocated);
|
||||
writer.join();
|
||||
VerifyDB({{"foo", "v2"}});
|
||||
}
|
||||
@ -671,17 +721,15 @@ TEST_F(BlobDBTest, GCExpiredKeyWhileOverwriting) {
|
||||
Open(bdb_options, options);
|
||||
mock_env_->set_current_time(100);
|
||||
ASSERT_OK(blob_db_->PutUntil(WriteOptions(), "foo", "v1", 200));
|
||||
BlobDBImpl *blob_db_impl =
|
||||
static_cast_with_check<BlobDBImpl, BlobDB>(blob_db_);
|
||||
auto blob_files = blob_db_impl->TEST_GetBlobFiles();
|
||||
auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
|
||||
ASSERT_EQ(1, blob_files.size());
|
||||
ASSERT_OK(blob_db_impl->TEST_CloseBlobFile(blob_files[0]));
|
||||
ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(blob_files[0]));
|
||||
mock_env_->set_current_time(300);
|
||||
|
||||
SyncPoint::GetInstance()->LoadDependency(
|
||||
{{"BlobDBImpl::GCFileAndUpdateLSM:AfterGetFromBaseDB",
|
||||
"BlobDBImpl::PutBlobValue:Start"},
|
||||
{"BlobDBImpl::PutBlobValue:Finish",
|
||||
"BlobDBImpl::PutUntil:Start"},
|
||||
{"BlobDBImpl::PutUntil:Finish",
|
||||
"BlobDBImpl::GCFileAndUpdateLSM:BeforeDelete"}});
|
||||
SyncPoint::GetInstance()->EnableProcessing();
|
||||
|
||||
@ -690,22 +738,23 @@ TEST_F(BlobDBTest, GCExpiredKeyWhileOverwriting) {
|
||||
});
|
||||
|
||||
GCStats gc_stats;
|
||||
ASSERT_OK(blob_db_impl->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats));
|
||||
ASSERT_OK(blob_db_impl()->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats));
|
||||
ASSERT_EQ(1, gc_stats.blob_count);
|
||||
ASSERT_EQ(1, gc_stats.num_deletes);
|
||||
ASSERT_EQ(0, gc_stats.delete_succeeded);
|
||||
ASSERT_EQ(1, gc_stats.overwritten_while_delete);
|
||||
ASSERT_EQ(0, gc_stats.num_relocate);
|
||||
ASSERT_EQ(1, gc_stats.num_keys_expired);
|
||||
ASSERT_EQ(0, gc_stats.num_keys_relocated);
|
||||
writer.join();
|
||||
VerifyDB({{"foo", "v2"}});
|
||||
}
|
||||
|
||||
TEST_F(BlobDBTest, GCOldestSimpleBlobFileWhenOutOfSpace) {
|
||||
// This test is no longer valid since we now return an error when we go
|
||||
// over the configured blob_dir_size.
|
||||
// The test needs to be re-written later in such a way that writes continue
|
||||
// after a GC happens.
|
||||
TEST_F(BlobDBTest, DISABLED_GCOldestSimpleBlobFileWhenOutOfSpace) {
|
||||
// Use mock env to stop wall clock.
|
||||
Options options;
|
||||
options.env = mock_env_.get();
|
||||
BlobDBOptions bdb_options;
|
||||
bdb_options.is_fifo = true;
|
||||
bdb_options.blob_dir_size = 100;
|
||||
bdb_options.blob_file_size = 100;
|
||||
bdb_options.min_blob_size = 0;
|
||||
@ -716,9 +765,7 @@ TEST_F(BlobDBTest, GCOldestSimpleBlobFileWhenOutOfSpace) {
|
||||
for (int i = 0; i < 10; i++) {
|
||||
ASSERT_OK(blob_db_->Put(WriteOptions(), "key" + ToString(i), value));
|
||||
}
|
||||
BlobDBImpl *blob_db_impl =
|
||||
static_cast_with_check<BlobDBImpl, BlobDB>(blob_db_);
|
||||
auto blob_files = blob_db_impl->TEST_GetBlobFiles();
|
||||
auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
|
||||
ASSERT_EQ(11, blob_files.size());
|
||||
ASSERT_TRUE(blob_files[0]->HasTTL());
|
||||
ASSERT_TRUE(blob_files[0]->Immutable());
|
||||
@ -728,9 +775,9 @@ TEST_F(BlobDBTest, GCOldestSimpleBlobFileWhenOutOfSpace) {
|
||||
ASSERT_TRUE(blob_files[i]->Immutable());
|
||||
}
|
||||
}
|
||||
blob_db_impl->TEST_RunGC();
|
||||
blob_db_impl()->TEST_RunGC();
|
||||
// The oldest simple blob file (i.e. blob_files[1]) has been selected for GC.
|
||||
auto obsolete_files = blob_db_impl->TEST_GetObsoleteFiles();
|
||||
auto obsolete_files = blob_db_impl()->TEST_GetObsoleteFiles();
|
||||
ASSERT_EQ(1, obsolete_files.size());
|
||||
ASSERT_EQ(blob_files[1]->BlobFileNumber(),
|
||||
obsolete_files[0]->BlobFileNumber());
|
||||
@ -744,13 +791,11 @@ TEST_F(BlobDBTest, ReadWhileGC) {
|
||||
bdb_options.disable_background_tasks = true;
|
||||
Open(bdb_options);
|
||||
blob_db_->Put(WriteOptions(), "foo", "bar");
|
||||
BlobDBImpl *blob_db_impl =
|
||||
static_cast_with_check<BlobDBImpl, BlobDB>(blob_db_);
|
||||
auto blob_files = blob_db_impl->TEST_GetBlobFiles();
|
||||
auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
|
||||
ASSERT_EQ(1, blob_files.size());
|
||||
std::shared_ptr<BlobFile> bfile = blob_files[0];
|
||||
uint64_t bfile_number = bfile->BlobFileNumber();
|
||||
ASSERT_OK(blob_db_impl->TEST_CloseBlobFile(bfile));
|
||||
ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(bfile));
|
||||
|
||||
switch (i) {
|
||||
case 0:
|
||||
@ -788,17 +833,15 @@ TEST_F(BlobDBTest, ReadWhileGC) {
|
||||
|
||||
TEST_SYNC_POINT("BlobDBTest::ReadWhileGC:1");
|
||||
GCStats gc_stats;
|
||||
ASSERT_OK(blob_db_impl->TEST_GCFileAndUpdateLSM(bfile, &gc_stats));
|
||||
ASSERT_OK(blob_db_impl()->TEST_GCFileAndUpdateLSM(bfile, &gc_stats));
|
||||
ASSERT_EQ(1, gc_stats.blob_count);
|
||||
ASSERT_EQ(1, gc_stats.num_relocate);
|
||||
ASSERT_EQ(1, gc_stats.relocate_succeeded);
|
||||
blob_db_impl->TEST_ObsoleteFile(blob_files[0]);
|
||||
blob_db_impl->TEST_DeleteObsoleteFiles();
|
||||
ASSERT_EQ(1, gc_stats.num_keys_relocated);
|
||||
blob_db_impl()->TEST_DeleteObsoleteFiles();
|
||||
// The file shouln't be deleted
|
||||
blob_files = blob_db_impl->TEST_GetBlobFiles();
|
||||
blob_files = blob_db_impl()->TEST_GetBlobFiles();
|
||||
ASSERT_EQ(2, blob_files.size());
|
||||
ASSERT_EQ(bfile_number, blob_files[0]->BlobFileNumber());
|
||||
auto obsolete_files = blob_db_impl->TEST_GetObsoleteFiles();
|
||||
auto obsolete_files = blob_db_impl()->TEST_GetObsoleteFiles();
|
||||
ASSERT_EQ(1, obsolete_files.size());
|
||||
ASSERT_EQ(bfile_number, obsolete_files[0]->BlobFileNumber());
|
||||
TEST_SYNC_POINT("BlobDBTest::ReadWhileGC:2");
|
||||
@ -806,16 +849,86 @@ TEST_F(BlobDBTest, ReadWhileGC) {
|
||||
SyncPoint::GetInstance()->DisableProcessing();
|
||||
|
||||
// The file is deleted this time
|
||||
blob_db_impl->TEST_DeleteObsoleteFiles();
|
||||
blob_files = blob_db_impl->TEST_GetBlobFiles();
|
||||
blob_db_impl()->TEST_DeleteObsoleteFiles();
|
||||
blob_files = blob_db_impl()->TEST_GetBlobFiles();
|
||||
ASSERT_EQ(1, blob_files.size());
|
||||
ASSERT_NE(bfile_number, blob_files[0]->BlobFileNumber());
|
||||
ASSERT_EQ(0, blob_db_impl->TEST_GetObsoleteFiles().size());
|
||||
ASSERT_EQ(0, blob_db_impl()->TEST_GetObsoleteFiles().size());
|
||||
VerifyDB({{"foo", "bar"}});
|
||||
Destroy();
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(BlobDBTest, SnapshotAndGarbageCollection) {
|
||||
BlobDBOptions bdb_options;
|
||||
bdb_options.min_blob_size = 0;
|
||||
bdb_options.disable_background_tasks = true;
|
||||
// i = when to take snapshot
|
||||
for (int i = 0; i < 4; i++) {
|
||||
for (bool delete_key : {true, false}) {
|
||||
const Snapshot *snapshot = nullptr;
|
||||
Destroy();
|
||||
Open(bdb_options);
|
||||
// First file
|
||||
ASSERT_OK(Put("key1", "value"));
|
||||
if (i == 0) {
|
||||
snapshot = blob_db_->GetSnapshot();
|
||||
}
|
||||
auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
|
||||
ASSERT_EQ(1, blob_files.size());
|
||||
ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(blob_files[0]));
|
||||
// Second file
|
||||
ASSERT_OK(Put("key2", "value"));
|
||||
if (i == 1) {
|
||||
snapshot = blob_db_->GetSnapshot();
|
||||
}
|
||||
blob_files = blob_db_impl()->TEST_GetBlobFiles();
|
||||
ASSERT_EQ(2, blob_files.size());
|
||||
auto bfile = blob_files[1];
|
||||
ASSERT_FALSE(bfile->Immutable());
|
||||
ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(bfile));
|
||||
// Third file
|
||||
ASSERT_OK(Put("key3", "value"));
|
||||
if (i == 2) {
|
||||
snapshot = blob_db_->GetSnapshot();
|
||||
}
|
||||
if (delete_key) {
|
||||
Delete("key2");
|
||||
}
|
||||
GCStats gc_stats;
|
||||
ASSERT_OK(blob_db_impl()->TEST_GCFileAndUpdateLSM(bfile, &gc_stats));
|
||||
ASSERT_TRUE(bfile->Obsolete());
|
||||
ASSERT_EQ(1, gc_stats.blob_count);
|
||||
if (delete_key) {
|
||||
ASSERT_EQ(0, gc_stats.num_keys_relocated);
|
||||
ASSERT_EQ(bfile->GetSequenceRange().second + 1,
|
||||
bfile->GetObsoleteSequence());
|
||||
} else {
|
||||
ASSERT_EQ(1, gc_stats.num_keys_relocated);
|
||||
ASSERT_EQ(blob_db_->GetLatestSequenceNumber(),
|
||||
bfile->GetObsoleteSequence());
|
||||
}
|
||||
if (i == 3) {
|
||||
snapshot = blob_db_->GetSnapshot();
|
||||
}
|
||||
size_t num_files = delete_key ? 3 : 4;
|
||||
ASSERT_EQ(num_files, blob_db_impl()->TEST_GetBlobFiles().size());
|
||||
blob_db_impl()->TEST_DeleteObsoleteFiles();
|
||||
if (i == 0 || i == 3 || (i == 2 && delete_key)) {
|
||||
// The snapshot shouldn't see data in bfile
|
||||
ASSERT_EQ(num_files - 1, blob_db_impl()->TEST_GetBlobFiles().size());
|
||||
blob_db_->ReleaseSnapshot(snapshot);
|
||||
} else {
|
||||
// The snapshot will see data in bfile, so the file shouldn't be deleted
|
||||
ASSERT_EQ(num_files, blob_db_impl()->TEST_GetBlobFiles().size());
|
||||
blob_db_->ReleaseSnapshot(snapshot);
|
||||
blob_db_impl()->TEST_DeleteObsoleteFiles();
|
||||
ASSERT_EQ(num_files - 1, blob_db_impl()->TEST_GetBlobFiles().size());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(BlobDBTest, ColumnFamilyNotSupported) {
|
||||
Options options;
|
||||
options.env = mock_env_.get();
|
||||
@ -949,6 +1062,41 @@ TEST_F(BlobDBTest, OutOfSpace) {
|
||||
ASSERT_TRUE(s.IsNoSpace());
|
||||
}
|
||||
|
||||
TEST_F(BlobDBTest, EvictOldestFileWhenCloseToSpaceLimit) {
|
||||
// Use mock env to stop wall clock.
|
||||
Options options;
|
||||
BlobDBOptions bdb_options;
|
||||
bdb_options.blob_dir_size = 270;
|
||||
bdb_options.blob_file_size = 100;
|
||||
bdb_options.disable_background_tasks = true;
|
||||
bdb_options.is_fifo = true;
|
||||
Open(bdb_options);
|
||||
|
||||
// Each stored blob has an overhead of 32 bytes currently.
|
||||
// So a 100 byte blob should take up 132 bytes.
|
||||
std::string value(100, 'v');
|
||||
ASSERT_OK(blob_db_->PutWithTTL(WriteOptions(), "key1", value, 10));
|
||||
|
||||
auto *bdb_impl = static_cast<BlobDBImpl *>(blob_db_);
|
||||
auto blob_files = bdb_impl->TEST_GetBlobFiles();
|
||||
ASSERT_EQ(1, blob_files.size());
|
||||
|
||||
// Adding another 100 byte blob would take the total size to 264 bytes
|
||||
// (2*132), which is more than 90% of blob_dir_size. So, the oldest file
|
||||
// should be evicted and put in obsolete files list.
|
||||
ASSERT_OK(blob_db_->PutWithTTL(WriteOptions(), "key2", value, 60));
|
||||
|
||||
auto obsolete_files = bdb_impl->TEST_GetObsoleteFiles();
|
||||
ASSERT_EQ(1, obsolete_files.size());
|
||||
ASSERT_TRUE(obsolete_files[0]->Immutable());
|
||||
ASSERT_EQ(blob_files[0]->BlobFileNumber(),
|
||||
obsolete_files[0]->BlobFileNumber());
|
||||
|
||||
bdb_impl->TEST_DeleteObsoleteFiles();
|
||||
obsolete_files = bdb_impl->TEST_GetObsoleteFiles();
|
||||
ASSERT_TRUE(obsolete_files.empty());
|
||||
}
|
||||
|
||||
TEST_F(BlobDBTest, InlineSmallValues) {
|
||||
constexpr uint64_t kMaxExpiration = 1000;
|
||||
Random rnd(301);
|
||||
@ -1018,6 +1166,95 @@ TEST_F(BlobDBTest, InlineSmallValues) {
|
||||
ASSERT_EQ(last_ttl_seq, ttl_file->GetSequenceRange().second);
|
||||
}
|
||||
|
||||
TEST_F(BlobDBTest, CompactionFilterNotSupported) {
|
||||
class TestCompactionFilter : public CompactionFilter {
|
||||
virtual const char *Name() const { return "TestCompactionFilter"; }
|
||||
};
|
||||
class TestCompactionFilterFactory : public CompactionFilterFactory {
|
||||
virtual const char *Name() const { return "TestCompactionFilterFactory"; }
|
||||
virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
|
||||
const CompactionFilter::Context & /*context*/) {
|
||||
return std::unique_ptr<CompactionFilter>(new TestCompactionFilter());
|
||||
}
|
||||
};
|
||||
for (int i = 0; i < 2; i++) {
|
||||
Options options;
|
||||
if (i == 0) {
|
||||
options.compaction_filter = new TestCompactionFilter();
|
||||
} else {
|
||||
options.compaction_filter_factory.reset(
|
||||
new TestCompactionFilterFactory());
|
||||
}
|
||||
ASSERT_TRUE(TryOpen(BlobDBOptions(), options).IsNotSupported());
|
||||
delete options.compaction_filter;
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(BlobDBTest, FilterExpiredBlobIndex) {
|
||||
constexpr size_t kNumKeys = 100;
|
||||
constexpr size_t kNumPuts = 1000;
|
||||
constexpr uint64_t kMaxExpiration = 1000;
|
||||
constexpr uint64_t kCompactTime = 500;
|
||||
constexpr uint64_t kMinBlobSize = 100;
|
||||
Random rnd(301);
|
||||
mock_env_->set_current_time(0);
|
||||
BlobDBOptions bdb_options;
|
||||
bdb_options.min_blob_size = kMinBlobSize;
|
||||
bdb_options.disable_background_tasks = true;
|
||||
Options options;
|
||||
options.env = mock_env_.get();
|
||||
Open(bdb_options, options);
|
||||
|
||||
std::map<std::string, std::string> data;
|
||||
std::map<std::string, std::string> data_after_compact;
|
||||
for (size_t i = 0; i < kNumPuts; i++) {
|
||||
bool is_small_value = rnd.Next() % 2;
|
||||
bool has_ttl = rnd.Next() % 2;
|
||||
uint64_t expiration = rnd.Next() % kMaxExpiration;
|
||||
int len = is_small_value ? 10 : 200;
|
||||
std::string key = "key" + ToString(rnd.Next() % kNumKeys);
|
||||
std::string value = test::RandomHumanReadableString(&rnd, len);
|
||||
if (!has_ttl) {
|
||||
if (is_small_value) {
|
||||
std::string blob_entry;
|
||||
BlobIndex::EncodeInlinedTTL(&blob_entry, expiration, value);
|
||||
// Fake blob index with TTL. See what it will do.
|
||||
ASSERT_GT(kMinBlobSize, blob_entry.size());
|
||||
value = blob_entry;
|
||||
}
|
||||
ASSERT_OK(Put(key, value));
|
||||
data_after_compact[key] = value;
|
||||
} else {
|
||||
ASSERT_OK(PutUntil(key, value, expiration));
|
||||
if (expiration <= kCompactTime) {
|
||||
data_after_compact.erase(key);
|
||||
} else {
|
||||
data_after_compact[key] = value;
|
||||
}
|
||||
}
|
||||
data[key] = value;
|
||||
}
|
||||
VerifyDB(data);
|
||||
|
||||
mock_env_->set_current_time(kCompactTime);
|
||||
// Take a snapshot before compaction. Make sure expired blob indexes is
|
||||
// filtered regardless of snapshot.
|
||||
const Snapshot *snapshot = blob_db_->GetSnapshot();
|
||||
// Issue manual compaction to trigger compaction filter.
|
||||
ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(),
|
||||
blob_db_->DefaultColumnFamily(), nullptr,
|
||||
nullptr));
|
||||
blob_db_->ReleaseSnapshot(snapshot);
|
||||
// Verify expired blob index are filtered.
|
||||
std::vector<KeyVersion> versions;
|
||||
GetAllKeyVersions(blob_db_, "", "", &versions);
|
||||
ASSERT_EQ(data_after_compact.size(), versions.size());
|
||||
for (auto &version : versions) {
|
||||
ASSERT_TRUE(data_after_compact.count(version.user_key) > 0);
|
||||
}
|
||||
VerifyDB(data_after_compact);
|
||||
}
|
||||
|
||||
} // namespace blob_db
|
||||
} // namespace rocksdb
|
||||
|
||||
|
@ -30,13 +30,14 @@ BlobFile::BlobFile()
|
||||
: parent_(nullptr),
|
||||
file_number_(0),
|
||||
has_ttl_(false),
|
||||
compression_(kNoCompression),
|
||||
blob_count_(0),
|
||||
gc_epoch_(-1),
|
||||
file_size_(0),
|
||||
deleted_count_(0),
|
||||
deleted_size_(0),
|
||||
closed_(false),
|
||||
can_be_deleted_(false),
|
||||
obsolete_(false),
|
||||
gc_once_after_open_(false),
|
||||
expiration_range_({0, 0}),
|
||||
sequence_range_({kMaxSequenceNumber, 0}),
|
||||
@ -49,13 +50,14 @@ BlobFile::BlobFile(const BlobDBImpl* p, const std::string& bdir, uint64_t fn)
|
||||
path_to_dir_(bdir),
|
||||
file_number_(fn),
|
||||
has_ttl_(false),
|
||||
compression_(kNoCompression),
|
||||
blob_count_(0),
|
||||
gc_epoch_(-1),
|
||||
file_size_(0),
|
||||
deleted_count_(0),
|
||||
deleted_size_(0),
|
||||
closed_(false),
|
||||
can_be_deleted_(false),
|
||||
obsolete_(false),
|
||||
gc_once_after_open_(false),
|
||||
expiration_range_({0, 0}),
|
||||
sequence_range_({kMaxSequenceNumber, 0}),
|
||||
@ -64,7 +66,7 @@ BlobFile::BlobFile(const BlobDBImpl* p, const std::string& bdir, uint64_t fn)
|
||||
header_valid_(false) {}
|
||||
|
||||
BlobFile::~BlobFile() {
|
||||
if (can_be_deleted_) {
|
||||
if (obsolete_) {
|
||||
std::string pn(PathName());
|
||||
Status s = Env::Default()->DeleteFile(PathName());
|
||||
if (!s.ok()) {
|
||||
@ -98,8 +100,8 @@ std::shared_ptr<Reader> BlobFile::OpenSequentialReader(
|
||||
std::unique_ptr<SequentialFileReader> sfile_reader;
|
||||
sfile_reader.reset(new SequentialFileReader(std::move(sfile)));
|
||||
|
||||
std::shared_ptr<Reader> log_reader =
|
||||
std::make_shared<Reader>(db_options.info_log, std::move(sfile_reader));
|
||||
std::shared_ptr<Reader> log_reader = std::make_shared<Reader>(
|
||||
std::move(sfile_reader), db_options.env, db_options.statistics.get());
|
||||
|
||||
return log_reader;
|
||||
}
|
||||
@ -110,17 +112,21 @@ std::string BlobFile::DumpState() const {
|
||||
"path: %s fn: %" PRIu64 " blob_count: %" PRIu64 " gc_epoch: %" PRIu64
|
||||
" file_size: %" PRIu64 " deleted_count: %" PRIu64
|
||||
" deleted_size: %" PRIu64
|
||||
" closed: %d can_be_deleted: %d expiration_range: (%" PRIu64
|
||||
", %" PRIu64 ") sequence_range: (%" PRIu64 " %" PRIu64
|
||||
"), writer: %d reader: %d",
|
||||
" closed: %d obsolete: %d expiration_range: (%" PRIu64 ", %" PRIu64
|
||||
") sequence_range: (%" PRIu64 " %" PRIu64 "), writer: %d reader: %d",
|
||||
path_to_dir_.c_str(), file_number_, blob_count_.load(),
|
||||
gc_epoch_.load(), file_size_.load(), deleted_count_, deleted_size_,
|
||||
closed_.load(), can_be_deleted_.load(), expiration_range_.first,
|
||||
closed_.load(), obsolete_.load(), expiration_range_.first,
|
||||
expiration_range_.second, sequence_range_.first,
|
||||
sequence_range_.second, (!!log_writer_), (!!ra_file_reader_));
|
||||
return str;
|
||||
}
|
||||
|
||||
void BlobFile::MarkObsolete(SequenceNumber sequence) {
|
||||
obsolete_sequence_ = sequence;
|
||||
obsolete_.store(true);
|
||||
}
|
||||
|
||||
bool BlobFile::NeedsFsync(bool hard, uint64_t bytes_per_sync) const {
|
||||
assert(last_fsync_ <= file_size_);
|
||||
return (hard) ? file_size_ > last_fsync_
|
||||
|
@ -41,6 +41,9 @@ class BlobFile {
|
||||
// have TTL.
|
||||
bool has_ttl_;
|
||||
|
||||
// Compression type of blobs in the file
|
||||
CompressionType compression_;
|
||||
|
||||
// number of blobs in the file
|
||||
std::atomic<uint64_t> blob_count_;
|
||||
|
||||
@ -63,8 +66,12 @@ class BlobFile {
|
||||
std::atomic<bool> closed_;
|
||||
|
||||
// has a pass of garbage collection successfully finished on this file
|
||||
// can_be_deleted_ still needs to do iterator/snapshot checks
|
||||
std::atomic<bool> can_be_deleted_;
|
||||
// obsolete_ still needs to do iterator/snapshot checks
|
||||
std::atomic<bool> obsolete_;
|
||||
|
||||
// The last sequence number by the time the file marked as obsolete.
|
||||
// Data in this file is visible to a snapshot taken before the sequence.
|
||||
SequenceNumber obsolete_sequence_;
|
||||
|
||||
// should this file been gc'd once to reconcile lost deletes/compactions
|
||||
std::atomic<bool> gc_once_after_open_;
|
||||
@ -91,6 +98,8 @@ class BlobFile {
|
||||
|
||||
bool header_valid_;
|
||||
|
||||
SequenceNumber garbage_collection_finish_sequence_;
|
||||
|
||||
public:
|
||||
BlobFile();
|
||||
|
||||
@ -117,7 +126,19 @@ class BlobFile {
|
||||
std::string DumpState() const;
|
||||
|
||||
// if the file has gone through GC and blobs have been relocated
|
||||
bool Obsolete() const { return can_be_deleted_.load(); }
|
||||
bool Obsolete() const {
|
||||
assert(Immutable() || !obsolete_.load());
|
||||
return obsolete_.load();
|
||||
}
|
||||
|
||||
// Mark file as obsolete by garbage collection. The file is not visible to
|
||||
// snapshots with sequence greater or equal to the given sequence.
|
||||
void MarkObsolete(SequenceNumber sequence);
|
||||
|
||||
SequenceNumber GetObsoleteSequence() const {
|
||||
assert(Obsolete());
|
||||
return obsolete_sequence_;
|
||||
}
|
||||
|
||||
// if the file is not taking any more appends.
|
||||
bool Immutable() const { return closed_.load(); }
|
||||
@ -125,6 +146,8 @@ class BlobFile {
|
||||
// we will assume this is atomic
|
||||
bool NeedsFsync(bool hard, uint64_t bytes_per_sync) const;
|
||||
|
||||
void Fsync();
|
||||
|
||||
uint64_t GetFileSize() const {
|
||||
return file_size_.load(std::memory_order_acquire);
|
||||
}
|
||||
@ -153,9 +176,13 @@ class BlobFile {
|
||||
|
||||
void SetHasTTL(bool has_ttl) { has_ttl_ = has_ttl; }
|
||||
|
||||
std::shared_ptr<Writer> GetWriter() const { return log_writer_; }
|
||||
CompressionType compression() const { return compression_; }
|
||||
|
||||
void Fsync();
|
||||
void SetCompression(CompressionType c) {
|
||||
compression_ = c;
|
||||
}
|
||||
|
||||
std::shared_ptr<Writer> GetWriter() const { return log_writer_; }
|
||||
|
||||
private:
|
||||
std::shared_ptr<Reader> OpenSequentialReader(
|
||||
@ -183,8 +210,6 @@ class BlobFile {
|
||||
void SetFileSize(uint64_t fs) { file_size_ = fs; }
|
||||
|
||||
void SetBlobCount(uint64_t bc) { blob_count_ = bc; }
|
||||
|
||||
void SetCanBeDeleted() { can_be_deleted_ = true; }
|
||||
};
|
||||
} // namespace blob_db
|
||||
} // namespace rocksdb
|
||||
|
@ -111,6 +111,8 @@ struct BlobLogRecord {
|
||||
std::string key_buf;
|
||||
std::string value_buf;
|
||||
|
||||
uint64_t record_size() const { return kHeaderSize + key_size + value_size; }
|
||||
|
||||
void EncodeHeaderTo(std::string* dst);
|
||||
|
||||
Status DecodeHeaderFrom(Slice src);
|
||||
|
@ -9,22 +9,30 @@
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
#include "monitoring/statistics.h"
|
||||
#include "util/file_reader_writer.h"
|
||||
#include "util/stop_watch.h"
|
||||
|
||||
namespace rocksdb {
|
||||
namespace blob_db {
|
||||
|
||||
Reader::Reader(std::shared_ptr<Logger> info_log,
|
||||
unique_ptr<SequentialFileReader>&& _file)
|
||||
: info_log_(info_log), file_(std::move(_file)), buffer_(), next_byte_(0) {}
|
||||
Reader::Reader(unique_ptr<SequentialFileReader>&& file_reader, Env* env,
|
||||
Statistics* statistics)
|
||||
: file_(std::move(file_reader)),
|
||||
env_(env),
|
||||
statistics_(statistics),
|
||||
buffer_(),
|
||||
next_byte_(0) {}
|
||||
|
||||
Status Reader::ReadSlice(uint64_t size, Slice* slice, std::string* buf) {
|
||||
StopWatch read_sw(env_, statistics_, BLOB_DB_BLOB_FILE_READ_MICROS);
|
||||
buf->reserve(size);
|
||||
Status s = file_->Read(size, slice, &(*buf)[0]);
|
||||
next_byte_ += size;
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_READ, slice->size());
|
||||
if (slice->size() != size) {
|
||||
return Status::Corruption("EOF reached while reading record");
|
||||
}
|
||||
|
@ -10,7 +10,9 @@
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "rocksdb/env.h"
|
||||
#include "rocksdb/slice.h"
|
||||
#include "rocksdb/statistics.h"
|
||||
#include "rocksdb/status.h"
|
||||
#include "utilities/blob_db/blob_log_format.h"
|
||||
|
||||
@ -37,17 +39,8 @@ class Reader {
|
||||
|
||||
// Create a reader that will return log records from "*file".
|
||||
// "*file" must remain live while this Reader is in use.
|
||||
//
|
||||
// If "reporter" is non-nullptr, it is notified whenever some data is
|
||||
// dropped due to a detected corruption. "*reporter" must remain
|
||||
// live while this Reader is in use.
|
||||
//
|
||||
// If "checksum" is true, verify checksums if available.
|
||||
//
|
||||
// The Reader will start reading at the first record located at physical
|
||||
// position >= initial_offset within the file.
|
||||
Reader(std::shared_ptr<Logger> info_log,
|
||||
std::unique_ptr<SequentialFileReader>&& file);
|
||||
Reader(std::unique_ptr<SequentialFileReader>&& file_reader, Env* env,
|
||||
Statistics* statistics);
|
||||
|
||||
~Reader() = default;
|
||||
|
||||
@ -68,17 +61,14 @@ class Reader {
|
||||
|
||||
Status ReadSlice(uint64_t size, Slice* slice, std::string* buf);
|
||||
|
||||
SequentialFileReader* file() { return file_.get(); }
|
||||
|
||||
void ResetNextByte() { next_byte_ = 0; }
|
||||
|
||||
uint64_t GetNextByte() const { return next_byte_; }
|
||||
|
||||
const SequentialFileReader* file_reader() const { return file_.get(); }
|
||||
|
||||
private:
|
||||
std::shared_ptr<Logger> info_log_;
|
||||
const std::unique_ptr<SequentialFileReader> file_;
|
||||
Env* env_;
|
||||
Statistics* statistics_;
|
||||
|
||||
std::string backing_store_;
|
||||
Slice buffer_;
|
||||
|
@ -8,17 +8,23 @@
|
||||
|
||||
#include <cstdint>
|
||||
#include <string>
|
||||
|
||||
#include "monitoring/statistics.h"
|
||||
#include "rocksdb/env.h"
|
||||
#include "util/coding.h"
|
||||
#include "util/file_reader_writer.h"
|
||||
#include "util/stop_watch.h"
|
||||
#include "utilities/blob_db/blob_log_format.h"
|
||||
|
||||
namespace rocksdb {
|
||||
namespace blob_db {
|
||||
|
||||
Writer::Writer(unique_ptr<WritableFileWriter>&& dest, uint64_t log_number,
|
||||
uint64_t bpsync, bool use_fs, uint64_t boffset)
|
||||
Writer::Writer(unique_ptr<WritableFileWriter>&& dest, Env* env,
|
||||
Statistics* statistics, uint64_t log_number, uint64_t bpsync,
|
||||
bool use_fs, uint64_t boffset)
|
||||
: dest_(std::move(dest)),
|
||||
env_(env),
|
||||
statistics_(statistics),
|
||||
log_number_(log_number),
|
||||
block_offset_(boffset),
|
||||
bytes_per_sync_(bpsync),
|
||||
@ -26,7 +32,11 @@ Writer::Writer(unique_ptr<WritableFileWriter>&& dest, uint64_t log_number,
|
||||
use_fsync_(use_fs),
|
||||
last_elem_type_(kEtNone) {}
|
||||
|
||||
void Writer::Sync() { dest_->Sync(use_fsync_); }
|
||||
void Writer::Sync() {
|
||||
StopWatch sync_sw(env_, statistics_, BLOB_DB_BLOB_FILE_SYNC_MICROS);
|
||||
dest_->Sync(use_fsync_);
|
||||
RecordTick(statistics_, BLOB_DB_BLOB_FILE_SYNCED);
|
||||
}
|
||||
|
||||
Status Writer::WriteHeader(BlobLogHeader& header) {
|
||||
assert(block_offset_ == 0);
|
||||
@ -40,6 +50,8 @@ Status Writer::WriteHeader(BlobLogHeader& header) {
|
||||
s = dest_->Flush();
|
||||
}
|
||||
last_elem_type_ = kEtFileHdr;
|
||||
RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_WRITTEN,
|
||||
BlobLogHeader::kSize);
|
||||
return s;
|
||||
}
|
||||
|
||||
@ -58,6 +70,8 @@ Status Writer::AppendFooter(BlobLogFooter& footer) {
|
||||
}
|
||||
|
||||
last_elem_type_ = kEtFileFooter;
|
||||
RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_WRITTEN,
|
||||
BlobLogFooter::kSize);
|
||||
return s;
|
||||
}
|
||||
|
||||
@ -98,6 +112,7 @@ void Writer::ConstructBlobHeader(std::string* buf, const Slice& key,
|
||||
Status Writer::EmitPhysicalRecord(const std::string& headerbuf,
|
||||
const Slice& key, const Slice& val,
|
||||
uint64_t* key_offset, uint64_t* blob_offset) {
|
||||
StopWatch write_sw(env_, statistics_, BLOB_DB_BLOB_FILE_WRITE_MICROS);
|
||||
Status s = dest_->Append(Slice(headerbuf));
|
||||
if (s.ok()) {
|
||||
s = dest_->Append(key);
|
||||
@ -113,6 +128,8 @@ Status Writer::EmitPhysicalRecord(const std::string& headerbuf,
|
||||
*blob_offset = *key_offset + key.size();
|
||||
block_offset_ = *blob_offset + val.size();
|
||||
last_elem_type_ = kEtRecord;
|
||||
RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_WRITTEN,
|
||||
BlobLogRecord::kHeaderSize + key.size() + val.size());
|
||||
return s;
|
||||
}
|
||||
|
||||
|
@ -10,7 +10,9 @@
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "rocksdb/env.h"
|
||||
#include "rocksdb/slice.h"
|
||||
#include "rocksdb/statistics.h"
|
||||
#include "rocksdb/status.h"
|
||||
#include "rocksdb/types.h"
|
||||
#include "utilities/blob_db/blob_log_format.h"
|
||||
@ -34,9 +36,9 @@ class Writer {
|
||||
// Create a writer that will append data to "*dest".
|
||||
// "*dest" must be initially empty.
|
||||
// "*dest" must remain live while this Writer is in use.
|
||||
explicit Writer(std::unique_ptr<WritableFileWriter>&& dest,
|
||||
uint64_t log_number, uint64_t bpsync, bool use_fsync,
|
||||
uint64_t boffset = 0);
|
||||
Writer(std::unique_ptr<WritableFileWriter>&& dest, Env* env,
|
||||
Statistics* statistics, uint64_t log_number, uint64_t bpsync,
|
||||
bool use_fsync, uint64_t boffset = 0);
|
||||
|
||||
~Writer() = default;
|
||||
|
||||
@ -75,6 +77,8 @@ class Writer {
|
||||
|
||||
private:
|
||||
std::unique_ptr<WritableFileWriter> dest_;
|
||||
Env* env_;
|
||||
Statistics* statistics_;
|
||||
uint64_t log_number_;
|
||||
uint64_t block_offset_; // Current offset in block
|
||||
uint64_t bytes_per_sync_;
|
||||
|
@ -189,12 +189,11 @@ Status TransactionDB::Open(
|
||||
std::vector<ColumnFamilyDescriptor> column_families_copy = column_families;
|
||||
std::vector<size_t> compaction_enabled_cf_indices;
|
||||
DBOptions db_options_2pc = db_options;
|
||||
if (txn_db_options.write_policy == WRITE_PREPARED) {
|
||||
db_options_2pc.seq_per_batch = true;
|
||||
}
|
||||
PrepareWrap(&db_options_2pc, &column_families_copy,
|
||||
&compaction_enabled_cf_indices);
|
||||
s = DB::Open(db_options_2pc, dbname, column_families_copy, handles, &db);
|
||||
const bool use_seq_per_batch = txn_db_options.write_policy == WRITE_PREPARED;
|
||||
s = DBImpl::Open(db_options_2pc, dbname, column_families_copy, handles, &db,
|
||||
use_seq_per_batch);
|
||||
if (s.ok()) {
|
||||
s = WrapDB(db, txn_db_options, compaction_enabled_cf_indices, *handles,
|
||||
dbptr);
|
||||
|
@ -20,6 +20,7 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "monitoring/perf_context_imp.h"
|
||||
#include "rocksdb/slice.h"
|
||||
#include "rocksdb/utilities/transaction_db_mutex.h"
|
||||
#include "util/cast_util.h"
|
||||
@ -347,6 +348,8 @@ Status TransactionLockMgr::AcquireWithTimeout(
|
||||
&expire_time_hint, &wait_ids);
|
||||
|
||||
if (!result.ok() && timeout != 0) {
|
||||
PERF_TIMER_GUARD(key_lock_wait_time);
|
||||
PERF_COUNTER_ADD(key_lock_wait_count, 1);
|
||||
// If we weren't able to acquire the lock, we will keep retrying as long
|
||||
// as the timeout allows.
|
||||
bool timed_out = false;
|
||||
|
@ -19,6 +19,7 @@
|
||||
#include "db/db_impl.h"
|
||||
#include "rocksdb/db.h"
|
||||
#include "rocksdb/options.h"
|
||||
#include "rocksdb/perf_context.h"
|
||||
#include "rocksdb/utilities/transaction.h"
|
||||
#include "rocksdb/utilities/transaction_db.h"
|
||||
#include "table/mock_table.h"
|
||||
@ -187,15 +188,18 @@ TEST_P(TransactionTest, WaitingTxn) {
|
||||
ASSERT_EQ(cf_id, 0);
|
||||
});
|
||||
|
||||
get_perf_context()->Reset();
|
||||
// lock key in default cf
|
||||
s = txn1->GetForUpdate(read_options, "foo", &value);
|
||||
ASSERT_OK(s);
|
||||
ASSERT_EQ(value, "bar");
|
||||
ASSERT_EQ(get_perf_context()->key_lock_wait_count, 0);
|
||||
|
||||
// lock key in cfa
|
||||
s = txn1->GetForUpdate(read_options, cfa, "foo", &value);
|
||||
ASSERT_OK(s);
|
||||
ASSERT_EQ(value, "bar");
|
||||
ASSERT_EQ(get_perf_context()->key_lock_wait_count, 0);
|
||||
|
||||
auto lock_data = db->GetLockStatusData();
|
||||
// Locked keys exist in both column family.
|
||||
@ -231,6 +235,8 @@ TEST_P(TransactionTest, WaitingTxn) {
|
||||
s = txn2->GetForUpdate(read_options, "foo", &value);
|
||||
ASSERT_TRUE(s.IsTimedOut());
|
||||
ASSERT_EQ(s.ToString(), "Operation timed out: Timeout waiting to lock key");
|
||||
ASSERT_EQ(get_perf_context()->key_lock_wait_count, 1);
|
||||
ASSERT_GE(get_perf_context()->key_lock_wait_time, 0);
|
||||
|
||||
rocksdb::SyncPoint::GetInstance()->DisableProcessing();
|
||||
rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
|
||||
@ -4856,12 +4862,12 @@ TEST_P(TransactionTest, SeqAdvanceTest) {
|
||||
auto seq = db_impl->GetLatestSequenceNumber();
|
||||
exp_seq = seq;
|
||||
txn_t0(0);
|
||||
seq = db_impl->TEST_GetLatestVisibleSequenceNumber();
|
||||
seq = db_impl->TEST_GetLastVisibleSequence();
|
||||
ASSERT_EQ(exp_seq, seq);
|
||||
|
||||
if (branch_do(n, &branch)) {
|
||||
db_impl->Flush(fopt);
|
||||
seq = db_impl->TEST_GetLatestVisibleSequenceNumber();
|
||||
seq = db_impl->TEST_GetLastVisibleSequence();
|
||||
ASSERT_EQ(exp_seq, seq);
|
||||
}
|
||||
if (branch_do(n, &branch)) {
|
||||
@ -4874,16 +4880,16 @@ TEST_P(TransactionTest, SeqAdvanceTest) {
|
||||
|
||||
// Doing it twice might detect some bugs
|
||||
txn_t0(1);
|
||||
seq = db_impl->TEST_GetLatestVisibleSequenceNumber();
|
||||
seq = db_impl->TEST_GetLastVisibleSequence();
|
||||
ASSERT_EQ(exp_seq, seq);
|
||||
|
||||
txn_t1(0);
|
||||
seq = db_impl->TEST_GetLatestVisibleSequenceNumber();
|
||||
seq = db_impl->TEST_GetLastVisibleSequence();
|
||||
ASSERT_EQ(exp_seq, seq);
|
||||
|
||||
if (branch_do(n, &branch)) {
|
||||
db_impl->Flush(fopt);
|
||||
seq = db_impl->TEST_GetLatestVisibleSequenceNumber();
|
||||
seq = db_impl->TEST_GetLastVisibleSequence();
|
||||
ASSERT_EQ(exp_seq, seq);
|
||||
}
|
||||
if (branch_do(n, &branch)) {
|
||||
@ -4895,12 +4901,12 @@ TEST_P(TransactionTest, SeqAdvanceTest) {
|
||||
}
|
||||
|
||||
txn_t3(0);
|
||||
seq = db_impl->TEST_GetLatestVisibleSequenceNumber();
|
||||
seq = db_impl->TEST_GetLastVisibleSequence();
|
||||
ASSERT_EQ(exp_seq, seq);
|
||||
|
||||
if (branch_do(n, &branch)) {
|
||||
db_impl->Flush(fopt);
|
||||
seq = db_impl->TEST_GetLatestVisibleSequenceNumber();
|
||||
seq = db_impl->TEST_GetLastVisibleSequence();
|
||||
ASSERT_EQ(exp_seq, seq);
|
||||
}
|
||||
if (branch_do(n, &branch)) {
|
||||
@ -4912,16 +4918,16 @@ TEST_P(TransactionTest, SeqAdvanceTest) {
|
||||
}
|
||||
|
||||
txn_t0(0);
|
||||
seq = db_impl->TEST_GetLatestVisibleSequenceNumber();
|
||||
seq = db_impl->TEST_GetLastVisibleSequence();
|
||||
ASSERT_EQ(exp_seq, seq);
|
||||
|
||||
txn_t2(0);
|
||||
seq = db_impl->TEST_GetLatestVisibleSequenceNumber();
|
||||
seq = db_impl->TEST_GetLastVisibleSequence();
|
||||
ASSERT_EQ(exp_seq, seq);
|
||||
|
||||
if (branch_do(n, &branch)) {
|
||||
db_impl->Flush(fopt);
|
||||
seq = db_impl->TEST_GetLatestVisibleSequenceNumber();
|
||||
seq = db_impl->TEST_GetLastVisibleSequence();
|
||||
ASSERT_EQ(exp_seq, seq);
|
||||
}
|
||||
if (branch_do(n, &branch)) {
|
||||
|
@ -54,7 +54,7 @@ class TransactionTest : public ::testing::TestWithParam<
|
||||
options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
|
||||
env = new FaultInjectionTestEnv(Env::Default());
|
||||
options.env = env;
|
||||
options.concurrent_prepare = std::get<1>(GetParam());
|
||||
options.two_write_queues = std::get<1>(GetParam());
|
||||
dbname = test::TmpDir() + "/transaction_testdb";
|
||||
|
||||
DestroyDB(dbname, options);
|
||||
@ -113,11 +113,10 @@ class TransactionTest : public ::testing::TestWithParam<
|
||||
std::vector<ColumnFamilyHandle*> handles;
|
||||
DB* root_db;
|
||||
Options options_copy(options);
|
||||
if (txn_db_options.write_policy == WRITE_PREPARED) {
|
||||
options_copy.seq_per_batch = true;
|
||||
}
|
||||
Status s =
|
||||
DB::Open(options_copy, dbname, column_families, &handles, &root_db);
|
||||
const bool use_seq_per_batch =
|
||||
txn_db_options.write_policy == WRITE_PREPARED;
|
||||
Status s = DBImpl::Open(options_copy, dbname, column_families, &handles,
|
||||
&root_db, use_seq_per_batch);
|
||||
if (s.ok()) {
|
||||
assert(handles.size() == 1);
|
||||
s = TransactionDB::WrapStackableDB(
|
||||
@ -144,7 +143,7 @@ class TransactionTest : public ::testing::TestWithParam<
|
||||
} else {
|
||||
// Consume one seq per batch
|
||||
exp_seq++;
|
||||
if (options.concurrent_prepare) {
|
||||
if (options.two_write_queues) {
|
||||
// Consume one seq for commit
|
||||
exp_seq++;
|
||||
}
|
||||
@ -169,7 +168,7 @@ class TransactionTest : public ::testing::TestWithParam<
|
||||
} else {
|
||||
// Consume one seq per batch
|
||||
exp_seq++;
|
||||
if (options.concurrent_prepare) {
|
||||
if (options.two_write_queues) {
|
||||
// Consume one seq for commit
|
||||
exp_seq++;
|
||||
}
|
||||
@ -197,7 +196,7 @@ class TransactionTest : public ::testing::TestWithParam<
|
||||
} else {
|
||||
// Consume one seq per batch
|
||||
exp_seq++;
|
||||
if (options.concurrent_prepare) {
|
||||
if (options.two_write_queues) {
|
||||
// Consume one seq for commit
|
||||
exp_seq++;
|
||||
}
|
||||
|
@ -625,7 +625,7 @@ TEST_P(WritePreparedTransactionTest, SeqAdvanceConcurrentTest) {
|
||||
printf("Tested %" ROCKSDB_PRIszt " cases so far\n", n);
|
||||
}
|
||||
DBImpl* db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
|
||||
auto seq = db_impl->TEST_GetLatestVisibleSequenceNumber();
|
||||
auto seq = db_impl->TEST_GetLastVisibleSequence();
|
||||
exp_seq = seq;
|
||||
// This is increased before writing the batch for commit
|
||||
commit_writes = 0;
|
||||
@ -693,17 +693,17 @@ TEST_P(WritePreparedTransactionTest, SeqAdvanceConcurrentTest) {
|
||||
for (auto& t : threads) {
|
||||
t.join();
|
||||
}
|
||||
if (options.concurrent_prepare) {
|
||||
if (options.two_write_queues) {
|
||||
// In this case none of the above scheduling tricks to deterministically
|
||||
// form merged bactches works because the writes go to saparte queues.
|
||||
// This would result in different write groups in each run of the test. We
|
||||
// still keep the test since althgouh non-deterministic and hard to debug,
|
||||
// it is still useful to have.
|
||||
// TODO(myabandeh): Add a deterministic unit test for concurrent_prepare
|
||||
// TODO(myabandeh): Add a deterministic unit test for two_write_queues
|
||||
}
|
||||
|
||||
// Check if memtable inserts advanced seq number as expected
|
||||
seq = db_impl->TEST_GetLatestVisibleSequenceNumber();
|
||||
seq = db_impl->TEST_GetLastVisibleSequence();
|
||||
ASSERT_EQ(exp_seq, seq);
|
||||
|
||||
rocksdb::SyncPoint::GetInstance()->DisableProcessing();
|
||||
@ -1258,7 +1258,7 @@ TEST_P(WritePreparedTransactionTest, DisableGCDuringRecoveryTest) {
|
||||
VerifyKeys({{"foo", v}});
|
||||
seq++; // one for the key/value
|
||||
KeyVersion kv = {"foo", v, seq, kTypeValue};
|
||||
if (options.concurrent_prepare) {
|
||||
if (options.two_write_queues) {
|
||||
seq++; // one for the commit
|
||||
}
|
||||
versions.emplace_back(kv);
|
||||
@ -1306,10 +1306,10 @@ TEST_P(WritePreparedTransactionTest, CompactionShouldKeepUncommittedKeys) {
|
||||
auto add_key = [&](std::function<Status()> func) {
|
||||
ASSERT_OK(func());
|
||||
expected_seq++;
|
||||
if (options.concurrent_prepare) {
|
||||
if (options.two_write_queues) {
|
||||
expected_seq++; // 1 for commit
|
||||
}
|
||||
ASSERT_EQ(expected_seq, db_impl->TEST_GetLatestVisibleSequenceNumber());
|
||||
ASSERT_EQ(expected_seq, db_impl->TEST_GetLastVisibleSequence());
|
||||
snapshots.push_back(db->GetSnapshot());
|
||||
};
|
||||
|
||||
@ -1397,7 +1397,7 @@ TEST_P(WritePreparedTransactionTest, CompactionShouldKeepSnapshotVisibleKeys) {
|
||||
ASSERT_EQ(++expected_seq, db->GetLatestSequenceNumber());
|
||||
ASSERT_OK(txn1->Commit());
|
||||
DBImpl* db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
|
||||
ASSERT_EQ(++expected_seq, db_impl->TEST_GetLatestVisibleSequenceNumber());
|
||||
ASSERT_EQ(++expected_seq, db_impl->TEST_GetLastVisibleSequence());
|
||||
delete txn1;
|
||||
// Take a snapshots to avoid keys get evicted before compaction.
|
||||
const Snapshot* snapshot1 = db->GetSnapshot();
|
||||
@ -1410,24 +1410,24 @@ TEST_P(WritePreparedTransactionTest, CompactionShouldKeepSnapshotVisibleKeys) {
|
||||
// txn2 commit after snapshot2 and it is not visible.
|
||||
const Snapshot* snapshot2 = db->GetSnapshot();
|
||||
ASSERT_OK(txn2->Commit());
|
||||
ASSERT_EQ(++expected_seq, db_impl->TEST_GetLatestVisibleSequenceNumber());
|
||||
ASSERT_EQ(++expected_seq, db_impl->TEST_GetLastVisibleSequence());
|
||||
delete txn2;
|
||||
// Take a snapshots to avoid keys get evicted before compaction.
|
||||
const Snapshot* snapshot3 = db->GetSnapshot();
|
||||
ASSERT_OK(db->Put(WriteOptions(), "key1", "value1_2"));
|
||||
expected_seq++; // 1 for write
|
||||
SequenceNumber seq1 = expected_seq;
|
||||
if (options.concurrent_prepare) {
|
||||
if (options.two_write_queues) {
|
||||
expected_seq++; // 1 for commit
|
||||
}
|
||||
ASSERT_EQ(expected_seq, db_impl->TEST_GetLatestVisibleSequenceNumber());
|
||||
ASSERT_EQ(expected_seq, db_impl->TEST_GetLastVisibleSequence());
|
||||
ASSERT_OK(db->Put(WriteOptions(), "key2", "value2_2"));
|
||||
expected_seq++; // 1 for write
|
||||
SequenceNumber seq2 = expected_seq;
|
||||
if (options.concurrent_prepare) {
|
||||
if (options.two_write_queues) {
|
||||
expected_seq++; // 1 for commit
|
||||
}
|
||||
ASSERT_EQ(expected_seq, db_impl->TEST_GetLatestVisibleSequenceNumber());
|
||||
ASSERT_EQ(expected_seq, db_impl->TEST_GetLastVisibleSequence());
|
||||
ASSERT_OK(db->Flush(FlushOptions()));
|
||||
db->ReleaseSnapshot(snapshot1);
|
||||
db->ReleaseSnapshot(snapshot3);
|
||||
|
@ -90,7 +90,7 @@ Status WritePreparedTxn::CommitWithoutPrepareInternal() {
|
||||
}
|
||||
|
||||
SequenceNumber WritePreparedTxn::GetACommitSeqNumber(SequenceNumber prep_seq) {
|
||||
if (db_impl_->immutable_db_options().concurrent_prepare) {
|
||||
if (db_impl_->immutable_db_options().two_write_queues) {
|
||||
return db_impl_->IncAndFetchSequenceNumber();
|
||||
} else {
|
||||
return prep_seq;
|
||||
|
@ -46,7 +46,7 @@ class WritePreparedTxn : public PessimisticTransaction {
|
||||
virtual ~WritePreparedTxn() {}
|
||||
|
||||
// To make WAL commit markers visible, the snapshot will be based on the last
|
||||
// seq in the WAL, LastToBeWrittenSquence, as opposed to the last seq in the
|
||||
// seq in the WAL, LastAllocatedSequence, as opposed to the last seq in the
|
||||
// memtable.
|
||||
using Transaction::Get;
|
||||
virtual Status Get(const ReadOptions& options,
|
||||
@ -54,7 +54,7 @@ class WritePreparedTxn : public PessimisticTransaction {
|
||||
PinnableSlice* value) override;
|
||||
|
||||
// To make WAL commit markers visible, the snapshot will be based on the last
|
||||
// seq in the WAL, LastToBeWrittenSquence, as opposed to the last seq in the
|
||||
// seq in the WAL, LastAllocatedSequence, as opposed to the last seq in the
|
||||
// memtable.
|
||||
using Transaction::GetIterator;
|
||||
virtual Iterator* GetIterator(const ReadOptions& options) override;
|
||||
@ -76,7 +76,7 @@ class WritePreparedTxn : public PessimisticTransaction {
|
||||
// commit entails writing only a commit marker in the WAL. The sequence number
|
||||
// of the commit marker is then the commit timestamp of the transaction. To
|
||||
// make the commit timestamp visible to readers, their snapshot is based on
|
||||
// the last seq in the WAL, LastToBeWrittenSquence, as opposed to the last seq
|
||||
// the last seq in the WAL, LastAllocatedSequence, as opposed to the last seq
|
||||
// in the memtable.
|
||||
Status CommitInternal() override;
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user