Avoid to go through every CF for every ReleaseSnapshot() (#5090)

Summary:
With https://github.com/facebook/rocksdb/pull/3009 we go through every CF
to check whether a bottommost compaction is needed to be triggered. This is done
within DB mutex. What we do within DB mutex may heavily influece the write throughput
we can achieve, so we always want to minimize work there.

Here we try to avoid this for-loop by first check a global threshold. In most of
the time, the CF loop can be avoided.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5090

Differential Revision: D14582684

Pulled By: siying

fbshipit-source-id: 968f6d9bb6affe1a5ebc4910b418300b076f166f
This commit is contained in:
Siying Dong 2019-03-25 19:14:04 -07:00 committed by Facebook Github Bot
parent 52e6404e0f
commit 48e7effa79
5 changed files with 63 additions and 9 deletions

View File

@ -3348,7 +3348,7 @@ TEST_F(DBCompactionTest, CompactBottomLevelFilesWithDeletions) {
options.level0_file_num_compaction_trigger = kNumLevelFiles;
// inflate it a bit to account for key/metadata overhead
options.target_file_size_base = 120 * kNumKeysPerFile * kValueSize / 100;
Reopen(options);
CreateAndReopenWithCF({"one"}, options);
Random rnd(301);
const Snapshot* snapshot = nullptr;
@ -3379,10 +3379,12 @@ TEST_F(DBCompactionTest, CompactBottomLevelFilesWithDeletions) {
// just need to bump seqnum so ReleaseSnapshot knows the newest key in the SST
// files does not need to be preserved in case of a future snapshot.
ASSERT_OK(Put(Key(0), "val"));
ASSERT_NE(kMaxSequenceNumber, dbfull()->bottommost_files_mark_threshold_);
// release snapshot and wait for compactions to finish. Single-file
// compactions should be triggered, which reduce the size of each bottom-level
// file without changing file count.
db_->ReleaseSnapshot(snapshot);
ASSERT_EQ(kMaxSequenceNumber, dbfull()->bottommost_files_mark_threshold_);
rocksdb::SyncPoint::GetInstance()->SetCallBack(
"LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
Compaction* compaction = reinterpret_cast<Compaction*>(arg);

View File

@ -2113,6 +2113,18 @@ SnapshotImpl* DBImpl::GetSnapshotImpl(bool is_write_conflict_boundary,
return snapshot;
}
namespace {
typedef autovector<ColumnFamilyData*, 2> CfdList;
bool CfdListContains(const CfdList& list, ColumnFamilyData* cfd) {
for (const ColumnFamilyData* t : list) {
if (t == cfd) {
return true;
}
}
return false;
}
} // namespace
void DBImpl::ReleaseSnapshot(const Snapshot* s) {
const SnapshotImpl* casted_s = reinterpret_cast<const SnapshotImpl*>(s);
{
@ -2126,6 +2138,10 @@ void DBImpl::ReleaseSnapshot(const Snapshot* s) {
} else {
oldest_snapshot = snapshots_.oldest()->number_;
}
// Avoid to go through every column family by checking a global threshold
// first.
if (oldest_snapshot > bottommost_files_mark_threshold_) {
CfdList cf_scheduled;
for (auto* cfd : *versions_->GetColumnFamilySet()) {
cfd->current()->storage_info()->UpdateOldestSnapshot(oldest_snapshot);
if (!cfd->current()
@ -2134,8 +2150,24 @@ void DBImpl::ReleaseSnapshot(const Snapshot* s) {
.empty()) {
SchedulePendingCompaction(cfd);
MaybeScheduleFlushOrCompaction();
cf_scheduled.push_back(cfd);
}
}
// Calculate a new threshold, skipping those CFs where compactions are
// scheduled. We do not do the same pass as the previous loop because
// mutex might be unlocked during the loop, making the result inaccurate.
SequenceNumber new_bottommost_files_mark_threshold = kMaxSequenceNumber;
for (auto* cfd : *versions_->GetColumnFamilySet()) {
if (CfdListContains(cf_scheduled, cfd)) {
continue;
}
new_bottommost_files_mark_threshold = std::min(
new_bottommost_files_mark_threshold,
cfd->current()->storage_info()->bottommost_files_mark_threshold());
}
bottommost_files_mark_threshold_ = new_bottommost_files_mark_threshold;
}
}
delete casted_s;
}

View File

@ -865,6 +865,7 @@ class DBImpl : public DB {
friend class CompactedDBImpl;
friend class DBTest_ConcurrentFlushWAL_Test;
friend class DBTest_MixedSlowdownOptionsStop_Test;
friend class DBCompactionTest_CompactBottomLevelFilesWithDeletions_Test;
#ifndef NDEBUG
friend class DBTest2_ReadCallbackTest_Test;
friend class WriteCallbackTest_WriteWithCallbackTest_Test;
@ -1573,6 +1574,10 @@ class DBImpl : public DB {
// Indicate DB was opened successfully
bool opened_successfully_;
// The min threshold to triggere bottommost compaction for removing
// garbages, among all column families.
SequenceNumber bottommost_files_mark_threshold_ = kMaxSequenceNumber;
LogsWithPrepTracker logs_with_prep_tracker_;
// Callback for compaction to check if a key is visible to a snapshot.

View File

@ -2883,6 +2883,17 @@ void DBImpl::InstallSuperVersionAndScheduleWork(
}
cfd->InstallSuperVersion(sv_context, &mutex_, mutable_cf_options);
// There may be a small data race here. The snapshot tricking bottommost
// compaction may already be released here. But assuming there will always be
// newer snapshot created and released frequently, the compaction will be
// triggered soon anyway.
bottommost_files_mark_threshold_ = kMaxSequenceNumber;
for (auto* my_cfd : *versions_->GetColumnFamilySet()) {
bottommost_files_mark_threshold_ = std::min(
bottommost_files_mark_threshold_,
my_cfd->current()->storage_info()->bottommost_files_mark_threshold());
}
// Whenever we install new SuperVersion, we might need to issue new flushes or
// compactions.
SchedulePendingCompaction(cfd);

View File

@ -402,6 +402,10 @@ class VersionStorageInfo {
bool force_consistency_checks() const { return force_consistency_checks_; }
SequenceNumber bottommost_files_mark_threshold() const {
return bottommost_files_mark_threshold_;
}
// Returns whether any key in [`smallest_key`, `largest_key`] could appear in
// an older L0 file than `last_l0_idx` or in a greater level than `last_level`
//