From 3e1bf771a3c7324306fc31317c9ea275e1a8dd61 Mon Sep 17 00:00:00 2001 From: Levi Tamasi Date: Mon, 11 Oct 2021 18:00:44 -0700 Subject: [PATCH] Make it possible to force the garbage collection of the oldest blob files (#8994) Summary: The current BlobDB garbage collection logic works by relocating the valid blobs from the oldest blob files as they are encountered during compaction, and cleaning up blob files once they contain nothing but garbage. However, with sufficiently skewed workloads, it is theoretically possible to end up in a situation when few or no compactions get scheduled for the SST files that contain references to the oldest blob files, which can lead to increased space amp due to the lack of GC. In order to efficiently handle such workloads, the patch adds a new BlobDB configuration option called `blob_garbage_collection_force_threshold`, which signals to BlobDB to schedule targeted compactions for the SST files that keep alive the oldest batch of blob files if the overall ratio of garbage in the given blob files meets the threshold *and* all the given blob files are eligible for GC based on `blob_garbage_collection_age_cutoff`. (For example, if the new option is set to 0.9, targeted compactions will get scheduled if the sum of garbage bytes meets or exceeds 90% of the sum of total bytes in the oldest blob files, assuming all affected blob files are below the age-based cutoff.) The net result of these targeted compactions is that the valid blobs in the oldest blob files are relocated and the oldest blob files themselves cleaned up (since *all* SST files that rely on them get compacted away). These targeted compactions are similar to periodic compactions in the sense that they force certain SST files that otherwise would not get picked up to undergo compaction and also in the sense that instead of merging files from multiple levels, they target a single file. (Note: such compactions might still include neighboring files from the same level due to the need of having a "clean cut" boundary but they never include any files from any other level.) This functionality is currently only supported with the leveled compaction style and is inactive by default (since the default value is set to 1.0, i.e. 100%). Pull Request resolved: https://github.com/facebook/rocksdb/pull/8994 Test Plan: Ran `make check` and tested using `db_bench` and the stress/crash tests. Reviewed By: riversand963 Differential Revision: D31489850 Pulled By: ltamasi fbshipit-source-id: 44057d511726a0e2a03c5d9313d7511b3f0c4eab --- db/c.cc | 9 + db/c_test.c | 7 +- db/column_family.cc | 19 +- db/column_family_test.cc | 24 +++ db/compaction/compaction_job.cc | 2 + db/compaction/compaction_picker_level.cc | 10 ++ db/version_set.cc | 109 ++++++++++++ db/version_set.h | 18 ++ db/version_set_test.cc | 215 +++++++++++++++++++++-- db_stress_tool/db_stress_common.h | 1 + db_stress_tool/db_stress_gflags.cc | 6 + db_stress_tool/db_stress_test_base.cc | 11 +- include/rocksdb/advanced_options.h | 13 ++ include/rocksdb/c.h | 5 + include/rocksdb/listener.h | 2 + options/cf_options.cc | 7 + options/cf_options.h | 4 + options/options.cc | 26 +-- options/options_helper.cc | 2 + options/options_settable_test.cc | 1 + options/options_test.cc | 4 + test_util/testutil.cc | 2 + tools/db_bench_tool.cc | 8 + tools/db_bench_tool_test.cc | 3 +- tools/db_crashtest.py | 1 + 25 files changed, 475 insertions(+), 34 deletions(-) diff --git a/db/c.cc b/db/c.cc index 72274e68f..89dfa5303 100644 --- a/db/c.cc +++ b/db/c.cc @@ -2751,6 +2751,15 @@ double rocksdb_options_get_blob_gc_age_cutoff(rocksdb_options_t* opt) { return opt->rep.blob_garbage_collection_age_cutoff; } +void rocksdb_options_set_blob_gc_force_threshold(rocksdb_options_t* opt, + double val) { + opt->rep.blob_garbage_collection_force_threshold = val; +} + +double rocksdb_options_get_blob_gc_force_threshold(rocksdb_options_t* opt) { + return opt->rep.blob_garbage_collection_force_threshold; +} + void rocksdb_options_set_num_levels(rocksdb_options_t* opt, int n) { opt->rep.num_levels = n; } diff --git a/db/c_test.c b/db/c_test.c index bbc4e9db9..fb8f65635 100644 --- a/db/c_test.c +++ b/db/c_test.c @@ -1793,8 +1793,11 @@ int main(int argc, char** argv) { rocksdb_options_set_enable_blob_gc(o, 1); CheckCondition(1 == rocksdb_options_get_enable_blob_gc(o)); - rocksdb_options_set_blob_gc_age_cutoff(o, 0.75); - CheckCondition(0.75 == rocksdb_options_get_blob_gc_age_cutoff(o)); + rocksdb_options_set_blob_gc_age_cutoff(o, 0.5); + CheckCondition(0.5 == rocksdb_options_get_blob_gc_age_cutoff(o)); + + rocksdb_options_set_blob_gc_force_threshold(o, 0.75); + CheckCondition(0.75 == rocksdb_options_get_blob_gc_force_threshold(o)); // Create a copy that should be equal to the original. rocksdb_options_t* copy; diff --git a/db/column_family.cc b/db/column_family.cc index b1cfa24f0..2aa1fbd6d 100644 --- a/db/column_family.cc +++ b/db/column_family.cc @@ -1358,12 +1358,19 @@ Status ColumnFamilyData::ValidateOptions( } } - if (cf_options.enable_blob_garbage_collection && - (cf_options.blob_garbage_collection_age_cutoff < 0.0 || - cf_options.blob_garbage_collection_age_cutoff > 1.0)) { - return Status::InvalidArgument( - "The age cutoff for blob garbage collection should be in the range " - "[0.0, 1.0]."); + if (cf_options.enable_blob_garbage_collection) { + if (cf_options.blob_garbage_collection_age_cutoff < 0.0 || + cf_options.blob_garbage_collection_age_cutoff > 1.0) { + return Status::InvalidArgument( + "The age cutoff for blob garbage collection should be in the range " + "[0.0, 1.0]."); + } + if (cf_options.blob_garbage_collection_force_threshold < 0.0 || + cf_options.blob_garbage_collection_force_threshold > 1.0) { + return Status::InvalidArgument( + "The garbage ratio threshold for forcing blob garbage collection " + "should be in the range [0.0, 1.0]."); + } } if (cf_options.compaction_style == kCompactionStyleFIFO && diff --git a/db/column_family_test.cc b/db/column_family_test.cc index b772fb1ce..cdec7bac9 100644 --- a/db/column_family_test.cc +++ b/db/column_family_test.cc @@ -3407,6 +3407,30 @@ TEST(ColumnFamilyTest, ValidateBlobGCCutoff) { .IsInvalidArgument()); } +TEST(ColumnFamilyTest, ValidateBlobGCForceThreshold) { + DBOptions db_options; + + ColumnFamilyOptions cf_options; + cf_options.enable_blob_garbage_collection = true; + + cf_options.blob_garbage_collection_force_threshold = -0.5; + ASSERT_TRUE(ColumnFamilyData::ValidateOptions(db_options, cf_options) + .IsInvalidArgument()); + + cf_options.blob_garbage_collection_force_threshold = 0.0; + ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options)); + + cf_options.blob_garbage_collection_force_threshold = 0.5; + ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options)); + + cf_options.blob_garbage_collection_force_threshold = 1.0; + ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options)); + + cf_options.blob_garbage_collection_force_threshold = 1.5; + ASSERT_TRUE(ColumnFamilyData::ValidateOptions(db_options, cf_options) + .IsInvalidArgument()); +} + } // namespace ROCKSDB_NAMESPACE #ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc index 8b67d3323..d36cf8ab5 100644 --- a/db/compaction/compaction_job.cc +++ b/db/compaction/compaction_job.cc @@ -109,6 +109,8 @@ const char* GetCompactionReasonString(CompactionReason compaction_reason) { return "PeriodicCompaction"; case CompactionReason::kChangeTemperature: return "ChangeTemperature"; + case CompactionReason::kForcedBlobGC: + return "ForcedBlobGC"; case CompactionReason::kNumOfReasons: // fall through default: diff --git a/db/compaction/compaction_picker_level.cc b/db/compaction/compaction_picker_level.cc index 0a70c89c0..52a3d5c35 100644 --- a/db/compaction/compaction_picker_level.cc +++ b/db/compaction/compaction_picker_level.cc @@ -31,6 +31,9 @@ bool LevelCompactionPicker::NeedsCompaction( if (!vstorage->FilesMarkedForCompaction().empty()) { return true; } + if (!vstorage->FilesMarkedForForcedBlobGC().empty()) { + return true; + } for (int i = 0; i <= vstorage->MaxInputLevel(); i++) { if (vstorage->CompactionScore(i) >= 1) { return true; @@ -248,6 +251,13 @@ void LevelCompactionBuilder::SetupInitialFiles() { compaction_reason_ = CompactionReason::kPeriodicCompaction; return; } + + // Forced blob garbage collection + PickFileToCompact(vstorage_->FilesMarkedForForcedBlobGC(), false); + if (!start_level_inputs_.empty()) { + compaction_reason_ = CompactionReason::kForcedBlobGC; + return; + } } bool LevelCompactionBuilder::SetupOtherL0FilesIfNeeded() { diff --git a/db/version_set.cc b/db/version_set.cc index c8cbe3bb4..e99f44501 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -2817,6 +2817,15 @@ void VersionStorageInfo::ComputeCompactionScore( ComputeFilesMarkedForPeriodicCompaction( immutable_options, mutable_cf_options.periodic_compaction_seconds); } + + if (mutable_cf_options.enable_blob_garbage_collection && + mutable_cf_options.blob_garbage_collection_age_cutoff > 0.0 && + mutable_cf_options.blob_garbage_collection_force_threshold < 1.0) { + ComputeFilesMarkedForForcedBlobGC( + mutable_cf_options.blob_garbage_collection_age_cutoff, + mutable_cf_options.blob_garbage_collection_force_threshold); + } + EstimateCompactionBytesNeeded(mutable_cf_options); } @@ -2926,6 +2935,106 @@ void VersionStorageInfo::ComputeFilesMarkedForPeriodicCompaction( } } +void VersionStorageInfo::ComputeFilesMarkedForForcedBlobGC( + double blob_garbage_collection_age_cutoff, + double blob_garbage_collection_force_threshold) { + files_marked_for_forced_blob_gc_.clear(); + + if (blob_files_.empty()) { + return; + } + + // Number of blob files eligible for GC based on age + const size_t cutoff_count = static_cast( + blob_garbage_collection_age_cutoff * blob_files_.size()); + if (!cutoff_count) { + return; + } + + // Compute the sum of total and garbage bytes over the oldest batch of blob + // files. The oldest batch is defined as the set of blob files which are + // kept alive by the same SSTs as the very oldest one. Here is a toy example. + // Let's assume we have three SSTs 1, 2, and 3, and four blob files 10, 11, + // 12, and 13. Also, let's say SSTs 1 and 2 both rely on blob file 10 and + // potentially some higher-numbered ones, while SST 3 relies on blob file 12 + // and potentially some higher-numbered ones. Then, the SST to oldest blob + // file mapping is as follows: + // + // SST file number Oldest blob file number + // 1 10 + // 2 10 + // 3 12 + // + // This is what the same thing looks like from the blob files' POV. (Note that + // the linked SSTs simply denote the inverse mapping of the above.) + // + // Blob file number Linked SST set + // 10 {1, 2} + // 11 {} + // 12 {3} + // 13 {} + // + // Then, the oldest batch of blob files consists of blob files 10 and 11, + // and we can get rid of them by forcing the compaction of SSTs 1 and 2. + // + // Note that the overall ratio of garbage computed for the batch has to exceed + // blob_garbage_collection_force_threshold and the entire batch has to be + // eligible for GC according to blob_garbage_collection_age_cutoff in order + // for us to schedule any compactions. + const auto oldest_it = blob_files_.begin(); + + const auto& oldest_meta = oldest_it->second; + assert(oldest_meta); + + const auto& linked_ssts = oldest_meta->GetLinkedSsts(); + assert(!linked_ssts.empty()); + + size_t count = 1; + uint64_t sum_total_blob_bytes = oldest_meta->GetTotalBlobBytes(); + uint64_t sum_garbage_blob_bytes = oldest_meta->GetGarbageBlobBytes(); + + auto it = oldest_it; + for (++it; it != blob_files_.end(); ++it) { + const auto& meta = it->second; + assert(meta); + + if (!meta->GetLinkedSsts().empty()) { + break; + } + + if (++count > cutoff_count) { + return; + } + + sum_total_blob_bytes += meta->GetTotalBlobBytes(); + sum_garbage_blob_bytes += meta->GetGarbageBlobBytes(); + } + + if (sum_garbage_blob_bytes < + blob_garbage_collection_force_threshold * sum_total_blob_bytes) { + return; + } + + for (uint64_t sst_file_number : linked_ssts) { + const FileLocation location = GetFileLocation(sst_file_number); + assert(location.IsValid()); + + const int level = location.GetLevel(); + assert(level >= 0); + + const size_t pos = location.GetPosition(); + + FileMetaData* const sst_meta = files_[level][pos]; + assert(sst_meta); + + if (sst_meta->being_compacted) { + continue; + } + + files_marked_for_forced_blob_gc_.emplace_back(level, sst_meta); + } +} + namespace { // used to sort files by size diff --git a/db/version_set.h b/db/version_set.h index 4abb96f49..d4ce284a5 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -184,6 +184,14 @@ class VersionStorageInfo { // REQUIRES: DB mutex held void ComputeBottommostFilesMarkedForCompaction(); + // This computes files_marked_for_forced_blob_gc_ and is called by + // ComputeCompactionScore() + // + // REQUIRES: DB mutex held + void ComputeFilesMarkedForForcedBlobGC( + double blob_garbage_collection_age_cutoff, + double blob_garbage_collection_force_threshold); + // Generate level_files_brief_ from files_ void GenerateLevelFilesBrief(); // Sort all files for this version based on their file size and @@ -404,6 +412,14 @@ class VersionStorageInfo { return bottommost_files_marked_for_compaction_; } + // REQUIRES: This version has been saved (see VersionSet::SaveTo) + // REQUIRES: DB mutex held during access + const autovector>& FilesMarkedForForcedBlobGC() + const { + assert(finalized_); + return files_marked_for_forced_blob_gc_; + } + int base_level() const { return base_level_; } double level_multiplier() const { return level_multiplier_; } @@ -586,6 +602,8 @@ class VersionStorageInfo { autovector> bottommost_files_marked_for_compaction_; + autovector> files_marked_for_forced_blob_gc_; + // Threshold for needing to mark another bottommost file. Maintain it so we // can quickly check when releasing a snapshot whether more bottommost files // became eligible for compaction. It's defined as the min of the max nonzero diff --git a/db/version_set_test.cc b/db/version_set_test.cc index 90088100c..7bfea79b6 100644 --- a/db/version_set_test.cc +++ b/db/version_set_test.cc @@ -9,6 +9,8 @@ #include "db/version_set.h" +#include + #include "db/db_impl/db_impl.h" #include "db/log_writer.h" #include "rocksdb/convenience.h" @@ -135,31 +137,55 @@ class VersionStorageInfoTestBase : public testing::Test { } void Add(int level, uint32_t file_number, const char* smallest, - const char* largest, uint64_t file_size = 0) { - assert(level < vstorage_.num_levels()); - FileMetaData* f = new FileMetaData( - file_number, 0, file_size, GetInternalKey(smallest, 0), - GetInternalKey(largest, 0), /* smallest_seq */ 0, /* largest_seq */ 0, - /* marked_for_compact */ false, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, - kUnknownFileChecksum, kUnknownFileChecksumFuncName); - f->compensated_file_size = file_size; - vstorage_.AddFile(level, f); + const char* largest, uint64_t file_size = 0, + uint64_t oldest_blob_file_number = kInvalidBlobFileNumber) { + constexpr SequenceNumber dummy_seq = 0; + + Add(level, file_number, GetInternalKey(smallest, dummy_seq), + GetInternalKey(largest, dummy_seq), file_size, oldest_blob_file_number); } void Add(int level, uint32_t file_number, const InternalKey& smallest, - const InternalKey& largest, uint64_t file_size = 0) { + const InternalKey& largest, uint64_t file_size = 0, + uint64_t oldest_blob_file_number = kInvalidBlobFileNumber) { assert(level < vstorage_.num_levels()); FileMetaData* f = new FileMetaData( file_number, 0, file_size, smallest, largest, /* smallest_seq */ 0, /* largest_seq */ 0, /* marked_for_compact */ false, - kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + oldest_blob_file_number, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownFileChecksum, kUnknownFileChecksumFuncName); f->compensated_file_size = file_size; vstorage_.AddFile(level, f); } + void AddBlob(uint64_t blob_file_number, uint64_t total_blob_count, + uint64_t total_blob_bytes, + BlobFileMetaData::LinkedSsts linked_ssts, + uint64_t garbage_blob_count, uint64_t garbage_blob_bytes) { + auto shared_meta = SharedBlobFileMetaData::Create( + blob_file_number, total_blob_count, total_blob_bytes, + /* checksum_method */ std::string(), + /* checksum_value */ std::string()); + auto meta = + BlobFileMetaData::Create(std::move(shared_meta), std::move(linked_ssts), + garbage_blob_count, garbage_blob_bytes); + + vstorage_.AddBlobFile(std::move(meta)); + } + + void Finalize() { + vstorage_.UpdateNumNonEmptyLevels(); + vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_); + vstorage_.UpdateFilesByCompactionPri(ioptions_.compaction_pri); + vstorage_.GenerateFileIndexer(); + vstorage_.GenerateLevelFilesBrief(); + vstorage_.GenerateLevel0NonOverlapping(); + vstorage_.GenerateBottommostFiles(); + + vstorage_.SetFinalized(); + } + std::string GetOverlappingFiles(int level, const InternalKey& begin, const InternalKey& end) { std::vector inputs; @@ -445,6 +471,171 @@ TEST_F(VersionStorageInfoTest, FileLocationAndMetaDataByNumber) { ASSERT_EQ(vstorage_.GetFileMetaDataByNumber(999U), nullptr); } +TEST_F(VersionStorageInfoTest, ForcedBlobGCEmpty) { + // No SST or blob files in VersionStorageInfo + Finalize(); + + constexpr double age_cutoff = 0.5; + constexpr double force_threshold = 0.75; + vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold); + + ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty()); +} + +TEST_F(VersionStorageInfoTest, ForcedBlobGC) { + // Add three L0 SSTs and four blob files. The first two SSTs keep alive the + // first two blob files, while the third SST keeps alive the third and fourth + // blob files. + + constexpr int level = 0; + + constexpr uint64_t first_sst = 1; + constexpr uint64_t second_sst = 2; + constexpr uint64_t third_sst = 3; + + constexpr uint64_t first_blob = 10; + constexpr uint64_t second_blob = 11; + constexpr uint64_t third_blob = 12; + constexpr uint64_t fourth_blob = 13; + + { + constexpr char smallest[] = "bar1"; + constexpr char largest[] = "foo1"; + constexpr uint64_t file_size = 1000; + + Add(level, first_sst, smallest, largest, file_size, first_blob); + } + + { + constexpr char smallest[] = "bar2"; + constexpr char largest[] = "foo2"; + constexpr uint64_t file_size = 2000; + + Add(level, second_sst, smallest, largest, file_size, first_blob); + } + + { + constexpr char smallest[] = "bar3"; + constexpr char largest[] = "foo3"; + constexpr uint64_t file_size = 3000; + + Add(level, third_sst, smallest, largest, file_size, third_blob); + } + + { + constexpr uint64_t total_blob_count = 10; + constexpr uint64_t total_blob_bytes = 100000; + constexpr uint64_t garbage_blob_count = 2; + constexpr uint64_t garbage_blob_bytes = 15000; + + AddBlob(first_blob, total_blob_count, total_blob_bytes, + BlobFileMetaData::LinkedSsts{first_sst, second_sst}, + garbage_blob_count, garbage_blob_bytes); + } + + { + constexpr uint64_t total_blob_count = 4; + constexpr uint64_t total_blob_bytes = 400000; + constexpr uint64_t garbage_blob_count = 3; + constexpr uint64_t garbage_blob_bytes = 235000; + + AddBlob(second_blob, total_blob_count, total_blob_bytes, + BlobFileMetaData::LinkedSsts{}, garbage_blob_count, + garbage_blob_bytes); + } + + { + constexpr uint64_t total_blob_count = 20; + constexpr uint64_t total_blob_bytes = 1000000; + constexpr uint64_t garbage_blob_count = 8; + constexpr uint64_t garbage_blob_bytes = 123456; + + AddBlob(third_blob, total_blob_count, total_blob_bytes, + BlobFileMetaData::LinkedSsts{third_sst}, garbage_blob_count, + garbage_blob_bytes); + } + + { + constexpr uint64_t total_blob_count = 128; + constexpr uint64_t total_blob_bytes = 789012345; + constexpr uint64_t garbage_blob_count = 67; + constexpr uint64_t garbage_blob_bytes = 88888888; + + AddBlob(fourth_blob, total_blob_count, total_blob_bytes, + BlobFileMetaData::LinkedSsts{}, garbage_blob_count, + garbage_blob_bytes); + } + + Finalize(); + + assert(vstorage_.num_levels() > 0); + const auto& level_files = vstorage_.LevelFiles(level); + + assert(level_files.size() == 3); + assert(level_files[0] && level_files[0]->fd.GetNumber() == first_sst); + assert(level_files[1] && level_files[1]->fd.GetNumber() == second_sst); + assert(level_files[2] && level_files[2]->fd.GetNumber() == third_sst); + + // No blob files eligible for GC due to the age cutoff + + { + constexpr double age_cutoff = 0.1; + constexpr double force_threshold = 0.0; + vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold); + + ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty()); + } + + // Part of the oldest batch of blob files (specifically, the second file) is + // ineligible for GC due to the age cutoff + + { + constexpr double age_cutoff = 0.25; + constexpr double force_threshold = 0.0; + vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold); + + ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty()); + } + + // Oldest batch is eligible based on age cutoff but its overall garbage ratio + // is below threshold + + { + constexpr double age_cutoff = 0.5; + constexpr double force_threshold = 0.6; + vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold); + + ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty()); + } + + // Oldest batch is eligible based on age cutoff and its overall garbage ratio + // meets threshold + + { + constexpr double age_cutoff = 0.5; + constexpr double force_threshold = 0.5; + vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold); + + auto ssts_to_be_compacted = vstorage_.FilesMarkedForForcedBlobGC(); + ASSERT_EQ(ssts_to_be_compacted.size(), 2); + + std::sort(ssts_to_be_compacted.begin(), ssts_to_be_compacted.end(), + [](const std::pair& lhs, + const std::pair& rhs) { + assert(lhs.second); + assert(rhs.second); + return lhs.second->fd.GetNumber() < rhs.second->fd.GetNumber(); + }); + + const autovector> + expected_ssts_to_be_compacted{{level, level_files[0]}, + {level, level_files[1]}}; + + ASSERT_EQ(ssts_to_be_compacted[0], expected_ssts_to_be_compacted[0]); + ASSERT_EQ(ssts_to_be_compacted[1], expected_ssts_to_be_compacted[1]); + } +} + class VersionStorageInfoTimestampTest : public VersionStorageInfoTestBase { public: VersionStorageInfoTimestampTest() diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h index 29a968e6e..87840b0e4 100644 --- a/db_stress_tool/db_stress_common.h +++ b/db_stress_tool/db_stress_common.h @@ -252,6 +252,7 @@ DECLARE_uint64(blob_file_size); DECLARE_string(blob_compression_type); DECLARE_bool(enable_blob_garbage_collection); DECLARE_double(blob_garbage_collection_age_cutoff); +DECLARE_double(blob_garbage_collection_force_threshold); DECLARE_int32(approximate_size_one_in); DECLARE_bool(sync_fault_injection); diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc index 15adef915..89ccf5bc4 100644 --- a/db_stress_tool/db_stress_gflags.cc +++ b/db_stress_tool/db_stress_gflags.cc @@ -399,6 +399,12 @@ DEFINE_double(blob_garbage_collection_age_cutoff, "[Integrated BlobDB] The cutoff in terms of blob file age for " "garbage collection."); +DEFINE_double(blob_garbage_collection_force_threshold, + ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions() + .blob_garbage_collection_force_threshold, + "[Integrated BlobDB] The threshold for the ratio of garbage in " + "the oldest blob files for forcing garbage collection."); + static const bool FLAGS_subcompactions_dummy __attribute__((__unused__)) = RegisterFlagValidator(&FLAGS_subcompactions, &ValidateUint32Range); diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc index 527cd775c..5d4c414fd 100644 --- a/db_stress_tool/db_stress_test_base.cc +++ b/db_stress_tool/db_stress_test_base.cc @@ -262,6 +262,8 @@ bool StressTest::BuildOptionsTable() { options_tbl.emplace( "blob_garbage_collection_age_cutoff", std::vector{"0.0", "0.25", "0.5", "0.75", "1.0"}); + options_tbl.emplace("blob_garbage_collection_force_threshold", + std::vector{"0.5", "0.75", "1.0"}); } options_table_ = std::move(options_tbl); @@ -2310,6 +2312,8 @@ void StressTest::Open() { FLAGS_enable_blob_garbage_collection; options_.blob_garbage_collection_age_cutoff = FLAGS_blob_garbage_collection_age_cutoff; + options_.blob_garbage_collection_force_threshold = + FLAGS_blob_garbage_collection_force_threshold; } else { #ifdef ROCKSDB_LITE fprintf(stderr, "--options_file not supported in lite mode\n"); @@ -2418,8 +2422,11 @@ void StressTest::Open() { } if (options_.enable_blob_garbage_collection) { - fprintf(stdout, "Integrated BlobDB: blob GC enabled, cutoff %f\n", - options_.blob_garbage_collection_age_cutoff); + fprintf( + stdout, + "Integrated BlobDB: blob GC enabled, cutoff %f, force threshold %f\n", + options_.blob_garbage_collection_age_cutoff, + options_.blob_garbage_collection_force_threshold); } fprintf(stdout, "DB path: [%s]\n", FLAGS_db.c_str()); diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h index 2e6bb7fbc..d810bedf3 100644 --- a/include/rocksdb/advanced_options.h +++ b/include/rocksdb/advanced_options.h @@ -846,6 +846,19 @@ struct AdvancedColumnFamilyOptions { // Dynamically changeable through the SetOptions() API double blob_garbage_collection_age_cutoff = 0.25; + // If the ratio of garbage in the oldest blob files exceeds this threshold, + // targeted compactions are scheduled in order to force garbage collecting + // the blob files in question, assuming they are all eligible based on the + // value of blob_garbage_collection_age_cutoff above. This option is + // currently only supported with leveled compactions. + // Note that enable_blob_garbage_collection has to be set in order for this + // option to have any effect. + // + // Default: 1.0 + // + // Dynamically changeable through the SetOptions() API + double blob_garbage_collection_force_threshold = 1.0; + // Create ColumnFamilyOptions with default values for all fields AdvancedColumnFamilyOptions(); // Create ColumnFamilyOptions from Options diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h index c8ee0c939..730bef9f2 100644 --- a/include/rocksdb/c.h +++ b/include/rocksdb/c.h @@ -1117,6 +1117,11 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_set_blob_gc_age_cutoff( extern ROCKSDB_LIBRARY_API double rocksdb_options_get_blob_gc_age_cutoff( rocksdb_options_t* opt); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_blob_gc_force_threshold( + rocksdb_options_t* opt, double val); +extern ROCKSDB_LIBRARY_API double rocksdb_options_get_blob_gc_force_threshold( + rocksdb_options_t* opt); + /* returns a pointer to a malloc()-ed, null terminated string */ extern ROCKSDB_LIBRARY_API char* rocksdb_options_statistics_get_string( rocksdb_options_t* opt); diff --git a/include/rocksdb/listener.h b/include/rocksdb/listener.h index c79de186d..6ffb3fe4c 100644 --- a/include/rocksdb/listener.h +++ b/include/rocksdb/listener.h @@ -144,6 +144,8 @@ enum class CompactionReason : int { kPeriodicCompaction, // Compaction in order to move files to temperature kChangeTemperature, + // Compaction scheduled to force garbage collection of blob files + kForcedBlobGC, // total number of compaction reasons, new reasons must be added above this. kNumOfReasons, }; diff --git a/options/cf_options.cc b/options/cf_options.cc index c7f6538a9..5767e759c 100644 --- a/options/cf_options.cc +++ b/options/cf_options.cc @@ -425,6 +425,11 @@ static std::unordered_map {offsetof(struct MutableCFOptions, blob_garbage_collection_age_cutoff), OptionType::kDouble, OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, + {"blob_garbage_collection_force_threshold", + {offsetof(struct MutableCFOptions, + blob_garbage_collection_force_threshold), + OptionType::kDouble, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, {"sample_for_compression", {offsetof(struct MutableCFOptions, sample_for_compression), OptionType::kUInt64T, OptionVerificationType::kNormal, @@ -1042,6 +1047,8 @@ void MutableCFOptions::Dump(Logger* log) const { enable_blob_garbage_collection ? "true" : "false"); ROCKS_LOG_INFO(log, " blob_garbage_collection_age_cutoff: %f", blob_garbage_collection_age_cutoff); + ROCKS_LOG_INFO(log, " blob_garbage_collection_force_threshold: %f", + blob_garbage_collection_force_threshold); } MutableCFOptions::MutableCFOptions(const Options& options) diff --git a/options/cf_options.h b/options/cf_options.h index d4e77f04f..d08096da1 100644 --- a/options/cf_options.h +++ b/options/cf_options.h @@ -141,6 +141,8 @@ struct MutableCFOptions { enable_blob_garbage_collection(options.enable_blob_garbage_collection), blob_garbage_collection_age_cutoff( options.blob_garbage_collection_age_cutoff), + blob_garbage_collection_force_threshold( + options.blob_garbage_collection_force_threshold), max_sequential_skip_in_iterations( options.max_sequential_skip_in_iterations), check_flush_compaction_key_order( @@ -187,6 +189,7 @@ struct MutableCFOptions { blob_compression_type(kNoCompression), enable_blob_garbage_collection(false), blob_garbage_collection_age_cutoff(0.0), + blob_garbage_collection_force_threshold(0.0), max_sequential_skip_in_iterations(0), check_flush_compaction_key_order(true), paranoid_file_checks(false), @@ -251,6 +254,7 @@ struct MutableCFOptions { CompressionType blob_compression_type; bool enable_blob_garbage_collection; double blob_garbage_collection_age_cutoff; + double blob_garbage_collection_force_threshold; // Misc options uint64_t max_sequential_skip_in_iterations; diff --git a/options/options.cc b/options/options.cc index 2b3053fe3..9a8eee209 100644 --- a/options/options.cc +++ b/options/options.cc @@ -97,7 +97,9 @@ AdvancedColumnFamilyOptions::AdvancedColumnFamilyOptions(const Options& options) blob_compression_type(options.blob_compression_type), enable_blob_garbage_collection(options.enable_blob_garbage_collection), blob_garbage_collection_age_cutoff( - options.blob_garbage_collection_age_cutoff) { + options.blob_garbage_collection_age_cutoff), + blob_garbage_collection_force_threshold( + options.blob_garbage_collection_force_threshold) { assert(memtable_factory.get() != nullptr); if (max_bytes_for_level_multiplier_additional.size() < static_cast(num_levels)) { @@ -387,20 +389,22 @@ void ColumnFamilyOptions::Dump(Logger* log) const { ROCKS_LOG_HEADER(log, " Options.periodic_compaction_seconds: %" PRIu64, periodic_compaction_seconds); - ROCKS_LOG_HEADER(log, " Options.enable_blob_files: %s", + ROCKS_LOG_HEADER(log, " Options.enable_blob_files: %s", enable_blob_files ? "true" : "false"); - ROCKS_LOG_HEADER(log, - " Options.min_blob_size: %" PRIu64, - min_blob_size); - ROCKS_LOG_HEADER(log, - " Options.blob_file_size: %" PRIu64, - blob_file_size); - ROCKS_LOG_HEADER(log, " Options.blob_compression_type: %s", + ROCKS_LOG_HEADER( + log, " Options.min_blob_size: %" PRIu64, + min_blob_size); + ROCKS_LOG_HEADER( + log, " Options.blob_file_size: %" PRIu64, + blob_file_size); + ROCKS_LOG_HEADER(log, " Options.blob_compression_type: %s", CompressionTypeToString(blob_compression_type).c_str()); - ROCKS_LOG_HEADER(log, " Options.enable_blob_garbage_collection: %s", + ROCKS_LOG_HEADER(log, " Options.enable_blob_garbage_collection: %s", enable_blob_garbage_collection ? "true" : "false"); - ROCKS_LOG_HEADER(log, " Options.blob_garbage_collection_age_cutoff: %f", + ROCKS_LOG_HEADER(log, " Options.blob_garbage_collection_age_cutoff: %f", blob_garbage_collection_age_cutoff); + ROCKS_LOG_HEADER(log, "Options.blob_garbage_collection_force_threshold: %f", + blob_garbage_collection_force_threshold); } // ColumnFamilyOptions::Dump void Options::Dump(Logger* log) const { diff --git a/options/options_helper.cc b/options/options_helper.cc index 668a6caa1..02d0171a5 100644 --- a/options/options_helper.cc +++ b/options/options_helper.cc @@ -250,6 +250,8 @@ void UpdateColumnFamilyOptions(const MutableCFOptions& moptions, moptions.enable_blob_garbage_collection; cf_opts->blob_garbage_collection_age_cutoff = moptions.blob_garbage_collection_age_cutoff; + cf_opts->blob_garbage_collection_force_threshold = + moptions.blob_garbage_collection_force_threshold; // Misc options cf_opts->max_sequential_skip_in_iterations = diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc index f79021374..613fd8400 100644 --- a/options/options_settable_test.cc +++ b/options/options_settable_test.cc @@ -515,6 +515,7 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) { "blob_compression_type=kBZip2Compression;" "enable_blob_garbage_collection=true;" "blob_garbage_collection_age_cutoff=0.5;" + "blob_garbage_collection_force_threshold=0.75;" "compaction_options_fifo={max_table_files_size=3;allow_" "compaction=false;age_for_warm=1;};", new_options)); diff --git a/options/options_test.cc b/options/options_test.cc index 2e3130284..52b5ac7e1 100644 --- a/options/options_test.cc +++ b/options/options_test.cc @@ -108,6 +108,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) { {"blob_compression_type", "kZSTD"}, {"enable_blob_garbage_collection", "true"}, {"blob_garbage_collection_age_cutoff", "0.5"}, + {"blob_garbage_collection_force_threshold", "0.75"}, }; std::unordered_map db_options_map = { @@ -239,6 +240,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) { ASSERT_EQ(new_cf_opt.blob_compression_type, kZSTD); ASSERT_EQ(new_cf_opt.enable_blob_garbage_collection, true); ASSERT_EQ(new_cf_opt.blob_garbage_collection_age_cutoff, 0.5); + ASSERT_EQ(new_cf_opt.blob_garbage_collection_force_threshold, 0.75); cf_options_map["write_buffer_size"] = "hello"; ASSERT_NOK(GetColumnFamilyOptionsFromMap(exact, base_cf_opt, cf_options_map, @@ -2264,6 +2266,7 @@ TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) { {"blob_compression_type", "kZSTD"}, {"enable_blob_garbage_collection", "true"}, {"blob_garbage_collection_age_cutoff", "0.5"}, + {"blob_garbage_collection_force_threshold", "0.75"}, }; std::unordered_map db_options_map = { @@ -2387,6 +2390,7 @@ TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) { ASSERT_EQ(new_cf_opt.blob_compression_type, kZSTD); ASSERT_EQ(new_cf_opt.enable_blob_garbage_collection, true); ASSERT_EQ(new_cf_opt.blob_garbage_collection_age_cutoff, 0.5); + ASSERT_EQ(new_cf_opt.blob_garbage_collection_force_threshold, 0.75); cf_options_map["write_buffer_size"] = "hello"; ASSERT_NOK(GetColumnFamilyOptionsFromMap( diff --git a/test_util/testutil.cc b/test_util/testutil.cc index 0950326d6..9c72972ee 100644 --- a/test_util/testutil.cc +++ b/test_util/testutil.cc @@ -395,6 +395,8 @@ void RandomInitCFOptions(ColumnFamilyOptions* cf_opt, DBOptions& db_options, cf_opt->memtable_prefix_bloom_size_ratio = static_cast(rnd->Uniform(10000)) / 20000.0; cf_opt->blob_garbage_collection_age_cutoff = rnd->Uniform(10000) / 10000.0; + cf_opt->blob_garbage_collection_force_threshold = + rnd->Uniform(10000) / 10000.0; // int options cf_opt->level0_file_num_compaction_trigger = rnd->Uniform(100); diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc index e8ed9bff1..67060e240 100644 --- a/tools/db_bench_tool.cc +++ b/tools/db_bench_tool.cc @@ -979,6 +979,12 @@ DEFINE_double(blob_garbage_collection_age_cutoff, "[Integrated BlobDB] The cutoff in terms of blob file age for " "garbage collection."); +DEFINE_double(blob_garbage_collection_force_threshold, + ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions() + .blob_garbage_collection_force_threshold, + "[Integrated BlobDB] The threshold for the ratio of garbage in " + "the oldest blob files for forcing garbage collection."); + #ifndef ROCKSDB_LITE // Secondary DB instance Options @@ -4331,6 +4337,8 @@ class Benchmark { FLAGS_enable_blob_garbage_collection; options.blob_garbage_collection_age_cutoff = FLAGS_blob_garbage_collection_age_cutoff; + options.blob_garbage_collection_force_threshold = + FLAGS_blob_garbage_collection_force_threshold; #ifndef ROCKSDB_LITE if (FLAGS_readonly && FLAGS_transaction_db) { diff --git a/tools/db_bench_tool_test.cc b/tools/db_bench_tool_test.cc index 9c3515876..bad4ae5c0 100644 --- a/tools/db_bench_tool_test.cc +++ b/tools/db_bench_tool_test.cc @@ -276,7 +276,8 @@ const std::string options_file_content = R"OPTIONS_FILE( blob_file_size=10485760 blob_compression_type=kNoCompression enable_blob_garbage_collection=true - blob_garbage_collection_age_cutoff=0.75 + blob_garbage_collection_age_cutoff=0.5 + blob_garbage_collection_force_threshold=0.75 [TableOptions/BlockBasedTable "default"] format_version=0 diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py index a616740cd..5838304f0 100644 --- a/tools/db_crashtest.py +++ b/tools/db_crashtest.py @@ -292,6 +292,7 @@ blob_params = { "blob_compression_type": lambda: random.choice(["none", "snappy", "lz4", "zstd"]), "enable_blob_garbage_collection": lambda: random.choice([0] + [1] * 3), "blob_garbage_collection_age_cutoff": lambda: random.choice([0.0, 0.25, 0.5, 0.75, 1.0]), + "blob_garbage_collection_force_threshold": lambda: random.choice([0.5, 0.75, 1.0]), } ts_params = {