From 7d3eb006ffff0657cb85a87f9a2b23c65e9d6542 Mon Sep 17 00:00:00 2001 From: anand1976 <33647610+anand1976@users.noreply.github.com> Date: Mon, 15 Oct 2018 23:20:15 -0700 Subject: [PATCH] Properly determine a truncated CompactRange stop key (#4496) Summary: When a CompactRange() call for a level is truncated before the end key is reached, because it exceeds max_compaction_bytes, we need to properly set the compaction_end parameter to indicate the stop key. The next CompactRange will use that as the begin key. We set it to the smallest key of the next file in the level after expanding inputs to get a clean cut. Previously, we were setting it before expanding inputs. So we could end up recompacting some files. In a pathological case, where a single key has many entries spanning all the files in the level (possibly due to merge operands without a partial merge operator, thus resulting in compaction output identical to the input), this would result in an endless loop over the same set of files. Pull Request resolved: https://github.com/facebook/rocksdb/pull/4496 Differential Revision: D10395026 Pulled By: anand1976 fbshipit-source-id: f0c2f89fee29b4b3be53b6467b53abba8e9146a9 --- HISTORY.md | 1 + db/compaction_picker.cc | 16 ++++++++++----- db/compaction_picker.h | 3 ++- db/db_compaction_test.cc | 44 ++++++++++++++++++++++++++++++++++++++++ db/version_set.cc | 26 ++++++++++++++++++++---- db/version_set.h | 13 +++++++----- 6 files changed, 88 insertions(+), 15 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 20aedf45b..2a2436323 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -2,6 +2,7 @@ ## Unreleased ### Bug Fixes * Fix slow flush/compaction when DB contains many snapshots. The problem became noticeable to us in DBs with 100,000+ snapshots, though it will affect others at different thresholds. +* Properly set the stop key for a truncated manual CompactRange ### New Features * Introduced CacheAllocator, which lets the user specify custom allocator for memory in block cache. diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc index 4931eddfe..36b8029a3 100644 --- a/db/compaction_picker.cc +++ b/db/compaction_picker.cc @@ -219,7 +219,8 @@ void CompactionPicker::GetRange(const std::vector& inputs, bool CompactionPicker::ExpandInputsToCleanCut(const std::string& /*cf_name*/, VersionStorageInfo* vstorage, - CompactionInputFiles* inputs) { + CompactionInputFiles* inputs, + InternalKey** next_smallest) { // This isn't good compaction assert(!inputs->empty()); @@ -242,7 +243,8 @@ bool CompactionPicker::ExpandInputsToCleanCut(const std::string& /*cf_name*/, GetRange(*inputs, &smallest, &largest); inputs->clear(); vstorage->GetOverlappingInputs(level, &smallest, &largest, &inputs->files, - hint_index, &hint_index); + hint_index, &hint_index, true, + next_smallest); } while (inputs->size() > old_size); // we started off with inputs non-empty and the previous loop only grew @@ -649,7 +651,6 @@ Compaction* CompactionPicker::CompactRange( uint64_t s = inputs[i]->compensated_file_size; total += s; if (total >= limit) { - **compaction_end = inputs[i + 1]->smallest; covering_the_whole_range = false; inputs.files.resize(i + 1); break; @@ -658,7 +659,10 @@ Compaction* CompactionPicker::CompactRange( } assert(output_path_id < static_cast(ioptions_.cf_paths.size())); - if (ExpandInputsToCleanCut(cf_name, vstorage, &inputs) == false) { + InternalKey key_storage; + InternalKey* next_smallest = &key_storage; + if (ExpandInputsToCleanCut(cf_name, vstorage, &inputs, &next_smallest) == + false) { // manual compaction is now multi-threaded, so it can // happen that ExpandWhileOverlapping fails // we handle it higher in RunManualCompaction @@ -666,8 +670,10 @@ Compaction* CompactionPicker::CompactRange( return nullptr; } - if (covering_the_whole_range) { + if (covering_the_whole_range || !next_smallest) { *compaction_end = nullptr; + } else { + **compaction_end = *next_smallest; } CompactionInputFiles output_level_inputs; diff --git a/db/compaction_picker.h b/db/compaction_picker.h index 5f770039a..e9688d754 100644 --- a/db/compaction_picker.h +++ b/db/compaction_picker.h @@ -151,7 +151,8 @@ class CompactionPicker { // Will return false if it is impossible to apply this compaction. bool ExpandInputsToCleanCut(const std::string& cf_name, VersionStorageInfo* vstorage, - CompactionInputFiles* inputs); + CompactionInputFiles* inputs, + InternalKey** next_smallest = nullptr); // Returns true if any one of the parent files are being compacted bool IsRangeInCompaction(VersionStorageInfo* vstorage, diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc index 4f5b9f28a..5136b0392 100644 --- a/db/db_compaction_test.cc +++ b/db/db_compaction_test.cc @@ -3961,6 +3961,50 @@ INSTANTIATE_TEST_CASE_P( CompactionPri::kOldestSmallestSeqFirst, CompactionPri::kMinOverlappingRatio)); +class NoopMergeOperator : public MergeOperator { + public: + NoopMergeOperator() {} + + virtual bool FullMergeV2(const MergeOperationInput& /*merge_in*/, + MergeOperationOutput* merge_out) const override { + std::string val("bar"); + merge_out->new_value = val; + return true; + } + + virtual const char* Name() const override { return "Noop"; } +}; + +TEST_F(DBCompactionTest, PartialManualCompaction) { + Options opts = CurrentOptions(); + opts.num_levels = 3; + opts.level0_file_num_compaction_trigger = 10; + opts.compression = kNoCompression; + opts.merge_operator.reset(new NoopMergeOperator()); + opts.target_file_size_base = 10240; + DestroyAndReopen(opts); + + Random rnd(301); + for (auto i = 0; i < 8; ++i) { + for (auto j = 0; j < 10; ++j) { + Merge("foo", RandomString(&rnd, 1024)); + } + Flush(); + } + + MoveFilesToLevel(2); + + std::string prop; + EXPECT_TRUE(dbfull()->GetProperty(DB::Properties::kLiveSstFilesSize, &prop)); + uint64_t max_compaction_bytes = atoi(prop.c_str()) / 2; + ASSERT_OK(dbfull()->SetOptions( + {{"max_compaction_bytes", std::to_string(max_compaction_bytes)}})); + + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + dbfull()->CompactRange(cro, nullptr, nullptr); +} + #endif // !defined(ROCKSDB_LITE) } // namespace rocksdb diff --git a/db/version_set.cc b/db/version_set.cc index 2b3106c00..434d791e7 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -2028,7 +2028,7 @@ bool VersionStorageInfo::OverlapInLevel(int level, void VersionStorageInfo::GetOverlappingInputs( int level, const InternalKey* begin, const InternalKey* end, std::vector* inputs, int hint_index, int* file_index, - bool expand_range) const { + bool expand_range, InternalKey** next_smallest) const { if (level >= num_non_empty_levels_) { // this level is empty, no overlapping inputs return; @@ -2040,11 +2040,17 @@ void VersionStorageInfo::GetOverlappingInputs( } const Comparator* user_cmp = user_comparator_; if (level > 0) { - GetOverlappingInputsRangeBinarySearch(level, begin, end, inputs, - hint_index, file_index); + GetOverlappingInputsRangeBinarySearch(level, begin, end, inputs, hint_index, + file_index, false, next_smallest); return; } + if (next_smallest) { + // next_smallest key only makes sense for non-level 0, where files are + // non-overlapping + *next_smallest = nullptr; + } + Slice user_begin, user_end; if (begin != nullptr) { user_begin = begin->user_key(); @@ -2198,7 +2204,7 @@ int sstableKeyCompare(const Comparator* user_cmp, void VersionStorageInfo::GetOverlappingInputsRangeBinarySearch( int level, const InternalKey* begin, const InternalKey* end, std::vector* inputs, int hint_index, int* file_index, - bool within_interval) const { + bool within_interval, InternalKey** next_smallest) const { assert(level > 0); int min = 0; int mid = 0; @@ -2234,6 +2240,9 @@ void VersionStorageInfo::GetOverlappingInputsRangeBinarySearch( // If there were no overlapping files, return immediately. if (!foundOverlap) { + if (next_smallest) { + next_smallest = nullptr; + } return; } // returns the index where an overlap is found @@ -2254,6 +2263,15 @@ void VersionStorageInfo::GetOverlappingInputsRangeBinarySearch( for (int i = start_index; i <= end_index; i++) { inputs->push_back(files_[level][i]); } + + if (next_smallest != nullptr) { + // Provide the next key outside the range covered by inputs + if (++end_index < static_cast(files_[level].size())) { + **next_smallest = files_[level][end_index]->smallest; + } else { + *next_smallest = nullptr; + } + } } // Store in *start_index and *end_index the range of all files in diff --git a/db/version_set.h b/db/version_set.h index 4e3b2bec2..20521fa06 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -188,9 +188,11 @@ class VersionStorageInfo { std::vector* inputs, int hint_index = -1, // index of overlap file int* file_index = nullptr, // return index of overlap file - bool expand_range = true) // if set, returns files which overlap the - const; // range and overlap each other. If false, + bool expand_range = true, // if set, returns files which overlap the + // range and overlap each other. If false, // then just files intersecting the range + InternalKey** next_smallest = nullptr) // if non-null, returns the + const; // smallest key of next file not included void GetCleanInputsWithinInterval( int level, const InternalKey* begin, // nullptr means before all keys const InternalKey* end, // nullptr means after all keys @@ -200,14 +202,15 @@ class VersionStorageInfo { const; void GetOverlappingInputsRangeBinarySearch( - int level, // level > 0 + int level, // level > 0 const InternalKey* begin, // nullptr means before all keys const InternalKey* end, // nullptr means after all keys std::vector* inputs, int hint_index, // index of overlap file int* file_index, // return index of overlap file - bool within_interval = false) // if set, force the inputs within interval - const; + bool within_interval = false, // if set, force the inputs within interval + InternalKey** next_smallest = nullptr) // if non-null, returns the + const; // smallest key of next file not included void ExtendFileRangeOverlappingInterval( int level,