From dd0efa4ee989bfefd7a9feca96f88d507262b7f0 Mon Sep 17 00:00:00 2001 From: Andrew Kryczka Date: Wed, 6 Dec 2017 19:06:27 -0800 Subject: [PATCH] Preserve overlapping file endpoint invariant Summary: Fix for #2833. - In `DeleteFilesInRange`, use `GetCleanInputsWithinInterval` instead of `GetOverlappingInputs` to make sure we get a clean cut set of files to delete. - In `GetCleanInputsWithinInterval`, support nullptr as `begin_key` or `end_key`. - In `GetOverlappingInputsRangeBinarySearch`, move the assertion for non-empty range away from `ExtendFileRangeWithinInterval`, which should be allowed to return an empty range (via `end_index < begin_index`). Closes https://github.com/facebook/rocksdb/pull/2843 Differential Revision: D5772387 Pulled By: ajkr fbshipit-source-id: e554e8461823c6be82b21a9262a2da02b3957881 --- db/db_compaction_test.cc | 54 +++++++++++++++++++++++++++++++++++ db/db_impl.cc | 24 ++++++---------- db/version_set.cc | 40 +++++++++++++++----------- include/rocksdb/convenience.h | 3 +- 4 files changed, 88 insertions(+), 33 deletions(-) diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc index ca77d5b93..93fbc0d37 100644 --- a/db/db_compaction_test.cc +++ b/db/db_compaction_test.cc @@ -1439,6 +1439,60 @@ TEST_F(DBCompactionTest, DeleteFileRange) { ASSERT_GT(old_num_files, new_num_files); } +TEST_F(DBCompactionTest, DeleteFileRangeFileEndpointsOverlapBug) { + // regression test for #2833: groups of files whose user-keys overlap at the + // endpoints could be split by `DeleteFilesInRange`. This caused old data to + // reappear, either because a new version of the key was removed, or a range + // deletion was partially dropped. It could also cause non-overlapping + // invariant to be violated if the files dropped by DeleteFilesInRange were + // a subset of files that a range deletion spans. + const int kNumL0Files = 2; + const int kValSize = 8 << 10; // 8KB + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = kNumL0Files; + options.target_file_size_base = 1 << 10; // 1KB + DestroyAndReopen(options); + + // The snapshot prevents key 1 from having its old version dropped. The low + // `target_file_size_base` ensures two keys will be in each output file. + const Snapshot* snapshot = nullptr; + Random rnd(301); + // The value indicates which flush the key belonged to, which is enough + // for us to determine the keys' relative ages. After L0 flushes finish, + // files look like: + // + // File 0: 0 -> vals[0], 1 -> vals[0] + // File 1: 1 -> vals[1], 2 -> vals[1] + // + // Then L0->L1 compaction happens, which outputs keys as follows: + // + // File 0: 0 -> vals[0], 1 -> vals[1] + // File 1: 1 -> vals[0], 2 -> vals[1] + // + // DeleteFilesInRange shouldn't be allowed to drop just file 0, as that + // would cause `1 -> vals[0]` (an older key) to reappear. + std::string vals[kNumL0Files]; + for (int i = 0; i < kNumL0Files; ++i) { + vals[i] = RandomString(&rnd, kValSize); + Put(Key(i), vals[i]); + Put(Key(i + 1), vals[i]); + Flush(); + if (i == 0) { + snapshot = db_->GetSnapshot(); + } + } + dbfull()->TEST_WaitForCompact(); + + // Verify `DeleteFilesInRange` can't drop only file 0 which would cause + // "1 -> vals[0]" to reappear. + Slice begin = Key(0); + Slice end = Key(1); + ASSERT_OK(DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin, &end)); + ASSERT_EQ(vals[1], Get(Key(1))); + + db_->ReleaseSnapshot(snapshot); +} + TEST_P(DBCompactionTestWithParam, TrivialMoveToLastLevelWithFiles) { int32_t trivial_move = 0; int32_t non_trivial_move = 0; diff --git a/db/db_impl.cc b/db/db_impl.cc index d1bfe41e8..c9f4702c3 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -2062,25 +2062,19 @@ Status DBImpl::DeleteFilesInRange(ColumnFamilyHandle* column_family, end_key = &end_storage; } - vstorage->GetOverlappingInputs(i, begin_key, end_key, &level_files, -1, - nullptr, false); + vstorage->GetCleanInputsWithinInterval(i, begin_key, end_key, + &level_files, -1 /* hint_index */, + nullptr /* file_index */); FileMetaData* level_file; for (uint32_t j = 0; j < level_files.size(); j++) { level_file = level_files[j]; - if (((begin == nullptr) || - (cfd->internal_comparator().user_comparator()->Compare( - level_file->smallest.user_key(), *begin) >= 0)) && - ((end == nullptr) || - (cfd->internal_comparator().user_comparator()->Compare( - level_file->largest.user_key(), *end) <= 0))) { - if (level_file->being_compacted) { - continue; - } - edit.SetColumnFamily(cfd->GetID()); - edit.DeleteFile(i, level_file->fd.GetNumber()); - deleted_files.push_back(level_file); - level_file->being_compacted = true; + if (level_file->being_compacted) { + continue; } + edit.SetColumnFamily(cfd->GetID()); + edit.DeleteFile(i, level_file->fd.GetNumber()); + deleted_files.push_back(level_file); + level_file->being_compacted = true; } } if (edit.GetDeletedFiles().empty()) { diff --git a/db/version_set.cc b/db/version_set.cc index 782ebc263..fec8e8479 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -1793,27 +1793,33 @@ void VersionStorageInfo::GetOverlappingInputs( void VersionStorageInfo::GetCleanInputsWithinInterval( int level, const InternalKey* begin, const InternalKey* end, std::vector* inputs, int hint_index, int* file_index) const { - if (level >= num_non_empty_levels_) { - // this level is empty, no inputs within range - return; - } - inputs->clear(); - Slice user_begin, user_end; - if (begin != nullptr) { - user_begin = begin->user_key(); - } - if (end != nullptr) { - user_end = end->user_key(); - } if (file_index) { *file_index = -1; } - if (begin != nullptr && end != nullptr && level > 0) { - GetOverlappingInputsRangeBinarySearch(level, user_begin, user_end, inputs, - hint_index, file_index, - true /* within_interval */); + if (level >= num_non_empty_levels_ || level == 0 || + level_files_brief_[level].num_files == 0) { + // this level is empty, no inputs within range + // also don't support clean input interval within L0 + return; } + + Slice user_begin, user_end; + const auto& level_files = level_files_brief_[level]; + if (begin == nullptr) { + user_begin = ExtractUserKey(level_files.files[0].smallest_key); + } else { + user_begin = begin->user_key(); + } + if (end == nullptr) { + user_end = ExtractUserKey( + level_files.files[level_files.num_files - 1].largest_key); + } else { + user_end = end->user_key(); + } + GetOverlappingInputsRangeBinarySearch(level, user_begin, user_end, inputs, + hint_index, file_index, + true /* within_interval */); } // Store in "*inputs" all files in "level" that overlap [begin,end] @@ -1876,8 +1882,8 @@ void VersionStorageInfo::GetOverlappingInputsRangeBinarySearch( } else { ExtendFileRangeOverlappingInterval(level, user_begin, user_end, mid, &start_index, &end_index); + assert(end_index >= start_index); } - assert(end_index >= start_index); // insert overlapping files into vector for (int i = start_index; i <= end_index; i++) { inputs->push_back(files_[level][i]); diff --git a/include/rocksdb/convenience.h b/include/rocksdb/convenience.h index 4a60afb11..b09ac4816 100644 --- a/include/rocksdb/convenience.h +++ b/include/rocksdb/convenience.h @@ -325,7 +325,8 @@ void CancelAllBackgroundWork(DB* db, bool wait = false); // Delete files which are entirely in the given range // Could leave some keys in the range which are in files which are not -// entirely in the range. +// entirely in the range. Also leaves L0 files regardless of whether they're +// in the range. // Snapshots before the delete might not see the data in the given range. Status DeleteFilesInRange(DB* db, ColumnFamilyHandle* column_family, const Slice* begin, const Slice* end);