From 90fc40690a9184111295d4c8e1cc1ba4c25e2f87 Mon Sep 17 00:00:00 2001 From: Peter Mattis Date: Fri, 13 Jul 2018 17:34:54 -0700 Subject: [PATCH] Relax VersionStorageInfo::GetOverlappingInputs check (#4050) Summary: Do not consider the range tombstone sentinel key as causing 2 adjacent sstables in a level to overlap. When a range tombstone's end key is the largest key in an sstable, the sstable's end key is so to a "sentinel" value that is the smallest key in the next sstable with a sequence number of kMaxSequenceNumber. This "sentinel" is guaranteed to not overlap in internal-key space with the next sstable. Unfortunately, GetOverlappingFiles uses user-keys to determine overlap and was thus considering 2 adjacent sstables in a level to overlap if they were separated by this sentinel key. This in turn would cause compactions to be larger than necessary. Note that this conflicts with https://github.com/facebook/rocksdb/pull/2769 and cases `DBRangeDelTest.CompactionTreatsSplitInputLevelDeletionAtomically` to fail. Pull Request resolved: https://github.com/facebook/rocksdb/pull/4050 Differential Revision: D8844423 Pulled By: ajkr fbshipit-source-id: df3f9f1db8f4cff2bff77376b98b83c2ae1d155b --- db/builder.cc | 2 +- db/compaction_job.cc | 2 +- db/db_range_del_test.cc | 104 +++++++++++++++++-- db/dbformat.h | 16 ++- db/dbformat_test.cc | 7 ++ db/forward_iterator.cc | 8 +- db/range_del_aggregator.cc | 27 ++++- db/range_del_aggregator.h | 10 +- db/range_del_aggregator_test.cc | 43 ++++++-- db/repair.cc | 2 +- db/table_cache.cc | 53 ++-------- db/table_cache.h | 11 +- db/version_set.cc | 175 +++++++++++++++++++++----------- db/version_set.h | 24 ++--- db/version_set_test.cc | 64 ++++++++++++ 15 files changed, 396 insertions(+), 152 deletions(-) diff --git a/db/builder.cc b/db/builder.cc index 5488daee0..d52361046 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -204,7 +204,7 @@ Status BuildTable( // we will regrad this verification as user reads since the goal is // to cache it here for further user reads std::unique_ptr it(table_cache->NewIterator( - ReadOptions(), env_options, internal_comparator, meta->fd, + ReadOptions(), env_options, internal_comparator, *meta, nullptr /* range_del_agg */, mutable_cf_options.prefix_extractor.get(), nullptr, (internal_stats == nullptr) ? nullptr diff --git a/db/compaction_job.cc b/db/compaction_job.cc index 15ba15c51..94c6762a2 100644 --- a/db/compaction_job.cc +++ b/db/compaction_job.cc @@ -635,7 +635,7 @@ Status CompactionJob::Run() { // to cache it here for further user reads InternalIterator* iter = cfd->table_cache()->NewIterator( ReadOptions(), env_options_, cfd->internal_comparator(), - files_meta[file_idx]->fd, nullptr /* range_del_agg */, + *files_meta[file_idx], nullptr /* range_del_agg */, prefix_extractor, nullptr, cfd->internal_stats()->GetFileReadHist( compact_->compaction->output_level()), diff --git a/db/db_range_del_test.cc b/db/db_range_del_test.cc index cf009746b..5c5e2d89c 100644 --- a/db/db_range_del_test.cc +++ b/db/db_range_del_test.cc @@ -916,11 +916,14 @@ TEST_F(DBRangeDelTest, MemtableBloomFilter) { } TEST_F(DBRangeDelTest, CompactionTreatsSplitInputLevelDeletionAtomically) { - // make sure compaction treats files containing a split range deletion in the - // input level as an atomic unit. I.e., compacting any input-level file(s) - // containing a portion of the range deletion causes all other input-level - // files containing portions of that same range deletion to be included in the - // compaction. + // This test originally verified that compaction treated files containing a + // split range deletion in the input level as an atomic unit. I.e., + // compacting any input-level file(s) containing a portion of the range + // deletion causes all other input-level files containing portions of that + // same range deletion to be included in the compaction. Range deletion + // tombstones are now truncated to sstable boundaries which removed the need + // for that behavior (which could lead to excessively large + // compactions). const int kNumFilesPerLevel = 4, kValueBytes = 4 << 10; Options options = CurrentOptions(); options.compression = kNoCompression; @@ -967,22 +970,111 @@ TEST_F(DBRangeDelTest, CompactionTreatsSplitInputLevelDeletionAtomically) { if (i == 0) { ASSERT_OK(db_->CompactFiles( CompactionOptions(), {meta.levels[1].files[0].name}, 2 /* level */)); + ASSERT_EQ(0, NumTableFilesAtLevel(1)); } else if (i == 1) { auto begin_str = Key(0), end_str = Key(1); Slice begin = begin_str, end = end_str; ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &begin, &end)); + ASSERT_EQ(3, NumTableFilesAtLevel(1)); } else if (i == 2) { ASSERT_OK(db_->SetOptions(db_->DefaultColumnFamily(), {{"max_bytes_for_level_base", "10000"}})); dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(1, NumTableFilesAtLevel(1)); } - ASSERT_EQ(0, NumTableFilesAtLevel(1)); ASSERT_GT(NumTableFilesAtLevel(2), 0); db_->ReleaseSnapshot(snapshot); } } +TEST_F(DBRangeDelTest, RangeTombstoneEndKeyAsSstableUpperBound) { + // Test the handling of the range-tombstone end-key as the + // upper-bound for an sstable. + + const int kNumFilesPerLevel = 2, kValueBytes = 4 << 10; + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.level0_file_num_compaction_trigger = kNumFilesPerLevel; + options.memtable_factory.reset( + new SpecialSkipListFactory(2 /* num_entries_flush */)); + options.target_file_size_base = kValueBytes; + options.disable_auto_compactions = true; + + DestroyAndReopen(options); + + // Create an initial sstable at L2: + // [key000000#1,1, key000000#1,1] + ASSERT_OK(Put(Key(0), "")); + ASSERT_OK(db_->Flush(FlushOptions())); + MoveFilesToLevel(2); + ASSERT_EQ(1, NumTableFilesAtLevel(2)); + + // A snapshot protects the range tombstone from dropping due to + // becoming obsolete. + const Snapshot* snapshot = db_->GetSnapshot(); + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(0), Key(2 * kNumFilesPerLevel)); + + // Create 2 additional sstables in L0. Note that the first sstable + // contains the range tombstone. + // [key000000#3,1, key000004#72057594037927935,15] + // [key000001#5,1, key000002#6,1] + Random rnd(301); + std::string value = RandomString(&rnd, kValueBytes); + for (int j = 0; j < kNumFilesPerLevel; ++j) { + // Give files overlapping key-ranges to prevent a trivial move when we + // compact from L0 to L1. + ASSERT_OK(Put(Key(j), value)); + ASSERT_OK(Put(Key(2 * kNumFilesPerLevel - 1 - j), value)); + ASSERT_OK(db_->Flush(FlushOptions())); + ASSERT_EQ(j + 1, NumTableFilesAtLevel(0)); + } + // Compact the 2 L0 sstables to L1, resulting in the following LSM. There + // are 2 sstables generated in L1 due to the target_file_size_base setting. + // L1: + // [key000000#3,1, key000002#72057594037927935,15] + // [key000002#6,1, key000004#72057594037927935,15] + // L2: + // [key000000#1,1, key000000#1,1] + MoveFilesToLevel(1); + ASSERT_EQ(2, NumTableFilesAtLevel(1)); + + { + // Compact the second sstable in L1: + // L1: + // [key000000#3,1, key000002#72057594037927935,15] + // L2: + // [key000000#1,1, key000000#1,1] + // [key000002#6,1, key000004#72057594037927935,15] + auto begin_str = Key(3); + const rocksdb::Slice begin = begin_str; + dbfull()->TEST_CompactRange(1, &begin, nullptr); + ASSERT_EQ(1, NumTableFilesAtLevel(1)); + ASSERT_EQ(2, NumTableFilesAtLevel(2)); + } + + { + // Compact the first sstable in L1. This should be copacetic, but + // was previously resulting in overlapping sstables in L2 due to + // mishandling of the range tombstone end-key when used as the + // largest key for an sstable. The resulting LSM structure should + // be: + // + // L2: + // [key000000#1,1, key000001#72057594037927935,15] + // [key000001#5,1, key000002#72057594037927935,15] + // [key000002#6,1, key000004#72057594037927935,15] + auto begin_str = Key(0); + const rocksdb::Slice begin = begin_str; + dbfull()->TEST_CompactRange(1, &begin, &begin); + ASSERT_EQ(0, NumTableFilesAtLevel(1)); + ASSERT_EQ(3, NumTableFilesAtLevel(2)); + } + + db_->ReleaseSnapshot(snapshot); +} + TEST_F(DBRangeDelTest, UnorderedTombstones) { // Regression test for #2752. Range delete tombstones between // different snapshot stripes are not stored in order, so the first diff --git a/db/dbformat.h b/db/dbformat.h index d191fbd4a..81cfceb03 100644 --- a/db/dbformat.h +++ b/db/dbformat.h @@ -140,10 +140,14 @@ inline Slice ExtractUserKey(const Slice& internal_key) { return Slice(internal_key.data(), internal_key.size() - 8); } -inline ValueType ExtractValueType(const Slice& internal_key) { +inline uint64_t ExtractInternalKeyFooter(const Slice& internal_key) { assert(internal_key.size() >= 8); const size_t n = internal_key.size(); - uint64_t num = DecodeFixed64(internal_key.data() + n - 8); + return DecodeFixed64(internal_key.data() + n - 8); +} + +inline ValueType ExtractValueType(const Slice& internal_key) { + uint64_t num = ExtractInternalKeyFooter(internal_key); unsigned char c = num & 0xff; return static_cast(c); } @@ -606,9 +610,15 @@ struct RangeTombstone { return InternalKey(start_key_, seq_, kTypeRangeDeletion); } + // The tombstone end-key is exclusive, so we generate an internal-key here + // which has a similar property. Using kMaxSequenceNumber guarantees that + // the returned internal-key will compare less than any other internal-key + // with the same user-key. This in turn guarantees that the serialized + // end-key for a tombstone such as [a-b] will compare less than the key "b". + // // be careful to use SerializeEndKey(), allocates new memory InternalKey SerializeEndKey() const { - return InternalKey(end_key_, seq_, kTypeRangeDeletion); + return InternalKey(end_key_, kMaxSequenceNumber, kTypeRangeDeletion); } }; diff --git a/db/dbformat_test.cc b/db/dbformat_test.cc index d96b5757a..0b16c13f5 100644 --- a/db/dbformat_test.cc +++ b/db/dbformat_test.cc @@ -192,6 +192,13 @@ TEST_F(FormatTest, UpdateInternalKey) { ASSERT_EQ(new_val_type, decoded.type); } +TEST_F(FormatTest, RangeTombstoneSerializeEndKey) { + RangeTombstone t("a", "b", 2); + InternalKey k("b", 3, kTypeValue); + const InternalKeyComparator cmp(BytewiseComparator()); + ASSERT_LT(cmp.Compare(t.SerializeEndKey(), k), 0); +} + } // namespace rocksdb int main(int argc, char** argv) { diff --git a/db/forward_iterator.cc b/db/forward_iterator.cc index 00cfffc49..42395cc46 100644 --- a/db/forward_iterator.cc +++ b/db/forward_iterator.cc @@ -75,7 +75,7 @@ class ForwardLevelIterator : public InternalIterator { cfd_->internal_comparator(), {} /* snapshots */); file_iter_ = cfd_->table_cache()->NewIterator( read_options_, *(cfd_->soptions()), cfd_->internal_comparator(), - files_[file_index_]->fd, + *files_[file_index_], read_options_.ignore_range_deletions ? nullptr : &range_del_agg, prefix_extractor_, nullptr /* table_reader_ptr */, nullptr, false); file_iter_->SetPinnedItersMgr(pinned_iters_mgr_); @@ -635,7 +635,7 @@ void ForwardIterator::RebuildIterators(bool refresh_sv) { continue; } l0_iters_.push_back(cfd_->table_cache()->NewIterator( - read_options_, *cfd_->soptions(), cfd_->internal_comparator(), l0->fd, + read_options_, *cfd_->soptions(), cfd_->internal_comparator(), *l0, read_options_.ignore_range_deletions ? nullptr : &range_del_agg, sv_->mutable_cf_options.prefix_extractor.get())); } @@ -706,7 +706,7 @@ void ForwardIterator::RenewIterators() { } l0_iters_new.push_back(cfd_->table_cache()->NewIterator( read_options_, *cfd_->soptions(), cfd_->internal_comparator(), - l0_files_new[inew]->fd, + *l0_files_new[inew], read_options_.ignore_range_deletions ? nullptr : &range_del_agg, svnew->mutable_cf_options.prefix_extractor.get())); } @@ -766,7 +766,7 @@ void ForwardIterator::ResetIncompleteIterators() { DeleteIterator(l0_iters_[i]); l0_iters_[i] = cfd_->table_cache()->NewIterator( read_options_, *cfd_->soptions(), cfd_->internal_comparator(), - l0_files[i]->fd, nullptr /* range_del_agg */, + *l0_files[i], nullptr /* range_del_agg */, sv_->mutable_cf_options.prefix_extractor.get()); l0_iters_[i]->SetPinnedItersMgr(pinned_iters_mgr_); } diff --git a/db/range_del_aggregator.cc b/db/range_del_aggregator.cc index 2fb5752ee..4492e3b97 100644 --- a/db/range_del_aggregator.cc +++ b/db/range_del_aggregator.cc @@ -426,7 +426,9 @@ bool RangeDelAggregator::IsRangeOverlapped(const Slice& start, } Status RangeDelAggregator::AddTombstones( - std::unique_ptr input) { + std::unique_ptr input, + const InternalKey* smallest, + const InternalKey* largest) { if (input == nullptr) { return Status::OK(); } @@ -450,6 +452,29 @@ Status RangeDelAggregator::AddTombstones( return Status::Corruption("Unable to parse range tombstone InternalKey"); } RangeTombstone tombstone(parsed_key, input->value()); + // Truncate the tombstone to the range [smallest, largest]. + if (smallest != nullptr) { + if (icmp_.user_comparator()->Compare( + tombstone.start_key_, smallest->user_key()) < 0) { + tombstone.start_key_ = smallest->user_key(); + } + } + if (largest != nullptr) { + // This is subtly correct despite the discrepancy between + // FileMetaData::largest being inclusive while RangeTombstone::end_key_ + // is exclusive. A tombstone will only extend past the bounds of an + // sstable if its end-key is the largest key in the table. If that + // occurs, the largest key for the table is set based on the smallest + // key in the next table in the level. In that case, largest->user_key() + // is not actually a key in the current table and thus we can use it as + // the exclusive end-key for the tombstone. + if (icmp_.user_comparator()->Compare( + tombstone.end_key_, largest->user_key()) > 0) { + // The largest key should be a tombstone sentinel key. + assert(GetInternalKeySeqno(largest->Encode()) == kMaxSequenceNumber); + tombstone.end_key_ = largest->user_key(); + } + } GetRangeDelMap(tombstone.seq_).AddTombstone(std::move(tombstone)); input->Next(); } diff --git a/db/range_del_aggregator.h b/db/range_del_aggregator.h index 5ba858143..7abd147f9 100644 --- a/db/range_del_aggregator.h +++ b/db/range_del_aggregator.h @@ -137,9 +137,15 @@ class RangeDelAggregator { bool IsRangeOverlapped(const Slice& start, const Slice& end); // Adds tombstones to the tombstone aggregation structure maintained by this - // object. + // object. Tombstones are truncated to smallest and largest. If smallest (or + // largest) is null, it is not used for truncation. When adding range + // tombstones present in an sstable, smallest and largest should be set to + // the smallest and largest keys from the sstable file metadata. Note that + // tombstones end keys are exclusive while largest is inclusive. // @return non-OK status if any of the tombstone keys are corrupted. - Status AddTombstones(std::unique_ptr input); + Status AddTombstones(std::unique_ptr input, + const InternalKey* smallest = nullptr, + const InternalKey* largest = nullptr); // Resets iterators maintained across calls to ShouldDelete(). This may be // called when the tombstones change, or the owner may call explicitly, e.g., diff --git a/db/range_del_aggregator_test.cc b/db/range_del_aggregator_test.cc index 85db17b16..6b3423c2a 100644 --- a/db/range_del_aggregator_test.cc +++ b/db/range_del_aggregator_test.cc @@ -19,6 +19,7 @@ namespace { struct ExpectedPoint { Slice begin; SequenceNumber seq; + bool expectAlive; }; enum Direction { @@ -29,7 +30,9 @@ enum Direction { static auto icmp = InternalKeyComparator(BytewiseComparator()); void AddTombstones(RangeDelAggregator* range_del_agg, - const std::vector& range_dels) { + const std::vector& range_dels, + const InternalKey* smallest = nullptr, + const InternalKey* largest = nullptr) { std::vector keys, values; for (const auto& range_del : range_dels) { auto key_and_value = range_del.Serialize(); @@ -38,7 +41,7 @@ void AddTombstones(RangeDelAggregator* range_del_agg, } std::unique_ptr range_del_iter( new test::VectorIterator(keys, values)); - range_del_agg->AddTombstones(std::move(range_del_iter)); + range_del_agg->AddTombstones(std::move(range_del_iter), smallest, largest); } void VerifyTombstonesEq(const RangeTombstone& a, const RangeTombstone& b) { @@ -62,7 +65,9 @@ void VerifyRangeDelIter( void VerifyRangeDels( const std::vector& range_dels_in, const std::vector& expected_points, - const std::vector& expected_collapsed_range_dels) { + const std::vector& expected_collapsed_range_dels, + const InternalKey* smallest = nullptr, + const InternalKey* largest = nullptr) { // Test same result regardless of which order the range deletions are added // and regardless of collapsed mode. for (bool collapsed : {false, true}) { @@ -73,7 +78,7 @@ void VerifyRangeDels( if (dir == kReverse) { std::reverse(range_dels.begin(), range_dels.end()); } - AddTombstones(&range_del_agg, range_dels); + AddTombstones(&range_del_agg, range_dels, smallest, largest); auto mode = RangeDelPositioningMode::kFullScan; if (collapsed) { @@ -88,22 +93,29 @@ void VerifyRangeDels( ASSERT_FALSE(range_del_agg.ShouldDelete(parsed_key, mode)); if (parsed_key.sequence > 0) { --parsed_key.sequence; - ASSERT_TRUE(range_del_agg.ShouldDelete(parsed_key, mode)); + if (expected_point.expectAlive) { + ASSERT_FALSE(range_del_agg.ShouldDelete(parsed_key, mode)); + } else { + ASSERT_TRUE(range_del_agg.ShouldDelete(parsed_key, mode)); + } } } if (collapsed) { range_dels = expected_collapsed_range_dels; - } else { - // Tombstones in an uncollapsed map are presented in start key order. - // Tombstones with the same start key are presented in insertion order. + VerifyRangeDelIter(range_del_agg.NewIterator().get(), range_dels); + } else if (smallest == nullptr && largest == nullptr) { + // Tombstones in an uncollapsed map are presented in start key + // order. Tombstones with the same start key are presented in + // insertion order. We don't handle tombstone truncation here, so the + // verification is only performed if no truncation was requested. std::stable_sort(range_dels.begin(), range_dels.end(), [&](const RangeTombstone& a, const RangeTombstone& b) { return icmp.user_comparator()->Compare( a.start_key_, b.start_key_) < 0; }); + VerifyRangeDelIter(range_del_agg.NewIterator().get(), range_dels); } - VerifyRangeDelIter(range_del_agg.NewIterator().get(), range_dels); } } @@ -275,6 +287,19 @@ TEST_F(RangeDelAggregatorTest, MergingIteratorSeek) { {{"c", "d", 20}, {"e", "f", 20}, {"f", "g", 10}}); } +TEST_F(RangeDelAggregatorTest, TruncateTombstones) { + const InternalKey smallest("b", 1, kTypeRangeDeletion); + const InternalKey largest("e", kMaxSequenceNumber, kTypeRangeDeletion); + VerifyRangeDels( + {{"a", "c", 10}, {"d", "f", 10}}, + {{"a", 10, true}, // truncated + {"b", 10, false}, // not truncated + {"d", 10, false}, // not truncated + {"e", 10, true}}, // truncated + {{"b", "c", 10}, {"d", "e", 10}}, + &smallest, &largest); +} + } // namespace rocksdb int main(int argc, char** argv) { diff --git a/db/repair.cc b/db/repair.cc index b10873504..91c64734c 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -500,7 +500,7 @@ class Repairer { } if (status.ok()) { InternalIterator* iter = table_cache_->NewIterator( - ReadOptions(), env_options_, cfd->internal_comparator(), t->meta.fd, + ReadOptions(), env_options_, cfd->internal_comparator(), t->meta, nullptr /* range_del_agg */, cfd->GetLatestMutableCFOptions()->prefix_extractor.get()); bool empty = true; diff --git a/db/table_cache.cc b/db/table_cache.cc index a6d0fb541..d8a95b969 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -180,7 +180,7 @@ Status TableCache::FindTable(const EnvOptions& env_options, InternalIterator* TableCache::NewIterator( const ReadOptions& options, const EnvOptions& env_options, - const InternalKeyComparator& icomparator, const FileDescriptor& fd, + const InternalKeyComparator& icomparator, const FileMetaData& file_meta, RangeDelAggregator* range_del_agg, const SliceTransform* prefix_extractor, TableReader** table_reader_ptr, HistogramImpl* file_read_hist, bool for_compaction, Arena* arena, bool skip_filters, int level) { @@ -211,6 +211,7 @@ InternalIterator* TableCache::NewIterator( create_new_table_reader = readahead > 0; } + auto& fd = file_meta.fd; if (create_new_table_reader) { unique_ptr table_reader_unique_ptr; s = GetTableReader( @@ -265,7 +266,10 @@ InternalIterator* TableCache::NewIterator( s = range_del_iter->status(); } if (s.ok()) { - s = range_del_agg->AddTombstones(std::move(range_del_iter)); + s = range_del_agg->AddTombstones( + std::move(range_del_iter), + &file_meta.smallest, + &file_meta.largest); } } } @@ -280,51 +284,14 @@ InternalIterator* TableCache::NewIterator( return result; } -InternalIterator* TableCache::NewRangeTombstoneIterator( - const ReadOptions& options, const EnvOptions& env_options, - const InternalKeyComparator& icomparator, const FileDescriptor& fd, - HistogramImpl* file_read_hist, bool skip_filters, int level, - const SliceTransform* prefix_extractor) { - Status s; - Cache::Handle* handle = nullptr; - TableReader* table_reader = fd.table_reader; - if (table_reader == nullptr) { - s = FindTable(env_options, icomparator, fd, &handle, prefix_extractor, - options.read_tier == kBlockCacheTier /* no_io */, - true /* record read_stats */, file_read_hist, skip_filters, - level); - if (s.ok()) { - table_reader = GetTableReaderFromHandle(handle); - } - } - InternalIterator* result = nullptr; - if (s.ok()) { - result = table_reader->NewRangeTombstoneIterator(options); - if (result != nullptr) { - if (handle != nullptr) { - result->RegisterCleanup(&UnrefEntry, cache_, handle); - } - } - } - if (result == nullptr && handle != nullptr) { - // the range deletion block didn't exist, or there was a failure between - // getting handle and getting iterator. - ReleaseHandle(handle); - } - if (!s.ok()) { - assert(result == nullptr); - result = NewErrorInternalIterator(s); - } - return result; -} - Status TableCache::Get(const ReadOptions& options, const InternalKeyComparator& internal_comparator, - const FileDescriptor& fd, const Slice& k, + const FileMetaData& file_meta, const Slice& k, GetContext* get_context, const SliceTransform* prefix_extractor, HistogramImpl* file_read_hist, bool skip_filters, int level) { + auto& fd = file_meta.fd; std::string* row_cache_entry = nullptr; bool done = false; #ifndef ROCKSDB_LITE @@ -405,7 +372,9 @@ Status TableCache::Get(const ReadOptions& options, } if (s.ok()) { s = get_context->range_del_agg()->AddTombstones( - std::move(range_del_iter)); + std::move(range_del_iter), + &file_meta.smallest, + &file_meta.largest); } } if (s.ok()) { diff --git a/db/table_cache.h b/db/table_cache.h index 614fd2e41..dcf09edff 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -53,19 +53,12 @@ class TableCache { InternalIterator* NewIterator( const ReadOptions& options, const EnvOptions& toptions, const InternalKeyComparator& internal_comparator, - const FileDescriptor& file_fd, RangeDelAggregator* range_del_agg, + const FileMetaData& file_meta, RangeDelAggregator* range_del_agg, const SliceTransform* prefix_extractor = nullptr, TableReader** table_reader_ptr = nullptr, HistogramImpl* file_read_hist = nullptr, bool for_compaction = false, Arena* arena = nullptr, bool skip_filters = false, int level = -1); - InternalIterator* NewRangeTombstoneIterator( - const ReadOptions& options, const EnvOptions& toptions, - const InternalKeyComparator& internal_comparator, - const FileDescriptor& file_fd, HistogramImpl* file_read_hist, - bool skip_filters, int level, - const SliceTransform* prefix_extractor = nullptr); - // If a seek to internal key "k" in specified file finds an entry, // call (*handle_result)(arg, found_key, found_value) repeatedly until // it returns false. @@ -76,7 +69,7 @@ class TableCache { // @param level The level this table is at, -1 for "not set / don't know" Status Get(const ReadOptions& options, const InternalKeyComparator& internal_comparator, - const FileDescriptor& file_fd, const Slice& k, + const FileMetaData& file_meta, const Slice& k, GetContext* get_context, const SliceTransform* prefix_extractor = nullptr, HistogramImpl* file_read_hist = nullptr, bool skip_filters = false, diff --git a/db/version_set.cc b/db/version_set.cc index 1b946f2e8..ced79de75 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -548,8 +548,9 @@ class LevelIterator final : public InternalIterator { } return table_cache_->NewIterator( - read_options_, env_options_, icomparator_, file_meta.fd, range_del_agg_, - prefix_extractor_, nullptr /* don't need reference to table */, + read_options_, env_options_, icomparator_, *file_meta.file_metadata, + range_del_agg_, prefix_extractor_, + nullptr /* don't need reference to table */, file_read_hist_, for_compaction_, nullptr /* arena */, skip_filters_, level_); } @@ -1000,7 +1001,7 @@ void Version::AddIteratorsForLevel(const ReadOptions& read_options, for (size_t i = 0; i < storage_info_.LevelFilesBrief(0).num_files; i++) { const auto& file = storage_info_.LevelFilesBrief(0).files[i]; merge_iter_builder->AddIterator(cfd_->table_cache()->NewIterator( - read_options, soptions, cfd_->internal_comparator(), file.fd, + read_options, soptions, cfd_->internal_comparator(), *file.file_metadata, range_del_agg, mutable_cf_options_.prefix_extractor.get(), nullptr, cfd_->internal_stats()->GetFileReadHist(0), false, arena, false /* skip_filters */, 0 /* level */)); @@ -1053,7 +1054,7 @@ Status Version::OverlapWithLevelIterator(const ReadOptions& read_options, continue; } ScopedArenaIterator iter(cfd_->table_cache()->NewIterator( - read_options, env_options, cfd_->internal_comparator(), file->fd, + read_options, env_options, cfd_->internal_comparator(), *file->file_metadata, &range_del_agg, mutable_cf_options_.prefix_extractor.get(), nullptr, cfd_->internal_stats()->GetFileReadHist(0), false, &arena, false /* skip_filters */, 0 /* level */)); @@ -1198,8 +1199,8 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k, } *status = table_cache_->Get( - read_options, *internal_comparator(), f->fd, ikey, &get_context, - mutable_cf_options_.prefix_extractor.get(), + read_options, *internal_comparator(), *f->file_metadata, ikey, + &get_context, mutable_cf_options_.prefix_extractor.get(), cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()), IsFilterSkipped(static_cast(fp.GetHitFileLevel()), fp.IsHitFileLastInLevel()), @@ -2053,8 +2054,8 @@ void VersionStorageInfo::GetOverlappingInputs( *file_index = -1; } const Comparator* user_cmp = user_comparator_; - if (begin != nullptr && end != nullptr && level > 0) { - GetOverlappingInputsRangeBinarySearch(level, user_begin, user_end, inputs, + if (level > 0) { + GetOverlappingInputsRangeBinarySearch(level, begin, end, inputs, hint_index, file_index); return; } @@ -2109,24 +2110,73 @@ void VersionStorageInfo::GetCleanInputsWithinInterval( return; } - Slice user_begin, user_end; const auto& level_files = level_files_brief_[level]; if (begin == nullptr) { - user_begin = ExtractUserKey(level_files.files[0].smallest_key); - } else { - user_begin = begin->user_key(); + begin = &level_files.files[0].file_metadata->smallest; } if (end == nullptr) { - user_end = ExtractUserKey( - level_files.files[level_files.num_files - 1].largest_key); - } else { - user_end = end->user_key(); + end = &level_files.files[level_files.num_files - 1].file_metadata->largest; } - GetOverlappingInputsRangeBinarySearch(level, user_begin, user_end, inputs, + + GetOverlappingInputsRangeBinarySearch(level, begin, end, inputs, hint_index, file_index, true /* within_interval */); } +namespace { + +const uint64_t kRangeTombstoneSentinel = + PackSequenceAndType(kMaxSequenceNumber, kTypeRangeDeletion); + +// Utility for comparing sstable boundary keys. Returns -1 if either a or b is +// null which provides the property that a==null indicates a key that is less +// than any key and b==null indicates a key that is greater than any key. Note +// that the comparison is performed primarily on the user-key portion of the +// key. If the user-keys compare equal, an additional test is made to sort +// range tombstone sentinel keys before other keys with the same user-key. The +// result is that 2 user-keys will compare equal if they differ purely on +// their sequence number and value, but the range tombstone sentinel for that +// user-key will compare not equal. This is necessary because the range +// tombstone sentinel key is set as the largest key for an sstable even though +// that key never appears in the database. We don't want adjacent sstables to +// be considered overlapping if they are separated by the range tombstone +// sentinel. +int sstableKeyCompare(const Comparator* user_cmp, + const InternalKey& a, const InternalKey& b) { + auto c = user_cmp->Compare(a.user_key(), b.user_key()); + if (c != 0) { + return c; + } + auto a_footer = ExtractInternalKeyFooter(a.Encode()); + auto b_footer = ExtractInternalKeyFooter(b.Encode()); + if (a_footer == kRangeTombstoneSentinel) { + if (b_footer != kRangeTombstoneSentinel) { + return -1; + } + } else if (b_footer == kRangeTombstoneSentinel) { + return 1; + } + return 0; +} + +int sstableKeyCompare(const Comparator* user_cmp, + const InternalKey* a, const InternalKey& b) { + if (a == nullptr) { + return -1; + } + return sstableKeyCompare(user_cmp, *a, b); +} + +int sstableKeyCompare(const Comparator* user_cmp, + const InternalKey& a, const InternalKey* b) { + if (b == nullptr) { + return -1; + } + return sstableKeyCompare(user_cmp, a, *b); +} + +} // namespace + // Store in "*inputs" all files in "level" that overlap [begin,end] // Employ binary search to find at least one file that overlaps the // specified range. From that file, iterate backwards and @@ -2135,7 +2185,7 @@ void VersionStorageInfo::GetCleanInputsWithinInterval( // within range [begin, end]. "clean" means there is a boudnary // between the files in "*inputs" and the surrounding files void VersionStorageInfo::GetOverlappingInputsRangeBinarySearch( - int level, const Slice& user_begin, const Slice& user_end, + int level, const InternalKey* begin, const InternalKey* end, std::vector* inputs, int hint_index, int* file_index, bool within_interval) const { assert(level > 0); @@ -2143,7 +2193,7 @@ void VersionStorageInfo::GetOverlappingInputsRangeBinarySearch( int mid = 0; int max = static_cast(files_[level].size()) - 1; bool foundOverlap = false; - const Comparator* user_cmp = user_comparator_; + auto user_cmp = user_comparator_; // if the caller already knows the index of a file that has overlap, // then we can skip the binary search. @@ -2155,15 +2205,15 @@ void VersionStorageInfo::GetOverlappingInputsRangeBinarySearch( while (!foundOverlap && min <= max) { mid = (min + max)/2; FdWithKeyRange* f = &(level_files_brief_[level].files[mid]); - const Slice file_start = ExtractUserKey(f->smallest_key); - const Slice file_limit = ExtractUserKey(f->largest_key); - if ((!within_interval && user_cmp->Compare(file_limit, user_begin) < 0) || - (within_interval && user_cmp->Compare(file_start, user_begin) < 0)) { + auto& smallest = f->file_metadata->smallest; + auto& largest = f->file_metadata->largest; + if ((!within_interval && sstableKeyCompare(user_cmp, begin, largest) > 0) || + (within_interval && sstableKeyCompare(user_cmp, begin, smallest) > 0)) { min = mid + 1; } else if ((!within_interval && - user_cmp->Compare(user_end, file_start) < 0) || + sstableKeyCompare(user_cmp, smallest, end) > 0) || (within_interval && - user_cmp->Compare(user_end, file_limit) < 0)) { + sstableKeyCompare(user_cmp, largest, end) > 0)) { max = mid - 1; } else { foundOverlap = true; @@ -2182,10 +2232,10 @@ void VersionStorageInfo::GetOverlappingInputsRangeBinarySearch( int start_index, end_index; if (within_interval) { - ExtendFileRangeWithinInterval(level, user_begin, user_end, mid, &start_index, - &end_index); + ExtendFileRangeWithinInterval(level, begin, end, mid, + &start_index, &end_index); } else { - ExtendFileRangeOverlappingInterval(level, user_begin, user_end, mid, + ExtendFileRangeOverlappingInterval(level, begin, end, mid, &start_index, &end_index); assert(end_index >= start_index); } @@ -2202,21 +2252,28 @@ void VersionStorageInfo::GetOverlappingInputsRangeBinarySearch( // and forward to find all overlapping files. // Use FileLevel in searching, make it faster void VersionStorageInfo::ExtendFileRangeOverlappingInterval( - int level, const Slice& user_begin, const Slice& user_end, + int level, const InternalKey* begin, const InternalKey* end, unsigned int mid_index, int* start_index, int* end_index) const { - const Comparator* user_cmp = user_comparator_; + auto user_cmp = user_comparator_; const FdWithKeyRange* files = level_files_brief_[level].files; #ifndef NDEBUG { // assert that the file at mid_index overlaps with the range assert(mid_index < level_files_brief_[level].num_files); const FdWithKeyRange* f = &files[mid_index]; - const Slice fstart = ExtractUserKey(f->smallest_key); - const Slice flimit = ExtractUserKey(f->largest_key); - if (user_cmp->Compare(fstart, user_begin) >= 0) { - assert(user_cmp->Compare(fstart, user_end) <= 0); + auto& smallest = f->file_metadata->smallest; + auto& largest = f->file_metadata->largest; + if (sstableKeyCompare(user_cmp, begin, smallest) <= 0) { + assert(sstableKeyCompare(user_cmp, smallest, end) <= 0); } else { - assert(user_cmp->Compare(flimit, user_begin) >= 0); + // fprintf(stderr, "ExtendFileRangeOverlappingInterval\n%s - %s\n%s - %s\n%d %d\n", + // begin ? begin->DebugString().c_str() : "(null)", + // end ? end->DebugString().c_str() : "(null)", + // smallest->DebugString().c_str(), + // largest->DebugString().c_str(), + // sstableKeyCompare(user_cmp, smallest, begin), + // sstableKeyCompare(user_cmp, largest, begin)); + assert(sstableKeyCompare(user_cmp, begin, largest) <= 0); } } #endif @@ -2228,8 +2285,8 @@ void VersionStorageInfo::ExtendFileRangeOverlappingInterval( // check backwards from 'mid' to lower indices for (int i = mid_index; i >= 0 ; i--) { const FdWithKeyRange* f = &files[i]; - const Slice file_limit = ExtractUserKey(f->largest_key); - if (user_cmp->Compare(file_limit, user_begin) >= 0) { + auto& largest = f->file_metadata->largest; + if (sstableKeyCompare(user_cmp, begin, largest) <= 0) { *start_index = i; assert((count++, true)); } else { @@ -2240,8 +2297,8 @@ void VersionStorageInfo::ExtendFileRangeOverlappingInterval( for (unsigned int i = mid_index+1; i < level_files_brief_[level].num_files; i++) { const FdWithKeyRange* f = &files[i]; - const Slice file_start = ExtractUserKey(f->smallest_key); - if (user_cmp->Compare(file_start, user_end) <= 0) { + auto& smallest = f->file_metadata->smallest; + if (sstableKeyCompare(user_cmp, smallest, end) <= 0) { assert((count++, true)); *end_index = i; } else { @@ -2259,39 +2316,36 @@ void VersionStorageInfo::ExtendFileRangeOverlappingInterval( // the clean range required. // Use FileLevel in searching, make it faster void VersionStorageInfo::ExtendFileRangeWithinInterval( - int level, const Slice& user_begin, const Slice& user_end, + int level, const InternalKey* begin, const InternalKey* end, unsigned int mid_index, int* start_index, int* end_index) const { assert(level != 0); - const Comparator* user_cmp = user_comparator_; + auto* user_cmp = user_comparator_; const FdWithKeyRange* files = level_files_brief_[level].files; #ifndef NDEBUG { // assert that the file at mid_index is within the range assert(mid_index < level_files_brief_[level].num_files); const FdWithKeyRange* f = &files[mid_index]; - const Slice fstart = ExtractUserKey(f->smallest_key); - const Slice flimit = ExtractUserKey(f->largest_key); - assert(user_cmp->Compare(fstart, user_begin) >= 0 && - user_cmp->Compare(flimit, user_end) <= 0); + auto& smallest = f->file_metadata->smallest; + auto& largest = f->file_metadata->largest; + assert(sstableKeyCompare(user_cmp, begin, smallest) <= 0 && + sstableKeyCompare(user_cmp, largest, end) <= 0); } #endif - ExtendFileRangeOverlappingInterval(level, user_begin, user_end, mid_index, + ExtendFileRangeOverlappingInterval(level, begin, end, mid_index, start_index, end_index); int left = *start_index; int right = *end_index; // shrink from left to right while (left <= right) { - const Slice& first_key_in_range = ExtractUserKey(files[left].smallest_key); - if (user_cmp->Compare(first_key_in_range, user_begin) < 0) { + auto& smallest = files[left].file_metadata->smallest; + if (sstableKeyCompare(user_cmp, begin, smallest) > 0) { left++; continue; } if (left > 0) { // If not first file - const Slice& last_key_before = - ExtractUserKey(files[left - 1].largest_key); - if (user_cmp->Equal(first_key_in_range, last_key_before)) { - // The first user key in range overlaps with the previous file's last - // key + auto& largest = files[left - 1].file_metadata->largest; + if (sstableKeyCompare(user_cmp, smallest, largest) == 0) { left++; continue; } @@ -2300,16 +2354,15 @@ void VersionStorageInfo::ExtendFileRangeWithinInterval( } // shrink from right to left while (left <= right) { - const Slice last_key_in_range = ExtractUserKey(files[right].largest_key); - if (user_cmp->Compare(last_key_in_range, user_end) > 0) { + auto& largest = files[right].file_metadata->largest; + if (sstableKeyCompare(user_cmp, largest, end) > 0) { right--; continue; } if (right < static_cast(level_files_brief_[level].num_files) - 1) { // If not the last file - const Slice first_key_after = - ExtractUserKey(files[right + 1].smallest_key); - if (user_cmp->Equal(last_key_in_range, first_key_after)) { + auto& smallest = files[right + 1].file_metadata->smallest; + if (sstableKeyCompare(user_cmp, smallest, largest) == 0) { // The last user key in range overlaps with the next file's first key right--; continue; @@ -4030,8 +4083,8 @@ uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f, // approximate offset of "key" within the table. TableReader* table_reader_ptr; InternalIterator* iter = v->cfd_->table_cache()->NewIterator( - ReadOptions(), v->env_options_, v->cfd_->internal_comparator(), f.fd, - nullptr /* range_del_agg */, + ReadOptions(), v->env_options_, v->cfd_->internal_comparator(), + *f.file_metadata, nullptr /* range_del_agg */, v->GetMutableCFOptions().prefix_extractor.get(), &table_reader_ptr); if (table_reader_ptr != nullptr) { result = table_reader_ptr->ApproximateOffsetOf(key); @@ -4111,7 +4164,7 @@ InternalIterator* VersionSet::MakeInputIterator( for (size_t i = 0; i < flevel->num_files; i++) { list[num++] = cfd->table_cache()->NewIterator( read_options, env_options_compactions, cfd->internal_comparator(), - flevel->files[i].fd, range_del_agg, + *flevel->files[i].file_metadata, range_del_agg, c->mutable_cf_options()->prefix_extractor.get(), nullptr /* table_reader_ptr */, nullptr /* no per level latency histogram */, diff --git a/db/version_set.h b/db/version_set.h index 7abea9855..269592b6c 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -201,8 +201,8 @@ class VersionStorageInfo { void GetOverlappingInputsRangeBinarySearch( int level, // level > 0 - const Slice& begin, // nullptr means before all keys - const Slice& end, // nullptr means after all keys + const InternalKey* begin, // nullptr means before all keys + const InternalKey* end, // nullptr means after all keys std::vector* inputs, int hint_index, // index of overlap file int* file_index, // return index of overlap file @@ -211,20 +211,20 @@ class VersionStorageInfo { void ExtendFileRangeOverlappingInterval( int level, - const Slice& begin, // nullptr means before all keys - const Slice& end, // nullptr means after all keys - unsigned int index, // start extending from this index - int* startIndex, // return the startIndex of input range - int* endIndex) // return the endIndex of input range + const InternalKey* begin, // nullptr means before all keys + const InternalKey* end, // nullptr means after all keys + unsigned int index, // start extending from this index + int* startIndex, // return the startIndex of input range + int* endIndex) // return the endIndex of input range const; void ExtendFileRangeWithinInterval( int level, - const Slice& begin, // nullptr means before all keys - const Slice& end, // nullptr means after all keys - unsigned int index, // start extending from this index - int* startIndex, // return the startIndex of input range - int* endIndex) // return the endIndex of input range + const InternalKey* begin, // nullptr means before all keys + const InternalKey* end, // nullptr means after all keys + unsigned int index, // start extending from this index + int* startIndex, // return the startIndex of input range + int* endIndex) // return the endIndex of input range const; // Returns true iff some file in the specified level overlaps diff --git a/db/version_set_test.cc b/db/version_set_test.cc index 5ce923287..7ad823cb4 100644 --- a/db/version_set_test.cc +++ b/db/version_set_test.cc @@ -11,6 +11,7 @@ #include "db/log_writer.h" #include "table/mock_table.h" #include "util/logging.h" +#include "util/string_util.h" #include "util/testharness.h" #include "util/testutil.h" @@ -139,6 +140,35 @@ class VersionStorageInfoTest : public testing::Test { f->num_deletions = 0; vstorage_.AddFile(level, f); } + + void Add(int level, uint32_t file_number, const InternalKey& smallest, + const InternalKey& largest, uint64_t file_size = 0) { + assert(level < vstorage_.num_levels()); + FileMetaData* f = new FileMetaData; + f->fd = FileDescriptor(file_number, 0, file_size); + f->smallest = smallest; + f->largest = largest; + f->compensated_file_size = file_size; + f->refs = 0; + f->num_entries = 0; + f->num_deletions = 0; + vstorage_.AddFile(level, f); + } + + std::string GetOverlappingFiles(int level, const InternalKey& begin, + const InternalKey& end) { + std::vector inputs; + vstorage_.GetOverlappingInputs(level, &begin, &end, &inputs); + + std::string result; + for (size_t i = 0; i < inputs.size(); ++i) { + if (i > 0) { + result += ","; + } + AppendNumberTo(&result, inputs[i]->fd.GetNumber()); + } + return result; + } }; TEST_F(VersionStorageInfoTest, MaxBytesForLevelStatic) { @@ -261,6 +291,40 @@ TEST_F(VersionStorageInfoTest, EstimateLiveDataSize2) { ASSERT_EQ(4U, vstorage_.EstimateLiveDataSize()); } +TEST_F(VersionStorageInfoTest, GetOverlappingInputs) { + // Two files that overlap at the range deletion tombstone sentinel. + Add(1, 1U, {"a", 0, kTypeValue}, {"b", kMaxSequenceNumber, kTypeRangeDeletion}, 1); + Add(1, 2U, {"b", 0, kTypeValue}, {"c", 0, kTypeValue}, 1); + // Two files that overlap at the same user key. + Add(1, 3U, {"d", 0, kTypeValue}, {"e", kMaxSequenceNumber, kTypeValue}, 1); + Add(1, 4U, {"e", 0, kTypeValue}, {"f", 0, kTypeValue}, 1); + // Two files that do not overlap. + Add(1, 5U, {"g", 0, kTypeValue}, {"h", 0, kTypeValue}, 1); + Add(1, 6U, {"i", 0, kTypeValue}, {"j", 0, kTypeValue}, 1); + vstorage_.UpdateNumNonEmptyLevels(); + vstorage_.GenerateLevelFilesBrief(); + + ASSERT_EQ("1,2", GetOverlappingFiles( + 1, {"a", 0, kTypeValue}, {"b", 0, kTypeValue})); + ASSERT_EQ("1", GetOverlappingFiles( + 1, {"a", 0, kTypeValue}, {"b", kMaxSequenceNumber, kTypeRangeDeletion})); + ASSERT_EQ("2", GetOverlappingFiles( + 1, {"b", kMaxSequenceNumber, kTypeValue}, {"c", 0, kTypeValue})); + ASSERT_EQ("3,4", GetOverlappingFiles( + 1, {"d", 0, kTypeValue}, {"e", 0, kTypeValue})); + ASSERT_EQ("3", GetOverlappingFiles( + 1, {"d", 0, kTypeValue}, {"e", kMaxSequenceNumber, kTypeRangeDeletion})); + ASSERT_EQ("3,4", GetOverlappingFiles( + 1, {"e", kMaxSequenceNumber, kTypeValue}, {"f", 0, kTypeValue})); + ASSERT_EQ("3,4", GetOverlappingFiles( + 1, {"e", 0, kTypeValue}, {"f", 0, kTypeValue})); + ASSERT_EQ("5", GetOverlappingFiles( + 1, {"g", 0, kTypeValue}, {"h", 0, kTypeValue})); + ASSERT_EQ("6", GetOverlappingFiles( + 1, {"i", 0, kTypeValue}, {"j", 0, kTypeValue})); +} + + class FindLevelFileTest : public testing::Test { public: LevelFilesBrief file_level_;