From b104b8781452d1922cfba07c782a094a43977d18 Mon Sep 17 00:00:00 2001 From: Andrew Kryczka Date: Thu, 5 Jan 2017 10:22:46 -0800 Subject: [PATCH] Maintain position in range deletions map Summary: When deletion-collapsing mode is enabled (i.e., for DBIter/CompactionIterator), we maintain position in the tombstone maps across calls to ShouldDelete(). Since iterators often access keys sequentially (or reverse-sequentially), scanning forward/backward from the last position can be faster than binary-searching the map for every key. - When Next() is invoked on an iterator, we use kForwardTraversal to scan forwards, if needed, until arriving at the range deletion containing the next key. - Similarly for Prev(), we use kBackwardTraversal to scan backwards in the range deletion map. - When the iterator seeks, we use kBinarySearch for repositioning - After tombstones are added or before the first ShouldDelete() invocation, the current position is set to invalid, which forces kBinarySearch to be used. - Non-iterator users (i.e., Get()) use kFullScan, which has the same behavior as before---scan the whole map for every key passed to ShouldDelete(). Closes https://github.com/facebook/rocksdb/pull/1701 Differential Revision: D4350318 Pulled By: ajkr fbshipit-source-id: 5129b76 --- db/compaction_iterator.cc | 3 +- db/db_iter.cc | 44 +++++++--- db/merge_helper.cc | 5 +- db/range_del_aggregator.cc | 146 ++++++++++++++++++++++++-------- db/range_del_aggregator.h | 50 ++++++++++- db/range_del_aggregator_test.cc | 8 +- 6 files changed, 202 insertions(+), 54 deletions(-) diff --git a/db/compaction_iterator.cc b/db/compaction_iterator.cc index 152a1de54..a57f558d7 100644 --- a/db/compaction_iterator.cc +++ b/db/compaction_iterator.cc @@ -466,7 +466,8 @@ void CompactionIterator::NextFromInput() { } else { // 1. new user key -OR- // 2. different snapshot stripe - bool should_delete = range_del_agg_->ShouldDelete(key_); + bool should_delete = range_del_agg_->ShouldDelete( + key_, RangeDelAggregator::RangePositioningMode::kForwardTraversal); if (should_delete) { ++iter_stats_.num_record_drop_hidden; ++iter_stats_.num_record_drop_range_del; diff --git a/db/db_iter.cc b/db/db_iter.cc index 260eb2c5f..54d8351a8 100644 --- a/db/db_iter.cc +++ b/db/db_iter.cc @@ -412,7 +412,9 @@ void DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) { saved_key_.SetKey( ikey.user_key, !iter_->IsKeyPinned() || !pin_thru_lifetime_ /* copy */); - if (range_del_agg_.ShouldDelete(ikey)) { + if (range_del_agg_.ShouldDelete( + ikey, RangeDelAggregator::RangePositioningMode:: + kForwardTraversal)) { // Arrange to skip all upcoming entries for this key since // they are hidden by this deletion. skipping = true; @@ -427,7 +429,9 @@ void DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) { saved_key_.SetKey( ikey.user_key, !iter_->IsKeyPinned() || !pin_thru_lifetime_ /* copy */); - if (range_del_agg_.ShouldDelete(ikey)) { + if (range_del_agg_.ShouldDelete( + ikey, RangeDelAggregator::RangePositioningMode:: + kForwardTraversal)) { // Arrange to skip all upcoming entries for this key since // they are hidden by this deletion. skipping = true; @@ -530,7 +534,9 @@ void DBIter::MergeValuesNewToOld() { // hit the next user key, stop right here break; } else if (kTypeDeletion == ikey.type || kTypeSingleDeletion == ikey.type || - range_del_agg_.ShouldDelete(ikey)) { + range_del_agg_.ShouldDelete( + ikey, RangeDelAggregator::RangePositioningMode:: + kForwardTraversal)) { // hit a delete with the same user key, stop right here // iter_ is positioned after delete iter_->Next(); @@ -599,6 +605,7 @@ void DBIter::ReverseToForward() { direction_ = kForward; if (!iter_->Valid()) { iter_->SeekToFirst(); + range_del_agg_.InvalidateTombstoneMapPositions(); } } @@ -614,6 +621,7 @@ void DBIter::ReverseToBackward() { // previous key. if (!iter_->Valid()) { iter_->SeekToLast(); + range_del_agg_.InvalidateTombstoneMapPositions(); } ParsedInternalKey ikey; FindParseableKey(&ikey, kReverse); @@ -710,7 +718,9 @@ bool DBIter::FindValueForCurrentKey() { last_key_entry_type = ikey.type; switch (last_key_entry_type) { case kTypeValue: - if (range_del_agg_.ShouldDelete(ikey)) { + if (range_del_agg_.ShouldDelete( + ikey, + RangeDelAggregator::RangePositioningMode::kBackwardTraversal)) { last_key_entry_type = kTypeRangeDeletion; PERF_COUNTER_ADD(internal_delete_skipped_count, 1); } else { @@ -727,7 +737,9 @@ bool DBIter::FindValueForCurrentKey() { PERF_COUNTER_ADD(internal_delete_skipped_count, 1); break; case kTypeMerge: - if (range_del_agg_.ShouldDelete(ikey)) { + if (range_del_agg_.ShouldDelete( + ikey, + RangeDelAggregator::RangePositioningMode::kBackwardTraversal)) { merge_context_.Clear(); last_key_entry_type = kTypeRangeDeletion; last_not_merge_type = last_key_entry_type; @@ -805,7 +817,8 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() { FindParseableKey(&ikey, kForward); if (ikey.type == kTypeDeletion || ikey.type == kTypeSingleDeletion || - range_del_agg_.ShouldDelete(ikey)) { + range_del_agg_.ShouldDelete( + ikey, RangeDelAggregator::RangePositioningMode::kBackwardTraversal)) { valid_ = false; return false; } @@ -820,9 +833,12 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() { // in operands current_entry_is_merged_ = true; merge_context_.Clear(); - while (iter_->Valid() && - user_comparator_->Equal(ikey.user_key, saved_key_.GetKey()) && - ikey.type == kTypeMerge && !range_del_agg_.ShouldDelete(ikey)) { + while ( + iter_->Valid() && + user_comparator_->Equal(ikey.user_key, saved_key_.GetKey()) && + ikey.type == kTypeMerge && + !range_del_agg_.ShouldDelete( + ikey, RangeDelAggregator::RangePositioningMode::kBackwardTraversal)) { merge_context_.PushOperand(iter_->value(), iter_->IsValuePinned() /* operand_pinned */); PERF_COUNTER_ADD(internal_merge_count, 1); @@ -834,7 +850,8 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() { if (!iter_->Valid() || !user_comparator_->Equal(ikey.user_key, saved_key_.GetKey()) || ikey.type == kTypeDeletion || ikey.type == kTypeSingleDeletion || - range_del_agg_.ShouldDelete(ikey)) { + range_del_agg_.ShouldDelete( + ikey, RangeDelAggregator::RangePositioningMode::kBackwardTraversal)) { s = MergeHelper::TimedFullMerge(merge_operator_, saved_key_.GetKey(), nullptr, merge_context_.GetOperands(), &saved_value_, logger_, statistics_, env_, @@ -929,12 +946,13 @@ void DBIter::Seek(const Slice& target) { StopWatch sw(env_, statistics_, DB_SEEK); ReleaseTempPinnedData(); saved_key_.Clear(); - // now savved_key is used to store internal key. + // now saved_key is used to store internal key. saved_key_.SetInternalKey(target, sequence_); { PERF_TIMER_GUARD(seek_internal_seek_time); iter_->Seek(saved_key_.GetKey()); + range_del_agg_.InvalidateTombstoneMapPositions(); } RecordTick(statistics_, NUMBER_DB_SEEK); if (iter_->Valid()) { @@ -974,6 +992,7 @@ void DBIter::SeekForPrev(const Slice& target) { { PERF_TIMER_GUARD(seek_internal_seek_time); iter_->SeekForPrev(saved_key_.GetKey()); + range_del_agg_.InvalidateTombstoneMapPositions(); } RecordTick(statistics_, NUMBER_DB_SEEK); @@ -1015,6 +1034,7 @@ void DBIter::SeekToFirst() { { PERF_TIMER_GUARD(seek_internal_seek_time); iter_->SeekToFirst(); + range_del_agg_.InvalidateTombstoneMapPositions(); } RecordTick(statistics_, NUMBER_DB_SEEK); @@ -1048,12 +1068,14 @@ void DBIter::SeekToLast() { { PERF_TIMER_GUARD(seek_internal_seek_time); iter_->SeekToLast(); + range_del_agg_.InvalidateTombstoneMapPositions(); } // When the iterate_upper_bound is set to a value, // it will seek to the last key before the // ReadOptions.iterate_upper_bound if (iter_->Valid() && iterate_upper_bound_ != nullptr) { SeekForPrev(*iterate_upper_bound_); + range_del_agg_.InvalidateTombstoneMapPositions(); if (!Valid()) { return; } else if (user_comparator_->Equal(*iterate_upper_bound_, key())) { diff --git a/db/merge_helper.cc b/db/merge_helper.cc index 5a1b047d6..04e7bcb80 100644 --- a/db/merge_helper.cc +++ b/db/merge_helper.cc @@ -192,7 +192,10 @@ Status MergeHelper::MergeUntil(InternalIterator* iter, ? CompactionFilter::Decision::kKeep : FilterMerge(orig_ikey.user_key, value_slice); if (range_del_agg != nullptr && - range_del_agg->ShouldDelete(iter->key()) && + + range_del_agg->ShouldDelete( + iter->key(), + RangeDelAggregator::RangePositioningMode::kForwardTraversal) && filter != CompactionFilter::Decision::kRemoveAndSkipUntil) { filter = CompactionFilter::Decision::kRemove; } diff --git a/db/range_del_aggregator.cc b/db/range_del_aggregator.cc index 6e6574837..27a680fea 100644 --- a/db/range_del_aggregator.cc +++ b/db/range_del_aggregator.cc @@ -32,16 +32,19 @@ void RangeDelAggregator::InitRep(const std::vector& snapshots) { for (auto snapshot : snapshots) { rep_->stripe_map_.emplace( snapshot, - TombstoneMap(stl_wrappers::LessOfComparator(icmp_.user_comparator()))); + PositionalTombstoneMap(TombstoneMap( + stl_wrappers::LessOfComparator(icmp_.user_comparator())))); } // Data newer than any snapshot falls in this catch-all stripe rep_->stripe_map_.emplace( kMaxSequenceNumber, - TombstoneMap(stl_wrappers::LessOfComparator(icmp_.user_comparator()))); + PositionalTombstoneMap(TombstoneMap( + stl_wrappers::LessOfComparator(icmp_.user_comparator())))); rep_->pinned_iters_mgr_.StartPinning(); } -bool RangeDelAggregator::ShouldDelete(const Slice& internal_key) { +bool RangeDelAggregator::ShouldDelete( + const Slice& internal_key, RangeDelAggregator::RangePositioningMode mode) { if (rep_ == nullptr) { return false; } @@ -49,36 +52,96 @@ bool RangeDelAggregator::ShouldDelete(const Slice& internal_key) { if (!ParseInternalKey(internal_key, &parsed)) { assert(false); } - return ShouldDelete(parsed); + return ShouldDelete(parsed, mode); } -bool RangeDelAggregator::ShouldDelete(const ParsedInternalKey& parsed) { +bool RangeDelAggregator::ShouldDelete( + const ParsedInternalKey& parsed, + RangeDelAggregator::RangePositioningMode mode) { assert(IsValueType(parsed.type)); if (rep_ == nullptr) { return false; } - const auto& tombstone_map = GetTombstoneMap(parsed.sequence); - if (collapse_deletions_) { - auto iter = tombstone_map.upper_bound(parsed.user_key); - if (iter == tombstone_map.begin()) { + auto& positional_tombstone_map = GetPositionalTombstoneMap(parsed.sequence); + const auto& tombstone_map = positional_tombstone_map.raw_map; + if (tombstone_map.empty()) { + return false; + } + auto& tombstone_map_iter = positional_tombstone_map.iter; + if (tombstone_map_iter == tombstone_map.end() && + (mode == kForwardTraversal || mode == kBackwardTraversal)) { + // invalid (e.g., if AddTombstones() changed the deletions), so need to + // reseek + mode = kBinarySearch; + } + switch (mode) { + case kFullScan: + assert(!collapse_deletions_); + // The maintained state (PositionalTombstoneMap::iter) isn't useful when + // we linear scan from the beginning each time, but we maintain it anyways + // for consistency. + tombstone_map_iter = tombstone_map.begin(); + while (tombstone_map_iter != tombstone_map.end()) { + const auto& tombstone = tombstone_map_iter->second; + if (icmp_.user_comparator()->Compare(parsed.user_key, + tombstone.start_key_) < 0) { + break; + } + if (parsed.sequence < tombstone.seq_ && + icmp_.user_comparator()->Compare(parsed.user_key, + tombstone.end_key_) < 0) { + return true; + } + ++tombstone_map_iter; + } return false; - } - --iter; - return parsed.sequence < iter->second.seq_; - } - for (const auto& start_key_and_tombstone : tombstone_map) { - const auto& tombstone = start_key_and_tombstone.second; - if (icmp_.user_comparator()->Compare(parsed.user_key, - tombstone.start_key_) < 0) { + case kForwardTraversal: + assert(collapse_deletions_ && tombstone_map_iter != tombstone_map.end()); + if (tombstone_map_iter == tombstone_map.begin() && + icmp_.user_comparator()->Compare(parsed.user_key, + tombstone_map_iter->first) < 0) { + // before start of deletion intervals + return false; + } + while (std::next(tombstone_map_iter) != tombstone_map.end() && + icmp_.user_comparator()->Compare( + std::next(tombstone_map_iter)->first, parsed.user_key) <= 0) { + ++tombstone_map_iter; + } + break; + case kBackwardTraversal: + assert(collapse_deletions_ && tombstone_map_iter != tombstone_map.end()); + while (tombstone_map_iter != tombstone_map.begin() && + icmp_.user_comparator()->Compare(parsed.user_key, + tombstone_map_iter->first) < 0) { + --tombstone_map_iter; + } + if (tombstone_map_iter == tombstone_map.begin() && + icmp_.user_comparator()->Compare(parsed.user_key, + tombstone_map_iter->first) < 0) { + // before start of deletion intervals + return false; + } + break; + case kBinarySearch: + assert(collapse_deletions_); + tombstone_map_iter = + tombstone_map.upper_bound(parsed.user_key); + if (tombstone_map_iter == tombstone_map.begin()) { + // before start of deletion intervals + return false; + } + --tombstone_map_iter; break; - } - if (parsed.sequence < tombstone.seq_ && - icmp_.user_comparator()->Compare(parsed.user_key, tombstone.end_key_) < - 0) { - return true; - } } - return false; + assert(mode != kFullScan); + assert(tombstone_map_iter != tombstone_map.end() && + icmp_.user_comparator()->Compare(tombstone_map_iter->first, + parsed.user_key) <= 0); + assert(std::next(tombstone_map_iter) == tombstone_map.end() || + icmp_.user_comparator()->Compare( + parsed.user_key, std::next(tombstone_map_iter)->first) < 0); + return parsed.sequence < tombstone_map_iter->second.seq_; } bool RangeDelAggregator::ShouldAddTombstones( @@ -97,7 +160,7 @@ bool RangeDelAggregator::ShouldAddTombstones( ++stripe_map_iter; } while (stripe_map_iter != rep_->stripe_map_.end()) { - if (!stripe_map_iter->second.empty()) { + if (!stripe_map_iter->second.raw_map.empty()) { return true; } ++stripe_map_iter; @@ -116,6 +179,8 @@ Status RangeDelAggregator::AddTombstones( if (first_iter) { if (rep_ == nullptr) { InitRep({upper_bound_}); + } else { + InvalidateTombstoneMapPositions(); } first_iter = false; } @@ -133,8 +198,19 @@ Status RangeDelAggregator::AddTombstones( return Status::OK(); } +void RangeDelAggregator::InvalidateTombstoneMapPositions() { + if (rep_ == nullptr) { + return; + } + for (auto stripe_map_iter = rep_->stripe_map_.begin(); + stripe_map_iter != rep_->stripe_map_.end(); ++stripe_map_iter) { + stripe_map_iter->second.iter = stripe_map_iter->second.raw_map.end(); + } +} + Status RangeDelAggregator::AddTombstone(RangeTombstone tombstone) { - auto& tombstone_map = GetTombstoneMap(tombstone.seq_); + auto& positional_tombstone_map = GetPositionalTombstoneMap(tombstone.seq_); + auto& tombstone_map = positional_tombstone_map.raw_map; if (collapse_deletions_) { // In collapsed mode, we only fill the seq_ field in the TombstoneMap's // values. The end_key is unneeded because we assume the tombstone extends @@ -286,8 +362,8 @@ Status RangeDelAggregator::AddTombstone(RangeTombstone tombstone) { return Status::OK(); } -RangeDelAggregator::TombstoneMap& RangeDelAggregator::GetTombstoneMap( - SequenceNumber seq) { +RangeDelAggregator::PositionalTombstoneMap& +RangeDelAggregator::GetPositionalTombstoneMap(SequenceNumber seq) { assert(rep_ != nullptr); // The stripe includes seqnum for the snapshot above and excludes seqnum for // the snapshot below. @@ -321,12 +397,12 @@ void RangeDelAggregator::AddToBuilder( if (bottommost_level) { // TODO(andrewkr): these are counted for each compaction output file, so // lots of double-counting. - if (!stripe_map_iter->second.empty()) { + if (!stripe_map_iter->second.raw_map.empty()) { range_del_out_stats->num_range_del_drop_obsolete += - static_cast(stripe_map_iter->second.size()) - + static_cast(stripe_map_iter->second.raw_map.size()) - (collapse_deletions_ ? 1 : 0); range_del_out_stats->num_record_drop_obsolete += - static_cast(stripe_map_iter->second.size()) - + static_cast(stripe_map_iter->second.raw_map.size()) - (collapse_deletions_ ? 1 : 0); } // For the bottommost level, keys covered by tombstones in the first @@ -338,13 +414,13 @@ void RangeDelAggregator::AddToBuilder( // insert them into a std::map on the read path. bool first_added = false; while (stripe_map_iter != rep_->stripe_map_.end()) { - for (auto tombstone_map_iter = stripe_map_iter->second.begin(); - tombstone_map_iter != stripe_map_iter->second.end(); + for (auto tombstone_map_iter = stripe_map_iter->second.raw_map.begin(); + tombstone_map_iter != stripe_map_iter->second.raw_map.end(); ++tombstone_map_iter) { RangeTombstone tombstone; if (collapse_deletions_) { auto next_tombstone_map_iter = std::next(tombstone_map_iter); - if (next_tombstone_map_iter == stripe_map_iter->second.end()) { + if (next_tombstone_map_iter == stripe_map_iter->second.raw_map.end()) { // it's the sentinel tombstone break; } @@ -432,7 +508,7 @@ bool RangeDelAggregator::IsEmpty() { } for (auto stripe_map_iter = rep_->stripe_map_.begin(); stripe_map_iter != rep_->stripe_map_.end(); ++stripe_map_iter) { - if (!stripe_map_iter->second.empty()) { + if (!stripe_map_iter->second.raw_map.empty()) { return false; } } diff --git a/db/range_del_aggregator.h b/db/range_del_aggregator.h index be5b3fa61..6d10ceee5 100644 --- a/db/range_del_aggregator.h +++ b/db/range_del_aggregator.h @@ -52,10 +52,31 @@ class RangeDelAggregator { SequenceNumber upper_bound, bool collapse_deletions = false); + // We maintain position in the tombstone map across calls to ShouldDelete. The + // caller may wish to specify a mode to optimize positioning the iterator + // during the next call to ShouldDelete. The non-kFullScan modes are only + // available when deletion collapsing is enabled. + // + // For example, if we invoke Next() on an iterator, kForwardTraversal should + // be specified to advance one-by-one through deletions until one is found + // with its interval containing the key. This will typically be faster than + // doing a full binary search (kBinarySearch). + enum RangePositioningMode { + kFullScan, // used iff collapse_deletions_ == false + kForwardTraversal, + kBackwardTraversal, + kBinarySearch, + }; + // Returns whether the key should be deleted, which is the case when it is // covered by a range tombstone residing in the same snapshot stripe. - bool ShouldDelete(const ParsedInternalKey& parsed); - bool ShouldDelete(const Slice& internal_key); + // @param mode If collapse_deletions_ is true, this dictates how we will find + // the deletion whose interval contains this key. Otherwise, its + // value must be kFullScan indicating linear scan from beginning.. + bool ShouldDelete(const ParsedInternalKey& parsed, + RangePositioningMode mode = kFullScan); + bool ShouldDelete(const Slice& internal_key, + RangePositioningMode mode = kFullScan); bool ShouldAddTombstones(bool bottommost_level = false); // Adds tombstones to the tombstone aggregation structure maintained by this @@ -63,6 +84,13 @@ class RangeDelAggregator { // @return non-OK status if any of the tombstone keys are corrupted. Status AddTombstones(std::unique_ptr input); + // Resets iterators maintained across calls to ShouldDelete(). This may be + // called when the tombstones change, or the owner may call explicitly, e.g., + // if it's an iterator that just seeked to an arbitrary position. The effect + // of invalidation is that the following call to ShouldDelete() will binary + // search for its tombstone. + void InvalidateTombstoneMapPositions(); + // Writes tombstones covering a range to a table builder. // @param extend_before_min_key If true, the range of tombstones to be added // to the TableBuilder starts from the beginning of the key-range; @@ -93,9 +121,23 @@ class RangeDelAggregator { // Maps tombstone user start key -> tombstone object typedef std::multimap TombstoneMap; + // Also maintains position in TombstoneMap last seen by ShouldDelete(). The + // end iterator indicates invalidation (e.g., if AddTombstones() changes the + // underlying map). End iterator cannot be invalidated. + struct PositionalTombstoneMap { + explicit PositionalTombstoneMap(TombstoneMap _raw_map) + : raw_map(std::move(_raw_map)), iter(raw_map.end()) {} + PositionalTombstoneMap(const PositionalTombstoneMap&) = delete; + PositionalTombstoneMap(PositionalTombstoneMap&& other) + : raw_map(std::move(other.raw_map)), iter(raw_map.end()) {} + + TombstoneMap raw_map; + TombstoneMap::const_iterator iter; + }; + // Maps snapshot seqnum -> map of tombstones that fall in that stripe, i.e., // their seqnums are greater than the next smaller snapshot's seqnum. - typedef std::map StripeMap; + typedef std::map StripeMap; struct Rep { StripeMap stripe_map_; @@ -106,7 +148,7 @@ class RangeDelAggregator { // once the first range deletion is encountered. void InitRep(const std::vector& snapshots); - TombstoneMap& GetTombstoneMap(SequenceNumber seq); + PositionalTombstoneMap& GetPositionalTombstoneMap(SequenceNumber seq); Status AddTombstone(RangeTombstone tombstone); SequenceNumber upper_bound_; diff --git a/db/range_del_aggregator_test.cc b/db/range_del_aggregator_test.cc index 975418946..83ecaaede 100644 --- a/db/range_del_aggregator_test.cc +++ b/db/range_del_aggregator_test.cc @@ -51,10 +51,14 @@ void VerifyRangeDels(const std::vector& range_dels, parsed_key.user_key = expected_point.begin; parsed_key.sequence = expected_point.seq; parsed_key.type = kTypeValue; - ASSERT_FALSE(range_del_agg.ShouldDelete(parsed_key)); + ASSERT_FALSE(range_del_agg.ShouldDelete( + parsed_key, + RangeDelAggregator::RangePositioningMode::kForwardTraversal)); if (parsed_key.sequence > 0) { --parsed_key.sequence; - ASSERT_TRUE(range_del_agg.ShouldDelete(parsed_key)); + ASSERT_TRUE(range_del_agg.ShouldDelete( + parsed_key, + RangeDelAggregator::RangePositioningMode::kForwardTraversal)); } } }