rocksdb/db/range_del_aggregator_v2.cc
Abhishek Madan 85394a96ca Speed up range scans with range tombstones (#4677)
Summary:
Previously, every range tombstone iterator was seeked on every
ShouldDelete call, which quickly degraded performance for long range
scans. This PR improves performance by tracking iterator positions and
only advancing iterators when necessary.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4677

Differential Revision: D13205373

Pulled By: abhimadan

fbshipit-source-id: 80c199dace1e19362a4c61c686bf01913eae87cb
2018-11-26 16:33:41 -08:00

313 lines
11 KiB
C++

// Copyright (c) 2018-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#include "db/range_del_aggregator_v2.h"
#include "db/compaction_iteration_stats.h"
#include "db/dbformat.h"
#include "db/pinned_iterators_manager.h"
#include "db/range_del_aggregator.h"
#include "db/range_tombstone_fragmenter.h"
#include "db/version_edit.h"
#include "include/rocksdb/comparator.h"
#include "include/rocksdb/types.h"
#include "table/internal_iterator.h"
#include "table/scoped_arena_iterator.h"
#include "table/table_builder.h"
#include "util/heap.h"
#include "util/kv_map.h"
#include "util/vector_iterator.h"
namespace rocksdb {
TruncatedRangeDelIterator::TruncatedRangeDelIterator(
std::unique_ptr<FragmentedRangeTombstoneIterator> iter,
const InternalKeyComparator* icmp, const InternalKey* smallest,
const InternalKey* largest)
: iter_(std::move(iter)), icmp_(icmp) {
if (smallest != nullptr) {
pinned_bounds_.emplace_back();
auto& parsed_smallest = pinned_bounds_.back();
if (!ParseInternalKey(smallest->Encode(), &parsed_smallest)) {
assert(false);
}
smallest_ = &parsed_smallest;
}
if (largest != nullptr) {
pinned_bounds_.emplace_back();
auto& parsed_largest = pinned_bounds_.back();
if (!ParseInternalKey(largest->Encode(), &parsed_largest)) {
assert(false);
}
if (parsed_largest.type == kTypeRangeDeletion &&
parsed_largest.sequence == kMaxSequenceNumber) {
// The file boundary has been artificially extended by a range tombstone.
// We do not need to adjust largest to properly truncate range
// tombstones that extend past the boundary.
} else if (parsed_largest.sequence == 0) {
// The largest key in the sstable has a sequence number of 0. Since we
// guarantee that no internal keys with the same user key and sequence
// number can exist in a DB, we know that the largest key in this sstable
// cannot exist as the smallest key in the next sstable. This further
// implies that no range tombstone in this sstable covers largest;
// otherwise, the file boundary would have been artificially extended.
//
// Therefore, we will never truncate a range tombstone at largest, so we
// can leave it unchanged.
} else {
// The same user key may straddle two sstable boundaries. To ensure that
// the truncated end key can cover the largest key in this sstable, reduce
// its sequence number by 1.
parsed_largest.sequence -= 1;
}
largest_ = &parsed_largest;
}
}
bool TruncatedRangeDelIterator::Valid() const {
return iter_->Valid() &&
(smallest_ == nullptr ||
icmp_->Compare(*smallest_, iter_->parsed_end_key()) < 0) &&
(largest_ == nullptr ||
icmp_->Compare(iter_->parsed_start_key(), *largest_) < 0);
}
void TruncatedRangeDelIterator::Next() { iter_->TopNext(); }
void TruncatedRangeDelIterator::Prev() { iter_->TopPrev(); }
// NOTE: target is a user key
void TruncatedRangeDelIterator::Seek(const Slice& target) {
if (largest_ != nullptr &&
icmp_->Compare(*largest_, ParsedInternalKey(target, kMaxSequenceNumber,
kTypeRangeDeletion)) <= 0) {
iter_->Invalidate();
return;
}
iter_->Seek(target);
}
// NOTE: target is a user key
void TruncatedRangeDelIterator::SeekForPrev(const Slice& target) {
if (smallest_ != nullptr &&
icmp_->Compare(ParsedInternalKey(target, 0, kTypeRangeDeletion),
*smallest_) < 0) {
iter_->Invalidate();
return;
}
iter_->SeekForPrev(target);
}
void TruncatedRangeDelIterator::SeekToFirst() { iter_->SeekToTopFirst(); }
void TruncatedRangeDelIterator::SeekToLast() { iter_->SeekToTopLast(); }
ForwardRangeDelIterator::ForwardRangeDelIterator(
const InternalKeyComparator* icmp,
const std::vector<std::unique_ptr<TruncatedRangeDelIterator>>* iters)
: icmp_(icmp),
iters_(iters),
unused_idx_(0),
active_seqnums_(SeqMaxComparator()),
active_iters_(EndKeyMinComparator(icmp)),
inactive_iters_(StartKeyMinComparator(icmp)) {}
bool ForwardRangeDelIterator::ShouldDelete(const ParsedInternalKey& parsed) {
assert(iters_ != nullptr);
// Pick up previously unseen iterators.
for (auto it = std::next(iters_->begin(), unused_idx_); it != iters_->end();
++it, ++unused_idx_) {
auto& iter = *it;
iter->Seek(parsed.user_key);
PushIter(iter.get(), parsed);
assert(active_iters_.size() == active_seqnums_.size());
}
// Move active iterators that end before parsed.
while (!active_iters_.empty() &&
icmp_->Compare((*active_iters_.top())->end_key(), parsed) <= 0) {
TruncatedRangeDelIterator* iter = PopActiveIter();
do {
iter->Next();
} while (iter->Valid() && icmp_->Compare(iter->end_key(), parsed) <= 0);
PushIter(iter, parsed);
assert(active_iters_.size() == active_seqnums_.size());
}
// Move inactive iterators that start before parsed.
while (!inactive_iters_.empty() &&
icmp_->Compare(inactive_iters_.top()->start_key(), parsed) <= 0) {
TruncatedRangeDelIterator* iter = PopInactiveIter();
while (iter->Valid() && icmp_->Compare(iter->end_key(), parsed) <= 0) {
iter->Next();
}
PushIter(iter, parsed);
assert(active_iters_.size() == active_seqnums_.size());
}
return active_seqnums_.empty()
? false
: (*active_seqnums_.begin())->seq() > parsed.sequence;
}
void ForwardRangeDelIterator::Invalidate() {
unused_idx_ = 0;
active_iters_.clear();
active_seqnums_.clear();
inactive_iters_.clear();
}
ReverseRangeDelIterator::ReverseRangeDelIterator(
const InternalKeyComparator* icmp,
const std::vector<std::unique_ptr<TruncatedRangeDelIterator>>* iters)
: icmp_(icmp),
iters_(iters),
unused_idx_(0),
active_seqnums_(SeqMaxComparator()),
active_iters_(StartKeyMaxComparator(icmp)),
inactive_iters_(EndKeyMaxComparator(icmp)) {}
bool ReverseRangeDelIterator::ShouldDelete(const ParsedInternalKey& parsed) {
assert(iters_ != nullptr);
// Pick up previously unseen iterators.
for (auto it = std::next(iters_->begin(), unused_idx_); it != iters_->end();
++it, ++unused_idx_) {
auto& iter = *it;
iter->SeekForPrev(parsed.user_key);
PushIter(iter.get(), parsed);
assert(active_iters_.size() == active_seqnums_.size());
}
// Move active iterators that start after parsed.
while (!active_iters_.empty() &&
icmp_->Compare(parsed, (*active_iters_.top())->start_key()) < 0) {
TruncatedRangeDelIterator* iter = PopActiveIter();
do {
iter->Prev();
} while (iter->Valid() && icmp_->Compare(parsed, iter->start_key()) < 0);
PushIter(iter, parsed);
assert(active_iters_.size() == active_seqnums_.size());
}
// Move inactive iterators that end after parsed.
while (!inactive_iters_.empty() &&
icmp_->Compare(parsed, inactive_iters_.top()->end_key()) < 0) {
TruncatedRangeDelIterator* iter = PopInactiveIter();
while (iter->Valid() && icmp_->Compare(parsed, iter->start_key()) < 0) {
iter->Prev();
}
PushIter(iter, parsed);
assert(active_iters_.size() == active_seqnums_.size());
}
return active_seqnums_.empty()
? false
: (*active_seqnums_.begin())->seq() > parsed.sequence;
}
void ReverseRangeDelIterator::Invalidate() {
unused_idx_ = 0;
active_iters_.clear();
active_seqnums_.clear();
inactive_iters_.clear();
}
RangeDelAggregatorV2::RangeDelAggregatorV2(const InternalKeyComparator* icmp,
SequenceNumber upper_bound)
: icmp_(icmp),
upper_bound_(upper_bound),
forward_iter_(icmp, &iters_),
reverse_iter_(icmp, &iters_) {}
void RangeDelAggregatorV2::AddTombstones(
std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter,
const InternalKey* smallest, const InternalKey* largest) {
if (input_iter == nullptr || input_iter->empty()) {
return;
}
if (wrapped_range_del_agg != nullptr) {
wrapped_range_del_agg->AddTombstones(std::move(input_iter), smallest,
largest);
// TODO: this eats the status of the wrapped call; may want to propagate it
return;
}
iters_.emplace_back(new TruncatedRangeDelIterator(std::move(input_iter),
icmp_, smallest, largest));
}
void RangeDelAggregatorV2::AddUnfragmentedTombstones(
std::unique_ptr<InternalIterator> input_iter) {
assert(wrapped_range_del_agg == nullptr);
if (input_iter == nullptr) {
return;
}
pinned_fragments_.emplace_back(new FragmentedRangeTombstoneList(
std::move(input_iter), *icmp_, false /* one_time_use */));
auto fragmented_iter = new FragmentedRangeTombstoneIterator(
pinned_fragments_.back().get(), upper_bound_, *icmp_);
AddTombstones(
std::unique_ptr<FragmentedRangeTombstoneIterator>(fragmented_iter));
}
bool RangeDelAggregatorV2::ShouldDelete(const ParsedInternalKey& parsed,
RangeDelPositioningMode mode) {
if (wrapped_range_del_agg != nullptr) {
return wrapped_range_del_agg->ShouldDelete(parsed, mode);
}
switch (mode) {
case RangeDelPositioningMode::kForwardTraversal:
reverse_iter_.Invalidate();
return forward_iter_.ShouldDelete(parsed);
case RangeDelPositioningMode::kBackwardTraversal:
forward_iter_.Invalidate();
return reverse_iter_.ShouldDelete(parsed);
default:
assert(false);
return false;
}
}
bool RangeDelAggregatorV2::IsRangeOverlapped(const Slice& start,
const Slice& end) {
assert(wrapped_range_del_agg == nullptr);
InvalidateRangeDelMapPositions();
// Set the internal start/end keys so that:
// - if start_ikey has the same user key and sequence number as the current
// end key, start_ikey will be considered greater; and
// - if end_ikey has the same user key and sequence number as the current
// start key, end_ikey will be considered greater.
ParsedInternalKey start_ikey(start, kMaxSequenceNumber,
static_cast<ValueType>(0));
ParsedInternalKey end_ikey(end, 0, static_cast<ValueType>(0));
for (auto& iter : iters_) {
bool checked_candidate_tombstones = false;
for (iter->SeekForPrev(start);
iter->Valid() && icmp_->Compare(iter->start_key(), end_ikey) <= 0;
iter->Next()) {
checked_candidate_tombstones = true;
if (icmp_->Compare(start_ikey, iter->end_key()) < 0 &&
icmp_->Compare(iter->start_key(), end_ikey) <= 0) {
return true;
}
}
if (!checked_candidate_tombstones) {
// Do an additional check for when the end of the range is the begin key
// of a tombstone, which we missed earlier since SeekForPrev'ing to the
// start was invalid.
iter->SeekForPrev(end);
if (iter->Valid() && icmp_->Compare(start_ikey, iter->end_key()) < 0 &&
icmp_->Compare(iter->start_key(), end_ikey) <= 0) {
return true;
}
}
}
return false;
}
} // namespace rocksdb