6fbe96baf8
Summary: This diff introduces RangeDelAggregator, which takes ownership of iterators provided to it via AddTombstones(). The tombstones are organized in a two-level map (snapshot stripe -> begin key -> tombstone). Tombstone creation avoids data copy by holding Slices returned by the iterator, which remain valid thanks to pinning. For compaction, we create a hierarchical range tombstone iterator with structure matching the iterator over compaction input data. An aggregator based on that iterator is used by CompactionIterator to determine which keys are covered by range tombstones. In case of merge operand, the same aggregator is used by MergeHelper. Upon finishing each file in the compaction, relevant range tombstones are added to the output file's range tombstone metablock and file boundaries are updated accordingly. To check whether a key is covered by range tombstone, RangeDelAggregator::ShouldDelete() considers tombstones in the key's snapshot stripe. When this function is used outside of compaction, it also checks newer stripes, which can contain covering tombstones. Currently the intra-stripe check involves a linear scan; however, in the future we plan to collapse ranges within a stripe such that binary search can be used. RangeDelAggregator::AddToBuilder() adds all range tombstones in the table's key-range to a new table's range tombstone meta-block. Since range tombstones may fall in the gap between files, we may need to extend some files' key-ranges. The strategy is (1) first file extends as far left as possible and other files do not extend left, (2) all files extend right until either the start of the next file or the end of the last range tombstone in the gap, whichever comes first. One other notable change is adding release/move semantics to ScopedArenaIterator such that it can be used to transfer ownership of an arena-allocated iterator, similar to how unique_ptr is used for malloc'd data. Depends on D61473 Test Plan: compaction_iterator_test, mock_table, end-to-end tests in D63927 Reviewers: sdong, IslamAbdelRahman, wanning, yhchiang, lightmark Reviewed By: lightmark Subscribers: andrewkr, dhruba, leveldb Differential Revision: https://reviews.facebook.net/D62205
196 lines
7.8 KiB
C++
196 lines
7.8 KiB
C++
// Copyright (c) 2016-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under the BSD-style license found in the
|
|
// LICENSE file in the root directory of this source tree. An additional grant
|
|
// of patent rights can be found in the PATENTS file in the same directory.
|
|
|
|
#include "db/range_del_aggregator.h"
|
|
|
|
#include <algorithm>
|
|
|
|
namespace rocksdb {
|
|
|
|
RangeDelAggregator::RangeDelAggregator(
|
|
const InternalKeyComparator& icmp,
|
|
const std::vector<SequenceNumber>& snapshots)
|
|
: icmp_(icmp) {
|
|
pinned_iters_mgr_.StartPinning();
|
|
for (auto snapshot : snapshots) {
|
|
stripe_map_.emplace(
|
|
snapshot,
|
|
TombstoneMap(stl_wrappers::LessOfComparator(icmp_.user_comparator())));
|
|
}
|
|
// Data newer than any snapshot falls in this catch-all stripe
|
|
stripe_map_.emplace(kMaxSequenceNumber, TombstoneMap());
|
|
}
|
|
|
|
bool RangeDelAggregator::ShouldDelete(const Slice& internal_key,
|
|
bool for_compaction /* = false */) {
|
|
ParsedInternalKey parsed;
|
|
if (!ParseInternalKey(internal_key, &parsed)) {
|
|
assert(false);
|
|
}
|
|
assert(IsValueType(parsed.type));
|
|
|
|
// Starting point is the snapshot stripe in which the key lives, then need to
|
|
// search all earlier stripes too, unless it's for compaction.
|
|
for (auto stripe_map_iter = GetStripeMapIter(parsed.sequence);
|
|
stripe_map_iter != stripe_map_.end(); ++stripe_map_iter) {
|
|
const auto& tombstone_map = stripe_map_iter->second;
|
|
for (const auto& start_key_and_tombstone : tombstone_map) {
|
|
const auto& tombstone = start_key_and_tombstone.second;
|
|
if (icmp_.user_comparator()->Compare(parsed.user_key,
|
|
tombstone.start_key_) < 0) {
|
|
break;
|
|
}
|
|
if (parsed.sequence < tombstone.seq_ &&
|
|
icmp_.user_comparator()->Compare(parsed.user_key,
|
|
tombstone.end_key_) <= 0) {
|
|
return true;
|
|
}
|
|
}
|
|
if (for_compaction) {
|
|
break;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool RangeDelAggregator::ShouldAddTombstones(
|
|
bool bottommost_level /* = false */) {
|
|
auto stripe_map_iter = stripe_map_.begin();
|
|
assert(stripe_map_iter != stripe_map_.end());
|
|
if (bottommost_level) {
|
|
// For the bottommost level, keys covered by tombstones in the first
|
|
// (oldest) stripe have been compacted away, so the tombstones are obsolete.
|
|
++stripe_map_iter;
|
|
}
|
|
while (stripe_map_iter != stripe_map_.end()) {
|
|
if (!stripe_map_iter->second.empty()) {
|
|
return true;
|
|
}
|
|
++stripe_map_iter;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
void RangeDelAggregator::AddTombstones(ScopedArenaIterator input) {
|
|
AddTombstones(input.release(), true /* arena */);
|
|
}
|
|
|
|
void RangeDelAggregator::AddTombstones(
|
|
std::unique_ptr<InternalIterator> input) {
|
|
AddTombstones(input.release(), false /* arena */);
|
|
}
|
|
|
|
void RangeDelAggregator::AddTombstones(InternalIterator* input, bool arena) {
|
|
pinned_iters_mgr_.PinIterator(input, arena);
|
|
input->SeekToFirst();
|
|
while (input->Valid()) {
|
|
RangeTombstone tombstone(input->key(), input->value());
|
|
auto& tombstone_map = GetStripeMapIter(tombstone.seq_)->second;
|
|
tombstone_map.emplace(tombstone.start_key_.ToString(),
|
|
std::move(tombstone));
|
|
input->Next();
|
|
}
|
|
}
|
|
|
|
RangeDelAggregator::StripeMap::iterator RangeDelAggregator::GetStripeMapIter(
|
|
SequenceNumber seq) {
|
|
// The stripe includes seqnum for the snapshot above and excludes seqnum for
|
|
// the snapshot below.
|
|
StripeMap::iterator iter;
|
|
if (seq > 0) {
|
|
// upper_bound() checks strict inequality so need to subtract one
|
|
iter = stripe_map_.upper_bound(seq - 1);
|
|
} else {
|
|
iter = stripe_map_.begin();
|
|
}
|
|
// catch-all stripe justifies this assertion in either of above cases
|
|
assert(iter != stripe_map_.end());
|
|
return iter;
|
|
}
|
|
|
|
// TODO(andrewkr): We should implement an iterator over range tombstones in our
|
|
// map. It'd enable compaction to open tables on-demand, i.e., only once range
|
|
// tombstones are known to be available, without the code duplication we have
|
|
// in ShouldAddTombstones(). It'll also allow us to move the table-modifying
|
|
// code into more coherent places: CompactionJob and BuildTable().
|
|
void RangeDelAggregator::AddToBuilder(TableBuilder* builder,
|
|
bool extend_before_min_key,
|
|
const Slice* next_table_min_key,
|
|
FileMetaData* meta,
|
|
bool bottommost_level /* = false */) {
|
|
auto stripe_map_iter = stripe_map_.begin();
|
|
assert(stripe_map_iter != stripe_map_.end());
|
|
if (bottommost_level) {
|
|
// For the bottommost level, keys covered by tombstones in the first
|
|
// (oldest) stripe have been compacted away, so the tombstones are obsolete.
|
|
++stripe_map_iter;
|
|
}
|
|
|
|
// Note the order in which tombstones are stored is insignificant since we
|
|
// insert them into a std::map on the read path.
|
|
bool first_added = false;
|
|
while (stripe_map_iter != stripe_map_.end()) {
|
|
for (const auto& start_key_and_tombstone : stripe_map_iter->second) {
|
|
const auto& tombstone = start_key_and_tombstone.second;
|
|
if (next_table_min_key != nullptr &&
|
|
icmp_.user_comparator()->Compare(*next_table_min_key,
|
|
tombstone.start_key_) < 0) {
|
|
// Tombstones starting after next_table_min_key only need to be included
|
|
// in the next table.
|
|
break;
|
|
}
|
|
if (!extend_before_min_key && meta->smallest.size() != 0 &&
|
|
icmp_.user_comparator()->Compare(tombstone.end_key_,
|
|
meta->smallest.user_key()) < 0) {
|
|
// Tombstones ending before this table's smallest key can conditionally
|
|
// be excluded, e.g., when this table is a non-first compaction output,
|
|
// we know such tombstones are included in the previous table. In that
|
|
// case extend_before_min_key would be false.
|
|
continue;
|
|
}
|
|
|
|
auto ikey_and_end_key = tombstone.Serialize();
|
|
builder->Add(ikey_and_end_key.first.Encode(), ikey_and_end_key.second);
|
|
if (!first_added) {
|
|
first_added = true;
|
|
if (extend_before_min_key &&
|
|
(meta->smallest.size() == 0 ||
|
|
icmp_.Compare(ikey_and_end_key.first, meta->smallest) < 0)) {
|
|
meta->smallest = ikey_and_end_key.first;
|
|
}
|
|
}
|
|
auto end_ikey = tombstone.SerializeEndKey();
|
|
if (meta->largest.size() == 0 ||
|
|
icmp_.Compare(meta->largest, end_ikey) < 0) {
|
|
if (next_table_min_key != nullptr &&
|
|
icmp_.Compare(*next_table_min_key, end_ikey.Encode()) < 0) {
|
|
// Pretend the largest key has the same user key as the min key in the
|
|
// following table in order for files to appear key-space partitioned.
|
|
// Choose highest seqnum so this file's largest comes before the next
|
|
// file's smallest. The fake seqnum is OK because the read path's
|
|
// file-picking code only considers the user key portion.
|
|
//
|
|
// Note Seek() also creates InternalKey with (user_key,
|
|
// kMaxSequenceNumber), but with kTypeDeletion (0x7) instead of
|
|
// kTypeRangeDeletion (0xF), so the range tombstone comes before the
|
|
// Seek() key in InternalKey's ordering. So Seek() will look in the
|
|
// next file for the user key.
|
|
ParsedInternalKey parsed;
|
|
ParseInternalKey(*next_table_min_key, &parsed);
|
|
meta->largest = InternalKey(parsed.user_key, kMaxSequenceNumber,
|
|
kTypeRangeDeletion);
|
|
} else {
|
|
meta->largest = std::move(end_ikey);
|
|
}
|
|
}
|
|
meta->smallest_seqno = std::min(meta->smallest_seqno, tombstone.seq_);
|
|
meta->largest_seqno = std::max(meta->largest_seqno, tombstone.seq_);
|
|
}
|
|
++stripe_map_iter;
|
|
}
|
|
}
|
|
|
|
} // namespace rocksdb
|