Consider subcompaction boundaries when updating file boundaries for range deletion
Summary: Adjusted AddToBuilder() to take lower_bound and upper_bound, which serve two purposes: (1) only range deletions overlapping with the interval [lower_bound, upper_bound) will be added to the output file, and (2) the output file's boundaries will not be extended before lower_bound or after upper_bound. Our computation of lower_bound/upper_bound consider both subcompaction boundaries and previous/next files within the subcompaction. Test cases are here (level subcompactions: https://gist.github.com/ajkr/63c7eae3e9667c5ebdc0a7efb74ac332, and universal subcompactions: https://gist.github.com/ajkr/5a62af77c4ebe4052a1955c496d51fdb) but can't be included in this diff as they depend on committing the API first. They fail before this change and pass after. Closes https://github.com/facebook/rocksdb/pull/1501 Reviewed By: yhchiang Differential Revision: D4171685 Pulled By: ajkr fbshipit-source-id: ee99db8
This commit is contained in:
parent
800e51553e
commit
ec2f64794b
@ -146,8 +146,8 @@ Status BuildTable(
|
||||
}
|
||||
}
|
||||
// nullptr for table_{min,max} so all range tombstones will be flushed
|
||||
range_del_agg->AddToBuilder(builder, true /* extend_before_min_key */,
|
||||
nullptr /* next_table_min_key*/, meta);
|
||||
range_del_agg->AddToBuilder(builder, nullptr /* lower_bound */,
|
||||
nullptr /* upper_bound */, meta);
|
||||
|
||||
// Finish and check for builder errors
|
||||
bool empty = builder->NumEntries() == 0;
|
||||
|
@ -972,14 +972,33 @@ Status CompactionJob::FinishCompactionOutputFile(
|
||||
Status s = input_status;
|
||||
auto meta = &sub_compact->current_output()->meta;
|
||||
if (s.ok()) {
|
||||
// For the first output table, include range tombstones before the min key
|
||||
// boundary. For subsequent output tables, this is unnecessary because we
|
||||
// extend each file's max key boundary up until the next file's min key when
|
||||
// range tombstones fall in the gap.
|
||||
range_del_agg->AddToBuilder(
|
||||
sub_compact->builder.get(),
|
||||
sub_compact->outputs.size() == 1 /* extend_before_min_key */,
|
||||
next_table_min_key, meta, bottommost_level_);
|
||||
Slice lower_bound_guard, upper_bound_guard;
|
||||
const Slice *lower_bound, *upper_bound;
|
||||
if (sub_compact->outputs.size() == 1) {
|
||||
// For the first output table, include range tombstones before the min key
|
||||
// but after the subcompaction boundary.
|
||||
lower_bound = sub_compact->start;
|
||||
} else if (meta->smallest.size() > 0) {
|
||||
// For subsequent output tables, only include range tombstones from min
|
||||
// key onwards since the previous file was extended to contain range
|
||||
// tombstones falling before min key.
|
||||
lower_bound_guard = meta->smallest.user_key();
|
||||
lower_bound = &lower_bound_guard;
|
||||
} else {
|
||||
lower_bound = nullptr;
|
||||
}
|
||||
if (next_table_min_key != nullptr) {
|
||||
// This isn't the last file in the subcompaction, so extend until the next
|
||||
// file starts.
|
||||
upper_bound_guard = ExtractUserKey(*next_table_min_key);
|
||||
upper_bound = &upper_bound_guard;
|
||||
} else {
|
||||
// This is the last file in the subcompaction, so extend until the
|
||||
// subcompaction ends.
|
||||
upper_bound = sub_compact->end;
|
||||
}
|
||||
range_del_agg->AddToBuilder(sub_compact->builder.get(), lower_bound,
|
||||
upper_bound, meta, bottommost_level_);
|
||||
}
|
||||
const uint64_t current_entries = sub_compact->builder->NumEntries();
|
||||
meta->marked_for_compaction = sub_compact->builder->NeedCompact();
|
||||
|
@ -113,11 +113,10 @@ RangeDelAggregator::TombstoneMap& RangeDelAggregator::GetTombstoneMap(
|
||||
// tombstones are known to be available, without the code duplication we have
|
||||
// in ShouldAddTombstones(). It'll also allow us to move the table-modifying
|
||||
// code into more coherent places: CompactionJob and BuildTable().
|
||||
void RangeDelAggregator::AddToBuilder(TableBuilder* builder,
|
||||
bool extend_before_min_key,
|
||||
const Slice* next_table_min_key,
|
||||
FileMetaData* meta,
|
||||
bool bottommost_level /* = false */) {
|
||||
void RangeDelAggregator::AddToBuilder(
|
||||
TableBuilder* builder, const Slice* lower_bound, const Slice* upper_bound,
|
||||
FileMetaData* meta,
|
||||
bool bottommost_level /* = false */) {
|
||||
auto stripe_map_iter = stripe_map_.begin();
|
||||
assert(stripe_map_iter != stripe_map_.end());
|
||||
if (bottommost_level) {
|
||||
@ -132,20 +131,20 @@ void RangeDelAggregator::AddToBuilder(TableBuilder* builder,
|
||||
while (stripe_map_iter != stripe_map_.end()) {
|
||||
for (const auto& start_key_and_tombstone : stripe_map_iter->second) {
|
||||
const auto& tombstone = start_key_and_tombstone.second;
|
||||
if (next_table_min_key != nullptr &&
|
||||
icmp_.user_comparator()->Compare(*next_table_min_key,
|
||||
tombstone.start_key_) < 0) {
|
||||
// Tombstones starting after next_table_min_key only need to be included
|
||||
// in the next table.
|
||||
if (upper_bound != nullptr &&
|
||||
icmp_.user_comparator()->Compare(*upper_bound,
|
||||
tombstone.start_key_) <= 0) {
|
||||
// Tombstones starting at upper_bound or later only need to be included
|
||||
// in the next table. Break because subsequent tombstones will start
|
||||
// even later.
|
||||
break;
|
||||
}
|
||||
if (!extend_before_min_key && meta->smallest.size() != 0 &&
|
||||
if (lower_bound != nullptr &&
|
||||
icmp_.user_comparator()->Compare(tombstone.end_key_,
|
||||
meta->smallest.user_key()) < 0) {
|
||||
// Tombstones ending before this table's smallest key can conditionally
|
||||
// be excluded, e.g., when this table is a non-first compaction output,
|
||||
// we know such tombstones are included in the previous table. In that
|
||||
// case extend_before_min_key would be false.
|
||||
*lower_bound) <= 0) {
|
||||
// Tombstones ending before or at lower_bound only need to be included
|
||||
// in the prev table. Continue because subsequent tombstones may still
|
||||
// overlap [lower_bound, upper_bound).
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -153,35 +152,49 @@ void RangeDelAggregator::AddToBuilder(TableBuilder* builder,
|
||||
builder->Add(ikey_and_end_key.first.Encode(), ikey_and_end_key.second);
|
||||
if (!first_added) {
|
||||
first_added = true;
|
||||
if (extend_before_min_key &&
|
||||
(meta->smallest.size() == 0 ||
|
||||
icmp_.Compare(ikey_and_end_key.first, meta->smallest) < 0)) {
|
||||
meta->smallest = ikey_and_end_key.first;
|
||||
InternalKey smallest_candidate = std::move(ikey_and_end_key.first);;
|
||||
if (lower_bound != nullptr &&
|
||||
icmp_.user_comparator()->Compare(smallest_candidate.user_key(),
|
||||
*lower_bound) <= 0) {
|
||||
// Pretend the smallest key has the same user key as lower_bound
|
||||
// (the max key in the previous table or subcompaction) in order for
|
||||
// files to appear key-space partitioned.
|
||||
//
|
||||
// Choose lowest seqnum so this file's smallest internal key comes
|
||||
// after the previous file's/subcompaction's largest. The fake seqnum
|
||||
// is OK because the read path's file-picking code only considers user
|
||||
// key.
|
||||
smallest_candidate = InternalKey(*lower_bound, 0, kTypeRangeDeletion);
|
||||
}
|
||||
if (meta->smallest.size() == 0 ||
|
||||
icmp_.Compare(smallest_candidate, meta->smallest) < 0) {
|
||||
meta->smallest = std::move(smallest_candidate);
|
||||
}
|
||||
}
|
||||
auto end_ikey = tombstone.SerializeEndKey();
|
||||
InternalKey largest_candidate = tombstone.SerializeEndKey();
|
||||
if (upper_bound != nullptr &&
|
||||
icmp_.user_comparator()->Compare(*upper_bound,
|
||||
largest_candidate.user_key()) <= 0) {
|
||||
// Pretend the largest key has the same user key as upper_bound (the
|
||||
// min key in the following table or subcompaction) in order for files
|
||||
// to appear key-space partitioned.
|
||||
//
|
||||
// Choose highest seqnum so this file's largest internal key comes
|
||||
// before the next file's/subcompaction's smallest. The fake seqnum is
|
||||
// OK because the read path's file-picking code only considers the user
|
||||
// key portion.
|
||||
//
|
||||
// Note Seek() also creates InternalKey with (user_key,
|
||||
// kMaxSequenceNumber), but with kTypeDeletion (0x7) instead of
|
||||
// kTypeRangeDeletion (0xF), so the range tombstone comes before the
|
||||
// Seek() key in InternalKey's ordering. So Seek() will look in the
|
||||
// next file for the user key.
|
||||
largest_candidate = InternalKey(*upper_bound, kMaxSequenceNumber,
|
||||
kTypeRangeDeletion);
|
||||
}
|
||||
if (meta->largest.size() == 0 ||
|
||||
icmp_.Compare(meta->largest, end_ikey) < 0) {
|
||||
if (next_table_min_key != nullptr &&
|
||||
icmp_.Compare(*next_table_min_key, end_ikey.Encode()) < 0) {
|
||||
// Pretend the largest key has the same user key as the min key in the
|
||||
// following table in order for files to appear key-space partitioned.
|
||||
// Choose highest seqnum so this file's largest comes before the next
|
||||
// file's smallest. The fake seqnum is OK because the read path's
|
||||
// file-picking code only considers the user key portion.
|
||||
//
|
||||
// Note Seek() also creates InternalKey with (user_key,
|
||||
// kMaxSequenceNumber), but with kTypeDeletion (0x7) instead of
|
||||
// kTypeRangeDeletion (0xF), so the range tombstone comes before the
|
||||
// Seek() key in InternalKey's ordering. So Seek() will look in the
|
||||
// next file for the user key.
|
||||
ParsedInternalKey parsed;
|
||||
ParseInternalKey(*next_table_min_key, &parsed);
|
||||
meta->largest = InternalKey(parsed.user_key, kMaxSequenceNumber,
|
||||
kTypeRangeDeletion);
|
||||
} else {
|
||||
meta->largest = std::move(end_ikey);
|
||||
}
|
||||
icmp_.Compare(meta->largest, largest_candidate) < 0) {
|
||||
meta->largest = std::move(largest_candidate);
|
||||
}
|
||||
meta->smallest_seqno = std::min(meta->smallest_seqno, tombstone.seq_);
|
||||
meta->largest_seqno = std::max(meta->largest_seqno, tombstone.seq_);
|
||||
|
@ -56,19 +56,24 @@ class RangeDelAggregator {
|
||||
// @param extend_before_min_key If true, the range of tombstones to be added
|
||||
// to the TableBuilder starts from the beginning of the key-range;
|
||||
// otherwise, it starts from meta->smallest.
|
||||
// @param next_table_min_key If nullptr, the range of tombstones to be added
|
||||
// to the TableBuilder ends at the end of the key-range; otherwise, it
|
||||
// ends at next_table_min_key.
|
||||
// @param lower_bound/upper_bound Any range deletion with [start_key, end_key)
|
||||
// that overlaps the target range [*lower_bound, *upper_bound) is added to
|
||||
// the builder. If lower_bound is nullptr, the target range extends
|
||||
// infinitely to the left. If upper_bound is nullptr, the target range
|
||||
// extends infinitely to the right. If both are nullptr, the target range
|
||||
// extends infinitely in both directions, i.e., all range deletions are
|
||||
// added to the builder.
|
||||
// @param meta The file's metadata. We modify the begin and end keys according
|
||||
// to the range tombstones added to this file such that the read path does
|
||||
// not miss range tombstones that cover gaps before/after/between files in
|
||||
// a level.
|
||||
// a level. lower_bound/upper_bound above constrain how far file boundaries
|
||||
// can be extended.
|
||||
// @param bottommost_level If true, we will filter out any tombstones
|
||||
// belonging to the oldest snapshot stripe, because all keys potentially
|
||||
// covered by this tombstone are guaranteed to have been deleted by
|
||||
// compaction.
|
||||
void AddToBuilder(TableBuilder* builder, bool extend_before_min_key,
|
||||
const Slice* next_table_min_key, FileMetaData* meta,
|
||||
void AddToBuilder(TableBuilder* builder, const Slice* lower_bound,
|
||||
const Slice* upper_bound, FileMetaData* meta,
|
||||
bool bottommost_level = false);
|
||||
Arena* GetArena() { return &arena_; }
|
||||
bool IsEmpty();
|
||||
|
Loading…
Reference in New Issue
Block a user