Relax VersionStorageInfo::GetOverlappingInputs check (#4050)

Summary:
Do not consider the range tombstone sentinel key as causing 2 adjacent
sstables in a level to overlap. When a range tombstone's end key is the
largest key in an sstable, the sstable's end key is so to a "sentinel"
value that is the smallest key in the next sstable with a sequence
number of kMaxSequenceNumber. This "sentinel" is guaranteed to not
overlap in internal-key space with the next sstable. Unfortunately,
GetOverlappingFiles uses user-keys to determine overlap and was thus
considering 2 adjacent sstables in a level to overlap if they were
separated by this sentinel key. This in turn would cause compactions to
be larger than necessary.

Note that this conflicts with
https://github.com/facebook/rocksdb/pull/2769 and cases
`DBRangeDelTest.CompactionTreatsSplitInputLevelDeletionAtomically` to
fail.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4050

Differential Revision: D8844423

Pulled By: ajkr

fbshipit-source-id: df3f9f1db8f4cff2bff77376b98b83c2ae1d155b
This commit is contained in:
Peter Mattis 2018-07-13 17:34:54 -07:00 committed by Facebook Github Bot
parent 21171615c1
commit 90fc40690a
15 changed files with 396 additions and 152 deletions

View File

@ -204,7 +204,7 @@ Status BuildTable(
// we will regrad this verification as user reads since the goal is
// to cache it here for further user reads
std::unique_ptr<InternalIterator> it(table_cache->NewIterator(
ReadOptions(), env_options, internal_comparator, meta->fd,
ReadOptions(), env_options, internal_comparator, *meta,
nullptr /* range_del_agg */,
mutable_cf_options.prefix_extractor.get(), nullptr,
(internal_stats == nullptr) ? nullptr

View File

@ -635,7 +635,7 @@ Status CompactionJob::Run() {
// to cache it here for further user reads
InternalIterator* iter = cfd->table_cache()->NewIterator(
ReadOptions(), env_options_, cfd->internal_comparator(),
files_meta[file_idx]->fd, nullptr /* range_del_agg */,
*files_meta[file_idx], nullptr /* range_del_agg */,
prefix_extractor, nullptr,
cfd->internal_stats()->GetFileReadHist(
compact_->compaction->output_level()),

View File

@ -916,11 +916,14 @@ TEST_F(DBRangeDelTest, MemtableBloomFilter) {
}
TEST_F(DBRangeDelTest, CompactionTreatsSplitInputLevelDeletionAtomically) {
// make sure compaction treats files containing a split range deletion in the
// input level as an atomic unit. I.e., compacting any input-level file(s)
// containing a portion of the range deletion causes all other input-level
// files containing portions of that same range deletion to be included in the
// compaction.
// This test originally verified that compaction treated files containing a
// split range deletion in the input level as an atomic unit. I.e.,
// compacting any input-level file(s) containing a portion of the range
// deletion causes all other input-level files containing portions of that
// same range deletion to be included in the compaction. Range deletion
// tombstones are now truncated to sstable boundaries which removed the need
// for that behavior (which could lead to excessively large
// compactions).
const int kNumFilesPerLevel = 4, kValueBytes = 4 << 10;
Options options = CurrentOptions();
options.compression = kNoCompression;
@ -967,22 +970,111 @@ TEST_F(DBRangeDelTest, CompactionTreatsSplitInputLevelDeletionAtomically) {
if (i == 0) {
ASSERT_OK(db_->CompactFiles(
CompactionOptions(), {meta.levels[1].files[0].name}, 2 /* level */));
ASSERT_EQ(0, NumTableFilesAtLevel(1));
} else if (i == 1) {
auto begin_str = Key(0), end_str = Key(1);
Slice begin = begin_str, end = end_str;
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &begin, &end));
ASSERT_EQ(3, NumTableFilesAtLevel(1));
} else if (i == 2) {
ASSERT_OK(db_->SetOptions(db_->DefaultColumnFamily(),
{{"max_bytes_for_level_base", "10000"}}));
dbfull()->TEST_WaitForCompact();
ASSERT_EQ(1, NumTableFilesAtLevel(1));
}
ASSERT_EQ(0, NumTableFilesAtLevel(1));
ASSERT_GT(NumTableFilesAtLevel(2), 0);
db_->ReleaseSnapshot(snapshot);
}
}
TEST_F(DBRangeDelTest, RangeTombstoneEndKeyAsSstableUpperBound) {
// Test the handling of the range-tombstone end-key as the
// upper-bound for an sstable.
const int kNumFilesPerLevel = 2, kValueBytes = 4 << 10;
Options options = CurrentOptions();
options.compression = kNoCompression;
options.level0_file_num_compaction_trigger = kNumFilesPerLevel;
options.memtable_factory.reset(
new SpecialSkipListFactory(2 /* num_entries_flush */));
options.target_file_size_base = kValueBytes;
options.disable_auto_compactions = true;
DestroyAndReopen(options);
// Create an initial sstable at L2:
// [key000000#1,1, key000000#1,1]
ASSERT_OK(Put(Key(0), ""));
ASSERT_OK(db_->Flush(FlushOptions()));
MoveFilesToLevel(2);
ASSERT_EQ(1, NumTableFilesAtLevel(2));
// A snapshot protects the range tombstone from dropping due to
// becoming obsolete.
const Snapshot* snapshot = db_->GetSnapshot();
db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
Key(0), Key(2 * kNumFilesPerLevel));
// Create 2 additional sstables in L0. Note that the first sstable
// contains the range tombstone.
// [key000000#3,1, key000004#72057594037927935,15]
// [key000001#5,1, key000002#6,1]
Random rnd(301);
std::string value = RandomString(&rnd, kValueBytes);
for (int j = 0; j < kNumFilesPerLevel; ++j) {
// Give files overlapping key-ranges to prevent a trivial move when we
// compact from L0 to L1.
ASSERT_OK(Put(Key(j), value));
ASSERT_OK(Put(Key(2 * kNumFilesPerLevel - 1 - j), value));
ASSERT_OK(db_->Flush(FlushOptions()));
ASSERT_EQ(j + 1, NumTableFilesAtLevel(0));
}
// Compact the 2 L0 sstables to L1, resulting in the following LSM. There
// are 2 sstables generated in L1 due to the target_file_size_base setting.
// L1:
// [key000000#3,1, key000002#72057594037927935,15]
// [key000002#6,1, key000004#72057594037927935,15]
// L2:
// [key000000#1,1, key000000#1,1]
MoveFilesToLevel(1);
ASSERT_EQ(2, NumTableFilesAtLevel(1));
{
// Compact the second sstable in L1:
// L1:
// [key000000#3,1, key000002#72057594037927935,15]
// L2:
// [key000000#1,1, key000000#1,1]
// [key000002#6,1, key000004#72057594037927935,15]
auto begin_str = Key(3);
const rocksdb::Slice begin = begin_str;
dbfull()->TEST_CompactRange(1, &begin, nullptr);
ASSERT_EQ(1, NumTableFilesAtLevel(1));
ASSERT_EQ(2, NumTableFilesAtLevel(2));
}
{
// Compact the first sstable in L1. This should be copacetic, but
// was previously resulting in overlapping sstables in L2 due to
// mishandling of the range tombstone end-key when used as the
// largest key for an sstable. The resulting LSM structure should
// be:
//
// L2:
// [key000000#1,1, key000001#72057594037927935,15]
// [key000001#5,1, key000002#72057594037927935,15]
// [key000002#6,1, key000004#72057594037927935,15]
auto begin_str = Key(0);
const rocksdb::Slice begin = begin_str;
dbfull()->TEST_CompactRange(1, &begin, &begin);
ASSERT_EQ(0, NumTableFilesAtLevel(1));
ASSERT_EQ(3, NumTableFilesAtLevel(2));
}
db_->ReleaseSnapshot(snapshot);
}
TEST_F(DBRangeDelTest, UnorderedTombstones) {
// Regression test for #2752. Range delete tombstones between
// different snapshot stripes are not stored in order, so the first

View File

@ -140,10 +140,14 @@ inline Slice ExtractUserKey(const Slice& internal_key) {
return Slice(internal_key.data(), internal_key.size() - 8);
}
inline ValueType ExtractValueType(const Slice& internal_key) {
inline uint64_t ExtractInternalKeyFooter(const Slice& internal_key) {
assert(internal_key.size() >= 8);
const size_t n = internal_key.size();
uint64_t num = DecodeFixed64(internal_key.data() + n - 8);
return DecodeFixed64(internal_key.data() + n - 8);
}
inline ValueType ExtractValueType(const Slice& internal_key) {
uint64_t num = ExtractInternalKeyFooter(internal_key);
unsigned char c = num & 0xff;
return static_cast<ValueType>(c);
}
@ -606,9 +610,15 @@ struct RangeTombstone {
return InternalKey(start_key_, seq_, kTypeRangeDeletion);
}
// The tombstone end-key is exclusive, so we generate an internal-key here
// which has a similar property. Using kMaxSequenceNumber guarantees that
// the returned internal-key will compare less than any other internal-key
// with the same user-key. This in turn guarantees that the serialized
// end-key for a tombstone such as [a-b] will compare less than the key "b".
//
// be careful to use SerializeEndKey(), allocates new memory
InternalKey SerializeEndKey() const {
return InternalKey(end_key_, seq_, kTypeRangeDeletion);
return InternalKey(end_key_, kMaxSequenceNumber, kTypeRangeDeletion);
}
};

View File

@ -192,6 +192,13 @@ TEST_F(FormatTest, UpdateInternalKey) {
ASSERT_EQ(new_val_type, decoded.type);
}
TEST_F(FormatTest, RangeTombstoneSerializeEndKey) {
RangeTombstone t("a", "b", 2);
InternalKey k("b", 3, kTypeValue);
const InternalKeyComparator cmp(BytewiseComparator());
ASSERT_LT(cmp.Compare(t.SerializeEndKey(), k), 0);
}
} // namespace rocksdb
int main(int argc, char** argv) {

View File

@ -75,7 +75,7 @@ class ForwardLevelIterator : public InternalIterator {
cfd_->internal_comparator(), {} /* snapshots */);
file_iter_ = cfd_->table_cache()->NewIterator(
read_options_, *(cfd_->soptions()), cfd_->internal_comparator(),
files_[file_index_]->fd,
*files_[file_index_],
read_options_.ignore_range_deletions ? nullptr : &range_del_agg,
prefix_extractor_, nullptr /* table_reader_ptr */, nullptr, false);
file_iter_->SetPinnedItersMgr(pinned_iters_mgr_);
@ -635,7 +635,7 @@ void ForwardIterator::RebuildIterators(bool refresh_sv) {
continue;
}
l0_iters_.push_back(cfd_->table_cache()->NewIterator(
read_options_, *cfd_->soptions(), cfd_->internal_comparator(), l0->fd,
read_options_, *cfd_->soptions(), cfd_->internal_comparator(), *l0,
read_options_.ignore_range_deletions ? nullptr : &range_del_agg,
sv_->mutable_cf_options.prefix_extractor.get()));
}
@ -706,7 +706,7 @@ void ForwardIterator::RenewIterators() {
}
l0_iters_new.push_back(cfd_->table_cache()->NewIterator(
read_options_, *cfd_->soptions(), cfd_->internal_comparator(),
l0_files_new[inew]->fd,
*l0_files_new[inew],
read_options_.ignore_range_deletions ? nullptr : &range_del_agg,
svnew->mutable_cf_options.prefix_extractor.get()));
}
@ -766,7 +766,7 @@ void ForwardIterator::ResetIncompleteIterators() {
DeleteIterator(l0_iters_[i]);
l0_iters_[i] = cfd_->table_cache()->NewIterator(
read_options_, *cfd_->soptions(), cfd_->internal_comparator(),
l0_files[i]->fd, nullptr /* range_del_agg */,
*l0_files[i], nullptr /* range_del_agg */,
sv_->mutable_cf_options.prefix_extractor.get());
l0_iters_[i]->SetPinnedItersMgr(pinned_iters_mgr_);
}

View File

@ -426,7 +426,9 @@ bool RangeDelAggregator::IsRangeOverlapped(const Slice& start,
}
Status RangeDelAggregator::AddTombstones(
std::unique_ptr<InternalIterator> input) {
std::unique_ptr<InternalIterator> input,
const InternalKey* smallest,
const InternalKey* largest) {
if (input == nullptr) {
return Status::OK();
}
@ -450,6 +452,29 @@ Status RangeDelAggregator::AddTombstones(
return Status::Corruption("Unable to parse range tombstone InternalKey");
}
RangeTombstone tombstone(parsed_key, input->value());
// Truncate the tombstone to the range [smallest, largest].
if (smallest != nullptr) {
if (icmp_.user_comparator()->Compare(
tombstone.start_key_, smallest->user_key()) < 0) {
tombstone.start_key_ = smallest->user_key();
}
}
if (largest != nullptr) {
// This is subtly correct despite the discrepancy between
// FileMetaData::largest being inclusive while RangeTombstone::end_key_
// is exclusive. A tombstone will only extend past the bounds of an
// sstable if its end-key is the largest key in the table. If that
// occurs, the largest key for the table is set based on the smallest
// key in the next table in the level. In that case, largest->user_key()
// is not actually a key in the current table and thus we can use it as
// the exclusive end-key for the tombstone.
if (icmp_.user_comparator()->Compare(
tombstone.end_key_, largest->user_key()) > 0) {
// The largest key should be a tombstone sentinel key.
assert(GetInternalKeySeqno(largest->Encode()) == kMaxSequenceNumber);
tombstone.end_key_ = largest->user_key();
}
}
GetRangeDelMap(tombstone.seq_).AddTombstone(std::move(tombstone));
input->Next();
}

View File

@ -137,9 +137,15 @@ class RangeDelAggregator {
bool IsRangeOverlapped(const Slice& start, const Slice& end);
// Adds tombstones to the tombstone aggregation structure maintained by this
// object.
// object. Tombstones are truncated to smallest and largest. If smallest (or
// largest) is null, it is not used for truncation. When adding range
// tombstones present in an sstable, smallest and largest should be set to
// the smallest and largest keys from the sstable file metadata. Note that
// tombstones end keys are exclusive while largest is inclusive.
// @return non-OK status if any of the tombstone keys are corrupted.
Status AddTombstones(std::unique_ptr<InternalIterator> input);
Status AddTombstones(std::unique_ptr<InternalIterator> input,
const InternalKey* smallest = nullptr,
const InternalKey* largest = nullptr);
// Resets iterators maintained across calls to ShouldDelete(). This may be
// called when the tombstones change, or the owner may call explicitly, e.g.,

View File

@ -19,6 +19,7 @@ namespace {
struct ExpectedPoint {
Slice begin;
SequenceNumber seq;
bool expectAlive;
};
enum Direction {
@ -29,7 +30,9 @@ enum Direction {
static auto icmp = InternalKeyComparator(BytewiseComparator());
void AddTombstones(RangeDelAggregator* range_del_agg,
const std::vector<RangeTombstone>& range_dels) {
const std::vector<RangeTombstone>& range_dels,
const InternalKey* smallest = nullptr,
const InternalKey* largest = nullptr) {
std::vector<std::string> keys, values;
for (const auto& range_del : range_dels) {
auto key_and_value = range_del.Serialize();
@ -38,7 +41,7 @@ void AddTombstones(RangeDelAggregator* range_del_agg,
}
std::unique_ptr<test::VectorIterator> range_del_iter(
new test::VectorIterator(keys, values));
range_del_agg->AddTombstones(std::move(range_del_iter));
range_del_agg->AddTombstones(std::move(range_del_iter), smallest, largest);
}
void VerifyTombstonesEq(const RangeTombstone& a, const RangeTombstone& b) {
@ -62,7 +65,9 @@ void VerifyRangeDelIter(
void VerifyRangeDels(
const std::vector<RangeTombstone>& range_dels_in,
const std::vector<ExpectedPoint>& expected_points,
const std::vector<RangeTombstone>& expected_collapsed_range_dels) {
const std::vector<RangeTombstone>& expected_collapsed_range_dels,
const InternalKey* smallest = nullptr,
const InternalKey* largest = nullptr) {
// Test same result regardless of which order the range deletions are added
// and regardless of collapsed mode.
for (bool collapsed : {false, true}) {
@ -73,7 +78,7 @@ void VerifyRangeDels(
if (dir == kReverse) {
std::reverse(range_dels.begin(), range_dels.end());
}
AddTombstones(&range_del_agg, range_dels);
AddTombstones(&range_del_agg, range_dels, smallest, largest);
auto mode = RangeDelPositioningMode::kFullScan;
if (collapsed) {
@ -88,22 +93,29 @@ void VerifyRangeDels(
ASSERT_FALSE(range_del_agg.ShouldDelete(parsed_key, mode));
if (parsed_key.sequence > 0) {
--parsed_key.sequence;
ASSERT_TRUE(range_del_agg.ShouldDelete(parsed_key, mode));
if (expected_point.expectAlive) {
ASSERT_FALSE(range_del_agg.ShouldDelete(parsed_key, mode));
} else {
ASSERT_TRUE(range_del_agg.ShouldDelete(parsed_key, mode));
}
}
}
if (collapsed) {
range_dels = expected_collapsed_range_dels;
} else {
// Tombstones in an uncollapsed map are presented in start key order.
// Tombstones with the same start key are presented in insertion order.
VerifyRangeDelIter(range_del_agg.NewIterator().get(), range_dels);
} else if (smallest == nullptr && largest == nullptr) {
// Tombstones in an uncollapsed map are presented in start key
// order. Tombstones with the same start key are presented in
// insertion order. We don't handle tombstone truncation here, so the
// verification is only performed if no truncation was requested.
std::stable_sort(range_dels.begin(), range_dels.end(),
[&](const RangeTombstone& a, const RangeTombstone& b) {
return icmp.user_comparator()->Compare(
a.start_key_, b.start_key_) < 0;
});
VerifyRangeDelIter(range_del_agg.NewIterator().get(), range_dels);
}
VerifyRangeDelIter(range_del_agg.NewIterator().get(), range_dels);
}
}
@ -275,6 +287,19 @@ TEST_F(RangeDelAggregatorTest, MergingIteratorSeek) {
{{"c", "d", 20}, {"e", "f", 20}, {"f", "g", 10}});
}
TEST_F(RangeDelAggregatorTest, TruncateTombstones) {
const InternalKey smallest("b", 1, kTypeRangeDeletion);
const InternalKey largest("e", kMaxSequenceNumber, kTypeRangeDeletion);
VerifyRangeDels(
{{"a", "c", 10}, {"d", "f", 10}},
{{"a", 10, true}, // truncated
{"b", 10, false}, // not truncated
{"d", 10, false}, // not truncated
{"e", 10, true}}, // truncated
{{"b", "c", 10}, {"d", "e", 10}},
&smallest, &largest);
}
} // namespace rocksdb
int main(int argc, char** argv) {

View File

@ -500,7 +500,7 @@ class Repairer {
}
if (status.ok()) {
InternalIterator* iter = table_cache_->NewIterator(
ReadOptions(), env_options_, cfd->internal_comparator(), t->meta.fd,
ReadOptions(), env_options_, cfd->internal_comparator(), t->meta,
nullptr /* range_del_agg */,
cfd->GetLatestMutableCFOptions()->prefix_extractor.get());
bool empty = true;

View File

@ -180,7 +180,7 @@ Status TableCache::FindTable(const EnvOptions& env_options,
InternalIterator* TableCache::NewIterator(
const ReadOptions& options, const EnvOptions& env_options,
const InternalKeyComparator& icomparator, const FileDescriptor& fd,
const InternalKeyComparator& icomparator, const FileMetaData& file_meta,
RangeDelAggregator* range_del_agg, const SliceTransform* prefix_extractor,
TableReader** table_reader_ptr, HistogramImpl* file_read_hist,
bool for_compaction, Arena* arena, bool skip_filters, int level) {
@ -211,6 +211,7 @@ InternalIterator* TableCache::NewIterator(
create_new_table_reader = readahead > 0;
}
auto& fd = file_meta.fd;
if (create_new_table_reader) {
unique_ptr<TableReader> table_reader_unique_ptr;
s = GetTableReader(
@ -265,7 +266,10 @@ InternalIterator* TableCache::NewIterator(
s = range_del_iter->status();
}
if (s.ok()) {
s = range_del_agg->AddTombstones(std::move(range_del_iter));
s = range_del_agg->AddTombstones(
std::move(range_del_iter),
&file_meta.smallest,
&file_meta.largest);
}
}
}
@ -280,51 +284,14 @@ InternalIterator* TableCache::NewIterator(
return result;
}
InternalIterator* TableCache::NewRangeTombstoneIterator(
const ReadOptions& options, const EnvOptions& env_options,
const InternalKeyComparator& icomparator, const FileDescriptor& fd,
HistogramImpl* file_read_hist, bool skip_filters, int level,
const SliceTransform* prefix_extractor) {
Status s;
Cache::Handle* handle = nullptr;
TableReader* table_reader = fd.table_reader;
if (table_reader == nullptr) {
s = FindTable(env_options, icomparator, fd, &handle, prefix_extractor,
options.read_tier == kBlockCacheTier /* no_io */,
true /* record read_stats */, file_read_hist, skip_filters,
level);
if (s.ok()) {
table_reader = GetTableReaderFromHandle(handle);
}
}
InternalIterator* result = nullptr;
if (s.ok()) {
result = table_reader->NewRangeTombstoneIterator(options);
if (result != nullptr) {
if (handle != nullptr) {
result->RegisterCleanup(&UnrefEntry, cache_, handle);
}
}
}
if (result == nullptr && handle != nullptr) {
// the range deletion block didn't exist, or there was a failure between
// getting handle and getting iterator.
ReleaseHandle(handle);
}
if (!s.ok()) {
assert(result == nullptr);
result = NewErrorInternalIterator(s);
}
return result;
}
Status TableCache::Get(const ReadOptions& options,
const InternalKeyComparator& internal_comparator,
const FileDescriptor& fd, const Slice& k,
const FileMetaData& file_meta, const Slice& k,
GetContext* get_context,
const SliceTransform* prefix_extractor,
HistogramImpl* file_read_hist, bool skip_filters,
int level) {
auto& fd = file_meta.fd;
std::string* row_cache_entry = nullptr;
bool done = false;
#ifndef ROCKSDB_LITE
@ -405,7 +372,9 @@ Status TableCache::Get(const ReadOptions& options,
}
if (s.ok()) {
s = get_context->range_del_agg()->AddTombstones(
std::move(range_del_iter));
std::move(range_del_iter),
&file_meta.smallest,
&file_meta.largest);
}
}
if (s.ok()) {

View File

@ -53,19 +53,12 @@ class TableCache {
InternalIterator* NewIterator(
const ReadOptions& options, const EnvOptions& toptions,
const InternalKeyComparator& internal_comparator,
const FileDescriptor& file_fd, RangeDelAggregator* range_del_agg,
const FileMetaData& file_meta, RangeDelAggregator* range_del_agg,
const SliceTransform* prefix_extractor = nullptr,
TableReader** table_reader_ptr = nullptr,
HistogramImpl* file_read_hist = nullptr, bool for_compaction = false,
Arena* arena = nullptr, bool skip_filters = false, int level = -1);
InternalIterator* NewRangeTombstoneIterator(
const ReadOptions& options, const EnvOptions& toptions,
const InternalKeyComparator& internal_comparator,
const FileDescriptor& file_fd, HistogramImpl* file_read_hist,
bool skip_filters, int level,
const SliceTransform* prefix_extractor = nullptr);
// If a seek to internal key "k" in specified file finds an entry,
// call (*handle_result)(arg, found_key, found_value) repeatedly until
// it returns false.
@ -76,7 +69,7 @@ class TableCache {
// @param level The level this table is at, -1 for "not set / don't know"
Status Get(const ReadOptions& options,
const InternalKeyComparator& internal_comparator,
const FileDescriptor& file_fd, const Slice& k,
const FileMetaData& file_meta, const Slice& k,
GetContext* get_context,
const SliceTransform* prefix_extractor = nullptr,
HistogramImpl* file_read_hist = nullptr, bool skip_filters = false,

View File

@ -548,8 +548,9 @@ class LevelIterator final : public InternalIterator {
}
return table_cache_->NewIterator(
read_options_, env_options_, icomparator_, file_meta.fd, range_del_agg_,
prefix_extractor_, nullptr /* don't need reference to table */,
read_options_, env_options_, icomparator_, *file_meta.file_metadata,
range_del_agg_, prefix_extractor_,
nullptr /* don't need reference to table */,
file_read_hist_, for_compaction_, nullptr /* arena */, skip_filters_,
level_);
}
@ -1000,7 +1001,7 @@ void Version::AddIteratorsForLevel(const ReadOptions& read_options,
for (size_t i = 0; i < storage_info_.LevelFilesBrief(0).num_files; i++) {
const auto& file = storage_info_.LevelFilesBrief(0).files[i];
merge_iter_builder->AddIterator(cfd_->table_cache()->NewIterator(
read_options, soptions, cfd_->internal_comparator(), file.fd,
read_options, soptions, cfd_->internal_comparator(), *file.file_metadata,
range_del_agg, mutable_cf_options_.prefix_extractor.get(), nullptr,
cfd_->internal_stats()->GetFileReadHist(0), false, arena,
false /* skip_filters */, 0 /* level */));
@ -1053,7 +1054,7 @@ Status Version::OverlapWithLevelIterator(const ReadOptions& read_options,
continue;
}
ScopedArenaIterator iter(cfd_->table_cache()->NewIterator(
read_options, env_options, cfd_->internal_comparator(), file->fd,
read_options, env_options, cfd_->internal_comparator(), *file->file_metadata,
&range_del_agg, mutable_cf_options_.prefix_extractor.get(), nullptr,
cfd_->internal_stats()->GetFileReadHist(0), false, &arena,
false /* skip_filters */, 0 /* level */));
@ -1198,8 +1199,8 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k,
}
*status = table_cache_->Get(
read_options, *internal_comparator(), f->fd, ikey, &get_context,
mutable_cf_options_.prefix_extractor.get(),
read_options, *internal_comparator(), *f->file_metadata, ikey,
&get_context, mutable_cf_options_.prefix_extractor.get(),
cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()),
IsFilterSkipped(static_cast<int>(fp.GetHitFileLevel()),
fp.IsHitFileLastInLevel()),
@ -2053,8 +2054,8 @@ void VersionStorageInfo::GetOverlappingInputs(
*file_index = -1;
}
const Comparator* user_cmp = user_comparator_;
if (begin != nullptr && end != nullptr && level > 0) {
GetOverlappingInputsRangeBinarySearch(level, user_begin, user_end, inputs,
if (level > 0) {
GetOverlappingInputsRangeBinarySearch(level, begin, end, inputs,
hint_index, file_index);
return;
}
@ -2109,24 +2110,73 @@ void VersionStorageInfo::GetCleanInputsWithinInterval(
return;
}
Slice user_begin, user_end;
const auto& level_files = level_files_brief_[level];
if (begin == nullptr) {
user_begin = ExtractUserKey(level_files.files[0].smallest_key);
} else {
user_begin = begin->user_key();
begin = &level_files.files[0].file_metadata->smallest;
}
if (end == nullptr) {
user_end = ExtractUserKey(
level_files.files[level_files.num_files - 1].largest_key);
} else {
user_end = end->user_key();
end = &level_files.files[level_files.num_files - 1].file_metadata->largest;
}
GetOverlappingInputsRangeBinarySearch(level, user_begin, user_end, inputs,
GetOverlappingInputsRangeBinarySearch(level, begin, end, inputs,
hint_index, file_index,
true /* within_interval */);
}
namespace {
const uint64_t kRangeTombstoneSentinel =
PackSequenceAndType(kMaxSequenceNumber, kTypeRangeDeletion);
// Utility for comparing sstable boundary keys. Returns -1 if either a or b is
// null which provides the property that a==null indicates a key that is less
// than any key and b==null indicates a key that is greater than any key. Note
// that the comparison is performed primarily on the user-key portion of the
// key. If the user-keys compare equal, an additional test is made to sort
// range tombstone sentinel keys before other keys with the same user-key. The
// result is that 2 user-keys will compare equal if they differ purely on
// their sequence number and value, but the range tombstone sentinel for that
// user-key will compare not equal. This is necessary because the range
// tombstone sentinel key is set as the largest key for an sstable even though
// that key never appears in the database. We don't want adjacent sstables to
// be considered overlapping if they are separated by the range tombstone
// sentinel.
int sstableKeyCompare(const Comparator* user_cmp,
const InternalKey& a, const InternalKey& b) {
auto c = user_cmp->Compare(a.user_key(), b.user_key());
if (c != 0) {
return c;
}
auto a_footer = ExtractInternalKeyFooter(a.Encode());
auto b_footer = ExtractInternalKeyFooter(b.Encode());
if (a_footer == kRangeTombstoneSentinel) {
if (b_footer != kRangeTombstoneSentinel) {
return -1;
}
} else if (b_footer == kRangeTombstoneSentinel) {
return 1;
}
return 0;
}
int sstableKeyCompare(const Comparator* user_cmp,
const InternalKey* a, const InternalKey& b) {
if (a == nullptr) {
return -1;
}
return sstableKeyCompare(user_cmp, *a, b);
}
int sstableKeyCompare(const Comparator* user_cmp,
const InternalKey& a, const InternalKey* b) {
if (b == nullptr) {
return -1;
}
return sstableKeyCompare(user_cmp, a, *b);
}
} // namespace
// Store in "*inputs" all files in "level" that overlap [begin,end]
// Employ binary search to find at least one file that overlaps the
// specified range. From that file, iterate backwards and
@ -2135,7 +2185,7 @@ void VersionStorageInfo::GetCleanInputsWithinInterval(
// within range [begin, end]. "clean" means there is a boudnary
// between the files in "*inputs" and the surrounding files
void VersionStorageInfo::GetOverlappingInputsRangeBinarySearch(
int level, const Slice& user_begin, const Slice& user_end,
int level, const InternalKey* begin, const InternalKey* end,
std::vector<FileMetaData*>* inputs, int hint_index, int* file_index,
bool within_interval) const {
assert(level > 0);
@ -2143,7 +2193,7 @@ void VersionStorageInfo::GetOverlappingInputsRangeBinarySearch(
int mid = 0;
int max = static_cast<int>(files_[level].size()) - 1;
bool foundOverlap = false;
const Comparator* user_cmp = user_comparator_;
auto user_cmp = user_comparator_;
// if the caller already knows the index of a file that has overlap,
// then we can skip the binary search.
@ -2155,15 +2205,15 @@ void VersionStorageInfo::GetOverlappingInputsRangeBinarySearch(
while (!foundOverlap && min <= max) {
mid = (min + max)/2;
FdWithKeyRange* f = &(level_files_brief_[level].files[mid]);
const Slice file_start = ExtractUserKey(f->smallest_key);
const Slice file_limit = ExtractUserKey(f->largest_key);
if ((!within_interval && user_cmp->Compare(file_limit, user_begin) < 0) ||
(within_interval && user_cmp->Compare(file_start, user_begin) < 0)) {
auto& smallest = f->file_metadata->smallest;
auto& largest = f->file_metadata->largest;
if ((!within_interval && sstableKeyCompare(user_cmp, begin, largest) > 0) ||
(within_interval && sstableKeyCompare(user_cmp, begin, smallest) > 0)) {
min = mid + 1;
} else if ((!within_interval &&
user_cmp->Compare(user_end, file_start) < 0) ||
sstableKeyCompare(user_cmp, smallest, end) > 0) ||
(within_interval &&
user_cmp->Compare(user_end, file_limit) < 0)) {
sstableKeyCompare(user_cmp, largest, end) > 0)) {
max = mid - 1;
} else {
foundOverlap = true;
@ -2182,10 +2232,10 @@ void VersionStorageInfo::GetOverlappingInputsRangeBinarySearch(
int start_index, end_index;
if (within_interval) {
ExtendFileRangeWithinInterval(level, user_begin, user_end, mid, &start_index,
&end_index);
ExtendFileRangeWithinInterval(level, begin, end, mid,
&start_index, &end_index);
} else {
ExtendFileRangeOverlappingInterval(level, user_begin, user_end, mid,
ExtendFileRangeOverlappingInterval(level, begin, end, mid,
&start_index, &end_index);
assert(end_index >= start_index);
}
@ -2202,21 +2252,28 @@ void VersionStorageInfo::GetOverlappingInputsRangeBinarySearch(
// and forward to find all overlapping files.
// Use FileLevel in searching, make it faster
void VersionStorageInfo::ExtendFileRangeOverlappingInterval(
int level, const Slice& user_begin, const Slice& user_end,
int level, const InternalKey* begin, const InternalKey* end,
unsigned int mid_index, int* start_index, int* end_index) const {
const Comparator* user_cmp = user_comparator_;
auto user_cmp = user_comparator_;
const FdWithKeyRange* files = level_files_brief_[level].files;
#ifndef NDEBUG
{
// assert that the file at mid_index overlaps with the range
assert(mid_index < level_files_brief_[level].num_files);
const FdWithKeyRange* f = &files[mid_index];
const Slice fstart = ExtractUserKey(f->smallest_key);
const Slice flimit = ExtractUserKey(f->largest_key);
if (user_cmp->Compare(fstart, user_begin) >= 0) {
assert(user_cmp->Compare(fstart, user_end) <= 0);
auto& smallest = f->file_metadata->smallest;
auto& largest = f->file_metadata->largest;
if (sstableKeyCompare(user_cmp, begin, smallest) <= 0) {
assert(sstableKeyCompare(user_cmp, smallest, end) <= 0);
} else {
assert(user_cmp->Compare(flimit, user_begin) >= 0);
// fprintf(stderr, "ExtendFileRangeOverlappingInterval\n%s - %s\n%s - %s\n%d %d\n",
// begin ? begin->DebugString().c_str() : "(null)",
// end ? end->DebugString().c_str() : "(null)",
// smallest->DebugString().c_str(),
// largest->DebugString().c_str(),
// sstableKeyCompare(user_cmp, smallest, begin),
// sstableKeyCompare(user_cmp, largest, begin));
assert(sstableKeyCompare(user_cmp, begin, largest) <= 0);
}
}
#endif
@ -2228,8 +2285,8 @@ void VersionStorageInfo::ExtendFileRangeOverlappingInterval(
// check backwards from 'mid' to lower indices
for (int i = mid_index; i >= 0 ; i--) {
const FdWithKeyRange* f = &files[i];
const Slice file_limit = ExtractUserKey(f->largest_key);
if (user_cmp->Compare(file_limit, user_begin) >= 0) {
auto& largest = f->file_metadata->largest;
if (sstableKeyCompare(user_cmp, begin, largest) <= 0) {
*start_index = i;
assert((count++, true));
} else {
@ -2240,8 +2297,8 @@ void VersionStorageInfo::ExtendFileRangeOverlappingInterval(
for (unsigned int i = mid_index+1;
i < level_files_brief_[level].num_files; i++) {
const FdWithKeyRange* f = &files[i];
const Slice file_start = ExtractUserKey(f->smallest_key);
if (user_cmp->Compare(file_start, user_end) <= 0) {
auto& smallest = f->file_metadata->smallest;
if (sstableKeyCompare(user_cmp, smallest, end) <= 0) {
assert((count++, true));
*end_index = i;
} else {
@ -2259,39 +2316,36 @@ void VersionStorageInfo::ExtendFileRangeOverlappingInterval(
// the clean range required.
// Use FileLevel in searching, make it faster
void VersionStorageInfo::ExtendFileRangeWithinInterval(
int level, const Slice& user_begin, const Slice& user_end,
int level, const InternalKey* begin, const InternalKey* end,
unsigned int mid_index, int* start_index, int* end_index) const {
assert(level != 0);
const Comparator* user_cmp = user_comparator_;
auto* user_cmp = user_comparator_;
const FdWithKeyRange* files = level_files_brief_[level].files;
#ifndef NDEBUG
{
// assert that the file at mid_index is within the range
assert(mid_index < level_files_brief_[level].num_files);
const FdWithKeyRange* f = &files[mid_index];
const Slice fstart = ExtractUserKey(f->smallest_key);
const Slice flimit = ExtractUserKey(f->largest_key);
assert(user_cmp->Compare(fstart, user_begin) >= 0 &&
user_cmp->Compare(flimit, user_end) <= 0);
auto& smallest = f->file_metadata->smallest;
auto& largest = f->file_metadata->largest;
assert(sstableKeyCompare(user_cmp, begin, smallest) <= 0 &&
sstableKeyCompare(user_cmp, largest, end) <= 0);
}
#endif
ExtendFileRangeOverlappingInterval(level, user_begin, user_end, mid_index,
ExtendFileRangeOverlappingInterval(level, begin, end, mid_index,
start_index, end_index);
int left = *start_index;
int right = *end_index;
// shrink from left to right
while (left <= right) {
const Slice& first_key_in_range = ExtractUserKey(files[left].smallest_key);
if (user_cmp->Compare(first_key_in_range, user_begin) < 0) {
auto& smallest = files[left].file_metadata->smallest;
if (sstableKeyCompare(user_cmp, begin, smallest) > 0) {
left++;
continue;
}
if (left > 0) { // If not first file
const Slice& last_key_before =
ExtractUserKey(files[left - 1].largest_key);
if (user_cmp->Equal(first_key_in_range, last_key_before)) {
// The first user key in range overlaps with the previous file's last
// key
auto& largest = files[left - 1].file_metadata->largest;
if (sstableKeyCompare(user_cmp, smallest, largest) == 0) {
left++;
continue;
}
@ -2300,16 +2354,15 @@ void VersionStorageInfo::ExtendFileRangeWithinInterval(
}
// shrink from right to left
while (left <= right) {
const Slice last_key_in_range = ExtractUserKey(files[right].largest_key);
if (user_cmp->Compare(last_key_in_range, user_end) > 0) {
auto& largest = files[right].file_metadata->largest;
if (sstableKeyCompare(user_cmp, largest, end) > 0) {
right--;
continue;
}
if (right < static_cast<int>(level_files_brief_[level].num_files) -
1) { // If not the last file
const Slice first_key_after =
ExtractUserKey(files[right + 1].smallest_key);
if (user_cmp->Equal(last_key_in_range, first_key_after)) {
auto& smallest = files[right + 1].file_metadata->smallest;
if (sstableKeyCompare(user_cmp, smallest, largest) == 0) {
// The last user key in range overlaps with the next file's first key
right--;
continue;
@ -4030,8 +4083,8 @@ uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f,
// approximate offset of "key" within the table.
TableReader* table_reader_ptr;
InternalIterator* iter = v->cfd_->table_cache()->NewIterator(
ReadOptions(), v->env_options_, v->cfd_->internal_comparator(), f.fd,
nullptr /* range_del_agg */,
ReadOptions(), v->env_options_, v->cfd_->internal_comparator(),
*f.file_metadata, nullptr /* range_del_agg */,
v->GetMutableCFOptions().prefix_extractor.get(), &table_reader_ptr);
if (table_reader_ptr != nullptr) {
result = table_reader_ptr->ApproximateOffsetOf(key);
@ -4111,7 +4164,7 @@ InternalIterator* VersionSet::MakeInputIterator(
for (size_t i = 0; i < flevel->num_files; i++) {
list[num++] = cfd->table_cache()->NewIterator(
read_options, env_options_compactions, cfd->internal_comparator(),
flevel->files[i].fd, range_del_agg,
*flevel->files[i].file_metadata, range_del_agg,
c->mutable_cf_options()->prefix_extractor.get(),
nullptr /* table_reader_ptr */,
nullptr /* no per level latency histogram */,

View File

@ -201,8 +201,8 @@ class VersionStorageInfo {
void GetOverlappingInputsRangeBinarySearch(
int level, // level > 0
const Slice& begin, // nullptr means before all keys
const Slice& end, // nullptr means after all keys
const InternalKey* begin, // nullptr means before all keys
const InternalKey* end, // nullptr means after all keys
std::vector<FileMetaData*>* inputs,
int hint_index, // index of overlap file
int* file_index, // return index of overlap file
@ -211,20 +211,20 @@ class VersionStorageInfo {
void ExtendFileRangeOverlappingInterval(
int level,
const Slice& begin, // nullptr means before all keys
const Slice& end, // nullptr means after all keys
unsigned int index, // start extending from this index
int* startIndex, // return the startIndex of input range
int* endIndex) // return the endIndex of input range
const InternalKey* begin, // nullptr means before all keys
const InternalKey* end, // nullptr means after all keys
unsigned int index, // start extending from this index
int* startIndex, // return the startIndex of input range
int* endIndex) // return the endIndex of input range
const;
void ExtendFileRangeWithinInterval(
int level,
const Slice& begin, // nullptr means before all keys
const Slice& end, // nullptr means after all keys
unsigned int index, // start extending from this index
int* startIndex, // return the startIndex of input range
int* endIndex) // return the endIndex of input range
const InternalKey* begin, // nullptr means before all keys
const InternalKey* end, // nullptr means after all keys
unsigned int index, // start extending from this index
int* startIndex, // return the startIndex of input range
int* endIndex) // return the endIndex of input range
const;
// Returns true iff some file in the specified level overlaps

View File

@ -11,6 +11,7 @@
#include "db/log_writer.h"
#include "table/mock_table.h"
#include "util/logging.h"
#include "util/string_util.h"
#include "util/testharness.h"
#include "util/testutil.h"
@ -139,6 +140,35 @@ class VersionStorageInfoTest : public testing::Test {
f->num_deletions = 0;
vstorage_.AddFile(level, f);
}
void Add(int level, uint32_t file_number, const InternalKey& smallest,
const InternalKey& largest, uint64_t file_size = 0) {
assert(level < vstorage_.num_levels());
FileMetaData* f = new FileMetaData;
f->fd = FileDescriptor(file_number, 0, file_size);
f->smallest = smallest;
f->largest = largest;
f->compensated_file_size = file_size;
f->refs = 0;
f->num_entries = 0;
f->num_deletions = 0;
vstorage_.AddFile(level, f);
}
std::string GetOverlappingFiles(int level, const InternalKey& begin,
const InternalKey& end) {
std::vector<FileMetaData*> inputs;
vstorage_.GetOverlappingInputs(level, &begin, &end, &inputs);
std::string result;
for (size_t i = 0; i < inputs.size(); ++i) {
if (i > 0) {
result += ",";
}
AppendNumberTo(&result, inputs[i]->fd.GetNumber());
}
return result;
}
};
TEST_F(VersionStorageInfoTest, MaxBytesForLevelStatic) {
@ -261,6 +291,40 @@ TEST_F(VersionStorageInfoTest, EstimateLiveDataSize2) {
ASSERT_EQ(4U, vstorage_.EstimateLiveDataSize());
}
TEST_F(VersionStorageInfoTest, GetOverlappingInputs) {
// Two files that overlap at the range deletion tombstone sentinel.
Add(1, 1U, {"a", 0, kTypeValue}, {"b", kMaxSequenceNumber, kTypeRangeDeletion}, 1);
Add(1, 2U, {"b", 0, kTypeValue}, {"c", 0, kTypeValue}, 1);
// Two files that overlap at the same user key.
Add(1, 3U, {"d", 0, kTypeValue}, {"e", kMaxSequenceNumber, kTypeValue}, 1);
Add(1, 4U, {"e", 0, kTypeValue}, {"f", 0, kTypeValue}, 1);
// Two files that do not overlap.
Add(1, 5U, {"g", 0, kTypeValue}, {"h", 0, kTypeValue}, 1);
Add(1, 6U, {"i", 0, kTypeValue}, {"j", 0, kTypeValue}, 1);
vstorage_.UpdateNumNonEmptyLevels();
vstorage_.GenerateLevelFilesBrief();
ASSERT_EQ("1,2", GetOverlappingFiles(
1, {"a", 0, kTypeValue}, {"b", 0, kTypeValue}));
ASSERT_EQ("1", GetOverlappingFiles(
1, {"a", 0, kTypeValue}, {"b", kMaxSequenceNumber, kTypeRangeDeletion}));
ASSERT_EQ("2", GetOverlappingFiles(
1, {"b", kMaxSequenceNumber, kTypeValue}, {"c", 0, kTypeValue}));
ASSERT_EQ("3,4", GetOverlappingFiles(
1, {"d", 0, kTypeValue}, {"e", 0, kTypeValue}));
ASSERT_EQ("3", GetOverlappingFiles(
1, {"d", 0, kTypeValue}, {"e", kMaxSequenceNumber, kTypeRangeDeletion}));
ASSERT_EQ("3,4", GetOverlappingFiles(
1, {"e", kMaxSequenceNumber, kTypeValue}, {"f", 0, kTypeValue}));
ASSERT_EQ("3,4", GetOverlappingFiles(
1, {"e", 0, kTypeValue}, {"f", 0, kTypeValue}));
ASSERT_EQ("5", GetOverlappingFiles(
1, {"g", 0, kTypeValue}, {"h", 0, kTypeValue}));
ASSERT_EQ("6", GetOverlappingFiles(
1, {"i", 0, kTypeValue}, {"j", 0, kTypeValue}));
}
class FindLevelFileTest : public testing::Test {
public:
LevelFilesBrief file_level_;