An iterator may automatically invoke reseeks.
Summary: An iterator invokes reseek if the number of sequential skips over the same userkey exceeds a configured number. This makes iter->Next() faster (bacause of fewer key compares) if a large number of adjacent internal keys in a table (sst or memtable) have the same userkey. Test Plan: Unit test DBTest.IterReseek. Reviewers: emayanke, haobo, xjin Reviewed By: xjin CC: leveldb, xjin Differential Revision: https://reviews.facebook.net/D11865
This commit is contained in:
parent
de98c1d9aa
commit
197034e4c3
@ -65,6 +65,7 @@ class DBIter: public Iterator {
|
||||
current_entry_is_merged_(false),
|
||||
statistics_(options.statistics) {
|
||||
RecordTick(statistics_, NO_ITERATORS, 1);
|
||||
max_skip_ = options.max_sequential_skip_in_iterations;
|
||||
}
|
||||
virtual ~DBIter() {
|
||||
RecordTick(statistics_, NO_ITERATORS, -1);
|
||||
@ -129,6 +130,7 @@ class DBIter: public Iterator {
|
||||
bool valid_;
|
||||
bool current_entry_is_merged_;
|
||||
std::shared_ptr<Statistics> statistics_;
|
||||
uint64_t max_skip_;
|
||||
|
||||
// No copying allowed
|
||||
DBIter(const DBIter&);
|
||||
@ -188,12 +190,13 @@ void DBIter::FindNextUserEntry(bool skipping) {
|
||||
assert(iter_->Valid());
|
||||
assert(direction_ == kForward);
|
||||
current_entry_is_merged_ = false;
|
||||
uint64_t num_skipped = 0;
|
||||
do {
|
||||
ParsedInternalKey ikey;
|
||||
if (ParseKey(&ikey) && ikey.sequence <= sequence_) {
|
||||
if (skipping &&
|
||||
user_comparator_->Compare(ikey.user_key, saved_key_) <= 0) {
|
||||
// skip this entry
|
||||
num_skipped++; // skip this entry
|
||||
} else {
|
||||
skipping = false;
|
||||
switch (ikey.type) {
|
||||
@ -202,6 +205,7 @@ void DBIter::FindNextUserEntry(bool skipping) {
|
||||
// they are hidden by this deletion.
|
||||
SaveKey(ikey.user_key, &saved_key_);
|
||||
skipping = true;
|
||||
num_skipped = 0;
|
||||
break;
|
||||
case kTypeValue:
|
||||
valid_ = true;
|
||||
@ -220,7 +224,20 @@ void DBIter::FindNextUserEntry(bool skipping) {
|
||||
}
|
||||
}
|
||||
}
|
||||
// If we have sequentially iterated via numerous keys and still not
|
||||
// found the next user-key, then it is better to seek so that we can
|
||||
// avoid too many key comparisons. We seek to the last occurence of
|
||||
// our current key by looking for sequence number 0.
|
||||
if (skipping && num_skipped > max_skip_) {
|
||||
num_skipped = 0;
|
||||
std::string last_key;
|
||||
AppendInternalKey(&last_key,
|
||||
ParsedInternalKey(Slice(saved_key_), 0, kValueTypeForSeek));
|
||||
iter_->Seek(last_key);
|
||||
RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
|
||||
} else {
|
||||
iter_->Next();
|
||||
}
|
||||
} while (iter_->Valid());
|
||||
valid_ = false;
|
||||
}
|
||||
@ -342,6 +359,7 @@ void DBIter::Prev() {
|
||||
|
||||
void DBIter::FindPrevUserEntry() {
|
||||
assert(direction_ == kReverse);
|
||||
uint64_t num_skipped = 0;
|
||||
|
||||
ValueType value_type = kTypeDeletion;
|
||||
if (iter_->Valid()) {
|
||||
@ -367,7 +385,22 @@ void DBIter::FindPrevUserEntry() {
|
||||
saved_value_.assign(raw_value.data(), raw_value.size());
|
||||
}
|
||||
}
|
||||
num_skipped++;
|
||||
// If we have sequentially iterated via numerous keys and still not
|
||||
// found the prev user-key, then it is better to seek so that we can
|
||||
// avoid too many key comparisons. We seek to the first occurence of
|
||||
// our current key by looking for max sequence number.
|
||||
if (num_skipped > max_skip_) {
|
||||
num_skipped = 0;
|
||||
std::string last_key;
|
||||
AppendInternalKey(&last_key,
|
||||
ParsedInternalKey(Slice(saved_key_), kMaxSequenceNumber,
|
||||
kValueTypeForSeek));
|
||||
iter_->Seek(last_key);
|
||||
RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
|
||||
} else {
|
||||
iter_->Prev();
|
||||
}
|
||||
} while (iter_->Valid());
|
||||
}
|
||||
|
||||
|
@ -69,6 +69,7 @@ class AtomicCounter {
|
||||
count_ = 0;
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
// Special Env used to delay background operations
|
||||
@ -1133,6 +1134,95 @@ TEST(DBTest, IterMulti) {
|
||||
} while (ChangeCompactOptions());
|
||||
}
|
||||
|
||||
// Check that we can skip over a run of user keys
|
||||
// by using reseek rather than sequential scan
|
||||
TEST(DBTest, IterReseek) {
|
||||
Options options = CurrentOptions();
|
||||
options.max_sequential_skip_in_iterations = 3;
|
||||
options.create_if_missing = true;
|
||||
options.statistics = leveldb::CreateDBStatistics();
|
||||
DestroyAndReopen(&options);
|
||||
|
||||
// insert two keys with same userkey and verify that
|
||||
// reseek is not invoked. For each of these test cases,
|
||||
// verify that we can find the next key "b".
|
||||
ASSERT_OK(Put("a", "one"));
|
||||
ASSERT_OK(Put("a", "two"));
|
||||
ASSERT_OK(Put("b", "bone"));
|
||||
Iterator* iter = db_->NewIterator(ReadOptions());
|
||||
iter->SeekToFirst();
|
||||
ASSERT_EQ(options.statistics.get()->getTickerCount(
|
||||
NUMBER_OF_RESEEKS_IN_ITERATION), 0);
|
||||
ASSERT_EQ(IterStatus(iter), "a->two");
|
||||
iter->Next();
|
||||
ASSERT_EQ(options.statistics.get()->getTickerCount(
|
||||
NUMBER_OF_RESEEKS_IN_ITERATION), 0);
|
||||
ASSERT_EQ(IterStatus(iter), "b->bone");
|
||||
delete iter;
|
||||
|
||||
// insert a total of three keys with same userkey and verify
|
||||
// that reseek is still not invoked.
|
||||
ASSERT_OK(Put("a", "three"));
|
||||
iter = db_->NewIterator(ReadOptions());
|
||||
iter->SeekToFirst();
|
||||
ASSERT_EQ(IterStatus(iter), "a->three");
|
||||
iter->Next();
|
||||
ASSERT_EQ(options.statistics.get()->getTickerCount(
|
||||
NUMBER_OF_RESEEKS_IN_ITERATION), 0);
|
||||
ASSERT_EQ(IterStatus(iter), "b->bone");
|
||||
delete iter;
|
||||
|
||||
// insert a total of four keys with same userkey and verify
|
||||
// that reseek is invoked.
|
||||
ASSERT_OK(Put("a", "four"));
|
||||
iter = db_->NewIterator(ReadOptions());
|
||||
iter->SeekToFirst();
|
||||
ASSERT_EQ(IterStatus(iter), "a->four");
|
||||
ASSERT_EQ(options.statistics.get()->getTickerCount(
|
||||
NUMBER_OF_RESEEKS_IN_ITERATION), 0);
|
||||
iter->Next();
|
||||
ASSERT_EQ(options.statistics.get()->getTickerCount(
|
||||
NUMBER_OF_RESEEKS_IN_ITERATION), 1);
|
||||
ASSERT_EQ(IterStatus(iter), "b->bone");
|
||||
delete iter;
|
||||
|
||||
// Testing reverse iterator
|
||||
// At this point, we have three versions of "a" and one version of "b".
|
||||
// The reseek statistics is already at 1.
|
||||
int num_reseeks = (int)options.statistics.get()->getTickerCount(
|
||||
NUMBER_OF_RESEEKS_IN_ITERATION);
|
||||
|
||||
// Insert another version of b and assert that reseek is not invoked
|
||||
ASSERT_OK(Put("b", "btwo"));
|
||||
iter = db_->NewIterator(ReadOptions());
|
||||
iter->SeekToLast();
|
||||
ASSERT_EQ(IterStatus(iter), "b->btwo");
|
||||
ASSERT_EQ(options.statistics.get()->getTickerCount(
|
||||
NUMBER_OF_RESEEKS_IN_ITERATION), num_reseeks);
|
||||
iter->Prev();
|
||||
ASSERT_EQ(options.statistics.get()->getTickerCount(
|
||||
NUMBER_OF_RESEEKS_IN_ITERATION), num_reseeks+1);
|
||||
ASSERT_EQ(IterStatus(iter), "a->four");
|
||||
delete iter;
|
||||
|
||||
// insert two more versions of b. This makes a total of 4 versions
|
||||
// of b and 4 versions of a.
|
||||
ASSERT_OK(Put("b", "bthree"));
|
||||
ASSERT_OK(Put("b", "bfour"));
|
||||
iter = db_->NewIterator(ReadOptions());
|
||||
iter->SeekToLast();
|
||||
ASSERT_EQ(IterStatus(iter), "b->bfour");
|
||||
ASSERT_EQ(options.statistics.get()->getTickerCount(
|
||||
NUMBER_OF_RESEEKS_IN_ITERATION), num_reseeks + 2);
|
||||
iter->Prev();
|
||||
|
||||
// the previous Prev call should have invoked reseek
|
||||
ASSERT_EQ(options.statistics.get()->getTickerCount(
|
||||
NUMBER_OF_RESEEKS_IN_ITERATION), num_reseeks + 3);
|
||||
ASSERT_EQ(IterStatus(iter), "a->four");
|
||||
delete iter;
|
||||
}
|
||||
|
||||
TEST(DBTest, IterSmallAndLargeMix) {
|
||||
do {
|
||||
ASSERT_OK(Put("a", "va"));
|
||||
|
@ -532,6 +532,13 @@ struct Options {
|
||||
// Default: false
|
||||
bool filter_deletes;
|
||||
|
||||
// An iteration->Next() sequentially skips over keys with the same
|
||||
// user-key unless this option is set. This number specifies the number
|
||||
// of keys (with the same userkey) that will be sequentially
|
||||
// skipped before a reseek is issued.
|
||||
// Default: 8
|
||||
uint64_t max_sequential_skip_in_iterations;
|
||||
|
||||
// This is a factory that provides MemTableRep objects.
|
||||
// Default: a factory that provides a skip-list-based implementation of
|
||||
// MemTableRep.
|
||||
|
@ -58,6 +58,8 @@ enum Tickers {
|
||||
NUMBER_MULTIGET_KEYS_READ = 19,
|
||||
NUMBER_MULTIGET_BYTES_READ = 20,
|
||||
|
||||
// Number of deletes records that were not required to be
|
||||
// written to storage because key does not exist
|
||||
NUMBER_FILTERED_DELETES = 21,
|
||||
NUMBER_MERGE_FAILURES = 22,
|
||||
SEQUENCE_NUMBER = 23,
|
||||
@ -68,9 +70,15 @@ enum Tickers {
|
||||
BLOOM_FILTER_PREFIX_CHECKED = 24,
|
||||
BLOOM_FILTER_PREFIX_USEFUL = 25,
|
||||
|
||||
TICKER_ENUM_MAX = 26
|
||||
// Number of times we had to reseek inside an iteration to skip
|
||||
// over large number of keys with same userkey.
|
||||
NUMBER_OF_RESEEKS_IN_ITERATION = 26,
|
||||
|
||||
TICKER_ENUM_MAX = 27
|
||||
};
|
||||
|
||||
// The order of items listed in Tickers should be the same as
|
||||
// the order listed in TickersNameMap
|
||||
const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
|
||||
{ BLOCK_CACHE_MISS, "rocksdb.block.cache.miss" },
|
||||
{ BLOCK_CACHE_HIT, "rocksdb.block.cache.hit" },
|
||||
@ -97,7 +105,8 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
|
||||
{ NUMBER_MERGE_FAILURES, "rocksdb.number.merge.failures" },
|
||||
{ SEQUENCE_NUMBER, "rocksdb.sequence.number" },
|
||||
{ BLOOM_FILTER_PREFIX_CHECKED, "rocksdb.bloom.filter.prefix.checked" },
|
||||
{ BLOOM_FILTER_PREFIX_USEFUL, "rocksdb.bloom.filter.prefix.useful" }
|
||||
{ BLOOM_FILTER_PREFIX_USEFUL, "rocksdb.bloom.filter.prefix.useful" },
|
||||
{ NUMBER_OF_RESEEKS_IN_ITERATION, "rocksdb.number.reseeks.iteration" }
|
||||
};
|
||||
|
||||
/**
|
||||
|
@ -81,11 +81,11 @@ Options::Options()
|
||||
bytes_per_sync(0),
|
||||
compaction_style(kCompactionStyleLevel),
|
||||
filter_deletes(false),
|
||||
max_sequential_skip_in_iterations(8),
|
||||
memtable_factory(std::shared_ptr<SkipListFactory>(new SkipListFactory)),
|
||||
compaction_filter_factory(
|
||||
std::shared_ptr<CompactionFilterFactory>(
|
||||
new DefaultCompactionFilterFactory())) {
|
||||
|
||||
assert(memtable_factory.get() != nullptr);
|
||||
}
|
||||
|
||||
@ -174,6 +174,8 @@ Options::Dump(Logger* log) const
|
||||
Log(log,"Options.max_bytes_for_level_multiplier_addtl[%d]: %d",
|
||||
i, max_bytes_for_level_multiplier_additional[i]);
|
||||
}
|
||||
Log(log," Options.max_sequential_skip_in_iterations: %ld",
|
||||
max_sequential_skip_in_iterations);
|
||||
Log(log," Options.expanded_compaction_factor: %d",
|
||||
expanded_compaction_factor);
|
||||
Log(log," Options.source_compaction_factor: %d",
|
||||
|
Loading…
Reference in New Issue
Block a user