Fix kHashSearch bug with SeekForPrev (#6297)

Summary:
When prefix is enabled the expected behavior when the prefix of the target does not exist is for Seek is to seek to any key larger than target and SeekToPrev to any key less than the target.
Currently. the prefix index (kHashSearch) returns OK status but sets Invalid() to indicate two cases: a prefix of the searched key does not exist, ii) the key is beyond the range of the keys in SST file. The SeekForPrev implementation in BlockBasedTable thus does not have enough information to know when it should set the index key to first (to return a key smaller than target). The patch fixes that by returning NotFound status for cases that the prefix does not exist. SeekForPrev in BlockBasedTable accordingly SeekToFirst instead of SeekToLast on the index iterator.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6297

Test Plan: SeekForPrev of non-exsiting prefix is added to block_test.cc, and a test case is added in db_test2, which fails without the fix.

Differential Revision: D19404695

fbshipit-source-id: cafbbf95f8f60ff9ede9ccc99d25bfa1cf6fcdc3
This commit is contained in:
sdong 2020-01-15 14:03:18 -08:00 committed by Facebook Github Bot
parent 1dd7873e08
commit d2b4d42d4b
7 changed files with 155 additions and 10 deletions

View File

@ -18,6 +18,7 @@
* Fix a race condition for cfd->log_number_ between manifest switch and memtable switch (PR 6249) when number of column families is greater than 1. * Fix a race condition for cfd->log_number_ between manifest switch and memtable switch (PR 6249) when number of column families is greater than 1.
* Fix a bug on fractional cascading index when multiple files at the same level contain the same smallest user key, and those user keys are for merge operands. In this case, Get() the exact key may miss some merge operands. * Fix a bug on fractional cascading index when multiple files at the same level contain the same smallest user key, and those user keys are for merge operands. In this case, Get() the exact key may miss some merge operands.
* Delcare kHashSearch index type feature-incompatible with index_block_restart_interval larger than 1. * Delcare kHashSearch index type feature-incompatible with index_block_restart_interval larger than 1.
* Fix incorrect results while block-based table uses kHashSearch, together with Prev()/SeekForPrev().
### New Features ### New Features
* It is now possible to enable periodic compactions for the base DB when using BlobDB. * It is now possible to enable periodic compactions for the base DB when using BlobDB.

View File

@ -4300,6 +4300,64 @@ TEST_F(DBTest2, SameSmallestInSameLevel) {
ASSERT_EQ("2,3,4,5,6,7,8", Get("key")); ASSERT_EQ("2,3,4,5,6,7,8", Get("key"));
} }
TEST_F(DBTest2, BlockBasedTablePrefixIndexSeekForPrev) {
// create a DB with block prefix index
BlockBasedTableOptions table_options;
Options options = CurrentOptions();
table_options.block_size = 300;
table_options.index_type = BlockBasedTableOptions::kHashSearch;
table_options.index_shortening =
BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
options.prefix_extractor.reset(NewFixedPrefixTransform(1));
Reopen(options);
Random rnd(301);
std::string large_value = RandomString(&rnd, 500);
ASSERT_OK(Put("a1", large_value));
ASSERT_OK(Put("x1", large_value));
ASSERT_OK(Put("y1", large_value));
Flush();
{
std::unique_ptr<Iterator> iterator(db_->NewIterator(ReadOptions()));
iterator->SeekForPrev("x3");
ASSERT_TRUE(iterator->Valid());
ASSERT_EQ("x1", iterator->key().ToString());
iterator->SeekForPrev("a3");
ASSERT_TRUE(iterator->Valid());
ASSERT_EQ("a1", iterator->key().ToString());
iterator->SeekForPrev("y3");
ASSERT_TRUE(iterator->Valid());
ASSERT_EQ("y1", iterator->key().ToString());
// Query more than one non-existing prefix to cover the case both
// of empty hash bucket and hash bucket conflict.
iterator->SeekForPrev("b1");
// Result should be not valid or "a1".
if (iterator->Valid()) {
ASSERT_EQ("a1", iterator->key().ToString());
}
iterator->SeekForPrev("c1");
// Result should be not valid or "a1".
if (iterator->Valid()) {
ASSERT_EQ("a1", iterator->key().ToString());
}
iterator->SeekForPrev("d1");
// Result should be not valid or "a1".
if (iterator->Valid()) {
ASSERT_EQ("a1", iterator->key().ToString());
}
}
}
} // namespace rocksdb } // namespace rocksdb
#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS #ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS

View File

@ -393,7 +393,15 @@ void IndexBlockIter::Seek(const Slice& target) {
uint32_t index = 0; uint32_t index = 0;
bool ok = false; bool ok = false;
if (prefix_index_) { if (prefix_index_) {
ok = PrefixSeek(target, &index); bool prefix_may_exist = true;
ok = PrefixSeek(target, &index, &prefix_may_exist);
if (!prefix_may_exist) {
// This is to let the caller to distinguish between non-existing prefix,
// and when key is larger than the last key, which both set Valid() to
// false.
current_ = restarts_;
status_ = Status::NotFound();
}
} else if (value_delta_encoded_) { } else if (value_delta_encoded_) {
ok = BinarySeek<DecodeKeyV4>(seek_key, 0, num_restarts_ - 1, &index, ok = BinarySeek<DecodeKeyV4>(seek_key, 0, num_restarts_ - 1, &index,
comparator_); comparator_);
@ -718,8 +726,12 @@ int IndexBlockIter::CompareBlockKey(uint32_t block_index, const Slice& target) {
// with a key >= target // with a key >= target
bool IndexBlockIter::BinaryBlockIndexSeek(const Slice& target, bool IndexBlockIter::BinaryBlockIndexSeek(const Slice& target,
uint32_t* block_ids, uint32_t left, uint32_t* block_ids, uint32_t left,
uint32_t right, uint32_t* index) { uint32_t right, uint32_t* index,
bool* prefix_may_exist) {
assert(left <= right); assert(left <= right);
assert(index);
assert(prefix_may_exist);
*prefix_may_exist = true;
uint32_t left_bound = left; uint32_t left_bound = left;
while (left <= right) { while (left <= right) {
@ -753,6 +765,7 @@ bool IndexBlockIter::BinaryBlockIndexSeek(const Slice& target,
(left == left_bound || block_ids[left - 1] != block_ids[left] - 1) && (left == left_bound || block_ids[left - 1] != block_ids[left] - 1) &&
CompareBlockKey(block_ids[left] - 1, target) > 0) { CompareBlockKey(block_ids[left] - 1, target) > 0) {
current_ = restarts_; current_ = restarts_;
*prefix_may_exist = false;
return false; return false;
} }
@ -760,14 +773,45 @@ bool IndexBlockIter::BinaryBlockIndexSeek(const Slice& target,
return true; return true;
} else { } else {
assert(left > right); assert(left > right);
// If the next block key is larger than seek key, it is possible that
// no key shares the prefix with `target`, or all keys with the same
// prefix as `target` are smaller than prefix. In the latter case,
// we are mandated to set the position the same as the total order.
// In the latter case, either:
// (1) `target` falls into the range of the next block. In this case,
// we can place the iterator to the next block, or
// (2) `target` is larger than all block keys. In this case we can
// keep the iterator invalidate without setting `prefix_may_exist`
// to false.
// We might sometimes end up with setting the total order position
// while there is no key sharing the prefix as `target`, but it
// still follows the contract.
uint32_t right_index = block_ids[right];
assert(right_index + 1 <= num_restarts_);
if (right_index + 1 < num_restarts_) {
if (CompareBlockKey(right_index + 1, target) >= 0) {
*index = right_index + 1;
return true;
} else {
// We have to set the flag here because we are not positioning
// the iterator to the total order position.
*prefix_may_exist = false;
}
}
// Mark iterator invalid // Mark iterator invalid
current_ = restarts_; current_ = restarts_;
return false; return false;
} }
} }
bool IndexBlockIter::PrefixSeek(const Slice& target, uint32_t* index) { bool IndexBlockIter::PrefixSeek(const Slice& target, uint32_t* index,
bool* prefix_may_exist) {
assert(index);
assert(prefix_may_exist);
assert(prefix_index_); assert(prefix_index_);
*prefix_may_exist = true;
Slice seek_key = target; Slice seek_key = target;
if (!key_includes_seq_) { if (!key_includes_seq_) {
seek_key = ExtractUserKey(target); seek_key = ExtractUserKey(target);
@ -777,9 +821,12 @@ bool IndexBlockIter::PrefixSeek(const Slice& target, uint32_t* index) {
if (num_blocks == 0) { if (num_blocks == 0) {
current_ = restarts_; current_ = restarts_;
*prefix_may_exist = false;
return false; return false;
} else { } else {
return BinaryBlockIndexSeek(seek_key, block_ids, 0, num_blocks - 1, index); assert(block_ids);
return BinaryBlockIndexSeek(seek_key, block_ids, 0, num_blocks - 1, index,
prefix_may_exist);
} }
} }

View File

@ -539,6 +539,13 @@ class IndexBlockIter final : public BlockIter<IndexValue> {
} }
} }
// IndexBlockIter follows a different contract for prefix iterator
// from data iterators.
// If prefix of the seek key `target` exists in the file, it must
// return the same result as total order seek.
// If the prefix of `target` doesn't exist in the file, it can either
// return the result of total order seek, or set both of Valid() = false
// and status() = NotFound().
virtual void Seek(const Slice& target) override; virtual void Seek(const Slice& target) override;
virtual void SeekForPrev(const Slice&) override { virtual void SeekForPrev(const Slice&) override {
@ -595,9 +602,16 @@ class IndexBlockIter final : public BlockIter<IndexValue> {
std::unique_ptr<GlobalSeqnoState> global_seqno_state_; std::unique_ptr<GlobalSeqnoState> global_seqno_state_;
bool PrefixSeek(const Slice& target, uint32_t* index); // Set *prefix_may_exist to false if no key possibly share the same prefix
// as `target`. If not set, the result position should be the same as total
// order Seek.
bool PrefixSeek(const Slice& target, uint32_t* index, bool* prefix_may_exist);
// Set *prefix_may_exist to false if no key can possibly share the same
// prefix as `target`. If not set, the result position should be the same
// as total order seek.
bool BinaryBlockIndexSeek(const Slice& target, uint32_t* block_ids, bool BinaryBlockIndexSeek(const Slice& target, uint32_t* block_ids,
uint32_t left, uint32_t right, uint32_t* index); uint32_t left, uint32_t right, uint32_t* index,
bool* prefix_may_exist);
inline int CompareBlockKey(uint32_t block_index, const Slice& target); inline int CompareBlockKey(uint32_t block_index, const Slice& target);
inline int Compare(const Slice& a, const Slice& b) const { inline int Compare(const Slice& a, const Slice& b) const {

View File

@ -2901,12 +2901,22 @@ void BlockBasedTableIterator<TBlockIter, TValue>::SeekForPrev(
index_iter_->Seek(target); index_iter_->Seek(target);
if (!index_iter_->Valid()) { if (!index_iter_->Valid()) {
if (!index_iter_->status().ok()) { auto seek_status = index_iter_->status();
// Check for IO error
if (!seek_status.IsNotFound() && !seek_status.ok()) {
ResetDataIter(); ResetDataIter();
return; return;
} }
index_iter_->SeekToLast(); // With prefix index, Seek() returns NotFound if the prefix doesn't exist
if (seek_status.IsNotFound()) {
// Any key less than the target is fine for prefix seek
ResetDataIter();
return;
} else {
index_iter_->SeekToLast();
}
// Check for IO error
if (!index_iter_->Valid()) { if (!index_iter_->Valid()) {
ResetDataIter(); ResetDataIter();
return; return;

View File

@ -686,7 +686,8 @@ class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
return block_iter_.value(); return block_iter_.value();
} }
Status status() const override { Status status() const override {
if (!index_iter_->status().ok()) { // Prefix index set status to NotFound when the prefix does not exist
if (!index_iter_->status().ok() && !index_iter_->status().IsNotFound()) {
return index_iter_->status(); return index_iter_->status();
} else if (block_iter_points_to_real_block_) { } else if (block_iter_points_to_real_block_) {
return block_iter_.status(); return block_iter_.status();

View File

@ -1857,7 +1857,8 @@ void TableTest::IndexTest(BlockBasedTableOptions table_options) {
auto key = prefixes[i] + "9"; auto key = prefixes[i] + "9";
index_iter->Seek(InternalKey(key, 0, kTypeValue).Encode()); index_iter->Seek(InternalKey(key, 0, kTypeValue).Encode());
ASSERT_OK(index_iter->status()); ASSERT_TRUE(index_iter->status().ok() || index_iter->status().IsNotFound());
ASSERT_TRUE(!index_iter->status().IsNotFound() || !index_iter->Valid());
if (i == prefixes.size() - 1) { if (i == prefixes.size() - 1) {
// last key // last key
ASSERT_TRUE(!index_iter->Valid()); ASSERT_TRUE(!index_iter->Valid());
@ -1884,6 +1885,19 @@ void TableTest::IndexTest(BlockBasedTableOptions table_options) {
ASSERT_TRUE(BytewiseComparator()->Compare(prefix, ukey_prefix) < 0); ASSERT_TRUE(BytewiseComparator()->Compare(prefix, ukey_prefix) < 0);
} }
} }
for (const auto& prefix : non_exist_prefixes) {
index_iter->SeekForPrev(InternalKey(prefix, 0, kTypeValue).Encode());
// regular_iter->Seek(prefix);
ASSERT_OK(index_iter->status());
// Seek to non-existing prefixes should yield either invalid, or a
// key with prefix greater than the target.
if (index_iter->Valid()) {
Slice ukey = ExtractUserKey(index_iter->key());
Slice ukey_prefix = options.prefix_extractor->Transform(ukey);
ASSERT_TRUE(BytewiseComparator()->Compare(prefix, ukey_prefix) > 0);
}
}
c.ResetTableReader(); c.ResetTableReader();
} }