From f517d9dd09a2a9200c6578742f51b3cf505c1e57 Mon Sep 17 00:00:00 2001 From: Aaron Gao Date: Tue, 27 Sep 2016 18:20:57 -0700 Subject: [PATCH] Add SeekForPrev() to Iterator Summary: Add new Iterator API, `SeekForPrev`: find the last key that <= target key support prefix_extractor support prefix_same_as_start support upper_bound not supported in iterators without Prev() Also add tests in db_iter_test and db_iterator_test Pass all tests Cheers! Test Plan: make all check -j64 Reviewers: andrewkr, yiwu, IslamAbdelRahman, sdong Reviewed By: sdong Subscribers: andrewkr, dhruba, leveldb Differential Revision: https://reviews.facebook.net/D64149 --- db/comparator_db_test.cc | 4 + db/db_iter.cc | 58 +++++- db/db_iter.h | 1 + db/db_iter_test.cc | 37 +++- db/db_iterator_test.cc | 193 +++++++++++++++++- db/db_test.cc | 4 + db/dbformat.cc | 9 + db/dbformat.h | 10 +- db/forward_iterator.cc | 4 + db/forward_iterator.h | 4 + db/inlineskiplist.h | 19 ++ db/inlineskiplist_test.cc | 27 ++- db/managed_iterator.cc | 10 + db/managed_iterator.h | 1 + db/memtable.cc | 24 +++ db/skiplist.h | 20 +- db/skiplist_test.cc | 27 ++- db/version_set.cc | 4 + include/rocksdb/iterator.h | 5 + include/rocksdb/memtablerep.h | 4 + include/rocksdb/perf_context.h | 3 +- .../utilities/write_batch_with_index.h | 2 + memtable/hash_cuckoo_rep.cc | 13 +- memtable/hash_linklist_rep.cc | 29 +++ memtable/hash_skiplist_rep.cc | 9 + memtable/skiplistrep.cc | 19 ++ memtable/vectorrep.cc | 10 + table/block.cc | 114 +++++++---- table/block.h | 2 + table/cuckoo_table_reader.cc | 6 + table/internal_iterator.h | 17 ++ table/iterator.cc | 2 + table/iterator_wrapper.h | 5 + table/merger.cc | 25 ++- table/mock_table.h | 6 + table/plain_table_reader.cc | 8 + table/table_test.cc | 9 + table/two_level_iterator.cc | 24 ++- util/testutil.h | 10 + utilities/ttl/db_ttl_impl.h | 2 + .../write_batch_with_index.cc | 12 ++ .../write_batch_with_index_test.cc | 4 + 42 files changed, 707 insertions(+), 89 deletions(-) diff --git a/db/comparator_db_test.cc b/db/comparator_db_test.cc index e4e84107e..39ba14fc9 100644 --- a/db/comparator_db_test.cc +++ b/db/comparator_db_test.cc @@ -40,6 +40,10 @@ class KVIter : public Iterator { virtual void Seek(const Slice& k) override { iter_ = map_->lower_bound(k.ToString()); } + virtual void SeekForPrev(const Slice& k) override { + iter_ = map_->upper_bound(k.ToString()); + Prev(); + } virtual void Next() override { ++iter_; } virtual void Prev() override { if (iter_ == map_->begin()) { diff --git a/db/db_iter.cc b/db/db_iter.cc index 2a12534f2..333fd6fd0 100644 --- a/db/db_iter.cc +++ b/db/db_iter.cc @@ -199,6 +199,7 @@ class DBIter: public Iterator { virtual void Next() override; virtual void Prev() override; virtual void Seek(const Slice& target) override; + virtual void SeekForPrev(const Slice& target) override; virtual void SeekToFirst() override; virtual void SeekToLast() override; @@ -503,11 +504,6 @@ void DBIter::Prev() { local_stats_.bytes_read_ += (key().size() + value().size()); } } - if (valid_ && prefix_extractor_ && prefix_same_as_start_ && - prefix_extractor_->Transform(saved_key_.GetKey()) - .compare(prefix_start_key_) != 0) { - valid_ = false; - } } void DBIter::ReverseToBackward() { @@ -544,6 +540,7 @@ void DBIter::PrevInternal() { } ParsedInternalKey ikey; + bool match_prefix = true; while (iter_->Valid()) { saved_key_.SetKey(ExtractUserKey(iter_->key()), @@ -557,6 +554,12 @@ void DBIter::PrevInternal() { if (user_comparator_->Equal(ikey.user_key, saved_key_.GetKey())) { FindPrevUserKey(); } + if (valid_ && prefix_extractor_ && prefix_same_as_start_ && + prefix_extractor_->Transform(saved_key_.GetKey()) + .compare(prefix_start_key_) != 0) { + match_prefix = false; + break; + } return; } if (!iter_->Valid()) { @@ -568,7 +571,8 @@ void DBIter::PrevInternal() { } } // We haven't found any key - iterator is not valid - assert(!iter_->Valid()); + // Or the prefix is different than start prefix + assert(!iter_->Valid() || !match_prefix); valid_ = false; } @@ -819,6 +823,45 @@ void DBIter::Seek(const Slice& target) { } } +void DBIter::SeekForPrev(const Slice& target) { + StopWatch sw(env_, statistics_, DB_SEEK); + ReleaseTempPinnedData(); + saved_key_.Clear(); + // now saved_key is used to store internal key. + saved_key_.SetInternalKey(target, 0 /* sequence_number */, + kValueTypeForSeekForPrev); + + { + PERF_TIMER_GUARD(seek_internal_seek_time); + iter_->SeekForPrev(saved_key_.GetKey()); + } + + RecordTick(statistics_, NUMBER_DB_SEEK); + if (iter_->Valid()) { + if (prefix_extractor_ && prefix_same_as_start_) { + prefix_start_key_ = prefix_extractor_->Transform(target); + } + direction_ = kReverse; + ClearSavedValue(); + PrevInternal(); + if (!valid_) { + prefix_start_key_.clear(); + } + if (statistics_ != nullptr) { + if (valid_) { + RecordTick(statistics_, NUMBER_DB_SEEK_FOUND); + RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size()); + } + } + } else { + valid_ = false; + } + if (valid_ && prefix_extractor_ && prefix_same_as_start_) { + prefix_start_buf_.SetKey(prefix_start_key_); + prefix_start_key_ = prefix_start_buf_.GetKey(); + } +} + void DBIter::SeekToFirst() { // Don't use iter_::Seek() if we set a prefix extractor // because prefix seek will be used. @@ -931,6 +974,9 @@ inline void ArenaWrappedDBIter::SeekToLast() { db_iter_->SeekToLast(); } inline void ArenaWrappedDBIter::Seek(const Slice& target) { db_iter_->Seek(target); } +inline void ArenaWrappedDBIter::SeekForPrev(const Slice& target) { + db_iter_->SeekForPrev(target); +} inline void ArenaWrappedDBIter::Next() { db_iter_->Next(); } inline void ArenaWrappedDBIter::Prev() { db_iter_->Prev(); } inline Slice ArenaWrappedDBIter::key() const { return db_iter_->key(); } diff --git a/db/db_iter.h b/db/db_iter.h index f54e12267..1431bfac9 100644 --- a/db/db_iter.h +++ b/db/db_iter.h @@ -57,6 +57,7 @@ class ArenaWrappedDBIter : public Iterator { virtual void SeekToFirst() override; virtual void SeekToLast() override; virtual void Seek(const Slice& target) override; + virtual void SeekForPrev(const Slice& target) override; virtual void Next() override; virtual void Prev() override; virtual Slice key() const override; diff --git a/db/db_iter_test.cc b/db/db_iter_test.cc index 30956e35c..5305d3617 100644 --- a/db/db_iter_test.cc +++ b/db/db_iter_test.cc @@ -117,6 +117,11 @@ class TestIterator : public InternalIterator { } } + virtual void SeekForPrev(const Slice& target) override { + assert(initialized_); + SeekForPrevImpl(target, &cmp); + } + virtual void Next() override { assert(initialized_); if (data_.empty() || (iter_ == data_.size() - 1)) { @@ -1726,6 +1731,15 @@ TEST_F(DBIteratorTest, DBIterator9) { ASSERT_EQ(db_iter->key().ToString(), "a"); ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2"); + db_iter->SeekForPrev("b"); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "merge_3,merge_4"); + db_iter->Next(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "d"); + ASSERT_EQ(db_iter->value().ToString(), "merge_5,merge_6"); + db_iter->Seek("c"); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "d"); @@ -1734,6 +1748,15 @@ TEST_F(DBIteratorTest, DBIterator9) { ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "b"); ASSERT_EQ(db_iter->value().ToString(), "merge_3,merge_4"); + + db_iter->SeekForPrev("c"); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "merge_3,merge_4"); + db_iter->Next(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "d"); + ASSERT_EQ(db_iter->value().ToString(), "merge_5,merge_6"); } } @@ -1764,6 +1787,18 @@ TEST_F(DBIteratorTest, DBIterator10) { ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "c"); ASSERT_EQ(db_iter->value().ToString(), "3"); + + db_iter->SeekForPrev("c"); + ASSERT_TRUE(db_iter->Valid()); + db_iter->Next(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "d"); + ASSERT_EQ(db_iter->value().ToString(), "4"); + + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "c"); + ASSERT_EQ(db_iter->value().ToString(), "3"); } TEST_F(DBIteratorTest, SeekToLastOccurrenceSeq0) { @@ -1914,7 +1949,7 @@ TEST_F(DBIterWithMergeIterTest, InnerMergeIterator1) { TEST_F(DBIterWithMergeIterTest, InnerMergeIterator2) { // Test Prev() when one child iterator is at its end. - db_iter_->Seek("g"); + db_iter_->SeekForPrev("g"); ASSERT_TRUE(db_iter_->Valid()); ASSERT_EQ(db_iter_->key().ToString(), "g"); ASSERT_EQ(db_iter_->value().ToString(), "3"); diff --git a/db/db_iterator_test.cc b/db/db_iterator_test.cc index a971835c0..e6bb25405 100644 --- a/db/db_iterator_test.cc +++ b/db/db_iterator_test.cc @@ -204,6 +204,22 @@ TEST_F(DBIteratorTest, IterSeekBeforePrev) { delete iter; } +TEST_F(DBIteratorTest, IterSeekForPrevBeforeNext) { + ASSERT_OK(Put("a", "b")); + ASSERT_OK(Put("c", "d")); + dbfull()->Flush(FlushOptions()); + ASSERT_OK(Put("0", "f")); + ASSERT_OK(Put("1", "h")); + dbfull()->Flush(FlushOptions()); + ASSERT_OK(Put("2", "j")); + auto iter = db_->NewIterator(ReadOptions()); + iter->SeekForPrev(Slice("0")); + iter->Next(); + iter->SeekForPrev(Slice("1")); + iter->Next(); + delete iter; +} + namespace { std::string MakeLongKey(size_t length, char c) { return std::string(length, c); @@ -231,6 +247,13 @@ TEST_F(DBIteratorTest, IterLongKeys) { ASSERT_EQ(IterStatus(iter), MakeLongKey(127, 3) + "->3"); iter->Next(); ASSERT_EQ(IterStatus(iter), MakeLongKey(64, 4) + "->4"); + + iter->SeekForPrev(MakeLongKey(127, 3)); + ASSERT_EQ(IterStatus(iter), MakeLongKey(127, 3) + "->3"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), MakeLongKey(32, 2) + "->2"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), MakeLongKey(50, 1) + "->1"); delete iter; iter = db_->NewIterator(ReadOptions()); @@ -261,6 +284,11 @@ TEST_F(DBIteratorTest, IterNextWithNewerSeq) { ASSERT_EQ(IterStatus(iter), "a->b"); iter->Next(); ASSERT_EQ(IterStatus(iter), "c->d"); + iter->SeekForPrev(Slice("b")); + ASSERT_EQ(IterStatus(iter), "a->b"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "c->d"); + delete iter; } @@ -284,7 +312,13 @@ TEST_F(DBIteratorTest, IterPrevWithNewerSeq) { ASSERT_EQ(IterStatus(iter), "c->d"); iter->Prev(); ASSERT_EQ(IterStatus(iter), "a->b"); - + iter->Prev(); + iter->SeekForPrev(Slice("d")); + ASSERT_EQ(IterStatus(iter), "d->e"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "c->d"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "a->b"); iter->Prev(); delete iter; } @@ -294,10 +328,13 @@ TEST_F(DBIteratorTest, IterPrevWithNewerSeq2) { dbfull()->Flush(FlushOptions()); ASSERT_OK(Put("a", "b")); ASSERT_OK(Put("c", "d")); - ASSERT_OK(Put("d", "e")); + ASSERT_OK(Put("e", "f")); auto iter = db_->NewIterator(ReadOptions()); + auto iter2 = db_->NewIterator(ReadOptions()); iter->Seek(Slice("c")); + iter2->SeekForPrev(Slice("d")); ASSERT_EQ(IterStatus(iter), "c->d"); + ASSERT_EQ(IterStatus(iter2), "c->d"); // Create a key that needs to be skipped for Seq too new for (uint64_t i = 0; i < last_options_.max_sequential_skip_in_iterations + 1; @@ -307,9 +344,12 @@ TEST_F(DBIteratorTest, IterPrevWithNewerSeq2) { iter->Prev(); ASSERT_EQ(IterStatus(iter), "a->b"); - iter->Prev(); + iter2->Prev(); + ASSERT_EQ(IterStatus(iter2), "a->b"); + iter2->Prev(); delete iter; + delete iter2; } TEST_F(DBIteratorTest, IterEmpty) { @@ -326,6 +366,9 @@ TEST_F(DBIteratorTest, IterEmpty) { iter->Seek("foo"); ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekForPrev("foo"); + ASSERT_EQ(IterStatus(iter), "(invalid)"); + delete iter; } while (ChangeCompactOptions()); } @@ -358,14 +401,24 @@ TEST_F(DBIteratorTest, IterSingle) { ASSERT_EQ(IterStatus(iter), "a->va"); iter->Next(); ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekForPrev(""); + ASSERT_EQ(IterStatus(iter), "(invalid)"); iter->Seek("a"); ASSERT_EQ(IterStatus(iter), "a->va"); iter->Next(); ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekForPrev("a"); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); iter->Seek("b"); ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekForPrev("b"); + ASSERT_EQ(IterStatus(iter), "a->va"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "(invalid)"); delete iter; } while (ChangeCompactOptions()); @@ -411,11 +464,21 @@ TEST_F(DBIteratorTest, IterMulti) { ASSERT_EQ(IterStatus(iter), "a->va"); iter->Seek("ax"); ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->SeekForPrev("d"); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->SeekForPrev("c"); + ASSERT_EQ(IterStatus(iter), "c->vc"); + iter->SeekForPrev("bx"); + ASSERT_EQ(IterStatus(iter), "b->vb"); iter->Seek("b"); ASSERT_EQ(IterStatus(iter), "b->vb"); iter->Seek("z"); ASSERT_EQ(IterStatus(iter), "(invalid)"); + iter->SeekForPrev("b"); + ASSERT_EQ(IterStatus(iter), "b->vb"); + iter->SeekForPrev(""); + ASSERT_EQ(IterStatus(iter), "(invalid)"); // Switch from reverse to forward iter->SeekToLast(); @@ -658,6 +721,7 @@ TEST_F(DBIteratorTest, IterWithSnapshot) { options.snapshot = snapshot; Iterator* iter = db_->NewIterator(options, handles_[1]); + ASSERT_OK(Put(1, "key0", "val0")); // Put more values after the snapshot ASSERT_OK(Put(1, "key100", "val100")); ASSERT_OK(Put(1, "key101", "val101")); @@ -682,6 +746,26 @@ TEST_F(DBIteratorTest, IterWithSnapshot) { iter->Next(); ASSERT_TRUE(!iter->Valid()); } + + if (!CurrentOptions().merge_operator) { + // TODO(gzh): merge operator does not support backward iteration yet + if (kPlainTableAllBytesPrefix != option_config_ && + kBlockBasedTableWithWholeKeyHashIndex != option_config_ && + kHashLinkList != option_config_) { + iter->SeekForPrev("key1"); + ASSERT_EQ(IterStatus(iter), "key1->val1"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "key2->val2"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "key3->val3"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "key2->val2"); + iter->Prev(); + ASSERT_EQ(IterStatus(iter), "key1->val1"); + iter->Prev(); + ASSERT_TRUE(!iter->Valid()); + } + } db_->ReleaseSnapshot(snapshot); delete iter; // skip as HashCuckooRep does not support snapshot @@ -745,6 +829,19 @@ TEST_F(DBIteratorTest, DBIteratorBoundTest) { iter->Next(); ASSERT_TRUE(iter->Valid()); ASSERT_EQ(iter->key().compare(Slice("g1")), 0); + + iter->SeekForPrev("g1"); + + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("g1")), 0); + + iter->Prev(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("foo1")), 0); + + iter->Prev(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("foo")), 0); } // testing iterate_upper_bound and forward iterator @@ -877,7 +974,7 @@ TEST_F(DBIteratorTest, DBIteratorBoundTest) { // TODO(3.13): fix the issue of Seek() + Prev() which might not necessary // return the biggest key which is smaller than the seek key. -TEST_F(DBIteratorTest, PrevAfterMerge) { +TEST_F(DBIteratorTest, PrevAfterAndNextAfterMerge) { Options options; options.create_if_missing = true; options.merge_operator = MergeOperators::CreatePutOperator(); @@ -898,6 +995,14 @@ TEST_F(DBIteratorTest, PrevAfterMerge) { it->Prev(); ASSERT_TRUE(it->Valid()); ASSERT_EQ("1", it->key().ToString()); + + it->SeekForPrev("1"); + ASSERT_TRUE(it->Valid()); + ASSERT_EQ("1", it->key().ToString()); + + it->Next(); + ASSERT_TRUE(it->Valid()); + ASSERT_EQ("2", it->key().ToString()); } TEST_F(DBIteratorTest, PinnedDataIteratorRandomized) { @@ -980,7 +1085,6 @@ TEST_F(DBIteratorTest, PinnedDataIteratorRandomized) { { // Test Seek to random keys - printf("Testing seek on %" ROCKSDB_PRIszt " keys\n", random_keys.size()); std::vector keys_slices; std::vector true_keys; for (auto& k : random_keys) { @@ -1002,9 +1106,31 @@ TEST_F(DBIteratorTest, PinnedDataIteratorRandomized) { } } + { + // Test SeekForPrev to random keys + std::vector keys_slices; + std::vector true_keys; + for (auto& k : random_keys) { + iter->SeekForPrev(k); + if (!iter->Valid()) { + ASSERT_EQ(true_data.upper_bound(k), true_data.begin()); + continue; + } + std::string prop_value; + ASSERT_OK( + iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value)); + ASSERT_EQ("1", prop_value); + keys_slices.push_back(iter->key()); + true_keys.push_back((--true_data.upper_bound(k))->first); + } + + for (size_t i = 0; i < keys_slices.size(); i++) { + ASSERT_EQ(keys_slices[i].ToString(), true_keys[i]); + } + } + { // Test iterating all data forward - printf("Testing iterating forward on all keys\n"); std::vector all_keys; for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { std::string prop_value; @@ -1025,7 +1151,6 @@ TEST_F(DBIteratorTest, PinnedDataIteratorRandomized) { { // Test iterating all data backward - printf("Testing iterating backward on all keys\n"); std::vector all_keys; for (iter->SeekToLast(); iter->Valid(); iter->Prev()) { std::string prop_value; @@ -1230,6 +1355,60 @@ TEST_F(DBIteratorTest, PinnedDataIteratorReadAfterUpdate) { delete iter; } +TEST_F(DBIteratorTest, IterSeekForPrevCrossingFiles) { + Options options = CurrentOptions(); + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + options.disable_auto_compactions = true; + // Enable prefix bloom for SST files + BlockBasedTableOptions table_options; + table_options.filter_policy.reset(NewBloomFilterPolicy(10, true)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + + ASSERT_OK(Put("a1", "va1")); + ASSERT_OK(Put("a2", "va2")); + ASSERT_OK(Put("a3", "va3")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("b1", "vb1")); + ASSERT_OK(Put("b2", "vb2")); + ASSERT_OK(Put("b3", "vb3")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("b4", "vb4")); + ASSERT_OK(Put("d1", "vd1")); + ASSERT_OK(Put("d2", "vd2")); + ASSERT_OK(Put("d4", "vd4")); + ASSERT_OK(Flush()); + + MoveFilesToLevel(1); + { + ReadOptions ro; + Iterator* iter = db_->NewIterator(ro); + + iter->SeekForPrev("a4"); + ASSERT_EQ(iter->key().ToString(), "a3"); + ASSERT_EQ(iter->value().ToString(), "va3"); + + iter->SeekForPrev("c2"); + ASSERT_EQ(iter->key().ToString(), "b3"); + iter->SeekForPrev("d3"); + ASSERT_EQ(iter->key().ToString(), "d2"); + iter->SeekForPrev("b5"); + ASSERT_EQ(iter->key().ToString(), "b4"); + delete iter; + } + + { + ReadOptions ro; + ro.prefix_same_as_start = true; + Iterator* iter = db_->NewIterator(ro); + iter->SeekForPrev("c2"); + ASSERT_TRUE(!iter->Valid()); + delete iter; + } +} + TEST_F(DBIteratorTest, IterPrevKeyCrossingBlocks) { Options options = CurrentOptions(); BlockBasedTableOptions table_options; diff --git a/db/db_test.cc b/db/db_test.cc index 0f7e0adcf..d019c41ea 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -2881,6 +2881,10 @@ class ModelDB : public DB { virtual void Seek(const Slice& k) override { iter_ = map_->lower_bound(k.ToString()); } + virtual void SeekForPrev(const Slice& k) override { + iter_ = map_->upper_bound(k.ToString()); + Prev(); + } virtual void Next() override { ++iter_; } virtual void Prev() override { if (iter_ == map_->begin()) { diff --git a/db/dbformat.cc b/db/dbformat.cc index cb45cd12e..5914e6dbf 100644 --- a/db/dbformat.cc +++ b/db/dbformat.cc @@ -20,6 +20,15 @@ namespace rocksdb { +// kValueTypeForSeek defines the ValueType that should be passed when +// constructing a ParsedInternalKey object for seeking to a particular +// sequence number (since we sort sequence numbers in decreasing order +// and the value type is embedded as the low 8 bits in the sequence +// number in internal keys, we need to use the highest-numbered +// ValueType, not the lowest). +const ValueType kValueTypeForSeek = kTypeSingleDeletion; +const ValueType kValueTypeForSeekForPrev = kTypeDeletion; + uint64_t PackSequenceAndType(uint64_t seq, ValueType t) { assert(seq <= kMaxSequenceNumber); assert(IsExtendedValueType(t)); diff --git a/db/dbformat.h b/db/dbformat.h index f7d763105..29d17b0ec 100644 --- a/db/dbformat.h +++ b/db/dbformat.h @@ -50,13 +50,9 @@ enum ValueType : unsigned char { kMaxValue = 0x7F // Not used for storing records. }; -// kValueTypeForSeek defines the ValueType that should be passed when -// constructing a ParsedInternalKey object for seeking to a particular -// sequence number (since we sort sequence numbers in decreasing order -// and the value type is embedded as the low 8 bits in the sequence -// number in internal keys, we need to use the highest-numbered -// ValueType, not the lowest). -static const ValueType kValueTypeForSeek = kTypeSingleDeletion; +// Defined in dbformat.cc +extern const ValueType kValueTypeForSeek; +extern const ValueType kValueTypeForSeekForPrev; // Checks whether a type is an inline value type // (i.e. a type used in memtable skiplist and sst file datablock). diff --git a/db/forward_iterator.cc b/db/forward_iterator.cc index 92b5ab25e..2bd95e9f6 100644 --- a/db/forward_iterator.cc +++ b/db/forward_iterator.cc @@ -97,6 +97,10 @@ class LevelIterator : public InternalIterator { file_iter_->Seek(internal_key); valid_ = file_iter_->Valid(); } + void SeekForPrev(const Slice& internal_key) override { + status_ = Status::NotSupported("LevelIterator::SeekForPrev()"); + valid_ = false; + } void Next() override { assert(valid_); file_iter_->Next(); diff --git a/db/forward_iterator.h b/db/forward_iterator.h index 32ebf262c..4f00f6e76 100644 --- a/db/forward_iterator.h +++ b/db/forward_iterator.h @@ -55,6 +55,10 @@ class ForwardIterator : public InternalIterator { ColumnFamilyData* cfd, SuperVersion* current_sv = nullptr); virtual ~ForwardIterator(); + void SeekForPrev(const Slice& target) override { + status_ = Status::NotSupported("ForwardIterator::SeekForPrev()"); + valid_ = false; + } void SeekToLast() override { status_ = Status::NotSupported("ForwardIterator::SeekToLast()"); valid_ = false; diff --git a/db/inlineskiplist.h b/db/inlineskiplist.h index cfd47f39f..5e4ffbc7a 100644 --- a/db/inlineskiplist.h +++ b/db/inlineskiplist.h @@ -116,6 +116,9 @@ class InlineSkipList { // Advance to the first entry with a key >= target void Seek(const char* target); + // Retreat to the last entry with a key <= target + void SeekForPrev(const char* target); + // Position at the first entry in list. // Final state of iterator is Valid() iff list is not empty. void SeekToFirst(); @@ -167,6 +170,10 @@ class InlineSkipList { return (compare_(a, b) == 0); } + bool LessThan(const char* a, const char* b) const { + return (compare_(a, b) < 0); + } + // Return true if key is greater than the data stored in "n". Null n // is considered infinite. bool KeyIsAfterNode(const char* key, Node* n) const; @@ -310,6 +317,18 @@ inline void InlineSkipList::Iterator::Seek(const char* target) { node_ = list_->FindGreaterOrEqual(target); } +template +inline void InlineSkipList::Iterator::SeekForPrev( + const char* target) { + Seek(target); + if (!Valid()) { + SeekToLast(); + } + while (Valid() && list_->LessThan(target, key())) { + Prev(); + } +} + template inline void InlineSkipList::Iterator::SeekToFirst() { node_ = list_->head_->Next(0); diff --git a/db/inlineskiplist_test.cc b/db/inlineskiplist_test.cc index 5743bacec..a683b332b 100644 --- a/db/inlineskiplist_test.cc +++ b/db/inlineskiplist_test.cc @@ -58,6 +58,8 @@ TEST_F(InlineSkipTest, Empty) { key = 100; iter.Seek(Encode(&key)); ASSERT_TRUE(!iter.Valid()); + iter.SeekForPrev(Encode(&key)); + ASSERT_TRUE(!iter.Valid()); iter.SeekToLast(); ASSERT_TRUE(!iter.Valid()); } @@ -97,6 +99,11 @@ TEST_F(InlineSkipTest, InsertAndLookup) { ASSERT_TRUE(iter.Valid()); ASSERT_EQ(*(keys.begin()), Decode(iter.key())); + uint64_t max_key = R - 1; + iter.SeekForPrev(Encode(&max_key)); + ASSERT_TRUE(iter.Valid()); + ASSERT_EQ(*(keys.rbegin()), Decode(iter.key())); + iter.SeekToFirst(); ASSERT_TRUE(iter.Valid()); ASSERT_EQ(*(keys.begin()), Decode(iter.key())); @@ -127,18 +134,22 @@ TEST_F(InlineSkipTest, InsertAndLookup) { } // Backward iteration test - { + for (Key i = 0; i < R; i++) { InlineSkipList::Iterator iter(&list); - iter.SeekToLast(); + iter.SeekForPrev(Encode(&i)); // Compare against model iterator - for (std::set::reverse_iterator model_iter = keys.rbegin(); - model_iter != keys.rend(); ++model_iter) { - ASSERT_TRUE(iter.Valid()); - ASSERT_EQ(*model_iter, Decode(iter.key())); - iter.Prev(); + std::set::iterator model_iter = keys.upper_bound(i); + for (int j = 0; j < 3; j++) { + if (model_iter == keys.begin()) { + ASSERT_TRUE(!iter.Valid()); + break; + } else { + ASSERT_TRUE(iter.Valid()); + ASSERT_EQ(*--model_iter, Decode(iter.key())); + iter.Prev(); + } } - ASSERT_TRUE(!iter.Valid()); } } diff --git a/db/managed_iterator.cc b/db/managed_iterator.cc index 1d47f933d..e8837fce3 100644 --- a/db/managed_iterator.cc +++ b/db/managed_iterator.cc @@ -122,6 +122,16 @@ void ManagedIterator::Seek(const Slice& user_key) { SeekInternal(user_key, false); } +void ManagedIterator::SeekForPrev(const Slice& user_key) { + MILock l(&in_use_, this); + if (NeedToRebuild()) { + RebuildIterator(); + } + assert(mutable_iter_ != nullptr); + mutable_iter_->SeekForPrev(user_key); + UpdateCurrent(); +} + void ManagedIterator::SeekInternal(const Slice& user_key, bool seek_to_first) { if (NeedToRebuild()) { RebuildIterator(); diff --git a/db/managed_iterator.h b/db/managed_iterator.h index d9a87596e..bce26383e 100644 --- a/db/managed_iterator.h +++ b/db/managed_iterator.h @@ -43,6 +43,7 @@ class ManagedIterator : public Iterator { virtual bool Valid() const override; void SeekToFirst() override; virtual void Seek(const Slice& target) override; + virtual void SeekForPrev(const Slice& target) override; virtual void Next() override; virtual Slice key() const override; virtual Slice value() const override; diff --git a/db/memtable.cc b/db/memtable.cc index d8e35a289..a25c05a3c 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -222,6 +222,7 @@ class MemTableIterator : public InternalIterator { Arena* arena) : bloom_(nullptr), prefix_extractor_(mem.prefix_extractor_), + comparator_(mem.comparator_), valid_(false), arena_mode_(arena != nullptr), value_pinned_(!mem.GetMemTableOptions()->inplace_update_support) { @@ -272,6 +273,28 @@ class MemTableIterator : public InternalIterator { iter_->Seek(k, nullptr); valid_ = iter_->Valid(); } + virtual void SeekForPrev(const Slice& k) override { + PERF_TIMER_GUARD(seek_on_memtable_time); + PERF_COUNTER_ADD(seek_on_memtable_count, 1); + if (bloom_ != nullptr) { + if (!bloom_->MayContain( + prefix_extractor_->Transform(ExtractUserKey(k)))) { + PERF_COUNTER_ADD(bloom_memtable_miss_count, 1); + valid_ = false; + return; + } else { + PERF_COUNTER_ADD(bloom_memtable_hit_count, 1); + } + } + iter_->Seek(k, nullptr); + valid_ = iter_->Valid(); + if (!Valid()) { + SeekToLast(); + } + while (Valid() && comparator_.comparator.Compare(k, key()) < 0) { + Prev(); + } + } virtual void SeekToFirst() override { iter_->SeekToFirst(); valid_ = iter_->Valid(); @@ -315,6 +338,7 @@ class MemTableIterator : public InternalIterator { private: DynamicBloom* bloom_; const SliceTransform* const prefix_extractor_; + const MemTable::KeyComparator comparator_; MemTableRep::Iterator* iter_; bool valid_; bool arena_mode_; diff --git a/db/skiplist.h b/db/skiplist.h index 3fdbd8f54..fe69094fb 100644 --- a/db/skiplist.h +++ b/db/skiplist.h @@ -92,6 +92,9 @@ class SkipList { // Advance to the first entry with a key >= target void Seek(const Key& target); + // Retreat to the last entry with a key <= target + void SeekForPrev(const Key& target); + // Position at the first entry in list. // Final state of iterator is Valid() iff list is not empty. void SeekToFirst(); @@ -135,6 +138,9 @@ class SkipList { Node* NewNode(const Key& key, int height); int RandomHeight(); bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); } + bool LessThan(const Key& a, const Key& b) const { + return (compare_(a, b) < 0); + } // Return true if key is greater than the data stored in "n" bool KeyIsAfterNode(const Key& key, Node* n) const; @@ -247,7 +253,19 @@ inline void SkipList::Iterator::Seek(const Key& target) { node_ = list_->FindGreaterOrEqual(target); } -template +template +inline void SkipList::Iterator::SeekForPrev( + const Key& target) { + Seek(target); + if (!Valid()) { + SeekToLast(); + } + while (Valid() && list_->LessThan(target, key())) { + Prev(); + } +} + +template inline void SkipList::Iterator::SeekToFirst() { node_ = list_->head_->Next(0); } diff --git a/db/skiplist_test.cc b/db/skiplist_test.cc index b4f98e34c..80f696d29 100644 --- a/db/skiplist_test.cc +++ b/db/skiplist_test.cc @@ -45,6 +45,8 @@ TEST_F(SkipTest, Empty) { ASSERT_TRUE(!iter.Valid()); iter.Seek(100); ASSERT_TRUE(!iter.Valid()); + iter.SeekForPrev(100); + ASSERT_TRUE(!iter.Valid()); iter.SeekToLast(); ASSERT_TRUE(!iter.Valid()); } @@ -81,6 +83,10 @@ TEST_F(SkipTest, InsertAndLookup) { ASSERT_TRUE(iter.Valid()); ASSERT_EQ(*(keys.begin()), iter.key()); + iter.SeekForPrev(R - 1); + ASSERT_TRUE(iter.Valid()); + ASSERT_EQ(*(keys.rbegin()), iter.key()); + iter.SeekToFirst(); ASSERT_TRUE(iter.Valid()); ASSERT_EQ(*(keys.begin()), iter.key()); @@ -111,19 +117,22 @@ TEST_F(SkipTest, InsertAndLookup) { } // Backward iteration test - { + for (int i = 0; i < R; i++) { SkipList::Iterator iter(&list); - iter.SeekToLast(); + iter.SeekForPrev(i); // Compare against model iterator - for (std::set::reverse_iterator model_iter = keys.rbegin(); - model_iter != keys.rend(); - ++model_iter) { - ASSERT_TRUE(iter.Valid()); - ASSERT_EQ(*model_iter, iter.key()); - iter.Prev(); + std::set::iterator model_iter = keys.upper_bound(i); + for (int j = 0; j < 3; j++) { + if (model_iter == keys.begin()) { + ASSERT_TRUE(!iter.Valid()); + break; + } else { + ASSERT_TRUE(iter.Valid()); + ASSERT_EQ(*--model_iter, iter.key()); + iter.Prev(); + } } - ASSERT_TRUE(!iter.Valid()); } } diff --git a/db/version_set.cc b/db/version_set.cc index 0f668ca82..bda5216f2 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -446,6 +446,10 @@ class LevelFileNumIterator : public InternalIterator { virtual void Seek(const Slice& target) override { index_ = FindFile(icmp_, *flevel_, target); } + virtual void SeekForPrev(const Slice& target) override { + SeekForPrevImpl(target, &icmp_); + } + virtual void SeekToFirst() override { index_ = 0; } virtual void SeekToLast() override { index_ = (flevel_->num_files == 0) diff --git a/include/rocksdb/iterator.h b/include/rocksdb/iterator.h index c39ce4641..9d935b627 100644 --- a/include/rocksdb/iterator.h +++ b/include/rocksdb/iterator.h @@ -69,6 +69,11 @@ class Iterator : public Cleanable { // an entry that comes at or past target. virtual void Seek(const Slice& target) = 0; + // Position at the last key in the source that at or before target + // The iterator is Valid() after this call iff the source contains + // an entry that comes at or before target. + virtual void SeekForPrev(const Slice& target) = 0; + // Moves to the next entry in the source. After this call, Valid() is // true iff the iterator was not positioned at the last entry in the source. // REQUIRES: Valid() diff --git a/include/rocksdb/memtablerep.h b/include/rocksdb/memtablerep.h index f6f030946..fbf07caaf 100644 --- a/include/rocksdb/memtablerep.h +++ b/include/rocksdb/memtablerep.h @@ -154,6 +154,10 @@ class MemTableRep { // Advance to the first entry with a key >= target virtual void Seek(const Slice& internal_key, const char* memtable_key) = 0; + // retreat to the first entry with a key <= target + virtual void SeekForPrev(const Slice& internal_key, + const char* memtable_key) = 0; + // Position at the first entry in collection. // Final state of iterator is Valid() iff collection is not empty. virtual void SeekToFirst() = 0; diff --git a/include/rocksdb/perf_context.h b/include/rocksdb/perf_context.h index 1d73fecaa..283cda9dc 100644 --- a/include/rocksdb/perf_context.h +++ b/include/rocksdb/perf_context.h @@ -72,7 +72,8 @@ struct PerfContext { uint64_t seek_child_seek_time; // number of seek issued in child iterators uint64_t seek_child_seek_count; - uint64_t seek_min_heap_time; // total nanos spent on the merge heap + uint64_t seek_min_heap_time; // total nanos spent on the merge min heap + uint64_t seek_max_heap_time; // total nanos spent on the merge max heap // total nanos spent on seeking the internal entries uint64_t seek_internal_seek_time; // total nanos spent on iterating internal entries to find the next user entry diff --git a/include/rocksdb/utilities/write_batch_with_index.h b/include/rocksdb/utilities/write_batch_with_index.h index ed25cf4b8..08852bc3d 100644 --- a/include/rocksdb/utilities/write_batch_with_index.h +++ b/include/rocksdb/utilities/write_batch_with_index.h @@ -60,6 +60,8 @@ class WBWIIterator { virtual void Seek(const Slice& key) = 0; + virtual void SeekForPrev(const Slice& key) = 0; + virtual void Next() = 0; virtual void Prev() = 0; diff --git a/memtable/hash_cuckoo_rep.cc b/memtable/hash_cuckoo_rep.cc index 6ae3e098b..a8799f1ca 100644 --- a/memtable/hash_cuckoo_rep.cc +++ b/memtable/hash_cuckoo_rep.cc @@ -40,8 +40,7 @@ struct CuckooStep { CuckooStep() : bucket_id_(-1), prev_step_id_(kNullStep), depth_(1) {} - // MSVC does not support = default yet - CuckooStep(CuckooStep&& o) ROCKSDB_NOEXCEPT { *this = std::move(o); } + CuckooStep(CuckooStep&& o) = default; CuckooStep& operator=(CuckooStep&& rhs) { bucket_id_ = std::move(rhs.bucket_id_); @@ -153,6 +152,10 @@ class HashCuckooRep : public MemTableRep { // Advance to the first entry with a key >= target virtual void Seek(const Slice& user_key, const char* memtable_key) override; + // Retreat to the last entry with a key <= target + virtual void SeekForPrev(const Slice& user_key, + const char* memtable_key) override; + // Position at the first entry in collection. // Final state of iterator is Valid() iff collection is not empty. virtual void SeekToFirst() override; @@ -594,6 +597,12 @@ void HashCuckooRep::Iterator::Seek(const Slice& user_key, }).first; } +// Retreat to the last entry with a key <= target +void HashCuckooRep::Iterator::SeekForPrev(const Slice& user_key, + const char* memtable_key) { + assert(false); +} + // Position at the first entry in collection. // Final state of iterator is Valid() iff collection is not empty. void HashCuckooRep::Iterator::SeekToFirst() { diff --git a/memtable/hash_linklist_rep.cc b/memtable/hash_linklist_rep.cc index 902c30e8a..7caf66531 100644 --- a/memtable/hash_linklist_rep.cc +++ b/memtable/hash_linklist_rep.cc @@ -247,8 +247,18 @@ class HashLinkListRep : public MemTableRep { return (n != nullptr) && (compare_(n->key, key) < 0); } + bool KeyIsAfterOrAtNode(const Slice& internal_key, const Node* n) const { + // nullptr n is considered infinite + return (n != nullptr) && (compare_(n->key, internal_key) <= 0); + } + + bool KeyIsAfterOrAtNode(const Key& key, const Node* n) const { + // nullptr n is considered infinite + return (n != nullptr) && (compare_(n->key, key) <= 0); + } Node* FindGreaterOrEqualInBucket(Node* head, const Slice& key) const; + Node* FindLessOrEqualInBucket(Node* head, const Slice& key) const; class FullListIterator : public MemTableRep::Iterator { public: @@ -291,6 +301,15 @@ class HashLinkListRep : public MemTableRep { iter_.Seek(encoded_key); } + // Retreat to the last entry with a key <= target + virtual void SeekForPrev(const Slice& internal_key, + const char* memtable_key) override { + const char* encoded_key = (memtable_key != nullptr) + ? memtable_key + : EncodeKey(&tmp_, internal_key); + iter_.SeekForPrev(encoded_key); + } + // Position at the first entry in collection. // Final state of iterator is Valid() iff collection is not empty. virtual void SeekToFirst() override { iter_.SeekToFirst(); } @@ -348,6 +367,14 @@ class HashLinkListRep : public MemTableRep { internal_key); } + // Retreat to the last entry with a key <= target + virtual void SeekForPrev(const Slice& internal_key, + const char* memtable_key) override { + // Since we do not support Prev() + // We simply do not support SeekForPrev + Reset(nullptr); + } + // Position at the first entry in collection. // Final state of iterator is Valid() iff collection is not empty. virtual void SeekToFirst() override { @@ -458,6 +485,8 @@ class HashLinkListRep : public MemTableRep { virtual void Prev() override {} virtual void Seek(const Slice& user_key, const char* memtable_key) override {} + virtual void SeekForPrev(const Slice& user_key, + const char* memtable_key) override {} virtual void SeekToFirst() override {} virtual void SeekToLast() override {} diff --git a/memtable/hash_skiplist_rep.cc b/memtable/hash_skiplist_rep.cc index 73a917607..12b47950d 100644 --- a/memtable/hash_skiplist_rep.cc +++ b/memtable/hash_skiplist_rep.cc @@ -130,6 +130,13 @@ class HashSkipListRep : public MemTableRep { } } + // Retreat to the last entry with a key <= target + virtual void SeekForPrev(const Slice& internal_key, + const char* memtable_key) override { + // not supported + assert(false); + } + // Position at the first entry in collection. // Final state of iterator is Valid() iff collection is not empty. virtual void SeekToFirst() override { @@ -214,6 +221,8 @@ class HashSkipListRep : public MemTableRep { virtual void Prev() override {} virtual void Seek(const Slice& internal_key, const char* memtable_key) override {} + virtual void SeekForPrev(const Slice& internal_key, + const char* memtable_key) override {} virtual void SeekToFirst() override {} virtual void SeekToLast() override {} diff --git a/memtable/skiplistrep.cc b/memtable/skiplistrep.cc index b8c90c6d6..daed53d99 100644 --- a/memtable/skiplistrep.cc +++ b/memtable/skiplistrep.cc @@ -118,6 +118,16 @@ public: } } + // Retreat to the last entry with a key <= target + virtual void SeekForPrev(const Slice& user_key, + const char* memtable_key) override { + if (memtable_key != nullptr) { + iter_.SeekForPrev(memtable_key); + } else { + iter_.SeekForPrev(EncodeKey(&tmp_, user_key)); + } + } + // Position at the first entry in list. // Final state of iterator is Valid() iff list is not empty. virtual void SeekToFirst() override { @@ -208,6 +218,15 @@ public: prev_ = iter_; } + virtual void SeekForPrev(const Slice& internal_key, + const char* memtable_key) override { + const char* encoded_key = (memtable_key != nullptr) + ? memtable_key + : EncodeKey(&tmp_, internal_key); + iter_.SeekForPrev(encoded_key); + prev_ = iter_; + } + virtual void SeekToFirst() override { iter_.SeekToFirst(); prev_ = iter_; diff --git a/memtable/vectorrep.cc b/memtable/vectorrep.cc index b9d9ebe0a..697156c3c 100644 --- a/memtable/vectorrep.cc +++ b/memtable/vectorrep.cc @@ -83,6 +83,10 @@ class VectorRep : public MemTableRep { // Advance to the first entry with a key >= target virtual void Seek(const Slice& user_key, const char* memtable_key) override; + // Advance to the first entry with a key <= target + virtual void SeekForPrev(const Slice& user_key, + const char* memtable_key) override; + // Position at the first entry in collection. // Final state of iterator is Valid() iff collection is not empty. virtual void SeekToFirst() override; @@ -221,6 +225,12 @@ void VectorRep::Iterator::Seek(const Slice& user_key, }).first; } +// Advance to the first entry with a key <= target +void VectorRep::Iterator::SeekForPrev(const Slice& user_key, + const char* memtable_key) { + assert(false); +} + // Position at the first entry in collection. // Final state of iterator is Valid() iff collection is not empty. void VectorRep::Iterator::SeekToFirst() { diff --git a/table/block.cc b/table/block.cc index 32eeeceaf..dacd53d50 100644 --- a/table/block.cc +++ b/table/block.cc @@ -10,7 +10,6 @@ // Decodes the blocks generated by block_builder.cc. #include "table/block.h" - #include #include #include @@ -162,6 +161,32 @@ void BlockIter::Seek(const Slice& target) { } } +void BlockIter::SeekForPrev(const Slice& target) { + PERF_TIMER_GUARD(block_seek_nanos); + if (data_ == nullptr) { // Not init yet + return; + } + uint32_t index = 0; + bool ok = false; + ok = BinarySeek(target, 0, num_restarts_ - 1, &index); + + if (!ok) { + return; + } + SeekToRestartPoint(index); + // Linear search (within restart block) for first key >= target + + while (ParseNextKey() && Compare(key_.GetKey(), target) < 0) { + } + if (!Valid()) { + SeekToLast(); + } else { + while (Valid() && Compare(key_.GetKey(), target) > 0) { + Prev(); + } + } +} + void BlockIter::SeekToFirst() { if (data_ == nullptr) { // Not init yet return; @@ -189,55 +214,56 @@ void BlockIter::CorruptionError() { } bool BlockIter::ParseNextKey() { - current_ = NextEntryOffset(); - const char* p = data_ + current_; - const char* limit = data_ + restarts_; // Restarts come right after data - if (p >= limit) { - // No more entries to return. Mark as invalid. - current_ = restarts_; - restart_index_ = num_restarts_; - return false; - } - - // Decode next entry - uint32_t shared, non_shared, value_length; - p = DecodeEntry(p, limit, &shared, &non_shared, &value_length); - if (p == nullptr || key_.Size() < shared) { - CorruptionError(); - return false; - } else { - if (shared == 0) { - // If this key dont share any bytes with prev key then we dont need - // to decode it and can use it's address in the block directly. - key_.SetKey(Slice(p, non_shared), false /* copy */); - key_pinned_ = true; - } else { - // This key share `shared` bytes with prev key, we need to decode it - key_.TrimAppend(shared, p, non_shared); - key_pinned_ = false; - } - value_ = Slice(p + non_shared, value_length); - while (restart_index_ + 1 < num_restarts_ && - GetRestartPoint(restart_index_ + 1) < current_) { - ++restart_index_; - } - return true; - } + current_ = NextEntryOffset(); + const char* p = data_ + current_; + const char* limit = data_ + restarts_; // Restarts come right after data + if (p >= limit) { + // No more entries to return. Mark as invalid. + current_ = restarts_; + restart_index_ = num_restarts_; + return false; } -// Binary search in restart array to find the first restart point -// with a key >= target (TODO: this comment is inaccurate) + // Decode next entry + uint32_t shared, non_shared, value_length; + p = DecodeEntry(p, limit, &shared, &non_shared, &value_length); + if (p == nullptr || key_.Size() < shared) { + CorruptionError(); + return false; + } else { + if (shared == 0) { + // If this key dont share any bytes with prev key then we dont need + // to decode it and can use it's address in the block directly. + key_.SetKey(Slice(p, non_shared), false /* copy */); + key_pinned_ = true; + } else { + // This key share `shared` bytes with prev key, we need to decode it + key_.TrimAppend(shared, p, non_shared); + key_pinned_ = false; + } + value_ = Slice(p + non_shared, value_length); + while (restart_index_ + 1 < num_restarts_ && + GetRestartPoint(restart_index_ + 1) < current_) { + ++restart_index_; + } + return true; + } +} + +// Binary search in restart array to find the first restart point that +// is either the last restart point with a key less than target, +// which means the key of next restart point is larger than target, or +// the first restart point with a key = target bool BlockIter::BinarySeek(const Slice& target, uint32_t left, uint32_t right, - uint32_t* index) { + uint32_t* index) { assert(left <= right); while (left < right) { uint32_t mid = (left + right + 1) / 2; uint32_t region_offset = GetRestartPoint(mid); uint32_t shared, non_shared, value_length; - const char* key_ptr = - DecodeEntry(data_ + region_offset, data_ + restarts_, &shared, - &non_shared, &value_length); + const char* key_ptr = DecodeEntry(data_ + region_offset, data_ + restarts_, + &shared, &non_shared, &value_length); if (key_ptr == nullptr || (shared != 0)) { CorruptionError(); return false; @@ -279,13 +305,13 @@ int BlockIter::CompareBlockKey(uint32_t block_index, const Slice& target) { // Binary search in block_ids to find the first block // with a key >= target bool BlockIter::BinaryBlockIndexSeek(const Slice& target, uint32_t* block_ids, - uint32_t left, uint32_t right, - uint32_t* index) { + uint32_t left, uint32_t right, + uint32_t* index) { assert(left <= right); uint32_t left_bound = left; while (left <= right) { - uint32_t mid = (left + right) / 2; + uint32_t mid = (right + left) / 2; int cmp = CompareBlockKey(block_ids[mid], target); if (!status_.ok()) { diff --git a/table/block.h b/table/block.h index c6c03a300..b70ee8daf 100644 --- a/table/block.h +++ b/table/block.h @@ -271,6 +271,8 @@ class BlockIter : public InternalIterator { virtual void Seek(const Slice& target) override; + virtual void SeekForPrev(const Slice& target) override; + virtual void SeekToFirst() override; virtual void SeekToLast() override; diff --git a/table/cuckoo_table_reader.cc b/table/cuckoo_table_reader.cc index f6d69154e..ba699b251 100644 --- a/table/cuckoo_table_reader.cc +++ b/table/cuckoo_table_reader.cc @@ -187,6 +187,7 @@ class CuckooTableIterator : public InternalIterator { void SeekToFirst() override; void SeekToLast() override; void Seek(const Slice& target) override; + void SeekForPrev(const Slice& target) override; void Next() override; void Prev() override; Slice key() const override; @@ -298,6 +299,11 @@ void CuckooTableIterator::Seek(const Slice& target) { PrepareKVAtCurrIdx(); } +void CuckooTableIterator::SeekForPrev(const Slice& target) { + // Not supported + assert(false); +} + bool CuckooTableIterator::Valid() const { return curr_key_idx_ < sorted_bucket_ids_.size(); } diff --git a/table/internal_iterator.h b/table/internal_iterator.h index 2850a6773..62248007c 100644 --- a/table/internal_iterator.h +++ b/table/internal_iterator.h @@ -7,6 +7,7 @@ #pragma once #include +#include "rocksdb/comparator.h" #include "rocksdb/iterator.h" #include "rocksdb/status.h" @@ -36,6 +37,11 @@ class InternalIterator : public Cleanable { // an entry that comes at or past target. virtual void Seek(const Slice& target) = 0; + // Position at the first key in the source that at or before target + // The iterator is Valid() after this call iff the source contains + // an entry that comes at or before target. + virtual void SeekForPrev(const Slice& target) = 0; + // Moves to the next entry in the source. After this call, Valid() is // true iff the iterator was not positioned at the last entry in the source. // REQUIRES: Valid() @@ -89,6 +95,17 @@ class InternalIterator : public Cleanable { return Status::NotSupported(""); } + protected: + void SeekForPrevImpl(const Slice& target, const Comparator* cmp) { + Seek(target); + if (!Valid()) { + SeekToLast(); + } + while (Valid() && cmp->Compare(target, key()) < 0) { + Prev(); + } + } + private: // No copying allowed InternalIterator(const InternalIterator&) = delete; diff --git a/table/iterator.cc b/table/iterator.cc index 09f7f8e68..93b07bfef 100644 --- a/table/iterator.cc +++ b/table/iterator.cc @@ -63,6 +63,7 @@ class EmptyIterator : public Iterator { explicit EmptyIterator(const Status& s) : status_(s) { } virtual bool Valid() const override { return false; } virtual void Seek(const Slice& target) override {} + virtual void SeekForPrev(const Slice& target) override {} virtual void SeekToFirst() override {} virtual void SeekToLast() override {} virtual void Next() override { assert(false); } @@ -86,6 +87,7 @@ class EmptyInternalIterator : public InternalIterator { explicit EmptyInternalIterator(const Status& s) : status_(s) {} virtual bool Valid() const override { return false; } virtual void Seek(const Slice& target) override {} + virtual void SeekForPrev(const Slice& target) override {} virtual void SeekToFirst() override {} virtual void SeekToLast() override {} virtual void Next() override { assert(false); } diff --git a/table/iterator_wrapper.h b/table/iterator_wrapper.h index e68bbf3f0..667eac82b 100644 --- a/table/iterator_wrapper.h +++ b/table/iterator_wrapper.h @@ -61,6 +61,11 @@ class IteratorWrapper { void Next() { assert(iter_); iter_->Next(); Update(); } void Prev() { assert(iter_); iter_->Prev(); Update(); } void Seek(const Slice& k) { assert(iter_); iter_->Seek(k); Update(); } + void SeekForPrev(const Slice& k) { + assert(iter_); + iter_->SeekForPrev(k); + Update(); + } void SeekToFirst() { assert(iter_); iter_->SeekToFirst(); Update(); } void SeekToLast() { assert(iter_); iter_->SeekToLast(); Update(); } diff --git a/table/merger.cc b/table/merger.cc index 53aba5536..2b199fb5a 100644 --- a/table/merger.cc +++ b/table/merger.cc @@ -8,9 +8,7 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "table/merger.h" - #include - #include "db/pinned_iterators_manager.h" #include "rocksdb/comparator.h" #include "rocksdb/iterator.h" @@ -123,6 +121,29 @@ class MergingIterator : public InternalIterator { } } + virtual void SeekForPrev(const Slice& target) override { + ClearHeaps(); + InitMaxHeap(); + + for (auto& child : children_) { + { + PERF_TIMER_GUARD(seek_child_seek_time); + child.SeekForPrev(target); + } + PERF_COUNTER_ADD(seek_child_seek_count, 1); + + if (child.Valid()) { + PERF_TIMER_GUARD(seek_max_heap_time); + maxHeap_->push(&child); + } + } + direction_ = kReverse; + { + PERF_TIMER_GUARD(seek_max_heap_time); + current_ = CurrentReverse(); + } + } + virtual void Next() override { assert(Valid()); diff --git a/table/mock_table.h b/table/mock_table.h index d9afba46f..3af5349e6 100644 --- a/table/mock_table.h +++ b/table/mock_table.h @@ -80,6 +80,12 @@ class MockTableIterator : public InternalIterator { itr_ = table_.lower_bound(str_target); } + void SeekForPrev(const Slice& target) override { + std::string str_target(target.data(), target.size()); + itr_ = table_.upper_bound(str_target); + Prev(); + } + void Next() override { ++itr_; } void Prev() override { diff --git a/table/plain_table_reader.cc b/table/plain_table_reader.cc index d501606cf..e46a33a40 100644 --- a/table/plain_table_reader.cc +++ b/table/plain_table_reader.cc @@ -65,6 +65,8 @@ class PlainTableIterator : public InternalIterator { void Seek(const Slice& target) override; + void SeekForPrev(const Slice& target) override; + void Next() override; void Prev() override; @@ -688,6 +690,12 @@ void PlainTableIterator::Seek(const Slice& target) { } } +void PlainTableIterator::SeekForPrev(const Slice& target) { + assert(false); + status_ = + Status::NotSupported("SeekForPrev() is not supported in PlainTable"); +} + void PlainTableIterator::Next() { offset_ = next_offset_; if (offset_ < table_->file_info_.data_end_offset) { diff --git a/table/table_test.cc b/table/table_test.cc index fb1d24b06..5e2adb7f2 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -257,6 +257,12 @@ class KeyConvertingIterator : public InternalIterator { AppendInternalKey(&encoded, ikey); iter_->Seek(encoded); } + virtual void SeekForPrev(const Slice& target) override { + ParsedInternalKey ikey(target, kMaxSequenceNumber, kTypeValue); + std::string encoded; + AppendInternalKey(&encoded, ikey); + iter_->SeekForPrev(encoded); + } virtual void SeekToFirst() override { iter_->SeekToFirst(); } virtual void SeekToLast() override { iter_->SeekToLast(); } virtual void Next() override { iter_->Next(); } @@ -465,6 +471,9 @@ class InternalIteratorFromIterator : public InternalIterator { explicit InternalIteratorFromIterator(Iterator* it) : it_(it) {} virtual bool Valid() const override { return it_->Valid(); } virtual void Seek(const Slice& target) override { it_->Seek(target); } + virtual void SeekForPrev(const Slice& target) override { + it_->SeekForPrev(target); + } virtual void SeekToFirst() override { it_->SeekToFirst(); } virtual void SeekToLast() override { it_->SeekToLast(); } virtual void Next() override { it_->Next(); } diff --git a/table/two_level_iterator.cc b/table/two_level_iterator.cc index 54523fad7..1beafdd66 100644 --- a/table/two_level_iterator.cc +++ b/table/two_level_iterator.cc @@ -8,7 +8,6 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "table/two_level_iterator.h" - #include "db/pinned_iterators_manager.h" #include "rocksdb/options.h" #include "rocksdb/table.h" @@ -41,6 +40,7 @@ class TwoLevelIterator : public InternalIterator { } virtual void Seek(const Slice& target) override; + virtual void SeekForPrev(const Slice& target) override; virtual void SeekToFirst() override; virtual void SeekToLast() override; virtual void Next() override; @@ -126,6 +126,28 @@ void TwoLevelIterator::Seek(const Slice& target) { SkipEmptyDataBlocksForward(); } +void TwoLevelIterator::SeekForPrev(const Slice& target) { + if (state_->check_prefix_may_match && !state_->PrefixMayMatch(target)) { + SetSecondLevelIterator(nullptr); + return; + } + first_level_iter_.Seek(target); + InitDataBlock(); + if (second_level_iter_.iter() != nullptr) { + second_level_iter_.SeekForPrev(target); + } + if (!Valid()) { + if (!first_level_iter_.Valid()) { + first_level_iter_.SeekToLast(); + InitDataBlock(); + if (second_level_iter_.iter() != nullptr) { + second_level_iter_.SeekForPrev(target); + } + } + SkipEmptyDataBlocksBackward(); + } +} + void TwoLevelIterator::SeekToFirst() { first_level_iter_.SeekToFirst(); InitDataBlock(); diff --git a/util/testutil.h b/util/testutil.h index fddf7a37a..fb1a26c62 100644 --- a/util/testutil.h +++ b/util/testutil.h @@ -159,6 +159,16 @@ class VectorIterator : public InternalIterator { keys_.begin(); } + virtual void SeekForPrev(const Slice& target) override { + current_ = std::upper_bound(keys_.begin(), keys_.end(), target.ToString()) - + keys_.begin(); + if (!Valid()) { + SeekToLast(); + } else { + Prev(); + } + } + virtual void Next() override { current_++; } virtual void Prev() override { current_--; } diff --git a/utilities/ttl/db_ttl_impl.h b/utilities/ttl/db_ttl_impl.h index f16bcc51b..a9b445e65 100644 --- a/utilities/ttl/db_ttl_impl.h +++ b/utilities/ttl/db_ttl_impl.h @@ -109,6 +109,8 @@ class TtlIterator : public Iterator { void Seek(const Slice& target) override { iter_->Seek(target); } + void SeekForPrev(const Slice& target) override { iter_->SeekForPrev(target); } + void Next() override { iter_->Next(); } void Prev() override { iter_->Prev(); } diff --git a/utilities/write_batch_with_index/write_batch_with_index.cc b/utilities/write_batch_with_index/write_batch_with_index.cc index cbd4bf868..96f0459b3 100644 --- a/utilities/write_batch_with_index/write_batch_with_index.cc +++ b/utilities/write_batch_with_index/write_batch_with_index.cc @@ -66,6 +66,13 @@ class BaseDeltaIterator : public Iterator { UpdateCurrent(); } + void SeekForPrev(const Slice& k) override { + forward_ = false; + base_iterator_->SeekForPrev(k); + delta_iterator_->SeekForPrev(k); + UpdateCurrent(); + } + void Next() override { if (!Valid()) { status_ = Status::NotSupported("Next() on invalid iterator"); @@ -334,6 +341,11 @@ class WBWIIteratorImpl : public WBWIIterator { skip_list_iter_.Seek(&search_entry); } + virtual void SeekForPrev(const Slice& key) override { + WriteBatchIndexEntry search_entry(&key, column_family_id_); + skip_list_iter_.SeekForPrev(&search_entry); + } + virtual void Next() override { skip_list_iter_.Next(); } virtual void Prev() override { skip_list_iter_.Prev(); } diff --git a/utilities/write_batch_with_index/write_batch_with_index_test.cc b/utilities/write_batch_with_index/write_batch_with_index_test.cc index c04650d5d..1898fc62f 100644 --- a/utilities/write_batch_with_index/write_batch_with_index_test.cc +++ b/utilities/write_batch_with_index/write_batch_with_index_test.cc @@ -515,6 +515,10 @@ class KVIter : public Iterator { } } virtual void Seek(const Slice& k) { iter_ = map_->lower_bound(k.ToString()); } + virtual void SeekForPrev(const Slice& k) { + iter_ = map_->upper_bound(k.ToString()); + Prev(); + } virtual void Next() { ++iter_; } virtual void Prev() { if (iter_ == map_->begin()) {