diff --git a/HISTORY.md b/HISTORY.md index 382e899f7..8074553d4 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -4,6 +4,7 @@ ### New Features * When reading from option file/string/map, customized comparators and/or merge operators can be filled according to object registry. * Introduce Periodic Compaction for Level style compaction. Files are re-compacted periodically and put in the same level. +* Improve range scan performance by avoiding per-key upper bound check in BlockBasedTableIterator. ### Public API Change * Change the behavior of OptimizeForPointLookup(): move away from hash-based block-based-table index, and use whole key memtable filtering. @@ -15,6 +16,7 @@ * Fix a race condition between WritePrepared::Get and ::Put with duplicate keys. * Fix crash when memtable prefix bloom is enabled and read/write a key out of domain of prefix extractor. + ## 6.1.0 (3/27/2019) ### New Features * Introduce two more stats levels, kExceptHistogramOrTimers and kExceptTimers. diff --git a/db/db_iterator_test.cc b/db/db_iterator_test.cc index 6a5188c77..d41d417eb 100644 --- a/db/db_iterator_test.cc +++ b/db/db_iterator_test.cc @@ -15,6 +15,7 @@ #include "port/stack_trace.h" #include "rocksdb/iostats_context.h" #include "rocksdb/perf_context.h" +#include "table/flush_block_policy.h" namespace rocksdb { @@ -59,35 +60,6 @@ class DBIteratorTest : public DBTestBase, std::vector> read_callbacks_; }; -class FlushBlockEveryKeyPolicy : public FlushBlockPolicy { - public: - bool Update(const Slice& /*key*/, const Slice& /*value*/) override { - if (!start_) { - start_ = true; - return false; - } - return true; - } - - private: - bool start_ = false; -}; - -class FlushBlockEveryKeyPolicyFactory : public FlushBlockPolicyFactory { - public: - explicit FlushBlockEveryKeyPolicyFactory() {} - - const char* Name() const override { - return "FlushBlockEveryKeyPolicyFactory"; - } - - FlushBlockPolicy* NewFlushBlockPolicy( - const BlockBasedTableOptions& /*table_options*/, - const BlockBuilder& /*data_block_builder*/) const override { - return new FlushBlockEveryKeyPolicy; - } -}; - TEST_P(DBIteratorTest, IteratorProperty) { // The test needs to be changed if kPersistedTier is supported in iterator. Options options = CurrentOptions(); @@ -1034,48 +1006,48 @@ TEST_P(DBIteratorTest, DBIteratorBoundMultiSeek) { #endif TEST_P(DBIteratorTest, DBIteratorBoundOptimizationTest) { - int upper_bound_hits = 0; - Options options = CurrentOptions(); - rocksdb::SyncPoint::GetInstance()->SetCallBack( - "BlockBasedTable::BlockEntryIteratorState::KeyReachedUpperBound", - [&upper_bound_hits](void* arg) { - assert(arg != nullptr); - upper_bound_hits += (*static_cast(arg) ? 1 : 0); - }); - rocksdb::SyncPoint::GetInstance()->EnableProcessing(); - options.env = env_; - options.create_if_missing = true; - options.prefix_extractor = nullptr; - BlockBasedTableOptions table_options; - table_options.flush_block_policy_factory = - std::make_shared(); - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + for (auto format_version : {2, 3, 4}) { + int upper_bound_hits = 0; + Options options = CurrentOptions(); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "BlockBasedTableIterator:out_of_bound", + [&upper_bound_hits](void*) { upper_bound_hits++; }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + options.env = env_; + options.create_if_missing = true; + options.prefix_extractor = nullptr; + BlockBasedTableOptions table_options; + table_options.format_version = format_version; + table_options.flush_block_policy_factory = + std::make_shared(); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - DestroyAndReopen(options); - ASSERT_OK(Put("foo1", "bar1")); - ASSERT_OK(Put("foo2", "bar2")); - ASSERT_OK(Put("foo4", "bar4")); - ASSERT_OK(Flush()); + DestroyAndReopen(options); + ASSERT_OK(Put("foo1", "bar1")); + ASSERT_OK(Put("foo2", "bar2")); + ASSERT_OK(Put("foo4", "bar4")); + ASSERT_OK(Flush()); - Slice ub("foo3"); - ReadOptions ro; - ro.iterate_upper_bound = &ub; + Slice ub("foo3"); + ReadOptions ro; + ro.iterate_upper_bound = &ub; - std::unique_ptr iter(NewIterator(ro)); + std::unique_ptr iter(NewIterator(ro)); - iter->Seek("foo"); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(iter->key().compare(Slice("foo1")), 0); - ASSERT_EQ(upper_bound_hits, 0); + iter->Seek("foo"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("foo1")), 0); + ASSERT_EQ(upper_bound_hits, 0); - iter->Next(); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(iter->key().compare(Slice("foo2")), 0); - ASSERT_EQ(upper_bound_hits, 0); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("foo2")), 0); + ASSERT_EQ(upper_bound_hits, 0); - iter->Next(); - ASSERT_FALSE(iter->Valid()); - ASSERT_EQ(upper_bound_hits, 1); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + ASSERT_EQ(upper_bound_hits, 1); + } } // TODO(3.13): fix the issue of Seek() + Prev() which might not necessary // return the biggest key which is smaller than the seek key. diff --git a/db/forward_iterator.cc b/db/forward_iterator.cc index d1c073468..94e448ee9 100644 --- a/db/forward_iterator.cc +++ b/db/forward_iterator.cc @@ -381,9 +381,9 @@ void ForwardIterator::SeekInternal(const Slice& internal_key, } } - Slice user_key; + Slice target_user_key; if (!seek_to_first) { - user_key = ExtractUserKey(internal_key); + target_user_key = ExtractUserKey(internal_key); } const VersionStorageInfo* vstorage = sv_->current->storage_info(); const std::vector& l0 = vstorage->LevelFiles(0); @@ -396,8 +396,8 @@ void ForwardIterator::SeekInternal(const Slice& internal_key, } else { // If the target key passes over the larget key, we are sure Next() // won't go over this file. - if (user_comparator_->Compare(user_key, - l0[i]->largest.user_key()) > 0) { + if (user_comparator_->Compare(target_user_key, + l0[i]->largest.user_key()) > 0) { if (read_options_.iterate_upper_bound != nullptr) { has_iter_trimmed_for_upper_bound_ = true; DeleteIterator(l0_iters_[i]); diff --git a/db/range_tombstone_fragmenter.cc b/db/range_tombstone_fragmenter.cc index f9d9f2feb..e3eb18908 100644 --- a/db/range_tombstone_fragmenter.cc +++ b/db/range_tombstone_fragmenter.cc @@ -407,9 +407,10 @@ bool FragmentedRangeTombstoneIterator::Valid() const { } SequenceNumber FragmentedRangeTombstoneIterator::MaxCoveringTombstoneSeqnum( - const Slice& user_key) { - SeekToCoveringTombstone(user_key); - return ValidPos() && ucmp_->Compare(start_key(), user_key) <= 0 ? seq() : 0; + const Slice& target_user_key) { + SeekToCoveringTombstone(target_user_key); + return ValidPos() && ucmp_->Compare(start_key(), target_user_key) <= 0 ? seq() + : 0; } std::map> diff --git a/table/block.cc b/table/block.cc index 7c83ebb64..80bef4a91 100644 --- a/table/block.cc +++ b/table/block.cc @@ -286,9 +286,10 @@ void DataBlockIter::Seek(const Slice& target) { // with a smaller [ type | seqno ] (i.e. a larger seqno, or the same seqno // but larger type). bool DataBlockIter::SeekForGetImpl(const Slice& target) { - Slice user_key = ExtractUserKey(target); + Slice target_user_key = ExtractUserKey(target); uint32_t map_offset = restarts_ + num_restarts_ * sizeof(uint32_t); - uint8_t entry = data_block_hash_index_->Lookup(data_, map_offset, user_key); + uint8_t entry = + data_block_hash_index_->Lookup(data_, map_offset, target_user_key); if (entry == kCollision) { // HashSeek not effective, falling back @@ -360,7 +361,7 @@ bool DataBlockIter::SeekForGetImpl(const Slice& target) { return true; } - if (user_comparator_->Compare(key_.GetUserKey(), user_key) != 0) { + if (user_comparator_->Compare(key_.GetUserKey(), target_user_key) != 0) { // the key is not in this block and cannot be at the next block either. return false; } diff --git a/table/block.h b/table/block.h index 737874abd..7ef001d58 100644 --- a/table/block.h +++ b/table/block.h @@ -496,6 +496,13 @@ class IndexBlockIter final : public BlockIter { value_delta_encoded_ = !value_is_full; } + Slice user_key() const override { + if (key_includes_seq_) { + return ExtractUserKey(key()); + } + return key(); + } + virtual BlockHandle value() const override { assert(Valid()); if (value_delta_encoded_) { diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc index b58960a85..c6ceacdcb 100644 --- a/table/block_based_table_reader.cc +++ b/table/block_based_table_reader.cc @@ -2347,6 +2347,7 @@ void BlockBasedTableIterator::Seek(const Slice& target) { block_iter_.Seek(target); FindKeyForward(); + CheckOutOfBound(); assert( !block_iter_.Valid() || (key_includes_seq_ && icomp_.Compare(target, block_iter_.key()) <= 0) || @@ -2410,6 +2411,7 @@ void BlockBasedTableIterator::SeekToFirst() { InitDataBlock(); block_iter_.SeekToFirst(); FindKeyForward(); + CheckOutOfBound(); } template @@ -2492,19 +2494,33 @@ void BlockBasedTableIterator::InitDataBlock() { template void BlockBasedTableIterator::FindKeyForward() { - assert(!is_out_of_bound_); // TODO the while loop inherits from two-level-iterator. We don't know // whether a block can be empty so it can be replaced by an "if". while (!block_iter_.Valid()) { if (!block_iter_.status().ok()) { return; } + // Whether next data block is out of upper bound, if there is one. + bool next_block_is_out_of_bound = false; + if (read_options_.iterate_upper_bound != nullptr && + block_iter_points_to_real_block_) { + next_block_is_out_of_bound = + (user_comparator_.Compare(*read_options_.iterate_upper_bound, + index_iter_->user_key()) <= 0); + } ResetDataIter(); - // We used to check the current index key for upperbound. - // It will only save a data reading for a small percentage of use cases, - // so for code simplicity, we removed it. We can add it back if there is a - // significnat performance regression. index_iter_->Next(); + if (next_block_is_out_of_bound) { + // The next block is out of bound. No need to read it. + TEST_SYNC_POINT_CALLBACK("BlockBasedTableIterator:out_of_bound", nullptr); + // We need to make sure this is not the last data block before setting + // is_out_of_bound_, since the index key for the last data block can be + // larger than smallest key of the next file on the same level. + if (index_iter_->Valid()) { + is_out_of_bound_ = true; + } + return; + } if (index_iter_->Valid()) { InitDataBlock(); @@ -2513,25 +2529,10 @@ void BlockBasedTableIterator::FindKeyForward() { return; } } - - // Check upper bound on the current key - bool reached_upper_bound = - (read_options_.iterate_upper_bound != nullptr && - block_iter_points_to_real_block_ && block_iter_.Valid() && - user_comparator_.Compare(ExtractUserKey(block_iter_.key()), - *read_options_.iterate_upper_bound) >= 0); - TEST_SYNC_POINT_CALLBACK( - "BlockBasedTable::BlockEntryIteratorState::KeyReachedUpperBound", - &reached_upper_bound); - if (reached_upper_bound) { - is_out_of_bound_ = true; - return; - } } template void BlockBasedTableIterator::FindKeyBackward() { - assert(!is_out_of_bound_); while (!block_iter_.Valid()) { if (!block_iter_.status().ok()) { return; @@ -2552,6 +2553,15 @@ void BlockBasedTableIterator::FindKeyBackward() { // code simplicity. } +template +void BlockBasedTableIterator::CheckOutOfBound() { + if (read_options_.iterate_upper_bound != nullptr && + block_iter_points_to_real_block_ && block_iter_.Valid()) { + is_out_of_bound_ = user_comparator_.Compare( + *read_options_.iterate_upper_bound, user_key()) <= 0; + } +} + InternalIterator* BlockBasedTable::NewIterator( const ReadOptions& read_options, const SliceTransform* prefix_extractor, Arena* arena, bool skip_filters, bool for_compaction) { diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h index e75a71cda..82fe3c888 100644 --- a/table/block_based_table_reader.h +++ b/table/block_based_table_reader.h @@ -621,6 +621,10 @@ class BlockBasedTableIterator : public InternalIteratorBase { assert(Valid()); return block_iter_.key(); } + Slice user_key() const override { + assert(Valid()); + return block_iter_.user_key(); + } TValue value() const override { assert(Valid()); return block_iter_.value(); @@ -635,6 +639,7 @@ class BlockBasedTableIterator : public InternalIteratorBase { } } + // Whether iterator invalidated for being out of bound. bool IsOutOfBound() override { return is_out_of_bound_; } void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { @@ -685,6 +690,7 @@ class BlockBasedTableIterator : public InternalIteratorBase { void InitDataBlock(); void FindKeyForward(); void FindKeyBackward(); + void CheckOutOfBound(); private: BlockBasedTable* table_; diff --git a/table/flush_block_policy.h b/table/flush_block_policy.h new file mode 100644 index 000000000..1cc7361a3 --- /dev/null +++ b/table/flush_block_policy.h @@ -0,0 +1,41 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/flush_block_policy.h" + +namespace rocksdb { + +// FlushBlockEveryKeyPolicy currently used only in tests. + +class FlushBlockEveryKeyPolicy : public FlushBlockPolicy { + public: + bool Update(const Slice& /*key*/, const Slice& /*value*/) override { + if (!start_) { + start_ = true; + return false; + } + return true; + } + + private: + bool start_ = false; +}; + +class FlushBlockEveryKeyPolicyFactory : public FlushBlockPolicyFactory { + public: + explicit FlushBlockEveryKeyPolicyFactory() {} + + const char* Name() const override { + return "FlushBlockEveryKeyPolicyFactory"; + } + + FlushBlockPolicy* NewFlushBlockPolicy( + const BlockBasedTableOptions& /*table_options*/, + const BlockBuilder& /*data_block_builder*/) const override { + return new FlushBlockEveryKeyPolicy; + } +}; + +} // namespace rocksdb diff --git a/table/internal_iterator.h b/table/internal_iterator.h index a173d6069..d08fcae25 100644 --- a/table/internal_iterator.h +++ b/table/internal_iterator.h @@ -7,6 +7,7 @@ #pragma once #include +#include "db/dbformat.h" #include "rocksdb/comparator.h" #include "rocksdb/iterator.h" #include "rocksdb/status.h" @@ -64,6 +65,10 @@ class InternalIteratorBase : public Cleanable { // REQUIRES: Valid() virtual Slice key() const = 0; + // Return user key for the current entry. + // REQUIRES: Valid() + virtual Slice user_key() const { return ExtractUserKey(key()); } + // Return the value for the current entry. The underlying storage for // the returned slice is valid only until the next modification of // the iterator. diff --git a/table/table_test.cc b/table/table_test.cc index f217fe50a..a62ce4255 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -38,6 +38,7 @@ #include "table/block_based_table_reader.h" #include "table/block_builder.h" #include "table/block_fetcher.h" +#include "table/flush_block_policy.h" #include "table/format.h" #include "table/get_context.h" #include "table/internal_iterator.h" @@ -276,6 +277,7 @@ class KeyConvertingIterator : public InternalIterator { void SeekToLast() override { iter_->SeekToLast(); } void Next() override { iter_->Next(); } void Prev() override { iter_->Prev(); } + bool IsOutOfBound() override { return iter_->IsOutOfBound(); } Slice key() const override { assert(Valid()); @@ -3871,6 +3873,84 @@ TEST_P(BlockBasedTableTest, DataBlockHashIndex) { } } +// BlockBasedTableIterator should invalidate itself and return +// OutOfBound()=true immediately after Seek(), to allow LevelIterator +// filter out corresponding level. +TEST_P(BlockBasedTableTest, OutOfBoundOnSeek) { + TableConstructor c(BytewiseComparator(), true /*convert_to_internal_key*/); + c.Add("foo", "v1"); + std::vector keys; + stl_wrappers::KVMap kvmap; + Options options; + BlockBasedTableOptions table_opt(GetBlockBasedTableOptions()); + options.table_factory.reset(NewBlockBasedTableFactory(table_opt)); + const ImmutableCFOptions ioptions(options); + const MutableCFOptions moptions(options); + c.Finish(options, ioptions, moptions, table_opt, + GetPlainInternalComparator(BytewiseComparator()), &keys, &kvmap); + auto* reader = c.GetTableReader(); + ReadOptions read_opt; + std::string upper_bound = "bar"; + Slice upper_bound_slice(upper_bound); + read_opt.iterate_upper_bound = &upper_bound_slice; + std::unique_ptr iter; + iter.reset(new KeyConvertingIterator( + reader->NewIterator(read_opt, nullptr /*prefix_extractor*/))); + iter->SeekToFirst(); + ASSERT_FALSE(iter->Valid()); + ASSERT_TRUE(iter->IsOutOfBound()); + iter.reset(new KeyConvertingIterator( + reader->NewIterator(read_opt, nullptr /*prefix_extractor*/))); + iter->Seek("foo"); + ASSERT_FALSE(iter->Valid()); + ASSERT_TRUE(iter->IsOutOfBound()); +} + +// BlockBasedTableIterator should invalidate itself and return +// OutOfBound()=true after Next(), if it finds current index key is no smaller +// than upper bound, unless it is pointing to the last data block. +TEST_P(BlockBasedTableTest, OutOfBoundOnNext) { + TableConstructor c(BytewiseComparator(), true /*convert_to_internal_key*/); + c.Add("bar", "v"); + c.Add("foo", "v"); + std::vector keys; + stl_wrappers::KVMap kvmap; + Options options; + BlockBasedTableOptions table_opt(GetBlockBasedTableOptions()); + table_opt.flush_block_policy_factory = + std::make_shared(); + options.table_factory.reset(NewBlockBasedTableFactory(table_opt)); + const ImmutableCFOptions ioptions(options); + const MutableCFOptions moptions(options); + c.Finish(options, ioptions, moptions, table_opt, + GetPlainInternalComparator(BytewiseComparator()), &keys, &kvmap); + auto* reader = c.GetTableReader(); + ReadOptions read_opt; + std::string ub1 = "bar_after"; + Slice ub_slice1(ub1); + read_opt.iterate_upper_bound = &ub_slice1; + std::unique_ptr iter; + iter.reset(new KeyConvertingIterator( + reader->NewIterator(read_opt, nullptr /*prefix_extractor*/))); + iter->Seek("bar"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("bar", iter->key()); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + ASSERT_TRUE(iter->IsOutOfBound()); + std::string ub2 = "foo_after"; + Slice ub_slice2(ub2); + read_opt.iterate_upper_bound = &ub_slice2; + iter.reset(new KeyConvertingIterator( + reader->NewIterator(read_opt, nullptr /*prefix_extractor*/))); + iter->Seek("foo"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("foo", iter->key()); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + ASSERT_FALSE(iter->IsOutOfBound()); +} + } // namespace rocksdb int main(int argc, char** argv) {