From f1239d5f10e604a8eecee774a1a9bf5f50e23cf4 Mon Sep 17 00:00:00 2001 From: yiwu-arbug Date: Tue, 16 Apr 2019 11:32:03 -0700 Subject: [PATCH] Avoid per-key upper bound check in BlockBasedTableIterator (#5142) Summary: This is second attempt for #5101. Original commit message: `BlockBasedTableIterator` avoid reading next block on `Next()` if it detects the iterator will be out of bound, by checking against index key. The optimization was added in #2239, and by the time it only check the bound per block. It seems later change make it a per-key check, which introduce unnecessary key comparisons. This patch come with two fixes: Fix 1: To optimize checking for bounds, we need comparing the bounds with index key as well. However BlockBasedTableIterator doesn't know whether its index iterator is internally using user keys or internal keys. The patch fixes that by extending InternalIterator with a user_key() function that is overridden by In IndexBlockIter. Fix 2: In #5101 we return `IsOutOfBound()=true` when block index key is out of bound. But the index key can be larger than smallest key of the next file on the level. That file can be within upper bound and should not be filtered out. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5142 Differential Revision: D14907113 Pulled By: siying fbshipit-source-id: ac95775c5b4e7b700f76ab43e39f45402c98fbfb --- HISTORY.md | 2 + db/db_iterator_test.cc | 102 +++++++++++------------------- db/forward_iterator.cc | 8 +-- db/range_tombstone_fragmenter.cc | 7 +- table/block.cc | 7 +- table/block.h | 7 ++ table/block_based_table_reader.cc | 50 +++++++++------ table/block_based_table_reader.h | 6 ++ table/flush_block_policy.h | 41 ++++++++++++ table/internal_iterator.h | 5 ++ table/table_test.cc | 80 +++++++++++++++++++++++ 11 files changed, 220 insertions(+), 95 deletions(-) create mode 100644 table/flush_block_policy.h diff --git a/HISTORY.md b/HISTORY.md index 382e899f7..8074553d4 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -4,6 +4,7 @@ ### New Features * When reading from option file/string/map, customized comparators and/or merge operators can be filled according to object registry. * Introduce Periodic Compaction for Level style compaction. Files are re-compacted periodically and put in the same level. +* Improve range scan performance by avoiding per-key upper bound check in BlockBasedTableIterator. ### Public API Change * Change the behavior of OptimizeForPointLookup(): move away from hash-based block-based-table index, and use whole key memtable filtering. @@ -15,6 +16,7 @@ * Fix a race condition between WritePrepared::Get and ::Put with duplicate keys. * Fix crash when memtable prefix bloom is enabled and read/write a key out of domain of prefix extractor. + ## 6.1.0 (3/27/2019) ### New Features * Introduce two more stats levels, kExceptHistogramOrTimers and kExceptTimers. diff --git a/db/db_iterator_test.cc b/db/db_iterator_test.cc index 6a5188c77..d41d417eb 100644 --- a/db/db_iterator_test.cc +++ b/db/db_iterator_test.cc @@ -15,6 +15,7 @@ #include "port/stack_trace.h" #include "rocksdb/iostats_context.h" #include "rocksdb/perf_context.h" +#include "table/flush_block_policy.h" namespace rocksdb { @@ -59,35 +60,6 @@ class DBIteratorTest : public DBTestBase, std::vector> read_callbacks_; }; -class FlushBlockEveryKeyPolicy : public FlushBlockPolicy { - public: - bool Update(const Slice& /*key*/, const Slice& /*value*/) override { - if (!start_) { - start_ = true; - return false; - } - return true; - } - - private: - bool start_ = false; -}; - -class FlushBlockEveryKeyPolicyFactory : public FlushBlockPolicyFactory { - public: - explicit FlushBlockEveryKeyPolicyFactory() {} - - const char* Name() const override { - return "FlushBlockEveryKeyPolicyFactory"; - } - - FlushBlockPolicy* NewFlushBlockPolicy( - const BlockBasedTableOptions& /*table_options*/, - const BlockBuilder& /*data_block_builder*/) const override { - return new FlushBlockEveryKeyPolicy; - } -}; - TEST_P(DBIteratorTest, IteratorProperty) { // The test needs to be changed if kPersistedTier is supported in iterator. Options options = CurrentOptions(); @@ -1034,48 +1006,48 @@ TEST_P(DBIteratorTest, DBIteratorBoundMultiSeek) { #endif TEST_P(DBIteratorTest, DBIteratorBoundOptimizationTest) { - int upper_bound_hits = 0; - Options options = CurrentOptions(); - rocksdb::SyncPoint::GetInstance()->SetCallBack( - "BlockBasedTable::BlockEntryIteratorState::KeyReachedUpperBound", - [&upper_bound_hits](void* arg) { - assert(arg != nullptr); - upper_bound_hits += (*static_cast(arg) ? 1 : 0); - }); - rocksdb::SyncPoint::GetInstance()->EnableProcessing(); - options.env = env_; - options.create_if_missing = true; - options.prefix_extractor = nullptr; - BlockBasedTableOptions table_options; - table_options.flush_block_policy_factory = - std::make_shared(); - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + for (auto format_version : {2, 3, 4}) { + int upper_bound_hits = 0; + Options options = CurrentOptions(); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "BlockBasedTableIterator:out_of_bound", + [&upper_bound_hits](void*) { upper_bound_hits++; }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + options.env = env_; + options.create_if_missing = true; + options.prefix_extractor = nullptr; + BlockBasedTableOptions table_options; + table_options.format_version = format_version; + table_options.flush_block_policy_factory = + std::make_shared(); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - DestroyAndReopen(options); - ASSERT_OK(Put("foo1", "bar1")); - ASSERT_OK(Put("foo2", "bar2")); - ASSERT_OK(Put("foo4", "bar4")); - ASSERT_OK(Flush()); + DestroyAndReopen(options); + ASSERT_OK(Put("foo1", "bar1")); + ASSERT_OK(Put("foo2", "bar2")); + ASSERT_OK(Put("foo4", "bar4")); + ASSERT_OK(Flush()); - Slice ub("foo3"); - ReadOptions ro; - ro.iterate_upper_bound = &ub; + Slice ub("foo3"); + ReadOptions ro; + ro.iterate_upper_bound = &ub; - std::unique_ptr iter(NewIterator(ro)); + std::unique_ptr iter(NewIterator(ro)); - iter->Seek("foo"); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(iter->key().compare(Slice("foo1")), 0); - ASSERT_EQ(upper_bound_hits, 0); + iter->Seek("foo"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("foo1")), 0); + ASSERT_EQ(upper_bound_hits, 0); - iter->Next(); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(iter->key().compare(Slice("foo2")), 0); - ASSERT_EQ(upper_bound_hits, 0); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("foo2")), 0); + ASSERT_EQ(upper_bound_hits, 0); - iter->Next(); - ASSERT_FALSE(iter->Valid()); - ASSERT_EQ(upper_bound_hits, 1); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + ASSERT_EQ(upper_bound_hits, 1); + } } // TODO(3.13): fix the issue of Seek() + Prev() which might not necessary // return the biggest key which is smaller than the seek key. diff --git a/db/forward_iterator.cc b/db/forward_iterator.cc index d1c073468..94e448ee9 100644 --- a/db/forward_iterator.cc +++ b/db/forward_iterator.cc @@ -381,9 +381,9 @@ void ForwardIterator::SeekInternal(const Slice& internal_key, } } - Slice user_key; + Slice target_user_key; if (!seek_to_first) { - user_key = ExtractUserKey(internal_key); + target_user_key = ExtractUserKey(internal_key); } const VersionStorageInfo* vstorage = sv_->current->storage_info(); const std::vector& l0 = vstorage->LevelFiles(0); @@ -396,8 +396,8 @@ void ForwardIterator::SeekInternal(const Slice& internal_key, } else { // If the target key passes over the larget key, we are sure Next() // won't go over this file. - if (user_comparator_->Compare(user_key, - l0[i]->largest.user_key()) > 0) { + if (user_comparator_->Compare(target_user_key, + l0[i]->largest.user_key()) > 0) { if (read_options_.iterate_upper_bound != nullptr) { has_iter_trimmed_for_upper_bound_ = true; DeleteIterator(l0_iters_[i]); diff --git a/db/range_tombstone_fragmenter.cc b/db/range_tombstone_fragmenter.cc index f9d9f2feb..e3eb18908 100644 --- a/db/range_tombstone_fragmenter.cc +++ b/db/range_tombstone_fragmenter.cc @@ -407,9 +407,10 @@ bool FragmentedRangeTombstoneIterator::Valid() const { } SequenceNumber FragmentedRangeTombstoneIterator::MaxCoveringTombstoneSeqnum( - const Slice& user_key) { - SeekToCoveringTombstone(user_key); - return ValidPos() && ucmp_->Compare(start_key(), user_key) <= 0 ? seq() : 0; + const Slice& target_user_key) { + SeekToCoveringTombstone(target_user_key); + return ValidPos() && ucmp_->Compare(start_key(), target_user_key) <= 0 ? seq() + : 0; } std::map> diff --git a/table/block.cc b/table/block.cc index 7c83ebb64..80bef4a91 100644 --- a/table/block.cc +++ b/table/block.cc @@ -286,9 +286,10 @@ void DataBlockIter::Seek(const Slice& target) { // with a smaller [ type | seqno ] (i.e. a larger seqno, or the same seqno // but larger type). bool DataBlockIter::SeekForGetImpl(const Slice& target) { - Slice user_key = ExtractUserKey(target); + Slice target_user_key = ExtractUserKey(target); uint32_t map_offset = restarts_ + num_restarts_ * sizeof(uint32_t); - uint8_t entry = data_block_hash_index_->Lookup(data_, map_offset, user_key); + uint8_t entry = + data_block_hash_index_->Lookup(data_, map_offset, target_user_key); if (entry == kCollision) { // HashSeek not effective, falling back @@ -360,7 +361,7 @@ bool DataBlockIter::SeekForGetImpl(const Slice& target) { return true; } - if (user_comparator_->Compare(key_.GetUserKey(), user_key) != 0) { + if (user_comparator_->Compare(key_.GetUserKey(), target_user_key) != 0) { // the key is not in this block and cannot be at the next block either. return false; } diff --git a/table/block.h b/table/block.h index 737874abd..7ef001d58 100644 --- a/table/block.h +++ b/table/block.h @@ -496,6 +496,13 @@ class IndexBlockIter final : public BlockIter { value_delta_encoded_ = !value_is_full; } + Slice user_key() const override { + if (key_includes_seq_) { + return ExtractUserKey(key()); + } + return key(); + } + virtual BlockHandle value() const override { assert(Valid()); if (value_delta_encoded_) { diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc index b58960a85..c6ceacdcb 100644 --- a/table/block_based_table_reader.cc +++ b/table/block_based_table_reader.cc @@ -2347,6 +2347,7 @@ void BlockBasedTableIterator::Seek(const Slice& target) { block_iter_.Seek(target); FindKeyForward(); + CheckOutOfBound(); assert( !block_iter_.Valid() || (key_includes_seq_ && icomp_.Compare(target, block_iter_.key()) <= 0) || @@ -2410,6 +2411,7 @@ void BlockBasedTableIterator::SeekToFirst() { InitDataBlock(); block_iter_.SeekToFirst(); FindKeyForward(); + CheckOutOfBound(); } template @@ -2492,19 +2494,33 @@ void BlockBasedTableIterator::InitDataBlock() { template void BlockBasedTableIterator::FindKeyForward() { - assert(!is_out_of_bound_); // TODO the while loop inherits from two-level-iterator. We don't know // whether a block can be empty so it can be replaced by an "if". while (!block_iter_.Valid()) { if (!block_iter_.status().ok()) { return; } + // Whether next data block is out of upper bound, if there is one. + bool next_block_is_out_of_bound = false; + if (read_options_.iterate_upper_bound != nullptr && + block_iter_points_to_real_block_) { + next_block_is_out_of_bound = + (user_comparator_.Compare(*read_options_.iterate_upper_bound, + index_iter_->user_key()) <= 0); + } ResetDataIter(); - // We used to check the current index key for upperbound. - // It will only save a data reading for a small percentage of use cases, - // so for code simplicity, we removed it. We can add it back if there is a - // significnat performance regression. index_iter_->Next(); + if (next_block_is_out_of_bound) { + // The next block is out of bound. No need to read it. + TEST_SYNC_POINT_CALLBACK("BlockBasedTableIterator:out_of_bound", nullptr); + // We need to make sure this is not the last data block before setting + // is_out_of_bound_, since the index key for the last data block can be + // larger than smallest key of the next file on the same level. + if (index_iter_->Valid()) { + is_out_of_bound_ = true; + } + return; + } if (index_iter_->Valid()) { InitDataBlock(); @@ -2513,25 +2529,10 @@ void BlockBasedTableIterator::FindKeyForward() { return; } } - - // Check upper bound on the current key - bool reached_upper_bound = - (read_options_.iterate_upper_bound != nullptr && - block_iter_points_to_real_block_ && block_iter_.Valid() && - user_comparator_.Compare(ExtractUserKey(block_iter_.key()), - *read_options_.iterate_upper_bound) >= 0); - TEST_SYNC_POINT_CALLBACK( - "BlockBasedTable::BlockEntryIteratorState::KeyReachedUpperBound", - &reached_upper_bound); - if (reached_upper_bound) { - is_out_of_bound_ = true; - return; - } } template void BlockBasedTableIterator::FindKeyBackward() { - assert(!is_out_of_bound_); while (!block_iter_.Valid()) { if (!block_iter_.status().ok()) { return; @@ -2552,6 +2553,15 @@ void BlockBasedTableIterator::FindKeyBackward() { // code simplicity. } +template +void BlockBasedTableIterator::CheckOutOfBound() { + if (read_options_.iterate_upper_bound != nullptr && + block_iter_points_to_real_block_ && block_iter_.Valid()) { + is_out_of_bound_ = user_comparator_.Compare( + *read_options_.iterate_upper_bound, user_key()) <= 0; + } +} + InternalIterator* BlockBasedTable::NewIterator( const ReadOptions& read_options, const SliceTransform* prefix_extractor, Arena* arena, bool skip_filters, bool for_compaction) { diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h index e75a71cda..82fe3c888 100644 --- a/table/block_based_table_reader.h +++ b/table/block_based_table_reader.h @@ -621,6 +621,10 @@ class BlockBasedTableIterator : public InternalIteratorBase { assert(Valid()); return block_iter_.key(); } + Slice user_key() const override { + assert(Valid()); + return block_iter_.user_key(); + } TValue value() const override { assert(Valid()); return block_iter_.value(); @@ -635,6 +639,7 @@ class BlockBasedTableIterator : public InternalIteratorBase { } } + // Whether iterator invalidated for being out of bound. bool IsOutOfBound() override { return is_out_of_bound_; } void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { @@ -685,6 +690,7 @@ class BlockBasedTableIterator : public InternalIteratorBase { void InitDataBlock(); void FindKeyForward(); void FindKeyBackward(); + void CheckOutOfBound(); private: BlockBasedTable* table_; diff --git a/table/flush_block_policy.h b/table/flush_block_policy.h new file mode 100644 index 000000000..1cc7361a3 --- /dev/null +++ b/table/flush_block_policy.h @@ -0,0 +1,41 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/flush_block_policy.h" + +namespace rocksdb { + +// FlushBlockEveryKeyPolicy currently used only in tests. + +class FlushBlockEveryKeyPolicy : public FlushBlockPolicy { + public: + bool Update(const Slice& /*key*/, const Slice& /*value*/) override { + if (!start_) { + start_ = true; + return false; + } + return true; + } + + private: + bool start_ = false; +}; + +class FlushBlockEveryKeyPolicyFactory : public FlushBlockPolicyFactory { + public: + explicit FlushBlockEveryKeyPolicyFactory() {} + + const char* Name() const override { + return "FlushBlockEveryKeyPolicyFactory"; + } + + FlushBlockPolicy* NewFlushBlockPolicy( + const BlockBasedTableOptions& /*table_options*/, + const BlockBuilder& /*data_block_builder*/) const override { + return new FlushBlockEveryKeyPolicy; + } +}; + +} // namespace rocksdb diff --git a/table/internal_iterator.h b/table/internal_iterator.h index a173d6069..d08fcae25 100644 --- a/table/internal_iterator.h +++ b/table/internal_iterator.h @@ -7,6 +7,7 @@ #pragma once #include +#include "db/dbformat.h" #include "rocksdb/comparator.h" #include "rocksdb/iterator.h" #include "rocksdb/status.h" @@ -64,6 +65,10 @@ class InternalIteratorBase : public Cleanable { // REQUIRES: Valid() virtual Slice key() const = 0; + // Return user key for the current entry. + // REQUIRES: Valid() + virtual Slice user_key() const { return ExtractUserKey(key()); } + // Return the value for the current entry. The underlying storage for // the returned slice is valid only until the next modification of // the iterator. diff --git a/table/table_test.cc b/table/table_test.cc index f217fe50a..a62ce4255 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -38,6 +38,7 @@ #include "table/block_based_table_reader.h" #include "table/block_builder.h" #include "table/block_fetcher.h" +#include "table/flush_block_policy.h" #include "table/format.h" #include "table/get_context.h" #include "table/internal_iterator.h" @@ -276,6 +277,7 @@ class KeyConvertingIterator : public InternalIterator { void SeekToLast() override { iter_->SeekToLast(); } void Next() override { iter_->Next(); } void Prev() override { iter_->Prev(); } + bool IsOutOfBound() override { return iter_->IsOutOfBound(); } Slice key() const override { assert(Valid()); @@ -3871,6 +3873,84 @@ TEST_P(BlockBasedTableTest, DataBlockHashIndex) { } } +// BlockBasedTableIterator should invalidate itself and return +// OutOfBound()=true immediately after Seek(), to allow LevelIterator +// filter out corresponding level. +TEST_P(BlockBasedTableTest, OutOfBoundOnSeek) { + TableConstructor c(BytewiseComparator(), true /*convert_to_internal_key*/); + c.Add("foo", "v1"); + std::vector keys; + stl_wrappers::KVMap kvmap; + Options options; + BlockBasedTableOptions table_opt(GetBlockBasedTableOptions()); + options.table_factory.reset(NewBlockBasedTableFactory(table_opt)); + const ImmutableCFOptions ioptions(options); + const MutableCFOptions moptions(options); + c.Finish(options, ioptions, moptions, table_opt, + GetPlainInternalComparator(BytewiseComparator()), &keys, &kvmap); + auto* reader = c.GetTableReader(); + ReadOptions read_opt; + std::string upper_bound = "bar"; + Slice upper_bound_slice(upper_bound); + read_opt.iterate_upper_bound = &upper_bound_slice; + std::unique_ptr iter; + iter.reset(new KeyConvertingIterator( + reader->NewIterator(read_opt, nullptr /*prefix_extractor*/))); + iter->SeekToFirst(); + ASSERT_FALSE(iter->Valid()); + ASSERT_TRUE(iter->IsOutOfBound()); + iter.reset(new KeyConvertingIterator( + reader->NewIterator(read_opt, nullptr /*prefix_extractor*/))); + iter->Seek("foo"); + ASSERT_FALSE(iter->Valid()); + ASSERT_TRUE(iter->IsOutOfBound()); +} + +// BlockBasedTableIterator should invalidate itself and return +// OutOfBound()=true after Next(), if it finds current index key is no smaller +// than upper bound, unless it is pointing to the last data block. +TEST_P(BlockBasedTableTest, OutOfBoundOnNext) { + TableConstructor c(BytewiseComparator(), true /*convert_to_internal_key*/); + c.Add("bar", "v"); + c.Add("foo", "v"); + std::vector keys; + stl_wrappers::KVMap kvmap; + Options options; + BlockBasedTableOptions table_opt(GetBlockBasedTableOptions()); + table_opt.flush_block_policy_factory = + std::make_shared(); + options.table_factory.reset(NewBlockBasedTableFactory(table_opt)); + const ImmutableCFOptions ioptions(options); + const MutableCFOptions moptions(options); + c.Finish(options, ioptions, moptions, table_opt, + GetPlainInternalComparator(BytewiseComparator()), &keys, &kvmap); + auto* reader = c.GetTableReader(); + ReadOptions read_opt; + std::string ub1 = "bar_after"; + Slice ub_slice1(ub1); + read_opt.iterate_upper_bound = &ub_slice1; + std::unique_ptr iter; + iter.reset(new KeyConvertingIterator( + reader->NewIterator(read_opt, nullptr /*prefix_extractor*/))); + iter->Seek("bar"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("bar", iter->key()); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + ASSERT_TRUE(iter->IsOutOfBound()); + std::string ub2 = "foo_after"; + Slice ub_slice2(ub2); + read_opt.iterate_upper_bound = &ub_slice2; + iter.reset(new KeyConvertingIterator( + reader->NewIterator(read_opt, nullptr /*prefix_extractor*/))); + iter->Seek("foo"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("foo", iter->key()); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + ASSERT_FALSE(iter->IsOutOfBound()); +} + } // namespace rocksdb int main(int argc, char** argv) {