From 80ade9ad830d9bc6e46388cf1ea5fb1fd623063b Mon Sep 17 00:00:00 2001 From: Maysam Yabandeh Date: Fri, 22 Jun 2018 15:14:05 -0700 Subject: [PATCH] Pin top-level index on partitioned index/filter blocks (#4037) Summary: Top-level index in partitioned index/filter blocks are small and could be pinned in memory. So far we use that by cache_index_and_filter_blocks to false. This however make it difficult to keep account of the total memory usage. This patch introduces pin_top_level_index_and_filter which in combination with cache_index_and_filter_blocks=true keeps the top-level index in cache and yet pinned them to avoid cache misses and also cache lookup overhead. Closes https://github.com/facebook/rocksdb/pull/4037 Differential Revision: D8596218 Pulled By: maysamyabandeh fbshipit-source-id: 3a5f7f9ca6b4b525b03ff6bd82354881ae974ad2 --- HISTORY.md | 1 + db/c.cc | 5 + examples/rocksdb_option_file_example.ini | 1 + include/rocksdb/c.h | 3 + include/rocksdb/table.h | 7 ++ options/options_settable_test.cc | 1 + table/block_based_table_factory.cc | 3 + table/block_based_table_factory.h | 4 + table/block_based_table_reader.cc | 96 ++++++++++++------ table/block_based_table_reader.h | 14 ++- table/table_test.cc | 124 ++++++++++++----------- tools/db_bench_tool.cc | 6 ++ util/testutil.cc | 1 + 13 files changed, 168 insertions(+), 98 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index aa177d655..4d6b089f1 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -7,6 +7,7 @@ ### New Features * Changes the format of index blocks by storing the key in their raw form rather than converting them to InternalKey. This saves 8 bytes per index key. The feature is backward compatbile but not forward compatible. It is disabled by default unless format_version 3 or above is used. * Improve the performance of iterators doing long range scans by using readahead, when using direct IO. +* pin_top_level_index_and_filter (default true) in BlockBasedTableOptions can be used in combination with cache_index_and_filter_blocks to prefetch and pin the top-level index of partitioned index and filter blocks in cache. It has no impact when cache_index_and_filter_blocks is false. ### Bug Fixes * fix deadlock with enable_pipelined_write=true and max_successive_merges > 0 diff --git a/db/c.cc b/db/c.cc index b1c7cb21d..05076c309 100644 --- a/db/c.cc +++ b/db/c.cc @@ -1985,6 +1985,11 @@ void rocksdb_block_based_options_set_pin_l0_filter_and_index_blocks_in_cache( options->rep.pin_l0_filter_and_index_blocks_in_cache = v; } +void rocksdb_block_based_options_set_pin_top_level_index_and_filter( + rocksdb_block_based_table_options_t* options, unsigned char v) { + options->rep.pin_top_level_index_and_filter = v; +} + void rocksdb_options_set_block_based_table_factory( rocksdb_options_t *opt, rocksdb_block_based_table_options_t* table_options) { diff --git a/examples/rocksdb_option_file_example.ini b/examples/rocksdb_option_file_example.ini index 8e07131b3..351f1ed01 100644 --- a/examples/rocksdb_option_file_example.ini +++ b/examples/rocksdb_option_file_example.ini @@ -138,6 +138,7 @@ block_restart_interval=16 cache_index_and_filter_blocks=false pin_l0_filter_and_index_blocks_in_cache=false + pin_top_level_index_and_filter=false index_type=kBinarySearch hash_index_allow_collision=true flush_block_policy_factory=FlushBlockBySizePolicyFactory diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h index f89f66898..7f5d4d724 100644 --- a/include/rocksdb/c.h +++ b/include/rocksdb/c.h @@ -700,6 +700,9 @@ rocksdb_block_based_options_set_cache_index_and_filter_blocks_with_high_priority extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_pin_l0_filter_and_index_blocks_in_cache( rocksdb_block_based_table_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void +rocksdb_block_based_options_set_pin_top_level_index_and_filter( + rocksdb_block_based_table_options_t*, unsigned char); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_block_based_table_factory( rocksdb_options_t* opt, rocksdb_block_based_table_options_t* table_options); diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h index 6ee1c6c38..413a92a49 100644 --- a/include/rocksdb/table.h +++ b/include/rocksdb/table.h @@ -77,6 +77,13 @@ struct BlockBasedTableOptions { // evicted from cache when the table reader is freed. bool pin_l0_filter_and_index_blocks_in_cache = false; + // If cache_index_and_filter_blocks is true and the below is true, then + // the top-level index of partitioned filter and index blocks are stored in + // the cache, but a reference is held in the "table reader" object so the + // blocks are pinned and only evicted from cache when the table reader is + // freed. This is not limited to l0 in LSM tree. + bool pin_top_level_index_and_filter = true; + // The index type that will be used for this table. enum IndexType : char { // A space efficient index block that is optimized for diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc index b88cd0215..f4c5f85ba 100644 --- a/options/options_settable_test.cc +++ b/options/options_settable_test.cc @@ -140,6 +140,7 @@ TEST_F(OptionsSettableTest, BlockBasedTableOptionsAllFieldsSettable) { "cache_index_and_filter_blocks=1;" "cache_index_and_filter_blocks_with_high_priority=true;" "pin_l0_filter_and_index_blocks_in_cache=1;" + "pin_top_level_index_and_filter=1;" "index_type=kHashSearch;" "checksum=kxxHash;hash_index_allow_collision=1;no_block_cache=1;" "block_cache=1M;block_cache_compressed=1k;block_size=1024;" diff --git a/table/block_based_table_factory.cc b/table/block_based_table_factory.cc index d53b9833c..700090769 100644 --- a/table/block_based_table_factory.cc +++ b/table/block_based_table_factory.cc @@ -151,6 +151,9 @@ std::string BlockBasedTableFactory::GetPrintableTableOptions() const { " pin_l0_filter_and_index_blocks_in_cache: %d\n", table_options_.pin_l0_filter_and_index_blocks_in_cache); ret.append(buffer); + snprintf(buffer, kBufferSize, " pin_top_level_index_and_filter: %d\n", + table_options_.pin_top_level_index_and_filter); + ret.append(buffer); snprintf(buffer, kBufferSize, " index_type: %d\n", table_options_.index_type); ret.append(buffer); diff --git a/table/block_based_table_factory.h b/table/block_based_table_factory.h index b9d3a97d6..47e7d923b 100644 --- a/table/block_based_table_factory.h +++ b/table/block_based_table_factory.h @@ -158,6 +158,10 @@ static std::unordered_map OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, {"block_align", {offsetof(struct BlockBasedTableOptions, block_align), + OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, + {"pin_top_level_index_and_filter", + {offsetof(struct BlockBasedTableOptions, + pin_top_level_index_and_filter), OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}}; #endif // !ROCKSDB_LITE } // namespace rocksdb diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc index 213b403d8..2721d4cf2 100644 --- a/table/block_based_table_reader.cc +++ b/table/block_based_table_reader.cc @@ -898,17 +898,39 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, rep->ioptions.info_log); } - const bool pin = + // prefetch both index and filters, down to all partitions + const bool prefetch_all = prefetch_index_and_filter_in_cache || level == 0; + BlockBasedTableOptions::IndexType index_type = new_table->UpdateIndexType(); + // prefetch the first level of index + const bool prefetch_index = + prefetch_all || + (table_options.pin_top_level_index_and_filter && + index_type == BlockBasedTableOptions::kTwoLevelIndexSearch); + // prefetch the first level of filter + const bool prefetch_filter = + prefetch_all || (table_options.pin_top_level_index_and_filter && + rep->filter_type == Rep::FilterType::kPartitionedFilter); + // Partition fitlers cannot be enabled without partition indexes + assert(!prefetch_index || prefetch_filter); + // pin both index and filters, down to all partitions + const bool pin_all = rep->table_options.pin_l0_filter_and_index_blocks_in_cache && level == 0; + // pin the first level of index + const bool pin_index = + pin_all || (table_options.pin_top_level_index_and_filter && + index_type == BlockBasedTableOptions::kTwoLevelIndexSearch); + // pin the first level of filter + const bool pin_filter = + pin_all || (table_options.pin_top_level_index_and_filter && + rep->filter_type == Rep::FilterType::kPartitionedFilter); // pre-fetching of blocks is turned on // Will use block cache for index/filter blocks access // Always prefetch index and filter for level 0 if (table_options.cache_index_and_filter_blocks) { - if (prefetch_index_and_filter_in_cache || level == 0) { - assert(table_options.block_cache != nullptr); + assert(table_options.block_cache != nullptr); + if (prefetch_index) { // Hack: Call NewIndexIterator() to implicitly add index to the // block_cache - CachableEntry index_entry; bool prefix_extractor_changed = false; // check prefix_extractor match only if hash based index is used @@ -924,27 +946,29 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, // On success it should give us ownership of the `CachableEntry` by // populating `index_entry`. assert(index_entry.value != nullptr); - index_entry.value->CacheDependencies(pin); - if (pin) { + if (prefetch_all) { + index_entry.value->CacheDependencies(pin_all); + } + if (pin_index) { rep->index_entry = std::move(index_entry); } else { index_entry.Release(table_options.block_cache.get()); } - - // Hack: Call GetFilter() to implicitly add filter to the block_cache - auto filter_entry = new_table->GetFilter(prefix_extractor); - if (filter_entry.value != nullptr) { - filter_entry.value->CacheDependencies(pin, prefix_extractor); - } - // if pin_l0_filter_and_index_blocks_in_cache is true, and this is - // a level0 file, then save it in rep_->filter_entry; it will be - // released in the destructor only, hence it will be pinned in the - // cache while this reader is alive - if (pin) { - rep->filter_entry = filter_entry; - } else { - filter_entry.Release(table_options.block_cache.get()); - } + } + } + if (s.ok() && prefetch_filter) { + // Hack: Call GetFilter() to implicitly add filter to the block_cache + auto filter_entry = new_table->GetFilter(prefix_extractor); + if (filter_entry.value != nullptr && prefetch_all) { + filter_entry.value->CacheDependencies(pin_all, prefix_extractor); + } + // if pin_filter is true then save it in rep_->filter_entry; it will be + // released in the destructor only, hence it will be pinned in the + // cache while this reader is alive + if (pin_filter) { + rep->filter_entry = filter_entry; + } else { + filter_entry.Release(table_options.block_cache.get()); } } } else { @@ -960,7 +984,7 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, // are hence follow the configuration for pin and prefetch regardless of // the value of cache_index_and_filter_blocks if (prefetch_index_and_filter_in_cache || level == 0) { - rep->index_reader->CacheDependencies(pin); + rep->index_reader->CacheDependencies(pin_all); } // Set filter block @@ -973,7 +997,7 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, // Refer to the comment above about paritioned indexes always being // cached if (filter && (prefetch_index_and_filter_in_cache || level == 0)) { - filter->CacheDependencies(pin, prefix_extractor); + filter->CacheDependencies(pin_all, prefix_extractor); } } } else { @@ -2419,18 +2443,11 @@ bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options, return in_cache; } -// REQUIRES: The following fields of rep_ should have already been populated: -// 1. file -// 2. index_handle, -// 3. options -// 4. internal_comparator -// 5. index_type -Status BlockBasedTable::CreateIndexReader( - FilePrefetchBuffer* prefetch_buffer, IndexReader** index_reader, - InternalIterator* preloaded_meta_index_iter, int level) { +BlockBasedTableOptions::IndexType BlockBasedTable::UpdateIndexType() { // Some old version of block-based tables don't have index type present in // table properties. If that's the case we can safely use the kBinarySearch. - auto index_type_on_file = BlockBasedTableOptions::kBinarySearch; + BlockBasedTableOptions::IndexType index_type_on_file = + BlockBasedTableOptions::kBinarySearch; if (rep_->table_properties) { auto& props = rep_->table_properties->user_collected_properties; auto pos = props.find(BlockBasedTablePropertyNames::kIndexType); @@ -2441,6 +2458,19 @@ Status BlockBasedTable::CreateIndexReader( rep_->index_type = index_type_on_file; } } + return index_type_on_file; +} + +// REQUIRES: The following fields of rep_ should have already been populated: +// 1. file +// 2. index_handle, +// 3. options +// 4. internal_comparator +// 5. index_type +Status BlockBasedTable::CreateIndexReader( + FilePrefetchBuffer* prefetch_buffer, IndexReader** index_reader, + InternalIterator* preloaded_meta_index_iter, int level) { + auto index_type_on_file = UpdateIndexType(); auto file = rep_->file.get(); const InternalKeyComparator* icomparator = &rep_->internal_comparator; diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h index 8205f108b..3cbbbbc87 100644 --- a/table/block_based_table_reader.h +++ b/table/block_based_table_reader.h @@ -322,6 +322,9 @@ class BlockBasedTable : public TableReader { void ReadMeta(const Footer& footer); + // Figure the index type, update it in rep_, and also return it. + BlockBasedTableOptions::IndexType UpdateIndexType(); + // Create a index reader based on the index type stored in the table. // Optionally, user can pass a preloaded meta_index_iter for the index that // need to access extra meta blocks for index construction. This parameter @@ -478,11 +481,12 @@ struct BlockBasedTable::Rep { // block to extract prefix without knowing if a key is internal or not. unique_ptr internal_prefix_transform; - // only used in level 0 files: - // when pin_l0_filter_and_index_blocks_in_cache is true, we do use the - // LRU cache, but we always keep the filter & idndex block's handle checked - // out here (=we don't call Release()), plus the parsed out objects - // the LRU cache will never push flush them out, hence they're pinned + // only used in level 0 files when pin_l0_filter_and_index_blocks_in_cache is + // true or in all levels when pin_top_level_index_and_filter is set in + // combination with partitioned index/filters: then we do use the LRU cache, + // but we always keep the filter & index block's handle checked out here (=we + // don't call Release()), plus the parsed out objects the LRU cache will never + // push flush them out, hence they're pinned CachableEntry filter_entry; CachableEntry index_entry; // range deletion meta-block is pinned through reader's lifetime when LRU diff --git a/table/table_test.cc b/table/table_test.cc index 3b34eb2ee..bada1a2c4 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -2330,70 +2330,74 @@ TEST_P(BlockBasedTableTest, NoObjectInCacheAfterTableClose) { } for (bool index_and_filter_in_cache : {true, false}) { for (bool pin_l0 : {true, false}) { - if (pin_l0 && !index_and_filter_in_cache) { - continue; - } - // Create a table - Options opt; - unique_ptr ikc; - ikc.reset(new test::PlainInternalKeyComparator(opt.comparator)); - opt.compression = kNoCompression; - BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); - table_options.block_size = 1024; - table_options.index_type = - BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; - table_options.pin_l0_filter_and_index_blocks_in_cache = pin_l0; - table_options.partition_filters = partition_filter; - table_options.cache_index_and_filter_blocks = - index_and_filter_in_cache; - // big enough so we don't ever lose cached values. - table_options.block_cache = std::shared_ptr( - new MockCache(16 * 1024 * 1024, 4, false, 0.0)); - table_options.filter_policy.reset( - rocksdb::NewBloomFilterPolicy(10, block_based_filter)); - opt.table_factory.reset(NewBlockBasedTableFactory(table_options)); + for (bool pin_top_level : {true, false}) { + if (pin_l0 && !index_and_filter_in_cache) { + continue; + } + // Create a table + Options opt; + unique_ptr ikc; + ikc.reset(new test::PlainInternalKeyComparator(opt.comparator)); + opt.compression = kNoCompression; + BlockBasedTableOptions table_options = + GetBlockBasedTableOptions(); + table_options.block_size = 1024; + table_options.index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + table_options.pin_l0_filter_and_index_blocks_in_cache = pin_l0; + table_options.pin_top_level_index_and_filter = pin_top_level; + table_options.partition_filters = partition_filter; + table_options.cache_index_and_filter_blocks = + index_and_filter_in_cache; + // big enough so we don't ever lose cached values. + table_options.block_cache = std::shared_ptr( + new MockCache(16 * 1024 * 1024, 4, false, 0.0)); + table_options.filter_policy.reset( + rocksdb::NewBloomFilterPolicy(10, block_based_filter)); + opt.table_factory.reset(NewBlockBasedTableFactory(table_options)); - bool convert_to_internal_key = false; - TableConstructor c(BytewiseComparator(), convert_to_internal_key, - level); - std::string user_key = "k01"; - std::string key = - InternalKey(user_key, 0, kTypeValue).Encode().ToString(); - c.Add(key, "hello"); - std::vector keys; - stl_wrappers::KVMap kvmap; - const ImmutableCFOptions ioptions(opt); - const MutableCFOptions moptions(opt); - c.Finish(opt, ioptions, moptions, table_options, *ikc, &keys, - &kvmap); + bool convert_to_internal_key = false; + TableConstructor c(BytewiseComparator(), convert_to_internal_key, + level); + std::string user_key = "k01"; + std::string key = + InternalKey(user_key, 0, kTypeValue).Encode().ToString(); + c.Add(key, "hello"); + std::vector keys; + stl_wrappers::KVMap kvmap; + const ImmutableCFOptions ioptions(opt); + const MutableCFOptions moptions(opt); + c.Finish(opt, ioptions, moptions, table_options, *ikc, &keys, + &kvmap); - // Doing a read to make index/filter loaded into the cache - auto table_reader = - dynamic_cast(c.GetTableReader()); - PinnableSlice value; - GetContext get_context(opt.comparator, nullptr, nullptr, nullptr, - GetContext::kNotFound, user_key, &value, - nullptr, nullptr, nullptr, nullptr); - InternalKey ikey(user_key, 0, kTypeValue); - auto s = table_reader->Get(ReadOptions(), key, &get_context, - moptions.prefix_extractor.get()); - ASSERT_EQ(get_context.State(), GetContext::kFound); - ASSERT_STREQ(value.data(), "hello"); + // Doing a read to make index/filter loaded into the cache + auto table_reader = + dynamic_cast(c.GetTableReader()); + PinnableSlice value; + GetContext get_context(opt.comparator, nullptr, nullptr, nullptr, + GetContext::kNotFound, user_key, &value, + nullptr, nullptr, nullptr, nullptr); + InternalKey ikey(user_key, 0, kTypeValue); + auto s = table_reader->Get(ReadOptions(), key, &get_context, + moptions.prefix_extractor.get()); + ASSERT_EQ(get_context.State(), GetContext::kFound); + ASSERT_STREQ(value.data(), "hello"); - // Close the table - c.ResetTableReader(); + // Close the table + c.ResetTableReader(); - auto usage = table_options.block_cache->GetUsage(); - auto pinned_usage = table_options.block_cache->GetPinnedUsage(); - // The only usage must be for marked data blocks - ASSERT_EQ(usage, MockCache::marked_size_); - // There must be some pinned data since PinnableSlice has not - // released them yet - ASSERT_GT(pinned_usage, 0); - // Release pinnable slice reousrces - value.Reset(); - pinned_usage = table_options.block_cache->GetPinnedUsage(); - ASSERT_EQ(pinned_usage, 0); + auto usage = table_options.block_cache->GetUsage(); + auto pinned_usage = table_options.block_cache->GetPinnedUsage(); + // The only usage must be for marked data blocks + ASSERT_EQ(usage, MockCache::marked_size_); + // There must be some pinned data since PinnableSlice has not + // released them yet + ASSERT_GT(pinned_usage, 0); + // Release pinnable slice reousrces + value.Reset(); + pinned_usage = table_options.block_cache->GetPinnedUsage(); + ASSERT_EQ(pinned_usage, 0); + } } } } diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc index a1634ce7d..d12104e86 100644 --- a/tools/db_bench_tool.cc +++ b/tools/db_bench_tool.cc @@ -432,6 +432,10 @@ DEFINE_int32(ops_between_duration_checks, 1000, DEFINE_bool(pin_l0_filter_and_index_blocks_in_cache, false, "Pin index/filter blocks of L0 files in block cache."); +DEFINE_bool( + pin_top_level_index_and_filter, false, + "Pin top-level index of partitioned index/filter blocks in block cache."); + DEFINE_int32(block_size, static_cast(rocksdb::BlockBasedTableOptions().block_size), "Number of bytes in a block."); @@ -3186,6 +3190,8 @@ void VerifyDBFromDB(std::string& truth_db_name) { FLAGS_cache_index_and_filter_blocks; block_based_options.pin_l0_filter_and_index_blocks_in_cache = FLAGS_pin_l0_filter_and_index_blocks_in_cache; + block_based_options.pin_top_level_index_and_filter = + FLAGS_pin_top_level_index_and_filter; if (FLAGS_cache_high_pri_pool_ratio > 1e-6) { // > 0.0 + eps block_based_options.cache_index_and_filter_blocks_with_high_priority = true; diff --git a/util/testutil.cc b/util/testutil.cc index fa32498a2..7625d20ee 100644 --- a/util/testutil.cc +++ b/util/testutil.cc @@ -195,6 +195,7 @@ BlockBasedTableOptions RandomBlockBasedTableOptions(Random* rnd) { BlockBasedTableOptions opt; opt.cache_index_and_filter_blocks = rnd->Uniform(2); opt.pin_l0_filter_and_index_blocks_in_cache = rnd->Uniform(2); + opt.pin_top_level_index_and_filter = rnd->Uniform(2); opt.index_type = rnd->Uniform(2) ? BlockBasedTableOptions::kBinarySearch : BlockBasedTableOptions::kHashSearch; opt.hash_index_allow_collision = rnd->Uniform(2);