Pin top-level index on partitioned index/filter blocks (#4037)
Summary: Top-level index in partitioned index/filter blocks are small and could be pinned in memory. So far we use that by cache_index_and_filter_blocks to false. This however make it difficult to keep account of the total memory usage. This patch introduces pin_top_level_index_and_filter which in combination with cache_index_and_filter_blocks=true keeps the top-level index in cache and yet pinned them to avoid cache misses and also cache lookup overhead. Closes https://github.com/facebook/rocksdb/pull/4037 Differential Revision: D8596218 Pulled By: maysamyabandeh fbshipit-source-id: 3a5f7f9ca6b4b525b03ff6bd82354881ae974ad2
This commit is contained in:
parent
c726f7fda8
commit
80ade9ad83
@ -7,6 +7,7 @@
|
||||
### New Features
|
||||
* Changes the format of index blocks by storing the key in their raw form rather than converting them to InternalKey. This saves 8 bytes per index key. The feature is backward compatbile but not forward compatible. It is disabled by default unless format_version 3 or above is used.
|
||||
* Improve the performance of iterators doing long range scans by using readahead, when using direct IO.
|
||||
* pin_top_level_index_and_filter (default true) in BlockBasedTableOptions can be used in combination with cache_index_and_filter_blocks to prefetch and pin the top-level index of partitioned index and filter blocks in cache. It has no impact when cache_index_and_filter_blocks is false.
|
||||
|
||||
### Bug Fixes
|
||||
* fix deadlock with enable_pipelined_write=true and max_successive_merges > 0
|
||||
|
5
db/c.cc
5
db/c.cc
@ -1985,6 +1985,11 @@ void rocksdb_block_based_options_set_pin_l0_filter_and_index_blocks_in_cache(
|
||||
options->rep.pin_l0_filter_and_index_blocks_in_cache = v;
|
||||
}
|
||||
|
||||
void rocksdb_block_based_options_set_pin_top_level_index_and_filter(
|
||||
rocksdb_block_based_table_options_t* options, unsigned char v) {
|
||||
options->rep.pin_top_level_index_and_filter = v;
|
||||
}
|
||||
|
||||
void rocksdb_options_set_block_based_table_factory(
|
||||
rocksdb_options_t *opt,
|
||||
rocksdb_block_based_table_options_t* table_options) {
|
||||
|
@ -138,6 +138,7 @@
|
||||
block_restart_interval=16
|
||||
cache_index_and_filter_blocks=false
|
||||
pin_l0_filter_and_index_blocks_in_cache=false
|
||||
pin_top_level_index_and_filter=false
|
||||
index_type=kBinarySearch
|
||||
hash_index_allow_collision=true
|
||||
flush_block_policy_factory=FlushBlockBySizePolicyFactory
|
||||
|
@ -700,6 +700,9 @@ rocksdb_block_based_options_set_cache_index_and_filter_blocks_with_high_priority
|
||||
extern ROCKSDB_LIBRARY_API void
|
||||
rocksdb_block_based_options_set_pin_l0_filter_and_index_blocks_in_cache(
|
||||
rocksdb_block_based_table_options_t*, unsigned char);
|
||||
extern ROCKSDB_LIBRARY_API void
|
||||
rocksdb_block_based_options_set_pin_top_level_index_and_filter(
|
||||
rocksdb_block_based_table_options_t*, unsigned char);
|
||||
extern ROCKSDB_LIBRARY_API void rocksdb_options_set_block_based_table_factory(
|
||||
rocksdb_options_t* opt, rocksdb_block_based_table_options_t* table_options);
|
||||
|
||||
|
@ -77,6 +77,13 @@ struct BlockBasedTableOptions {
|
||||
// evicted from cache when the table reader is freed.
|
||||
bool pin_l0_filter_and_index_blocks_in_cache = false;
|
||||
|
||||
// If cache_index_and_filter_blocks is true and the below is true, then
|
||||
// the top-level index of partitioned filter and index blocks are stored in
|
||||
// the cache, but a reference is held in the "table reader" object so the
|
||||
// blocks are pinned and only evicted from cache when the table reader is
|
||||
// freed. This is not limited to l0 in LSM tree.
|
||||
bool pin_top_level_index_and_filter = true;
|
||||
|
||||
// The index type that will be used for this table.
|
||||
enum IndexType : char {
|
||||
// A space efficient index block that is optimized for
|
||||
|
@ -140,6 +140,7 @@ TEST_F(OptionsSettableTest, BlockBasedTableOptionsAllFieldsSettable) {
|
||||
"cache_index_and_filter_blocks=1;"
|
||||
"cache_index_and_filter_blocks_with_high_priority=true;"
|
||||
"pin_l0_filter_and_index_blocks_in_cache=1;"
|
||||
"pin_top_level_index_and_filter=1;"
|
||||
"index_type=kHashSearch;"
|
||||
"checksum=kxxHash;hash_index_allow_collision=1;no_block_cache=1;"
|
||||
"block_cache=1M;block_cache_compressed=1k;block_size=1024;"
|
||||
|
@ -151,6 +151,9 @@ std::string BlockBasedTableFactory::GetPrintableTableOptions() const {
|
||||
" pin_l0_filter_and_index_blocks_in_cache: %d\n",
|
||||
table_options_.pin_l0_filter_and_index_blocks_in_cache);
|
||||
ret.append(buffer);
|
||||
snprintf(buffer, kBufferSize, " pin_top_level_index_and_filter: %d\n",
|
||||
table_options_.pin_top_level_index_and_filter);
|
||||
ret.append(buffer);
|
||||
snprintf(buffer, kBufferSize, " index_type: %d\n",
|
||||
table_options_.index_type);
|
||||
ret.append(buffer);
|
||||
|
@ -158,6 +158,10 @@ static std::unordered_map<std::string, OptionTypeInfo>
|
||||
OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
|
||||
{"block_align",
|
||||
{offsetof(struct BlockBasedTableOptions, block_align),
|
||||
OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
|
||||
{"pin_top_level_index_and_filter",
|
||||
{offsetof(struct BlockBasedTableOptions,
|
||||
pin_top_level_index_and_filter),
|
||||
OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}};
|
||||
#endif // !ROCKSDB_LITE
|
||||
} // namespace rocksdb
|
||||
|
@ -898,17 +898,39 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
|
||||
rep->ioptions.info_log);
|
||||
}
|
||||
|
||||
const bool pin =
|
||||
// prefetch both index and filters, down to all partitions
|
||||
const bool prefetch_all = prefetch_index_and_filter_in_cache || level == 0;
|
||||
BlockBasedTableOptions::IndexType index_type = new_table->UpdateIndexType();
|
||||
// prefetch the first level of index
|
||||
const bool prefetch_index =
|
||||
prefetch_all ||
|
||||
(table_options.pin_top_level_index_and_filter &&
|
||||
index_type == BlockBasedTableOptions::kTwoLevelIndexSearch);
|
||||
// prefetch the first level of filter
|
||||
const bool prefetch_filter =
|
||||
prefetch_all || (table_options.pin_top_level_index_and_filter &&
|
||||
rep->filter_type == Rep::FilterType::kPartitionedFilter);
|
||||
// Partition fitlers cannot be enabled without partition indexes
|
||||
assert(!prefetch_index || prefetch_filter);
|
||||
// pin both index and filters, down to all partitions
|
||||
const bool pin_all =
|
||||
rep->table_options.pin_l0_filter_and_index_blocks_in_cache && level == 0;
|
||||
// pin the first level of index
|
||||
const bool pin_index =
|
||||
pin_all || (table_options.pin_top_level_index_and_filter &&
|
||||
index_type == BlockBasedTableOptions::kTwoLevelIndexSearch);
|
||||
// pin the first level of filter
|
||||
const bool pin_filter =
|
||||
pin_all || (table_options.pin_top_level_index_and_filter &&
|
||||
rep->filter_type == Rep::FilterType::kPartitionedFilter);
|
||||
// pre-fetching of blocks is turned on
|
||||
// Will use block cache for index/filter blocks access
|
||||
// Always prefetch index and filter for level 0
|
||||
if (table_options.cache_index_and_filter_blocks) {
|
||||
if (prefetch_index_and_filter_in_cache || level == 0) {
|
||||
assert(table_options.block_cache != nullptr);
|
||||
if (prefetch_index) {
|
||||
// Hack: Call NewIndexIterator() to implicitly add index to the
|
||||
// block_cache
|
||||
|
||||
CachableEntry<IndexReader> index_entry;
|
||||
bool prefix_extractor_changed = false;
|
||||
// check prefix_extractor match only if hash based index is used
|
||||
@ -924,29 +946,31 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
|
||||
// On success it should give us ownership of the `CachableEntry` by
|
||||
// populating `index_entry`.
|
||||
assert(index_entry.value != nullptr);
|
||||
index_entry.value->CacheDependencies(pin);
|
||||
if (pin) {
|
||||
if (prefetch_all) {
|
||||
index_entry.value->CacheDependencies(pin_all);
|
||||
}
|
||||
if (pin_index) {
|
||||
rep->index_entry = std::move(index_entry);
|
||||
} else {
|
||||
index_entry.Release(table_options.block_cache.get());
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
if (s.ok() && prefetch_filter) {
|
||||
// Hack: Call GetFilter() to implicitly add filter to the block_cache
|
||||
auto filter_entry = new_table->GetFilter(prefix_extractor);
|
||||
if (filter_entry.value != nullptr) {
|
||||
filter_entry.value->CacheDependencies(pin, prefix_extractor);
|
||||
if (filter_entry.value != nullptr && prefetch_all) {
|
||||
filter_entry.value->CacheDependencies(pin_all, prefix_extractor);
|
||||
}
|
||||
// if pin_l0_filter_and_index_blocks_in_cache is true, and this is
|
||||
// a level0 file, then save it in rep_->filter_entry; it will be
|
||||
// if pin_filter is true then save it in rep_->filter_entry; it will be
|
||||
// released in the destructor only, hence it will be pinned in the
|
||||
// cache while this reader is alive
|
||||
if (pin) {
|
||||
if (pin_filter) {
|
||||
rep->filter_entry = filter_entry;
|
||||
} else {
|
||||
filter_entry.Release(table_options.block_cache.get());
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// If we don't use block cache for index/filter blocks access, we'll
|
||||
// pre-load these blocks, which will kept in member variables in Rep
|
||||
@ -960,7 +984,7 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
|
||||
// are hence follow the configuration for pin and prefetch regardless of
|
||||
// the value of cache_index_and_filter_blocks
|
||||
if (prefetch_index_and_filter_in_cache || level == 0) {
|
||||
rep->index_reader->CacheDependencies(pin);
|
||||
rep->index_reader->CacheDependencies(pin_all);
|
||||
}
|
||||
|
||||
// Set filter block
|
||||
@ -973,7 +997,7 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
|
||||
// Refer to the comment above about paritioned indexes always being
|
||||
// cached
|
||||
if (filter && (prefetch_index_and_filter_in_cache || level == 0)) {
|
||||
filter->CacheDependencies(pin, prefix_extractor);
|
||||
filter->CacheDependencies(pin_all, prefix_extractor);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
@ -2419,18 +2443,11 @@ bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options,
|
||||
return in_cache;
|
||||
}
|
||||
|
||||
// REQUIRES: The following fields of rep_ should have already been populated:
|
||||
// 1. file
|
||||
// 2. index_handle,
|
||||
// 3. options
|
||||
// 4. internal_comparator
|
||||
// 5. index_type
|
||||
Status BlockBasedTable::CreateIndexReader(
|
||||
FilePrefetchBuffer* prefetch_buffer, IndexReader** index_reader,
|
||||
InternalIterator* preloaded_meta_index_iter, int level) {
|
||||
BlockBasedTableOptions::IndexType BlockBasedTable::UpdateIndexType() {
|
||||
// Some old version of block-based tables don't have index type present in
|
||||
// table properties. If that's the case we can safely use the kBinarySearch.
|
||||
auto index_type_on_file = BlockBasedTableOptions::kBinarySearch;
|
||||
BlockBasedTableOptions::IndexType index_type_on_file =
|
||||
BlockBasedTableOptions::kBinarySearch;
|
||||
if (rep_->table_properties) {
|
||||
auto& props = rep_->table_properties->user_collected_properties;
|
||||
auto pos = props.find(BlockBasedTablePropertyNames::kIndexType);
|
||||
@ -2441,6 +2458,19 @@ Status BlockBasedTable::CreateIndexReader(
|
||||
rep_->index_type = index_type_on_file;
|
||||
}
|
||||
}
|
||||
return index_type_on_file;
|
||||
}
|
||||
|
||||
// REQUIRES: The following fields of rep_ should have already been populated:
|
||||
// 1. file
|
||||
// 2. index_handle,
|
||||
// 3. options
|
||||
// 4. internal_comparator
|
||||
// 5. index_type
|
||||
Status BlockBasedTable::CreateIndexReader(
|
||||
FilePrefetchBuffer* prefetch_buffer, IndexReader** index_reader,
|
||||
InternalIterator* preloaded_meta_index_iter, int level) {
|
||||
auto index_type_on_file = UpdateIndexType();
|
||||
|
||||
auto file = rep_->file.get();
|
||||
const InternalKeyComparator* icomparator = &rep_->internal_comparator;
|
||||
|
@ -322,6 +322,9 @@ class BlockBasedTable : public TableReader {
|
||||
|
||||
void ReadMeta(const Footer& footer);
|
||||
|
||||
// Figure the index type, update it in rep_, and also return it.
|
||||
BlockBasedTableOptions::IndexType UpdateIndexType();
|
||||
|
||||
// Create a index reader based on the index type stored in the table.
|
||||
// Optionally, user can pass a preloaded meta_index_iter for the index that
|
||||
// need to access extra meta blocks for index construction. This parameter
|
||||
@ -478,11 +481,12 @@ struct BlockBasedTable::Rep {
|
||||
// block to extract prefix without knowing if a key is internal or not.
|
||||
unique_ptr<SliceTransform> internal_prefix_transform;
|
||||
|
||||
// only used in level 0 files:
|
||||
// when pin_l0_filter_and_index_blocks_in_cache is true, we do use the
|
||||
// LRU cache, but we always keep the filter & idndex block's handle checked
|
||||
// out here (=we don't call Release()), plus the parsed out objects
|
||||
// the LRU cache will never push flush them out, hence they're pinned
|
||||
// only used in level 0 files when pin_l0_filter_and_index_blocks_in_cache is
|
||||
// true or in all levels when pin_top_level_index_and_filter is set in
|
||||
// combination with partitioned index/filters: then we do use the LRU cache,
|
||||
// but we always keep the filter & index block's handle checked out here (=we
|
||||
// don't call Release()), plus the parsed out objects the LRU cache will never
|
||||
// push flush them out, hence they're pinned
|
||||
CachableEntry<FilterBlockReader> filter_entry;
|
||||
CachableEntry<IndexReader> index_entry;
|
||||
// range deletion meta-block is pinned through reader's lifetime when LRU
|
||||
|
@ -2330,6 +2330,7 @@ TEST_P(BlockBasedTableTest, NoObjectInCacheAfterTableClose) {
|
||||
}
|
||||
for (bool index_and_filter_in_cache : {true, false}) {
|
||||
for (bool pin_l0 : {true, false}) {
|
||||
for (bool pin_top_level : {true, false}) {
|
||||
if (pin_l0 && !index_and_filter_in_cache) {
|
||||
continue;
|
||||
}
|
||||
@ -2338,11 +2339,13 @@ TEST_P(BlockBasedTableTest, NoObjectInCacheAfterTableClose) {
|
||||
unique_ptr<InternalKeyComparator> ikc;
|
||||
ikc.reset(new test::PlainInternalKeyComparator(opt.comparator));
|
||||
opt.compression = kNoCompression;
|
||||
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
||||
BlockBasedTableOptions table_options =
|
||||
GetBlockBasedTableOptions();
|
||||
table_options.block_size = 1024;
|
||||
table_options.index_type =
|
||||
BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
|
||||
table_options.pin_l0_filter_and_index_blocks_in_cache = pin_l0;
|
||||
table_options.pin_top_level_index_and_filter = pin_top_level;
|
||||
table_options.partition_filters = partition_filter;
|
||||
table_options.cache_index_and_filter_blocks =
|
||||
index_and_filter_in_cache;
|
||||
@ -2399,6 +2402,7 @@ TEST_P(BlockBasedTableTest, NoObjectInCacheAfterTableClose) {
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} // level
|
||||
}
|
||||
|
||||
|
@ -432,6 +432,10 @@ DEFINE_int32(ops_between_duration_checks, 1000,
|
||||
DEFINE_bool(pin_l0_filter_and_index_blocks_in_cache, false,
|
||||
"Pin index/filter blocks of L0 files in block cache.");
|
||||
|
||||
DEFINE_bool(
|
||||
pin_top_level_index_and_filter, false,
|
||||
"Pin top-level index of partitioned index/filter blocks in block cache.");
|
||||
|
||||
DEFINE_int32(block_size,
|
||||
static_cast<int32_t>(rocksdb::BlockBasedTableOptions().block_size),
|
||||
"Number of bytes in a block.");
|
||||
@ -3186,6 +3190,8 @@ void VerifyDBFromDB(std::string& truth_db_name) {
|
||||
FLAGS_cache_index_and_filter_blocks;
|
||||
block_based_options.pin_l0_filter_and_index_blocks_in_cache =
|
||||
FLAGS_pin_l0_filter_and_index_blocks_in_cache;
|
||||
block_based_options.pin_top_level_index_and_filter =
|
||||
FLAGS_pin_top_level_index_and_filter;
|
||||
if (FLAGS_cache_high_pri_pool_ratio > 1e-6) { // > 0.0 + eps
|
||||
block_based_options.cache_index_and_filter_blocks_with_high_priority =
|
||||
true;
|
||||
|
@ -195,6 +195,7 @@ BlockBasedTableOptions RandomBlockBasedTableOptions(Random* rnd) {
|
||||
BlockBasedTableOptions opt;
|
||||
opt.cache_index_and_filter_blocks = rnd->Uniform(2);
|
||||
opt.pin_l0_filter_and_index_blocks_in_cache = rnd->Uniform(2);
|
||||
opt.pin_top_level_index_and_filter = rnd->Uniform(2);
|
||||
opt.index_type = rnd->Uniform(2) ? BlockBasedTableOptions::kBinarySearch
|
||||
: BlockBasedTableOptions::kHashSearch;
|
||||
opt.hash_index_allow_collision = rnd->Uniform(2);
|
||||
|
Loading…
Reference in New Issue
Block a user