Cache warming blocks during flush (#8561)

Summary:
Insert warm blocks  (data, uncompressed dict, index and filter blocks) during flush in Block cache which is enabled under option BlockBasedTableOptions.prepopulate_block_cache.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/8561

Test Plan: Added unit test

Reviewed By: anand1976

Differential Revision: D29773411

Pulled By: akankshamahajan15

fbshipit-source-id: 6631123c10134340ef0bd7e90baafaa6deba0e66
This commit is contained in:
Akanksha Mahajan 2021-08-03 12:42:22 -07:00 committed by Facebook GitHub Bot
parent b278152261
commit 8b2f60b668
5 changed files with 131 additions and 57 deletions

View File

@ -5,6 +5,7 @@
### New Features
* Made the EventListener extend the Customizable class.
* EventListeners that have a non-empty Name() and that are registered with the ObjectRegistry can now be serialized to/from the OPTIONS file.
* Insert warm blocks (data blocks, uncompressed dict blocks, index and filter blocks) in Block cache during flush under option BlockBasedTableOptions.prepopulate_block_cache. Previously it was enabled for only data blocks.
### Performance Improvements
* Try to avoid updating DBOptions if `SetDBOptions()` does not change any option value.

View File

@ -497,12 +497,48 @@ TEST_F(DBBlockCacheTest, WarmCacheWithDataBlocksDuringFlush) {
ASSERT_OK(Put(ToString(i), value));
ASSERT_OK(Flush());
ASSERT_EQ(i, options.statistics->getTickerCount(BLOCK_CACHE_DATA_ADD));
ASSERT_EQ(value, Get(ToString(i)));
ASSERT_EQ(0, options.statistics->getTickerCount(BLOCK_CACHE_DATA_MISS));
ASSERT_EQ(i, options.statistics->getTickerCount(BLOCK_CACHE_DATA_HIT));
}
}
// This test cache all types of blocks during flush.
TEST_F(DBBlockCacheTest, WarmCacheWithBlocksDuringFlush) {
Options options = CurrentOptions();
options.create_if_missing = true;
options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
BlockBasedTableOptions table_options;
table_options.block_cache = NewLRUCache(1 << 25, 0, false);
table_options.cache_index_and_filter_blocks = true;
table_options.prepopulate_block_cache =
BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly;
table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
DestroyAndReopen(options);
std::string value(kValueSize, 'a');
for (size_t i = 1; i < 2; i++) {
ASSERT_OK(Put(ToString(i), value));
ASSERT_OK(Flush());
ASSERT_EQ(i, options.statistics->getTickerCount(BLOCK_CACHE_DATA_ADD));
ASSERT_EQ(i, options.statistics->getTickerCount(BLOCK_CACHE_INDEX_ADD));
ASSERT_EQ(i, options.statistics->getTickerCount(BLOCK_CACHE_FILTER_ADD));
ASSERT_EQ(value, Get(ToString(i)));
ASSERT_EQ(0, options.statistics->getTickerCount(BLOCK_CACHE_DATA_MISS));
ASSERT_EQ(i, options.statistics->getTickerCount(BLOCK_CACHE_DATA_HIT));
ASSERT_EQ(0, options.statistics->getTickerCount(BLOCK_CACHE_INDEX_MISS));
ASSERT_EQ(i * 3, options.statistics->getTickerCount(BLOCK_CACHE_INDEX_HIT));
ASSERT_EQ(0, options.statistics->getTickerCount(BLOCK_CACHE_FILTER_MISS));
ASSERT_EQ(i * 2,
options.statistics->getTickerCount(BLOCK_CACHE_FILTER_HIT));
}
}
#endif
namespace {

View File

@ -464,22 +464,18 @@ struct BlockBasedTableOptions {
// Default: 256 KB (256 * 1024).
size_t max_auto_readahead_size = 256 * 1024;
// If enabled, prepopulate warm/hot data blocks which are already in memory
// into block cache at the time of flush. On a flush, the data block that is
// in memory (in memtables) get flushed to the device. If using Direct IO,
// additional IO is incurred to read this data back into memory again, which
// is avoided by enabling this option. This further helps if the workload
// exhibits high temporal locality, where most of the reads go to recently
// written data. This also helps in case of Distributed FileSystem.
//
// Right now, this is enabled only for flush for data blocks. We plan to
// expand this option to cover compactions in the future and for other types
// of blocks.
// If enabled, prepopulate warm/hot blocks (data, uncompressed dict, index and
// filter blocks) which are already in memory into block cache at the time of
// flush. On a flush, the block that is in memory (in memtables) get flushed
// to the device. If using Direct IO, additional IO is incurred to read this
// data back into memory again, which is avoided by enabling this option. This
// further helps if the workload exhibits high temporal locality, where most
// of the reads go to recently written data. This also helps in case of
// Distributed FileSystem.
enum class PrepopulateBlockCache : char {
// Disable prepopulate block cache.
kDisable,
// Prepopulate data blocks during flush only. Plan to extend it to all block
// types.
// Prepopulate blocks during flush only.
kFlushOnly,
};

View File

@ -36,6 +36,7 @@
#include "table/block_based/block_based_table_factory.h"
#include "table/block_based/block_based_table_reader.h"
#include "table/block_based/block_builder.h"
#include "table/block_based/block_like_traits.h"
#include "table/block_based/filter_block.h"
#include "table/block_based/filter_policy_internal.h"
#include "table/block_based/full_filter_block.h"
@ -994,33 +995,34 @@ void BlockBasedTableBuilder::Flush() {
r->get_offset());
r->pc_rep->EmitBlock(block_rep);
} else {
WriteBlock(&r->data_block, &r->pending_handle, true /* is_data_block */);
WriteBlock(&r->data_block, &r->pending_handle, BlockType::kData);
}
}
void BlockBasedTableBuilder::WriteBlock(BlockBuilder* block,
BlockHandle* handle,
bool is_data_block) {
BlockType block_type) {
block->Finish();
std::string raw_block_contents;
block->SwapAndReset(raw_block_contents);
if (rep_->state == Rep::State::kBuffered) {
assert(is_data_block);
assert(block_type == BlockType::kData);
rep_->data_block_buffers.emplace_back(std::move(raw_block_contents));
rep_->data_begin_offset += rep_->data_block_buffers.back().size();
return;
}
WriteBlock(raw_block_contents, handle, is_data_block);
WriteBlock(raw_block_contents, handle, block_type);
}
void BlockBasedTableBuilder::WriteBlock(const Slice& raw_block_contents,
BlockHandle* handle,
bool is_data_block) {
BlockType block_type) {
Rep* r = rep_;
assert(r->state == Rep::State::kUnbuffered);
Slice block_contents;
CompressionType type;
Status compress_status;
bool is_data_block = block_type == BlockType::kData;
CompressAndVerifyBlock(raw_block_contents, is_data_block,
*(r->compression_ctxs[0]), r->verify_ctxs[0].get(),
&(r->compressed_output), &(block_contents), &type,
@ -1030,8 +1032,7 @@ void BlockBasedTableBuilder::WriteBlock(const Slice& raw_block_contents,
return;
}
WriteRawBlock(block_contents, type, handle, is_data_block,
&raw_block_contents);
WriteRawBlock(block_contents, type, handle, block_type, &raw_block_contents);
r->compressed_output.clear();
if (is_data_block) {
if (r->filter_builder != nullptr) {
@ -1189,9 +1190,10 @@ void BlockBasedTableBuilder::CompressAndVerifyBlock(
void BlockBasedTableBuilder::WriteRawBlock(const Slice& block_contents,
CompressionType type,
BlockHandle* handle,
bool is_data_block,
BlockType block_type,
const Slice* raw_block_contents) {
Rep* r = rep_;
bool is_data_block = block_type == BlockType::kData;
Status s = Status::OK();
IOStatus io_s = IOStatus::OK();
StopWatch sw(r->ioptions.clock, r->ioptions.stats, WRITE_RAW_BLOCK_MICROS);
@ -1247,13 +1249,12 @@ void BlockBasedTableBuilder::WriteRawBlock(const Slice& block_contents,
io_s = r->file->Append(Slice(trailer, kBlockTrailerSize));
if (io_s.ok()) {
assert(s.ok());
if (is_data_block &&
r->table_options.prepopulate_block_cache ==
BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly) {
if (r->table_options.prepopulate_block_cache ==
BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly) {
if (type == kNoCompression) {
s = InsertBlockInCache(block_contents, handle);
s = InsertBlockInCacheHelper(block_contents, handle, block_type);
} else if (raw_block_contents != nullptr) {
s = InsertBlockInCache(*raw_block_contents, handle);
s = InsertBlockInCacheHelper(*raw_block_contents, handle, block_type);
}
if (!s.ok()) {
r->SetStatus(s);
@ -1328,10 +1329,8 @@ void BlockBasedTableBuilder::BGWorkWriteRawBlock() {
}
r->pc_rep->file_size_estimator.SetCurrBlockRawSize(block_rep->data->size());
WriteRawBlock(block_rep->compressed_contents, block_rep->compression_type,
&r->pending_handle, true /* is_data_block*/,
&block_rep->contents);
&r->pending_handle, BlockType::kData, &block_rep->contents);
if (!ok()) {
break;
}
@ -1460,8 +1459,30 @@ Status BlockBasedTableBuilder::InsertBlockInCompressedCache(
return s;
}
Status BlockBasedTableBuilder::InsertBlockInCacheHelper(
const Slice& block_contents, const BlockHandle* handle,
BlockType block_type) {
Status s;
if (block_type == BlockType::kData || block_type == BlockType::kIndex) {
s = InsertBlockInCache<Block>(block_contents, handle, block_type);
} else if (block_type == BlockType::kFilter) {
if (rep_->filter_builder->IsBlockBased()) {
s = InsertBlockInCache<Block>(block_contents, handle, block_type);
} else {
s = InsertBlockInCache<ParsedFullFilterBlock>(block_contents, handle,
block_type);
}
} else if (block_type == BlockType::kCompressionDictionary) {
s = InsertBlockInCache<UncompressionDict>(block_contents, handle,
block_type);
}
return s;
}
template <typename TBlocklike>
Status BlockBasedTableBuilder::InsertBlockInCache(const Slice& block_contents,
const BlockHandle* handle) {
const BlockHandle* handle,
BlockType block_type) {
// Uncompressed regular block cache
Cache* block_cache = rep_->table_options.block_cache.get();
Status s;
@ -1479,15 +1500,25 @@ Status BlockBasedTableBuilder::InsertBlockInCache(const Slice& block_contents,
const size_t read_amp_bytes_per_bit =
rep_->table_options.read_amp_bytes_per_bit;
Block* block = new Block(std::move(results), read_amp_bytes_per_bit);
size_t charge = block->ApproximateMemoryUsage();
s = block_cache->Insert(key, block, charge, &DeleteEntryCached<Block>);
if (s.ok()) {
BlockBasedTable::UpdateCacheInsertionMetrics(
BlockType::kData, nullptr /*get_context*/, charge,
s.IsOkOverwritten(), rep_->ioptions.stats);
} else {
RecordTick(rep_->ioptions.stats, BLOCK_CACHE_ADD_FAILURES);
TBlocklike* block_holder = BlocklikeTraits<TBlocklike>::Create(
std::move(results), read_amp_bytes_per_bit,
rep_->ioptions.statistics.get(),
false /*rep_->blocks_definitely_zstd_compressed*/,
rep_->table_options.filter_policy.get());
if (block_holder->own_bytes()) {
size_t charge = block_holder->ApproximateMemoryUsage();
s = block_cache->Insert(key, block_holder, charge,
&DeleteEntryCached<TBlocklike>);
if (s.ok()) {
BlockBasedTable::UpdateCacheInsertionMetrics(
block_type, nullptr /*get_context*/, charge, s.IsOkOverwritten(),
rep_->ioptions.stats);
} else {
RecordTick(rep_->ioptions.stats, BLOCK_CACHE_ADD_FAILURES);
}
}
}
return s;
@ -1507,7 +1538,8 @@ void BlockBasedTableBuilder::WriteFilterBlock(
rep_->filter_builder->Finish(filter_block_handle, &s);
assert(s.ok() || s.IsIncomplete());
rep_->props.filter_size += filter_content.size();
WriteRawBlock(filter_content, kNoCompression, &filter_block_handle);
WriteRawBlock(filter_content, kNoCompression, &filter_block_handle,
BlockType::kFilter);
}
}
if (ok() && !empty_filter_block) {
@ -1541,7 +1573,7 @@ void BlockBasedTableBuilder::WriteIndexBlock(
if (ok()) {
for (const auto& item : index_blocks.meta_blocks) {
BlockHandle block_handle;
WriteBlock(item.second, &block_handle, false /* is_data_block */);
WriteBlock(item.second, &block_handle, BlockType::kIndex);
if (!ok()) {
break;
}
@ -1550,10 +1582,11 @@ void BlockBasedTableBuilder::WriteIndexBlock(
}
if (ok()) {
if (rep_->table_options.enable_index_compression) {
WriteBlock(index_blocks.index_block_contents, index_block_handle, false);
WriteBlock(index_blocks.index_block_contents, index_block_handle,
BlockType::kIndex);
} else {
WriteRawBlock(index_blocks.index_block_contents, kNoCompression,
index_block_handle);
index_block_handle, BlockType::kIndex);
}
}
// If there are more index partitions, finish them and write them out
@ -1567,10 +1600,10 @@ void BlockBasedTableBuilder::WriteIndexBlock(
}
if (rep_->table_options.enable_index_compression) {
WriteBlock(index_blocks.index_block_contents, index_block_handle,
false);
BlockType::kIndex);
} else {
WriteRawBlock(index_blocks.index_block_contents, kNoCompression,
index_block_handle);
index_block_handle, BlockType::kIndex);
}
// The last index_block_handle will be for the partition index block
}
@ -1665,7 +1698,7 @@ void BlockBasedTableBuilder::WritePropertiesBlock(
&property_block_builder);
WriteRawBlock(property_block_builder.Finish(), kNoCompression,
&properties_block_handle);
&properties_block_handle, BlockType::kProperties);
}
if (ok()) {
#ifndef NDEBUG
@ -1691,7 +1724,8 @@ void BlockBasedTableBuilder::WriteCompressionDictBlock(
BlockHandle compression_dict_block_handle;
if (ok()) {
WriteRawBlock(rep_->compression_dict->GetRawDict(), kNoCompression,
&compression_dict_block_handle);
&compression_dict_block_handle,
BlockType::kCompressionDictionary);
#ifndef NDEBUG
Slice compression_dict = rep_->compression_dict->GetRawDict();
TEST_SYNC_POINT_CALLBACK(
@ -1711,7 +1745,7 @@ void BlockBasedTableBuilder::WriteRangeDelBlock(
if (ok() && !rep_->range_del_block.empty()) {
BlockHandle range_del_block_handle;
WriteRawBlock(rep_->range_del_block.Finish(), kNoCompression,
&range_del_block_handle);
&range_del_block_handle, BlockType::kRangeDeletion);
meta_index_builder->Add(kRangeDelBlock, range_del_block_handle);
}
}
@ -1872,8 +1906,7 @@ void BlockBasedTableBuilder::EnterUnbuffered() {
}
r->index_builder->OnKeyAdded(key);
}
WriteBlock(Slice(data_block), &r->pending_handle,
true /* is_data_block */);
WriteBlock(Slice(data_block), &r->pending_handle, BlockType::kData);
if (ok() && i + 1 < r->data_block_buffers.size()) {
assert(next_block_iter != nullptr);
Slice first_key_in_next_block = next_block_iter->key();
@ -1935,7 +1968,7 @@ Status BlockBasedTableBuilder::Finish() {
if (ok()) {
// flush the meta index block
WriteRawBlock(meta_index_builder.Finish(), kNoCompression,
&metaindex_block_handle);
&metaindex_block_handle, BlockType::kMetaIndex);
}
if (ok()) {
WriteFooter(metaindex_block_handle, index_block_handle);

View File

@ -9,6 +9,7 @@
#pragma once
#include <stdint.h>
#include <limits>
#include <string>
#include <utility>
@ -108,20 +109,27 @@ class BlockBasedTableBuilder : public TableBuilder {
// Call block's Finish() method and then
// - in buffered mode, buffer the uncompressed block contents.
// - in unbuffered mode, write the compressed block contents to file.
void WriteBlock(BlockBuilder* block, BlockHandle* handle, bool is_data_block);
void WriteBlock(BlockBuilder* block, BlockHandle* handle,
BlockType blocktype);
// Compress and write block content to the file.
void WriteBlock(const Slice& block_contents, BlockHandle* handle,
bool is_data_block);
BlockType block_type);
// Directly write data to the file.
void WriteRawBlock(const Slice& data, CompressionType, BlockHandle* handle,
bool is_data_block = false,
const Slice* raw_data = nullptr);
BlockType block_type, const Slice* raw_data = nullptr);
void SetupCacheKeyPrefix(const TableBuilderOptions& tbo);
template <typename TBlocklike>
Status InsertBlockInCache(const Slice& block_contents,
const BlockHandle* handle);
const BlockHandle* handle, BlockType block_type);
Status InsertBlockInCacheHelper(const Slice& block_contents,
const BlockHandle* handle,
BlockType block_type);
Status InsertBlockInCompressedCache(const Slice& block_contents,
const CompressionType type,
const BlockHandle* handle);