Clarify caching behavior for index and filter partitions (#9068)

Summary:
Somewhat confusingly, index and filter partition blocks are
never owned by table readers, even with
cache_index_and_filter_blocks=false. They still go into block cache
(possibly pinned by table reader) if there is a block cache. If no block
cache, they are only loaded transiently on demand.

This PR primarily clarifies the options APIs and some internal code
comments.

Also, this closes a hypothetical data corruption vulnerability where
some but not all index partitions are pinned. I haven't been able to
reproduce a case where it can happen (the failure seems to propagate
to abort table open) but it's worth patching nonetheless.

Fixes https://github.com/facebook/rocksdb/issues/8979

Pull Request resolved: https://github.com/facebook/rocksdb/pull/9068

Test Plan:
existing tests :-/  I could cover the new code using sync
points, but then I'd have to very carefully relax my `assert(false)`

Reviewed By: ajkr

Differential Revision: D31898284

Pulled By: pdillinger

fbshipit-source-id: f2511a7d3a36bc04b627935d8e6cfea6422f98be
This commit is contained in:
Peter Dillinger 2021-10-27 17:21:48 -07:00 committed by Facebook GitHub Bot
parent 82846f41d3
commit 5bf9a7d5ee
6 changed files with 64 additions and 35 deletions

View File

@ -117,9 +117,10 @@ struct BlockBasedTableOptions {
// caching as they should now apply to range tombstone and compression // caching as they should now apply to range tombstone and compression
// dictionary meta-blocks, in addition to index and filter meta-blocks. // dictionary meta-blocks, in addition to index and filter meta-blocks.
// //
// Indicating if we'd put index/filter blocks to the block cache. // Whether to put index/filter blocks in the block cache. When false,
// If not specified, each "table reader" object will pre-load index/filter // each "table reader" object will pre-load index/filter blocks during
// block during table initialization. // table initialization. Index and filter partition blocks always use
// block cache regardless of this option.
bool cache_index_and_filter_blocks = false; bool cache_index_and_filter_blocks = false;
// If cache_index_and_filter_blocks is enabled, cache index and filter // If cache_index_and_filter_blocks is enabled, cache index and filter
@ -190,6 +191,8 @@ struct BlockBasedTableOptions {
kHashSearch = 0x01, kHashSearch = 0x01,
// A two-level index implementation. Both levels are binary search indexes. // A two-level index implementation. Both levels are binary search indexes.
// Second level index blocks ("partitions") use block cache even when
// cache_index_and_filter_blocks=false.
kTwoLevelIndexSearch = 0x02, kTwoLevelIndexSearch = 0x02,
// Like kBinarySearch, but index also contains first key of each block. // Like kBinarySearch, but index also contains first key of each block.
@ -285,7 +288,8 @@ struct BlockBasedTableOptions {
// well. // well.
// TODO(myabandeh): remove the note above once the limitation is lifted // TODO(myabandeh): remove the note above once the limitation is lifted
// Use partitioned full filters for each SST file. This option is // Use partitioned full filters for each SST file. This option is
// incompatible with block-based filters. // incompatible with block-based filters. Filter partition blocks use
// block cache even when cache_index_and_filter_blocks=false.
bool partition_filters = false; bool partition_filters = false;
// Option to generate Bloom/Ribbon filters that minimize memory // Option to generate Bloom/Ribbon filters that minimize memory

View File

@ -2063,9 +2063,12 @@ BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator(
const BlockHandle& handle) { const BlockHandle& handle) {
// Return a block iterator on the index partition // Return a block iterator on the index partition
auto block = block_map_->find(handle.offset()); auto block = block_map_->find(handle.offset());
// This is a possible scenario since block cache might not have had space // block_map_ must be exhaustive
// for the partition if (block == block_map_->end()) {
if (block != block_map_->end()) { assert(false);
// Signal problem to caller
return nullptr;
}
const Rep* rep = table_->get_rep(); const Rep* rep = table_->get_rep();
assert(rep); assert(rep);
@ -2078,10 +2081,6 @@ BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator(
rep->index_has_first_key, rep->index_key_includes_seq, rep->index_has_first_key, rep->index_key_includes_seq,
rep->index_value_is_full); rep->index_value_is_full);
} }
// Create an empty iterator
// TODO(ajkr): this is not the right way to handle an unpinned partition.
return new IndexBlockIter();
}
// This will be broken if the user specifies an unusual implementation // This will be broken if the user specifies an unusual implementation
// of Options.comparator, or if the user specifies an unusual // of Options.comparator, or if the user specifies an unusual

View File

@ -142,6 +142,8 @@ class PartitionedFilterBlockReader : public FilterBlockReaderCommon<Block> {
bool index_value_is_full() const; bool index_value_is_full() const;
protected: protected:
// For partition blocks pinned in cache. Can be a subset of blocks
// in case some fail insertion on attempt to pin.
std::unordered_map<uint64_t, CachableEntry<ParsedFullFilterBlock>> std::unordered_map<uint64_t, CachableEntry<ParsedFullFilterBlock>>
filter_map_; filter_map_;
}; };

View File

@ -114,11 +114,13 @@ Status PartitionIndexReader::CacheDependencies(const ReadOptions& ro,
Statistics* kNullStats = nullptr; Statistics* kNullStats = nullptr;
CachableEntry<Block> index_block; CachableEntry<Block> index_block;
{
Status s = GetOrReadIndexBlock(false /* no_io */, nullptr /* get_context */, Status s = GetOrReadIndexBlock(false /* no_io */, nullptr /* get_context */,
&lookup_context, &index_block); &lookup_context, &index_block);
if (!s.ok()) { if (!s.ok()) {
return s; return s;
} }
}
// We don't return pinned data from index blocks, so no need // We don't return pinned data from index blocks, so no need
// to set `block_contents_pinned`. // to set `block_contents_pinned`.
@ -149,7 +151,8 @@ Status PartitionIndexReader::CacheDependencies(const ReadOptions& ro,
rep->CreateFilePrefetchBuffer(0, 0, &prefetch_buffer, rep->CreateFilePrefetchBuffer(0, 0, &prefetch_buffer,
false /*Implicit auto readahead*/); false /*Implicit auto readahead*/);
IOOptions opts; IOOptions opts;
s = rep->file->PrepareIOOptions(ro, opts); {
Status s = rep->file->PrepareIOOptions(ro, opts);
if (s.ok()) { if (s.ok()) {
s = prefetch_buffer->Prefetch(opts, rep->file.get(), prefetch_off, s = prefetch_buffer->Prefetch(opts, rep->file.get(), prefetch_off,
static_cast<size_t>(prefetch_len)); static_cast<size_t>(prefetch_len));
@ -157,15 +160,21 @@ Status PartitionIndexReader::CacheDependencies(const ReadOptions& ro,
if (!s.ok()) { if (!s.ok()) {
return s; return s;
} }
}
// For saving "all or nothing" to partition_map_
std::unordered_map<uint64_t, CachableEntry<Block>> map_in_progress;
// After prefetch, read the partitions one by one // After prefetch, read the partitions one by one
biter.SeekToFirst(); biter.SeekToFirst();
size_t partition_count = 0;
for (; biter.Valid(); biter.Next()) { for (; biter.Valid(); biter.Next()) {
handle = biter.value().handle; handle = biter.value().handle;
CachableEntry<Block> block; CachableEntry<Block> block;
++partition_count;
// TODO: Support counter batch update for partitioned index and // TODO: Support counter batch update for partitioned index and
// filter blocks // filter blocks
s = table()->MaybeReadBlockAndLoadToCache( Status s = table()->MaybeReadBlockAndLoadToCache(
prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(), prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(),
/*wait=*/true, &block, BlockType::kIndex, /*get_context=*/nullptr, /*wait=*/true, &block, BlockType::kIndex, /*get_context=*/nullptr,
&lookup_context, /*contents=*/nullptr); &lookup_context, /*contents=*/nullptr);
@ -174,14 +183,22 @@ Status PartitionIndexReader::CacheDependencies(const ReadOptions& ro,
return s; return s;
} }
if (block.GetValue() != nullptr) { if (block.GetValue() != nullptr) {
// Might need to "pin" some mmap-read blocks (GetOwnValue) if some
// partitions are successfully compressed (cached) and some are not
// compressed (mmap eligible)
if (block.IsCached() || block.GetOwnValue()) { if (block.IsCached() || block.GetOwnValue()) {
if (pin) { if (pin) {
partition_map_[handle.offset()] = std::move(block); map_in_progress[handle.offset()] = std::move(block);
} }
} }
} }
} }
return biter.status(); Status s = biter.status();
// Save (pin) them only if everything checks out
if (map_in_progress.size() == partition_count && s.ok()) {
std::swap(partition_map_, map_in_progress);
}
return s;
} }
} // namespace ROCKSDB_NAMESPACE } // namespace ROCKSDB_NAMESPACE

View File

@ -46,6 +46,9 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon {
CachableEntry<Block>&& index_block) CachableEntry<Block>&& index_block)
: IndexReaderCommon(t, std::move(index_block)) {} : IndexReaderCommon(t, std::move(index_block)) {}
// For partition blocks pinned in cache. This is expected to be "all or
// none" so that !partition_map_.empty() can use an iterator expecting
// all partitions to be saved here.
std::unordered_map<uint64_t, CachableEntry<Block>> partition_map_; std::unordered_map<uint64_t, CachableEntry<Block>> partition_map_;
}; };
} // namespace ROCKSDB_NAMESPACE } // namespace ROCKSDB_NAMESPACE

View File

@ -201,6 +201,10 @@ void TwoLevelIndexIterator::InitDataBlock() {
state_->NewSecondaryIterator(handle); state_->NewSecondaryIterator(handle);
data_block_handle_ = handle; data_block_handle_ = handle;
SetSecondLevelIterator(iter); SetSecondLevelIterator(iter);
if (iter == nullptr) {
status_ = Status::Corruption("Missing block for partition " +
handle.ToString());
}
} }
} }
} }