From 3b66d4b617ae865c9a56f2c39e0d37840cca3a6f Mon Sep 17 00:00:00 2001 From: Maysam Yabandeh Date: Fri, 10 Aug 2018 15:14:44 -0700 Subject: [PATCH] Fix wrong partitioned index size recorded in properties block (#4259) Summary: After refactoring in https://github.com/facebook/rocksdb/pull/4158 the properties block is written after the index block. This breaks the existing logic in estimating the index size in partitioned indexes. The patch fixes that by using the accurate index block size, which is available since by the time we write the properties block, the index block is already written. The patch also fixes an issue in estimating the partition size with format_version=3 which was resulting into partitions smaller than the configured metadata_block_size. Pull Request resolved: https://github.com/facebook/rocksdb/pull/4259 Differential Revision: D9274454 Pulled By: maysamyabandeh fbshipit-source-id: c82d045505cca3e7ed1a44ee1eaa26e4f25a4272 --- HISTORY.md | 4 +++ table/block_based_table_builder.cc | 5 +-- table/index_builder.cc | 53 ++++++++++-------------------- table/index_builder.h | 32 +++++++++++------- 4 files changed, 45 insertions(+), 49 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 7e516d85a..19a2dfc16 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,4 +1,8 @@ # Rocksdb Change Log +## 5.15.3 (8/10/2018) +### Bug Fixes +* Fix a bug in misreporting the estimated partition index size in properties block. + ## 5.15.2 (8/9/2018) ### Bug Fixes * Return correct usable_size for BlockContents. diff --git a/table/block_based_table_builder.cc b/table/block_based_table_builder.cc index df094ffa2..e8db6a4bd 100644 --- a/table/block_based_table_builder.cc +++ b/table/block_based_table_builder.cc @@ -755,7 +755,8 @@ void BlockBasedTableBuilder::WritePropertiesBlock( rep_->props.filter_policy_name = rep_->table_options.filter_policy != nullptr ? rep_->table_options.filter_policy->Name() : ""; - rep_->props.index_size = rep_->index_builder->EstimatedSize() + kBlockTrailerSize; + rep_->props.index_size = + rep_->index_builder->IndexSize() + kBlockTrailerSize; rep_->props.comparator_name = rep_->ioptions.user_comparator != nullptr ? rep_->ioptions.user_comparator->Name() : "nullptr"; @@ -784,7 +785,7 @@ void BlockBasedTableBuilder::WritePropertiesBlock( assert(rep_->p_index_builder_ != nullptr); rep_->props.index_partitions = rep_->p_index_builder_->NumPartitions(); rep_->props.top_level_index_size = - rep_->p_index_builder_->EstimateTopLevelIndexSize(rep_->offset); + rep_->p_index_builder_->TopLevelIndexSize(rep_->offset); } rep_->props.index_key_is_user_key = !rep_->index_builder->seperator_is_key_plus_seq(); diff --git a/table/index_builder.cc b/table/index_builder.cc index ebabbeb8d..0d8d3b8e6 100644 --- a/table/index_builder.cc +++ b/table/index_builder.cc @@ -70,6 +70,12 @@ PartitionedIndexBuilder::PartitionedIndexBuilder( table_opt.format_version), sub_index_builder_(nullptr), table_opt_(table_opt), + // We start by false. After each partition we revise the value based on + // what the sub_index_builder has decided. If the feature is disabled + // entirely, this will be set to true after switching the first + // sub_index_builder. Otherwise, it could be set to true even one of the + // sub_index_builders could not safely exclude seq from the keys, then it + // wil be enforced on all sub_index_builders on ::Finish. seperator_is_key_plus_seq_(false) {} PartitionedIndexBuilder::~PartitionedIndexBuilder() { @@ -83,7 +89,11 @@ void PartitionedIndexBuilder::MakeNewSubIndexBuilder() { table_opt_.format_version); flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy( table_opt_.metadata_block_size, table_opt_.block_size_deviation, - sub_index_builder_->index_block_builder_)); + // Note: this is sub-optimal since sub_index_builder_ could later reset + // seperator_is_key_plus_seq_ but the probability of that is low. + sub_index_builder_->seperator_is_key_plus_seq_ + ? sub_index_builder_->index_block_builder_ + : sub_index_builder_->index_block_builder_without_seq_)); partition_cut_requested_ = false; } @@ -143,6 +153,9 @@ void PartitionedIndexBuilder::AddIndexEntry( Status PartitionedIndexBuilder::Finish( IndexBlocks* index_blocks, const BlockHandle& last_partition_block_handle) { + if (partition_cnt_ == 0) { + partition_cnt_ = entries_.size(); + } // It must be set to null after last key is added assert(sub_index_builder_ == nullptr); if (finishing_indexes == true) { @@ -164,6 +177,8 @@ Status PartitionedIndexBuilder::Finish( index_blocks->index_block_contents = index_block_builder_without_seq_.Finish(); } + top_level_index_size_ = index_blocks->index_block_contents.size(); + index_size_ += top_level_index_size_; return Status::OK(); } else { // Finish the next partition index in line and Incomplete() to indicate we @@ -172,45 +187,13 @@ Status PartitionedIndexBuilder::Finish( // Apply the policy to all sub-indexes entry.value->seperator_is_key_plus_seq_ = seperator_is_key_plus_seq_; auto s = entry.value->Finish(index_blocks); + index_size_ += index_blocks->index_block_contents.size(); finishing_indexes = true; return s.ok() ? Status::Incomplete() : s; } } -// Estimate size excluding the top-level index -// It is assumed that this method is called before writing index partition -// starts -size_t PartitionedIndexBuilder::EstimatedSize() const { - size_t total = 0; - for (auto it = entries_.begin(); it != entries_.end(); ++it) { - total += it->value->EstimatedSize(); - } - total += - sub_index_builder_ == nullptr ? 0 : sub_index_builder_->EstimatedSize(); - return total; -} - -// Since when this method is called we do not know the index block offsets yet, -// the top-level index does not exist. Hence we estimate the block offsets and -// create a temporary top-level index. -size_t PartitionedIndexBuilder::EstimateTopLevelIndexSize( - uint64_t offset) const { - BlockBuilder tmp_builder( - table_opt_.index_block_restart_interval); // tmp top-level index builder - for (auto it = entries_.begin(); it != entries_.end(); ++it) { - std::string tmp_handle_encoding; - uint64_t size = it->value->EstimatedSize(); - BlockHandle tmp_block_handle(offset, size); - tmp_block_handle.EncodeTo(&tmp_handle_encoding); - tmp_builder.Add( - seperator_is_key_plus_seq_ ? it->key : ExtractUserKey(it->key), - tmp_handle_encoding); - offset += size; - } - return tmp_builder.CurrentSizeEstimate(); -} - size_t PartitionedIndexBuilder::NumPartitions() const { - return entries_.size(); + return partition_cnt_; } } // namespace rocksdb diff --git a/table/index_builder.h b/table/index_builder.h index e8c060716..d77194a8b 100644 --- a/table/index_builder.h +++ b/table/index_builder.h @@ -96,13 +96,15 @@ class IndexBuilder { virtual Status Finish(IndexBlocks* index_blocks, const BlockHandle& last_partition_block_handle) = 0; - // Get the estimated size for index block. - virtual size_t EstimatedSize() const = 0; + // Get the size for index block. Must be called after ::Finish. + virtual size_t IndexSize() const = 0; virtual bool seperator_is_key_plus_seq() { return true; } protected: const InternalKeyComparator* comparator_; + // Set after ::Finish is called + size_t index_size_ = 0; }; // This index builder builds space-efficient index block. @@ -162,15 +164,12 @@ class ShortenedIndexBuilder : public IndexBuilder { index_blocks->index_block_contents = index_block_builder_without_seq_.Finish(); } + index_size_ = index_blocks->index_block_contents.size(); return Status::OK(); } - virtual size_t EstimatedSize() const override { - if (seperator_is_key_plus_seq_) { - return index_block_builder_.CurrentSizeEstimate(); - } else { - return index_block_builder_without_seq_.CurrentSizeEstimate(); - } + virtual size_t IndexSize() const override { + return index_size_; } virtual bool seperator_is_key_plus_seq() override { @@ -272,8 +271,8 @@ class HashIndexBuilder : public IndexBuilder { return Status::OK(); } - virtual size_t EstimatedSize() const override { - return primary_index_builder_.EstimatedSize() + prefix_block_.size() + + virtual size_t IndexSize() const override { + return primary_index_builder_.IndexSize() + prefix_block_.size() + prefix_meta_block_.size(); } @@ -338,8 +337,12 @@ class PartitionedIndexBuilder : public IndexBuilder { IndexBlocks* index_blocks, const BlockHandle& last_partition_block_handle) override; - virtual size_t EstimatedSize() const override; - size_t EstimateTopLevelIndexSize(uint64_t) const; + virtual size_t IndexSize() const override { + return index_size_; + } + size_t TopLevelIndexSize(uint64_t) const { + return top_level_index_size_; + } size_t NumPartitions() const; inline bool ShouldCutFilterBlock() { @@ -362,6 +365,11 @@ class PartitionedIndexBuilder : public IndexBuilder { } private: + // Set after ::Finish is called + size_t top_level_index_size_ = 0; + // Set after ::Finish is called + size_t partition_cnt_ = 0; + void MakeNewSubIndexBuilder(); struct Entry {