Fix wrong partitioned index size recorded in properties block (#4259)

Summary:
After refactoring in https://github.com/facebook/rocksdb/pull/4158 the properties block is written after the index block. This breaks the existing logic in estimating the index size in partitioned indexes. The patch fixes that by using the accurate index block size, which is available since by the time we write the properties block, the index block is already written.
The patch also fixes an issue in estimating the partition size with format_version=3 which was resulting into partitions smaller than the configured metadata_block_size.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4259

Differential Revision: D9274454

Pulled By: maysamyabandeh

fbshipit-source-id: c82d045505cca3e7ed1a44ee1eaa26e4f25a4272
This commit is contained in:
Maysam Yabandeh 2018-08-10 15:14:44 -07:00
parent 421be4af42
commit 3b66d4b617
4 changed files with 45 additions and 49 deletions

View File

@ -1,4 +1,8 @@
# Rocksdb Change Log # Rocksdb Change Log
## 5.15.3 (8/10/2018)
### Bug Fixes
* Fix a bug in misreporting the estimated partition index size in properties block.
## 5.15.2 (8/9/2018) ## 5.15.2 (8/9/2018)
### Bug Fixes ### Bug Fixes
* Return correct usable_size for BlockContents. * Return correct usable_size for BlockContents.

View File

@ -755,7 +755,8 @@ void BlockBasedTableBuilder::WritePropertiesBlock(
rep_->props.filter_policy_name = rep_->table_options.filter_policy != nullptr rep_->props.filter_policy_name = rep_->table_options.filter_policy != nullptr
? rep_->table_options.filter_policy->Name() ? rep_->table_options.filter_policy->Name()
: ""; : "";
rep_->props.index_size = rep_->index_builder->EstimatedSize() + kBlockTrailerSize; rep_->props.index_size =
rep_->index_builder->IndexSize() + kBlockTrailerSize;
rep_->props.comparator_name = rep_->ioptions.user_comparator != nullptr rep_->props.comparator_name = rep_->ioptions.user_comparator != nullptr
? rep_->ioptions.user_comparator->Name() ? rep_->ioptions.user_comparator->Name()
: "nullptr"; : "nullptr";
@ -784,7 +785,7 @@ void BlockBasedTableBuilder::WritePropertiesBlock(
assert(rep_->p_index_builder_ != nullptr); assert(rep_->p_index_builder_ != nullptr);
rep_->props.index_partitions = rep_->p_index_builder_->NumPartitions(); rep_->props.index_partitions = rep_->p_index_builder_->NumPartitions();
rep_->props.top_level_index_size = rep_->props.top_level_index_size =
rep_->p_index_builder_->EstimateTopLevelIndexSize(rep_->offset); rep_->p_index_builder_->TopLevelIndexSize(rep_->offset);
} }
rep_->props.index_key_is_user_key = rep_->props.index_key_is_user_key =
!rep_->index_builder->seperator_is_key_plus_seq(); !rep_->index_builder->seperator_is_key_plus_seq();

View File

@ -70,6 +70,12 @@ PartitionedIndexBuilder::PartitionedIndexBuilder(
table_opt.format_version), table_opt.format_version),
sub_index_builder_(nullptr), sub_index_builder_(nullptr),
table_opt_(table_opt), table_opt_(table_opt),
// We start by false. After each partition we revise the value based on
// what the sub_index_builder has decided. If the feature is disabled
// entirely, this will be set to true after switching the first
// sub_index_builder. Otherwise, it could be set to true even one of the
// sub_index_builders could not safely exclude seq from the keys, then it
// wil be enforced on all sub_index_builders on ::Finish.
seperator_is_key_plus_seq_(false) {} seperator_is_key_plus_seq_(false) {}
PartitionedIndexBuilder::~PartitionedIndexBuilder() { PartitionedIndexBuilder::~PartitionedIndexBuilder() {
@ -83,7 +89,11 @@ void PartitionedIndexBuilder::MakeNewSubIndexBuilder() {
table_opt_.format_version); table_opt_.format_version);
flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy( flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy(
table_opt_.metadata_block_size, table_opt_.block_size_deviation, table_opt_.metadata_block_size, table_opt_.block_size_deviation,
sub_index_builder_->index_block_builder_)); // Note: this is sub-optimal since sub_index_builder_ could later reset
// seperator_is_key_plus_seq_ but the probability of that is low.
sub_index_builder_->seperator_is_key_plus_seq_
? sub_index_builder_->index_block_builder_
: sub_index_builder_->index_block_builder_without_seq_));
partition_cut_requested_ = false; partition_cut_requested_ = false;
} }
@ -143,6 +153,9 @@ void PartitionedIndexBuilder::AddIndexEntry(
Status PartitionedIndexBuilder::Finish( Status PartitionedIndexBuilder::Finish(
IndexBlocks* index_blocks, const BlockHandle& last_partition_block_handle) { IndexBlocks* index_blocks, const BlockHandle& last_partition_block_handle) {
if (partition_cnt_ == 0) {
partition_cnt_ = entries_.size();
}
// It must be set to null after last key is added // It must be set to null after last key is added
assert(sub_index_builder_ == nullptr); assert(sub_index_builder_ == nullptr);
if (finishing_indexes == true) { if (finishing_indexes == true) {
@ -164,6 +177,8 @@ Status PartitionedIndexBuilder::Finish(
index_blocks->index_block_contents = index_blocks->index_block_contents =
index_block_builder_without_seq_.Finish(); index_block_builder_without_seq_.Finish();
} }
top_level_index_size_ = index_blocks->index_block_contents.size();
index_size_ += top_level_index_size_;
return Status::OK(); return Status::OK();
} else { } else {
// Finish the next partition index in line and Incomplete() to indicate we // Finish the next partition index in line and Incomplete() to indicate we
@ -172,45 +187,13 @@ Status PartitionedIndexBuilder::Finish(
// Apply the policy to all sub-indexes // Apply the policy to all sub-indexes
entry.value->seperator_is_key_plus_seq_ = seperator_is_key_plus_seq_; entry.value->seperator_is_key_plus_seq_ = seperator_is_key_plus_seq_;
auto s = entry.value->Finish(index_blocks); auto s = entry.value->Finish(index_blocks);
index_size_ += index_blocks->index_block_contents.size();
finishing_indexes = true; finishing_indexes = true;
return s.ok() ? Status::Incomplete() : s; return s.ok() ? Status::Incomplete() : s;
} }
} }
// Estimate size excluding the top-level index
// It is assumed that this method is called before writing index partition
// starts
size_t PartitionedIndexBuilder::EstimatedSize() const {
size_t total = 0;
for (auto it = entries_.begin(); it != entries_.end(); ++it) {
total += it->value->EstimatedSize();
}
total +=
sub_index_builder_ == nullptr ? 0 : sub_index_builder_->EstimatedSize();
return total;
}
// Since when this method is called we do not know the index block offsets yet,
// the top-level index does not exist. Hence we estimate the block offsets and
// create a temporary top-level index.
size_t PartitionedIndexBuilder::EstimateTopLevelIndexSize(
uint64_t offset) const {
BlockBuilder tmp_builder(
table_opt_.index_block_restart_interval); // tmp top-level index builder
for (auto it = entries_.begin(); it != entries_.end(); ++it) {
std::string tmp_handle_encoding;
uint64_t size = it->value->EstimatedSize();
BlockHandle tmp_block_handle(offset, size);
tmp_block_handle.EncodeTo(&tmp_handle_encoding);
tmp_builder.Add(
seperator_is_key_plus_seq_ ? it->key : ExtractUserKey(it->key),
tmp_handle_encoding);
offset += size;
}
return tmp_builder.CurrentSizeEstimate();
}
size_t PartitionedIndexBuilder::NumPartitions() const { size_t PartitionedIndexBuilder::NumPartitions() const {
return entries_.size(); return partition_cnt_;
} }
} // namespace rocksdb } // namespace rocksdb

View File

@ -96,13 +96,15 @@ class IndexBuilder {
virtual Status Finish(IndexBlocks* index_blocks, virtual Status Finish(IndexBlocks* index_blocks,
const BlockHandle& last_partition_block_handle) = 0; const BlockHandle& last_partition_block_handle) = 0;
// Get the estimated size for index block. // Get the size for index block. Must be called after ::Finish.
virtual size_t EstimatedSize() const = 0; virtual size_t IndexSize() const = 0;
virtual bool seperator_is_key_plus_seq() { return true; } virtual bool seperator_is_key_plus_seq() { return true; }
protected: protected:
const InternalKeyComparator* comparator_; const InternalKeyComparator* comparator_;
// Set after ::Finish is called
size_t index_size_ = 0;
}; };
// This index builder builds space-efficient index block. // This index builder builds space-efficient index block.
@ -162,15 +164,12 @@ class ShortenedIndexBuilder : public IndexBuilder {
index_blocks->index_block_contents = index_blocks->index_block_contents =
index_block_builder_without_seq_.Finish(); index_block_builder_without_seq_.Finish();
} }
index_size_ = index_blocks->index_block_contents.size();
return Status::OK(); return Status::OK();
} }
virtual size_t EstimatedSize() const override { virtual size_t IndexSize() const override {
if (seperator_is_key_plus_seq_) { return index_size_;
return index_block_builder_.CurrentSizeEstimate();
} else {
return index_block_builder_without_seq_.CurrentSizeEstimate();
}
} }
virtual bool seperator_is_key_plus_seq() override { virtual bool seperator_is_key_plus_seq() override {
@ -272,8 +271,8 @@ class HashIndexBuilder : public IndexBuilder {
return Status::OK(); return Status::OK();
} }
virtual size_t EstimatedSize() const override { virtual size_t IndexSize() const override {
return primary_index_builder_.EstimatedSize() + prefix_block_.size() + return primary_index_builder_.IndexSize() + prefix_block_.size() +
prefix_meta_block_.size(); prefix_meta_block_.size();
} }
@ -338,8 +337,12 @@ class PartitionedIndexBuilder : public IndexBuilder {
IndexBlocks* index_blocks, IndexBlocks* index_blocks,
const BlockHandle& last_partition_block_handle) override; const BlockHandle& last_partition_block_handle) override;
virtual size_t EstimatedSize() const override; virtual size_t IndexSize() const override {
size_t EstimateTopLevelIndexSize(uint64_t) const; return index_size_;
}
size_t TopLevelIndexSize(uint64_t) const {
return top_level_index_size_;
}
size_t NumPartitions() const; size_t NumPartitions() const;
inline bool ShouldCutFilterBlock() { inline bool ShouldCutFilterBlock() {
@ -362,6 +365,11 @@ class PartitionedIndexBuilder : public IndexBuilder {
} }
private: private:
// Set after ::Finish is called
size_t top_level_index_size_ = 0;
// Set after ::Finish is called
size_t partition_cnt_ = 0;
void MakeNewSubIndexBuilder(); void MakeNewSubIndexBuilder();
struct Entry { struct Entry {