From 6d37fdb365776fae66fd88b3022dddc56fb1cd33 Mon Sep 17 00:00:00 2001 From: Fenggang Wu Date: Mon, 20 Aug 2018 23:04:08 -0700 Subject: [PATCH] DataBlockHashIndex: Remove the division from EstimateSize() (#4293) Summary: `BlockBasedTableBuilder::Add()` eventually calls `DataBlockHashIndexBuilder::EstimateSize()`. The previous implementation divides the `num_keys` by the `util_ratio_` to get an estimizted `num_buckets`. Such division is expensive as it happens in every `BlockBasedTableBuilder::Add()`. This diff estimates the `num_buckets` by double addition instead of double division. Specifically, in each `Add()`, we add `bucket_per_key_` (inverse of `util_ratio_`) to the current `estimiated_num_buckets_`. The cost is that we are gonna have the `estimated_num_buckets_` stored as one extra field in the DataBlockHashIndexBuilder. Pull Request resolved: https://github.com/facebook/rocksdb/pull/4293 Differential Revision: D9412472 Pulled By: fgwu fbshipit-source-id: 2925c8509a401e7bd3c1ab1d9e9c7244755c277a --- table/block_builder.cc | 1 - table/data_block_hash_index.cc | 8 +++++--- table/data_block_hash_index.h | 16 ++++++++++------ 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/table/block_builder.cc b/table/block_builder.cc index d7051d2db..37b407ea5 100644 --- a/table/block_builder.cc +++ b/table/block_builder.cc @@ -106,7 +106,6 @@ size_t BlockBuilder::EstimateSizeAfterKV(const Slice& key, const Slice& value) estimate += VarintLength(value.size()); // varint for value length. } - // TODO(fwu): add the delta of the DataBlockHashIndex return estimate; } diff --git a/table/data_block_hash_index.cc b/table/data_block_hash_index.cc index bdb784622..adb1d7b8c 100644 --- a/table/data_block_hash_index.cc +++ b/table/data_block_hash_index.cc @@ -23,12 +23,13 @@ void DataBlockHashIndexBuilder::Add(const Slice& key, uint32_t hash_value = GetSliceHash(key); hash_and_restart_pairs_.emplace_back(hash_value, static_cast(restart_index)); + estimated_num_buckets_ += bucket_per_key_; } void DataBlockHashIndexBuilder::Finish(std::string& buffer) { assert(Valid()); - uint16_t num_buckets = static_cast( - static_cast(hash_and_restart_pairs_.size()) / util_ratio_); + uint16_t num_buckets = static_cast(estimated_num_buckets_); + if (num_buckets == 0) { num_buckets = 1; // sanity check } @@ -66,8 +67,9 @@ void DataBlockHashIndexBuilder::Finish(std::string& buffer) { } void DataBlockHashIndexBuilder::Reset() { - hash_and_restart_pairs_.clear(); + estimated_num_buckets_ = 0; valid_ = true; + hash_and_restart_pairs_.clear(); } void DataBlockHashIndex::Initialize(const char* data, uint16_t size, diff --git a/table/data_block_hash_index.h b/table/data_block_hash_index.h index a25df07e8..0af8b257c 100644 --- a/table/data_block_hash_index.h +++ b/table/data_block_hash_index.h @@ -72,23 +72,26 @@ const double kDefaultUtilRatio = 0.75; class DataBlockHashIndexBuilder { public: - DataBlockHashIndexBuilder() : util_ratio_(0), valid_(false) {} + DataBlockHashIndexBuilder() + : bucket_per_key_(-1 /*uninitialized marker*/), + estimated_num_buckets_(0), + valid_(false) {} void Initialize(double util_ratio) { if (util_ratio <= 0) { util_ratio = kDefaultUtilRatio; // sanity check } - util_ratio_ = util_ratio; + bucket_per_key_ = 1 / util_ratio; valid_ = true; } - inline bool Valid() const { return valid_ && util_ratio_ > 0; } + inline bool Valid() const { return valid_ && bucket_per_key_ > 0; } void Add(const Slice& key, const size_t restart_index); void Finish(std::string& buffer); void Reset(); inline size_t EstimateSize() const { - uint16_t estimated_num_buckets = static_cast( - static_cast(hash_and_restart_pairs_.size()) / util_ratio_); + uint16_t estimated_num_buckets = + static_cast(estimated_num_buckets_); // Maching the num_buckets number in DataBlockHashIndexBuilder::Finish. estimated_num_buckets |= 1; @@ -98,7 +101,8 @@ class DataBlockHashIndexBuilder { } private: - double util_ratio_; + double bucket_per_key_; // is the multiplicative inverse of util_ratio_ + double estimated_num_buckets_; // Now the only usage for `valid_` is to mark false when the inserted // restart_index is larger than supported. In this case HashIndex is not