DataBlockHashIndex: Remove the division from EstimateSize() (#4293)
Summary: `BlockBasedTableBuilder::Add()` eventually calls `DataBlockHashIndexBuilder::EstimateSize()`. The previous implementation divides the `num_keys` by the `util_ratio_` to get an estimizted `num_buckets`. Such division is expensive as it happens in every `BlockBasedTableBuilder::Add()`. This diff estimates the `num_buckets` by double addition instead of double division. Specifically, in each `Add()`, we add `bucket_per_key_` (inverse of `util_ratio_`) to the current `estimiated_num_buckets_`. The cost is that we are gonna have the `estimated_num_buckets_` stored as one extra field in the DataBlockHashIndexBuilder. Pull Request resolved: https://github.com/facebook/rocksdb/pull/4293 Differential Revision: D9412472 Pulled By: fgwu fbshipit-source-id: 2925c8509a401e7bd3c1ab1d9e9c7244755c277a
This commit is contained in:
parent
7188bd34f3
commit
6d37fdb365
@ -106,7 +106,6 @@ size_t BlockBuilder::EstimateSizeAfterKV(const Slice& key, const Slice& value)
|
|||||||
estimate += VarintLength(value.size()); // varint for value length.
|
estimate += VarintLength(value.size()); // varint for value length.
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO(fwu): add the delta of the DataBlockHashIndex
|
|
||||||
return estimate;
|
return estimate;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -23,12 +23,13 @@ void DataBlockHashIndexBuilder::Add(const Slice& key,
|
|||||||
uint32_t hash_value = GetSliceHash(key);
|
uint32_t hash_value = GetSliceHash(key);
|
||||||
hash_and_restart_pairs_.emplace_back(hash_value,
|
hash_and_restart_pairs_.emplace_back(hash_value,
|
||||||
static_cast<uint8_t>(restart_index));
|
static_cast<uint8_t>(restart_index));
|
||||||
|
estimated_num_buckets_ += bucket_per_key_;
|
||||||
}
|
}
|
||||||
|
|
||||||
void DataBlockHashIndexBuilder::Finish(std::string& buffer) {
|
void DataBlockHashIndexBuilder::Finish(std::string& buffer) {
|
||||||
assert(Valid());
|
assert(Valid());
|
||||||
uint16_t num_buckets = static_cast<uint16_t>(
|
uint16_t num_buckets = static_cast<uint16_t>(estimated_num_buckets_);
|
||||||
static_cast<double>(hash_and_restart_pairs_.size()) / util_ratio_);
|
|
||||||
if (num_buckets == 0) {
|
if (num_buckets == 0) {
|
||||||
num_buckets = 1; // sanity check
|
num_buckets = 1; // sanity check
|
||||||
}
|
}
|
||||||
@ -66,8 +67,9 @@ void DataBlockHashIndexBuilder::Finish(std::string& buffer) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void DataBlockHashIndexBuilder::Reset() {
|
void DataBlockHashIndexBuilder::Reset() {
|
||||||
hash_and_restart_pairs_.clear();
|
estimated_num_buckets_ = 0;
|
||||||
valid_ = true;
|
valid_ = true;
|
||||||
|
hash_and_restart_pairs_.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
void DataBlockHashIndex::Initialize(const char* data, uint16_t size,
|
void DataBlockHashIndex::Initialize(const char* data, uint16_t size,
|
||||||
|
@ -72,23 +72,26 @@ const double kDefaultUtilRatio = 0.75;
|
|||||||
|
|
||||||
class DataBlockHashIndexBuilder {
|
class DataBlockHashIndexBuilder {
|
||||||
public:
|
public:
|
||||||
DataBlockHashIndexBuilder() : util_ratio_(0), valid_(false) {}
|
DataBlockHashIndexBuilder()
|
||||||
|
: bucket_per_key_(-1 /*uninitialized marker*/),
|
||||||
|
estimated_num_buckets_(0),
|
||||||
|
valid_(false) {}
|
||||||
|
|
||||||
void Initialize(double util_ratio) {
|
void Initialize(double util_ratio) {
|
||||||
if (util_ratio <= 0) {
|
if (util_ratio <= 0) {
|
||||||
util_ratio = kDefaultUtilRatio; // sanity check
|
util_ratio = kDefaultUtilRatio; // sanity check
|
||||||
}
|
}
|
||||||
util_ratio_ = util_ratio;
|
bucket_per_key_ = 1 / util_ratio;
|
||||||
valid_ = true;
|
valid_ = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline bool Valid() const { return valid_ && util_ratio_ > 0; }
|
inline bool Valid() const { return valid_ && bucket_per_key_ > 0; }
|
||||||
void Add(const Slice& key, const size_t restart_index);
|
void Add(const Slice& key, const size_t restart_index);
|
||||||
void Finish(std::string& buffer);
|
void Finish(std::string& buffer);
|
||||||
void Reset();
|
void Reset();
|
||||||
inline size_t EstimateSize() const {
|
inline size_t EstimateSize() const {
|
||||||
uint16_t estimated_num_buckets = static_cast<uint16_t>(
|
uint16_t estimated_num_buckets =
|
||||||
static_cast<double>(hash_and_restart_pairs_.size()) / util_ratio_);
|
static_cast<uint16_t>(estimated_num_buckets_);
|
||||||
|
|
||||||
// Maching the num_buckets number in DataBlockHashIndexBuilder::Finish.
|
// Maching the num_buckets number in DataBlockHashIndexBuilder::Finish.
|
||||||
estimated_num_buckets |= 1;
|
estimated_num_buckets |= 1;
|
||||||
@ -98,7 +101,8 @@ class DataBlockHashIndexBuilder {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
double util_ratio_;
|
double bucket_per_key_; // is the multiplicative inverse of util_ratio_
|
||||||
|
double estimated_num_buckets_;
|
||||||
|
|
||||||
// Now the only usage for `valid_` is to mark false when the inserted
|
// Now the only usage for `valid_` is to mark false when the inserted
|
||||||
// restart_index is larger than supported. In this case HashIndex is not
|
// restart_index is larger than supported. In this case HashIndex is not
|
||||||
|
Loading…
Reference in New Issue
Block a user