reduce memory usage of cuckoo table builder
Summary: builder currently buffers all key value pairs as a vector of pair<string, string>. That is too much due to std::string overhead. It wasn't able to fit 1B key/values (12bytes total) in 100GB of ram. Switch to use a plain string to store the key/value sequence and use only 12GB of ram as a result. Test Plan: db_bench Reviewers: igor, sdong, yhchiang Reviewed By: sdong Subscribers: leveldb Differential Revision: https://reviews.facebook.net/D23763
This commit is contained in:
parent
c6275956e2
commit
94997eab5e
@ -60,6 +60,9 @@ CuckooTableBuilder::CuckooTableBuilder(
|
||||
hash_table_size_(use_module_hash ? 0 : 2),
|
||||
is_last_level_file_(false),
|
||||
has_seen_first_key_(false),
|
||||
key_size_(0),
|
||||
value_size_(0),
|
||||
num_entries_(0),
|
||||
ucomp_(user_comparator),
|
||||
use_module_hash_(use_module_hash),
|
||||
identity_as_first_hash_(identity_as_first_hash),
|
||||
@ -72,7 +75,7 @@ CuckooTableBuilder::CuckooTableBuilder(
|
||||
}
|
||||
|
||||
void CuckooTableBuilder::Add(const Slice& key, const Slice& value) {
|
||||
if (kvs_.size() >= kMaxVectorIdx - 1) {
|
||||
if (num_entries_ >= kMaxVectorIdx - 1) {
|
||||
status_ = Status::NotSupported("Number of keys in a file must be < 2^32-1");
|
||||
return;
|
||||
}
|
||||
@ -90,15 +93,18 @@ void CuckooTableBuilder::Add(const Slice& key, const Slice& value) {
|
||||
has_seen_first_key_ = true;
|
||||
smallest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size());
|
||||
largest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size());
|
||||
key_size_ = is_last_level_file_ ? ikey.user_key.size() : key.size();
|
||||
value_size_ = value.size();
|
||||
}
|
||||
// Even if one sequence number is non-zero, then it is not last level.
|
||||
assert(!is_last_level_file_ || ikey.sequence == 0);
|
||||
if (is_last_level_file_) {
|
||||
kvs_.emplace_back(std::make_pair(
|
||||
ikey.user_key.ToString(), value.ToString()));
|
||||
kvs_.append(ikey.user_key.data(), ikey.user_key.size());
|
||||
} else {
|
||||
kvs_.emplace_back(std::make_pair(key.ToString(), value.ToString()));
|
||||
kvs_.append(key.data(), key.size());
|
||||
}
|
||||
kvs_.append(value.data(), value.size());
|
||||
++num_entries_;
|
||||
|
||||
// In order to fill the empty buckets in the hash table, we identify a
|
||||
// key which is not used so far (unused_user_key). We determine this by
|
||||
@ -111,21 +117,32 @@ void CuckooTableBuilder::Add(const Slice& key, const Slice& value) {
|
||||
largest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size());
|
||||
}
|
||||
if (!use_module_hash_) {
|
||||
if (hash_table_size_ < kvs_.size() / max_hash_table_ratio_) {
|
||||
if (hash_table_size_ < num_entries_ / max_hash_table_ratio_) {
|
||||
hash_table_size_ *= 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Slice CuckooTableBuilder::GetKey(uint64_t idx) const {
|
||||
return Slice(&kvs_[idx * (key_size_ + value_size_)], key_size_);
|
||||
}
|
||||
|
||||
Slice CuckooTableBuilder::GetUserKey(uint64_t idx) const {
|
||||
return is_last_level_file_ ? GetKey(idx) : ExtractUserKey(GetKey(idx));
|
||||
}
|
||||
|
||||
Slice CuckooTableBuilder::GetValue(uint64_t idx) const {
|
||||
return Slice(&kvs_[idx * (key_size_ + value_size_) + key_size_], value_size_);
|
||||
}
|
||||
|
||||
Status CuckooTableBuilder::MakeHashTable(std::vector<CuckooBucket>* buckets) {
|
||||
buckets->resize(hash_table_size_ + cuckoo_block_size_ - 1);
|
||||
uint64_t make_space_for_key_call_id = 0;
|
||||
for (uint32_t vector_idx = 0; vector_idx < kvs_.size(); vector_idx++) {
|
||||
for (uint32_t vector_idx = 0; vector_idx < num_entries_; vector_idx++) {
|
||||
uint64_t bucket_id;
|
||||
bool bucket_found = false;
|
||||
autovector<uint64_t> hash_vals;
|
||||
Slice user_key = is_last_level_file_ ? kvs_[vector_idx].first :
|
||||
ExtractUserKey(kvs_[vector_idx].first);
|
||||
Slice user_key = GetUserKey(vector_idx);
|
||||
for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_ && !bucket_found;
|
||||
++hash_cnt) {
|
||||
uint64_t hash_val = CuckooHash(user_key, hash_cnt, use_module_hash_,
|
||||
@ -140,10 +157,8 @@ Status CuckooTableBuilder::MakeHashTable(std::vector<CuckooBucket>* buckets) {
|
||||
bucket_found = true;
|
||||
break;
|
||||
} else {
|
||||
if (ucomp_->Compare(user_key, is_last_level_file_
|
||||
? Slice(kvs_[(*buckets)[hash_val].vector_idx].first)
|
||||
: ExtractUserKey(
|
||||
kvs_[(*buckets)[hash_val].vector_idx].first)) == 0) {
|
||||
if (ucomp_->Compare(user_key,
|
||||
GetUserKey((*buckets)[hash_val].vector_idx)) == 0) {
|
||||
return Status::NotSupported("Same key is being inserted again.");
|
||||
}
|
||||
hash_vals.push_back(hash_val);
|
||||
@ -183,10 +198,10 @@ Status CuckooTableBuilder::Finish() {
|
||||
std::vector<CuckooBucket> buckets;
|
||||
Status s;
|
||||
std::string unused_bucket;
|
||||
if (!kvs_.empty()) {
|
||||
if (num_entries_ > 0) {
|
||||
// Calculate the real hash size if module hash is enabled.
|
||||
if (use_module_hash_) {
|
||||
hash_table_size_ = kvs_.size() / max_hash_table_ratio_;
|
||||
hash_table_size_ = num_entries_ / max_hash_table_ratio_;
|
||||
}
|
||||
s = MakeHashTable(&buckets);
|
||||
if (!s.ok()) {
|
||||
@ -224,14 +239,13 @@ Status CuckooTableBuilder::Finish() {
|
||||
AppendInternalKey(&unused_bucket, ikey);
|
||||
}
|
||||
}
|
||||
properties_.num_entries = kvs_.size();
|
||||
properties_.fixed_key_len = unused_bucket.size();
|
||||
uint32_t value_length = kvs_.empty() ? 0 : kvs_[0].second.size();
|
||||
uint32_t bucket_size = value_length + properties_.fixed_key_len;
|
||||
properties_.num_entries = num_entries_;
|
||||
properties_.fixed_key_len = key_size_;
|
||||
properties_.user_collected_properties[
|
||||
CuckooTablePropertyNames::kValueLength].assign(
|
||||
reinterpret_cast<const char*>(&value_length), sizeof(value_length));
|
||||
reinterpret_cast<const char*>(&value_size_), sizeof(value_size_));
|
||||
|
||||
uint64_t bucket_size = key_size_ + value_size_;
|
||||
unused_bucket.resize(bucket_size, 'a');
|
||||
// Write the table.
|
||||
uint32_t num_added = 0;
|
||||
@ -240,9 +254,9 @@ Status CuckooTableBuilder::Finish() {
|
||||
s = file_->Append(Slice(unused_bucket));
|
||||
} else {
|
||||
++num_added;
|
||||
s = file_->Append(kvs_[bucket.vector_idx].first);
|
||||
s = file_->Append(GetKey(bucket.vector_idx));
|
||||
if (s.ok()) {
|
||||
s = file_->Append(kvs_[bucket.vector_idx].second);
|
||||
s = file_->Append(GetValue(bucket.vector_idx));
|
||||
}
|
||||
}
|
||||
if (!s.ok()) {
|
||||
@ -251,7 +265,7 @@ Status CuckooTableBuilder::Finish() {
|
||||
}
|
||||
assert(num_added == NumEntries());
|
||||
properties_.raw_key_size = num_added * properties_.fixed_key_len;
|
||||
properties_.raw_value_size = num_added * value_length;
|
||||
properties_.raw_value_size = num_added * value_size_;
|
||||
|
||||
uint64_t offset = buckets.size() * bucket_size;
|
||||
properties_.data_size = offset;
|
||||
@ -330,19 +344,18 @@ void CuckooTableBuilder::Abandon() {
|
||||
}
|
||||
|
||||
uint64_t CuckooTableBuilder::NumEntries() const {
|
||||
return kvs_.size();
|
||||
return num_entries_;
|
||||
}
|
||||
|
||||
uint64_t CuckooTableBuilder::FileSize() const {
|
||||
if (closed_) {
|
||||
return file_->GetFileSize();
|
||||
} else if (kvs_.size() == 0) {
|
||||
} else if (num_entries_ == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (use_module_hash_) {
|
||||
return (kvs_[0].first.size() + kvs_[0].second.size()) * kvs_.size() /
|
||||
max_hash_table_ratio_;
|
||||
return (key_size_ + value_size_) * num_entries_ / max_hash_table_ratio_;
|
||||
} else {
|
||||
// Account for buckets being a power of two.
|
||||
// As elements are added, file size remains constant for a while and
|
||||
@ -350,11 +363,10 @@ uint64_t CuckooTableBuilder::FileSize() const {
|
||||
// only after it exceeds the file limit, we account for the extra element
|
||||
// being added here.
|
||||
uint64_t expected_hash_table_size = hash_table_size_;
|
||||
if (expected_hash_table_size < (kvs_.size() + 1) / max_hash_table_ratio_) {
|
||||
if (expected_hash_table_size < (num_entries_ + 1) / max_hash_table_ratio_) {
|
||||
expected_hash_table_size *= 2;
|
||||
}
|
||||
return (kvs_[0].first.size() + kvs_[0].second.size()) *
|
||||
expected_hash_table_size - 1;
|
||||
return (key_size_ + value_size_) * expected_hash_table_size - 1;
|
||||
}
|
||||
}
|
||||
|
||||
@ -390,7 +402,7 @@ bool CuckooTableBuilder::MakeSpaceForKey(
|
||||
// of the method. We store this number into the nodes that we explore in
|
||||
// current method call.
|
||||
// It is unlikely for the increment operation to overflow because the maximum
|
||||
// no. of times this will be called is <= max_num_hash_func_ + kvs_.size().
|
||||
// no. of times this will be called is <= max_num_hash_func_ + num_entries_.
|
||||
for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_; ++hash_cnt) {
|
||||
uint64_t bucket_id = hash_vals[hash_cnt];
|
||||
(*buckets)[bucket_id].make_space_for_key_call_id =
|
||||
@ -408,9 +420,7 @@ bool CuckooTableBuilder::MakeSpaceForKey(
|
||||
CuckooBucket& curr_bucket = (*buckets)[curr_node.bucket_id];
|
||||
for (uint32_t hash_cnt = 0;
|
||||
hash_cnt < num_hash_func_ && !null_found; ++hash_cnt) {
|
||||
uint64_t child_bucket_id = CuckooHash(
|
||||
(is_last_level_file_ ? kvs_[curr_bucket.vector_idx].first :
|
||||
ExtractUserKey(Slice(kvs_[curr_bucket.vector_idx].first))),
|
||||
uint64_t child_bucket_id = CuckooHash(GetUserKey(curr_bucket.vector_idx),
|
||||
hash_cnt, use_module_hash_, hash_table_size_, identity_as_first_hash_,
|
||||
get_slice_hash_);
|
||||
// Iterate inside Cuckoo Block.
|
||||
|
@ -75,6 +75,10 @@ class CuckooTableBuilder: public TableBuilder {
|
||||
uint64_t* bucket_id);
|
||||
Status MakeHashTable(std::vector<CuckooBucket>* buckets);
|
||||
|
||||
inline Slice GetKey(uint64_t idx) const;
|
||||
inline Slice GetUserKey(uint64_t idx) const;
|
||||
inline Slice GetValue(uint64_t idx) const;
|
||||
|
||||
uint32_t num_hash_func_;
|
||||
WritableFile* file_;
|
||||
const double max_hash_table_ratio_;
|
||||
@ -83,10 +87,17 @@ class CuckooTableBuilder: public TableBuilder {
|
||||
const uint32_t cuckoo_block_size_;
|
||||
uint64_t hash_table_size_;
|
||||
bool is_last_level_file_;
|
||||
Status status_;
|
||||
std::vector<std::pair<std::string, std::string>> kvs_;
|
||||
TableProperties properties_;
|
||||
bool has_seen_first_key_;
|
||||
uint64_t key_size_;
|
||||
uint64_t value_size_;
|
||||
// A list of fixed-size key-value pairs concatenating into a string.
|
||||
// Use GetKey(), GetUserKey(), and GetValue() to retrieve a specific
|
||||
// key / value given an index
|
||||
std::string kvs_;
|
||||
// Number of key-value pairs stored in kvs_
|
||||
uint64_t num_entries_;
|
||||
Status status_;
|
||||
TableProperties properties_;
|
||||
const Comparator* ucomp_;
|
||||
bool use_module_hash_;
|
||||
bool identity_as_first_hash_;
|
||||
|
Loading…
Reference in New Issue
Block a user