From 606a12670310a190c37c9431dfd1ed50b3d074b9 Mon Sep 17 00:00:00 2001 From: Radheshyam Balasundaram Date: Tue, 5 Aug 2014 20:55:46 -0700 Subject: [PATCH] Changing implementaiton of CuckooTableBuilder to not take file_size, key_length, value_length. Summary: - Maintain a list of key-value pairs as vectors during Add operation. - Start building hash table only when Finish() is called. - This approach takes more time and space but avoids taking file_size, key and value lengths. - Rewrote cuckoo_table_builder_test I did not know about IterKey while writing this diff. I shall change places where IterKey could be used instead of std::string tomorrow. Please review rest of the logic. Test Plan: cuckoo_table_reader_test --enable_perf cuckoo_table_builder_test valgrind_check asan_check Reviewers: sdong, igor, yhchiang, ljin Reviewed By: ljin Subscribers: leveldb Differential Revision: https://reviews.facebook.net/D20907 --- table/cuckoo_table_builder.cc | 263 ++++++------ table/cuckoo_table_builder.h | 48 ++- table/cuckoo_table_builder_test.cc | 660 +++++++++++++---------------- table/cuckoo_table_reader.cc | 6 +- table/cuckoo_table_reader.h | 4 +- table/cuckoo_table_reader_test.cc | 54 +-- 6 files changed, 480 insertions(+), 555 deletions(-) diff --git a/table/cuckoo_table_builder.cc b/table/cuckoo_table_builder.cc index df0fae044..3643197ff 100644 --- a/table/cuckoo_table_builder.cc +++ b/table/cuckoo_table_builder.cc @@ -8,6 +8,7 @@ #include #include +#include #include #include @@ -36,25 +37,18 @@ const std::string CuckooTablePropertyNames::kIsLastLevel = extern const uint64_t kCuckooTableMagicNumber = 0x926789d0c5f17873ull; CuckooTableBuilder::CuckooTableBuilder( - WritableFile* file, uint32_t fixed_key_length, - uint32_t fixed_value_length, double hash_table_ratio, - uint64_t file_size, uint32_t max_num_hash_table, - uint32_t max_search_depth, bool is_last_level, - uint64_t (*GetSliceHashPtr)(const Slice&, uint32_t, uint64_t)) + WritableFile* file, double hash_table_ratio, + uint32_t max_num_hash_table, uint32_t max_search_depth, + uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t)) : num_hash_table_(2), file_(file), - value_length_(fixed_value_length), - // 8 is the difference between sizes of user key and InternalKey. - bucket_size_(fixed_key_length + - fixed_value_length - (is_last_level ? 8 : 0)), hash_table_ratio_(hash_table_ratio), - max_num_buckets_(file_size / bucket_size_), max_num_hash_table_(max_num_hash_table), max_search_depth_(max_search_depth), - is_last_level_file_(is_last_level), - buckets_(max_num_buckets_), - make_space_for_key_call_id_(0), - GetSliceHash(GetSliceHashPtr) { + is_last_level_file_(false), + has_seen_first_key_(false), + get_slice_hash_(get_slice_hash), + closed_(false) { properties_.num_entries = 0; // Data is in a huge block. properties_.num_data_blocks = 1; @@ -62,105 +56,120 @@ CuckooTableBuilder::CuckooTableBuilder( properties_.filter_size = 0; } -CuckooTableBuilder::~CuckooTableBuilder() { -} - void CuckooTableBuilder::Add(const Slice& key, const Slice& value) { - if (NumEntries() == max_num_buckets_) { - status_ = Status::Corruption("Hash Table is full."); + if (properties_.num_entries >= kMaxVectorIdx - 1) { + status_ = Status::NotSupported("Number of keys in a file must be < 2^32-1"); return; } - uint64_t bucket_id; - bool bucket_found = false; - autovector hash_vals; ParsedInternalKey ikey; if (!ParseInternalKey(key, &ikey)) { status_ = Status::Corruption("Unable to parse key into inernal key."); return; } - Slice user_key = ikey.user_key; - for (uint32_t hash_cnt = 0; hash_cnt < num_hash_table_; ++hash_cnt) { - uint64_t hash_val = GetSliceHash(user_key, hash_cnt, max_num_buckets_); - if (buckets_[hash_val].is_empty) { - bucket_id = hash_val; - bucket_found = true; - break; - } else { - if (user_key.compare( - is_last_level_file_ ? Slice(buckets_[hash_val].key) - : ExtractUserKey(Slice(buckets_[hash_val].key))) == 0) { - status_ = Status::Corruption("Same key is being inserted again."); - return; - } - hash_vals.push_back(hash_val); - } - } - while (!bucket_found && !MakeSpaceForKey(key, &bucket_id, hash_vals)) { - // Rehash by increashing number of hash tables. - if (num_hash_table_ >= max_num_hash_table_) { - status_ = Status::Corruption("Too many collissions. Unable to hash."); - return; - } - // We don't really need to rehash the entire table because old hashes are - // still valid and we only increased the number of hash functions. - uint64_t hash_val = GetSliceHash(user_key, - num_hash_table_, max_num_buckets_); - ++num_hash_table_; - if (buckets_[hash_val].is_empty) { - bucket_found = true; - bucket_id = hash_val; - break; - } else { - hash_vals.push_back(hash_val); - } + // Determine if we can ignore the sequence number and value type from + // internal keys by looking at sequence number from first key. We assume + // that if first key has a zero sequence number, then all the remaining + // keys will have zero seq. no. + if (!has_seen_first_key_) { + is_last_level_file_ = ikey.sequence == 0; + has_seen_first_key_ = true; } + // Even if one sequence number is non-zero, then it is not last level. + assert(!is_last_level_file_ || ikey.sequence == 0); if (is_last_level_file_) { - buckets_[bucket_id].key.assign(user_key.data(), user_key.size()); + kvs_.emplace_back(std::make_pair( + ikey.user_key.ToString(), value.ToString())); } else { - buckets_[bucket_id].key.assign(key.data(), key.size()); + kvs_.emplace_back(std::make_pair(key.ToString(), value.ToString())); } - buckets_[bucket_id].value.assign(value.data(), value.size()); - buckets_[bucket_id].is_empty = false; properties_.num_entries++; - // We assume that the keys are inserted in sorted order. To identify an - // unused key, which will be used in filling empty buckets in the table, - // we try to find gaps between successive keys inserted. This is done by - // maintaining the previous key and comparing it with next key. - if (unused_user_key_.empty()) { - if (prev_key_.empty()) { - prev_key_ = user_key.ToString(); - return; - } - std::string new_user_key = prev_key_; + // We assume that the keys are inserted in sorted order as determined by + // Byte-wise comparator. To identify an unused key, which will be used in + // filling empty buckets in the table, we try to find gaps between successive + // keys inserted (ie, latest key and previous in kvs_). + if (unused_user_key_.empty() && kvs_.size() > 1) { + std::string prev_key = is_last_level_file_ ? kvs_[kvs_.size()-1].first + : ExtractUserKey(kvs_[kvs_.size()-1].first).ToString(); + std::string new_user_key = prev_key; new_user_key.back()++; // We ignore carry-overs and check that it is larger than previous key. - if ((new_user_key > prev_key_) && - (new_user_key < user_key.ToString())) { + if (Slice(new_user_key).compare(Slice(prev_key)) > 0 && + Slice(new_user_key).compare(ikey.user_key) < 0) { unused_user_key_ = new_user_key; - } else { - prev_key_ = user_key.ToString(); } } } -Status CuckooTableBuilder::status() const { return status_; } +Status CuckooTableBuilder::MakeHashTable(std::vector* buckets) { + uint64_t num_buckets = kvs_.size() / hash_table_ratio_; + buckets->resize(num_buckets); + uint64_t make_space_for_key_call_id = 0; + for (uint32_t vector_idx = 0; vector_idx < kvs_.size(); vector_idx++) { + uint64_t bucket_id; + bool bucket_found = false; + autovector hash_vals; + Slice user_key = is_last_level_file_ ? kvs_[vector_idx].first : + ExtractUserKey(kvs_[vector_idx].first); + for (uint32_t hash_cnt = 0; hash_cnt < num_hash_table_; ++hash_cnt) { + uint64_t hash_val = get_slice_hash_(user_key, hash_cnt, num_buckets); + if ((*buckets)[hash_val].vector_idx == kMaxVectorIdx) { + bucket_id = hash_val; + bucket_found = true; + break; + } else { + if (user_key.compare(is_last_level_file_ + ? Slice(kvs_[(*buckets)[hash_val].vector_idx].first) + : ExtractUserKey( + kvs_[(*buckets)[hash_val].vector_idx].first)) == 0) { + return Status::NotSupported("Same key is being inserted again."); + } + hash_vals.push_back(hash_val); + } + } + while (!bucket_found && !MakeSpaceForKey(hash_vals, + ++make_space_for_key_call_id, buckets, &bucket_id)) { + // Rehash by increashing number of hash tables. + if (num_hash_table_ >= max_num_hash_table_) { + return Status::NotSupported("Too many collissions. Unable to hash."); + } + // We don't really need to rehash the entire table because old hashes are + // still valid and we only increased the number of hash functions. + uint64_t hash_val = get_slice_hash_(user_key, + num_hash_table_, num_buckets); + ++num_hash_table_; + if ((*buckets)[hash_val].vector_idx == kMaxVectorIdx) { + bucket_found = true; + bucket_id = hash_val; + break; + } else { + hash_vals.push_back(hash_val); + } + } + (*buckets)[bucket_id].vector_idx = vector_idx; + } + return Status::OK(); +} Status CuckooTableBuilder::Finish() { assert(!closed_); closed_ = true; - - if (unused_user_key_.empty()) { - if (prev_key_.empty()) { - return Status::Corruption("Unable to find unused key"); - } - // Try to find the key next to prev_key_ by handling carryovers. - std::string new_user_key = prev_key_; + std::vector buckets; + Status s = MakeHashTable(&buckets); + if (!s.ok()) { + return s; + } + if (unused_user_key_.empty() && !kvs_.empty()) { + // Try to find the key next to last key by handling carryovers. + std::string last_key = + is_last_level_file_ ? kvs_[kvs_.size()-1].first + : ExtractUserKey(kvs_[kvs_.size()-1].first).ToString(); + std::string new_user_key = last_key; int curr_pos = new_user_key.size() - 1; while (curr_pos >= 0) { ++new_user_key[curr_pos]; - if (new_user_key > prev_key_) { + if (new_user_key > last_key) { unused_user_key_ = new_user_key; break; } @@ -171,29 +180,32 @@ Status CuckooTableBuilder::Finish() { } } std::string unused_bucket; - if (is_last_level_file_) { - unused_bucket = unused_user_key_; - } else { - ParsedInternalKey ikey(unused_user_key_, 0, kTypeValue); - AppendInternalKey(&unused_bucket, ikey); + if (!kvs_.empty()) { + if (is_last_level_file_) { + unused_bucket = unused_user_key_; + } else { + ParsedInternalKey ikey(unused_user_key_, 0, kTypeValue); + AppendInternalKey(&unused_bucket, ikey); + } } properties_.fixed_key_len = unused_bucket.size(); + uint32_t value_length = kvs_.empty() ? 0 : kvs_[0].second.size(); + uint32_t bucket_size = value_length + properties_.fixed_key_len; properties_.user_collected_properties[ CuckooTablePropertyNames::kValueLength].assign( - reinterpret_cast(&value_length_), sizeof(value_length_)); + reinterpret_cast(&value_length), sizeof(value_length)); - unused_bucket.resize(bucket_size_, 'a'); + unused_bucket.resize(bucket_size, 'a'); // Write the table. uint32_t num_added = 0; - for (auto& bucket : buckets_) { - Status s; - if (bucket.is_empty) { + for (auto& bucket : buckets) { + if (bucket.vector_idx == kMaxVectorIdx) { s = file_->Append(Slice(unused_bucket)); } else { ++num_added; - s = file_->Append(Slice(bucket.key)); + s = file_->Append(kvs_[bucket.vector_idx].first); if (s.ok()) { - s = file_->Append(Slice(bucket.value)); + s = file_->Append(kvs_[bucket.vector_idx].second); } } if (!s.ok()) { @@ -202,17 +214,17 @@ Status CuckooTableBuilder::Finish() { } assert(num_added == NumEntries()); - uint64_t offset = buckets_.size() * bucket_size_; + uint64_t offset = buckets.size() * bucket_size; unused_bucket.resize(properties_.fixed_key_len); properties_.user_collected_properties[ CuckooTablePropertyNames::kEmptyKey] = unused_bucket; properties_.user_collected_properties[ CuckooTablePropertyNames::kNumHashTable].assign( reinterpret_cast(&num_hash_table_), sizeof(num_hash_table_)); + uint64_t num_buckets = buckets.size(); properties_.user_collected_properties[ CuckooTablePropertyNames::kMaxNumBuckets].assign( - reinterpret_cast(&max_num_buckets_), - sizeof(max_num_buckets_)); + reinterpret_cast(&num_buckets), sizeof(num_buckets)); properties_.user_collected_properties[ CuckooTablePropertyNames::kIsLastLevel].assign( reinterpret_cast(&is_last_level_file_), @@ -228,7 +240,7 @@ Status CuckooTableBuilder::Finish() { BlockHandle property_block_handle; property_block_handle.set_offset(offset); property_block_handle.set_size(property_block.size()); - Status s = file_->Append(property_block); + s = file_->Append(property_block); offset += property_block.size(); if (!s.ok()) { return s; @@ -266,12 +278,14 @@ uint64_t CuckooTableBuilder::NumEntries() const { uint64_t CuckooTableBuilder::FileSize() const { if (closed_) { return file_->GetFileSize(); - } else { - // This is not the actual size of the file as we need to account for - // hash table ratio. This returns the size of filled buckets in the table - // scaled up by a factor of 1/hash table ratio. - return (properties_.num_entries * bucket_size_) / hash_table_ratio_; + } else if (properties_.num_entries == 0) { + return 0; } + // This is not the actual size of the file as we need to account for + // hash table ratio. This returns the size of filled buckets in the table + // scaled up by a factor of 1/hash_table_ratio. + return ((kvs_[0].first.size() + kvs_[0].second.size()) * + properties_.num_entries) / hash_table_ratio_; } // This method is invoked when there is no place to insert the target key. @@ -284,8 +298,11 @@ uint64_t CuckooTableBuilder::FileSize() const { // move all elements along the path from first level to this empty bucket, to // make space for target key which is inserted at first level (*bucket_id). // If tree depth exceedes max depth, we return false indicating failure. -bool CuckooTableBuilder::MakeSpaceForKey(const Slice& key, - uint64_t *bucket_id, autovector hash_vals) { +bool CuckooTableBuilder::MakeSpaceForKey( + const autovector& hash_vals, + const uint64_t make_space_for_key_call_id, + std::vector* buckets, + uint64_t* bucket_id) { struct CuckooNode { uint64_t bucket_id; uint32_t depth; @@ -302,13 +319,11 @@ bool CuckooTableBuilder::MakeSpaceForKey(const Slice& key, // unique id for this invocation of the method. We store this number into // the nodes that we explore in current method call. // It is unlikely for the increment operation to overflow because the maximum - // number of times this will be called is <= max_num_hash_table_ + - // max_num_buckets_. - ++make_space_for_key_call_id_; + // no. of times this will be called is <= max_num_hash_table_ + kvs_.size(). for (uint32_t hash_cnt = 0; hash_cnt < num_hash_table_; ++hash_cnt) { uint64_t bucket_id = hash_vals[hash_cnt]; - buckets_[bucket_id].make_space_for_key_call_id = - make_space_for_key_call_id_; + (*buckets)[bucket_id].make_space_for_key_call_id = + make_space_for_key_call_id; tree.push_back(CuckooNode(bucket_id, 0, 0)); } bool null_found = false; @@ -318,21 +333,21 @@ bool CuckooTableBuilder::MakeSpaceForKey(const Slice& key, if (curr_node.depth >= max_search_depth_) { break; } - CuckooBucket& curr_bucket = buckets_[curr_node.bucket_id]; + CuckooBucket& curr_bucket = (*buckets)[curr_node.bucket_id]; for (uint32_t hash_cnt = 0; hash_cnt < num_hash_table_; ++hash_cnt) { - uint64_t child_bucket_id = GetSliceHash( - is_last_level_file_ ? curr_bucket.key - : ExtractUserKey(Slice(curr_bucket.key)), - hash_cnt, max_num_buckets_); - if (buckets_[child_bucket_id].make_space_for_key_call_id == - make_space_for_key_call_id_) { + uint64_t child_bucket_id = get_slice_hash_( + is_last_level_file_ ? kvs_[curr_bucket.vector_idx].first + : ExtractUserKey(Slice(kvs_[curr_bucket.vector_idx].first)), + hash_cnt, buckets->size()); + if ((*buckets)[child_bucket_id].make_space_for_key_call_id == + make_space_for_key_call_id) { continue; } - buckets_[child_bucket_id].make_space_for_key_call_id = - make_space_for_key_call_id_; + (*buckets)[child_bucket_id].make_space_for_key_call_id = + make_space_for_key_call_id; tree.push_back(CuckooNode(child_bucket_id, curr_node.depth + 1, curr_pos)); - if (buckets_[child_bucket_id].is_empty) { + if ((*buckets)[child_bucket_id].vector_idx == kMaxVectorIdx) { null_found = true; break; } @@ -349,8 +364,8 @@ bool CuckooTableBuilder::MakeSpaceForKey(const Slice& key, uint32_t bucket_to_replace_pos = tree.size()-1; while (bucket_to_replace_pos >= num_hash_table_) { CuckooNode& curr_node = tree[bucket_to_replace_pos]; - buckets_[curr_node.bucket_id] = - buckets_[tree[curr_node.parent_pos].bucket_id]; + (*buckets)[curr_node.bucket_id] = + (*buckets)[tree[curr_node.parent_pos].bucket_id]; bucket_to_replace_pos = curr_node.parent_pos; } *bucket_id = tree[bucket_to_replace_pos].bucket_id; diff --git a/table/cuckoo_table_builder.h b/table/cuckoo_table_builder.h index d40b1f7ff..7bc9f1d89 100644 --- a/table/cuckoo_table_builder.h +++ b/table/cuckoo_table_builder.h @@ -6,7 +6,9 @@ #pragma once #ifndef ROCKSDB_LITE #include +#include #include +#include #include #include "rocksdb/status.h" #include "table/table_builder.h" @@ -19,14 +21,12 @@ namespace rocksdb { class CuckooTableBuilder: public TableBuilder { public: CuckooTableBuilder( - WritableFile* file, uint32_t fixed_key_length, - uint32_t fixed_value_length, double hash_table_ratio, - uint64_t file_size, uint32_t max_num_hash_table, - uint32_t max_search_depth, bool is_last_level, - uint64_t (*GetSliceHash)(const Slice&, uint32_t, uint64_t)); + WritableFile* file, double hash_table_ratio, uint32_t max_num_hash_table, + uint32_t max_search_depth, + uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t)); // REQUIRES: Either Finish() or Abandon() has been called. - ~CuckooTableBuilder(); + ~CuckooTableBuilder() {} // Add key,value to the table being constructed. // REQUIRES: key is after any previously added key according to comparator. @@ -34,7 +34,7 @@ class CuckooTableBuilder: public TableBuilder { void Add(const Slice& key, const Slice& value) override; // Return non-ok iff some error has been detected. - Status status() const override; + Status status() const override { return status_; } // Finish building the table. Stops using the file passed to the // constructor after this function returns. @@ -57,35 +57,37 @@ class CuckooTableBuilder: public TableBuilder { private: struct CuckooBucket { - CuckooBucket(): is_empty(true), make_space_for_key_call_id(0) {} - std::string key; - std::string value; - bool is_empty; - uint64_t make_space_for_key_call_id; + CuckooBucket() + : vector_idx(kMaxVectorIdx), make_space_for_key_call_id(0) {} + uint32_t vector_idx; + // This number will not exceed kvs_.size() + max_num_hash_table_. + // We assume number of items is <= 2^32. + uint32_t make_space_for_key_call_id; }; + static const uint32_t kMaxVectorIdx = std::numeric_limits::max(); - bool MakeSpaceForKey(const Slice& key, uint64_t* bucket_id, - autovector hash_vals); + bool MakeSpaceForKey( + const autovector& hash_vals, + const uint64_t call_id, + std::vector* buckets, + uint64_t* bucket_id); + Status MakeHashTable(std::vector* buckets); uint32_t num_hash_table_; WritableFile* file_; - const uint32_t value_length_; - const uint32_t bucket_size_; const double hash_table_ratio_; - const uint64_t max_num_buckets_; const uint32_t max_num_hash_table_; const uint32_t max_search_depth_; - const bool is_last_level_file_; + bool is_last_level_file_; Status status_; - std::vector buckets_; + std::vector> kvs_; TableProperties properties_; - uint64_t make_space_for_key_call_id_; - uint64_t (*GetSliceHash)(const Slice& s, uint32_t index, + bool has_seen_first_key_; + uint64_t (*get_slice_hash_)(const Slice& s, uint32_t index, uint64_t max_num_buckets); std::string unused_user_key_ = ""; - std::string prev_key_; - bool closed_ = false; // Either Finish() or Abandon() has been called. + bool closed_; // Either Finish() or Abandon() has been called. // No copying allowed CuckooTableBuilder(const CuckooTableBuilder&) = delete; diff --git a/table/cuckoo_table_builder_test.cc b/table/cuckoo_table_builder_test.cc index 2df45fb09..f346f45c9 100644 --- a/table/cuckoo_table_builder_test.cc +++ b/table/cuckoo_table_builder_test.cc @@ -10,26 +10,15 @@ #include "table/meta_blocks.h" #include "table/cuckoo_table_builder.h" -#include "util/random.h" #include "util/testharness.h" #include "util/testutil.h" namespace rocksdb { - extern const uint64_t kCuckooTableMagicNumber; namespace { std::unordered_map> hash_map; -void AddHashLookups(const std::string& s, uint64_t bucket_id, - uint32_t num_hash_fun) { - std::vector v; - for (uint32_t i = 0; i < num_hash_fun; i++) { - v.push_back(bucket_id + i); - } - hash_map[s] = v; -} - uint64_t GetSliceHash(const Slice& s, uint32_t index, uint64_t max_num_buckets) { return hash_map[s.ToString()][index]; @@ -40,9 +29,14 @@ class CuckooBuilderTest { public: CuckooBuilderTest() { env_ = Env::Default(); + Options options; + options.allow_mmap_reads = true; + env_options_ = EnvOptions(options); } - void CheckFileContents(const std::string& expected_data, + void CheckFileContents(const std::vector& keys, + const std::vector& values, + const std::vector& expected_locations, std::string expected_unused_bucket, uint64_t expected_max_buckets, uint32_t expected_num_hash_fun, bool expected_is_last_level) { // Read file @@ -55,18 +49,19 @@ class CuckooBuilderTest { TableProperties* props = nullptr; ASSERT_OK(ReadTableProperties(read_file.get(), read_file_size, kCuckooTableMagicNumber, env_, nullptr, &props)); - ASSERT_EQ(props->num_entries, num_items); - ASSERT_EQ(props->fixed_key_len, key_length); + ASSERT_EQ(props->num_entries, keys.size()); + ASSERT_EQ(props->fixed_key_len, keys.empty() ? 0 : keys[0].size()); // Check unused bucket. std::string unused_key = props->user_collected_properties[ CuckooTablePropertyNames::kEmptyKey]; - ASSERT_EQ(expected_unused_bucket.substr(0, key_length), unused_key); + ASSERT_EQ(expected_unused_bucket.substr(0, + props->fixed_key_len), unused_key); uint32_t value_len_found = *reinterpret_cast(props->user_collected_properties[ CuckooTablePropertyNames::kValueLength].data()); - ASSERT_EQ(value_length, value_len_found); + ASSERT_EQ(values.empty() ? 0 : values[0].size(), value_len_found); const uint64_t max_buckets = *reinterpret_cast(props->user_collected_properties[ CuckooTablePropertyNames::kMaxNumBuckets].data()); @@ -80,411 +75,322 @@ class CuckooBuilderTest { CuckooTablePropertyNames::kIsLastLevel].data()); ASSERT_EQ(expected_is_last_level, is_last_level_found); delete props; + // Check contents of the bucket. - std::string read_data; - read_data.resize(expected_data.size()); - Slice read_slice; - ASSERT_OK(read_file->Read(0, expected_data.size(), - &read_slice, &read_data[0])); - ASSERT_EQ(expected_data, read_data); + std::vector keys_found(keys.size(), false); + uint32_t bucket_size = expected_unused_bucket.size(); + for (uint32_t i = 0; i < max_buckets; ++i) { + Slice read_slice; + ASSERT_OK(read_file->Read(i*bucket_size, bucket_size, + &read_slice, nullptr)); + uint32_t key_idx = std::find(expected_locations.begin(), + expected_locations.end(), i) - expected_locations.begin(); + if (key_idx == keys.size()) { + // i is not one of the expected locaitons. Empty bucket. + ASSERT_EQ(read_slice.compare(expected_unused_bucket), 0); + } else { + keys_found[key_idx] = true; + ASSERT_EQ(read_slice.compare(keys[key_idx] + values[key_idx]), 0); + } + } + for (auto key_found : keys_found) { + // Check that all keys were found. + ASSERT_TRUE(key_found); + } + } + + std::string GetInternalKey(Slice user_key, bool zero_seqno) { + IterKey ikey; + ikey.SetInternalKey(user_key, zero_seqno ? 0 : 1000, kTypeValue); + return ikey.GetKey().ToString(); } Env* env_; - const EnvOptions env_options_; + EnvOptions env_options_; std::string fname; - uint64_t file_size = 100000; - uint32_t num_items = 20; - uint32_t num_hash_fun = 64; - double hash_table_ratio = 0.9; - uint32_t ikey_length; - uint32_t user_key_length; - uint32_t key_length; - uint32_t value_length; - uint32_t bucket_length; + const double kHashTableRatio = 0.9; }; -TEST(CuckooBuilderTest, NoCollision) { - hash_map.clear(); - uint32_t expected_num_hash_fun = 2; - std::vector user_keys(num_items); - std::vector keys(num_items); - std::vector values(num_items); - uint64_t bucket_ids = 0; - for (uint32_t i = 0; i < num_items; i++) { - user_keys[i] = "keys" + std::to_string(i+100); - ParsedInternalKey ikey(user_keys[i], i + 1000, kTypeValue); - AppendInternalKey(&keys[i], ikey); - values[i] = "value" + std::to_string(i+100); - AddHashLookups(user_keys[i], bucket_ids, num_hash_fun); - bucket_ids += num_hash_fun; - } - - ikey_length = keys[0].size(); - key_length = ikey_length; - value_length = values[0].size(); - bucket_length = ikey_length + value_length; - uint64_t expected_max_buckets = file_size / bucket_length; - std::string expected_unused_user_key = "keys10:"; - ParsedInternalKey ikey(expected_unused_user_key, 0, kTypeValue); - std::string expected_unused_bucket; - AppendInternalKey(&expected_unused_bucket, ikey); - expected_unused_bucket.resize(bucket_length, 'a'); +TEST(CuckooBuilderTest, SuccessWithEmptyFile) { unique_ptr writable_file; - fname = test::TmpDir() + "/BasicTest_writable_file"; + fname = test::TmpDir() + "/NoCollisionFullKey"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); - CuckooTableBuilder cuckoo_builder( - writable_file.get(), ikey_length, - value_length, hash_table_ratio, - file_size, num_hash_fun, 100, false, GetSliceHash); - ASSERT_OK(cuckoo_builder.status()); - uint32_t key_idx = 0; - std::string expected_file_data = ""; - for (uint32_t i = 0; i < expected_max_buckets; i++) { - if (key_idx * num_hash_fun == i && key_idx < num_items) { - cuckoo_builder.Add(Slice(keys[key_idx]), Slice(values[key_idx])); - ASSERT_EQ(cuckoo_builder.NumEntries(), key_idx + 1); - ASSERT_OK(cuckoo_builder.status()); - expected_file_data.append(keys[key_idx] + values[key_idx]); - ++key_idx; - } else { - expected_file_data.append(expected_unused_bucket); - } - } - ASSERT_OK(cuckoo_builder.Finish()); - writable_file->Close(); - CheckFileContents(expected_file_data, expected_unused_bucket, - expected_max_buckets, expected_num_hash_fun, false); + CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, + 4, 100, GetSliceHash); + ASSERT_OK(builder.status()); + ASSERT_OK(builder.Finish()); + ASSERT_OK(writable_file->Close()); + CheckFileContents({}, {}, {}, "", 0, 2, false); } -TEST(CuckooBuilderTest, NoCollisionLastLevel) { - hash_map.clear(); - uint32_t expected_num_hash_fun = 2; - std::vector user_keys(num_items); - std::vector keys(num_items); - std::vector values(num_items); - uint64_t bucket_ids = 0; - for (uint32_t i = 0; i < num_items; i++) { - user_keys[i] = "keys" + std::to_string(i+100); - // Set zero sequence number in all keys. - ParsedInternalKey ikey(user_keys[i], 0, kTypeValue); - AppendInternalKey(&keys[i], ikey); - values[i] = "value" + std::to_string(i+100); - AddHashLookups(user_keys[i], bucket_ids, num_hash_fun); - bucket_ids += num_hash_fun; +TEST(CuckooBuilderTest, WriteSuccessNoCollisionFullKey) { + uint32_t num_hash_fun = 4; + std::vector user_keys = {"key01", "key02", "key03", "key04"}; + std::vector values = {"v01", "v02", "v03", "v04"}; + hash_map = { + {user_keys[0], {0, 1, 2, 3}}, + {user_keys[1], {1, 2, 3, 4}}, + {user_keys[2], {2, 3, 4, 5}}, + {user_keys[3], {3, 4, 5, 6}} + }; + std::vector expected_locations = {0, 1, 2, 3}; + std::vector keys; + for (auto& user_key : user_keys) { + keys.push_back(GetInternalKey(user_key, false)); } - ikey_length = keys[0].size(); - user_key_length = user_keys[0].size(); - key_length = user_key_length; - value_length = values[0].size(); - bucket_length = key_length + value_length; - uint64_t expected_max_buckets = file_size / bucket_length; - std::string expected_unused_bucket = "keys10:"; - expected_unused_bucket.resize(bucket_length, 'a'); + unique_ptr writable_file; - fname = test::TmpDir() + "/NoCollisionLastLevel_writable_file"; + fname = test::TmpDir() + "/NoCollisionFullKey"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); - CuckooTableBuilder cuckoo_builder( - writable_file.get(), ikey_length, - value_length, hash_table_ratio, - file_size, num_hash_fun, 100, true, GetSliceHash); - ASSERT_OK(cuckoo_builder.status()); - uint32_t key_idx = 0; - std::string expected_file_data = ""; - for (uint32_t i = 0; i < expected_max_buckets; i++) { - if (key_idx * num_hash_fun == i && key_idx < num_items) { - cuckoo_builder.Add(Slice(keys[key_idx]), Slice(values[key_idx])); - ASSERT_EQ(cuckoo_builder.NumEntries(), key_idx + 1); - ASSERT_OK(cuckoo_builder.status()); - expected_file_data.append(user_keys[key_idx] + values[key_idx]); - ++key_idx; - } else { - expected_file_data.append(expected_unused_bucket); - } + CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, + num_hash_fun, 100, GetSliceHash); + ASSERT_OK(builder.status()); + for (uint32_t i = 0; i < user_keys.size(); i++) { + builder.Add(Slice(keys[i]), Slice(values[i])); + ASSERT_EQ(builder.NumEntries(), i + 1); + ASSERT_OK(builder.status()); } - ASSERT_OK(cuckoo_builder.Finish()); - writable_file->Close(); - CheckFileContents(expected_file_data, expected_unused_bucket, - expected_max_buckets, expected_num_hash_fun, true); + ASSERT_OK(builder.Finish()); + ASSERT_OK(writable_file->Close()); + + uint32_t expected_max_buckets = keys.size() / kHashTableRatio; + std::string expected_unused_bucket = GetInternalKey("key05", true); + expected_unused_bucket += std::string(values[0].size(), 'a'); + CheckFileContents(keys, values, expected_locations, + expected_unused_bucket, expected_max_buckets, 2, false); } -TEST(CuckooBuilderTest, WithCollision) { - // Take keys with colliding hash function values. - hash_map.clear(); - num_hash_fun = 20; - num_items = num_hash_fun; - uint32_t expected_num_hash_fun = num_hash_fun; - std::vector user_keys(num_items); - std::vector keys(num_items); - std::vector values(num_items); - for (uint32_t i = 0; i < num_items; i++) { - user_keys[i] = "keys" + std::to_string(i+100); - ParsedInternalKey ikey(user_keys[i], i + 1000, kTypeValue); - AppendInternalKey(&keys[i], ikey); - values[i] = "value" + std::to_string(i+100); - // Make all hash values collide. - AddHashLookups(user_keys[i], 0, num_hash_fun); +TEST(CuckooBuilderTest, WriteSuccessWithCollisionFullKey) { + uint32_t num_hash_fun = 4; + std::vector user_keys = {"key01", "key02", "key03", "key04"}; + std::vector values = {"v01", "v02", "v03", "v04"}; + hash_map = { + {user_keys[0], {0, 1, 2, 3}}, + {user_keys[1], {0, 1, 2, 3}}, + {user_keys[2], {0, 1, 2, 3}}, + {user_keys[3], {0, 1, 2, 3}}, + }; + std::vector expected_locations = {0, 1, 2, 3}; + std::vector keys; + for (auto& user_key : user_keys) { + keys.push_back(GetInternalKey(user_key, false)); } - ikey_length = keys[0].size(); - value_length = values[0].size(); - key_length = ikey_length; - bucket_length = key_length + value_length; - uint64_t expected_max_buckets = file_size / bucket_length; - std::string expected_unused_user_key = "keys10:"; - ParsedInternalKey ikey(expected_unused_user_key, 0, kTypeValue); - std::string expected_unused_bucket; - AppendInternalKey(&expected_unused_bucket, ikey); - expected_unused_bucket.resize(bucket_length, 'a'); + unique_ptr writable_file; - fname = test::TmpDir() + "/WithCollision_writable_file"; + fname = test::TmpDir() + "/WithCollisionFullKey"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); - CuckooTableBuilder cuckoo_builder( - writable_file.get(), key_length, value_length, hash_table_ratio, - file_size, num_hash_fun, 100, false, GetSliceHash); - ASSERT_OK(cuckoo_builder.status()); - uint32_t key_idx = 0; - std::string expected_file_data = ""; - for (uint32_t i = 0; i < expected_max_buckets; i++) { - if (key_idx == i && key_idx < num_items) { - cuckoo_builder.Add(Slice(keys[key_idx]), Slice(values[key_idx])); - ASSERT_EQ(cuckoo_builder.NumEntries(), key_idx + 1); - ASSERT_OK(cuckoo_builder.status()); - expected_file_data.append(keys[key_idx] + values[key_idx]); - ++key_idx; - } else { - expected_file_data.append(expected_unused_bucket); - } + CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, + num_hash_fun, 100, GetSliceHash); + ASSERT_OK(builder.status()); + for (uint32_t i = 0; i < user_keys.size(); i++) { + builder.Add(Slice(keys[i]), Slice(values[i])); + ASSERT_EQ(builder.NumEntries(), i + 1); + ASSERT_OK(builder.status()); } - ASSERT_OK(cuckoo_builder.Finish()); - writable_file->Close(); - CheckFileContents(expected_file_data, expected_unused_bucket, - expected_max_buckets, expected_num_hash_fun, false); + ASSERT_OK(builder.Finish()); + ASSERT_OK(writable_file->Close()); + + uint32_t expected_max_buckets = keys.size() / kHashTableRatio; + std::string expected_unused_bucket = GetInternalKey("key05", true); + expected_unused_bucket += std::string(values[0].size(), 'a'); + CheckFileContents(keys, values, expected_locations, + expected_unused_bucket, expected_max_buckets, 4, false); } -TEST(CuckooBuilderTest, FailWithTooManyCollisions) { - // Take keys with colliding hash function values. - // Take more keys than the number of hash functions. - hash_map.clear(); - num_hash_fun = 20; - num_items = num_hash_fun + 1; - std::vector user_keys(num_items); - std::vector keys(num_items); - std::vector values(num_items); - for (uint32_t i = 0; i < num_items; i++) { - user_keys[i] = "keys" + std::to_string(i+100); - ParsedInternalKey ikey(user_keys[i], i + 1000, kTypeValue); - AppendInternalKey(&keys[i], ikey); - values[i] = "value" + std::to_string(i+100); - // Make all hash values collide. - AddHashLookups(user_keys[i], 0, num_hash_fun); - } - ikey_length = keys[0].size(); - value_length = values[0].size(); - unique_ptr writable_file; - fname = test::TmpDir() + "/FailWithTooManyCollisions_writable"; - ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); - CuckooTableBuilder cuckoo_builder( - writable_file.get(), ikey_length, - value_length, hash_table_ratio, file_size, num_hash_fun, - 100, false, GetSliceHash); - ASSERT_OK(cuckoo_builder.status()); - for (uint32_t key_idx = 0; key_idx < num_items-1; key_idx++) { - cuckoo_builder.Add(Slice(keys[key_idx]), Slice(values[key_idx])); - ASSERT_OK(cuckoo_builder.status()); - ASSERT_EQ(cuckoo_builder.NumEntries(), key_idx + 1); - } - cuckoo_builder.Add(Slice(keys.back()), Slice(values.back())); - ASSERT_TRUE(cuckoo_builder.status().IsCorruption()); - cuckoo_builder.Abandon(); - writable_file->Close(); -} - -TEST(CuckooBuilderTest, FailWhenSameKeyInserted) { - hash_map.clear(); - std::string user_key = "repeatedkey"; - AddHashLookups(user_key, 0, num_hash_fun); - std::string key_to_reuse1, key_to_reuse2; - ParsedInternalKey ikey1(user_key, 1000, kTypeValue); - ParsedInternalKey ikey2(user_key, 1001, kTypeValue); - AppendInternalKey(&key_to_reuse1, ikey1); - AppendInternalKey(&key_to_reuse2, ikey2); - std::string value = "value"; - ikey_length = key_to_reuse1.size(); - value_length = value.size(); - unique_ptr writable_file; - fname = test::TmpDir() + "/FailWhenSameKeyInserted_writable_file"; - ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); - CuckooTableBuilder cuckoo_builder( - writable_file.get(), ikey_length, - value_length, hash_table_ratio, file_size, num_hash_fun, - 100, false, GetSliceHash); - ASSERT_OK(cuckoo_builder.status()); - cuckoo_builder.Add(Slice(key_to_reuse1), Slice(value)); - ASSERT_OK(cuckoo_builder.status()); - ASSERT_EQ(cuckoo_builder.NumEntries(), 1U); - cuckoo_builder.Add(Slice(key_to_reuse2), Slice(value)); - ASSERT_TRUE(cuckoo_builder.status().IsCorruption()); - cuckoo_builder.Abandon(); - writable_file->Close(); -} - -TEST(CuckooBuilderTest, WithACollisionPath) { - hash_map.clear(); +TEST(CuckooBuilderTest, WithCollisionPathFullKey) { // Have two hash functions. Insert elements with overlapping hashes. // Finally insert an element with hash value somewhere in the middle // so that it displaces all the elements after that. - num_hash_fun = 2; - uint32_t expected_num_hash_fun = num_hash_fun; - uint32_t max_search_depth = 100; - num_items = 2*max_search_depth + 2; - std::vector user_keys(num_items); - std::vector keys(num_items); - std::vector values(num_items); - std::vector expected_bucket_id(num_items); - for (uint32_t i = 0; i < num_items - 1; i++) { - user_keys[i] = "keys" + std::to_string(i+100); - ParsedInternalKey ikey(user_keys[i], i + 1000, kTypeValue); - AppendInternalKey(&keys[i], ikey); - values[i] = "value" + std::to_string(i+100); - // Make all hash values collide with the next element. - AddHashLookups(user_keys[i], i, num_hash_fun); - if (i <= max_search_depth) { - expected_bucket_id[i] = i; - } else { - expected_bucket_id[i] = i+1; - } - } - user_keys.back() = "keys" + std::to_string(num_items + 99); - ParsedInternalKey ikey(user_keys.back(), num_items + 1000, kTypeValue); - AppendInternalKey(&keys.back(), ikey); - values.back() = "value" + std::to_string(num_items+100); - // Make hash values collide with first and middle elements. - // Inserting at 0 will fail after exceeding search depth limit. - hash_map[user_keys.back()] = {0, max_search_depth + 1}; - expected_bucket_id.back() = max_search_depth + 1; - - ikey_length = keys[0].size(); - value_length = values[0].size(); - key_length = ikey_length; - bucket_length = key_length + value_length; - - uint64_t expected_max_buckets = file_size / bucket_length; - std::string expected_unused_user_key = "keys10:"; - ikey = ParsedInternalKey(expected_unused_user_key, 0, kTypeValue); - std::string expected_unused_bucket; - AppendInternalKey(&expected_unused_bucket, ikey); - expected_unused_bucket.resize(bucket_length, 'a'); - std::string expected_file_data = ""; - for (uint32_t i = 0; i < expected_max_buckets; i++) { - expected_file_data += expected_unused_bucket; + uint32_t num_hash_fun = 2; + std::vector user_keys = {"key01", "key02", "key03", + "key04", "key05"}; + std::vector values = {"v01", "v02", "v03", "v04", "v05"}; + hash_map = { + {user_keys[0], {0, 1}}, + {user_keys[1], {1, 2}}, + {user_keys[2], {2, 3}}, + {user_keys[3], {3, 4}}, + {user_keys[4], {0, 2}}, + }; + std::vector expected_locations = {0, 1, 3, 4, 2}; + std::vector keys; + for (auto& user_key : user_keys) { + keys.push_back(GetInternalKey(user_key, false)); } unique_ptr writable_file; - fname = test::TmpDir() + "/WithCollisionPath_writable_file"; + fname = test::TmpDir() + "/WithCollisionPathFullKey"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); - CuckooTableBuilder cuckoo_builder( - writable_file.get(), key_length, - value_length, hash_table_ratio, file_size, - num_hash_fun, max_search_depth, false, GetSliceHash); - ASSERT_OK(cuckoo_builder.status()); - for (uint32_t key_idx = 0; key_idx < num_items; key_idx++) { - cuckoo_builder.Add(Slice(keys[key_idx]), Slice(values[key_idx])); - ASSERT_OK(cuckoo_builder.status()); - ASSERT_EQ(cuckoo_builder.NumEntries(), key_idx + 1); - expected_file_data.replace(expected_bucket_id[key_idx]*bucket_length, - bucket_length, keys[key_idx] + values[key_idx]); + CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, + num_hash_fun, 100, GetSliceHash); + ASSERT_OK(builder.status()); + for (uint32_t i = 0; i < user_keys.size(); i++) { + builder.Add(Slice(keys[i]), Slice(values[i])); + ASSERT_EQ(builder.NumEntries(), i + 1); + ASSERT_OK(builder.status()); } - ASSERT_OK(cuckoo_builder.Finish()); - writable_file->Close(); - CheckFileContents(expected_file_data, expected_unused_bucket, - expected_max_buckets, expected_num_hash_fun, false); + ASSERT_OK(builder.Finish()); + ASSERT_OK(writable_file->Close()); + + uint32_t expected_max_buckets = keys.size() / kHashTableRatio; + std::string expected_unused_bucket = GetInternalKey("key06", true); + expected_unused_bucket += std::string(values[0].size(), 'a'); + CheckFileContents(keys, values, expected_locations, + expected_unused_bucket, expected_max_buckets, 2, false); +} + +TEST(CuckooBuilderTest, WriteSuccessNoCollisionUserKey) { + uint32_t num_hash_fun = 4; + std::vector user_keys = {"key01", "key02", "key03", "key04"}; + std::vector values = {"v01", "v02", "v03", "v04"}; + hash_map = { + {user_keys[0], {0, 1, 2, 3}}, + {user_keys[1], {1, 2, 3, 4}}, + {user_keys[2], {2, 3, 4, 5}}, + {user_keys[3], {3, 4, 5, 6}} + }; + std::vector expected_locations = {0, 1, 2, 3}; + + unique_ptr writable_file; + fname = test::TmpDir() + "/NoCollisionUserKey"; + ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); + CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, + num_hash_fun, 100, GetSliceHash); + ASSERT_OK(builder.status()); + for (uint32_t i = 0; i < user_keys.size(); i++) { + builder.Add(Slice(GetInternalKey(user_keys[i], true)), Slice(values[i])); + ASSERT_EQ(builder.NumEntries(), i + 1); + ASSERT_OK(builder.status()); + } + ASSERT_OK(builder.Finish()); + ASSERT_OK(writable_file->Close()); + + uint32_t expected_max_buckets = user_keys.size() / kHashTableRatio; + std::string expected_unused_bucket = "key05"; + expected_unused_bucket += std::string(values[0].size(), 'a'); + CheckFileContents(user_keys, values, expected_locations, + expected_unused_bucket, expected_max_buckets, 2, true); +} + +TEST(CuckooBuilderTest, WriteSuccessWithCollisionUserKey) { + uint32_t num_hash_fun = 4; + std::vector user_keys = {"key01", "key02", "key03", "key04"}; + std::vector values = {"v01", "v02", "v03", "v04"}; + hash_map = { + {user_keys[0], {0, 1, 2, 3}}, + {user_keys[1], {0, 1, 2, 3}}, + {user_keys[2], {0, 1, 2, 3}}, + {user_keys[3], {0, 1, 2, 3}}, + }; + std::vector expected_locations = {0, 1, 2, 3}; + + unique_ptr writable_file; + fname = test::TmpDir() + "/WithCollisionUserKey"; + ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); + CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, + num_hash_fun, 100, GetSliceHash); + ASSERT_OK(builder.status()); + for (uint32_t i = 0; i < user_keys.size(); i++) { + builder.Add(Slice(GetInternalKey(user_keys[i], true)), Slice(values[i])); + ASSERT_EQ(builder.NumEntries(), i + 1); + ASSERT_OK(builder.status()); + } + ASSERT_OK(builder.Finish()); + ASSERT_OK(writable_file->Close()); + + uint32_t expected_max_buckets = user_keys.size() / kHashTableRatio; + std::string expected_unused_bucket = "key05"; + expected_unused_bucket += std::string(values[0].size(), 'a'); + CheckFileContents(user_keys, values, expected_locations, + expected_unused_bucket, expected_max_buckets, 4, true); +} + +TEST(CuckooBuilderTest, WithCollisionPathUserKey) { + uint32_t num_hash_fun = 2; + std::vector user_keys = {"key01", "key02", "key03", + "key04", "key05"}; + std::vector values = {"v01", "v02", "v03", "v04", "v05"}; + hash_map = { + {user_keys[0], {0, 1}}, + {user_keys[1], {1, 2}}, + {user_keys[2], {2, 3}}, + {user_keys[3], {3, 4}}, + {user_keys[4], {0, 2}}, + }; + std::vector expected_locations = {0, 1, 3, 4, 2}; + + unique_ptr writable_file; + fname = test::TmpDir() + "/WithCollisionPathUserKey"; + ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); + CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, + num_hash_fun, 2, GetSliceHash); + ASSERT_OK(builder.status()); + for (uint32_t i = 0; i < user_keys.size(); i++) { + builder.Add(Slice(GetInternalKey(user_keys[i], true)), Slice(values[i])); + ASSERT_EQ(builder.NumEntries(), i + 1); + ASSERT_OK(builder.status()); + } + ASSERT_OK(builder.Finish()); + ASSERT_OK(writable_file->Close()); + + uint32_t expected_max_buckets = user_keys.size() / kHashTableRatio; + std::string expected_unused_bucket = "key06"; + expected_unused_bucket += std::string(values[0].size(), 'a'); + CheckFileContents(user_keys, values, expected_locations, + expected_unused_bucket, expected_max_buckets, 2, true); } TEST(CuckooBuilderTest, FailWhenCollisionPathTooLong) { - hash_map.clear(); // Have two hash functions. Insert elements with overlapping hashes. - // Finally insert an element which will displace all the current elements. - num_hash_fun = 2; + // Finally try inserting an element with hash value somewhere in the middle + // and it should fail because the no. of elements to displace is too high. + uint32_t num_hash_fun = 2; + std::vector user_keys = {"key01", "key02", "key03", + "key04", "key05"}; + hash_map = { + {user_keys[0], {0, 1}}, + {user_keys[1], {1, 2}}, + {user_keys[2], {2, 3}}, + {user_keys[3], {3, 4}}, + {user_keys[4], {0, 1}}, + }; - uint32_t max_search_depth = 100; - num_items = 2*max_search_depth + 3; - std::vector user_keys(num_items); - std::vector keys(num_items); - std::vector values(num_items); - for (uint32_t i = 0; i < num_items - 1; i++) { - user_keys[i] = "keys" + std::to_string(i+100); - ParsedInternalKey ikey(user_keys[i], i + 1000, kTypeValue); - AppendInternalKey(&keys[i], ikey); - values[i] = "value" + std::to_string(i+100); - // Make all hash values collide with the next element. - AddHashLookups(user_keys[i], i, num_hash_fun); - } - user_keys.back() = "keys" + std::to_string(num_items + 99); - ParsedInternalKey ikey(user_keys.back(), num_items + 1000, kTypeValue); - AppendInternalKey(&keys.back(), ikey); - values.back() = "value" + std::to_string(num_items+100); - // Make hash values collide with middle element. - hash_map[user_keys.back()] = {0, max_search_depth + 1}; - - ikey_length = keys[0].size(); - value_length = values[0].size(); unique_ptr writable_file; - fname = test::TmpDir() + "/FailWhenCollisionPathTooLong_writable"; + fname = test::TmpDir() + "/WithCollisionPathUserKey"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); - CuckooTableBuilder cuckoo_builder( - writable_file.get(), ikey_length, - value_length, hash_table_ratio, file_size, num_hash_fun, - max_search_depth, false, GetSliceHash); - ASSERT_OK(cuckoo_builder.status()); - for (uint32_t key_idx = 0; key_idx < num_items-1; key_idx++) { - cuckoo_builder.Add(Slice(keys[key_idx]), Slice(values[key_idx])); - ASSERT_OK(cuckoo_builder.status()); - ASSERT_EQ(cuckoo_builder.NumEntries(), key_idx + 1); + CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, + num_hash_fun, 2, GetSliceHash); + ASSERT_OK(builder.status()); + for (uint32_t i = 0; i < user_keys.size(); i++) { + builder.Add(Slice(GetInternalKey(user_keys[i], false)), Slice("value")); + ASSERT_EQ(builder.NumEntries(), i + 1); + ASSERT_OK(builder.status()); } - cuckoo_builder.Add(Slice(keys.back()), Slice(values.back())); - ASSERT_TRUE(cuckoo_builder.status().IsCorruption()); - cuckoo_builder.Abandon(); - writable_file->Close(); + ASSERT_TRUE(builder.Finish().IsNotSupported()); + ASSERT_OK(writable_file->Close()); } -TEST(CuckooBuilderTest, FailWhenTableIsFull) { - hash_map.clear(); - file_size = 160; - - num_items = 7; - std::vector user_keys(num_items); - std::vector keys(num_items); - std::vector values(num_items); - for (uint32_t i = 0; i < num_items; i++) { - user_keys[i] = "keys" + std::to_string(i+1000); - ParsedInternalKey ikey(user_keys[i], i + 1000, kTypeValue); - AppendInternalKey(&keys[i], ikey); - values[i] = "value" + std::to_string(i+100); - AddHashLookups(user_keys[i], i, num_hash_fun); - } - ikey_length = keys[0].size(); - value_length = values[0].size(); - bucket_length = ikey_length + value_length; - // Check that number of items is tight. - ASSERT_GT(bucket_length * num_items, file_size); - ASSERT_LE(bucket_length * (num_items-1), file_size); +TEST(CuckooBuilderTest, FailWhenSameKeyInserted) { + hash_map = {{"repeatedkey", {0, 1, 2, 3}}}; + uint32_t num_hash_fun = 4; + std::string user_key = "repeatedkey"; unique_ptr writable_file; - fname = test::TmpDir() + "/FailWhenTabelIsFull_writable"; + fname = test::TmpDir() + "/FailWhenSameKeyInserted"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); - CuckooTableBuilder cuckoo_builder( - writable_file.get(), ikey_length, - value_length, hash_table_ratio, file_size, num_hash_fun, - 100, false, GetSliceHash); - ASSERT_OK(cuckoo_builder.status()); - for (uint32_t key_idx = 0; key_idx < num_items-1; key_idx++) { - cuckoo_builder.Add(Slice(keys[key_idx]), Slice(values[key_idx])); - ASSERT_OK(cuckoo_builder.status()); - ASSERT_EQ(cuckoo_builder.NumEntries(), key_idx + 1); - } - cuckoo_builder.Add(Slice(keys.back()), Slice(values.back())); - ASSERT_TRUE(cuckoo_builder.status().IsCorruption()); - cuckoo_builder.Abandon(); - writable_file->Close(); + CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, + num_hash_fun, 100, GetSliceHash); + ASSERT_OK(builder.status()); + + builder.Add(Slice(GetInternalKey(user_key, false)), Slice("value1")); + ASSERT_EQ(builder.NumEntries(), 1); + ASSERT_OK(builder.status()); + builder.Add(Slice(GetInternalKey(user_key, true)), Slice("value2")); + ASSERT_EQ(builder.NumEntries(), 2); + ASSERT_OK(builder.status()); + + ASSERT_TRUE(builder.Finish().IsNotSupported()); + ASSERT_OK(writable_file->Close()); } } // namespace rocksdb diff --git a/table/cuckoo_table_reader.cc b/table/cuckoo_table_reader.cc index bde472418..f15213d58 100644 --- a/table/cuckoo_table_reader.cc +++ b/table/cuckoo_table_reader.cc @@ -28,9 +28,9 @@ CuckooTableReader::CuckooTableReader( const Options& options, std::unique_ptr&& file, uint64_t file_size, - uint64_t (*GetSliceHashPtr)(const Slice&, uint32_t, uint64_t)) + uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t)) : file_(std::move(file)), - GetSliceHash(GetSliceHashPtr) { + get_slice_hash_(get_slice_hash) { if (!options.allow_mmap_reads) { status_ = Status::InvalidArgument("File is not mmaped"); } @@ -90,7 +90,7 @@ Status CuckooTableReader::Get( return Status::Corruption("Unable to parse key into inernal key."); } for (uint32_t hash_cnt = 0; hash_cnt < num_hash_fun_; ++hash_cnt) { - uint64_t hash_val = GetSliceHash(ikey.user_key, hash_cnt, num_buckets_); + uint64_t hash_val = get_slice_hash_(ikey.user_key, hash_cnt, num_buckets_); assert(hash_val < num_buckets_); uint64_t offset = hash_val * bucket_length_; const char* bucket = &file_data_.data()[offset]; diff --git a/table/cuckoo_table_reader.h b/table/cuckoo_table_reader.h index 4e99fc72e..a4b5b72f9 100644 --- a/table/cuckoo_table_reader.h +++ b/table/cuckoo_table_reader.h @@ -29,7 +29,7 @@ class CuckooTableReader: public TableReader { const Options& options, std::unique_ptr&& file, uint64_t file_size, - uint64_t (*GetSliceHash)(const Slice&, uint32_t, uint64_t)); + uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t)); ~CuckooTableReader() {} std::shared_ptr GetTableProperties() const override { @@ -67,7 +67,7 @@ class CuckooTableReader: public TableReader { uint32_t value_length_; uint32_t bucket_length_; uint64_t num_buckets_; - uint64_t (*GetSliceHash)(const Slice& s, uint32_t index, + uint64_t (*get_slice_hash_)(const Slice& s, uint32_t index, uint64_t max_num_buckets); }; diff --git a/table/cuckoo_table_reader_test.cc b/table/cuckoo_table_reader_test.cc index 81746aa62..214603909 100644 --- a/table/cuckoo_table_reader_test.cc +++ b/table/cuckoo_table_reader_test.cc @@ -101,12 +101,11 @@ class CuckooReaderTest { return std::string(reinterpret_cast(&i), sizeof(i)); } - void CreateCuckooFile(bool is_last_level) { + void CreateCuckooFileAndCheckReader() { unique_ptr writable_file; ASSERT_OK(env->NewWritableFile(fname, &writable_file, env_options)); CuckooTableBuilder builder( - writable_file.get(), keys[0].size(), values[0].size(), 0.9, - 10000, kNumHashFunc, 100, is_last_level, GetSliceHash); + writable_file.get(), 0.9, kNumHashFunc, 100, GetSliceHash); ASSERT_OK(builder.status()); for (uint32_t key_idx = 0; key_idx < num_items; ++key_idx) { builder.Add(Slice(keys[key_idx]), Slice(values[key_idx])); @@ -117,9 +116,8 @@ class CuckooReaderTest { ASSERT_EQ(num_items, builder.NumEntries()); file_size = builder.FileSize(); ASSERT_OK(writable_file->Close()); - } - void CheckReader() { + // Check reader now. unique_ptr read_file; ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options)); CuckooTableReader reader( @@ -135,6 +133,14 @@ class CuckooReaderTest { ASSERT_EQ(1, v.call_count); } } + void UpdateKeys(bool with_zero_seqno) { + for (uint32_t i = 0; i < num_items; i++) { + ParsedInternalKey ikey(user_keys[i], + with_zero_seqno ? 0 : i + 1000, kTypeValue); + keys[i].clear(); + AppendInternalKey(&keys[i], ikey); + } + } void CheckIterator() { unique_ptr read_file; @@ -216,23 +222,22 @@ TEST(CuckooReaderTest, WhenKeyExists) { AppendInternalKey(&keys[i], ikey); values[i] = "value" + NumToStr(i); // Give disjoint hash values. - AddHashLookups(user_keys[i], i * kNumHashFunc, kNumHashFunc); + AddHashLookups(user_keys[i], i, kNumHashFunc); } - CreateCuckooFile(false); - CheckReader(); + CreateCuckooFileAndCheckReader(); // Last level file. - CreateCuckooFile(true); - CheckReader(); + UpdateKeys(true); + CreateCuckooFileAndCheckReader(); // Test with collision. Make all hash values collide. hash_map.clear(); for (uint32_t i = 0; i < num_items; i++) { AddHashLookups(user_keys[i], 0, kNumHashFunc); } - CreateCuckooFile(false); - CheckReader(); + UpdateKeys(false); + CreateCuckooFileAndCheckReader(); // Last level file. - CreateCuckooFile(true); - CheckReader(); + UpdateKeys(true); + CreateCuckooFileAndCheckReader(); } TEST(CuckooReaderTest, CheckIterator) { @@ -244,18 +249,19 @@ TEST(CuckooReaderTest, CheckIterator) { AppendInternalKey(&keys[i], ikey); values[i] = "value" + NumToStr(i); // Give disjoint hash values, in reverse order. - AddHashLookups(user_keys[i], (num_items-i-1)*kNumHashFunc, kNumHashFunc); + AddHashLookups(user_keys[i], num_items-i-1, kNumHashFunc); } - CreateCuckooFile(false); + CreateCuckooFileAndCheckReader(); CheckIterator(); // Last level file. - CreateCuckooFile(true); + UpdateKeys(true); + CreateCuckooFileAndCheckReader(); CheckIterator(); } TEST(CuckooReaderTest, WhenKeyNotFound) { // Add keys with colliding hash values. - SetUp(kNumHashFunc / 2); + SetUp(kNumHashFunc); fname = test::TmpDir() + "/CuckooReader_WhenKeyNotFound"; for (uint64_t i = 0; i < num_items; i++) { user_keys[i] = "key" + NumToStr(i); @@ -265,8 +271,7 @@ TEST(CuckooReaderTest, WhenKeyNotFound) { // Make all hash values collide. AddHashLookups(user_keys[i], 0, kNumHashFunc); } - CreateCuckooFile(false); - CheckReader(); + CreateCuckooFileAndCheckReader(); unique_ptr read_file; ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options)); CuckooTableReader reader( @@ -351,20 +356,17 @@ void BM_CuckooRead(uint64_t num, uint32_t key_length, } std::string fname = FLAGS_file_dir + "/cuckoo_read_benchmark"; - uint64_t predicted_file_size = - num * (key_length + value_length) / hash_ratio + 1024; - unique_ptr writable_file; ASSERT_OK(env->NewWritableFile(fname, &writable_file, env_options)); CuckooTableBuilder builder( - writable_file.get(), key_length + 8, value_length, hash_ratio, - predicted_file_size, kMaxNumHashTable, 1000, true, GetSliceMurmurHash); + writable_file.get(), hash_ratio, + kMaxNumHashTable, 1000, GetSliceMurmurHash); ASSERT_OK(builder.status()); for (uint64_t key_idx = 0; key_idx < num; ++key_idx) { // Value is just a part of key. std::string new_key(reinterpret_cast(&key_idx), sizeof(key_idx)); new_key = std::string(key_length - new_key.size(), 'k') + new_key; - ParsedInternalKey ikey(new_key, num, kTypeValue); + ParsedInternalKey ikey(new_key, 0, kTypeValue); std::string full_key; AppendInternalKey(&full_key, ikey); builder.Add(Slice(full_key), Slice(&full_key[0], value_length));