2014-07-26 01:37:32 +02:00
|
|
|
// Copyright (c) 2014, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under the BSD-style license found in the
|
|
|
|
// LICENSE file in the root directory of this source tree. An additional grant
|
|
|
|
// of patent rights can be found in the PATENTS file in the same directory.
|
|
|
|
//
|
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
#include "table/cuckoo_table_reader.h"
|
|
|
|
|
2014-08-06 01:35:02 +02:00
|
|
|
#include <algorithm>
|
|
|
|
#include <limits>
|
2014-07-26 01:37:32 +02:00
|
|
|
#include <string>
|
2014-08-06 01:35:02 +02:00
|
|
|
#include <utility>
|
|
|
|
#include <vector>
|
|
|
|
#include "rocksdb/iterator.h"
|
2014-07-26 01:37:32 +02:00
|
|
|
#include "table/meta_blocks.h"
|
2014-08-06 01:35:02 +02:00
|
|
|
#include "util/arena.h"
|
2014-07-26 01:37:32 +02:00
|
|
|
#include "util/coding.h"
|
|
|
|
|
|
|
|
namespace rocksdb {
|
|
|
|
|
|
|
|
extern const uint64_t kCuckooTableMagicNumber;
|
|
|
|
|
|
|
|
CuckooTableReader::CuckooTableReader(
|
|
|
|
const Options& options,
|
|
|
|
std::unique_ptr<RandomAccessFile>&& file,
|
|
|
|
uint64_t file_size,
|
2014-08-06 05:55:46 +02:00
|
|
|
uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t))
|
2014-07-26 01:37:32 +02:00
|
|
|
: file_(std::move(file)),
|
2014-08-06 05:55:46 +02:00
|
|
|
get_slice_hash_(get_slice_hash) {
|
2014-07-26 01:37:32 +02:00
|
|
|
if (!options.allow_mmap_reads) {
|
|
|
|
status_ = Status::InvalidArgument("File is not mmaped");
|
|
|
|
}
|
|
|
|
TableProperties* props = nullptr;
|
|
|
|
status_ = ReadTableProperties(file_.get(), file_size, kCuckooTableMagicNumber,
|
|
|
|
options.env, options.info_log.get(), &props);
|
|
|
|
if (!status_.ok()) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
table_props_.reset(props);
|
|
|
|
auto& user_props = props->user_collected_properties;
|
|
|
|
auto hash_funs = user_props.find(CuckooTablePropertyNames::kNumHashTable);
|
|
|
|
if (hash_funs == user_props.end()) {
|
|
|
|
status_ = Status::InvalidArgument("Number of hash functions not found");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
num_hash_fun_ = *reinterpret_cast<const uint32_t*>(hash_funs->second.data());
|
|
|
|
auto unused_key = user_props.find(CuckooTablePropertyNames::kEmptyKey);
|
|
|
|
if (unused_key == user_props.end()) {
|
|
|
|
status_ = Status::InvalidArgument("Empty bucket value not found");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
unused_key_ = unused_key->second;
|
|
|
|
|
|
|
|
key_length_ = props->fixed_key_len;
|
|
|
|
auto value_length = user_props.find(CuckooTablePropertyNames::kValueLength);
|
|
|
|
if (value_length == user_props.end()) {
|
|
|
|
status_ = Status::InvalidArgument("Value length not found");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
value_length_ = *reinterpret_cast<const uint32_t*>(
|
|
|
|
value_length->second.data());
|
|
|
|
bucket_length_ = key_length_ + value_length_;
|
|
|
|
|
|
|
|
auto num_buckets = user_props.find(CuckooTablePropertyNames::kMaxNumBuckets);
|
|
|
|
if (num_buckets == user_props.end()) {
|
|
|
|
status_ = Status::InvalidArgument("Num buckets not found");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
num_buckets_ = *reinterpret_cast<const uint64_t*>(num_buckets->second.data());
|
|
|
|
auto is_last_level = user_props.find(CuckooTablePropertyNames::kIsLastLevel);
|
|
|
|
if (is_last_level == user_props.end()) {
|
|
|
|
status_ = Status::InvalidArgument("Is last level not found");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
is_last_level_ = *reinterpret_cast<const bool*>(is_last_level->second.data());
|
|
|
|
status_ = file_->Read(0, file_size, &file_data_, nullptr);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status CuckooTableReader::Get(
|
|
|
|
const ReadOptions& readOptions, const Slice& key, void* handle_context,
|
|
|
|
bool (*result_handler)(void* arg, const ParsedInternalKey& k,
|
|
|
|
const Slice& v),
|
|
|
|
void (*mark_key_may_exist_handler)(void* handle_context)) {
|
|
|
|
ParsedInternalKey ikey;
|
|
|
|
if (!ParseInternalKey(key, &ikey)) {
|
|
|
|
return Status::Corruption("Unable to parse key into inernal key.");
|
|
|
|
}
|
2014-08-12 05:21:07 +02:00
|
|
|
if ((is_last_level_ && key.size() != key_length_ + 8) ||
|
|
|
|
(!is_last_level_ && key.size() != key_length_)) {
|
|
|
|
return Status::InvalidArgument("Length of key is invalid.");
|
|
|
|
}
|
2014-07-26 01:37:32 +02:00
|
|
|
for (uint32_t hash_cnt = 0; hash_cnt < num_hash_fun_; ++hash_cnt) {
|
2014-08-06 05:55:46 +02:00
|
|
|
uint64_t hash_val = get_slice_hash_(ikey.user_key, hash_cnt, num_buckets_);
|
2014-07-26 01:37:32 +02:00
|
|
|
assert(hash_val < num_buckets_);
|
Implement Prepare method in CuckooTableReader
Summary:
- Implement Prepare method
- Rewrite performance tests in cuckoo_table_reader_test to write new file only if one doesn't already exist.
- Add performance tests for batch lookup along with prefetching.
Test Plan:
./cuckoo_table_reader_test --enable_perf
Results (We get better results if we used int64 comparator instead of string comparator (TBD in future diffs)):
With 100000000 items and hash table ratio 0.500000, number of hash functions used: 2.
Time taken per op is 0.208us (4.8 Mqps) with batch size of 0
With 100000000 items and hash table ratio 0.500000, number of hash functions used: 2.
Time taken per op is 0.182us (5.5 Mqps) with batch size of 10
With 100000000 items and hash table ratio 0.500000, number of hash functions used: 2.
Time taken per op is 0.161us (6.2 Mqps) with batch size of 25
With 100000000 items and hash table ratio 0.500000, number of hash functions used: 2.
Time taken per op is 0.161us (6.2 Mqps) with batch size of 50
With 100000000 items and hash table ratio 0.500000, number of hash functions used: 2.
Time taken per op is 0.163us (6.1 Mqps) with batch size of 100
With 100000000 items and hash table ratio 0.600000, number of hash functions used: 3.
Time taken per op is 0.252us (4.0 Mqps) with batch size of 0
With 100000000 items and hash table ratio 0.600000, number of hash functions used: 3.
Time taken per op is 0.192us (5.2 Mqps) with batch size of 10
With 100000000 items and hash table ratio 0.600000, number of hash functions used: 3.
Time taken per op is 0.195us (5.1 Mqps) with batch size of 25
With 100000000 items and hash table ratio 0.600000, number of hash functions used: 3.
Time taken per op is 0.191us (5.2 Mqps) with batch size of 50
With 100000000 items and hash table ratio 0.600000, number of hash functions used: 3.
Time taken per op is 0.194us (5.1 Mqps) with batch size of 100
With 100000000 items and hash table ratio 0.750000, number of hash functions used: 3.
Time taken per op is 0.228us (4.4 Mqps) with batch size of 0
With 100000000 items and hash table ratio 0.750000, number of hash functions used: 3.
Time taken per op is 0.185us (5.4 Mqps) with batch size of 10
With 100000000 items and hash table ratio 0.750000, number of hash functions used: 3.
Time taken per op is 0.186us (5.4 Mqps) with batch size of 25
With 100000000 items and hash table ratio 0.750000, number of hash functions used: 3.
Time taken per op is 0.189us (5.3 Mqps) with batch size of 50
With 100000000 items and hash table ratio 0.750000, number of hash functions used: 3.
Time taken per op is 0.188us (5.3 Mqps) with batch size of 100
With 100000000 items and hash table ratio 0.900000, number of hash functions used: 3.
Time taken per op is 0.325us (3.1 Mqps) with batch size of 0
With 100000000 items and hash table ratio 0.900000, number of hash functions used: 3.
Time taken per op is 0.196us (5.1 Mqps) with batch size of 10
With 100000000 items and hash table ratio 0.900000, number of hash functions used: 3.
Time taken per op is 0.199us (5.0 Mqps) with batch size of 25
With 100000000 items and hash table ratio 0.900000, number of hash functions used: 3.
Time taken per op is 0.196us (5.1 Mqps) with batch size of 50
With 100000000 items and hash table ratio 0.900000, number of hash functions used: 3.
Time taken per op is 0.209us (4.8 Mqps) with batch size of 100
Reviewers: sdong, yhchiang, igor, ljin
Reviewed By: ljin
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D22167
2014-08-21 03:35:35 +02:00
|
|
|
const char* bucket = &file_data_.data()[hash_val * bucket_length_];
|
2014-07-26 01:37:32 +02:00
|
|
|
if (unused_key_.compare(0, key_length_, bucket, key_length_) == 0) {
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
// Here, we compare only the user key part as we support only one entry
|
|
|
|
// per user key and we don't support sanpshot.
|
|
|
|
if (ikey.user_key.compare(Slice(bucket, ikey.user_key.size())) == 0) {
|
|
|
|
Slice value = Slice(&bucket[key_length_], value_length_);
|
2014-08-12 05:21:07 +02:00
|
|
|
if (is_last_level_) {
|
|
|
|
ParsedInternalKey found_ikey(Slice(bucket, key_length_), 0, kTypeValue);
|
|
|
|
result_handler(handle_context, found_ikey, value);
|
|
|
|
} else {
|
|
|
|
Slice full_key(bucket, key_length_);
|
|
|
|
ParsedInternalKey found_ikey;
|
|
|
|
ParseInternalKey(full_key, &found_ikey);
|
|
|
|
result_handler(handle_context, found_ikey, value);
|
|
|
|
}
|
2014-07-26 01:37:32 +02:00
|
|
|
// We don't support merge operations. So, we return here.
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
Implement Prepare method in CuckooTableReader
Summary:
- Implement Prepare method
- Rewrite performance tests in cuckoo_table_reader_test to write new file only if one doesn't already exist.
- Add performance tests for batch lookup along with prefetching.
Test Plan:
./cuckoo_table_reader_test --enable_perf
Results (We get better results if we used int64 comparator instead of string comparator (TBD in future diffs)):
With 100000000 items and hash table ratio 0.500000, number of hash functions used: 2.
Time taken per op is 0.208us (4.8 Mqps) with batch size of 0
With 100000000 items and hash table ratio 0.500000, number of hash functions used: 2.
Time taken per op is 0.182us (5.5 Mqps) with batch size of 10
With 100000000 items and hash table ratio 0.500000, number of hash functions used: 2.
Time taken per op is 0.161us (6.2 Mqps) with batch size of 25
With 100000000 items and hash table ratio 0.500000, number of hash functions used: 2.
Time taken per op is 0.161us (6.2 Mqps) with batch size of 50
With 100000000 items and hash table ratio 0.500000, number of hash functions used: 2.
Time taken per op is 0.163us (6.1 Mqps) with batch size of 100
With 100000000 items and hash table ratio 0.600000, number of hash functions used: 3.
Time taken per op is 0.252us (4.0 Mqps) with batch size of 0
With 100000000 items and hash table ratio 0.600000, number of hash functions used: 3.
Time taken per op is 0.192us (5.2 Mqps) with batch size of 10
With 100000000 items and hash table ratio 0.600000, number of hash functions used: 3.
Time taken per op is 0.195us (5.1 Mqps) with batch size of 25
With 100000000 items and hash table ratio 0.600000, number of hash functions used: 3.
Time taken per op is 0.191us (5.2 Mqps) with batch size of 50
With 100000000 items and hash table ratio 0.600000, number of hash functions used: 3.
Time taken per op is 0.194us (5.1 Mqps) with batch size of 100
With 100000000 items and hash table ratio 0.750000, number of hash functions used: 3.
Time taken per op is 0.228us (4.4 Mqps) with batch size of 0
With 100000000 items and hash table ratio 0.750000, number of hash functions used: 3.
Time taken per op is 0.185us (5.4 Mqps) with batch size of 10
With 100000000 items and hash table ratio 0.750000, number of hash functions used: 3.
Time taken per op is 0.186us (5.4 Mqps) with batch size of 25
With 100000000 items and hash table ratio 0.750000, number of hash functions used: 3.
Time taken per op is 0.189us (5.3 Mqps) with batch size of 50
With 100000000 items and hash table ratio 0.750000, number of hash functions used: 3.
Time taken per op is 0.188us (5.3 Mqps) with batch size of 100
With 100000000 items and hash table ratio 0.900000, number of hash functions used: 3.
Time taken per op is 0.325us (3.1 Mqps) with batch size of 0
With 100000000 items and hash table ratio 0.900000, number of hash functions used: 3.
Time taken per op is 0.196us (5.1 Mqps) with batch size of 10
With 100000000 items and hash table ratio 0.900000, number of hash functions used: 3.
Time taken per op is 0.199us (5.0 Mqps) with batch size of 25
With 100000000 items and hash table ratio 0.900000, number of hash functions used: 3.
Time taken per op is 0.196us (5.1 Mqps) with batch size of 50
With 100000000 items and hash table ratio 0.900000, number of hash functions used: 3.
Time taken per op is 0.209us (4.8 Mqps) with batch size of 100
Reviewers: sdong, yhchiang, igor, ljin
Reviewed By: ljin
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D22167
2014-08-21 03:35:35 +02:00
|
|
|
void CuckooTableReader::Prepare(const Slice& key) {
|
|
|
|
// Prefetching first location also helps improve Get performance.
|
|
|
|
for (uint32_t hash_cnt = 0; hash_cnt < num_hash_fun_; ++hash_cnt) {
|
|
|
|
uint64_t hash_val = get_slice_hash_(ExtractUserKey(key),
|
|
|
|
hash_cnt, num_buckets_);
|
|
|
|
PREFETCH(&file_data_.data()[hash_val * bucket_length_], 0, 3);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-08-06 01:35:02 +02:00
|
|
|
class CuckooTableIterator : public Iterator {
|
|
|
|
public:
|
|
|
|
explicit CuckooTableIterator(CuckooTableReader* reader);
|
|
|
|
~CuckooTableIterator() {}
|
|
|
|
bool Valid() const override;
|
|
|
|
void SeekToFirst() override;
|
|
|
|
void SeekToLast() override;
|
|
|
|
void Seek(const Slice& target) override;
|
|
|
|
void Next() override;
|
|
|
|
void Prev() override;
|
|
|
|
Slice key() const override;
|
|
|
|
Slice value() const override;
|
|
|
|
Status status() const override { return status_; }
|
|
|
|
void LoadKeysFromReader();
|
|
|
|
|
|
|
|
private:
|
|
|
|
struct {
|
|
|
|
bool operator()(const std::pair<Slice, uint32_t>& first,
|
|
|
|
const std::pair<Slice, uint32_t>& second) const {
|
|
|
|
return first.first.compare(second.first) < 0;
|
|
|
|
}
|
|
|
|
} CompareKeys;
|
|
|
|
void PrepareKVAtCurrIdx();
|
|
|
|
CuckooTableReader* reader_;
|
|
|
|
Status status_;
|
|
|
|
// Contains a map of keys to bucket_id sorted in key order.
|
|
|
|
// We assume byte-wise comparison for key ordering.
|
|
|
|
std::vector<std::pair<Slice, uint32_t>> key_to_bucket_id_;
|
|
|
|
// We assume that the number of items can be stored in uint32 (4 Billion).
|
|
|
|
uint32_t curr_key_idx_;
|
|
|
|
Slice curr_value_;
|
|
|
|
IterKey curr_key_;
|
|
|
|
// No copying allowed
|
|
|
|
CuckooTableIterator(const CuckooTableIterator&) = delete;
|
|
|
|
void operator=(const Iterator&) = delete;
|
|
|
|
};
|
|
|
|
|
|
|
|
CuckooTableIterator::CuckooTableIterator(CuckooTableReader* reader)
|
|
|
|
: reader_(reader),
|
|
|
|
curr_key_idx_(std::numeric_limits<int32_t>::max()) {
|
|
|
|
key_to_bucket_id_.clear();
|
|
|
|
curr_value_.clear();
|
|
|
|
curr_key_.Clear();
|
|
|
|
}
|
|
|
|
|
|
|
|
void CuckooTableIterator::LoadKeysFromReader() {
|
|
|
|
key_to_bucket_id_.reserve(reader_->GetTableProperties()->num_entries);
|
|
|
|
for (uint32_t bucket_id = 0; bucket_id < reader_->num_buckets_; bucket_id++) {
|
|
|
|
Slice read_key;
|
|
|
|
status_ = reader_->file_->Read(bucket_id * reader_->bucket_length_,
|
|
|
|
reader_->key_length_, &read_key, nullptr);
|
|
|
|
if (read_key != Slice(reader_->unused_key_)) {
|
|
|
|
key_to_bucket_id_.push_back(std::make_pair(read_key, bucket_id));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
assert(key_to_bucket_id_.size() ==
|
|
|
|
reader_->GetTableProperties()->num_entries);
|
|
|
|
std::sort(key_to_bucket_id_.begin(), key_to_bucket_id_.end(), CompareKeys);
|
|
|
|
curr_key_idx_ = key_to_bucket_id_.size();
|
|
|
|
}
|
|
|
|
|
|
|
|
void CuckooTableIterator::SeekToFirst() {
|
|
|
|
curr_key_idx_ = 0;
|
|
|
|
PrepareKVAtCurrIdx();
|
|
|
|
}
|
|
|
|
|
|
|
|
void CuckooTableIterator::SeekToLast() {
|
|
|
|
curr_key_idx_ = key_to_bucket_id_.size() - 1;
|
|
|
|
PrepareKVAtCurrIdx();
|
|
|
|
}
|
|
|
|
|
|
|
|
void CuckooTableIterator::Seek(const Slice& target) {
|
|
|
|
// We assume that the target is an internal key. If this is last level file,
|
|
|
|
// we need to take only the user key part to seek.
|
|
|
|
Slice target_to_search = reader_->is_last_level_ ?
|
|
|
|
ExtractUserKey(target) : target;
|
|
|
|
auto seek_it = std::lower_bound(key_to_bucket_id_.begin(),
|
|
|
|
key_to_bucket_id_.end(),
|
|
|
|
std::make_pair(target_to_search, 0),
|
|
|
|
CompareKeys);
|
|
|
|
curr_key_idx_ = std::distance(key_to_bucket_id_.begin(), seek_it);
|
|
|
|
PrepareKVAtCurrIdx();
|
|
|
|
}
|
|
|
|
|
|
|
|
bool CuckooTableIterator::Valid() const {
|
|
|
|
return curr_key_idx_ < key_to_bucket_id_.size();
|
|
|
|
}
|
|
|
|
|
|
|
|
void CuckooTableIterator::PrepareKVAtCurrIdx() {
|
|
|
|
if (!Valid()) {
|
|
|
|
curr_value_.clear();
|
|
|
|
curr_key_.Clear();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
uint64_t offset = ((uint64_t) key_to_bucket_id_[curr_key_idx_].second
|
|
|
|
* reader_->bucket_length_) + reader_->key_length_;
|
|
|
|
status_ = reader_->file_->Read(offset, reader_->value_length_,
|
|
|
|
&curr_value_, nullptr);
|
|
|
|
if (reader_->is_last_level_) {
|
|
|
|
// Always return internal key.
|
|
|
|
curr_key_.SetInternalKey(
|
|
|
|
key_to_bucket_id_[curr_key_idx_].first, 0, kTypeValue);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void CuckooTableIterator::Next() {
|
|
|
|
if (!Valid()) {
|
|
|
|
curr_value_.clear();
|
|
|
|
curr_key_.Clear();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
++curr_key_idx_;
|
|
|
|
PrepareKVAtCurrIdx();
|
|
|
|
}
|
|
|
|
|
|
|
|
void CuckooTableIterator::Prev() {
|
|
|
|
if (curr_key_idx_ == 0) {
|
|
|
|
curr_key_idx_ = key_to_bucket_id_.size();
|
|
|
|
}
|
|
|
|
if (!Valid()) {
|
|
|
|
curr_value_.clear();
|
|
|
|
curr_key_.Clear();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
--curr_key_idx_;
|
|
|
|
PrepareKVAtCurrIdx();
|
|
|
|
}
|
|
|
|
|
|
|
|
Slice CuckooTableIterator::key() const {
|
|
|
|
assert(Valid());
|
|
|
|
if (reader_->is_last_level_) {
|
|
|
|
return curr_key_.GetKey();
|
|
|
|
} else {
|
|
|
|
return key_to_bucket_id_[curr_key_idx_].first;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Slice CuckooTableIterator::value() const {
|
|
|
|
assert(Valid());
|
|
|
|
return curr_value_;
|
|
|
|
}
|
|
|
|
|
2014-08-26 01:14:30 +02:00
|
|
|
extern Iterator* NewErrorIterator(const Status& status, Arena* arena);
|
|
|
|
|
|
|
|
Iterator* CuckooTableReader::NewIterator(
|
|
|
|
const ReadOptions& read_options, Arena* arena) {
|
2014-08-06 01:35:02 +02:00
|
|
|
if (!status().ok()) {
|
|
|
|
return NewErrorIterator(
|
2014-08-26 01:14:30 +02:00
|
|
|
Status::Corruption("CuckooTableReader status is not okay."), arena);
|
|
|
|
}
|
|
|
|
if (read_options.total_order_seek) {
|
|
|
|
return NewErrorIterator(
|
|
|
|
Status::InvalidArgument("total_order_seek is not supported."), arena);
|
2014-08-06 01:35:02 +02:00
|
|
|
}
|
|
|
|
CuckooTableIterator* iter;
|
|
|
|
if (arena == nullptr) {
|
|
|
|
iter = new CuckooTableIterator(this);
|
|
|
|
} else {
|
|
|
|
auto iter_mem = arena->AllocateAligned(sizeof(CuckooTableIterator));
|
|
|
|
iter = new (iter_mem) CuckooTableIterator(this);
|
|
|
|
}
|
|
|
|
if (iter->status().ok()) {
|
|
|
|
iter->LoadKeysFromReader();
|
|
|
|
}
|
|
|
|
return iter;
|
2014-07-26 01:37:32 +02:00
|
|
|
}
|
2014-08-05 20:27:34 +02:00
|
|
|
|
|
|
|
size_t CuckooTableReader::ApproximateMemoryUsage() const { return 0; }
|
|
|
|
|
2014-07-26 01:37:32 +02:00
|
|
|
} // namespace rocksdb
|
|
|
|
#endif
|