e45673dece
Summary: Context: Index type `kBinarySearchWithFirstKey` added the ability for sst file iterator to sometimes report a key from index without reading the corresponding data block. This is useful when sst blocks are cut at some meaningful boundaries (e.g. one block per key prefix), and many seeks land between blocks (e.g. for each prefix, the ranges of keys in different sst files are nearly disjoint, so a typical seek needs to read a data block from only one file even if all files have the prefix). But this added a new error condition, which rocksdb code was really not equipped to deal with: `InternalIterator::value()` may fail with an IO error or Status::Incomplete, but it's just a method returning a Slice, with no way to report error instead. Before this PR, this type of error wasn't handled at all (an empty slice was returned), and kBinarySearchWithFirstKey implementation was considered a prototype. Now that we (LogDevice) have experimented with kBinarySearchWithFirstKey for a while and confirmed that it's really useful, this PR is adding the missing error handling. It's a pretty inconvenient situation implementation-wise. The error needs to be reported from InternalIterator when trying to access value. But there are ~700 call sites of `InternalIterator::value()`, most of which either can't hit the error condition (because the iterator is reading from memtable or from index or something) or wouldn't benefit from the deferred loading of the value (e.g. compaction iterator that reads all values anyway). Adding error handling to all these call sites would needlessly bloat the code. So instead I made the deferred value loading optional: only the call sites that may use deferred loading have to call the new method `PrepareValue()` before calling `value()`. The feature is enabled with a new bool argument `allow_unprepared_value` to a bunch of methods that create iterators (it wouldn't make sense to put it in ReadOptions because it's completely internal to iterators, with virtually no user-visible effect). Lmk if you have better ideas. Note that the deferred value loading only happens for *internal* iterators. The user-visible iterator (DBIter) always prepares the value before returning from Seek/Next/etc. We could go further and add an API to defer that value loading too, but that's most likely not useful for LogDevice, so it doesn't seem worth the complexity for now. Pull Request resolved: https://github.com/facebook/rocksdb/pull/6621 Test Plan: make -j5 check . Will also deploy to some logdevice test clusters and look at stats. Reviewed By: siying Differential Revision: D20786930 Pulled By: al13n321 fbshipit-source-id: 6da77d918bad3780522e918f17f4d5513d3e99ee
403 lines
14 KiB
C++
403 lines
14 KiB
C++
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
//
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
#include "table/cuckoo/cuckoo_table_reader.h"
|
|
|
|
#include <algorithm>
|
|
#include <limits>
|
|
#include <string>
|
|
#include <utility>
|
|
#include <vector>
|
|
#include "memory/arena.h"
|
|
#include "rocksdb/iterator.h"
|
|
#include "rocksdb/table.h"
|
|
#include "table/cuckoo/cuckoo_table_factory.h"
|
|
#include "table/get_context.h"
|
|
#include "table/internal_iterator.h"
|
|
#include "table/meta_blocks.h"
|
|
#include "util/coding.h"
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
namespace {
|
|
const uint64_t CACHE_LINE_MASK = ~((uint64_t)CACHE_LINE_SIZE - 1);
|
|
const uint32_t kInvalidIndex = std::numeric_limits<uint32_t>::max();
|
|
}
|
|
|
|
extern const uint64_t kCuckooTableMagicNumber;
|
|
|
|
CuckooTableReader::CuckooTableReader(
|
|
const ImmutableCFOptions& ioptions,
|
|
std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
|
|
const Comparator* comparator,
|
|
uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t))
|
|
: file_(std::move(file)),
|
|
is_last_level_(false),
|
|
identity_as_first_hash_(false),
|
|
use_module_hash_(false),
|
|
num_hash_func_(0),
|
|
unused_key_(""),
|
|
key_length_(0),
|
|
user_key_length_(0),
|
|
value_length_(0),
|
|
bucket_length_(0),
|
|
cuckoo_block_size_(0),
|
|
cuckoo_block_bytes_minus_one_(0),
|
|
table_size_(0),
|
|
ucomp_(comparator),
|
|
get_slice_hash_(get_slice_hash) {
|
|
if (!ioptions.allow_mmap_reads) {
|
|
status_ = Status::InvalidArgument("File is not mmaped");
|
|
return;
|
|
}
|
|
TableProperties* props = nullptr;
|
|
status_ = ReadTableProperties(file_.get(), file_size, kCuckooTableMagicNumber,
|
|
ioptions, &props, true /* compression_type_missing */);
|
|
if (!status_.ok()) {
|
|
return;
|
|
}
|
|
table_props_.reset(props);
|
|
auto& user_props = props->user_collected_properties;
|
|
auto hash_funs = user_props.find(CuckooTablePropertyNames::kNumHashFunc);
|
|
if (hash_funs == user_props.end()) {
|
|
status_ = Status::Corruption("Number of hash functions not found");
|
|
return;
|
|
}
|
|
num_hash_func_ = *reinterpret_cast<const uint32_t*>(hash_funs->second.data());
|
|
auto unused_key = user_props.find(CuckooTablePropertyNames::kEmptyKey);
|
|
if (unused_key == user_props.end()) {
|
|
status_ = Status::Corruption("Empty bucket value not found");
|
|
return;
|
|
}
|
|
unused_key_ = unused_key->second;
|
|
|
|
key_length_ = static_cast<uint32_t>(props->fixed_key_len);
|
|
auto user_key_len = user_props.find(CuckooTablePropertyNames::kUserKeyLength);
|
|
if (user_key_len == user_props.end()) {
|
|
status_ = Status::Corruption("User key length not found");
|
|
return;
|
|
}
|
|
user_key_length_ = *reinterpret_cast<const uint32_t*>(
|
|
user_key_len->second.data());
|
|
|
|
auto value_length = user_props.find(CuckooTablePropertyNames::kValueLength);
|
|
if (value_length == user_props.end()) {
|
|
status_ = Status::Corruption("Value length not found");
|
|
return;
|
|
}
|
|
value_length_ = *reinterpret_cast<const uint32_t*>(
|
|
value_length->second.data());
|
|
bucket_length_ = key_length_ + value_length_;
|
|
|
|
auto hash_table_size = user_props.find(
|
|
CuckooTablePropertyNames::kHashTableSize);
|
|
if (hash_table_size == user_props.end()) {
|
|
status_ = Status::Corruption("Hash table size not found");
|
|
return;
|
|
}
|
|
table_size_ = *reinterpret_cast<const uint64_t*>(
|
|
hash_table_size->second.data());
|
|
|
|
auto is_last_level = user_props.find(CuckooTablePropertyNames::kIsLastLevel);
|
|
if (is_last_level == user_props.end()) {
|
|
status_ = Status::Corruption("Is last level not found");
|
|
return;
|
|
}
|
|
is_last_level_ = *reinterpret_cast<const bool*>(is_last_level->second.data());
|
|
|
|
auto identity_as_first_hash = user_props.find(
|
|
CuckooTablePropertyNames::kIdentityAsFirstHash);
|
|
if (identity_as_first_hash == user_props.end()) {
|
|
status_ = Status::Corruption("identity as first hash not found");
|
|
return;
|
|
}
|
|
identity_as_first_hash_ = *reinterpret_cast<const bool*>(
|
|
identity_as_first_hash->second.data());
|
|
|
|
auto use_module_hash = user_props.find(
|
|
CuckooTablePropertyNames::kUseModuleHash);
|
|
if (use_module_hash == user_props.end()) {
|
|
status_ = Status::Corruption("hash type is not found");
|
|
return;
|
|
}
|
|
use_module_hash_ = *reinterpret_cast<const bool*>(
|
|
use_module_hash->second.data());
|
|
auto cuckoo_block_size = user_props.find(
|
|
CuckooTablePropertyNames::kCuckooBlockSize);
|
|
if (cuckoo_block_size == user_props.end()) {
|
|
status_ = Status::Corruption("Cuckoo block size not found");
|
|
return;
|
|
}
|
|
cuckoo_block_size_ = *reinterpret_cast<const uint32_t*>(
|
|
cuckoo_block_size->second.data());
|
|
cuckoo_block_bytes_minus_one_ = cuckoo_block_size_ * bucket_length_ - 1;
|
|
status_ = file_->Read(0, static_cast<size_t>(file_size), &file_data_, nullptr,
|
|
nullptr);
|
|
}
|
|
|
|
Status CuckooTableReader::Get(const ReadOptions& /*readOptions*/,
|
|
const Slice& key, GetContext* get_context,
|
|
const SliceTransform* /* prefix_extractor */,
|
|
bool /*skip_filters*/) {
|
|
assert(key.size() == key_length_ + (is_last_level_ ? 8 : 0));
|
|
Slice user_key = ExtractUserKey(key);
|
|
for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_; ++hash_cnt) {
|
|
uint64_t offset = bucket_length_ * CuckooHash(
|
|
user_key, hash_cnt, use_module_hash_, table_size_,
|
|
identity_as_first_hash_, get_slice_hash_);
|
|
const char* bucket = &file_data_.data()[offset];
|
|
for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_;
|
|
++block_idx, bucket += bucket_length_) {
|
|
if (ucomp_->Equal(Slice(unused_key_.data(), user_key.size()),
|
|
Slice(bucket, user_key.size()))) {
|
|
return Status::OK();
|
|
}
|
|
// Here, we compare only the user key part as we support only one entry
|
|
// per user key and we don't support snapshot.
|
|
if (ucomp_->Equal(user_key, Slice(bucket, user_key.size()))) {
|
|
Slice value(bucket + key_length_, value_length_);
|
|
if (is_last_level_) {
|
|
// Sequence number is not stored at the last level, so we will use
|
|
// kMaxSequenceNumber since it is unknown. This could cause some
|
|
// transactions to fail to lock a key due to known sequence number.
|
|
// However, it is expected for anyone to use a CuckooTable in a
|
|
// TransactionDB.
|
|
get_context->SaveValue(value, kMaxSequenceNumber);
|
|
} else {
|
|
Slice full_key(bucket, key_length_);
|
|
ParsedInternalKey found_ikey;
|
|
ParseInternalKey(full_key, &found_ikey);
|
|
bool dont_care __attribute__((__unused__));
|
|
get_context->SaveValue(found_ikey, value, &dont_care);
|
|
}
|
|
// We don't support merge operations. So, we return here.
|
|
return Status::OK();
|
|
}
|
|
}
|
|
}
|
|
return Status::OK();
|
|
}
|
|
|
|
void CuckooTableReader::Prepare(const Slice& key) {
|
|
// Prefetch the first Cuckoo Block.
|
|
Slice user_key = ExtractUserKey(key);
|
|
uint64_t addr = reinterpret_cast<uint64_t>(file_data_.data()) +
|
|
bucket_length_ * CuckooHash(user_key, 0, use_module_hash_, table_size_,
|
|
identity_as_first_hash_, nullptr);
|
|
uint64_t end_addr = addr + cuckoo_block_bytes_minus_one_;
|
|
for (addr &= CACHE_LINE_MASK; addr < end_addr; addr += CACHE_LINE_SIZE) {
|
|
PREFETCH(reinterpret_cast<const char*>(addr), 0, 3);
|
|
}
|
|
}
|
|
|
|
class CuckooTableIterator : public InternalIterator {
|
|
public:
|
|
explicit CuckooTableIterator(CuckooTableReader* reader);
|
|
// No copying allowed
|
|
CuckooTableIterator(const CuckooTableIterator&) = delete;
|
|
void operator=(const Iterator&) = delete;
|
|
~CuckooTableIterator() override {}
|
|
bool Valid() const override;
|
|
void SeekToFirst() override;
|
|
void SeekToLast() override;
|
|
void Seek(const Slice& target) override;
|
|
void SeekForPrev(const Slice& target) override;
|
|
void Next() override;
|
|
void Prev() override;
|
|
Slice key() const override;
|
|
Slice value() const override;
|
|
Status status() const override { return Status::OK(); }
|
|
void InitIfNeeded();
|
|
|
|
private:
|
|
struct BucketComparator {
|
|
BucketComparator(const Slice& file_data, const Comparator* ucomp,
|
|
uint32_t bucket_len, uint32_t user_key_len,
|
|
const Slice& target = Slice())
|
|
: file_data_(file_data),
|
|
ucomp_(ucomp),
|
|
bucket_len_(bucket_len),
|
|
user_key_len_(user_key_len),
|
|
target_(target) {}
|
|
bool operator()(const uint32_t first, const uint32_t second) const {
|
|
const char* first_bucket =
|
|
(first == kInvalidIndex) ? target_.data() :
|
|
&file_data_.data()[first * bucket_len_];
|
|
const char* second_bucket =
|
|
(second == kInvalidIndex) ? target_.data() :
|
|
&file_data_.data()[second * bucket_len_];
|
|
return ucomp_->Compare(Slice(first_bucket, user_key_len_),
|
|
Slice(second_bucket, user_key_len_)) < 0;
|
|
}
|
|
private:
|
|
const Slice file_data_;
|
|
const Comparator* ucomp_;
|
|
const uint32_t bucket_len_;
|
|
const uint32_t user_key_len_;
|
|
const Slice target_;
|
|
};
|
|
|
|
const BucketComparator bucket_comparator_;
|
|
void PrepareKVAtCurrIdx();
|
|
CuckooTableReader* reader_;
|
|
bool initialized_;
|
|
// Contains a map of keys to bucket_id sorted in key order.
|
|
std::vector<uint32_t> sorted_bucket_ids_;
|
|
// We assume that the number of items can be stored in uint32 (4 Billion).
|
|
uint32_t curr_key_idx_;
|
|
Slice curr_value_;
|
|
IterKey curr_key_;
|
|
};
|
|
|
|
CuckooTableIterator::CuckooTableIterator(CuckooTableReader* reader)
|
|
: bucket_comparator_(reader->file_data_, reader->ucomp_,
|
|
reader->bucket_length_, reader->user_key_length_),
|
|
reader_(reader),
|
|
initialized_(false),
|
|
curr_key_idx_(kInvalidIndex) {
|
|
sorted_bucket_ids_.clear();
|
|
curr_value_.clear();
|
|
curr_key_.Clear();
|
|
}
|
|
|
|
void CuckooTableIterator::InitIfNeeded() {
|
|
if (initialized_) {
|
|
return;
|
|
}
|
|
sorted_bucket_ids_.reserve(static_cast<size_t>(reader_->GetTableProperties()->num_entries));
|
|
uint64_t num_buckets = reader_->table_size_ + reader_->cuckoo_block_size_ - 1;
|
|
assert(num_buckets < kInvalidIndex);
|
|
const char* bucket = reader_->file_data_.data();
|
|
for (uint32_t bucket_id = 0; bucket_id < num_buckets; ++bucket_id) {
|
|
if (Slice(bucket, reader_->key_length_) != Slice(reader_->unused_key_)) {
|
|
sorted_bucket_ids_.push_back(bucket_id);
|
|
}
|
|
bucket += reader_->bucket_length_;
|
|
}
|
|
assert(sorted_bucket_ids_.size() ==
|
|
reader_->GetTableProperties()->num_entries);
|
|
std::sort(sorted_bucket_ids_.begin(), sorted_bucket_ids_.end(),
|
|
bucket_comparator_);
|
|
curr_key_idx_ = kInvalidIndex;
|
|
initialized_ = true;
|
|
}
|
|
|
|
void CuckooTableIterator::SeekToFirst() {
|
|
InitIfNeeded();
|
|
curr_key_idx_ = 0;
|
|
PrepareKVAtCurrIdx();
|
|
}
|
|
|
|
void CuckooTableIterator::SeekToLast() {
|
|
InitIfNeeded();
|
|
curr_key_idx_ = static_cast<uint32_t>(sorted_bucket_ids_.size()) - 1;
|
|
PrepareKVAtCurrIdx();
|
|
}
|
|
|
|
void CuckooTableIterator::Seek(const Slice& target) {
|
|
InitIfNeeded();
|
|
const BucketComparator seek_comparator(
|
|
reader_->file_data_, reader_->ucomp_,
|
|
reader_->bucket_length_, reader_->user_key_length_,
|
|
ExtractUserKey(target));
|
|
auto seek_it = std::lower_bound(sorted_bucket_ids_.begin(),
|
|
sorted_bucket_ids_.end(),
|
|
kInvalidIndex,
|
|
seek_comparator);
|
|
curr_key_idx_ =
|
|
static_cast<uint32_t>(std::distance(sorted_bucket_ids_.begin(), seek_it));
|
|
PrepareKVAtCurrIdx();
|
|
}
|
|
|
|
void CuckooTableIterator::SeekForPrev(const Slice& /*target*/) {
|
|
// Not supported
|
|
assert(false);
|
|
}
|
|
|
|
bool CuckooTableIterator::Valid() const {
|
|
return curr_key_idx_ < sorted_bucket_ids_.size();
|
|
}
|
|
|
|
void CuckooTableIterator::PrepareKVAtCurrIdx() {
|
|
if (!Valid()) {
|
|
curr_value_.clear();
|
|
curr_key_.Clear();
|
|
return;
|
|
}
|
|
uint32_t id = sorted_bucket_ids_[curr_key_idx_];
|
|
const char* offset = reader_->file_data_.data() +
|
|
id * reader_->bucket_length_;
|
|
if (reader_->is_last_level_) {
|
|
// Always return internal key.
|
|
curr_key_.SetInternalKey(Slice(offset, reader_->user_key_length_),
|
|
0, kTypeValue);
|
|
} else {
|
|
curr_key_.SetInternalKey(Slice(offset, reader_->key_length_));
|
|
}
|
|
curr_value_ = Slice(offset + reader_->key_length_, reader_->value_length_);
|
|
}
|
|
|
|
void CuckooTableIterator::Next() {
|
|
if (!Valid()) {
|
|
curr_value_.clear();
|
|
curr_key_.Clear();
|
|
return;
|
|
}
|
|
++curr_key_idx_;
|
|
PrepareKVAtCurrIdx();
|
|
}
|
|
|
|
void CuckooTableIterator::Prev() {
|
|
if (curr_key_idx_ == 0) {
|
|
curr_key_idx_ = static_cast<uint32_t>(sorted_bucket_ids_.size());
|
|
}
|
|
if (!Valid()) {
|
|
curr_value_.clear();
|
|
curr_key_.Clear();
|
|
return;
|
|
}
|
|
--curr_key_idx_;
|
|
PrepareKVAtCurrIdx();
|
|
}
|
|
|
|
Slice CuckooTableIterator::key() const {
|
|
assert(Valid());
|
|
return curr_key_.GetInternalKey();
|
|
}
|
|
|
|
Slice CuckooTableIterator::value() const {
|
|
assert(Valid());
|
|
return curr_value_;
|
|
}
|
|
|
|
InternalIterator* CuckooTableReader::NewIterator(
|
|
const ReadOptions& /*read_options*/,
|
|
const SliceTransform* /* prefix_extractor */, Arena* arena,
|
|
bool /*skip_filters*/, TableReaderCaller /*caller*/,
|
|
size_t /*compaction_readahead_size*/,
|
|
bool /* allow_unprepared_value */) {
|
|
if (!status().ok()) {
|
|
return NewErrorInternalIterator<Slice>(
|
|
Status::Corruption("CuckooTableReader status is not okay."), arena);
|
|
}
|
|
CuckooTableIterator* iter;
|
|
if (arena == nullptr) {
|
|
iter = new CuckooTableIterator(this);
|
|
} else {
|
|
auto iter_mem = arena->AllocateAligned(sizeof(CuckooTableIterator));
|
|
iter = new (iter_mem) CuckooTableIterator(this);
|
|
}
|
|
return iter;
|
|
}
|
|
|
|
size_t CuckooTableReader::ApproximateMemoryUsage() const { return 0; }
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|
|
#endif
|