Refactoring
Summary: This is the first split of https://github.com/facebook/rocksdb/pull/1891 and will be needed for the upcoming partitioned filter patch. Closes https://github.com/facebook/rocksdb/pull/1949 Differential Revision: D4652152 Pulled By: maysamyabandeh fbshipit-source-id: 9801778
This commit is contained in:
parent
2a5daa06f0
commit
a2f7a514d1
@ -336,10 +336,12 @@ set(SOURCES
|
||||
table/format.cc
|
||||
table/full_filter_block.cc
|
||||
table/get_context.cc
|
||||
table/index_builder.cc
|
||||
table/iterator.cc
|
||||
table/merging_iterator.cc
|
||||
table/sst_file_writer.cc
|
||||
table/meta_blocks.cc
|
||||
table/partitioned_filter_block.cc
|
||||
table/plain_table_builder.cc
|
||||
table/plain_table_factory.cc
|
||||
table/plain_table_index.cc
|
||||
|
2
src.mk
2
src.mk
@ -72,10 +72,12 @@ LIB_SOURCES = \
|
||||
table/format.cc \
|
||||
table/full_filter_block.cc \
|
||||
table/get_context.cc \
|
||||
table/index_builder.cc \
|
||||
table/iterator.cc \
|
||||
table/merging_iterator.cc \
|
||||
table/meta_blocks.cc \
|
||||
table/sst_file_writer.cc \
|
||||
table/partitioned_filter_block.cc \
|
||||
table/plain_table_builder.cc \
|
||||
table/plain_table_factory.cc \
|
||||
table/plain_table_index.cc \
|
||||
|
@ -31,14 +31,16 @@
|
||||
#include "rocksdb/table.h"
|
||||
|
||||
#include "table/block.h"
|
||||
#include "table/block_based_filter_block.h"
|
||||
#include "table/block_based_table_factory.h"
|
||||
#include "table/block_based_table_reader.h"
|
||||
#include "table/block_builder.h"
|
||||
#include "table/filter_block.h"
|
||||
#include "table/block_based_filter_block.h"
|
||||
#include "table/block_based_table_factory.h"
|
||||
#include "table/full_filter_block.h"
|
||||
#include "table/format.h"
|
||||
#include "table/full_filter_block.h"
|
||||
#include "table/index_builder.h"
|
||||
#include "table/meta_blocks.h"
|
||||
#include "table/partitioned_filter_block.h"
|
||||
#include "table/table_builder.h"
|
||||
|
||||
#include "util/string_util.h"
|
||||
@ -54,384 +56,10 @@ extern const std::string kHashIndexPrefixesBlock;
|
||||
extern const std::string kHashIndexPrefixesMetadataBlock;
|
||||
|
||||
typedef BlockBasedTableOptions::IndexType IndexType;
|
||||
class IndexBuilder;
|
||||
|
||||
namespace {
|
||||
rocksdb::IndexBuilder* CreateIndexBuilder(
|
||||
IndexType index_type, const InternalKeyComparator* comparator,
|
||||
const SliceTransform* prefix_extractor, int index_block_restart_interval,
|
||||
uint64_t index_per_partition);
|
||||
}
|
||||
|
||||
// The interface for building index.
|
||||
// Instruction for adding a new concrete IndexBuilder:
|
||||
// 1. Create a subclass instantiated from IndexBuilder.
|
||||
// 2. Add a new entry associated with that subclass in TableOptions::IndexType.
|
||||
// 3. Add a create function for the new subclass in CreateIndexBuilder.
|
||||
// Note: we can devise more advanced design to simplify the process for adding
|
||||
// new subclass, which will, on the other hand, increase the code complexity and
|
||||
// catch unwanted attention from readers. Given that we won't add/change
|
||||
// indexes frequently, it makes sense to just embrace a more straightforward
|
||||
// design that just works.
|
||||
class IndexBuilder {
|
||||
public:
|
||||
// Index builder will construct a set of blocks which contain:
|
||||
// 1. One primary index block.
|
||||
// 2. (Optional) a set of metablocks that contains the metadata of the
|
||||
// primary index.
|
||||
struct IndexBlocks {
|
||||
Slice index_block_contents;
|
||||
std::unordered_map<std::string, Slice> meta_blocks;
|
||||
};
|
||||
explicit IndexBuilder(const InternalKeyComparator* comparator)
|
||||
: comparator_(comparator) {}
|
||||
|
||||
virtual ~IndexBuilder() {}
|
||||
|
||||
// Add a new index entry to index block.
|
||||
// To allow further optimization, we provide `last_key_in_current_block` and
|
||||
// `first_key_in_next_block`, based on which the specific implementation can
|
||||
// determine the best index key to be used for the index block.
|
||||
// @last_key_in_current_block: this parameter maybe overridden with the value
|
||||
// "substitute key".
|
||||
// @first_key_in_next_block: it will be nullptr if the entry being added is
|
||||
// the last one in the table
|
||||
//
|
||||
// REQUIRES: Finish() has not yet been called.
|
||||
virtual void AddIndexEntry(std::string* last_key_in_current_block,
|
||||
const Slice* first_key_in_next_block,
|
||||
const BlockHandle& block_handle) = 0;
|
||||
|
||||
// This method will be called whenever a key is added. The subclasses may
|
||||
// override OnKeyAdded() if they need to collect additional information.
|
||||
virtual void OnKeyAdded(const Slice& key) {}
|
||||
|
||||
// Inform the index builder that all entries has been written. Block builder
|
||||
// may therefore perform any operation required for block finalization.
|
||||
//
|
||||
// REQUIRES: Finish() has not yet been called.
|
||||
inline Status Finish(IndexBlocks* index_blocks) {
|
||||
// Throw away the changes to last_partition_block_handle. It has no effect
|
||||
// on the first call to Finish anyway.
|
||||
BlockHandle last_partition_block_handle;
|
||||
return Finish(index_blocks, last_partition_block_handle);
|
||||
}
|
||||
|
||||
// This override of Finish can be utilized to build the 2nd level index in
|
||||
// PartitionIndexBuilder.
|
||||
//
|
||||
// index_blocks will be filled with the resulting index data. If the return
|
||||
// value is Status::InComplete() then it means that the index is partitioned
|
||||
// and the callee should keep calling Finish until Status::OK() is returned.
|
||||
// In that case, last_partition_block_handle is pointer to the block written
|
||||
// with the result of the last call to Finish. This can be utilized to build
|
||||
// the second level index pointing to each block of partitioned indexes. The
|
||||
// last call to Finish() that returns Status::OK() populates index_blocks with
|
||||
// the 2nd level index content.
|
||||
virtual Status Finish(IndexBlocks* index_blocks,
|
||||
const BlockHandle& last_partition_block_handle) = 0;
|
||||
|
||||
// Get the estimated size for index block.
|
||||
virtual size_t EstimatedSize() const = 0;
|
||||
|
||||
protected:
|
||||
const InternalKeyComparator* comparator_;
|
||||
};
|
||||
|
||||
// This index builder builds space-efficient index block.
|
||||
//
|
||||
// Optimizations:
|
||||
// 1. Made block's `block_restart_interval` to be 1, which will avoid linear
|
||||
// search when doing index lookup (can be disabled by setting
|
||||
// index_block_restart_interval).
|
||||
// 2. Shorten the key length for index block. Other than honestly using the
|
||||
// last key in the data block as the index key, we instead find a shortest
|
||||
// substitute key that serves the same function.
|
||||
class ShortenedIndexBuilder : public IndexBuilder {
|
||||
public:
|
||||
explicit ShortenedIndexBuilder(const InternalKeyComparator* comparator,
|
||||
int index_block_restart_interval)
|
||||
: IndexBuilder(comparator),
|
||||
index_block_builder_(index_block_restart_interval) {}
|
||||
|
||||
virtual void AddIndexEntry(std::string* last_key_in_current_block,
|
||||
const Slice* first_key_in_next_block,
|
||||
const BlockHandle& block_handle) override {
|
||||
if (first_key_in_next_block != nullptr) {
|
||||
comparator_->FindShortestSeparator(last_key_in_current_block,
|
||||
*first_key_in_next_block);
|
||||
} else {
|
||||
comparator_->FindShortSuccessor(last_key_in_current_block);
|
||||
}
|
||||
|
||||
std::string handle_encoding;
|
||||
block_handle.EncodeTo(&handle_encoding);
|
||||
index_block_builder_.Add(*last_key_in_current_block, handle_encoding);
|
||||
}
|
||||
|
||||
virtual Status Finish(
|
||||
IndexBlocks* index_blocks,
|
||||
const BlockHandle& last_partition_block_handle) override {
|
||||
index_blocks->index_block_contents = index_block_builder_.Finish();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
virtual size_t EstimatedSize() const override {
|
||||
return index_block_builder_.CurrentSizeEstimate();
|
||||
}
|
||||
|
||||
private:
|
||||
BlockBuilder index_block_builder_;
|
||||
};
|
||||
|
||||
/**
|
||||
* IndexBuilder for two-level indexing. Internally it creates a new index for
|
||||
* each partition and Finish then in order when Finish is called on it
|
||||
* continiously until Status::OK() is returned.
|
||||
*
|
||||
* The format on the disk would be I I I I I I IP where I is block containing a
|
||||
* partition of indexes built using ShortenedIndexBuilder and IP is a block
|
||||
* containing a secondary index on the partitions, built using
|
||||
* ShortenedIndexBuilder.
|
||||
*/
|
||||
class PartitionIndexBuilder : public IndexBuilder {
|
||||
public:
|
||||
explicit PartitionIndexBuilder(const InternalKeyComparator* comparator,
|
||||
const SliceTransform* prefix_extractor,
|
||||
const uint64_t index_per_partition,
|
||||
int index_block_restart_interval)
|
||||
: IndexBuilder(comparator),
|
||||
prefix_extractor_(prefix_extractor),
|
||||
index_block_builder_(index_block_restart_interval),
|
||||
index_per_partition_(index_per_partition),
|
||||
index_block_restart_interval_(index_block_restart_interval) {
|
||||
sub_index_builder_ =
|
||||
CreateIndexBuilder(sub_type_, comparator_, prefix_extractor_,
|
||||
index_block_restart_interval_, index_per_partition_);
|
||||
}
|
||||
|
||||
virtual ~PartitionIndexBuilder() { delete sub_index_builder_; }
|
||||
|
||||
virtual void AddIndexEntry(std::string* last_key_in_current_block,
|
||||
const Slice* first_key_in_next_block,
|
||||
const BlockHandle& block_handle) override {
|
||||
sub_index_builder_->AddIndexEntry(last_key_in_current_block,
|
||||
first_key_in_next_block, block_handle);
|
||||
num_indexes++;
|
||||
if (UNLIKELY(first_key_in_next_block == nullptr)) { // no more keys
|
||||
entries_.push_back({std::string(*last_key_in_current_block),
|
||||
std::unique_ptr<IndexBuilder>(sub_index_builder_)});
|
||||
sub_index_builder_ = nullptr;
|
||||
} else if (num_indexes % index_per_partition_ == 0) {
|
||||
entries_.push_back({std::string(*last_key_in_current_block),
|
||||
std::unique_ptr<IndexBuilder>(sub_index_builder_)});
|
||||
sub_index_builder_ = CreateIndexBuilder(
|
||||
sub_type_, comparator_, prefix_extractor_,
|
||||
index_block_restart_interval_, index_per_partition_);
|
||||
}
|
||||
}
|
||||
|
||||
virtual Status Finish(
|
||||
IndexBlocks* index_blocks,
|
||||
const BlockHandle& last_partition_block_handle) override {
|
||||
assert(!entries_.empty());
|
||||
// It must be set to null after last key is added
|
||||
assert(sub_index_builder_ == nullptr);
|
||||
if (finishing == true) {
|
||||
Entry& last_entry = entries_.front();
|
||||
std::string handle_encoding;
|
||||
last_partition_block_handle.EncodeTo(&handle_encoding);
|
||||
index_block_builder_.Add(last_entry.key, handle_encoding);
|
||||
entries_.pop_front();
|
||||
}
|
||||
// If there is no sub_index left, then return the 2nd level index.
|
||||
if (UNLIKELY(entries_.empty())) {
|
||||
index_blocks->index_block_contents = index_block_builder_.Finish();
|
||||
return Status::OK();
|
||||
} else {
|
||||
// Finish the next partition index in line and Incomplete() to indicate we
|
||||
// expect more calls to Finish
|
||||
Entry& entry = entries_.front();
|
||||
auto s = entry.value->Finish(index_blocks);
|
||||
finishing = true;
|
||||
return s.ok() ? Status::Incomplete() : s;
|
||||
}
|
||||
}
|
||||
|
||||
virtual size_t EstimatedSize() const override {
|
||||
size_t total = 0;
|
||||
for (auto it = entries_.begin(); it != entries_.end(); ++it) {
|
||||
total += it->value->EstimatedSize();
|
||||
}
|
||||
total += index_block_builder_.CurrentSizeEstimate();
|
||||
total +=
|
||||
sub_index_builder_ == nullptr ? 0 : sub_index_builder_->EstimatedSize();
|
||||
return total;
|
||||
}
|
||||
|
||||
private:
|
||||
static const IndexType sub_type_ = BlockBasedTableOptions::kBinarySearch;
|
||||
struct Entry {
|
||||
std::string key;
|
||||
std::unique_ptr<IndexBuilder> value;
|
||||
};
|
||||
std::list<Entry> entries_; // list of partitioned indexes and their keys
|
||||
const SliceTransform* prefix_extractor_;
|
||||
BlockBuilder index_block_builder_; // top-level index builder
|
||||
IndexBuilder* sub_index_builder_; // the active partition index builder
|
||||
uint64_t index_per_partition_;
|
||||
int index_block_restart_interval_;
|
||||
uint64_t num_indexes = 0;
|
||||
bool finishing =
|
||||
false; // true if Finish is called once but not complete yet.
|
||||
};
|
||||
|
||||
// HashIndexBuilder contains a binary-searchable primary index and the
|
||||
// metadata for secondary hash index construction.
|
||||
// The metadata for hash index consists two parts:
|
||||
// - a metablock that compactly contains a sequence of prefixes. All prefixes
|
||||
// are stored consectively without any metadata (like, prefix sizes) being
|
||||
// stored, which is kept in the other metablock.
|
||||
// - a metablock contains the metadata of the prefixes, including prefix size,
|
||||
// restart index and number of block it spans. The format looks like:
|
||||
//
|
||||
// +-----------------+---------------------------+---------------------+ <=prefix 1
|
||||
// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
|
||||
// +-----------------+---------------------------+---------------------+ <=prefix 2
|
||||
// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
|
||||
// +-----------------+---------------------------+---------------------+
|
||||
// | |
|
||||
// | .... |
|
||||
// | |
|
||||
// +-----------------+---------------------------+---------------------+ <=prefix n
|
||||
// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
|
||||
// +-----------------+---------------------------+---------------------+
|
||||
//
|
||||
// The reason of separating these two metablocks is to enable the efficiently
|
||||
// reuse the first metablock during hash index construction without unnecessary
|
||||
// data copy or small heap allocations for prefixes.
|
||||
class HashIndexBuilder : public IndexBuilder {
|
||||
public:
|
||||
explicit HashIndexBuilder(const InternalKeyComparator* comparator,
|
||||
const SliceTransform* hash_key_extractor,
|
||||
int index_block_restart_interval)
|
||||
: IndexBuilder(comparator),
|
||||
primary_index_builder_(comparator, index_block_restart_interval),
|
||||
hash_key_extractor_(hash_key_extractor) {}
|
||||
|
||||
virtual void AddIndexEntry(std::string* last_key_in_current_block,
|
||||
const Slice* first_key_in_next_block,
|
||||
const BlockHandle& block_handle) override {
|
||||
++current_restart_index_;
|
||||
primary_index_builder_.AddIndexEntry(last_key_in_current_block,
|
||||
first_key_in_next_block, block_handle);
|
||||
}
|
||||
|
||||
virtual void OnKeyAdded(const Slice& key) override {
|
||||
auto key_prefix = hash_key_extractor_->Transform(key);
|
||||
bool is_first_entry = pending_block_num_ == 0;
|
||||
|
||||
// Keys may share the prefix
|
||||
if (is_first_entry || pending_entry_prefix_ != key_prefix) {
|
||||
if (!is_first_entry) {
|
||||
FlushPendingPrefix();
|
||||
}
|
||||
|
||||
// need a hard copy otherwise the underlying data changes all the time.
|
||||
// TODO(kailiu) ToString() is expensive. We may speed up can avoid data
|
||||
// copy.
|
||||
pending_entry_prefix_ = key_prefix.ToString();
|
||||
pending_block_num_ = 1;
|
||||
pending_entry_index_ = static_cast<uint32_t>(current_restart_index_);
|
||||
} else {
|
||||
// entry number increments when keys share the prefix reside in
|
||||
// different data blocks.
|
||||
auto last_restart_index = pending_entry_index_ + pending_block_num_ - 1;
|
||||
assert(last_restart_index <= current_restart_index_);
|
||||
if (last_restart_index != current_restart_index_) {
|
||||
++pending_block_num_;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
virtual Status Finish(
|
||||
IndexBlocks* index_blocks,
|
||||
const BlockHandle& last_partition_block_handle) override {
|
||||
FlushPendingPrefix();
|
||||
primary_index_builder_.Finish(index_blocks, last_partition_block_handle);
|
||||
index_blocks->meta_blocks.insert(
|
||||
{kHashIndexPrefixesBlock.c_str(), prefix_block_});
|
||||
index_blocks->meta_blocks.insert(
|
||||
{kHashIndexPrefixesMetadataBlock.c_str(), prefix_meta_block_});
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
virtual size_t EstimatedSize() const override {
|
||||
return primary_index_builder_.EstimatedSize() + prefix_block_.size() +
|
||||
prefix_meta_block_.size();
|
||||
}
|
||||
|
||||
private:
|
||||
void FlushPendingPrefix() {
|
||||
prefix_block_.append(pending_entry_prefix_.data(),
|
||||
pending_entry_prefix_.size());
|
||||
PutVarint32Varint32Varint32(
|
||||
&prefix_meta_block_,
|
||||
static_cast<uint32_t>(pending_entry_prefix_.size()),
|
||||
pending_entry_index_, pending_block_num_);
|
||||
}
|
||||
|
||||
ShortenedIndexBuilder primary_index_builder_;
|
||||
const SliceTransform* hash_key_extractor_;
|
||||
|
||||
// stores a sequence of prefixes
|
||||
std::string prefix_block_;
|
||||
// stores the metadata of prefixes
|
||||
std::string prefix_meta_block_;
|
||||
|
||||
// The following 3 variables keeps unflushed prefix and its metadata.
|
||||
// The details of block_num and entry_index can be found in
|
||||
// "block_hash_index.{h,cc}"
|
||||
uint32_t pending_block_num_ = 0;
|
||||
uint32_t pending_entry_index_ = 0;
|
||||
std::string pending_entry_prefix_;
|
||||
|
||||
uint64_t current_restart_index_ = 0;
|
||||
};
|
||||
|
||||
// Without anonymous namespace here, we fail the warning -Wmissing-prototypes
|
||||
namespace {
|
||||
|
||||
// Create a index builder based on its type.
|
||||
IndexBuilder* CreateIndexBuilder(IndexType index_type,
|
||||
const InternalKeyComparator* comparator,
|
||||
const SliceTransform* prefix_extractor,
|
||||
int index_block_restart_interval,
|
||||
uint64_t index_per_partition) {
|
||||
switch (index_type) {
|
||||
case BlockBasedTableOptions::kBinarySearch: {
|
||||
return new ShortenedIndexBuilder(comparator,
|
||||
index_block_restart_interval);
|
||||
}
|
||||
case BlockBasedTableOptions::kHashSearch: {
|
||||
return new HashIndexBuilder(comparator, prefix_extractor,
|
||||
index_block_restart_interval);
|
||||
}
|
||||
case BlockBasedTableOptions::kTwoLevelIndexSearch: {
|
||||
return new PartitionIndexBuilder(comparator, prefix_extractor,
|
||||
index_per_partition,
|
||||
index_block_restart_interval);
|
||||
}
|
||||
default: {
|
||||
assert(!"Do not recognize the index type ");
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
// impossible.
|
||||
assert(false);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Create a index builder based on its type.
|
||||
FilterBlockBuilder* CreateFilterBlockBuilder(const ImmutableCFOptions& opt,
|
||||
const BlockBasedTableOptions& table_opt) {
|
||||
@ -649,11 +277,11 @@ struct BlockBasedTableBuilder::Rep {
|
||||
table_options.use_delta_encoding),
|
||||
range_del_block(1), // TODO(andrewkr): restart_interval unnecessary
|
||||
internal_prefix_transform(_ioptions.prefix_extractor),
|
||||
index_builder(
|
||||
CreateIndexBuilder(table_options.index_type, &internal_comparator,
|
||||
&this->internal_prefix_transform,
|
||||
table_options.index_block_restart_interval,
|
||||
table_options.index_per_partition)),
|
||||
index_builder(IndexBuilder::CreateIndexBuilder(
|
||||
table_options.index_type, &internal_comparator,
|
||||
&this->internal_prefix_transform,
|
||||
table_options.index_block_restart_interval,
|
||||
table_options.index_per_partition)),
|
||||
compression_type(_compression_type),
|
||||
compression_opts(_compression_opts),
|
||||
compression_dict(_compression_dict),
|
||||
|
@ -57,6 +57,11 @@ using std::unique_ptr;
|
||||
|
||||
typedef BlockBasedTable::IndexReader IndexReader;
|
||||
|
||||
BlockBasedTable::~BlockBasedTable() {
|
||||
Close();
|
||||
delete rep_;
|
||||
}
|
||||
|
||||
namespace {
|
||||
// Read the block identified by "handle" from "file".
|
||||
// The only relevant option is options.verify_checksums for now.
|
||||
@ -143,42 +148,6 @@ Cache::Handle* GetEntryFromCache(Cache* block_cache, const Slice& key,
|
||||
|
||||
} // namespace
|
||||
|
||||
// -- IndexReader and its subclasses
|
||||
// IndexReader is the interface that provide the functionality for index access.
|
||||
class BlockBasedTable::IndexReader {
|
||||
public:
|
||||
explicit IndexReader(const Comparator* comparator, Statistics* stats)
|
||||
: comparator_(comparator), statistics_(stats) {}
|
||||
|
||||
virtual ~IndexReader() {}
|
||||
|
||||
// Create an iterator for index access.
|
||||
// If iter is null then a new object is created on heap and the callee will
|
||||
// have the ownership. If a non-null iter is passed in it will be used, and
|
||||
// the returned value is either the same as iter or a new on-heap object that
|
||||
// wrapps the passed iter. In the latter case the return value would point to
|
||||
// a different object then iter and the callee has the ownership of the
|
||||
// returned object.
|
||||
virtual InternalIterator* NewIterator(BlockIter* iter = nullptr,
|
||||
bool total_order_seek = true) = 0;
|
||||
|
||||
// The size of the index.
|
||||
virtual size_t size() const = 0;
|
||||
// Memory usage of the index block
|
||||
virtual size_t usable_size() const = 0;
|
||||
// return the statistics pointer
|
||||
virtual Statistics* statistics() const { return statistics_; }
|
||||
// Report an approximation of how much memory has been used other than memory
|
||||
// that was allocated in block cache.
|
||||
virtual size_t ApproximateMemoryUsage() const = 0;
|
||||
|
||||
protected:
|
||||
const Comparator* comparator_;
|
||||
|
||||
private:
|
||||
Statistics* statistics_;
|
||||
};
|
||||
|
||||
// Index that allows binary search lookup in a two-level index structure.
|
||||
class PartitionIndexReader : public IndexReader {
|
||||
public:
|
||||
@ -397,118 +366,6 @@ class HashIndexReader : public IndexReader {
|
||||
BlockContents prefixes_contents_;
|
||||
};
|
||||
|
||||
// CachableEntry represents the entries that *may* be fetched from block cache.
|
||||
// field `value` is the item we want to get.
|
||||
// field `cache_handle` is the cache handle to the block cache. If the value
|
||||
// was not read from cache, `cache_handle` will be nullptr.
|
||||
template <class TValue>
|
||||
struct BlockBasedTable::CachableEntry {
|
||||
CachableEntry(TValue* _value, Cache::Handle* _cache_handle)
|
||||
: value(_value), cache_handle(_cache_handle) {}
|
||||
CachableEntry() : CachableEntry(nullptr, nullptr) {}
|
||||
void Release(Cache* cache) {
|
||||
if (cache_handle) {
|
||||
cache->Release(cache_handle);
|
||||
value = nullptr;
|
||||
cache_handle = nullptr;
|
||||
}
|
||||
}
|
||||
bool IsSet() const { return cache_handle != nullptr; }
|
||||
|
||||
TValue* value = nullptr;
|
||||
// if the entry is from the cache, cache_handle will be populated.
|
||||
Cache::Handle* cache_handle = nullptr;
|
||||
};
|
||||
|
||||
struct BlockBasedTable::Rep {
|
||||
Rep(const ImmutableCFOptions& _ioptions, const EnvOptions& _env_options,
|
||||
const BlockBasedTableOptions& _table_opt,
|
||||
const InternalKeyComparator& _internal_comparator, bool skip_filters)
|
||||
: ioptions(_ioptions),
|
||||
env_options(_env_options),
|
||||
table_options(_table_opt),
|
||||
filter_policy(skip_filters ? nullptr : _table_opt.filter_policy.get()),
|
||||
internal_comparator(_internal_comparator),
|
||||
filter_type(FilterType::kNoFilter),
|
||||
whole_key_filtering(_table_opt.whole_key_filtering),
|
||||
prefix_filtering(true),
|
||||
range_del_handle(BlockHandle::NullBlockHandle()),
|
||||
global_seqno(kDisableGlobalSequenceNumber) {}
|
||||
|
||||
const ImmutableCFOptions& ioptions;
|
||||
const EnvOptions& env_options;
|
||||
const BlockBasedTableOptions& table_options;
|
||||
const FilterPolicy* const filter_policy;
|
||||
const InternalKeyComparator& internal_comparator;
|
||||
Status status;
|
||||
unique_ptr<RandomAccessFileReader> file;
|
||||
char cache_key_prefix[kMaxCacheKeyPrefixSize];
|
||||
size_t cache_key_prefix_size = 0;
|
||||
char persistent_cache_key_prefix[kMaxCacheKeyPrefixSize];
|
||||
size_t persistent_cache_key_prefix_size = 0;
|
||||
char compressed_cache_key_prefix[kMaxCacheKeyPrefixSize];
|
||||
size_t compressed_cache_key_prefix_size = 0;
|
||||
uint64_t dummy_index_reader_offset =
|
||||
0; // ID that is unique for the block cache.
|
||||
PersistentCacheOptions persistent_cache_options;
|
||||
|
||||
// Footer contains the fixed table information
|
||||
Footer footer;
|
||||
// index_reader and filter will be populated and used only when
|
||||
// options.block_cache is nullptr; otherwise we will get the index block via
|
||||
// the block cache.
|
||||
unique_ptr<IndexReader> index_reader;
|
||||
unique_ptr<FilterBlockReader> filter;
|
||||
|
||||
enum class FilterType {
|
||||
kNoFilter,
|
||||
kFullFilter,
|
||||
kBlockFilter,
|
||||
};
|
||||
FilterType filter_type;
|
||||
BlockHandle filter_handle;
|
||||
|
||||
std::shared_ptr<const TableProperties> table_properties;
|
||||
// Block containing the data for the compression dictionary. We take ownership
|
||||
// for the entire block struct, even though we only use its Slice member. This
|
||||
// is easier because the Slice member depends on the continued existence of
|
||||
// another member ("allocation").
|
||||
std::unique_ptr<const BlockContents> compression_dict_block;
|
||||
BlockBasedTableOptions::IndexType index_type;
|
||||
bool hash_index_allow_collision;
|
||||
bool whole_key_filtering;
|
||||
bool prefix_filtering;
|
||||
// TODO(kailiu) It is very ugly to use internal key in table, since table
|
||||
// module should not be relying on db module. However to make things easier
|
||||
// and compatible with existing code, we introduce a wrapper that allows
|
||||
// block to extract prefix without knowing if a key is internal or not.
|
||||
unique_ptr<SliceTransform> internal_prefix_transform;
|
||||
|
||||
// only used in level 0 files:
|
||||
// when pin_l0_filter_and_index_blocks_in_cache is true, we do use the
|
||||
// LRU cache, but we always keep the filter & idndex block's handle checked
|
||||
// out here (=we don't call Release()), plus the parsed out objects
|
||||
// the LRU cache will never push flush them out, hence they're pinned
|
||||
CachableEntry<FilterBlockReader> filter_entry;
|
||||
CachableEntry<IndexReader> index_entry;
|
||||
// range deletion meta-block is pinned through reader's lifetime when LRU
|
||||
// cache is enabled.
|
||||
CachableEntry<Block> range_del_entry;
|
||||
BlockHandle range_del_handle;
|
||||
|
||||
// If global_seqno is used, all Keys in this file will have the same
|
||||
// seqno with value `global_seqno`.
|
||||
//
|
||||
// A value of kDisableGlobalSequenceNumber means that this feature is disabled
|
||||
// and every key have it's own seqno.
|
||||
SequenceNumber global_seqno;
|
||||
};
|
||||
|
||||
BlockBasedTable::~BlockBasedTable() {
|
||||
Close();
|
||||
delete rep_;
|
||||
}
|
||||
|
||||
// Helper function to setup the cache key's prefix for the Table.
|
||||
void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep, uint64_t file_size) {
|
||||
assert(kMaxCacheKeyPrefixSize >= 10);
|
||||
@ -850,7 +707,7 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
|
||||
|
||||
// Set filter block
|
||||
if (rep->filter_policy) {
|
||||
rep->filter.reset(ReadFilter(rep));
|
||||
rep->filter.reset(new_table->ReadFilter(rep));
|
||||
}
|
||||
} else {
|
||||
delete index_reader;
|
||||
@ -1087,7 +944,7 @@ Status BlockBasedTable::PutDataBlockToCache(
|
||||
return s;
|
||||
}
|
||||
|
||||
FilterBlockReader* BlockBasedTable::ReadFilter(Rep* rep) {
|
||||
FilterBlockReader* BlockBasedTable::ReadFilter(Rep* rep) const {
|
||||
// TODO: We might want to unify with ReadBlockFromFile() if we start
|
||||
// requiring checksum verification in Table::Open.
|
||||
if (rep->filter_type == Rep::FilterType::kNoFilter) {
|
||||
|
@ -20,6 +20,9 @@
|
||||
#include "rocksdb/statistics.h"
|
||||
#include "rocksdb/status.h"
|
||||
#include "rocksdb/table.h"
|
||||
#include "table/filter_block.h"
|
||||
#include "table/format.h"
|
||||
#include "table/persistent_cache_helper.h"
|
||||
#include "table/table_properties_internal.h"
|
||||
#include "table/table_reader.h"
|
||||
#include "table/two_level_iterator.h"
|
||||
@ -139,8 +142,45 @@ class BlockBasedTable : public TableReader {
|
||||
|
||||
bool TEST_filter_block_preloaded() const;
|
||||
bool TEST_index_reader_preloaded() const;
|
||||
// Implementation of IndexReader will be exposed to internal cc file only.
|
||||
class IndexReader;
|
||||
|
||||
// IndexReader is the interface that provide the functionality for index
|
||||
// access.
|
||||
class IndexReader {
|
||||
public:
|
||||
explicit IndexReader(const Comparator* comparator, Statistics* stats)
|
||||
: comparator_(comparator), statistics_(stats) {}
|
||||
|
||||
virtual ~IndexReader() {}
|
||||
|
||||
// Create an iterator for index access.
|
||||
// If iter is null then a new object is created on heap and the callee will
|
||||
// have the ownership. If a non-null iter is passed in it will be used, and
|
||||
// the returned value is either the same as iter or a new on-heap object
|
||||
// that
|
||||
// wrapps the passed iter. In the latter case the return value would point
|
||||
// to
|
||||
// a different object then iter and the callee has the ownership of the
|
||||
// returned object.
|
||||
virtual InternalIterator* NewIterator(BlockIter* iter = nullptr,
|
||||
bool total_order_seek = true) = 0;
|
||||
|
||||
// The size of the index.
|
||||
virtual size_t size() const = 0;
|
||||
// Memory usage of the index block
|
||||
virtual size_t usable_size() const = 0;
|
||||
// return the statistics pointer
|
||||
virtual Statistics* statistics() const { return statistics_; }
|
||||
// Report an approximation of how much memory has been used other than
|
||||
// memory
|
||||
// that was allocated in block cache.
|
||||
virtual size_t ApproximateMemoryUsage() const = 0;
|
||||
|
||||
protected:
|
||||
const Comparator* comparator_;
|
||||
|
||||
private:
|
||||
Statistics* statistics_;
|
||||
};
|
||||
|
||||
static Slice GetCacheKey(const char* cache_key_prefix,
|
||||
size_t cache_key_prefix_size,
|
||||
@ -155,7 +195,6 @@ class BlockBasedTable : public TableReader {
|
||||
private:
|
||||
template <class TValue>
|
||||
struct CachableEntry;
|
||||
|
||||
struct Rep;
|
||||
Rep* rep_;
|
||||
bool compaction_optimized_;
|
||||
@ -251,7 +290,7 @@ class BlockBasedTable : public TableReader {
|
||||
std::unique_ptr<InternalIterator>* iter);
|
||||
|
||||
// Create the filter from the filter block.
|
||||
static FilterBlockReader* ReadFilter(Rep* rep);
|
||||
FilterBlockReader* ReadFilter(Rep* rep) const;
|
||||
|
||||
static void SetupCacheKeyPrefix(Rep* rep, uint64_t file_size);
|
||||
|
||||
@ -290,4 +329,112 @@ class BlockBasedTable::BlockEntryIteratorState : public TwoLevelIteratorState {
|
||||
bool skip_filters_;
|
||||
};
|
||||
|
||||
// CachableEntry represents the entries that *may* be fetched from block cache.
|
||||
// field `value` is the item we want to get.
|
||||
// field `cache_handle` is the cache handle to the block cache. If the value
|
||||
// was not read from cache, `cache_handle` will be nullptr.
|
||||
template <class TValue>
|
||||
struct BlockBasedTable::CachableEntry {
|
||||
CachableEntry(TValue* _value, Cache::Handle* _cache_handle)
|
||||
: value(_value), cache_handle(_cache_handle) {}
|
||||
CachableEntry() : CachableEntry(nullptr, nullptr) {}
|
||||
void Release(Cache* cache) {
|
||||
if (cache_handle) {
|
||||
cache->Release(cache_handle);
|
||||
value = nullptr;
|
||||
cache_handle = nullptr;
|
||||
}
|
||||
}
|
||||
bool IsSet() const { return cache_handle != nullptr; }
|
||||
|
||||
TValue* value = nullptr;
|
||||
// if the entry is from the cache, cache_handle will be populated.
|
||||
Cache::Handle* cache_handle = nullptr;
|
||||
};
|
||||
|
||||
struct BlockBasedTable::Rep {
|
||||
Rep(const ImmutableCFOptions& _ioptions, const EnvOptions& _env_options,
|
||||
const BlockBasedTableOptions& _table_opt,
|
||||
const InternalKeyComparator& _internal_comparator, bool skip_filters)
|
||||
: ioptions(_ioptions),
|
||||
env_options(_env_options),
|
||||
table_options(_table_opt),
|
||||
filter_policy(skip_filters ? nullptr : _table_opt.filter_policy.get()),
|
||||
internal_comparator(_internal_comparator),
|
||||
filter_type(FilterType::kNoFilter),
|
||||
whole_key_filtering(_table_opt.whole_key_filtering),
|
||||
prefix_filtering(true),
|
||||
range_del_handle(BlockHandle::NullBlockHandle()),
|
||||
global_seqno(kDisableGlobalSequenceNumber) {}
|
||||
|
||||
const ImmutableCFOptions& ioptions;
|
||||
const EnvOptions& env_options;
|
||||
const BlockBasedTableOptions& table_options;
|
||||
const FilterPolicy* const filter_policy;
|
||||
const InternalKeyComparator& internal_comparator;
|
||||
Status status;
|
||||
unique_ptr<RandomAccessFileReader> file;
|
||||
char cache_key_prefix[kMaxCacheKeyPrefixSize];
|
||||
size_t cache_key_prefix_size = 0;
|
||||
char persistent_cache_key_prefix[kMaxCacheKeyPrefixSize];
|
||||
size_t persistent_cache_key_prefix_size = 0;
|
||||
char compressed_cache_key_prefix[kMaxCacheKeyPrefixSize];
|
||||
size_t compressed_cache_key_prefix_size = 0;
|
||||
uint64_t dummy_index_reader_offset =
|
||||
0; // ID that is unique for the block cache.
|
||||
PersistentCacheOptions persistent_cache_options;
|
||||
|
||||
// Footer contains the fixed table information
|
||||
Footer footer;
|
||||
// index_reader and filter will be populated and used only when
|
||||
// options.block_cache is nullptr; otherwise we will get the index block via
|
||||
// the block cache.
|
||||
unique_ptr<IndexReader> index_reader;
|
||||
unique_ptr<FilterBlockReader> filter;
|
||||
|
||||
enum class FilterType {
|
||||
kNoFilter,
|
||||
kFullFilter,
|
||||
kBlockFilter,
|
||||
kPartitionedFilter,
|
||||
};
|
||||
FilterType filter_type;
|
||||
BlockHandle filter_handle;
|
||||
|
||||
std::shared_ptr<const TableProperties> table_properties;
|
||||
// Block containing the data for the compression dictionary. We take ownership
|
||||
// for the entire block struct, even though we only use its Slice member. This
|
||||
// is easier because the Slice member depends on the continued existence of
|
||||
// another member ("allocation").
|
||||
std::unique_ptr<const BlockContents> compression_dict_block;
|
||||
BlockBasedTableOptions::IndexType index_type;
|
||||
bool hash_index_allow_collision;
|
||||
bool whole_key_filtering;
|
||||
bool prefix_filtering;
|
||||
// TODO(kailiu) It is very ugly to use internal key in table, since table
|
||||
// module should not be relying on db module. However to make things easier
|
||||
// and compatible with existing code, we introduce a wrapper that allows
|
||||
// block to extract prefix without knowing if a key is internal or not.
|
||||
unique_ptr<SliceTransform> internal_prefix_transform;
|
||||
|
||||
// only used in level 0 files:
|
||||
// when pin_l0_filter_and_index_blocks_in_cache is true, we do use the
|
||||
// LRU cache, but we always keep the filter & idndex block's handle checked
|
||||
// out here (=we don't call Release()), plus the parsed out objects
|
||||
// the LRU cache will never push flush them out, hence they're pinned
|
||||
CachableEntry<FilterBlockReader> filter_entry;
|
||||
CachableEntry<IndexReader> index_entry;
|
||||
// range deletion meta-block is pinned through reader's lifetime when LRU
|
||||
// cache is enabled.
|
||||
CachableEntry<Block> range_del_entry;
|
||||
BlockHandle range_del_handle;
|
||||
|
||||
// If global_seqno is used, all Keys in this file will have the same
|
||||
// seqno with value `global_seqno`.
|
||||
//
|
||||
// A value of kDisableGlobalSequenceNumber means that this feature is disabled
|
||||
// and every key have it's own seqno.
|
||||
SequenceNumber global_seqno;
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
|
@ -15,8 +15,8 @@
|
||||
#include "rocksdb/options.h"
|
||||
#include "rocksdb/table.h"
|
||||
|
||||
#include "port/port.h" // noexcept
|
||||
#include "table/persistent_cache_helper.h"
|
||||
#include "port/port.h" // noexcept
|
||||
#include "table/persistent_cache_options.h"
|
||||
#include "util/cf_options.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
52
table/index_builder.cc
Normal file
52
table/index_builder.cc
Normal file
@ -0,0 +1,52 @@
|
||||
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#include "table/index_builder.h"
|
||||
#include <assert.h>
|
||||
#include <inttypes.h>
|
||||
#include <list>
|
||||
#include <string>
|
||||
|
||||
#include "rocksdb/comparator.h"
|
||||
#include "table/format.h"
|
||||
#include "table/partitioned_filter_block.h"
|
||||
|
||||
// Without anonymous namespace here, we fail the warning -Wmissing-prototypes
|
||||
namespace rocksdb {
|
||||
// using namespace rocksdb;
|
||||
// Create a index builder based on its type.
|
||||
IndexBuilder* IndexBuilder::CreateIndexBuilder(
|
||||
BlockBasedTableOptions::IndexType index_type,
|
||||
const InternalKeyComparator* comparator,
|
||||
const SliceTransform* prefix_extractor, int index_block_restart_interval,
|
||||
uint64_t index_per_partition) {
|
||||
switch (index_type) {
|
||||
case BlockBasedTableOptions::kBinarySearch: {
|
||||
return new ShortenedIndexBuilder(comparator,
|
||||
index_block_restart_interval);
|
||||
}
|
||||
case BlockBasedTableOptions::kHashSearch: {
|
||||
return new HashIndexBuilder(comparator, prefix_extractor,
|
||||
index_block_restart_interval);
|
||||
}
|
||||
case BlockBasedTableOptions::kTwoLevelIndexSearch: {
|
||||
return new PartitionIndexBuilder(comparator, prefix_extractor,
|
||||
index_per_partition,
|
||||
index_block_restart_interval);
|
||||
}
|
||||
default: {
|
||||
assert(!"Do not recognize the index type ");
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
// impossible.
|
||||
assert(false);
|
||||
return nullptr;
|
||||
}
|
||||
} // namespace rocksdb
|
265
table/index_builder.h
Normal file
265
table/index_builder.h
Normal file
@ -0,0 +1,265 @@
|
||||
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
//
|
||||
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <assert.h>
|
||||
#include <inttypes.h>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "rocksdb/comparator.h"
|
||||
#include "table/block_based_table_factory.h"
|
||||
#include "table/block_builder.h"
|
||||
#include "table/format.h"
|
||||
|
||||
namespace rocksdb {
|
||||
// The interface for building index.
|
||||
// Instruction for adding a new concrete IndexBuilder:
|
||||
// 1. Create a subclass instantiated from IndexBuilder.
|
||||
// 2. Add a new entry associated with that subclass in TableOptions::IndexType.
|
||||
// 3. Add a create function for the new subclass in CreateIndexBuilder.
|
||||
// Note: we can devise more advanced design to simplify the process for adding
|
||||
// new subclass, which will, on the other hand, increase the code complexity and
|
||||
// catch unwanted attention from readers. Given that we won't add/change
|
||||
// indexes frequently, it makes sense to just embrace a more straightforward
|
||||
// design that just works.
|
||||
class IndexBuilder {
|
||||
public:
|
||||
static IndexBuilder* CreateIndexBuilder(
|
||||
BlockBasedTableOptions::IndexType index_type,
|
||||
const InternalKeyComparator* comparator,
|
||||
const SliceTransform* prefix_extractor, int index_block_restart_interval,
|
||||
uint64_t index_per_partition);
|
||||
|
||||
// Index builder will construct a set of blocks which contain:
|
||||
// 1. One primary index block.
|
||||
// 2. (Optional) a set of metablocks that contains the metadata of the
|
||||
// primary index.
|
||||
struct IndexBlocks {
|
||||
Slice index_block_contents;
|
||||
std::unordered_map<std::string, Slice> meta_blocks;
|
||||
};
|
||||
explicit IndexBuilder(const InternalKeyComparator* comparator)
|
||||
: comparator_(comparator) {}
|
||||
|
||||
virtual ~IndexBuilder() {}
|
||||
|
||||
// Add a new index entry to index block.
|
||||
// To allow further optimization, we provide `last_key_in_current_block` and
|
||||
// `first_key_in_next_block`, based on which the specific implementation can
|
||||
// determine the best index key to be used for the index block.
|
||||
// @last_key_in_current_block: this parameter maybe overridden with the value
|
||||
// "substitute key".
|
||||
// @first_key_in_next_block: it will be nullptr if the entry being added is
|
||||
// the last one in the table
|
||||
//
|
||||
// REQUIRES: Finish() has not yet been called.
|
||||
virtual void AddIndexEntry(std::string* last_key_in_current_block,
|
||||
const Slice* first_key_in_next_block,
|
||||
const BlockHandle& block_handle) = 0;
|
||||
|
||||
// This method will be called whenever a key is added. The subclasses may
|
||||
// override OnKeyAdded() if they need to collect additional information.
|
||||
virtual void OnKeyAdded(const Slice& key) {}
|
||||
|
||||
// Inform the index builder that all entries has been written. Block builder
|
||||
// may therefore perform any operation required for block finalization.
|
||||
//
|
||||
// REQUIRES: Finish() has not yet been called.
|
||||
inline Status Finish(IndexBlocks* index_blocks) {
|
||||
// Throw away the changes to last_partition_block_handle. It has no effect
|
||||
// on the first call to Finish anyway.
|
||||
BlockHandle last_partition_block_handle;
|
||||
return Finish(index_blocks, last_partition_block_handle);
|
||||
}
|
||||
|
||||
// This override of Finish can be utilized to build the 2nd level index in
|
||||
// PartitionIndexBuilder.
|
||||
//
|
||||
// index_blocks will be filled with the resulting index data. If the return
|
||||
// value is Status::InComplete() then it means that the index is partitioned
|
||||
// and the callee should keep calling Finish until Status::OK() is returned.
|
||||
// In that case, last_partition_block_handle is pointer to the block written
|
||||
// with the result of the last call to Finish. This can be utilized to build
|
||||
// the second level index pointing to each block of partitioned indexes. The
|
||||
// last call to Finish() that returns Status::OK() populates index_blocks with
|
||||
// the 2nd level index content.
|
||||
virtual Status Finish(IndexBlocks* index_blocks,
|
||||
const BlockHandle& last_partition_block_handle) = 0;
|
||||
|
||||
// Get the estimated size for index block.
|
||||
virtual size_t EstimatedSize() const = 0;
|
||||
|
||||
protected:
|
||||
const InternalKeyComparator* comparator_;
|
||||
};
|
||||
|
||||
// This index builder builds space-efficient index block.
|
||||
//
|
||||
// Optimizations:
|
||||
// 1. Made block's `block_restart_interval` to be 1, which will avoid linear
|
||||
// search when doing index lookup (can be disabled by setting
|
||||
// index_block_restart_interval).
|
||||
// 2. Shorten the key length for index block. Other than honestly using the
|
||||
// last key in the data block as the index key, we instead find a shortest
|
||||
// substitute key that serves the same function.
|
||||
class ShortenedIndexBuilder : public IndexBuilder {
|
||||
public:
|
||||
explicit ShortenedIndexBuilder(const InternalKeyComparator* comparator,
|
||||
int index_block_restart_interval)
|
||||
: IndexBuilder(comparator),
|
||||
index_block_builder_(index_block_restart_interval) {}
|
||||
|
||||
virtual void AddIndexEntry(std::string* last_key_in_current_block,
|
||||
const Slice* first_key_in_next_block,
|
||||
const BlockHandle& block_handle) override {
|
||||
if (first_key_in_next_block != nullptr) {
|
||||
comparator_->FindShortestSeparator(last_key_in_current_block,
|
||||
*first_key_in_next_block);
|
||||
} else {
|
||||
comparator_->FindShortSuccessor(last_key_in_current_block);
|
||||
}
|
||||
|
||||
std::string handle_encoding;
|
||||
block_handle.EncodeTo(&handle_encoding);
|
||||
index_block_builder_.Add(*last_key_in_current_block, handle_encoding);
|
||||
}
|
||||
|
||||
virtual Status Finish(
|
||||
IndexBlocks* index_blocks,
|
||||
const BlockHandle& last_partition_block_handle) override {
|
||||
index_blocks->index_block_contents = index_block_builder_.Finish();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
virtual size_t EstimatedSize() const override {
|
||||
return index_block_builder_.CurrentSizeEstimate();
|
||||
}
|
||||
|
||||
private:
|
||||
BlockBuilder index_block_builder_;
|
||||
};
|
||||
|
||||
// HashIndexBuilder contains a binary-searchable primary index and the
|
||||
// metadata for secondary hash index construction.
|
||||
// The metadata for hash index consists two parts:
|
||||
// - a metablock that compactly contains a sequence of prefixes. All prefixes
|
||||
// are stored consectively without any metadata (like, prefix sizes) being
|
||||
// stored, which is kept in the other metablock.
|
||||
// - a metablock contains the metadata of the prefixes, including prefix size,
|
||||
// restart index and number of block it spans. The format looks like:
|
||||
//
|
||||
// +-----------------+---------------------------+---------------------+
|
||||
// <=prefix 1
|
||||
// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
|
||||
// +-----------------+---------------------------+---------------------+
|
||||
// <=prefix 2
|
||||
// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
|
||||
// +-----------------+---------------------------+---------------------+
|
||||
// | |
|
||||
// | .... |
|
||||
// | |
|
||||
// +-----------------+---------------------------+---------------------+
|
||||
// <=prefix n
|
||||
// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
|
||||
// +-----------------+---------------------------+---------------------+
|
||||
//
|
||||
// The reason of separating these two metablocks is to enable the efficiently
|
||||
// reuse the first metablock during hash index construction without unnecessary
|
||||
// data copy or small heap allocations for prefixes.
|
||||
class HashIndexBuilder : public IndexBuilder {
|
||||
public:
|
||||
explicit HashIndexBuilder(const InternalKeyComparator* comparator,
|
||||
const SliceTransform* hash_key_extractor,
|
||||
int index_block_restart_interval)
|
||||
: IndexBuilder(comparator),
|
||||
primary_index_builder_(comparator, index_block_restart_interval),
|
||||
hash_key_extractor_(hash_key_extractor) {}
|
||||
|
||||
virtual void AddIndexEntry(std::string* last_key_in_current_block,
|
||||
const Slice* first_key_in_next_block,
|
||||
const BlockHandle& block_handle) override {
|
||||
++current_restart_index_;
|
||||
primary_index_builder_.AddIndexEntry(last_key_in_current_block,
|
||||
first_key_in_next_block, block_handle);
|
||||
}
|
||||
|
||||
virtual void OnKeyAdded(const Slice& key) override {
|
||||
auto key_prefix = hash_key_extractor_->Transform(key);
|
||||
bool is_first_entry = pending_block_num_ == 0;
|
||||
|
||||
// Keys may share the prefix
|
||||
if (is_first_entry || pending_entry_prefix_ != key_prefix) {
|
||||
if (!is_first_entry) {
|
||||
FlushPendingPrefix();
|
||||
}
|
||||
|
||||
// need a hard copy otherwise the underlying data changes all the time.
|
||||
// TODO(kailiu) ToString() is expensive. We may speed up can avoid data
|
||||
// copy.
|
||||
pending_entry_prefix_ = key_prefix.ToString();
|
||||
pending_block_num_ = 1;
|
||||
pending_entry_index_ = static_cast<uint32_t>(current_restart_index_);
|
||||
} else {
|
||||
// entry number increments when keys share the prefix reside in
|
||||
// different data blocks.
|
||||
auto last_restart_index = pending_entry_index_ + pending_block_num_ - 1;
|
||||
assert(last_restart_index <= current_restart_index_);
|
||||
if (last_restart_index != current_restart_index_) {
|
||||
++pending_block_num_;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
virtual Status Finish(
|
||||
IndexBlocks* index_blocks,
|
||||
const BlockHandle& last_partition_block_handle) override {
|
||||
FlushPendingPrefix();
|
||||
primary_index_builder_.Finish(index_blocks, last_partition_block_handle);
|
||||
index_blocks->meta_blocks.insert(
|
||||
{kHashIndexPrefixesBlock.c_str(), prefix_block_});
|
||||
index_blocks->meta_blocks.insert(
|
||||
{kHashIndexPrefixesMetadataBlock.c_str(), prefix_meta_block_});
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
virtual size_t EstimatedSize() const override {
|
||||
return primary_index_builder_.EstimatedSize() + prefix_block_.size() +
|
||||
prefix_meta_block_.size();
|
||||
}
|
||||
|
||||
private:
|
||||
void FlushPendingPrefix() {
|
||||
prefix_block_.append(pending_entry_prefix_.data(),
|
||||
pending_entry_prefix_.size());
|
||||
PutVarint32Varint32Varint32(
|
||||
&prefix_meta_block_,
|
||||
static_cast<uint32_t>(pending_entry_prefix_.size()),
|
||||
pending_entry_index_, pending_block_num_);
|
||||
}
|
||||
|
||||
ShortenedIndexBuilder primary_index_builder_;
|
||||
const SliceTransform* hash_key_extractor_;
|
||||
|
||||
// stores a sequence of prefixes
|
||||
std::string prefix_block_;
|
||||
// stores the metadata of prefixes
|
||||
std::string prefix_meta_block_;
|
||||
|
||||
// The following 3 variables keeps unflushed prefix and its metadata.
|
||||
// The details of block_num and entry_index can be found in
|
||||
// "block_hash_index.{h,cc}"
|
||||
uint32_t pending_block_num_ = 0;
|
||||
uint32_t pending_entry_index_ = 0;
|
||||
std::string pending_entry_prefix_;
|
||||
|
||||
uint64_t current_restart_index_ = 0;
|
||||
};
|
||||
} // namespace rocksdb
|
84
table/partitioned_filter_block.cc
Normal file
84
table/partitioned_filter_block.cc
Normal file
@ -0,0 +1,84 @@
|
||||
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#include "table/partitioned_filter_block.h"
|
||||
|
||||
#include "port/port.h"
|
||||
#include "util/coding.h"
|
||||
|
||||
namespace rocksdb {
|
||||
PartitionIndexBuilder::PartitionIndexBuilder(
|
||||
const InternalKeyComparator* comparator,
|
||||
const SliceTransform* prefix_extractor, const uint64_t index_per_partition,
|
||||
int index_block_restart_interval)
|
||||
: IndexBuilder(comparator),
|
||||
prefix_extractor_(prefix_extractor),
|
||||
index_block_builder_(index_block_restart_interval),
|
||||
index_per_partition_(index_per_partition),
|
||||
index_block_restart_interval_(index_block_restart_interval) {
|
||||
sub_index_builder_ =
|
||||
CreateIndexBuilder(sub_type_, comparator_, prefix_extractor_,
|
||||
index_block_restart_interval_, index_per_partition_);
|
||||
}
|
||||
|
||||
PartitionIndexBuilder::~PartitionIndexBuilder() { delete sub_index_builder_; }
|
||||
|
||||
void PartitionIndexBuilder::AddIndexEntry(
|
||||
std::string* last_key_in_current_block,
|
||||
const Slice* first_key_in_next_block, const BlockHandle& block_handle) {
|
||||
sub_index_builder_->AddIndexEntry(last_key_in_current_block,
|
||||
first_key_in_next_block, block_handle);
|
||||
num_indexes++;
|
||||
if (UNLIKELY(first_key_in_next_block == nullptr)) { // no more keys
|
||||
entries_.push_back({std::string(*last_key_in_current_block),
|
||||
std::unique_ptr<IndexBuilder>(sub_index_builder_)});
|
||||
sub_index_builder_ = nullptr;
|
||||
} else if (num_indexes % index_per_partition_ == 0) {
|
||||
entries_.push_back({std::string(*last_key_in_current_block),
|
||||
std::unique_ptr<IndexBuilder>(sub_index_builder_)});
|
||||
sub_index_builder_ =
|
||||
CreateIndexBuilder(sub_type_, comparator_, prefix_extractor_,
|
||||
index_block_restart_interval_, index_per_partition_);
|
||||
}
|
||||
}
|
||||
|
||||
Status PartitionIndexBuilder::Finish(
|
||||
IndexBlocks* index_blocks, const BlockHandle& last_partition_block_handle) {
|
||||
assert(!entries_.empty());
|
||||
// It must be set to null after last key is added
|
||||
assert(sub_index_builder_ == nullptr);
|
||||
if (finishing == true) {
|
||||
Entry& last_entry = entries_.front();
|
||||
std::string handle_encoding;
|
||||
last_partition_block_handle.EncodeTo(&handle_encoding);
|
||||
index_block_builder_.Add(last_entry.key, handle_encoding);
|
||||
entries_.pop_front();
|
||||
}
|
||||
// If there is no sub_index left, then return the 2nd level index.
|
||||
if (UNLIKELY(entries_.empty())) {
|
||||
index_blocks->index_block_contents = index_block_builder_.Finish();
|
||||
return Status::OK();
|
||||
} else {
|
||||
// Finish the next partition index in line and Incomplete() to indicate we
|
||||
// expect more calls to Finish
|
||||
Entry& entry = entries_.front();
|
||||
auto s = entry.value->Finish(index_blocks);
|
||||
finishing = true;
|
||||
return s.ok() ? Status::Incomplete() : s;
|
||||
}
|
||||
}
|
||||
|
||||
size_t PartitionIndexBuilder::EstimatedSize() const {
|
||||
size_t total = 0;
|
||||
for (auto it = entries_.begin(); it != entries_.end(); ++it) {
|
||||
total += it->value->EstimatedSize();
|
||||
}
|
||||
total += index_block_builder_.CurrentSizeEstimate();
|
||||
total +=
|
||||
sub_index_builder_ == nullptr ? 0 : sub_index_builder_->EstimatedSize();
|
||||
return total;
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
65
table/partitioned_filter_block.h
Normal file
65
table/partitioned_filter_block.h
Normal file
@ -0,0 +1,65 @@
|
||||
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <list>
|
||||
#include <string>
|
||||
#include "rocksdb/options.h"
|
||||
#include "rocksdb/slice.h"
|
||||
#include "rocksdb/slice_transform.h"
|
||||
#include "util/hash.h"
|
||||
|
||||
#include "table/index_builder.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
/**
|
||||
* IndexBuilder for two-level indexing. Internally it creates a new index for
|
||||
* each partition and Finish then in order when Finish is called on it
|
||||
* continiously until Status::OK() is returned.
|
||||
*
|
||||
* The format on the disk would be I I I I I I IP where I is block containing a
|
||||
* partition of indexes built using ShortenedIndexBuilder and IP is a block
|
||||
* containing a secondary index on the partitions, built using
|
||||
* ShortenedIndexBuilder.
|
||||
*/
|
||||
class PartitionIndexBuilder : public IndexBuilder {
|
||||
public:
|
||||
explicit PartitionIndexBuilder(const InternalKeyComparator* comparator,
|
||||
const SliceTransform* prefix_extractor,
|
||||
const uint64_t index_per_partition,
|
||||
int index_block_restart_interval);
|
||||
|
||||
virtual ~PartitionIndexBuilder();
|
||||
|
||||
virtual void AddIndexEntry(std::string* last_key_in_current_block,
|
||||
const Slice* first_key_in_next_block,
|
||||
const BlockHandle& block_handle);
|
||||
|
||||
virtual Status Finish(IndexBlocks* index_blocks,
|
||||
const BlockHandle& last_partition_block_handle);
|
||||
|
||||
virtual size_t EstimatedSize() const;
|
||||
|
||||
private:
|
||||
static const BlockBasedTableOptions::IndexType sub_type_ =
|
||||
BlockBasedTableOptions::kBinarySearch;
|
||||
struct Entry {
|
||||
std::string key;
|
||||
std::unique_ptr<IndexBuilder> value;
|
||||
};
|
||||
std::list<Entry> entries_; // list of partitioned indexes and their keys
|
||||
const SliceTransform* prefix_extractor_;
|
||||
BlockBuilder index_block_builder_; // top-level index builder
|
||||
IndexBuilder* sub_index_builder_; // the active partition index builder
|
||||
uint64_t index_per_partition_;
|
||||
int index_block_restart_interval_;
|
||||
uint64_t num_indexes = 0;
|
||||
bool finishing =
|
||||
false; // true if Finish is called once but not complete yet.
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
@ -2,7 +2,9 @@
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#include "table/persistent_cache_helper.h"
|
||||
#include "table/block_based_table_reader.h"
|
||||
#include "table/format.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
@ -6,33 +6,14 @@
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "table/block_based_table_reader.h"
|
||||
#include "table/format.h"
|
||||
#include "table/persistent_cache_options.h"
|
||||
#include "util/statistics.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
struct BlockContents;
|
||||
|
||||
// PersistentCacheOptions
|
||||
//
|
||||
// This describe the caching behavior for page cache
|
||||
// This is used to pass the context for caching and the cache handle
|
||||
struct PersistentCacheOptions {
|
||||
PersistentCacheOptions() {}
|
||||
explicit PersistentCacheOptions(
|
||||
const std::shared_ptr<PersistentCache>& _persistent_cache,
|
||||
const std::string _key_prefix, Statistics* const _statistics)
|
||||
: persistent_cache(_persistent_cache),
|
||||
key_prefix(_key_prefix),
|
||||
statistics(_statistics) {}
|
||||
|
||||
virtual ~PersistentCacheOptions() {}
|
||||
|
||||
std::shared_ptr<PersistentCache> persistent_cache;
|
||||
std::string key_prefix;
|
||||
Statistics* statistics = nullptr;
|
||||
};
|
||||
|
||||
// PersistentCacheHelper
|
||||
//
|
||||
// Encapsulates some of the helper logic for read and writing from the cache
|
||||
|
34
table/persistent_cache_options.h
Normal file
34
table/persistent_cache_options.h
Normal file
@ -0,0 +1,34 @@
|
||||
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "include/rocksdb/persistent_cache.h"
|
||||
#include "util/statistics.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
// PersistentCacheOptions
|
||||
//
|
||||
// This describe the caching behavior for page cache
|
||||
// This is used to pass the context for caching and the cache handle
|
||||
struct PersistentCacheOptions {
|
||||
PersistentCacheOptions() {}
|
||||
explicit PersistentCacheOptions(
|
||||
const std::shared_ptr<PersistentCache>& _persistent_cache,
|
||||
const std::string _key_prefix, Statistics* const _statistics)
|
||||
: persistent_cache(_persistent_cache),
|
||||
key_prefix(_key_prefix),
|
||||
statistics(_statistics) {}
|
||||
|
||||
virtual ~PersistentCacheOptions() {}
|
||||
|
||||
std::shared_ptr<PersistentCache> persistent_cache;
|
||||
std::string key_prefix;
|
||||
Statistics* statistics = nullptr;
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
Loading…
Reference in New Issue
Block a user