Make the block-based table's index pluggable
Summary: This patch introduced a new table options that allows us to make block-based table's index pluggable. To support that new features: * Code has been refacotred to be more flexible and supports this option well. * More documentation is added for the existing obsecure functionalities. * Big surgeon on DataBlockReader(), where the logic was really convoluted. * Other small code cleanups. The pluggablility will mostly affect development of internal modules and won't change frequently, as a result I intentionally avoid heavy-weight patterns (like factory) and try to make it simple. Test Plan: make all check Reviewers: haobo, sdong Reviewed By: sdong CC: leveldb Differential Revision: https://reviews.facebook.net/D16395
This commit is contained in:
parent
bf86af5174
commit
74939a9e13
@ -54,6 +54,21 @@ struct BlockBasedTableOptions {
|
|||||||
// If not specified, each "table reader" object will pre-load index/filter
|
// If not specified, each "table reader" object will pre-load index/filter
|
||||||
// block during table initialization.
|
// block during table initialization.
|
||||||
bool cache_index_and_filter_blocks = false;
|
bool cache_index_and_filter_blocks = false;
|
||||||
|
|
||||||
|
// The index type that will be used for this table.
|
||||||
|
enum IndexType : char {
|
||||||
|
// A space efficient index block that is optimized for
|
||||||
|
// binary-search-based index.
|
||||||
|
kBinarySearch,
|
||||||
|
};
|
||||||
|
|
||||||
|
IndexType index_type = kBinarySearch;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Table Properties that are specific to block-based table properties.
|
||||||
|
struct BlockBasedTablePropertyNames {
|
||||||
|
// value of this propertis is a fixed int32 number.
|
||||||
|
static const std::string kIndexType;
|
||||||
};
|
};
|
||||||
|
|
||||||
// Create default block based table factory.
|
// Create default block based table factory.
|
||||||
|
@ -11,23 +11,29 @@
|
|||||||
|
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
#include <inttypes.h>
|
#include <inttypes.h>
|
||||||
#include <map>
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
|
||||||
#include "rocksdb/flush_block_policy.h"
|
#include <map>
|
||||||
|
#include <memory>
|
||||||
|
|
||||||
|
#include "db/dbformat.h"
|
||||||
|
|
||||||
#include "rocksdb/cache.h"
|
#include "rocksdb/cache.h"
|
||||||
#include "rocksdb/comparator.h"
|
#include "rocksdb/comparator.h"
|
||||||
#include "table/table_builder.h"
|
|
||||||
#include "rocksdb/env.h"
|
#include "rocksdb/env.h"
|
||||||
#include "rocksdb/filter_policy.h"
|
#include "rocksdb/filter_policy.h"
|
||||||
|
#include "rocksdb/flush_block_policy.h"
|
||||||
#include "rocksdb/options.h"
|
#include "rocksdb/options.h"
|
||||||
#include "db/dbformat.h"
|
#include "rocksdb/table.h"
|
||||||
#include "table/block_based_table_reader.h"
|
|
||||||
#include "table/block.h"
|
#include "table/block.h"
|
||||||
|
#include "table/block_based_table_reader.h"
|
||||||
#include "table/block_builder.h"
|
#include "table/block_builder.h"
|
||||||
#include "table/filter_block.h"
|
#include "table/filter_block.h"
|
||||||
#include "table/format.h"
|
#include "table/format.h"
|
||||||
#include "table/meta_blocks.h"
|
#include "table/meta_blocks.h"
|
||||||
|
#include "table/table_builder.h"
|
||||||
|
|
||||||
#include "util/coding.h"
|
#include "util/coding.h"
|
||||||
#include "util/crc32c.h"
|
#include "util/crc32c.h"
|
||||||
#include "util/stop_watch.h"
|
#include "util/stop_watch.h"
|
||||||
@ -36,11 +42,167 @@ namespace rocksdb {
|
|||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
static bool GoodCompressionRatio(size_t compressed_size, size_t raw_size) {
|
typedef BlockBasedTableOptions::IndexType IndexType;
|
||||||
|
|
||||||
|
// The interface for building index.
|
||||||
|
// Instruction for adding a new concrete IndexBuilder:
|
||||||
|
// 1. Create a subclass instantiated from IndexBuilder.
|
||||||
|
// 2. Add a new entry associated with that subclass in TableOptions::IndexType.
|
||||||
|
// 3. Add a create function for the new subclass in CreateIndexBuilder.
|
||||||
|
// Note: we can devise more advanced design to simplify the process for adding
|
||||||
|
// new subclass, which will, on the other hand, increase the code complexity and
|
||||||
|
// catch unwanted attention from readers. Given that we won't add/change
|
||||||
|
// indexes frequently, it makes sense to just embrace a more straightforward
|
||||||
|
// design that just works.
|
||||||
|
class IndexBuilder {
|
||||||
|
public:
|
||||||
|
explicit IndexBuilder(const Comparator* comparator)
|
||||||
|
: comparator_(comparator) {}
|
||||||
|
|
||||||
|
virtual ~IndexBuilder() {}
|
||||||
|
|
||||||
|
// Add a new index entry to index block.
|
||||||
|
// To allow further optimization, we provide `last_key_in_current_block` and
|
||||||
|
// `first_key_in_next_block`, based on which the specific implementation can
|
||||||
|
// determine the best index key to be used for the index block.
|
||||||
|
// @last_key_in_current_block: this parameter maybe overridden with the value
|
||||||
|
// "substitute key".
|
||||||
|
// @first_key_in_next_block: it will be nullptr if the entry being added is
|
||||||
|
// the last one in the table
|
||||||
|
//
|
||||||
|
// REQUIRES: Finish() has not yet been called.
|
||||||
|
virtual void AddEntry(std::string* last_key_in_current_block,
|
||||||
|
const Slice* first_key_in_next_block,
|
||||||
|
const BlockHandle& block_handle) = 0;
|
||||||
|
|
||||||
|
// Inform the index builder that all entries has been written. Block builder
|
||||||
|
// may therefore perform any operation required for block finalization.
|
||||||
|
//
|
||||||
|
// REQUIRES: Finish() has not yet been called.
|
||||||
|
virtual Slice Finish() = 0;
|
||||||
|
|
||||||
|
// Get the estimated size for index block.
|
||||||
|
virtual size_t EstimatedSize() const = 0;
|
||||||
|
|
||||||
|
protected:
|
||||||
|
const Comparator* comparator_;
|
||||||
|
};
|
||||||
|
|
||||||
|
// This index builder builds space-efficient index block for binary-search-based
|
||||||
|
// index.
|
||||||
|
//
|
||||||
|
// Optimizations:
|
||||||
|
// 1. Made block's `block_restart_interval` to be 1, which will avoid linear
|
||||||
|
// search when doing index lookup.
|
||||||
|
// 2. Shorten the key length for index block. Other than honestly using the
|
||||||
|
// last key in the data block as the index key, we instead find a shortest
|
||||||
|
// substitute key that serves the same function.
|
||||||
|
class BinarySearchIndexBuilder : public IndexBuilder {
|
||||||
|
public:
|
||||||
|
explicit BinarySearchIndexBuilder(const Comparator* comparator)
|
||||||
|
: IndexBuilder(comparator),
|
||||||
|
index_block_builder_(1 /* block_restart_interval == 1 */, comparator) {}
|
||||||
|
|
||||||
|
virtual void AddEntry(std::string* last_key_in_current_block,
|
||||||
|
const Slice* first_key_in_next_block,
|
||||||
|
const BlockHandle& block_handle) override {
|
||||||
|
if (first_key_in_next_block != nullptr) {
|
||||||
|
comparator_->FindShortestSeparator(last_key_in_current_block,
|
||||||
|
*first_key_in_next_block);
|
||||||
|
} else {
|
||||||
|
comparator_->FindShortSuccessor(last_key_in_current_block);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string handle_encoding;
|
||||||
|
block_handle.EncodeTo(&handle_encoding);
|
||||||
|
index_block_builder_.Add(*last_key_in_current_block, handle_encoding);
|
||||||
|
}
|
||||||
|
|
||||||
|
virtual Slice Finish() override { return index_block_builder_.Finish(); }
|
||||||
|
|
||||||
|
virtual size_t EstimatedSize() const {
|
||||||
|
return index_block_builder_.CurrentSizeEstimate();
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
BlockBuilder index_block_builder_;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Create a index builder based on its type.
|
||||||
|
IndexBuilder* CreateIndexBuilder(IndexType type, const Comparator* comparator) {
|
||||||
|
switch (type) {
|
||||||
|
case BlockBasedTableOptions::kBinarySearch: {
|
||||||
|
return new BinarySearchIndexBuilder(comparator);
|
||||||
|
}
|
||||||
|
default: {
|
||||||
|
assert(!"Do not recognize the index type ");
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// impossible.
|
||||||
|
assert(false);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool GoodCompressionRatio(size_t compressed_size, size_t raw_size) {
|
||||||
// Check to see if compressed less than 12.5%
|
// Check to see if compressed less than 12.5%
|
||||||
return compressed_size < raw_size - (raw_size / 8u);
|
return compressed_size < raw_size - (raw_size / 8u);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Slice CompressBlock(const Slice& raw,
|
||||||
|
const CompressionOptions& compression_options,
|
||||||
|
CompressionType* type, std::string* compressed_output) {
|
||||||
|
if (*type == kNoCompression) {
|
||||||
|
return raw;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Will return compressed block contents if (1) the compression method is
|
||||||
|
// supported in this platform and (2) the compression rate is "good enough".
|
||||||
|
switch (*type) {
|
||||||
|
case kSnappyCompression:
|
||||||
|
if (port::Snappy_Compress(compression_options, raw.data(), raw.size(),
|
||||||
|
compressed_output) &&
|
||||||
|
GoodCompressionRatio(compressed_output->size(), raw.size())) {
|
||||||
|
return *compressed_output;
|
||||||
|
}
|
||||||
|
break; // fall back to no compression.
|
||||||
|
case kZlibCompression:
|
||||||
|
if (port::Zlib_Compress(compression_options, raw.data(), raw.size(),
|
||||||
|
compressed_output) &&
|
||||||
|
GoodCompressionRatio(compressed_output->size(), raw.size())) {
|
||||||
|
return *compressed_output;
|
||||||
|
}
|
||||||
|
break; // fall back to no compression.
|
||||||
|
case kBZip2Compression:
|
||||||
|
if (port::BZip2_Compress(compression_options, raw.data(), raw.size(),
|
||||||
|
compressed_output) &&
|
||||||
|
GoodCompressionRatio(compressed_output->size(), raw.size())) {
|
||||||
|
return *compressed_output;
|
||||||
|
}
|
||||||
|
break; // fall back to no compression.
|
||||||
|
case kLZ4Compression:
|
||||||
|
if (port::LZ4_Compress(compression_options, raw.data(), raw.size(),
|
||||||
|
compressed_output) &&
|
||||||
|
GoodCompressionRatio(compressed_output->size(), raw.size())) {
|
||||||
|
return *compressed_output;
|
||||||
|
}
|
||||||
|
break; // fall back to no compression.
|
||||||
|
case kLZ4HCCompression:
|
||||||
|
if (port::LZ4HC_Compress(compression_options, raw.data(), raw.size(),
|
||||||
|
compressed_output) &&
|
||||||
|
GoodCompressionRatio(compressed_output->size(), raw.size())) {
|
||||||
|
return *compressed_output;
|
||||||
|
}
|
||||||
|
break; // fall back to no compression.
|
||||||
|
default: {} // Do not recognize this compression type
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compression method is not supported, or not good compression ratio, so just
|
||||||
|
// fall back to uncompressed form.
|
||||||
|
*type = kNoCompression;
|
||||||
|
return raw;
|
||||||
|
}
|
||||||
|
|
||||||
} // anonymous namespace
|
} // anonymous namespace
|
||||||
|
|
||||||
// kBlockBasedTableMagicNumber was picked by running
|
// kBlockBasedTableMagicNumber was picked by running
|
||||||
@ -51,6 +213,46 @@ static bool GoodCompressionRatio(size_t compressed_size, size_t raw_size) {
|
|||||||
extern const uint64_t kBlockBasedTableMagicNumber
|
extern const uint64_t kBlockBasedTableMagicNumber
|
||||||
= 0xdb4775248b80fb57ull;
|
= 0xdb4775248b80fb57ull;
|
||||||
|
|
||||||
|
// A collector that collects properties of interest to block-based table.
|
||||||
|
// For now this class looks heavy-weight since we only write one additional
|
||||||
|
// property.
|
||||||
|
// But in the forseeable future, we will add more and more properties that are
|
||||||
|
// specific to block-based table.
|
||||||
|
class BlockBasedTableBuilder::BlockBasedTablePropertiesCollector
|
||||||
|
: public TablePropertiesCollector {
|
||||||
|
public:
|
||||||
|
BlockBasedTablePropertiesCollector(
|
||||||
|
BlockBasedTableOptions::IndexType index_type)
|
||||||
|
: index_type_(index_type) {}
|
||||||
|
|
||||||
|
virtual Status Add(const Slice& key, const Slice& value) {
|
||||||
|
// Intentionally left blank. Have no interest in collecting stats for
|
||||||
|
// individual key/value pairs.
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
|
virtual Status Finish(UserCollectedProperties* properties) {
|
||||||
|
std::string val;
|
||||||
|
PutFixed32(&val, static_cast<uint32_t>(index_type_));
|
||||||
|
properties->insert({BlockBasedTablePropertyNames::kIndexType, val});
|
||||||
|
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
|
// The name of the properties collector can be used for debugging purpose.
|
||||||
|
virtual const char* Name() const {
|
||||||
|
return "BlockBasedTablePropertiesCollector";
|
||||||
|
}
|
||||||
|
|
||||||
|
virtual UserCollectedProperties GetReadableProperties() const {
|
||||||
|
// Intentionally left blank.
|
||||||
|
return UserCollectedProperties();
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
BlockBasedTableOptions::IndexType index_type_;
|
||||||
|
};
|
||||||
|
|
||||||
struct BlockBasedTableBuilder::Rep {
|
struct BlockBasedTableBuilder::Rep {
|
||||||
Options options;
|
Options options;
|
||||||
const InternalKeyComparator& internal_comparator;
|
const InternalKeyComparator& internal_comparator;
|
||||||
@ -58,7 +260,8 @@ struct BlockBasedTableBuilder::Rep {
|
|||||||
uint64_t offset = 0;
|
uint64_t offset = 0;
|
||||||
Status status;
|
Status status;
|
||||||
BlockBuilder data_block;
|
BlockBuilder data_block;
|
||||||
BlockBuilder index_block;
|
std::unique_ptr<IndexBuilder> index_builder;
|
||||||
|
|
||||||
std::string last_key;
|
std::string last_key;
|
||||||
CompressionType compression_type;
|
CompressionType compression_type;
|
||||||
TableProperties props;
|
TableProperties props;
|
||||||
@ -75,28 +278,31 @@ struct BlockBasedTableBuilder::Rep {
|
|||||||
|
|
||||||
Rep(const Options& opt, const InternalKeyComparator& icomparator,
|
Rep(const Options& opt, const InternalKeyComparator& icomparator,
|
||||||
WritableFile* f, FlushBlockPolicyFactory* flush_block_policy_factory,
|
WritableFile* f, FlushBlockPolicyFactory* flush_block_policy_factory,
|
||||||
CompressionType compression_type)
|
CompressionType compression_type, IndexType index_block_type)
|
||||||
: options(opt),
|
: options(opt),
|
||||||
internal_comparator(icomparator),
|
internal_comparator(icomparator),
|
||||||
file(f),
|
file(f),
|
||||||
data_block(options, &internal_comparator),
|
data_block(options, &internal_comparator),
|
||||||
// To avoid linear scan, we make the block_restart_interval to be `1`
|
index_builder(
|
||||||
// in index block builder
|
CreateIndexBuilder(index_block_type, &internal_comparator)),
|
||||||
index_block(1 /* block_restart_interval */, &internal_comparator),
|
|
||||||
compression_type(compression_type),
|
compression_type(compression_type),
|
||||||
filter_block(opt.filter_policy == nullptr
|
filter_block(opt.filter_policy == nullptr
|
||||||
? nullptr
|
? nullptr
|
||||||
: new FilterBlockBuilder(opt, &internal_comparator)),
|
: new FilterBlockBuilder(opt, &internal_comparator)),
|
||||||
flush_block_policy(flush_block_policy_factory->NewFlushBlockPolicy(
|
flush_block_policy(flush_block_policy_factory->NewFlushBlockPolicy(
|
||||||
options, data_block)) {}
|
options, data_block)) {
|
||||||
|
options.table_properties_collectors.push_back(
|
||||||
|
std::make_shared<BlockBasedTablePropertiesCollector>(index_block_type));
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
BlockBasedTableBuilder::BlockBasedTableBuilder(
|
BlockBasedTableBuilder::BlockBasedTableBuilder(
|
||||||
const Options& options, const InternalKeyComparator& internal_comparator,
|
const Options& options, const BlockBasedTableOptions& table_options,
|
||||||
WritableFile* file, FlushBlockPolicyFactory* flush_block_policy_factory,
|
const InternalKeyComparator& internal_comparator, WritableFile* file,
|
||||||
CompressionType compression_type)
|
CompressionType compression_type)
|
||||||
: rep_(new Rep(options, internal_comparator, file,
|
: rep_(new Rep(options, internal_comparator, file,
|
||||||
flush_block_policy_factory, compression_type)) {
|
table_options.flush_block_policy_factory.get(),
|
||||||
|
compression_type, table_options.index_type)) {
|
||||||
if (rep_->filter_block != nullptr) {
|
if (rep_->filter_block != nullptr) {
|
||||||
rep_->filter_block->StartBlock(0);
|
rep_->filter_block->StartBlock(0);
|
||||||
}
|
}
|
||||||
@ -136,10 +342,7 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
|
|||||||
// entries in the first block and < all entries in subsequent
|
// entries in the first block and < all entries in subsequent
|
||||||
// blocks.
|
// blocks.
|
||||||
if (ok()) {
|
if (ok()) {
|
||||||
r->internal_comparator.FindShortestSeparator(&r->last_key, key);
|
r->index_builder->AddEntry(&r->last_key, &key, r->pending_handle);
|
||||||
std::string handle_encoding;
|
|
||||||
r->pending_handle.EncodeTo(&handle_encoding);
|
|
||||||
r->index_block.Add(r->last_key, Slice(handle_encoding));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -179,88 +382,25 @@ void BlockBasedTableBuilder::Flush() {
|
|||||||
|
|
||||||
void BlockBasedTableBuilder::WriteBlock(BlockBuilder* block,
|
void BlockBasedTableBuilder::WriteBlock(BlockBuilder* block,
|
||||||
BlockHandle* handle) {
|
BlockHandle* handle) {
|
||||||
|
WriteBlock(block->Finish(), handle);
|
||||||
|
block->Reset();
|
||||||
|
}
|
||||||
|
|
||||||
|
void BlockBasedTableBuilder::WriteBlock(const Slice& raw_block_contents,
|
||||||
|
BlockHandle* handle) {
|
||||||
// File format contains a sequence of blocks where each block has:
|
// File format contains a sequence of blocks where each block has:
|
||||||
// block_data: uint8[n]
|
// block_data: uint8[n]
|
||||||
// type: uint8
|
// type: uint8
|
||||||
// crc: uint32
|
// crc: uint32
|
||||||
assert(ok());
|
assert(ok());
|
||||||
Rep* r = rep_;
|
Rep* r = rep_;
|
||||||
Slice raw = block->Finish();
|
|
||||||
|
|
||||||
Slice block_contents;
|
auto type = r->compression_type;
|
||||||
std::string* compressed = &r->compressed_output;
|
auto block_contents =
|
||||||
CompressionType type = r->compression_type;
|
CompressBlock(raw_block_contents, r->options.compression_opts, &type,
|
||||||
switch (type) {
|
&r->compressed_output);
|
||||||
case kNoCompression:
|
|
||||||
block_contents = raw;
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kSnappyCompression: {
|
|
||||||
std::string* compressed = &r->compressed_output;
|
|
||||||
if (port::Snappy_Compress(r->options.compression_opts, raw.data(),
|
|
||||||
raw.size(), compressed) &&
|
|
||||||
GoodCompressionRatio(compressed->size(), raw.size())) {
|
|
||||||
block_contents = *compressed;
|
|
||||||
} else {
|
|
||||||
// Snappy not supported, or not good compression ratio, so just
|
|
||||||
// store uncompressed form
|
|
||||||
block_contents = raw;
|
|
||||||
type = kNoCompression;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
case kZlibCompression:
|
|
||||||
if (port::Zlib_Compress(r->options.compression_opts, raw.data(),
|
|
||||||
raw.size(), compressed) &&
|
|
||||||
GoodCompressionRatio(compressed->size(), raw.size())) {
|
|
||||||
block_contents = *compressed;
|
|
||||||
} else {
|
|
||||||
// Zlib not supported, or not good compression ratio, so just
|
|
||||||
// store uncompressed form
|
|
||||||
block_contents = raw;
|
|
||||||
type = kNoCompression;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
case kBZip2Compression:
|
|
||||||
if (port::BZip2_Compress(r->options.compression_opts, raw.data(),
|
|
||||||
raw.size(), compressed) &&
|
|
||||||
GoodCompressionRatio(compressed->size(), raw.size())) {
|
|
||||||
block_contents = *compressed;
|
|
||||||
} else {
|
|
||||||
// BZip not supported, or not good compression ratio, so just
|
|
||||||
// store uncompressed form
|
|
||||||
block_contents = raw;
|
|
||||||
type = kNoCompression;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
case kLZ4Compression:
|
|
||||||
if (port::LZ4_Compress(r->options.compression_opts, raw.data(),
|
|
||||||
raw.size(), compressed) &&
|
|
||||||
GoodCompressionRatio(compressed->size(), raw.size())) {
|
|
||||||
block_contents = *compressed;
|
|
||||||
} else {
|
|
||||||
// LZ4 not supported, or not good compression ratio, so just
|
|
||||||
// store uncompressed form
|
|
||||||
block_contents = raw;
|
|
||||||
type = kNoCompression;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
case kLZ4HCCompression:
|
|
||||||
if (port::LZ4HC_Compress(r->options.compression_opts, raw.data(),
|
|
||||||
raw.size(), compressed) &&
|
|
||||||
GoodCompressionRatio(compressed->size(), raw.size())) {
|
|
||||||
block_contents = *compressed;
|
|
||||||
} else {
|
|
||||||
// LZ4 not supported, or not good compression ratio, so just
|
|
||||||
// store uncompressed form
|
|
||||||
block_contents = raw;
|
|
||||||
type = kNoCompression;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
WriteRawBlock(block_contents, type, handle);
|
WriteRawBlock(block_contents, type, handle);
|
||||||
r->compressed_output.clear();
|
r->compressed_output.clear();
|
||||||
block->Reset();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void BlockBasedTableBuilder::WriteRawBlock(const Slice& block_contents,
|
void BlockBasedTableBuilder::WriteRawBlock(const Slice& block_contents,
|
||||||
@ -364,11 +504,8 @@ Status BlockBasedTableBuilder::Finish() {
|
|||||||
// block, we will finish writing all index entries here and flush them
|
// block, we will finish writing all index entries here and flush them
|
||||||
// to storage after metaindex block is written.
|
// to storage after metaindex block is written.
|
||||||
if (ok() && !empty_data_block) {
|
if (ok() && !empty_data_block) {
|
||||||
r->internal_comparator.FindShortSuccessor(&r->last_key);
|
r->index_builder->AddEntry(&r->last_key, nullptr /* no next data block */,
|
||||||
|
r->pending_handle);
|
||||||
std::string handle_encoding;
|
|
||||||
r->pending_handle.EncodeTo(&handle_encoding);
|
|
||||||
r->index_block.Add(r->last_key, handle_encoding);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Write meta blocks and metaindex block with the following order.
|
// Write meta blocks and metaindex block with the following order.
|
||||||
@ -394,11 +531,12 @@ Status BlockBasedTableBuilder::Finish() {
|
|||||||
r->props.filter_policy_name = r->options.filter_policy != nullptr ?
|
r->props.filter_policy_name = r->options.filter_policy != nullptr ?
|
||||||
r->options.filter_policy->Name() : "";
|
r->options.filter_policy->Name() : "";
|
||||||
r->props.index_size =
|
r->props.index_size =
|
||||||
r->index_block.CurrentSizeEstimate() + kBlockTrailerSize;
|
r->index_builder->EstimatedSize() + kBlockTrailerSize;
|
||||||
|
|
||||||
// Add basic properties
|
// Add basic properties
|
||||||
property_block_builder.AddTableProperty(r->props);
|
property_block_builder.AddTableProperty(r->props);
|
||||||
|
|
||||||
|
// Add use collected properties
|
||||||
NotifyCollectTableCollectorsOnFinish(
|
NotifyCollectTableCollectorsOnFinish(
|
||||||
r->options.table_properties_collectors,
|
r->options.table_properties_collectors,
|
||||||
r->options.info_log.get(),
|
r->options.info_log.get(),
|
||||||
@ -425,7 +563,7 @@ Status BlockBasedTableBuilder::Finish() {
|
|||||||
|
|
||||||
// Write index block
|
// Write index block
|
||||||
if (ok()) {
|
if (ok()) {
|
||||||
WriteBlock(&r->index_block, &index_block_handle);
|
WriteBlock(r->index_builder->Finish(), &index_block_handle);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Write footer
|
// Write footer
|
||||||
|
@ -9,6 +9,7 @@
|
|||||||
|
|
||||||
#pragma once
|
#pragma once
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
#include "rocksdb/flush_block_policy.h"
|
#include "rocksdb/flush_block_policy.h"
|
||||||
#include "rocksdb/options.h"
|
#include "rocksdb/options.h"
|
||||||
#include "rocksdb/status.h"
|
#include "rocksdb/status.h"
|
||||||
@ -19,6 +20,7 @@ namespace rocksdb {
|
|||||||
class BlockBuilder;
|
class BlockBuilder;
|
||||||
class BlockHandle;
|
class BlockHandle;
|
||||||
class WritableFile;
|
class WritableFile;
|
||||||
|
struct BlockBasedTableOptions;
|
||||||
|
|
||||||
class BlockBasedTableBuilder : public TableBuilder {
|
class BlockBasedTableBuilder : public TableBuilder {
|
||||||
public:
|
public:
|
||||||
@ -26,10 +28,9 @@ class BlockBasedTableBuilder : public TableBuilder {
|
|||||||
// building in *file. Does not close the file. It is up to the
|
// building in *file. Does not close the file. It is up to the
|
||||||
// caller to close the file after calling Finish().
|
// caller to close the file after calling Finish().
|
||||||
BlockBasedTableBuilder(const Options& options,
|
BlockBasedTableBuilder(const Options& options,
|
||||||
|
const BlockBasedTableOptions& table_options,
|
||||||
const InternalKeyComparator& internal_comparator,
|
const InternalKeyComparator& internal_comparator,
|
||||||
WritableFile* file,
|
WritableFile* file, CompressionType compression_type);
|
||||||
FlushBlockPolicyFactory* flush_block_policy_factory,
|
|
||||||
CompressionType compression_type);
|
|
||||||
|
|
||||||
// REQUIRES: Either Finish() or Abandon() has been called.
|
// REQUIRES: Either Finish() or Abandon() has been called.
|
||||||
~BlockBasedTableBuilder();
|
~BlockBasedTableBuilder();
|
||||||
@ -63,11 +64,17 @@ class BlockBasedTableBuilder : public TableBuilder {
|
|||||||
|
|
||||||
private:
|
private:
|
||||||
bool ok() const { return status().ok(); }
|
bool ok() const { return status().ok(); }
|
||||||
|
// Call block's Finish() method and then write the finalize block contents to
|
||||||
|
// file.
|
||||||
void WriteBlock(BlockBuilder* block, BlockHandle* handle);
|
void WriteBlock(BlockBuilder* block, BlockHandle* handle);
|
||||||
|
// Directly write block content to the file.
|
||||||
|
void WriteBlock(const Slice& block_contents, BlockHandle* handle);
|
||||||
void WriteRawBlock(const Slice& data, CompressionType, BlockHandle* handle);
|
void WriteRawBlock(const Slice& data, CompressionType, BlockHandle* handle);
|
||||||
Status InsertBlockInCache(const Slice& block_contents,
|
Status InsertBlockInCache(const Slice& block_contents,
|
||||||
const CompressionType type, const BlockHandle* handle);
|
const CompressionType type,
|
||||||
|
const BlockHandle* handle);
|
||||||
struct Rep;
|
struct Rep;
|
||||||
|
class BlockBasedTablePropertiesCollector;
|
||||||
Rep* rep_;
|
Rep* rep_;
|
||||||
|
|
||||||
// Advanced operation: flush any buffered key/value pairs to file.
|
// Advanced operation: flush any buffered key/value pairs to file.
|
||||||
@ -82,4 +89,3 @@ class BlockBasedTableBuilder : public TableBuilder {
|
|||||||
};
|
};
|
||||||
|
|
||||||
} // namespace rocksdb
|
} // namespace rocksdb
|
||||||
|
|
||||||
|
@ -11,7 +11,10 @@
|
|||||||
#include "table/block_based_table_factory.h"
|
#include "table/block_based_table_factory.h"
|
||||||
|
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
#include <string>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
|
#include "rocksdb/flush_block_policy.h"
|
||||||
#include "table/block_based_table_builder.h"
|
#include "table/block_based_table_builder.h"
|
||||||
#include "table/block_based_table_reader.h"
|
#include "table/block_based_table_reader.h"
|
||||||
#include "port/port.h"
|
#include "port/port.h"
|
||||||
@ -40,12 +43,8 @@ Status BlockBasedTableFactory::NewTableReader(
|
|||||||
TableBuilder* BlockBasedTableFactory::NewTableBuilder(
|
TableBuilder* BlockBasedTableFactory::NewTableBuilder(
|
||||||
const Options& options, const InternalKeyComparator& internal_comparator,
|
const Options& options, const InternalKeyComparator& internal_comparator,
|
||||||
WritableFile* file, CompressionType compression_type) const {
|
WritableFile* file, CompressionType compression_type) const {
|
||||||
auto flush_block_policy_factory =
|
auto table_builder = new BlockBasedTableBuilder(
|
||||||
table_options_.flush_block_policy_factory.get();
|
options, table_options_, internal_comparator, file, compression_type);
|
||||||
|
|
||||||
auto table_builder =
|
|
||||||
new BlockBasedTableBuilder(options, internal_comparator, file,
|
|
||||||
flush_block_policy_factory, compression_type);
|
|
||||||
|
|
||||||
return table_builder;
|
return table_builder;
|
||||||
}
|
}
|
||||||
@ -55,4 +54,7 @@ TableFactory* NewBlockBasedTableFactory(
|
|||||||
return new BlockBasedTableFactory(table_options);
|
return new BlockBasedTableFactory(table_options);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const std::string BlockBasedTablePropertyNames::kIndexType =
|
||||||
|
"rocksdb.block.based.table.index.type";
|
||||||
|
|
||||||
} // namespace rocksdb
|
} // namespace rocksdb
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -8,12 +8,14 @@
|
|||||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||||
|
|
||||||
#pragma once
|
#pragma once
|
||||||
#include <memory>
|
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include "rocksdb/cache.h"
|
#include <memory>
|
||||||
#include "rocksdb/env.h"
|
#include <utility>
|
||||||
#include "rocksdb/iterator.h"
|
|
||||||
#include "rocksdb/statistics.h"
|
#include "rocksdb/statistics.h"
|
||||||
|
#include "rocksdb/status.h"
|
||||||
|
#include "rocksdb/table.h"
|
||||||
#include "table/table_reader.h"
|
#include "table/table_reader.h"
|
||||||
#include "util/coding.h"
|
#include "util/coding.h"
|
||||||
|
|
||||||
@ -21,14 +23,19 @@ namespace rocksdb {
|
|||||||
|
|
||||||
class Block;
|
class Block;
|
||||||
class BlockHandle;
|
class BlockHandle;
|
||||||
|
class Cache;
|
||||||
|
class FilterBlockReader;
|
||||||
class Footer;
|
class Footer;
|
||||||
struct Options;
|
class InternalKeyComparator;
|
||||||
|
class Iterator;
|
||||||
class RandomAccessFile;
|
class RandomAccessFile;
|
||||||
struct ReadOptions;
|
|
||||||
class TableCache;
|
class TableCache;
|
||||||
class TableReader;
|
class TableReader;
|
||||||
class FilterBlockReader;
|
class WritableFile;
|
||||||
struct BlockBasedTableOptions;
|
struct BlockBasedTableOptions;
|
||||||
|
struct EnvOptions;
|
||||||
|
struct Options;
|
||||||
|
struct ReadOptions;
|
||||||
|
|
||||||
using std::unique_ptr;
|
using std::unique_ptr;
|
||||||
|
|
||||||
@ -91,7 +98,9 @@ class BlockBasedTable : public TableReader {
|
|||||||
~BlockBasedTable();
|
~BlockBasedTable();
|
||||||
|
|
||||||
bool TEST_filter_block_preloaded() const;
|
bool TEST_filter_block_preloaded() const;
|
||||||
bool TEST_index_block_preloaded() const;
|
bool TEST_index_reader_preloaded() const;
|
||||||
|
// Implementation of IndexReader will be exposed to internal cc file only.
|
||||||
|
class IndexReader;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
template <class TValue>
|
template <class TValue>
|
||||||
@ -101,40 +110,51 @@ class BlockBasedTable : public TableReader {
|
|||||||
Rep* rep_;
|
Rep* rep_;
|
||||||
bool compaction_optimized_;
|
bool compaction_optimized_;
|
||||||
|
|
||||||
static Iterator* BlockReader(void*, const ReadOptions&,
|
static Iterator* DataBlockReader(void*, const ReadOptions&,
|
||||||
const EnvOptions& soptions,
|
const EnvOptions& soptions,
|
||||||
const InternalKeyComparator& icomparator,
|
const InternalKeyComparator& icomparator,
|
||||||
const Slice&, bool for_compaction);
|
const Slice&, bool for_compaction);
|
||||||
|
|
||||||
static Iterator* BlockReader(void*, const ReadOptions&, const Slice&,
|
static Iterator* DataBlockReader(void*, const ReadOptions&, const Slice&,
|
||||||
bool* didIO, bool for_compaction = false);
|
bool* didIO, bool for_compaction = false);
|
||||||
|
|
||||||
// if `no_io == true`, we will not try to read filter from sst file
|
// For the following two functions:
|
||||||
// if it is not cached yet.
|
// if `no_io == true`, we will not try to read filter/index from sst file
|
||||||
|
// were they not present in cache yet.
|
||||||
CachableEntry<FilterBlockReader> GetFilter(bool no_io = false) const;
|
CachableEntry<FilterBlockReader> GetFilter(bool no_io = false) const;
|
||||||
|
|
||||||
Iterator* IndexBlockReader(const ReadOptions& options) const;
|
// Get the iterator from the index reader.
|
||||||
|
//
|
||||||
|
// Note: ErrorIterator with Status::Incomplete shall be returned if all the
|
||||||
|
// following conditions are met:
|
||||||
|
// 1. We enabled table_options.cache_index_and_filter_blocks.
|
||||||
|
// 2. index is not present in block cache.
|
||||||
|
// 3. We disallowed any io to be performed, that is, read_options ==
|
||||||
|
// kBlockCacheTier
|
||||||
|
Iterator* NewIndexIterator(const ReadOptions& read_options) const;
|
||||||
|
|
||||||
// Read the block, either from sst file or from cache. This method will try
|
// Read block cache from block caches (if set): block_cache and
|
||||||
// to read from cache only when block_cache is set or ReadOption doesn't
|
// block_cache_compressed.
|
||||||
// explicitly prohibit storage IO.
|
// On success, Status::OK with be returned and @block will be populated with
|
||||||
|
// pointer to the block as well as its block handle.
|
||||||
|
static Status GetDataBlockFromCache(
|
||||||
|
const Slice& block_cache_key, const Slice& compressed_block_cache_key,
|
||||||
|
Cache* block_cache, Cache* block_cache_compressed, Statistics* statistics,
|
||||||
|
const ReadOptions& read_options,
|
||||||
|
BlockBasedTable::CachableEntry<Block>* block);
|
||||||
|
// Put a raw block (maybe compressed) to the corresponding block caches.
|
||||||
|
// This method will perform decompression against raw_block if needed and then
|
||||||
|
// populate the block caches.
|
||||||
|
// On success, Status::OK will be returned; also @block will be populated with
|
||||||
|
// uncompressed block and its cache handle.
|
||||||
//
|
//
|
||||||
// If the block is read from cache, the statistics for cache miss/hit of the
|
// REQUIRES: raw_block is heap-allocated. PutDataBlockToCache() will be
|
||||||
// the given type of block will be updated. User can specify
|
// responsible for releasing its memory if error occurs.
|
||||||
// `block_cache_miss_ticker` and `block_cache_hit_ticker` for the statistics
|
static Status PutDataBlockToCache(
|
||||||
// update.
|
const Slice& block_cache_key, const Slice& compressed_block_cache_key,
|
||||||
//
|
Cache* block_cache, Cache* block_cache_compressed,
|
||||||
// On success, the `result` parameter will be populated, which contains a
|
const ReadOptions& read_options, Statistics* statistics,
|
||||||
// pointer to the block and its cache handle, which will be nullptr if it's
|
CachableEntry<Block>* block, Block* raw_block);
|
||||||
// not read from the cache.
|
|
||||||
static Status GetBlock(const BlockBasedTable* table,
|
|
||||||
const BlockHandle& handle,
|
|
||||||
const ReadOptions& options,
|
|
||||||
bool for_compaction,
|
|
||||||
Tickers block_cache_miss_ticker,
|
|
||||||
Tickers block_cache_hit_ticker,
|
|
||||||
bool* didIO,
|
|
||||||
CachableEntry<Block>* result);
|
|
||||||
|
|
||||||
// Calls (*handle_result)(arg, ...) repeatedly, starting with the entry found
|
// Calls (*handle_result)(arg, ...) repeatedly, starting with the entry found
|
||||||
// after a call to Seek(key), until handle_result returns false.
|
// after a call to Seek(key), until handle_result returns false.
|
||||||
@ -144,6 +164,7 @@ class BlockBasedTable : public TableReader {
|
|||||||
|
|
||||||
void ReadMeta(const Footer& footer);
|
void ReadMeta(const Footer& footer);
|
||||||
void ReadFilter(const Slice& filter_handle_value);
|
void ReadFilter(const Slice& filter_handle_value);
|
||||||
|
std::pair<Status, IndexReader*> CreateIndexReader() const;
|
||||||
|
|
||||||
// Read the meta block from sst.
|
// Read the meta block from sst.
|
||||||
static Status ReadMetaBlock(
|
static Status ReadMetaBlock(
|
||||||
@ -159,10 +180,9 @@ class BlockBasedTable : public TableReader {
|
|||||||
|
|
||||||
static void SetupCacheKeyPrefix(Rep* rep);
|
static void SetupCacheKeyPrefix(Rep* rep);
|
||||||
|
|
||||||
explicit BlockBasedTable(Rep* rep) :
|
explicit BlockBasedTable(Rep* rep)
|
||||||
compaction_optimized_(false) {
|
: rep_(rep), compaction_optimized_(false) {}
|
||||||
rep_ = rep;
|
|
||||||
}
|
|
||||||
// Generate a cache key prefix from the file
|
// Generate a cache key prefix from the file
|
||||||
static void GenerateCachePrefix(Cache* cc,
|
static void GenerateCachePrefix(Cache* cc,
|
||||||
RandomAccessFile* file, char* buffer, size_t* size);
|
RandomAccessFile* file, char* buffer, size_t* size);
|
||||||
|
@ -9,6 +9,7 @@
|
|||||||
|
|
||||||
#include <inttypes.h>
|
#include <inttypes.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <string>
|
#include <string>
|
||||||
@ -16,8 +17,6 @@
|
|||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#include "db/dbformat.h"
|
#include "db/dbformat.h"
|
||||||
#include "rocksdb/statistics.h"
|
|
||||||
#include "util/statistics.h"
|
|
||||||
#include "db/memtable.h"
|
#include "db/memtable.h"
|
||||||
#include "db/write_batch_internal.h"
|
#include "db/write_batch_internal.h"
|
||||||
|
|
||||||
@ -25,11 +24,11 @@
|
|||||||
#include "rocksdb/db.h"
|
#include "rocksdb/db.h"
|
||||||
#include "rocksdb/env.h"
|
#include "rocksdb/env.h"
|
||||||
#include "rocksdb/iterator.h"
|
#include "rocksdb/iterator.h"
|
||||||
#include "rocksdb/slice_transform.h"
|
|
||||||
#include "rocksdb/memtablerep.h"
|
#include "rocksdb/memtablerep.h"
|
||||||
|
#include "rocksdb/slice_transform.h"
|
||||||
|
#include "rocksdb/statistics.h"
|
||||||
|
|
||||||
#include "table/block.h"
|
#include "table/block.h"
|
||||||
#include "table/meta_blocks.h"
|
|
||||||
#include "table/block_based_table_reader.h"
|
|
||||||
#include "table/block_based_table_builder.h"
|
#include "table/block_based_table_builder.h"
|
||||||
#include "table/block_based_table_factory.h"
|
#include "table/block_based_table_factory.h"
|
||||||
#include "table/block_based_table_reader.h"
|
#include "table/block_based_table_reader.h"
|
||||||
@ -39,6 +38,7 @@
|
|||||||
#include "table/plain_table_factory.h"
|
#include "table/plain_table_factory.h"
|
||||||
|
|
||||||
#include "util/random.h"
|
#include "util/random.h"
|
||||||
|
#include "util/statistics.h"
|
||||||
#include "util/testharness.h"
|
#include "util/testharness.h"
|
||||||
#include "util/testutil.h"
|
#include "util/testutil.h"
|
||||||
|
|
||||||
@ -1201,7 +1201,7 @@ TEST(BlockBasedTableTest, BlockCacheDisabledTest) {
|
|||||||
// preloading filter/index blocks is enabled.
|
// preloading filter/index blocks is enabled.
|
||||||
auto reader = dynamic_cast<BlockBasedTable*>(c.table_reader());
|
auto reader = dynamic_cast<BlockBasedTable*>(c.table_reader());
|
||||||
ASSERT_TRUE(reader->TEST_filter_block_preloaded());
|
ASSERT_TRUE(reader->TEST_filter_block_preloaded());
|
||||||
ASSERT_TRUE(reader->TEST_index_block_preloaded());
|
ASSERT_TRUE(reader->TEST_index_reader_preloaded());
|
||||||
|
|
||||||
{
|
{
|
||||||
// nothing happens in the beginning
|
// nothing happens in the beginning
|
||||||
@ -1242,7 +1242,7 @@ TEST(BlockBasedTableTest, FilterBlockInBlockCache) {
|
|||||||
// preloading filter/index blocks is prohibited.
|
// preloading filter/index blocks is prohibited.
|
||||||
auto reader = dynamic_cast<BlockBasedTable*>(c.table_reader());
|
auto reader = dynamic_cast<BlockBasedTable*>(c.table_reader());
|
||||||
ASSERT_TRUE(!reader->TEST_filter_block_preloaded());
|
ASSERT_TRUE(!reader->TEST_filter_block_preloaded());
|
||||||
ASSERT_TRUE(!reader->TEST_index_block_preloaded());
|
ASSERT_TRUE(!reader->TEST_index_reader_preloaded());
|
||||||
|
|
||||||
// -- PART 1: Open with regular block cache.
|
// -- PART 1: Open with regular block cache.
|
||||||
// Since block_cache is disabled, no cache activities will be involved.
|
// Since block_cache is disabled, no cache activities will be involved.
|
||||||
|
Loading…
x
Reference in New Issue
Block a user