Adding option to save PlainTable index and bloom filter in SST file.
Summary: Adding option to save PlainTable index and bloom filter in SST file. If there is no bloom block and/or index block, PlainTableReader builds new ones. Otherwise PlainTableReader just use these blocks. Test Plan: make all check Reviewers: sdong Reviewed By: sdong Subscribers: leveldb Differential Revision: https://reviews.facebook.net/D19527
This commit is contained in:
parent
92d73cbe78
commit
9d70cce047
@ -23,6 +23,7 @@
|
||||
#include "rocksdb/slice_transform.h"
|
||||
#include "rocksdb/table.h"
|
||||
#include "table/meta_blocks.h"
|
||||
#include "table/bloom_block.h"
|
||||
#include "table/plain_table_factory.h"
|
||||
#include "table/plain_table_reader.h"
|
||||
#include "util/hash.h"
|
||||
@ -70,10 +71,11 @@ class PlainTableDBTest {
|
||||
plain_table_options.huge_page_tlb_size = 0;
|
||||
plain_table_options.encoding_type = kPrefix;
|
||||
plain_table_options.full_scan_mode = false;
|
||||
plain_table_options.store_index_in_file = false;
|
||||
|
||||
options.table_factory.reset(NewPlainTableFactory(plain_table_options));
|
||||
options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0, 3, true));
|
||||
|
||||
options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0, 3, true, 3));
|
||||
options.prefix_extractor.reset(NewFixedPrefixTransform(8));
|
||||
options.allow_mmap_reads = true;
|
||||
return options;
|
||||
@ -186,6 +188,8 @@ TEST(PlainTableDBTest, Empty) {
|
||||
ASSERT_EQ("NOT_FOUND", Get("0000000000000foo"));
|
||||
}
|
||||
|
||||
extern const uint64_t kPlainTableMagicNumber;
|
||||
|
||||
class TestPlainTableReader : public PlainTableReader {
|
||||
public:
|
||||
TestPlainTableReader(const EnvOptions& storage_options,
|
||||
@ -195,7 +199,8 @@ class TestPlainTableReader : public PlainTableReader {
|
||||
size_t index_sparseness,
|
||||
const TableProperties* table_properties,
|
||||
unique_ptr<RandomAccessFile>&& file,
|
||||
const Options& options, bool* expect_bloom_not_match)
|
||||
const Options& options, bool* expect_bloom_not_match,
|
||||
bool store_index_in_file)
|
||||
: PlainTableReader(options, std::move(file), storage_options, icomparator,
|
||||
encoding_type, file_size, table_properties),
|
||||
expect_bloom_not_match_(expect_bloom_not_match) {
|
||||
@ -206,6 +211,19 @@ class TestPlainTableReader : public PlainTableReader {
|
||||
bloom_bits_per_key, hash_table_ratio, index_sparseness,
|
||||
2 * 1024 * 1024);
|
||||
ASSERT_TRUE(s.ok());
|
||||
|
||||
TableProperties* props = const_cast<TableProperties*>(table_properties);
|
||||
if (store_index_in_file) {
|
||||
auto bloom_version_ptr = props->user_collected_properties.find(
|
||||
PlainTablePropertyNames::kBloomVersion);
|
||||
ASSERT_TRUE(bloom_version_ptr != props->user_collected_properties.end());
|
||||
ASSERT_EQ(bloom_version_ptr->second, std::string("1"));
|
||||
if (options.bloom_locality > 0) {
|
||||
auto num_blocks_ptr = props->user_collected_properties.find(
|
||||
PlainTablePropertyNames::kNumBloomBlocks);
|
||||
ASSERT_TRUE(num_blocks_ptr != props->user_collected_properties.end());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
virtual ~TestPlainTableReader() {}
|
||||
@ -213,7 +231,11 @@ class TestPlainTableReader : public PlainTableReader {
|
||||
private:
|
||||
virtual bool MatchBloom(uint32_t hash) const override {
|
||||
bool ret = PlainTableReader::MatchBloom(hash);
|
||||
ASSERT_TRUE(!*expect_bloom_not_match_ || !ret);
|
||||
if (*expect_bloom_not_match_) {
|
||||
ASSERT_TRUE(!ret);
|
||||
} else {
|
||||
ASSERT_TRUE(ret);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
bool* expect_bloom_not_match_;
|
||||
@ -228,6 +250,7 @@ class TestPlainTableFactory : public PlainTableFactory {
|
||||
bloom_bits_per_key_(options.bloom_bits_per_key),
|
||||
hash_table_ratio_(options.hash_table_ratio),
|
||||
index_sparseness_(options.index_sparseness),
|
||||
store_index_in_file_(options.store_index_in_file),
|
||||
expect_bloom_not_match_(expect_bloom_not_match) {}
|
||||
|
||||
Status NewTableReader(const Options& options, const EnvOptions& soptions,
|
||||
@ -239,6 +262,20 @@ class TestPlainTableFactory : public PlainTableFactory {
|
||||
options.env, options.info_log.get(), &props);
|
||||
ASSERT_TRUE(s.ok());
|
||||
|
||||
if (store_index_in_file_) {
|
||||
BlockHandle bloom_block_handle;
|
||||
s = FindMetaBlock(file.get(), file_size, kPlainTableMagicNumber,
|
||||
options.env, BloomBlockBuilder::kBloomBlock,
|
||||
&bloom_block_handle);
|
||||
ASSERT_TRUE(s.ok());
|
||||
|
||||
BlockHandle index_block_handle;
|
||||
s = FindMetaBlock(
|
||||
file.get(), file_size, kPlainTableMagicNumber, options.env,
|
||||
PlainTableIndexBuilder::kPlainTableIndexBlock, &index_block_handle);
|
||||
ASSERT_TRUE(s.ok());
|
||||
}
|
||||
|
||||
auto& user_props = props->user_collected_properties;
|
||||
auto encoding_type_prop =
|
||||
user_props.find(PlainTablePropertyNames::kEncodingType);
|
||||
@ -249,7 +286,8 @@ class TestPlainTableFactory : public PlainTableFactory {
|
||||
std::unique_ptr<PlainTableReader> new_reader(new TestPlainTableReader(
|
||||
soptions, internal_comparator, encoding_type, file_size,
|
||||
bloom_bits_per_key_, hash_table_ratio_, index_sparseness_, props,
|
||||
std::move(file), options, expect_bloom_not_match_));
|
||||
std::move(file), options, expect_bloom_not_match_,
|
||||
store_index_in_file_));
|
||||
|
||||
*table = std::move(new_reader);
|
||||
return s;
|
||||
@ -259,6 +297,7 @@ class TestPlainTableFactory : public PlainTableFactory {
|
||||
int bloom_bits_per_key_;
|
||||
double hash_table_ratio_;
|
||||
size_t index_sparseness_;
|
||||
bool store_index_in_file_;
|
||||
bool* expect_bloom_not_match_;
|
||||
};
|
||||
|
||||
@ -268,6 +307,12 @@ TEST(PlainTableDBTest, Flush) {
|
||||
for (EncodingType encoding_type : {kPlain, kPrefix}) {
|
||||
for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
|
||||
for (int total_order = 0; total_order <= 1; total_order++) {
|
||||
for (int store_index_in_file = 0; store_index_in_file <= 1;
|
||||
++store_index_in_file) {
|
||||
if (!bloom_bits && store_index_in_file) {
|
||||
continue;
|
||||
}
|
||||
|
||||
Options options = CurrentOptions();
|
||||
options.create_if_missing = true;
|
||||
// Set only one bucket to force bucket conflict.
|
||||
@ -283,6 +328,7 @@ TEST(PlainTableDBTest, Flush) {
|
||||
plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
|
||||
plain_table_options.encoding_type = encoding_type;
|
||||
plain_table_options.full_scan_mode = false;
|
||||
plain_table_options.store_index_in_file = store_index_in_file;
|
||||
|
||||
options.table_factory.reset(
|
||||
NewPlainTableFactory(plain_table_options));
|
||||
@ -295,12 +341,12 @@ TEST(PlainTableDBTest, Flush) {
|
||||
plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
|
||||
plain_table_options.encoding_type = encoding_type;
|
||||
plain_table_options.full_scan_mode = false;
|
||||
plain_table_options.store_index_in_file = store_index_in_file;
|
||||
|
||||
options.table_factory.reset(
|
||||
NewPlainTableFactory(plain_table_options));
|
||||
}
|
||||
DestroyAndReopen(&options);
|
||||
|
||||
ASSERT_OK(Put("1000000000000foo", "v1"));
|
||||
ASSERT_OK(Put("0000000000000bar", "v2"));
|
||||
ASSERT_OK(Put("1000000000000foo", "v3"));
|
||||
@ -311,17 +357,26 @@ TEST(PlainTableDBTest, Flush) {
|
||||
ASSERT_EQ(1U, ptc.size());
|
||||
auto row = ptc.begin();
|
||||
auto tp = row->second;
|
||||
ASSERT_EQ(total_order ? "4" : "12", (tp->user_collected_properties).at(
|
||||
"plain_table_hash_table_size"));
|
||||
|
||||
if (!store_index_in_file) {
|
||||
ASSERT_EQ(total_order ? "4" : "12",
|
||||
(tp->user_collected_properties)
|
||||
.at("plain_table_hash_table_size"));
|
||||
ASSERT_EQ("0", (tp->user_collected_properties)
|
||||
.at("plain_table_sub_index_size"));
|
||||
|
||||
} else {
|
||||
ASSERT_EQ("0", (tp->user_collected_properties)
|
||||
.at("plain_table_hash_table_size"));
|
||||
ASSERT_EQ("0", (tp->user_collected_properties)
|
||||
.at("plain_table_sub_index_size"));
|
||||
}
|
||||
ASSERT_EQ("v3", Get("1000000000000foo"));
|
||||
ASSERT_EQ("v2", Get("0000000000000bar"));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(PlainTableDBTest, Flush2) {
|
||||
@ -330,7 +385,15 @@ TEST(PlainTableDBTest, Flush2) {
|
||||
for (EncodingType encoding_type : {kPlain, kPrefix}) {
|
||||
for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
|
||||
for (int total_order = 0; total_order <= 1; total_order++) {
|
||||
if (encoding_type == kPrefix && total_order == 1) {
|
||||
for (int store_index_in_file = 0; store_index_in_file <= 1;
|
||||
++store_index_in_file) {
|
||||
if (encoding_type == kPrefix && total_order) {
|
||||
continue;
|
||||
}
|
||||
if (!bloom_bits && store_index_in_file) {
|
||||
continue;
|
||||
}
|
||||
if (total_order && store_index_in_file) {
|
||||
continue;
|
||||
}
|
||||
bool expect_bloom_not_match = false;
|
||||
@ -338,30 +401,23 @@ TEST(PlainTableDBTest, Flush2) {
|
||||
options.create_if_missing = true;
|
||||
// Set only one bucket to force bucket conflict.
|
||||
// Test index interval for the same prefix to be 1, 2 and 4
|
||||
PlainTableOptions plain_table_options;
|
||||
if (total_order) {
|
||||
options.prefix_extractor = nullptr;
|
||||
PlainTableOptions plain_table_options;
|
||||
plain_table_options.user_key_len = 0;
|
||||
plain_table_options.bloom_bits_per_key = bloom_bits;
|
||||
plain_table_options.hash_table_ratio = 0;
|
||||
plain_table_options.index_sparseness = 2;
|
||||
plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
|
||||
plain_table_options.encoding_type = encoding_type;
|
||||
|
||||
options.table_factory.reset(new TestPlainTableFactory(
|
||||
&expect_bloom_not_match, plain_table_options));
|
||||
} else {
|
||||
PlainTableOptions plain_table_options;
|
||||
plain_table_options.user_key_len = 0;
|
||||
plain_table_options.bloom_bits_per_key = bloom_bits;
|
||||
plain_table_options.hash_table_ratio = 0.75;
|
||||
plain_table_options.index_sparseness = 16;
|
||||
}
|
||||
plain_table_options.user_key_len = kPlainTableVariableLength;
|
||||
plain_table_options.bloom_bits_per_key = bloom_bits;
|
||||
plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
|
||||
plain_table_options.encoding_type = encoding_type;
|
||||
|
||||
plain_table_options.store_index_in_file = store_index_in_file;
|
||||
options.table_factory.reset(new TestPlainTableFactory(
|
||||
&expect_bloom_not_match, plain_table_options));
|
||||
}
|
||||
|
||||
DestroyAndReopen(&options);
|
||||
ASSERT_OK(Put("0000000000000bar", "b"));
|
||||
ASSERT_OK(Put("1000000000000foo", "v1"));
|
||||
@ -389,7 +445,6 @@ TEST(PlainTableDBTest, Flush2) {
|
||||
// Neither key nor value should exist.
|
||||
expect_bloom_not_match = true;
|
||||
ASSERT_EQ("NOT_FOUND", Get("5_not00000000bar"));
|
||||
|
||||
// Key doesn't exist any more but prefix exists.
|
||||
if (total_order) {
|
||||
ASSERT_EQ("NOT_FOUND", Get("1000000000000not"));
|
||||
@ -401,6 +456,7 @@ TEST(PlainTableDBTest, Flush2) {
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(PlainTableDBTest, Iterator) {
|
||||
|
@ -119,6 +119,8 @@ enum EncodingType : char {
|
||||
struct PlainTablePropertyNames {
|
||||
static const std::string kPrefixExtractorName;
|
||||
static const std::string kEncodingType;
|
||||
static const std::string kBloomVersion;
|
||||
static const std::string kNumBloomBlocks;
|
||||
};
|
||||
|
||||
const uint32_t kPlainTableVariableLength = 0;
|
||||
@ -166,6 +168,11 @@ EncodingType encoding_type = kPlain;
|
||||
// @full_scan_mode: mode for reading the whole file one record by one without
|
||||
// using the index.
|
||||
bool full_scan_mode = false;
|
||||
|
||||
// @store_index_in_file: compute plain table index and bloom filter during
|
||||
// file building and store it in file. When reading
|
||||
// file, index will be mmaped instead of recomputation.
|
||||
bool store_index_in_file = false;
|
||||
};
|
||||
|
||||
// -- Plain Table with prefix-only seek
|
||||
|
23
table/bloom_block.cc
Normal file
23
table/bloom_block.cc
Normal file
@ -0,0 +1,23 @@
|
||||
// Copyright (c) 2014, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#include "table/bloom_block.h"
|
||||
|
||||
#include <string>
|
||||
#include "rocksdb/slice.h"
|
||||
#include "util/dynamic_bloom.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
void BloomBlockBuilder::AddKeysHashes(const std::vector<uint32_t> keys_hashes) {
|
||||
for (auto hash : keys_hashes) {
|
||||
bloom_.AddHash(hash);
|
||||
}
|
||||
}
|
||||
|
||||
Slice BloomBlockBuilder::Finish() { return bloom_.GetRawData(); }
|
||||
|
||||
const std::string BloomBlockBuilder::kBloomBlock = "kBloomBlock";
|
||||
} // namespace rocksdb
|
37
table/bloom_block.h
Normal file
37
table/bloom_block.h
Normal file
@ -0,0 +1,37 @@
|
||||
// Copyright (c) 2014, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include "util/dynamic_bloom.h"
|
||||
|
||||
namespace rocksdb {
|
||||
class Logger;
|
||||
|
||||
class BloomBlockBuilder {
|
||||
public:
|
||||
static const std::string kBloomBlock;
|
||||
|
||||
explicit BloomBlockBuilder(uint32_t num_probes = 6)
|
||||
: bloom_(num_probes, nullptr) {}
|
||||
|
||||
void SetTotalBits(Arena* arena, uint32_t total_bits, uint32_t locality,
|
||||
size_t huge_page_tlb_size, Logger* logger) {
|
||||
bloom_.SetTotalBits(arena, total_bits, locality, huge_page_tlb_size,
|
||||
logger);
|
||||
}
|
||||
|
||||
uint32_t GetNumBlocks() const { return bloom_.GetNumBlocks(); }
|
||||
|
||||
void AddKeysHashes(const std::vector<uint32_t> keys_hashes);
|
||||
|
||||
Slice Finish();
|
||||
|
||||
private:
|
||||
DynamicBloom bloom_;
|
||||
};
|
||||
|
||||
}; // namespace rocksdb
|
@ -273,4 +273,72 @@ Status FindMetaBlock(Iterator* meta_index_iter,
|
||||
}
|
||||
}
|
||||
|
||||
Status FindMetaBlock(RandomAccessFile* file, uint64_t file_size,
|
||||
uint64_t table_magic_number, Env* env,
|
||||
const std::string& meta_block_name,
|
||||
BlockHandle* block_handle) {
|
||||
Footer footer(table_magic_number);
|
||||
auto s = ReadFooterFromFile(file, file_size, &footer);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
|
||||
auto metaindex_handle = footer.metaindex_handle();
|
||||
BlockContents metaindex_contents;
|
||||
ReadOptions read_options;
|
||||
read_options.verify_checksums = false;
|
||||
s = ReadBlockContents(file, footer, read_options, metaindex_handle,
|
||||
&metaindex_contents, env, false);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
Block metaindex_block(metaindex_contents);
|
||||
|
||||
std::unique_ptr<Iterator> meta_iter;
|
||||
meta_iter.reset(metaindex_block.NewIterator(BytewiseComparator()));
|
||||
|
||||
return FindMetaBlock(meta_iter.get(), meta_block_name, block_handle);
|
||||
}
|
||||
|
||||
Status ReadMetaBlock(RandomAccessFile* file, uint64_t file_size,
|
||||
uint64_t table_magic_number, Env* env,
|
||||
const std::string& meta_block_name,
|
||||
BlockContents* contents) {
|
||||
Footer footer(table_magic_number);
|
||||
auto s = ReadFooterFromFile(file, file_size, &footer);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
|
||||
// Reading metaindex block
|
||||
auto metaindex_handle = footer.metaindex_handle();
|
||||
BlockContents metaindex_contents;
|
||||
ReadOptions read_options;
|
||||
read_options.verify_checksums = false;
|
||||
s = ReadBlockContents(file, footer, read_options, metaindex_handle,
|
||||
&metaindex_contents, env, false);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
|
||||
// Finding metablock
|
||||
Block metaindex_block(metaindex_contents);
|
||||
|
||||
std::unique_ptr<Iterator> meta_iter;
|
||||
meta_iter.reset(metaindex_block.NewIterator(BytewiseComparator()));
|
||||
|
||||
BlockHandle block_handle;
|
||||
s = FindMetaBlock(meta_iter.get(), meta_block_name, &block_handle);
|
||||
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
|
||||
// Reading metablock
|
||||
s = ReadBlockContents(file, footer, read_options, block_handle, contents, env,
|
||||
false);
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
|
@ -15,6 +15,7 @@
|
||||
#include "rocksdb/slice.h"
|
||||
#include "rocksdb/table_properties.h"
|
||||
#include "table/block_builder.h"
|
||||
#include "table/format.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
@ -128,4 +129,18 @@ Status FindMetaBlock(Iterator* meta_index_iter,
|
||||
const std::string& meta_block_name,
|
||||
BlockHandle* block_handle);
|
||||
|
||||
// Find the meta block
|
||||
Status FindMetaBlock(RandomAccessFile* file, uint64_t file_size,
|
||||
uint64_t table_magic_number, Env* env,
|
||||
const std::string& meta_block_name,
|
||||
BlockHandle* block_handle);
|
||||
|
||||
// Read the specified meta block with name meta_block_name
|
||||
// from `file` and initialize `contents` with contents of this block.
|
||||
// Return Status::OK in case of success.
|
||||
Status ReadMetaBlock(RandomAccessFile* file, uint64_t file_size,
|
||||
uint64_t table_magic_number, Env* env,
|
||||
const std::string& meta_block_name,
|
||||
BlockContents* contents);
|
||||
|
||||
} // namespace rocksdb
|
||||
|
@ -6,6 +6,7 @@
|
||||
#ifndef ROCKSDB_LITE
|
||||
#include "table/plain_table_builder.h"
|
||||
|
||||
#include <string>
|
||||
#include <assert.h>
|
||||
#include <map>
|
||||
|
||||
@ -17,6 +18,8 @@
|
||||
#include "table/plain_table_factory.h"
|
||||
#include "db/dbformat.h"
|
||||
#include "table/block_builder.h"
|
||||
#include "table/bloom_block.h"
|
||||
#include "table/plain_table_index.h"
|
||||
#include "table/filter_block.h"
|
||||
#include "table/format.h"
|
||||
#include "table/meta_blocks.h"
|
||||
@ -54,20 +57,36 @@ Status WriteBlock(
|
||||
extern const uint64_t kPlainTableMagicNumber = 0x8242229663bf9564ull;
|
||||
extern const uint64_t kLegacyPlainTableMagicNumber = 0x4f3418eb7a8f13b8ull;
|
||||
|
||||
PlainTableBuilder::PlainTableBuilder(const Options& options, WritableFile* file,
|
||||
uint32_t user_key_len,
|
||||
EncodingType encoding_type,
|
||||
size_t index_sparseness)
|
||||
PlainTableBuilder::PlainTableBuilder(
|
||||
const Options& options, WritableFile* file, uint32_t user_key_len,
|
||||
EncodingType encoding_type, size_t index_sparseness,
|
||||
uint32_t bloom_bits_per_key, uint32_t num_probes, size_t huge_page_tlb_size,
|
||||
double hash_table_ratio, bool store_index_in_file)
|
||||
: options_(options),
|
||||
bloom_block_(num_probes),
|
||||
file_(file),
|
||||
bloom_bits_per_key_(bloom_bits_per_key),
|
||||
huge_page_tlb_size_(huge_page_tlb_size),
|
||||
encoder_(encoding_type, user_key_len, options.prefix_extractor.get(),
|
||||
index_sparseness) {
|
||||
index_sparseness),
|
||||
store_index_in_file_(store_index_in_file),
|
||||
prefix_extractor_(options.prefix_extractor.get()) {
|
||||
// Build index block and save it in the file if hash_table_ratio > 0
|
||||
if (store_index_in_file_) {
|
||||
assert(hash_table_ratio > 0 || IsTotalOrderMode());
|
||||
index_builder_.reset(
|
||||
new PlainTableIndexBuilder(&arena_, options, index_sparseness,
|
||||
hash_table_ratio, huge_page_tlb_size_));
|
||||
assert(bloom_bits_per_key_ > 0);
|
||||
properties_.user_collected_properties
|
||||
[PlainTablePropertyNames::kBloomVersion] = "1"; // For future use
|
||||
}
|
||||
|
||||
properties_.fixed_key_len = user_key_len;
|
||||
|
||||
// for plain table, we put all the data in a big chuck.
|
||||
properties_.num_data_blocks = 1;
|
||||
// emphasize that currently plain table doesn't have persistent index or
|
||||
// filter block.
|
||||
// Fill it later if store_index_in_file_ == true
|
||||
properties_.index_size = 0;
|
||||
properties_.filter_size = 0;
|
||||
// To support roll-back to previous version, now still use version 0 for
|
||||
@ -100,9 +119,28 @@ void PlainTableBuilder::Add(const Slice& key, const Slice& value) {
|
||||
char meta_bytes_buf[6];
|
||||
size_t meta_bytes_buf_size = 0;
|
||||
|
||||
ParsedInternalKey internal_key;
|
||||
ParseInternalKey(key, &internal_key);
|
||||
|
||||
// Store key hash
|
||||
if (store_index_in_file_) {
|
||||
if (options_.prefix_extractor.get() == nullptr) {
|
||||
keys_or_prefixes_hashes_.push_back(GetSliceHash(internal_key.user_key));
|
||||
} else {
|
||||
Slice prefix =
|
||||
options_.prefix_extractor->Transform(internal_key.user_key);
|
||||
keys_or_prefixes_hashes_.push_back(GetSliceHash(prefix));
|
||||
}
|
||||
}
|
||||
|
||||
// Write value
|
||||
auto prev_offset = offset_;
|
||||
// Write out the key
|
||||
encoder_.AppendKey(key, file_, &offset_, meta_bytes_buf,
|
||||
&meta_bytes_buf_size);
|
||||
if (SaveIndexInFile()) {
|
||||
index_builder_->AddKeyPrefix(GetPrefix(internal_key), prev_offset);
|
||||
}
|
||||
|
||||
// Write value length
|
||||
int value_size = value.size();
|
||||
@ -134,11 +172,50 @@ Status PlainTableBuilder::Finish() {
|
||||
properties_.data_size = offset_;
|
||||
|
||||
// Write the following blocks
|
||||
// 1. [meta block: properties]
|
||||
// 2. [metaindex block]
|
||||
// 3. [footer]
|
||||
// 1. [meta block: bloom] - optional
|
||||
// 2. [meta block: index] - optional
|
||||
// 3. [meta block: properties]
|
||||
// 4. [metaindex block]
|
||||
// 5. [footer]
|
||||
|
||||
MetaIndexBuilder meta_index_builer;
|
||||
|
||||
if (store_index_in_file_ && (properties_.num_entries > 0)) {
|
||||
bloom_block_.SetTotalBits(
|
||||
&arena_, properties_.num_entries * bloom_bits_per_key_,
|
||||
options_.bloom_locality, huge_page_tlb_size_, options_.info_log.get());
|
||||
|
||||
PutVarint32(&properties_.user_collected_properties
|
||||
[PlainTablePropertyNames::kNumBloomBlocks],
|
||||
bloom_block_.GetNumBlocks());
|
||||
|
||||
bloom_block_.AddKeysHashes(keys_or_prefixes_hashes_);
|
||||
BlockHandle bloom_block_handle;
|
||||
auto finish_result = bloom_block_.Finish();
|
||||
|
||||
properties_.filter_size = finish_result.size();
|
||||
auto s = WriteBlock(finish_result, file_, &offset_, &bloom_block_handle);
|
||||
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
|
||||
BlockHandle index_block_handle;
|
||||
finish_result = index_builder_->Finish();
|
||||
|
||||
properties_.index_size = finish_result.size();
|
||||
s = WriteBlock(finish_result, file_, &offset_, &index_block_handle);
|
||||
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
|
||||
meta_index_builer.Add(BloomBlockBuilder::kBloomBlock, bloom_block_handle);
|
||||
meta_index_builer.Add(PlainTableIndexBuilder::kPlainTableIndexBlock,
|
||||
index_block_handle);
|
||||
}
|
||||
|
||||
// Calculate bloom block size and index block size
|
||||
PropertyBlockBuilder property_block_builder;
|
||||
// -- Add basic properties
|
||||
property_block_builder.AddTableProperty(properties_);
|
||||
|
@ -13,6 +13,8 @@
|
||||
#include "table/plain_table_key_coding.h"
|
||||
#include "rocksdb/table.h"
|
||||
#include "rocksdb/table_properties.h"
|
||||
#include "table/bloom_block.h"
|
||||
#include "table/plain_table_index.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
@ -30,7 +32,10 @@ class PlainTableBuilder: public TableBuilder {
|
||||
// that the caller does not know which level the output file will reside.
|
||||
PlainTableBuilder(const Options& options, WritableFile* file,
|
||||
uint32_t user_key_size, EncodingType encoding_type,
|
||||
size_t index_sparseness);
|
||||
size_t index_sparseness, uint32_t bloom_bits_per_key,
|
||||
uint32_t num_probes = 6, size_t huge_page_tlb_size = 0,
|
||||
double hash_table_ratio = 0,
|
||||
bool store_index_in_file = false);
|
||||
|
||||
// REQUIRES: Either Finish() or Abandon() has been called.
|
||||
~PlainTableBuilder();
|
||||
@ -62,18 +67,59 @@ class PlainTableBuilder: public TableBuilder {
|
||||
// Finish() call, returns the size of the final generated file.
|
||||
uint64_t FileSize() const override;
|
||||
|
||||
bool SaveIndexInFile() const { return store_index_in_file_; }
|
||||
|
||||
private:
|
||||
Arena arena_;
|
||||
Options options_;
|
||||
std::vector<std::unique_ptr<TablePropertiesCollector>>
|
||||
table_properties_collectors_;
|
||||
|
||||
BloomBlockBuilder bloom_block_;
|
||||
std::unique_ptr<PlainTableIndexBuilder> index_builder_;
|
||||
|
||||
WritableFile* file_;
|
||||
uint64_t offset_ = 0;
|
||||
uint32_t bloom_bits_per_key_;
|
||||
uint32_t huge_page_tlb_size_;
|
||||
Status status_;
|
||||
TableProperties properties_;
|
||||
PlainTableKeyEncoder encoder_;
|
||||
|
||||
bool store_index_in_file_;
|
||||
|
||||
std::vector<uint32_t> keys_or_prefixes_hashes_;
|
||||
bool closed_ = false; // Either Finish() or Abandon() has been called.
|
||||
|
||||
const SliceTransform* prefix_extractor_;
|
||||
|
||||
Slice GetPrefix(const Slice& target) const {
|
||||
assert(target.size() >= 8); // target is internal key
|
||||
return GetPrefixFromUserKey(GetUserKey(target));
|
||||
}
|
||||
|
||||
Slice GetPrefix(const ParsedInternalKey& target) const {
|
||||
return GetPrefixFromUserKey(target.user_key);
|
||||
}
|
||||
|
||||
Slice GetUserKey(const Slice& key) const {
|
||||
return Slice(key.data(), key.size() - 8);
|
||||
}
|
||||
|
||||
Slice GetPrefixFromUserKey(const Slice& user_key) const {
|
||||
if (!IsTotalOrderMode()) {
|
||||
return prefix_extractor_->Transform(user_key);
|
||||
} else {
|
||||
// Use empty slice as prefix if prefix_extractor is not set.
|
||||
// In that case,
|
||||
// it falls back to pure binary search and
|
||||
// total iterator seek is supported.
|
||||
return Slice();
|
||||
}
|
||||
}
|
||||
|
||||
bool IsTotalOrderMode() const { return (prefix_extractor_ == nullptr); }
|
||||
|
||||
// No copying allowed
|
||||
PlainTableBuilder(const PlainTableBuilder&) = delete;
|
||||
void operator=(const PlainTableBuilder&) = delete;
|
||||
|
@ -30,7 +30,9 @@ TableBuilder* PlainTableFactory::NewTableBuilder(
|
||||
const Options& options, const InternalKeyComparator& internal_comparator,
|
||||
WritableFile* file, CompressionType compression_type) const {
|
||||
return new PlainTableBuilder(options, file, user_key_len_, encoding_type_,
|
||||
index_sparseness_);
|
||||
index_sparseness_, bloom_bits_per_key_, 6,
|
||||
huge_page_tlb_size_, hash_table_ratio_,
|
||||
store_index_in_file_);
|
||||
}
|
||||
|
||||
extern TableFactory* NewPlainTableFactory(const PlainTableOptions& options) {
|
||||
@ -43,5 +45,11 @@ const std::string PlainTablePropertyNames::kPrefixExtractorName =
|
||||
const std::string PlainTablePropertyNames::kEncodingType =
|
||||
"rocksdb.plain.table.encoding.type";
|
||||
|
||||
const std::string PlainTablePropertyNames::kBloomVersion =
|
||||
"rocksdb.plain.table.bloom.version";
|
||||
|
||||
const std::string PlainTablePropertyNames::kNumBloomBlocks =
|
||||
"rocksdb.plain.table.bloom.numblocks";
|
||||
|
||||
} // namespace rocksdb
|
||||
#endif // ROCKSDB_LITE
|
||||
|
@ -151,7 +151,8 @@ class PlainTableFactory : public TableFactory {
|
||||
index_sparseness_(options.index_sparseness),
|
||||
huge_page_tlb_size_(options.huge_page_tlb_size),
|
||||
encoding_type_(options.encoding_type),
|
||||
full_scan_mode_(options.full_scan_mode) {}
|
||||
full_scan_mode_(options.full_scan_mode),
|
||||
store_index_in_file_(options.store_index_in_file) {}
|
||||
const char* Name() const override { return "PlainTable"; }
|
||||
Status NewTableReader(const Options& options, const EnvOptions& soptions,
|
||||
const InternalKeyComparator& internal_comparator,
|
||||
@ -173,6 +174,7 @@ class PlainTableFactory : public TableFactory {
|
||||
size_t huge_page_tlb_size_;
|
||||
EncodingType encoding_type_;
|
||||
bool full_scan_mode_;
|
||||
bool store_index_in_file_;
|
||||
};
|
||||
|
||||
} // namespace rocksdb
|
||||
|
196
table/plain_table_index.cc
Normal file
196
table/plain_table_index.cc
Normal file
@ -0,0 +1,196 @@
|
||||
// Copyright (c) 2014, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#include "table/plain_table_index.h"
|
||||
#include "util/coding.h"
|
||||
#include "util/hash.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
namespace {
|
||||
inline uint32_t GetBucketIdFromHash(uint32_t hash, uint32_t num_buckets) {
|
||||
assert(num_buckets > 0);
|
||||
return hash % num_buckets;
|
||||
}
|
||||
}
|
||||
|
||||
void PlainTableIndex::InitFromRawData(Slice data) {
|
||||
assert(GetVarint32(&data, &index_size_));
|
||||
assert(index_size_ > 0);
|
||||
assert(GetVarint32(&data, &num_prefixes_));
|
||||
sub_index_size_ = data.size() - index_size_ * kOffsetLen;
|
||||
|
||||
char* index_data_begin = const_cast<char*>(data.data());
|
||||
index_ = reinterpret_cast<uint32_t*>(index_data_begin);
|
||||
sub_index_ = reinterpret_cast<char*>(index_ + index_size_);
|
||||
}
|
||||
|
||||
PlainTableIndex::IndexSearchResult PlainTableIndex::GetOffset(
|
||||
uint32_t prefix_hash, uint32_t* bucket_value) const {
|
||||
int bucket = GetBucketIdFromHash(prefix_hash, index_size_);
|
||||
*bucket_value = index_[bucket];
|
||||
if ((*bucket_value & kSubIndexMask) == kSubIndexMask) {
|
||||
*bucket_value ^= kSubIndexMask;
|
||||
return kSubindex;
|
||||
}
|
||||
if (*bucket_value >= kMaxFileSize) {
|
||||
return kNoPrefixForBucket;
|
||||
} else {
|
||||
// point directly to the file
|
||||
return kDirectToFile;
|
||||
}
|
||||
}
|
||||
|
||||
void PlainTableIndexBuilder::IndexRecordList::AddRecord(murmur_t hash,
|
||||
uint32_t offset) {
|
||||
if (num_records_in_current_group_ == kNumRecordsPerGroup) {
|
||||
current_group_ = AllocateNewGroup();
|
||||
num_records_in_current_group_ = 0;
|
||||
}
|
||||
auto& new_record = current_group_[num_records_in_current_group_++];
|
||||
new_record.hash = hash;
|
||||
new_record.offset = offset;
|
||||
new_record.next = nullptr;
|
||||
}
|
||||
|
||||
void PlainTableIndexBuilder::AddKeyPrefix(Slice key_prefix_slice,
|
||||
uint64_t key_offset) {
|
||||
if (is_first_record_ || prev_key_prefix_ != key_prefix_slice.ToString()) {
|
||||
++num_prefixes_;
|
||||
if (!is_first_record_) {
|
||||
keys_per_prefix_hist_.Add(num_keys_per_prefix_);
|
||||
}
|
||||
num_keys_per_prefix_ = 0;
|
||||
prev_key_prefix_ = key_prefix_slice.ToString();
|
||||
prev_key_prefix_hash_ = GetSliceHash(key_prefix_slice);
|
||||
due_index_ = true;
|
||||
}
|
||||
|
||||
if (due_index_) {
|
||||
// Add an index key for every kIndexIntervalForSamePrefixKeys keys
|
||||
record_list_.AddRecord(prev_key_prefix_hash_, key_offset);
|
||||
due_index_ = false;
|
||||
}
|
||||
|
||||
num_keys_per_prefix_++;
|
||||
if (index_sparseness_ == 0 || num_keys_per_prefix_ % index_sparseness_ == 0) {
|
||||
due_index_ = true;
|
||||
}
|
||||
is_first_record_ = false;
|
||||
}
|
||||
|
||||
Slice PlainTableIndexBuilder::Finish() {
|
||||
AllocateIndex();
|
||||
std::vector<IndexRecord*> hash_to_offsets(index_size_, nullptr);
|
||||
std::vector<uint32_t> entries_per_bucket(index_size_, 0);
|
||||
BucketizeIndexes(&hash_to_offsets, &entries_per_bucket);
|
||||
|
||||
keys_per_prefix_hist_.Add(num_keys_per_prefix_);
|
||||
Log(options_.info_log, "Number of Keys per prefix Histogram: %s",
|
||||
keys_per_prefix_hist_.ToString().c_str());
|
||||
|
||||
// From the temp data structure, populate indexes.
|
||||
return FillIndexes(hash_to_offsets, entries_per_bucket);
|
||||
}
|
||||
|
||||
void PlainTableIndexBuilder::AllocateIndex() {
|
||||
if (prefix_extractor_ == nullptr || hash_table_ratio_ <= 0) {
|
||||
// Fall back to pure binary search if the user fails to specify a prefix
|
||||
// extractor.
|
||||
index_size_ = 1;
|
||||
} else {
|
||||
double hash_table_size_multipier = 1.0 / hash_table_ratio_;
|
||||
index_size_ = num_prefixes_ * hash_table_size_multipier + 1;
|
||||
assert(index_size_ > 0);
|
||||
}
|
||||
}
|
||||
|
||||
void PlainTableIndexBuilder::BucketizeIndexes(
|
||||
std::vector<IndexRecord*>* hash_to_offsets,
|
||||
std::vector<uint32_t>* entries_per_bucket) {
|
||||
bool first = true;
|
||||
uint32_t prev_hash = 0;
|
||||
size_t num_records = record_list_.GetNumRecords();
|
||||
for (size_t i = 0; i < num_records; i++) {
|
||||
IndexRecord* index_record = record_list_.At(i);
|
||||
uint32_t cur_hash = index_record->hash;
|
||||
if (first || prev_hash != cur_hash) {
|
||||
prev_hash = cur_hash;
|
||||
first = false;
|
||||
}
|
||||
uint32_t bucket = GetBucketIdFromHash(cur_hash, index_size_);
|
||||
IndexRecord* prev_bucket_head = (*hash_to_offsets)[bucket];
|
||||
index_record->next = prev_bucket_head;
|
||||
(*hash_to_offsets)[bucket] = index_record;
|
||||
(*entries_per_bucket)[bucket]++;
|
||||
}
|
||||
|
||||
sub_index_size_ = 0;
|
||||
for (auto entry_count : *entries_per_bucket) {
|
||||
if (entry_count <= 1) {
|
||||
continue;
|
||||
}
|
||||
// Only buckets with more than 1 entry will have subindex.
|
||||
sub_index_size_ += VarintLength(entry_count);
|
||||
// total bytes needed to store these entries' in-file offsets.
|
||||
sub_index_size_ += entry_count * PlainTableIndex::kOffsetLen;
|
||||
}
|
||||
}
|
||||
|
||||
Slice PlainTableIndexBuilder::FillIndexes(
|
||||
const std::vector<IndexRecord*>& hash_to_offsets,
|
||||
const std::vector<uint32_t>& entries_per_bucket) {
|
||||
Log(options_.info_log, "Reserving %zu bytes for plain table's sub_index",
|
||||
sub_index_size_);
|
||||
auto total_allocate_size = GetTotalSize();
|
||||
char* allocated = arena_->AllocateAligned(
|
||||
total_allocate_size, huge_page_tlb_size_, options_.info_log.get());
|
||||
|
||||
auto temp_ptr = EncodeVarint32(allocated, index_size_);
|
||||
uint32_t* index =
|
||||
reinterpret_cast<uint32_t*>(EncodeVarint32(temp_ptr, num_prefixes_));
|
||||
char* sub_index = reinterpret_cast<char*>(index + index_size_);
|
||||
|
||||
size_t sub_index_offset = 0;
|
||||
for (uint32_t i = 0; i < index_size_; i++) {
|
||||
uint32_t num_keys_for_bucket = entries_per_bucket[i];
|
||||
switch (num_keys_for_bucket) {
|
||||
case 0:
|
||||
// No key for bucket
|
||||
index[i] = PlainTableIndex::kMaxFileSize;
|
||||
break;
|
||||
case 1:
|
||||
// point directly to the file offset
|
||||
index[i] = hash_to_offsets[i]->offset;
|
||||
break;
|
||||
default:
|
||||
// point to second level indexes.
|
||||
index[i] = sub_index_offset | PlainTableIndex::kSubIndexMask;
|
||||
char* prev_ptr = &sub_index[sub_index_offset];
|
||||
char* cur_ptr = EncodeVarint32(prev_ptr, num_keys_for_bucket);
|
||||
sub_index_offset += (cur_ptr - prev_ptr);
|
||||
char* sub_index_pos = &sub_index[sub_index_offset];
|
||||
IndexRecord* record = hash_to_offsets[i];
|
||||
int j;
|
||||
for (j = num_keys_for_bucket - 1; j >= 0 && record;
|
||||
j--, record = record->next) {
|
||||
EncodeFixed32(sub_index_pos + j * sizeof(uint32_t), record->offset);
|
||||
}
|
||||
assert(j == -1 && record == nullptr);
|
||||
sub_index_offset += PlainTableIndex::kOffsetLen * num_keys_for_bucket;
|
||||
assert(sub_index_offset <= sub_index_size_);
|
||||
break;
|
||||
}
|
||||
}
|
||||
assert(sub_index_offset == sub_index_size_);
|
||||
|
||||
Log(options_.info_log, "hash table size: %d, suffix_map length %zu",
|
||||
index_size_, sub_index_size_);
|
||||
return Slice(allocated, GetTotalSize());
|
||||
}
|
||||
|
||||
const std::string PlainTableIndexBuilder::kPlainTableIndexBlock =
|
||||
"PlainTableIndexBlock";
|
||||
}; // namespace rocksdb
|
221
table/plain_table_index.h
Normal file
221
table/plain_table_index.h
Normal file
@ -0,0 +1,221 @@
|
||||
// Copyright (c) 2014, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "db/dbformat.h"
|
||||
#include "rocksdb/options.h"
|
||||
#include "util/murmurhash.h"
|
||||
#include "util/hash.h"
|
||||
#include "util/arena.h"
|
||||
#include "util/histogram.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
// PlainTableIndex contains buckets size of index_size_, each is a
|
||||
// 32-bit integer. The lower 31 bits contain an offset value (explained below)
|
||||
// and the first bit of the integer indicates type of the offset.
|
||||
//
|
||||
// +--------------+------------------------------------------------------+
|
||||
// | Flag (1 bit) | Offset to binary search buffer or file (31 bits) +
|
||||
// +--------------+------------------------------------------------------+
|
||||
//
|
||||
// Explanation for the "flag bit":
|
||||
//
|
||||
// 0 indicates that the bucket contains only one prefix (no conflict when
|
||||
// hashing this prefix), whose first row starts from this offset of the
|
||||
// file.
|
||||
// 1 indicates that the bucket contains more than one prefixes, or there
|
||||
// are too many rows for one prefix so we need a binary search for it. In
|
||||
// this case, the offset indicates the offset of sub_index_ holding the
|
||||
// binary search indexes of keys for those rows. Those binary search indexes
|
||||
// are organized in this way:
|
||||
//
|
||||
// The first 4 bytes, indicate how many indexes (N) are stored after it. After
|
||||
// it, there are N 32-bit integers, each points of an offset of the file,
|
||||
// which
|
||||
// points to starting of a row. Those offsets need to be guaranteed to be in
|
||||
// ascending order so the keys they are pointing to are also in ascending
|
||||
// order
|
||||
// to make sure we can use them to do binary searches. Below is visual
|
||||
// presentation of a bucket.
|
||||
//
|
||||
// <begin>
|
||||
// number_of_records: varint32
|
||||
// record 1 file offset: fixedint32
|
||||
// record 2 file offset: fixedint32
|
||||
// ....
|
||||
// record N file offset: fixedint32
|
||||
// <end>
|
||||
class PlainTableIndex {
|
||||
public:
|
||||
enum IndexSearchResult {
|
||||
kNoPrefixForBucket = 0,
|
||||
kDirectToFile = 1,
|
||||
kSubindex = 2
|
||||
};
|
||||
|
||||
explicit PlainTableIndex(Slice data) { InitFromRawData(data); }
|
||||
|
||||
PlainTableIndex()
|
||||
: index_size_(0),
|
||||
sub_index_size_(0),
|
||||
num_prefixes_(0),
|
||||
index_(nullptr),
|
||||
sub_index_(nullptr) {}
|
||||
|
||||
IndexSearchResult GetOffset(uint32_t prefix_hash,
|
||||
uint32_t* bucket_value) const;
|
||||
|
||||
void InitFromRawData(Slice data);
|
||||
|
||||
const char* GetSubIndexBasePtrAndUpperBound(uint32_t offset,
|
||||
uint32_t* upper_bound) const {
|
||||
const char* index_ptr = &sub_index_[offset];
|
||||
return GetVarint32Ptr(index_ptr, index_ptr + 4, upper_bound);
|
||||
}
|
||||
|
||||
uint32_t GetIndexSize() const { return index_size_; }
|
||||
|
||||
uint32_t GetSubIndexSize() const { return sub_index_size_; }
|
||||
|
||||
uint32_t GetNumPrefixes() const { return num_prefixes_; }
|
||||
|
||||
static const uint64_t kMaxFileSize = (1u << 31) - 1;
|
||||
static const uint32_t kSubIndexMask = 0x80000000;
|
||||
static const size_t kOffsetLen = sizeof(uint32_t);
|
||||
|
||||
private:
|
||||
uint32_t index_size_;
|
||||
size_t sub_index_size_;
|
||||
uint32_t num_prefixes_;
|
||||
|
||||
uint32_t* index_;
|
||||
char* sub_index_;
|
||||
};
|
||||
|
||||
// PlainTableIndexBuilder is used to create plain table index.
|
||||
// After calling Finish(), it returns Slice, which is usually
|
||||
// used either to initialize PlainTableIndex or
|
||||
// to save index to sst file.
|
||||
// For more details about the index, please refer to:
|
||||
// https://github.com/facebook/rocksdb/wiki/PlainTable-Format
|
||||
// #wiki-in-memory-index-format
|
||||
class PlainTableIndexBuilder {
|
||||
public:
|
||||
PlainTableIndexBuilder(Arena* arena, const Options& options,
|
||||
uint32_t index_sparseness, double hash_table_ratio,
|
||||
double huge_page_tlb_size)
|
||||
: arena_(arena),
|
||||
options_(options),
|
||||
record_list_(kRecordsPerGroup),
|
||||
is_first_record_(true),
|
||||
due_index_(false),
|
||||
num_prefixes_(0),
|
||||
num_keys_per_prefix_(0),
|
||||
prev_key_prefix_hash_(0),
|
||||
index_sparseness_(index_sparseness),
|
||||
prefix_extractor_(options.prefix_extractor.get()),
|
||||
hash_table_ratio_(hash_table_ratio),
|
||||
huge_page_tlb_size_(huge_page_tlb_size) {}
|
||||
|
||||
void AddKeyPrefix(Slice key_prefix_slice, uint64_t key_offset);
|
||||
|
||||
Slice Finish();
|
||||
|
||||
uint32_t GetTotalSize() const {
|
||||
return VarintLength(index_size_) + VarintLength(num_prefixes_) +
|
||||
PlainTableIndex::kOffsetLen * index_size_ + sub_index_size_;
|
||||
}
|
||||
|
||||
static const std::string kPlainTableIndexBlock;
|
||||
|
||||
private:
|
||||
struct IndexRecord {
|
||||
uint32_t hash; // hash of the prefix
|
||||
uint32_t offset; // offset of a row
|
||||
IndexRecord* next;
|
||||
};
|
||||
|
||||
// Helper class to track all the index records
|
||||
class IndexRecordList {
|
||||
public:
|
||||
explicit IndexRecordList(size_t num_records_per_group)
|
||||
: kNumRecordsPerGroup(num_records_per_group),
|
||||
current_group_(nullptr),
|
||||
num_records_in_current_group_(num_records_per_group) {}
|
||||
|
||||
~IndexRecordList() {
|
||||
for (size_t i = 0; i < groups_.size(); i++) {
|
||||
delete[] groups_[i];
|
||||
}
|
||||
}
|
||||
|
||||
void AddRecord(murmur_t hash, uint32_t offset);
|
||||
|
||||
size_t GetNumRecords() const {
|
||||
return (groups_.size() - 1) * kNumRecordsPerGroup +
|
||||
num_records_in_current_group_;
|
||||
}
|
||||
IndexRecord* At(size_t index) {
|
||||
return &(groups_[index / kNumRecordsPerGroup]
|
||||
[index % kNumRecordsPerGroup]);
|
||||
}
|
||||
|
||||
private:
|
||||
IndexRecord* AllocateNewGroup() {
|
||||
IndexRecord* result = new IndexRecord[kNumRecordsPerGroup];
|
||||
groups_.push_back(result);
|
||||
return result;
|
||||
}
|
||||
|
||||
// Each group in `groups_` contains fix-sized records (determined by
|
||||
// kNumRecordsPerGroup). Which can help us minimize the cost if resizing
|
||||
// occurs.
|
||||
const size_t kNumRecordsPerGroup;
|
||||
IndexRecord* current_group_;
|
||||
// List of arrays allocated
|
||||
std::vector<IndexRecord*> groups_;
|
||||
size_t num_records_in_current_group_;
|
||||
};
|
||||
|
||||
void AllocateIndex();
|
||||
|
||||
// Internal helper function to bucket index record list to hash buckets.
|
||||
void BucketizeIndexes(std::vector<IndexRecord*>* hash_to_offsets,
|
||||
std::vector<uint32_t>* entries_per_bucket);
|
||||
|
||||
// Internal helper class to fill the indexes and bloom filters to internal
|
||||
// data structures.
|
||||
Slice FillIndexes(const std::vector<IndexRecord*>& hash_to_offsets,
|
||||
const std::vector<uint32_t>& entries_per_bucket);
|
||||
|
||||
Arena* arena_;
|
||||
Options options_;
|
||||
HistogramImpl keys_per_prefix_hist_;
|
||||
IndexRecordList record_list_;
|
||||
bool is_first_record_;
|
||||
bool due_index_;
|
||||
uint32_t num_prefixes_;
|
||||
uint32_t num_keys_per_prefix_;
|
||||
|
||||
uint32_t prev_key_prefix_hash_;
|
||||
uint32_t index_sparseness_;
|
||||
uint32_t index_size_;
|
||||
size_t sub_index_size_;
|
||||
|
||||
const SliceTransform* prefix_extractor_;
|
||||
double hash_table_ratio_;
|
||||
double huge_page_tlb_size_;
|
||||
|
||||
std::string prev_key_prefix_;
|
||||
|
||||
static const size_t kRecordsPerGroup = 256;
|
||||
};
|
||||
|
||||
}; // namespace rocksdb
|
@ -3,6 +3,7 @@
|
||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||
|
||||
#ifndef ROCKSDB_LITE
|
||||
|
||||
#include "table/plain_table_reader.h"
|
||||
|
||||
#include <string>
|
||||
@ -18,6 +19,7 @@
|
||||
#include "rocksdb/statistics.h"
|
||||
|
||||
#include "table/block.h"
|
||||
#include "table/bloom_block.h"
|
||||
#include "table/filter_block.h"
|
||||
#include "table/format.h"
|
||||
#include "table/meta_blocks.h"
|
||||
@ -39,15 +41,6 @@ namespace rocksdb {
|
||||
|
||||
namespace {
|
||||
|
||||
inline uint32_t GetSliceHash(const Slice& s) {
|
||||
return Hash(s.data(), s.size(), 397) ;
|
||||
}
|
||||
|
||||
inline uint32_t GetBucketIdFromHash(uint32_t hash, uint32_t num_buckets) {
|
||||
assert(num_buckets >= 0);
|
||||
return hash % num_buckets;
|
||||
}
|
||||
|
||||
// Safely getting a uint32_t element from a char array, where, starting from
|
||||
// `base`, every 4 bytes are considered as an fixed 32 bit integer.
|
||||
inline uint32_t GetFixed32Element(const char* base, size_t offset) {
|
||||
@ -103,6 +96,7 @@ PlainTableReader::PlainTableReader(const Options& options,
|
||||
const TableProperties* table_properties)
|
||||
: internal_comparator_(icomparator),
|
||||
encoding_type_(encoding_type),
|
||||
full_scan_mode_(false),
|
||||
data_end_offset_(table_properties->data_size),
|
||||
user_key_len_(table_properties->fixed_key_len),
|
||||
prefix_extractor_(options.prefix_extractor.get()),
|
||||
@ -126,8 +120,7 @@ Status PlainTableReader::Open(const Options& options,
|
||||
double hash_table_ratio, size_t index_sparseness,
|
||||
size_t huge_page_tlb_size, bool full_scan_mode) {
|
||||
assert(options.allow_mmap_reads);
|
||||
|
||||
if (file_size > kMaxFileSize) {
|
||||
if (file_size > PlainTableIndex::kMaxFileSize) {
|
||||
return Status::NotSupported("File is too large for PlainTableReader!");
|
||||
}
|
||||
|
||||
@ -173,7 +166,6 @@ Status PlainTableReader::Open(const Options& options,
|
||||
return s;
|
||||
}
|
||||
|
||||
// -- Populate Index
|
||||
if (!full_scan_mode) {
|
||||
s = new_reader->PopulateIndex(props, bloom_bits_per_key, hash_table_ratio,
|
||||
index_sparseness, huge_page_tlb_size);
|
||||
@ -183,7 +175,7 @@ Status PlainTableReader::Open(const Options& options,
|
||||
} else {
|
||||
// Flag to indicate it is a full scan mode so that none of the indexes
|
||||
// can be used.
|
||||
new_reader->index_size_ = kFullScanModeFlag;
|
||||
new_reader->full_scan_mode_ = true;
|
||||
}
|
||||
|
||||
*table_reader = std::move(new_reader);
|
||||
@ -203,79 +195,15 @@ Iterator* PlainTableReader::NewIterator(const ReadOptions& options,
|
||||
}
|
||||
}
|
||||
|
||||
struct PlainTableReader::IndexRecord {
|
||||
uint32_t hash; // hash of the prefix
|
||||
uint32_t offset; // offset of a row
|
||||
IndexRecord* next;
|
||||
};
|
||||
|
||||
// Helper class to track all the index records
|
||||
class PlainTableReader::IndexRecordList {
|
||||
public:
|
||||
explicit IndexRecordList(size_t num_records_per_group)
|
||||
: kNumRecordsPerGroup(num_records_per_group),
|
||||
current_group_(nullptr),
|
||||
num_records_in_current_group_(num_records_per_group) {}
|
||||
|
||||
~IndexRecordList() {
|
||||
for (size_t i = 0; i < groups_.size(); i++) {
|
||||
delete[] groups_[i];
|
||||
}
|
||||
}
|
||||
|
||||
void AddRecord(murmur_t hash, uint32_t offset) {
|
||||
if (num_records_in_current_group_ == kNumRecordsPerGroup) {
|
||||
current_group_ = AllocateNewGroup();
|
||||
num_records_in_current_group_ = 0;
|
||||
}
|
||||
auto& new_record = current_group_[num_records_in_current_group_++];
|
||||
new_record.hash = hash;
|
||||
new_record.offset = offset;
|
||||
new_record.next = nullptr;
|
||||
}
|
||||
|
||||
size_t GetNumRecords() const {
|
||||
return (groups_.size() - 1) * kNumRecordsPerGroup +
|
||||
num_records_in_current_group_;
|
||||
}
|
||||
IndexRecord* At(size_t index) {
|
||||
return &(groups_[index / kNumRecordsPerGroup][index % kNumRecordsPerGroup]);
|
||||
}
|
||||
|
||||
private:
|
||||
IndexRecord* AllocateNewGroup() {
|
||||
IndexRecord* result = new IndexRecord[kNumRecordsPerGroup];
|
||||
groups_.push_back(result);
|
||||
return result;
|
||||
}
|
||||
|
||||
// Each group in `groups_` contains fix-sized records (determined by
|
||||
// kNumRecordsPerGroup). Which can help us minimize the cost if resizing
|
||||
// occurs.
|
||||
const size_t kNumRecordsPerGroup;
|
||||
IndexRecord* current_group_;
|
||||
// List of arrays allocated
|
||||
std::vector<IndexRecord*> groups_;
|
||||
size_t num_records_in_current_group_;
|
||||
};
|
||||
|
||||
Status PlainTableReader::PopulateIndexRecordList(IndexRecordList* record_list,
|
||||
int* num_prefixes,
|
||||
int bloom_bits_per_key,
|
||||
size_t index_sparseness) {
|
||||
Status PlainTableReader::PopulateIndexRecordList(
|
||||
PlainTableIndexBuilder* index_builder, vector<uint32_t>* prefix_hashes) {
|
||||
Slice prev_key_prefix_slice;
|
||||
uint32_t prev_key_prefix_hash = 0;
|
||||
uint32_t pos = data_start_offset_;
|
||||
int num_keys_per_prefix = 0;
|
||||
bool is_first_record = true;
|
||||
HistogramImpl keys_per_prefix_hist;
|
||||
// Need map to be ordered to make sure sub indexes generated
|
||||
// are in order.
|
||||
|
||||
*num_prefixes = 0;
|
||||
bool is_first_record = true;
|
||||
Slice key_prefix_slice;
|
||||
PlainTableKeyDecoder decoder(encoding_type_, user_key_len_,
|
||||
options_.prefix_extractor.get());
|
||||
bool due_index = false;
|
||||
while (pos < data_end_offset_) {
|
||||
uint32_t key_offset = pos;
|
||||
ParsedInternalKey key;
|
||||
@ -285,152 +213,53 @@ Status PlainTableReader::PopulateIndexRecordList(IndexRecordList* record_list,
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
|
||||
key_prefix_slice = GetPrefix(key);
|
||||
if (enable_bloom_) {
|
||||
// total order mode and bloom filter is enabled.
|
||||
bloom_.AddHash(GetSliceHash(key.user_key));
|
||||
}
|
||||
Slice key_prefix_slice = GetPrefix(key);
|
||||
|
||||
} else {
|
||||
if (is_first_record || prev_key_prefix_slice != key_prefix_slice) {
|
||||
++(*num_prefixes);
|
||||
if (!is_first_record) {
|
||||
keys_per_prefix_hist.Add(num_keys_per_prefix);
|
||||
prefix_hashes->push_back(GetSliceHash(prev_key_prefix_slice));
|
||||
}
|
||||
num_keys_per_prefix = 0;
|
||||
prev_key_prefix_slice = key_prefix_slice;
|
||||
prev_key_prefix_hash = GetSliceHash(key_prefix_slice);
|
||||
due_index = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (due_index) {
|
||||
if (!seekable) {
|
||||
index_builder->AddKeyPrefix(GetPrefix(key), key_offset);
|
||||
|
||||
if (!seekable && is_first_record) {
|
||||
return Status::Corruption("Key for a prefix is not seekable");
|
||||
}
|
||||
// Add an index key for every kIndexIntervalForSamePrefixKeys keys
|
||||
record_list->AddRecord(prev_key_prefix_hash, key_offset);
|
||||
due_index = false;
|
||||
}
|
||||
|
||||
num_keys_per_prefix++;
|
||||
if (index_sparseness == 0 || num_keys_per_prefix % index_sparseness == 0) {
|
||||
due_index = true;
|
||||
}
|
||||
is_first_record = false;
|
||||
}
|
||||
|
||||
keys_per_prefix_hist.Add(num_keys_per_prefix);
|
||||
Log(options_.info_log, "Number of Keys per prefix Histogram: %s",
|
||||
keys_per_prefix_hist.ToString().c_str());
|
||||
|
||||
prefix_hashes->push_back(GetSliceHash(key_prefix_slice));
|
||||
index_.InitFromRawData(index_builder->Finish());
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
void PlainTableReader::AllocateIndexAndBloom(int num_prefixes,
|
||||
int bloom_bits_per_key,
|
||||
double hash_table_ratio,
|
||||
size_t huge_page_tlb_size) {
|
||||
if (prefix_extractor_ != nullptr) {
|
||||
void PlainTableReader::AllocateAndFillBloom(int bloom_bits_per_key,
|
||||
int num_prefixes,
|
||||
size_t huge_page_tlb_size,
|
||||
vector<uint32_t>* prefix_hashes) {
|
||||
if (!IsTotalOrderMode()) {
|
||||
uint32_t bloom_total_bits = num_prefixes * bloom_bits_per_key;
|
||||
if (bloom_total_bits > 0) {
|
||||
enable_bloom_ = true;
|
||||
bloom_.SetTotalBits(&arena_, bloom_total_bits, options_.bloom_locality,
|
||||
huge_page_tlb_size, options_.info_log.get());
|
||||
FillBloom(prefix_hashes);
|
||||
}
|
||||
}
|
||||
|
||||
if (prefix_extractor_ == nullptr || hash_table_ratio <= 0) {
|
||||
// Fall back to pure binary search if the user fails to specify a prefix
|
||||
// extractor.
|
||||
index_size_ = 1;
|
||||
} else {
|
||||
double hash_table_size_multipier = 1.0 / hash_table_ratio;
|
||||
index_size_ = num_prefixes * hash_table_size_multipier + 1;
|
||||
}
|
||||
}
|
||||
|
||||
size_t PlainTableReader::BucketizeIndexesAndFillBloom(
|
||||
IndexRecordList* record_list, std::vector<IndexRecord*>* hash_to_offsets,
|
||||
std::vector<uint32_t>* entries_per_bucket) {
|
||||
bool first = true;
|
||||
uint32_t prev_hash = 0;
|
||||
size_t num_records = record_list->GetNumRecords();
|
||||
for (size_t i = 0; i < num_records; i++) {
|
||||
IndexRecord* index_record = record_list->At(i);
|
||||
uint32_t cur_hash = index_record->hash;
|
||||
if (first || prev_hash != cur_hash) {
|
||||
prev_hash = cur_hash;
|
||||
first = false;
|
||||
if (enable_bloom_ && !IsTotalOrderMode()) {
|
||||
bloom_.AddHash(cur_hash);
|
||||
void PlainTableReader::FillBloom(vector<uint32_t>* prefix_hashes) {
|
||||
assert(bloom_.IsInitialized());
|
||||
for (auto prefix_hash : *prefix_hashes) {
|
||||
bloom_.AddHash(prefix_hash);
|
||||
}
|
||||
}
|
||||
uint32_t bucket = GetBucketIdFromHash(cur_hash, index_size_);
|
||||
IndexRecord* prev_bucket_head = (*hash_to_offsets)[bucket];
|
||||
index_record->next = prev_bucket_head;
|
||||
(*hash_to_offsets)[bucket] = index_record;
|
||||
(*entries_per_bucket)[bucket]++;
|
||||
}
|
||||
size_t sub_index_size = 0;
|
||||
for (auto entry_count : *entries_per_bucket) {
|
||||
if (entry_count <= 1) {
|
||||
continue;
|
||||
}
|
||||
// Only buckets with more than 1 entry will have subindex.
|
||||
sub_index_size += VarintLength(entry_count);
|
||||
// total bytes needed to store these entries' in-file offsets.
|
||||
sub_index_size += entry_count * kOffsetLen;
|
||||
}
|
||||
return sub_index_size;
|
||||
}
|
||||
|
||||
void PlainTableReader::FillIndexes(
|
||||
const size_t kSubIndexSize,
|
||||
const std::vector<IndexRecord*>& hash_to_offsets,
|
||||
const std::vector<uint32_t>& entries_per_bucket,
|
||||
size_t huge_page_tlb_size) {
|
||||
Log(options_.info_log, "Reserving %zu bytes for plain table's sub_index",
|
||||
kSubIndexSize);
|
||||
auto total_allocate_size = sizeof(uint32_t) * index_size_ + kSubIndexSize;
|
||||
char* allocated = arena_.AllocateAligned(
|
||||
total_allocate_size, huge_page_tlb_size, options_.info_log.get());
|
||||
index_ = reinterpret_cast<uint32_t*>(allocated);
|
||||
sub_index_ = allocated + sizeof(uint32_t) * index_size_;
|
||||
|
||||
size_t sub_index_offset = 0;
|
||||
for (int i = 0; i < index_size_; i++) {
|
||||
uint32_t num_keys_for_bucket = entries_per_bucket[i];
|
||||
switch (num_keys_for_bucket) {
|
||||
case 0:
|
||||
// No key for bucket
|
||||
index_[i] = data_end_offset_;
|
||||
break;
|
||||
case 1:
|
||||
// point directly to the file offset
|
||||
index_[i] = hash_to_offsets[i]->offset;
|
||||
break;
|
||||
default:
|
||||
// point to second level indexes.
|
||||
index_[i] = sub_index_offset | kSubIndexMask;
|
||||
char* prev_ptr = &sub_index_[sub_index_offset];
|
||||
char* cur_ptr = EncodeVarint32(prev_ptr, num_keys_for_bucket);
|
||||
sub_index_offset += (cur_ptr - prev_ptr);
|
||||
char* sub_index_pos = &sub_index_[sub_index_offset];
|
||||
IndexRecord* record = hash_to_offsets[i];
|
||||
int j;
|
||||
for (j = num_keys_for_bucket - 1; j >= 0 && record;
|
||||
j--, record = record->next) {
|
||||
EncodeFixed32(sub_index_pos + j * sizeof(uint32_t), record->offset);
|
||||
}
|
||||
assert(j == -1 && record == nullptr);
|
||||
sub_index_offset += kOffsetLen * num_keys_for_bucket;
|
||||
assert(sub_index_offset <= kSubIndexSize);
|
||||
break;
|
||||
}
|
||||
}
|
||||
assert(sub_index_offset == kSubIndexSize);
|
||||
|
||||
Log(options_.info_log, "hash table size: %d, suffix_map length %zu",
|
||||
index_size_, kSubIndexSize);
|
||||
}
|
||||
|
||||
Status PlainTableReader::MmapDataFile() {
|
||||
@ -445,21 +274,50 @@ Status PlainTableReader::PopulateIndex(TableProperties* props,
|
||||
size_t huge_page_tlb_size) {
|
||||
assert(props != nullptr);
|
||||
table_properties_.reset(props);
|
||||
// options.prefix_extractor is requried for a hash-based look-up.
|
||||
if ((options_.prefix_extractor.get() == nullptr) && (hash_table_ratio != 0)) {
|
||||
|
||||
BlockContents bloom_block_contents;
|
||||
auto s = ReadMetaBlock(file_.get(), file_size_, kPlainTableMagicNumber,
|
||||
options_.env, BloomBlockBuilder::kBloomBlock,
|
||||
&bloom_block_contents);
|
||||
bool index_in_file = s.ok();
|
||||
|
||||
BlockContents index_block_contents;
|
||||
s = ReadMetaBlock(file_.get(), file_size_, kPlainTableMagicNumber,
|
||||
options_.env, PlainTableIndexBuilder::kPlainTableIndexBlock,
|
||||
&index_block_contents);
|
||||
|
||||
index_in_file &= s.ok();
|
||||
|
||||
Slice* bloom_block;
|
||||
if (index_in_file) {
|
||||
bloom_block = &bloom_block_contents.data;
|
||||
} else {
|
||||
bloom_block = nullptr;
|
||||
}
|
||||
|
||||
// index_in_file == true only if there are kBloomBlock and
|
||||
// kPlainTableIndexBlock
|
||||
// in file
|
||||
|
||||
Slice* index_block;
|
||||
if (index_in_file) {
|
||||
index_block = &index_block_contents.data;
|
||||
} else {
|
||||
index_block = nullptr;
|
||||
}
|
||||
|
||||
if ((options_.prefix_extractor.get() == nullptr) && (hash_table_ratio != 0)) {
|
||||
// options.prefix_extractor is requried for a hash-based look-up.
|
||||
return Status::NotSupported(
|
||||
"PlainTable requires a prefix extractor enable prefix hash mode.");
|
||||
}
|
||||
|
||||
IndexRecordList record_list(kRecordsPerGroup);
|
||||
// First, read the whole file, for every kIndexIntervalForSamePrefixKeys rows
|
||||
// for a prefix (starting from the first one), generate a record of (hash,
|
||||
// offset) and append it to IndexRecordList, which is a data structure created
|
||||
// to store them.
|
||||
int num_prefixes;
|
||||
|
||||
if (!index_in_file) {
|
||||
// Allocate bloom filter here for total order mode.
|
||||
if (IsTotalOrderMode()) {
|
||||
uint32_t num_bloom_bits =
|
||||
@ -470,34 +328,57 @@ Status PlainTableReader::PopulateIndex(TableProperties* props,
|
||||
huge_page_tlb_size, options_.info_log.get());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
enable_bloom_ = true;
|
||||
auto num_blocks_property = props->user_collected_properties.find(
|
||||
PlainTablePropertyNames::kNumBloomBlocks);
|
||||
|
||||
Status s = PopulateIndexRecordList(&record_list, &num_prefixes,
|
||||
bloom_bits_per_key, index_sparseness);
|
||||
uint32_t num_blocks = 0;
|
||||
if (num_blocks_property != props->user_collected_properties.end()) {
|
||||
Slice temp_slice(num_blocks_property->second);
|
||||
if (!GetVarint32(&temp_slice, &num_blocks)) {
|
||||
num_blocks = 0;
|
||||
}
|
||||
}
|
||||
// cast away const qualifier, because bloom_ won't be changed
|
||||
bloom_.SetRawData(
|
||||
const_cast<unsigned char*>(
|
||||
reinterpret_cast<const unsigned char*>(bloom_block->data())),
|
||||
bloom_block->size() * 8, num_blocks);
|
||||
}
|
||||
|
||||
PlainTableIndexBuilder index_builder(&arena_, options_, index_sparseness,
|
||||
hash_table_ratio, huge_page_tlb_size);
|
||||
|
||||
std::vector<uint32_t> prefix_hashes;
|
||||
if (!index_in_file) {
|
||||
Status s = PopulateIndexRecordList(&index_builder, &prefix_hashes);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
// Calculated hash table and bloom filter size and allocate memory for indexes
|
||||
// and bloom filter based on the number of prefixes.
|
||||
AllocateIndexAndBloom(num_prefixes, bloom_bits_per_key, hash_table_ratio,
|
||||
huge_page_tlb_size);
|
||||
} else {
|
||||
index_.InitFromRawData(*index_block);
|
||||
}
|
||||
|
||||
// Bucketize all the index records to a temp data structure, in which for
|
||||
// each bucket, we generate a linked list of IndexRecord, in reversed order.
|
||||
std::vector<IndexRecord*> hash_to_offsets(index_size_, nullptr);
|
||||
std::vector<uint32_t> entries_per_bucket(index_size_, 0);
|
||||
size_t sub_index_size_needed = BucketizeIndexesAndFillBloom(
|
||||
&record_list, &hash_to_offsets, &entries_per_bucket);
|
||||
// From the temp data structure, populate indexes.
|
||||
FillIndexes(sub_index_size_needed, hash_to_offsets, entries_per_bucket,
|
||||
huge_page_tlb_size);
|
||||
if (!index_in_file) {
|
||||
// Calculated bloom filter size and allocate memory for
|
||||
// bloom filter based on the number of prefixes, then fill it.
|
||||
AllocateAndFillBloom(bloom_bits_per_key, index_.GetNumPrefixes(),
|
||||
huge_page_tlb_size, &prefix_hashes);
|
||||
}
|
||||
|
||||
// Fill two table properties.
|
||||
// TODO(sdong): after we have the feature of storing index in file, this
|
||||
// properties need to be populated to index_size instead.
|
||||
if (!index_in_file) {
|
||||
props->user_collected_properties["plain_table_hash_table_size"] =
|
||||
std::to_string(index_size_ * 4U);
|
||||
std::to_string(index_.GetIndexSize() * PlainTableIndex::kOffsetLen);
|
||||
props->user_collected_properties["plain_table_sub_index_size"] =
|
||||
std::to_string(sub_index_size_needed);
|
||||
std::to_string(index_.GetSubIndexSize());
|
||||
} else {
|
||||
props->user_collected_properties["plain_table_hash_table_size"] =
|
||||
std::to_string(0);
|
||||
props->user_collected_properties["plain_table_sub_index_size"] =
|
||||
std::to_string(0);
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
@ -506,24 +387,21 @@ Status PlainTableReader::GetOffset(const Slice& target, const Slice& prefix,
|
||||
uint32_t prefix_hash, bool& prefix_matched,
|
||||
uint32_t* offset) const {
|
||||
prefix_matched = false;
|
||||
int bucket = GetBucketIdFromHash(prefix_hash, index_size_);
|
||||
uint32_t bucket_value = index_[bucket];
|
||||
if (bucket_value == data_end_offset_) {
|
||||
uint32_t prefix_index_offset;
|
||||
auto res = index_.GetOffset(prefix_hash, &prefix_index_offset);
|
||||
if (res == PlainTableIndex::kNoPrefixForBucket) {
|
||||
*offset = data_end_offset_;
|
||||
return Status::OK();
|
||||
} else if ((bucket_value & kSubIndexMask) == 0) {
|
||||
// point directly to the file
|
||||
*offset = bucket_value;
|
||||
} else if (res == PlainTableIndex::kDirectToFile) {
|
||||
*offset = prefix_index_offset;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// point to sub-index, need to do a binary search
|
||||
uint32_t upper_bound;
|
||||
const char* base_ptr =
|
||||
index_.GetSubIndexBasePtrAndUpperBound(prefix_index_offset, &upper_bound);
|
||||
uint32_t low = 0;
|
||||
uint64_t prefix_index_offset = bucket_value ^ kSubIndexMask;
|
||||
|
||||
const char* index_ptr = &sub_index_[prefix_index_offset];
|
||||
uint32_t upper_bound = 0;
|
||||
const char* base_ptr = GetVarint32Ptr(index_ptr, index_ptr + 4, &upper_bound);
|
||||
uint32_t high = upper_bound;
|
||||
ParsedInternalKey mid_key;
|
||||
ParsedInternalKey parsed_target;
|
||||
@ -593,9 +471,6 @@ bool PlainTableReader::MatchBloom(uint32_t hash) const {
|
||||
return !enable_bloom_ || bloom_.MayContainHash(hash);
|
||||
}
|
||||
|
||||
Slice PlainTableReader::GetPrefix(const ParsedInternalKey& target) const {
|
||||
return GetPrefixFromUserKey(target.user_key);
|
||||
}
|
||||
|
||||
Status PlainTableReader::Next(PlainTableKeyDecoder* decoder, uint32_t* offset,
|
||||
ParsedInternalKey* parsed_key,
|
||||
@ -650,8 +525,7 @@ Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target,
|
||||
Slice prefix_slice;
|
||||
uint32_t prefix_hash;
|
||||
if (IsTotalOrderMode()) {
|
||||
if (index_size_ == kFullScanModeFlag) {
|
||||
// Full Scan Mode
|
||||
if (full_scan_mode_) {
|
||||
status_ =
|
||||
Status::InvalidArgument("Get() is not allowed in full scan mode.");
|
||||
}
|
||||
@ -682,7 +556,6 @@ Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target,
|
||||
if (!ParseInternalKey(target, &parsed_target)) {
|
||||
return Status::Corruption(Slice());
|
||||
}
|
||||
|
||||
Slice found_value;
|
||||
PlainTableKeyDecoder decoder(encoding_type_, user_key_len_,
|
||||
options_.prefix_extractor.get());
|
||||
@ -747,13 +620,12 @@ void PlainTableIterator::Seek(const Slice& target) {
|
||||
// If the user doesn't set prefix seek option and we are not able to do a
|
||||
// total Seek(). assert failure.
|
||||
if (!use_prefix_seek_) {
|
||||
if (table_->index_size_ == PlainTableReader::kFullScanModeFlag) {
|
||||
// Full Scan Mode.
|
||||
if (table_->full_scan_mode_) {
|
||||
status_ =
|
||||
Status::InvalidArgument("Seek() is not allowed in full scan mode.");
|
||||
offset_ = next_offset_ = table_->data_end_offset_;
|
||||
return;
|
||||
} else if (table_->index_size_ > 1) {
|
||||
} else if (table_->GetIndexSize() > 1) {
|
||||
assert(false);
|
||||
status_ = Status::NotSupported(
|
||||
"PlainTable cannot issue non-prefix seek unless in total order "
|
||||
|
@ -19,12 +19,14 @@
|
||||
#include "rocksdb/table_properties.h"
|
||||
#include "table/table_reader.h"
|
||||
#include "table/plain_table_factory.h"
|
||||
#include "table/plain_table_index.h"
|
||||
#include "util/arena.h"
|
||||
#include "util/dynamic_bloom.h"
|
||||
|
||||
namespace rocksdb {
|
||||
|
||||
class Block;
|
||||
class BlockContents;
|
||||
class BlockHandle;
|
||||
class Footer;
|
||||
struct Options;
|
||||
@ -37,6 +39,7 @@ class PlainTableKeyDecoder;
|
||||
|
||||
using std::unique_ptr;
|
||||
using std::unordered_map;
|
||||
using std::vector;
|
||||
extern const uint32_t kPlainTableVariableLength;
|
||||
|
||||
// Based on following output file format shown in plain_table_factory.h
|
||||
@ -68,6 +71,7 @@ class PlainTableReader: public TableReader {
|
||||
|
||||
uint64_t ApproximateOffsetOf(const Slice& key);
|
||||
|
||||
uint32_t GetIndexSize() const { return index_.GetIndexSize(); }
|
||||
void SetupForCompaction();
|
||||
|
||||
std::shared_ptr<const TableProperties> GetTableProperties() const {
|
||||
@ -93,65 +97,23 @@ class PlainTableReader: public TableReader {
|
||||
// props: the table properties object that need to be stored. Ownership of
|
||||
// the object will be passed.
|
||||
//
|
||||
// index_ contains buckets size of index_size_, each is a
|
||||
// 32-bit integer. The lower 31 bits contain an offset value (explained below)
|
||||
// and the first bit of the integer indicates type of the offset.
|
||||
//
|
||||
// +--------------+------------------------------------------------------+
|
||||
// | Flag (1 bit) | Offset to binary search buffer or file (31 bits) +
|
||||
// +--------------+------------------------------------------------------+
|
||||
//
|
||||
// Explanation for the "flag bit":
|
||||
//
|
||||
// 0 indicates that the bucket contains only one prefix (no conflict when
|
||||
// hashing this prefix), whose first row starts from this offset of the
|
||||
// file.
|
||||
// 1 indicates that the bucket contains more than one prefixes, or there
|
||||
// are too many rows for one prefix so we need a binary search for it. In
|
||||
// this case, the offset indicates the offset of sub_index_ holding the
|
||||
// binary search indexes of keys for those rows. Those binary search indexes
|
||||
// are organized in this way:
|
||||
//
|
||||
// The first 4 bytes, indicate how many indexes (N) are stored after it. After
|
||||
// it, there are N 32-bit integers, each points of an offset of the file,
|
||||
// which
|
||||
// points to starting of a row. Those offsets need to be guaranteed to be in
|
||||
// ascending order so the keys they are pointing to are also in ascending
|
||||
// order
|
||||
// to make sure we can use them to do binary searches. Below is visual
|
||||
// presentation of a bucket.
|
||||
//
|
||||
// <begin>
|
||||
// number_of_records: varint32
|
||||
// record 1 file offset: fixedint32
|
||||
// record 2 file offset: fixedint32
|
||||
// ....
|
||||
// record N file offset: fixedint32
|
||||
// <end>
|
||||
|
||||
Status PopulateIndex(TableProperties* props, int bloom_bits_per_key,
|
||||
double hash_table_ratio, size_t index_sparseness,
|
||||
size_t huge_page_tlb_size);
|
||||
|
||||
Status MmapDataFile();
|
||||
|
||||
private:
|
||||
struct IndexRecord;
|
||||
class IndexRecordList;
|
||||
|
||||
// Plain table maintains an index and a sub index.
|
||||
// index is implemented by a hash table.
|
||||
// subindex is a big of memory array.
|
||||
// For more details about the in-memory index, please refer to:
|
||||
// https://github.com/facebook/rocksdb/wiki/PlainTable-Format
|
||||
// #wiki-in-memory-index-format
|
||||
uint32_t* index_;
|
||||
int index_size_ = 0;
|
||||
char* sub_index_;
|
||||
const InternalKeyComparator internal_comparator_;
|
||||
EncodingType encoding_type_;
|
||||
// represents plain table's current status.
|
||||
Status status_;
|
||||
Slice file_data_;
|
||||
|
||||
PlainTableIndex index_;
|
||||
bool full_scan_mode_;
|
||||
|
||||
// data_start_offset_ and data_end_offset_ defines the range of the
|
||||
// sst file that stores data.
|
||||
const uint32_t data_start_offset_ = 0;
|
||||
@ -160,11 +122,6 @@ class PlainTableReader: public TableReader {
|
||||
const SliceTransform* prefix_extractor_;
|
||||
|
||||
static const size_t kNumInternalBytes = 8;
|
||||
static const uint32_t kSubIndexMask = 0x80000000;
|
||||
static const size_t kOffsetLen = sizeof(uint32_t);
|
||||
static const uint64_t kMaxFileSize = 1u << 31;
|
||||
static const size_t kRecordsPerGroup = 256;
|
||||
static const int kFullScanModeFlag = -1;
|
||||
|
||||
// Bloom filter is used to rule out non-existent key
|
||||
bool enable_bloom_;
|
||||
@ -184,6 +141,31 @@ class PlainTableReader: public TableReader {
|
||||
return user_key_len_ + kNumInternalBytes;
|
||||
}
|
||||
|
||||
Slice GetPrefix(const Slice& target) const {
|
||||
assert(target.size() >= 8); // target is internal key
|
||||
return GetPrefixFromUserKey(GetUserKey(target));
|
||||
}
|
||||
|
||||
Slice GetPrefix(const ParsedInternalKey& target) const {
|
||||
return GetPrefixFromUserKey(target.user_key);
|
||||
}
|
||||
|
||||
Slice GetUserKey(const Slice& key) const {
|
||||
return Slice(key.data(), key.size() - 8);
|
||||
}
|
||||
|
||||
Slice GetPrefixFromUserKey(const Slice& user_key) const {
|
||||
if (!IsTotalOrderMode()) {
|
||||
return prefix_extractor_->Transform(user_key);
|
||||
} else {
|
||||
// Use empty slice as prefix if prefix_extractor is not set.
|
||||
// In that case,
|
||||
// it falls back to pure binary search and
|
||||
// total iterator seek is supported.
|
||||
return Slice();
|
||||
}
|
||||
}
|
||||
|
||||
friend class TableCache;
|
||||
friend class PlainTableIterator;
|
||||
|
||||
@ -191,33 +173,15 @@ class PlainTableReader: public TableReader {
|
||||
// the rows, which contains index records as a list.
|
||||
// If bloom_ is not null, all the keys' full-key hash will be added to the
|
||||
// bloom filter.
|
||||
Status PopulateIndexRecordList(IndexRecordList* record_list,
|
||||
int* num_prefixes, int bloom_bits_per_key,
|
||||
size_t index_sparseness);
|
||||
Status PopulateIndexRecordList(PlainTableIndexBuilder* index_builder,
|
||||
vector<uint32_t>* prefix_hashes);
|
||||
|
||||
// Internal helper function to allocate memory for indexes and bloom filters
|
||||
void AllocateIndexAndBloom(int num_prefixes, int bloom_bits_per_key,
|
||||
double hash_table_ratio,
|
||||
size_t huge_page_tlb_size);
|
||||
// Internal helper function to allocate memory for bloom filter and fill it
|
||||
void AllocateAndFillBloom(int bloom_bits_per_key, int num_prefixes,
|
||||
size_t huge_page_tlb_size,
|
||||
vector<uint32_t>* prefix_hashes);
|
||||
|
||||
// Internal helper function to bucket index record list to hash buckets.
|
||||
// bucket_header is a vector of size hash_table_size_, with each entry
|
||||
// containing a linklist of IndexRecord hashed to the same bucket, in reverse
|
||||
// order.
|
||||
// of offsets for the hash, in reversed order.
|
||||
// entries_per_bucket is sized of index_size_. The value is how many index
|
||||
// records are there in bucket_headers for the same bucket.
|
||||
size_t BucketizeIndexesAndFillBloom(
|
||||
IndexRecordList* record_list, std::vector<IndexRecord*>* bucket_headers,
|
||||
std::vector<uint32_t>* entries_per_bucket);
|
||||
|
||||
// Internal helper class to fill the indexes and bloom filters to internal
|
||||
// data structures. bucket_headers and entries_per_bucket are bucketized
|
||||
// indexes and counts generated by BucketizeIndexesAndFillBloom().
|
||||
void FillIndexes(const size_t kSubIndexSize,
|
||||
const std::vector<IndexRecord*>& bucket_headers,
|
||||
const std::vector<uint32_t>& entries_per_bucket,
|
||||
size_t huge_page_tlb_size);
|
||||
void FillBloom(vector<uint32_t>* prefix_hashes);
|
||||
|
||||
// Read the key and value at `offset` to parameters for keys, the and
|
||||
// `seekable`.
|
||||
@ -237,28 +201,6 @@ class PlainTableReader: public TableReader {
|
||||
uint32_t prefix_hash, bool& prefix_matched,
|
||||
uint32_t* offset) const;
|
||||
|
||||
Slice GetUserKey(const Slice& key) const {
|
||||
return Slice(key.data(), key.size() - 8);
|
||||
}
|
||||
|
||||
Slice GetPrefix(const Slice& target) const {
|
||||
assert(target.size() >= 8); // target is internal key
|
||||
return GetPrefixFromUserKey(GetUserKey(target));
|
||||
}
|
||||
|
||||
inline Slice GetPrefix(const ParsedInternalKey& target) const;
|
||||
|
||||
Slice GetPrefixFromUserKey(const Slice& user_key) const {
|
||||
if (!IsTotalOrderMode()) {
|
||||
return prefix_extractor_->Transform(user_key);
|
||||
} else {
|
||||
// Use empty slice as prefix if prefix_extractor is not set. In that case,
|
||||
// it falls back to pure binary search and total iterator seek is
|
||||
// supported.
|
||||
return Slice();
|
||||
}
|
||||
}
|
||||
|
||||
bool IsTotalOrderMode() const { return (prefix_extractor_ == nullptr); }
|
||||
|
||||
// No copying allowed
|
||||
|
@ -48,6 +48,13 @@ DynamicBloom::DynamicBloom(uint32_t num_probes,
|
||||
kNumProbes(num_probes),
|
||||
hash_func_(hash_func == nullptr ? &BloomHash : hash_func) {}
|
||||
|
||||
void DynamicBloom::SetRawData(unsigned char* raw_data, uint32_t total_bits,
|
||||
uint32_t num_blocks) {
|
||||
data_ = raw_data;
|
||||
kTotalBits = total_bits;
|
||||
kNumBlocks = num_blocks;
|
||||
}
|
||||
|
||||
void DynamicBloom::SetTotalBits(Arena* arena,
|
||||
uint32_t total_bits, uint32_t locality,
|
||||
size_t huge_page_tlb_size,
|
||||
|
@ -5,6 +5,10 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "rocksdb/slice.h"
|
||||
|
||||
#include <util/arena.h>
|
||||
#include <port/port_posix.h>
|
||||
|
||||
@ -57,6 +61,19 @@ class DynamicBloom {
|
||||
|
||||
void Prefetch(uint32_t h);
|
||||
|
||||
uint32_t GetNumBlocks() const { return kNumBlocks; }
|
||||
|
||||
Slice GetRawData() const {
|
||||
return Slice(reinterpret_cast<char*>(data_), GetTotalBits() / 8);
|
||||
}
|
||||
|
||||
void SetRawData(unsigned char* raw_data, uint32_t total_bits,
|
||||
uint32_t num_blocks = 0);
|
||||
|
||||
uint32_t GetTotalBits() const { return kTotalBits; }
|
||||
|
||||
bool IsInitialized() const { return kNumBlocks > 0 || kTotalBits > 0; }
|
||||
|
||||
private:
|
||||
uint32_t kTotalBits;
|
||||
uint32_t kNumBlocks;
|
||||
@ -81,7 +98,7 @@ inline void DynamicBloom::Prefetch(uint32_t h) {
|
||||
}
|
||||
|
||||
inline bool DynamicBloom::MayContainHash(uint32_t h) const {
|
||||
assert(kNumBlocks > 0 || kTotalBits > 0);
|
||||
assert(IsInitialized());
|
||||
const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits
|
||||
if (kNumBlocks != 0) {
|
||||
uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8);
|
||||
@ -98,10 +115,6 @@ inline bool DynamicBloom::MayContainHash(uint32_t h) const {
|
||||
h += delta;
|
||||
}
|
||||
} else {
|
||||
if (kTotalBits == 0) {
|
||||
// Not initialized.
|
||||
return true;
|
||||
}
|
||||
for (uint32_t i = 0; i < kNumProbes; ++i) {
|
||||
const uint32_t bitpos = h % kTotalBits;
|
||||
if (((data_[bitpos / 8]) & (1 << (bitpos % 8))) == 0) {
|
||||
@ -114,7 +127,7 @@ inline bool DynamicBloom::MayContainHash(uint32_t h) const {
|
||||
}
|
||||
|
||||
inline void DynamicBloom::AddHash(uint32_t h) {
|
||||
assert(kNumBlocks > 0 || kTotalBits > 0);
|
||||
assert(IsInitialized());
|
||||
const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits
|
||||
if (kNumBlocks != 0) {
|
||||
uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8);
|
||||
|
@ -17,4 +17,7 @@ namespace rocksdb {
|
||||
|
||||
extern uint32_t Hash(const char* data, size_t n, uint32_t seed);
|
||||
|
||||
inline uint32_t GetSliceHash(const Slice& s) {
|
||||
return Hash(s.data(), s.size(), 397);
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user