Clean PlainTableReader's variables for better data locality

Summary:
Clean PlainTableReader's data structures:
(1) inline bloom_ (in order to do this, change DynamicBloom to allow lazy initialization)
(2) remove some variables only used when initialization from the class
(3) put variables not used in normal read code paths to the end of the class and reference prefix_extractor directly
(4) make Options a reference.

Test Plan: make all check

Reviewers: haobo, ljin

Reviewed By: ljin

Subscribers: igor, yhchiang, dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D18891
This commit is contained in:
sdong 2014-06-09 12:30:19 -07:00
parent f43c8262c2
commit 80f409ea37
5 changed files with 130 additions and 99 deletions

View File

@ -184,10 +184,11 @@ class TestPlainTableReader : public PlainTableReader {
unique_ptr<RandomAccessFile>&& file,
const Options& options, bool* expect_bloom_not_match)
: PlainTableReader(options, std::move(file), storage_options, icomparator,
file_size, bloom_bits_per_key, hash_table_ratio,
index_sparseness, table_properties, 2 * 1024 * 1024),
file_size, table_properties),
expect_bloom_not_match_(expect_bloom_not_match) {
Status s = PopulateIndex(const_cast<TableProperties*>(table_properties));
Status s = PopulateIndex(const_cast<TableProperties*>(table_properties),
bloom_bits_per_key, hash_table_ratio,
index_sparseness, 2 * 1024 * 1024);
ASSERT_TRUE(s.ok());
}

View File

@ -92,26 +92,22 @@ class PlainTableIterator : public Iterator {
};
extern const uint64_t kPlainTableMagicNumber;
PlainTableReader::PlainTableReader(
const Options& options, unique_ptr<RandomAccessFile>&& file,
const EnvOptions& storage_options, const InternalKeyComparator& icomparator,
uint64_t file_size, int bloom_bits_per_key, double hash_table_ratio,
size_t index_sparseness, const TableProperties* table_properties,
size_t huge_page_tlb_size)
: options_(options),
soptions_(storage_options),
file_(std::move(file)),
internal_comparator_(icomparator),
file_size_(file_size),
kHashTableRatio(hash_table_ratio),
kBloomBitsPerKey(bloom_bits_per_key),
kIndexIntervalForSamePrefixKeys(index_sparseness),
table_properties_(nullptr),
PlainTableReader::PlainTableReader(const Options& options,
unique_ptr<RandomAccessFile>&& file,
const EnvOptions& storage_options,
const InternalKeyComparator& icomparator,
uint64_t file_size,
const TableProperties* table_properties)
: internal_comparator_(icomparator),
data_end_offset_(table_properties->data_size),
user_key_len_(table_properties->fixed_key_len),
huge_page_tlb_size_(huge_page_tlb_size) {
assert(kHashTableRatio >= 0.0);
}
prefix_extractor_(options.prefix_extractor.get()),
enable_bloom_(false),
bloom_(6, nullptr),
options_(options),
file_(std::move(file)),
file_size_(file_size),
table_properties_(nullptr) {}
PlainTableReader::~PlainTableReader() {
}
@ -138,13 +134,14 @@ Status PlainTableReader::Open(const Options& options,
return s;
}
std::unique_ptr<PlainTableReader> new_reader(new PlainTableReader(
options, std::move(file), soptions, internal_comparator, file_size,
bloom_bits_per_key, hash_table_ratio, index_sparseness, props,
huge_page_tlb_size));
assert(hash_table_ratio >= 0.0);
std::unique_ptr<PlainTableReader> new_reader(
new PlainTableReader(options, std::move(file), soptions,
internal_comparator, file_size, props));
// -- Populate Index
s = new_reader->PopulateIndex(props);
s = new_reader->PopulateIndex(props, bloom_bits_per_key, hash_table_ratio,
index_sparseness, huge_page_tlb_size);
if (!s.ok()) {
return s;
}
@ -224,7 +221,9 @@ class PlainTableReader::IndexRecordList {
};
Status PlainTableReader::PopulateIndexRecordList(IndexRecordList* record_list,
int* num_prefixes) const {
int* num_prefixes,
int bloom_bits_per_key,
size_t index_sparseness) {
Slice prev_key_prefix_slice;
uint32_t prev_key_prefix_hash = 0;
uint32_t pos = data_start_offset_;
@ -243,9 +242,9 @@ Status PlainTableReader::PopulateIndexRecordList(IndexRecordList* record_list,
if (!s.ok()) {
return s;
}
if (bloom_) {
if (enable_bloom_) {
// total order mode and bloom filter is enabled.
bloom_->AddHash(GetSliceHash(key.user_key));
bloom_.AddHash(GetSliceHash(key.user_key));
}
Slice key_prefix_slice = GetPrefix(key);
@ -259,8 +258,8 @@ Status PlainTableReader::PopulateIndexRecordList(IndexRecordList* record_list,
prev_key_prefix_hash = GetSliceHash(key_prefix_slice);
}
if (kIndexIntervalForSamePrefixKeys == 0 ||
num_keys_per_prefix++ % kIndexIntervalForSamePrefixKeys == 0) {
if (index_sparseness == 0 ||
num_keys_per_prefix++ % index_sparseness == 0) {
// Add an index key for every kIndexIntervalForSamePrefixKeys keys
record_list->AddRecord(prev_key_prefix_hash, key_offset);
}
@ -274,22 +273,25 @@ Status PlainTableReader::PopulateIndexRecordList(IndexRecordList* record_list,
return Status::OK();
}
void PlainTableReader::AllocateIndexAndBloom(int num_prefixes) {
if (options_.prefix_extractor.get() != nullptr) {
uint32_t bloom_total_bits = num_prefixes * kBloomBitsPerKey;
void PlainTableReader::AllocateIndexAndBloom(int num_prefixes,
int bloom_bits_per_key,
double hash_table_ratio,
size_t huge_page_tlb_size) {
if (prefix_extractor_ != nullptr) {
uint32_t bloom_total_bits = num_prefixes * bloom_bits_per_key;
if (bloom_total_bits > 0) {
bloom_.reset(new DynamicBloom(bloom_total_bits, options_.bloom_locality,
6, nullptr, huge_page_tlb_size_,
options_.info_log.get()));
enable_bloom_ = true;
bloom_.SetTotalBits(bloom_total_bits, options_.bloom_locality,
huge_page_tlb_size, options_.info_log.get());
}
}
if (options_.prefix_extractor.get() == nullptr || kHashTableRatio <= 0) {
if (prefix_extractor_ == nullptr || hash_table_ratio <= 0) {
// Fall back to pure binary search if the user fails to specify a prefix
// extractor.
index_size_ = 1;
} else {
double hash_table_size_multipier = 1.0 / kHashTableRatio;
double hash_table_size_multipier = 1.0 / hash_table_ratio;
index_size_ = num_prefixes * hash_table_size_multipier + 1;
}
}
@ -306,8 +308,8 @@ size_t PlainTableReader::BucketizeIndexesAndFillBloom(
if (first || prev_hash != cur_hash) {
prev_hash = cur_hash;
first = false;
if (bloom_ && !IsTotalOrderMode()) {
bloom_->AddHash(cur_hash);
if (enable_bloom_ && !IsTotalOrderMode()) {
bloom_.AddHash(cur_hash);
}
}
uint32_t bucket = GetBucketIdFromHash(cur_hash, index_size_);
@ -332,12 +334,13 @@ size_t PlainTableReader::BucketizeIndexesAndFillBloom(
void PlainTableReader::FillIndexes(
const size_t kSubIndexSize,
const std::vector<IndexRecord*>& hash_to_offsets,
const std::vector<uint32_t>& entries_per_bucket) {
const std::vector<uint32_t>& entries_per_bucket,
size_t huge_page_tlb_size) {
Log(options_.info_log, "Reserving %zu bytes for plain table's sub_index",
kSubIndexSize);
auto total_allocate_size = sizeof(uint32_t) * index_size_ + kSubIndexSize;
char* allocated = arena_.AllocateAligned(
total_allocate_size, huge_page_tlb_size_, options_.info_log.get());
total_allocate_size, huge_page_tlb_size, options_.info_log.get());
index_ = reinterpret_cast<uint32_t*>(allocated);
sub_index_ = allocated + sizeof(uint32_t) * index_size_;
@ -378,12 +381,16 @@ void PlainTableReader::FillIndexes(
index_size_, kSubIndexSize);
}
Status PlainTableReader::PopulateIndex(TableProperties* props) {
Status PlainTableReader::PopulateIndex(TableProperties* props,
int bloom_bits_per_key,
double hash_table_ratio,
size_t index_sparseness,
size_t huge_page_tlb_size) {
assert(props != nullptr);
table_properties_.reset(props);
// options.prefix_extractor is requried for a hash-based look-up.
if (options_.prefix_extractor.get() == nullptr && kHashTableRatio != 0) {
if (options_.prefix_extractor.get() == nullptr && hash_table_ratio != 0) {
return Status::NotSupported(
"PlainTable requires a prefix extractor enable prefix hash mode.");
}
@ -403,21 +410,24 @@ Status PlainTableReader::PopulateIndex(TableProperties* props) {
// Allocate bloom filter here for total order mode.
if (IsTotalOrderMode()) {
uint32_t num_bloom_bits = table_properties_->num_entries * kBloomBitsPerKey;
uint32_t num_bloom_bits =
table_properties_->num_entries * bloom_bits_per_key;
if (num_bloom_bits > 0) {
bloom_.reset(new DynamicBloom(num_bloom_bits, options_.bloom_locality, 6,
nullptr, huge_page_tlb_size_,
options_.info_log.get()));
enable_bloom_ = true;
bloom_.SetTotalBits(num_bloom_bits, options_.bloom_locality,
huge_page_tlb_size, options_.info_log.get());
}
}
s = PopulateIndexRecordList(&record_list, &num_prefixes);
s = PopulateIndexRecordList(&record_list, &num_prefixes, bloom_bits_per_key,
index_sparseness);
if (!s.ok()) {
return s;
}
// Calculated hash table and bloom filter size and allocate memory for indexes
// and bloom filter based on the number of prefixes.
AllocateIndexAndBloom(num_prefixes);
AllocateIndexAndBloom(num_prefixes, bloom_bits_per_key, hash_table_ratio,
huge_page_tlb_size);
// Bucketize all the index records to a temp data structure, in which for
// each bucket, we generate a linked list of IndexRecord, in reversed order.
@ -426,7 +436,8 @@ Status PlainTableReader::PopulateIndex(TableProperties* props) {
size_t sub_index_size_needed = BucketizeIndexesAndFillBloom(
&record_list, &hash_to_offsets, &entries_per_bucket);
// From the temp data structure, populate indexes.
FillIndexes(sub_index_size_needed, hash_to_offsets, entries_per_bucket);
FillIndexes(sub_index_size_needed, hash_to_offsets, entries_per_bucket,
huge_page_tlb_size);
// Fill two table properties.
// TODO(sdong): after we have the feature of storing index in file, this
@ -515,7 +526,7 @@ Status PlainTableReader::GetOffset(const Slice& target, const Slice& prefix,
}
bool PlainTableReader::MatchBloom(uint32_t hash) const {
return bloom_.get() == nullptr || bloom_->MayContainHash(hash);
return !enable_bloom_ || bloom_.MayContainHash(hash);
}
Slice PlainTableReader::GetPrefix(const ParsedInternalKey& target) const {

View File

@ -20,6 +20,7 @@
#include "table/table_reader.h"
#include "table/plain_table_factory.h"
#include "util/arena.h"
#include "util/dynamic_bloom.h"
namespace rocksdb {
@ -31,7 +32,6 @@ class RandomAccessFile;
struct ReadOptions;
class TableCache;
class TableReader;
class DynamicBloom;
class InternalKeyComparator;
using std::unique_ptr;
@ -73,10 +73,7 @@ class PlainTableReader: public TableReader {
PlainTableReader(const Options& options, unique_ptr<RandomAccessFile>&& file,
const EnvOptions& storage_options,
const InternalKeyComparator& internal_comparator,
uint64_t file_size, int bloom_num_bits,
double hash_table_ratio, size_t index_sparseness,
const TableProperties* table_properties,
size_t huge_page_tlb_size);
uint64_t file_size, const TableProperties* table_properties);
virtual ~PlainTableReader();
protected:
@ -126,7 +123,9 @@ class PlainTableReader: public TableReader {
// ....
// record N file offset: fixedint32
// <end>
Status PopulateIndex(TableProperties* props);
Status PopulateIndex(TableProperties* props, int bloom_bits_per_key,
double hash_table_ratio, size_t index_sparseness,
size_t huge_page_tlb_size);
private:
struct IndexRecord;
@ -141,35 +140,17 @@ class PlainTableReader: public TableReader {
uint32_t* index_;
int index_size_ = 0;
char* sub_index_;
Options options_;
const EnvOptions& soptions_;
unique_ptr<RandomAccessFile> file_;
const InternalKeyComparator internal_comparator_;
// represents plain table's current status.
Status status_;
Slice file_data_;
uint32_t file_size_;
const double kHashTableRatio;
const int kBloomBitsPerKey;
// To speed up the search for keys with same prefix, we'll add index key for
// every N keys, where the "N" is determined by
// kIndexIntervalForSamePrefixKeys
const size_t kIndexIntervalForSamePrefixKeys = 16;
// Bloom filter is used to rule out non-existent key
unique_ptr<DynamicBloom> bloom_;
Arena arena_;
std::shared_ptr<const TableProperties> table_properties_;
// data_start_offset_ and data_end_offset_ defines the range of the
// sst file that stores data.
const uint32_t data_start_offset_ = 0;
const uint32_t data_end_offset_;
const size_t user_key_len_;
const size_t huge_page_tlb_size_;
const SliceTransform* prefix_extractor_;
static const size_t kNumInternalBytes = 8;
static const uint32_t kSubIndexMask = 0x80000000;
@ -177,6 +158,16 @@ class PlainTableReader: public TableReader {
static const uint64_t kMaxFileSize = 1u << 31;
static const size_t kRecordsPerGroup = 256;
// Bloom filter is used to rule out non-existent key
bool enable_bloom_;
DynamicBloom bloom_;
Arena arena_;
const Options& options_;
unique_ptr<RandomAccessFile> file_;
uint32_t file_size_;
std::shared_ptr<const TableProperties> table_properties_;
bool IsFixedLength() const {
return user_key_len_ != kPlainTableVariableLength;
}
@ -193,10 +184,13 @@ class PlainTableReader: public TableReader {
// If bloom_ is not null, all the keys' full-key hash will be added to the
// bloom filter.
Status PopulateIndexRecordList(IndexRecordList* record_list,
int* num_prefixes) const;
int* num_prefixes, int bloom_bits_per_key,
size_t index_sparseness);
// Internal helper function to allocate memory for indexes and bloom filters
void AllocateIndexAndBloom(int num_prefixes);
void AllocateIndexAndBloom(int num_prefixes, int bloom_bits_per_key,
double hash_table_ratio,
size_t huge_page_tlb_size);
// Internal helper function to bucket index record list to hash buckets.
// bucket_header is a vector of size hash_table_size_, with each entry
@ -214,7 +208,8 @@ class PlainTableReader: public TableReader {
// indexes and counts generated by BucketizeIndexesAndFillBloom().
void FillIndexes(const size_t kSubIndexSize,
const std::vector<IndexRecord*>& bucket_headers,
const std::vector<uint32_t>& entries_per_bucket);
const std::vector<uint32_t>& entries_per_bucket,
size_t huge_page_tlb_size);
// Read a plain table key from the position `start`. The read content
// will be written to `key` and the size of read bytes will be populated
@ -244,7 +239,7 @@ class PlainTableReader: public TableReader {
Slice GetPrefixFromUserKey(const Slice& user_key) const {
if (!IsTotalOrderMode()) {
return options_.prefix_extractor->Transform(user_key);
return prefix_extractor_->Transform(user_key);
} else {
// Use empty slice as prefix if prefix_extractor is not set. In that case,
// it falls back to pure binary search and total iterator seek is
@ -253,9 +248,7 @@ class PlainTableReader: public TableReader {
}
}
bool IsTotalOrderMode() const {
return (options_.prefix_extractor.get() == nullptr);
}
bool IsTotalOrderMode() const { return (prefix_extractor_ == nullptr); }
// No copying allowed
explicit PlainTableReader(const TableReader&) = delete;

View File

@ -18,15 +18,17 @@ static uint32_t BloomHash(const Slice& key) {
return Hash(key.data(), key.size(), 0xbc9f1d34);
}
uint32_t GetNumBlocks(uint32_t total_bits) {
uint32_t num_blocks = (total_bits + CACHE_LINE_SIZE * 8 - 1) /
(CACHE_LINE_SIZE * 8) * (CACHE_LINE_SIZE * 8);
uint32_t GetTotalBitsForLocality(uint32_t total_bits) {
uint32_t num_blocks =
(total_bits + CACHE_LINE_SIZE * 8 - 1) / (CACHE_LINE_SIZE * 8);
// Make num_blocks an odd number to make sure more bits are involved
// when determining which block.
if (num_blocks % 2 == 0) {
num_blocks++;
}
return num_blocks;
return num_blocks * (CACHE_LINE_SIZE * 8);
}
}
@ -34,11 +36,23 @@ DynamicBloom::DynamicBloom(uint32_t total_bits, uint32_t locality,
uint32_t num_probes,
uint32_t (*hash_func)(const Slice& key),
size_t huge_page_tlb_size, Logger* logger)
: kTotalBits(((locality > 0) ? GetNumBlocks(total_bits) : total_bits + 7) /
8 * 8),
kNumBlocks((locality > 0) ? kTotalBits / (CACHE_LINE_SIZE * 8) : 0),
: DynamicBloom(num_probes, hash_func) {
SetTotalBits(total_bits, locality, huge_page_tlb_size, logger);
}
DynamicBloom::DynamicBloom(uint32_t num_probes,
uint32_t (*hash_func)(const Slice& key))
: kTotalBits(0),
kNumBlocks(0),
kNumProbes(num_probes),
hash_func_(hash_func == nullptr ? &BloomHash : hash_func) {
hash_func_(hash_func == nullptr ? &BloomHash : hash_func) {}
void DynamicBloom::SetTotalBits(uint32_t total_bits, uint32_t locality,
size_t huge_page_tlb_size, Logger* logger) {
kTotalBits = (locality > 0) ? GetTotalBitsForLocality(total_bits)
: (total_bits + 7) / 8 * 8;
kNumBlocks = (locality > 0) ? (kTotalBits / (CACHE_LINE_SIZE * 8)) : 0;
assert(kNumBlocks > 0 || kTotalBits > 0);
assert(kNumProbes > 0);

View File

@ -33,6 +33,12 @@ class DynamicBloom {
size_t huge_page_tlb_size = 0,
Logger* logger = nullptr);
explicit DynamicBloom(uint32_t num_probes = 6,
uint32_t (*hash_func)(const Slice& key) = nullptr);
void SetTotalBits(uint32_t total_bits, uint32_t locality,
size_t huge_page_tlb_size, Logger* logger);
~DynamicBloom() {}
// Assuming single threaded access to this function.
@ -42,14 +48,14 @@ class DynamicBloom {
void AddHash(uint32_t hash);
// Multithreaded access to this function is OK
bool MayContain(const Slice& key);
bool MayContain(const Slice& key) const;
// Multithreaded access to this function is OK
bool MayContainHash(uint32_t hash);
bool MayContainHash(uint32_t hash) const;
private:
const uint32_t kTotalBits;
const uint32_t kNumBlocks;
uint32_t kTotalBits;
uint32_t kNumBlocks;
const uint32_t kNumProbes;
uint32_t (*hash_func_)(const Slice& key);
@ -61,11 +67,12 @@ class DynamicBloom {
inline void DynamicBloom::Add(const Slice& key) { AddHash(hash_func_(key)); }
inline bool DynamicBloom::MayContain(const Slice& key) {
inline bool DynamicBloom::MayContain(const Slice& key) const {
return (MayContainHash(hash_func_(key)));
}
inline bool DynamicBloom::MayContainHash(uint32_t h) {
inline bool DynamicBloom::MayContainHash(uint32_t h) const {
assert(kNumBlocks > 0 || kTotalBits > 0);
const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits
if (kNumBlocks != 0) {
uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8);
@ -82,6 +89,10 @@ inline bool DynamicBloom::MayContainHash(uint32_t h) {
h += delta;
}
} else {
if (kTotalBits == 0) {
// Not initialized.
return true;
}
for (uint32_t i = 0; i < kNumProbes; ++i) {
const uint32_t bitpos = h % kTotalBits;
if (((data_[bitpos / 8]) & (1 << (bitpos % 8))) == 0) {
@ -94,6 +105,7 @@ inline bool DynamicBloom::MayContainHash(uint32_t h) {
}
inline void DynamicBloom::AddHash(uint32_t h) {
assert(kNumBlocks > 0 || kTotalBits > 0);
const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits
if (kNumBlocks != 0) {
uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8);