Introduce Read amplification bitmap (read amp statistics)
Summary: Add ReadOptions::read_amp_bytes_per_bit option which allow us to create a bitmap for every data block we read the bitmap will contain (block_size / read_amp_bytes_per_bit) bits. We will use this bitmap to mark which bytes have been used of the block so we can calculate the read amplification Test Plan: added new tests Reviewers: andrewkr, yhchiang, sdong Reviewed By: sdong Subscribers: yiwu, leveldb, march, andrewkr, dhruba Differential Revision: https://reviews.facebook.net/D58707
This commit is contained in:
parent
c7004840d2
commit
b49b92cf28
161
db/db_test2.cc
161
db/db_test2.cc
@ -1859,6 +1859,167 @@ TEST_F(DBTest2, MaxSuccessiveMergesInRecovery) {
|
|||||||
options.max_successive_merges = 3;
|
options.max_successive_merges = 3;
|
||||||
Reopen(options);
|
Reopen(options);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
size_t GetEncodedEntrySize(size_t key_size, size_t value_size) {
|
||||||
|
std::string buffer;
|
||||||
|
|
||||||
|
PutVarint32(&buffer, static_cast<uint32_t>(0));
|
||||||
|
PutVarint32(&buffer, static_cast<uint32_t>(key_size));
|
||||||
|
PutVarint32(&buffer, static_cast<uint32_t>(value_size));
|
||||||
|
|
||||||
|
return buffer.size() + key_size + value_size;
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(DBTest2, ReadAmpBitmap) {
|
||||||
|
Options options = CurrentOptions();
|
||||||
|
BlockBasedTableOptions bbto;
|
||||||
|
// Disable delta encoding to make it easier to calculate read amplification
|
||||||
|
bbto.use_delta_encoding = false;
|
||||||
|
// Huge block cache to make it easier to calculate read amplification
|
||||||
|
bbto.block_cache = NewLRUCache(1024 * 1024 * 1024);
|
||||||
|
bbto.read_amp_bytes_per_bit = 16;
|
||||||
|
options.table_factory.reset(NewBlockBasedTableFactory(bbto));
|
||||||
|
options.statistics = rocksdb::CreateDBStatistics();
|
||||||
|
DestroyAndReopen(options);
|
||||||
|
|
||||||
|
const size_t kNumEntries = 10000;
|
||||||
|
|
||||||
|
Random rnd(301);
|
||||||
|
for (size_t i = 0; i < kNumEntries; i++) {
|
||||||
|
ASSERT_OK(Put(Key(static_cast<int>(i)), RandomString(&rnd, 100)));
|
||||||
|
}
|
||||||
|
ASSERT_OK(Flush());
|
||||||
|
|
||||||
|
Close();
|
||||||
|
Reopen(options);
|
||||||
|
|
||||||
|
// Read keys/values randomly and verify that reported read amp error
|
||||||
|
// is less than 2%
|
||||||
|
uint64_t total_useful_bytes = 0;
|
||||||
|
std::set<int> read_keys;
|
||||||
|
std::string value;
|
||||||
|
for (size_t i = 0; i < kNumEntries * 5; i++) {
|
||||||
|
int key_idx = rnd.Next() % kNumEntries;
|
||||||
|
std::string k = Key(key_idx);
|
||||||
|
ASSERT_OK(db_->Get(ReadOptions(), k, &value));
|
||||||
|
|
||||||
|
if (read_keys.find(key_idx) == read_keys.end()) {
|
||||||
|
auto ik = InternalKey(k, 0, ValueType::kTypeValue);
|
||||||
|
total_useful_bytes += GetEncodedEntrySize(ik.size(), value.size());
|
||||||
|
read_keys.insert(key_idx);
|
||||||
|
}
|
||||||
|
|
||||||
|
double expected_read_amp =
|
||||||
|
static_cast<double>(total_useful_bytes) /
|
||||||
|
options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES);
|
||||||
|
|
||||||
|
double read_amp =
|
||||||
|
static_cast<double>(options.statistics->getTickerCount(
|
||||||
|
READ_AMP_ESTIMATE_USEFUL_BYTES)) /
|
||||||
|
options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES);
|
||||||
|
|
||||||
|
double error_pct = fabs(expected_read_amp - read_amp) * 100;
|
||||||
|
// Error between reported read amp and real read amp should be less than 2%
|
||||||
|
EXPECT_LE(error_pct, 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Make sure we read every thing in the DB (which is smaller than our cache)
|
||||||
|
Iterator* iter = db_->NewIterator(ReadOptions());
|
||||||
|
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
||||||
|
ASSERT_EQ(iter->value().ToString(), Get(iter->key().ToString()));
|
||||||
|
}
|
||||||
|
delete iter;
|
||||||
|
|
||||||
|
// Read amp is 100% since we read all what we loaded in memory
|
||||||
|
ASSERT_EQ(options.statistics->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES),
|
||||||
|
options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES));
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(DBTest2, ReadAmpBitmapLiveInCacheAfterDBClose) {
|
||||||
|
if (dbname_.find("dev/shm") != std::string::npos) {
|
||||||
|
// /dev/shm dont support getting a unique file id, this mean that
|
||||||
|
// running this test on /dev/shm will fail because lru_cache will load
|
||||||
|
// the blocks again regardless of them being already in the cache
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::shared_ptr<Cache> lru_cache = NewLRUCache(1024 * 1024 * 1024);
|
||||||
|
std::shared_ptr<Statistics> stats = rocksdb::CreateDBStatistics();
|
||||||
|
|
||||||
|
Options options = CurrentOptions();
|
||||||
|
BlockBasedTableOptions bbto;
|
||||||
|
// Disable delta encoding to make it easier to calculate read amplification
|
||||||
|
bbto.use_delta_encoding = false;
|
||||||
|
// Huge block cache to make it easier to calculate read amplification
|
||||||
|
bbto.block_cache = lru_cache;
|
||||||
|
bbto.read_amp_bytes_per_bit = 16;
|
||||||
|
options.table_factory.reset(NewBlockBasedTableFactory(bbto));
|
||||||
|
options.statistics = stats;
|
||||||
|
DestroyAndReopen(options);
|
||||||
|
|
||||||
|
const int kNumEntries = 10000;
|
||||||
|
|
||||||
|
Random rnd(301);
|
||||||
|
for (int i = 0; i < kNumEntries; i++) {
|
||||||
|
ASSERT_OK(Put(Key(i), RandomString(&rnd, 100)));
|
||||||
|
}
|
||||||
|
ASSERT_OK(Flush());
|
||||||
|
|
||||||
|
Close();
|
||||||
|
Reopen(options);
|
||||||
|
|
||||||
|
uint64_t total_useful_bytes = 0;
|
||||||
|
std::set<int> read_keys;
|
||||||
|
std::string value;
|
||||||
|
// Iter1: Read half the DB, Read even keys
|
||||||
|
// Key(0), Key(2), Key(4), Key(6), Key(8), ...
|
||||||
|
for (int i = 0; i < kNumEntries; i += 2) {
|
||||||
|
std::string k = Key(i);
|
||||||
|
ASSERT_OK(db_->Get(ReadOptions(), k, &value));
|
||||||
|
|
||||||
|
if (read_keys.find(i) == read_keys.end()) {
|
||||||
|
auto ik = InternalKey(k, 0, ValueType::kTypeValue);
|
||||||
|
total_useful_bytes += GetEncodedEntrySize(ik.size(), value.size());
|
||||||
|
read_keys.insert(i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t total_useful_bytes_iter1 =
|
||||||
|
options.statistics->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES);
|
||||||
|
size_t total_loaded_bytes_iter1 =
|
||||||
|
options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES);
|
||||||
|
|
||||||
|
Close();
|
||||||
|
std::shared_ptr<Statistics> new_statistics = rocksdb::CreateDBStatistics();
|
||||||
|
// Destroy old statistics obj that the blocks in lru_cache are pointing to
|
||||||
|
options.statistics.reset();
|
||||||
|
// Use the statistics object that we just created
|
||||||
|
options.statistics = new_statistics;
|
||||||
|
Reopen(options);
|
||||||
|
|
||||||
|
// Iter2: Read half the DB, Read odd keys
|
||||||
|
// Key(1), Key(3), Key(5), Key(7), Key(9), ...
|
||||||
|
for (int i = 1; i < kNumEntries; i += 2) {
|
||||||
|
std::string k = Key(i);
|
||||||
|
ASSERT_OK(db_->Get(ReadOptions(), k, &value));
|
||||||
|
|
||||||
|
if (read_keys.find(i) == read_keys.end()) {
|
||||||
|
auto ik = InternalKey(k, 0, ValueType::kTypeValue);
|
||||||
|
total_useful_bytes += GetEncodedEntrySize(ik.size(), value.size());
|
||||||
|
read_keys.insert(i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t total_useful_bytes_iter2 =
|
||||||
|
options.statistics->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES);
|
||||||
|
size_t total_loaded_bytes_iter2 =
|
||||||
|
options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES);
|
||||||
|
|
||||||
|
// We reached read_amp of 100% because we read all the keys in the DB
|
||||||
|
ASSERT_EQ(total_useful_bytes_iter1 + total_useful_bytes_iter2,
|
||||||
|
total_loaded_bytes_iter1 + total_loaded_bytes_iter2);
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace rocksdb
|
} // namespace rocksdb
|
||||||
|
|
||||||
int main(int argc, char** argv) {
|
int main(int argc, char** argv) {
|
||||||
|
@ -197,6 +197,14 @@ enum Tickers : uint32_t {
|
|||||||
ROW_CACHE_HIT,
|
ROW_CACHE_HIT,
|
||||||
ROW_CACHE_MISS,
|
ROW_CACHE_MISS,
|
||||||
|
|
||||||
|
// Read amplification statistics.
|
||||||
|
// Read amplification can be calculated using this formula
|
||||||
|
// (READ_AMP_TOTAL_READ_BYTES / READ_AMP_ESTIMATE_USEFUL_BYTES)
|
||||||
|
//
|
||||||
|
// REQUIRES: ReadOptions::read_amp_bytes_per_bit to be enabled
|
||||||
|
READ_AMP_ESTIMATE_USEFUL_BYTES, // Estimate of total bytes actually used.
|
||||||
|
READ_AMP_TOTAL_READ_BYTES, // Total size of loaded data blocks.
|
||||||
|
|
||||||
TICKER_ENUM_MAX
|
TICKER_ENUM_MAX
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -291,6 +299,8 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
|
|||||||
{FILTER_OPERATION_TOTAL_TIME, "rocksdb.filter.operation.time.nanos"},
|
{FILTER_OPERATION_TOTAL_TIME, "rocksdb.filter.operation.time.nanos"},
|
||||||
{ROW_CACHE_HIT, "rocksdb.row.cache.hit"},
|
{ROW_CACHE_HIT, "rocksdb.row.cache.hit"},
|
||||||
{ROW_CACHE_MISS, "rocksdb.row.cache.miss"},
|
{ROW_CACHE_MISS, "rocksdb.row.cache.miss"},
|
||||||
|
{READ_AMP_ESTIMATE_USEFUL_BYTES, "rocksdb.read.amp.estimate.useful.bytes"},
|
||||||
|
{READ_AMP_TOTAL_READ_BYTES, "rocksdb.read.amp.total.read.bytes"},
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -173,6 +173,29 @@ struct BlockBasedTableOptions {
|
|||||||
// algorithms.
|
// algorithms.
|
||||||
bool verify_compression = false;
|
bool verify_compression = false;
|
||||||
|
|
||||||
|
// If used, For every data block we load into memory, we will create a bitmap
|
||||||
|
// of size ((block_size / `read_amp_bytes_per_bit`) / 8) bytes. This bitmap
|
||||||
|
// will be used to figure out the percentage we actually read of the blocks.
|
||||||
|
//
|
||||||
|
// When this feature is used Tickers::READ_AMP_ESTIMATE_USEFUL_BYTES and
|
||||||
|
// Tickers::READ_AMP_TOTAL_READ_BYTES can be used to calculate the
|
||||||
|
// read amplification using this formula
|
||||||
|
// (READ_AMP_TOTAL_READ_BYTES / READ_AMP_ESTIMATE_USEFUL_BYTES)
|
||||||
|
//
|
||||||
|
// value => memory usage (percentage of loaded blocks memory)
|
||||||
|
// 1 => 12.50 %
|
||||||
|
// 2 => 06.25 %
|
||||||
|
// 4 => 03.12 %
|
||||||
|
// 8 => 01.56 %
|
||||||
|
// 16 => 00.78 %
|
||||||
|
//
|
||||||
|
// Note: This number must be a power of 2, if not it will be sanitized
|
||||||
|
// to be the next lowest power of 2, for example a value of 7 will be
|
||||||
|
// treated as 4, a value of 19 will be treated as 16.
|
||||||
|
//
|
||||||
|
// Default: 0 (disabled)
|
||||||
|
uint32_t read_amp_bytes_per_bit = 0;
|
||||||
|
|
||||||
// We currently have three versions:
|
// We currently have three versions:
|
||||||
// 0 -- This version is currently written out by all RocksDB's versions by
|
// 0 -- This version is currently written out by all RocksDB's versions by
|
||||||
// default. Can be read by really old RocksDB's. Doesn't support changing
|
// default. Can be read by really old RocksDB's. Doesn't support changing
|
||||||
|
@ -16,9 +16,11 @@
|
|||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
#include "port/port.h"
|
||||||
|
#include "port/stack_trace.h"
|
||||||
#include "rocksdb/comparator.h"
|
#include "rocksdb/comparator.h"
|
||||||
#include "table/format.h"
|
|
||||||
#include "table/block_prefix_index.h"
|
#include "table/block_prefix_index.h"
|
||||||
|
#include "table/format.h"
|
||||||
#include "util/coding.h"
|
#include "util/coding.h"
|
||||||
#include "util/logging.h"
|
#include "util/logging.h"
|
||||||
#include "util/perf_context_imp.h"
|
#include "util/perf_context_imp.h"
|
||||||
@ -344,7 +346,8 @@ uint32_t Block::NumRestarts() const {
|
|||||||
return DecodeFixed32(data_ + size_ - sizeof(uint32_t));
|
return DecodeFixed32(data_ + size_ - sizeof(uint32_t));
|
||||||
}
|
}
|
||||||
|
|
||||||
Block::Block(BlockContents&& contents)
|
Block::Block(BlockContents&& contents, size_t read_amp_bytes_per_bit,
|
||||||
|
Statistics* statistics)
|
||||||
: contents_(std::move(contents)),
|
: contents_(std::move(contents)),
|
||||||
data_(contents_.data.data()),
|
data_(contents_.data.data()),
|
||||||
size_(contents_.data.size()) {
|
size_(contents_.data.size()) {
|
||||||
@ -359,10 +362,14 @@ Block::Block(BlockContents&& contents)
|
|||||||
size_ = 0;
|
size_ = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (read_amp_bytes_per_bit != 0 && statistics && size_ != 0) {
|
||||||
|
read_amp_bitmap_.reset(new BlockReadAmpBitmap(
|
||||||
|
restart_offset_, read_amp_bytes_per_bit, statistics));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
InternalIterator* Block::NewIterator(const Comparator* cmp, BlockIter* iter,
|
InternalIterator* Block::NewIterator(const Comparator* cmp, BlockIter* iter,
|
||||||
bool total_order_seek) {
|
bool total_order_seek, Statistics* stats) {
|
||||||
if (size_ < 2*sizeof(uint32_t)) {
|
if (size_ < 2*sizeof(uint32_t)) {
|
||||||
if (iter != nullptr) {
|
if (iter != nullptr) {
|
||||||
iter->SetStatus(Status::Corruption("bad block contents"));
|
iter->SetStatus(Status::Corruption("bad block contents"));
|
||||||
@ -385,10 +392,17 @@ InternalIterator* Block::NewIterator(const Comparator* cmp, BlockIter* iter,
|
|||||||
|
|
||||||
if (iter != nullptr) {
|
if (iter != nullptr) {
|
||||||
iter->Initialize(cmp, data_, restart_offset_, num_restarts,
|
iter->Initialize(cmp, data_, restart_offset_, num_restarts,
|
||||||
prefix_index_ptr);
|
prefix_index_ptr, read_amp_bitmap_.get());
|
||||||
} else {
|
} else {
|
||||||
iter = new BlockIter(cmp, data_, restart_offset_, num_restarts,
|
iter = new BlockIter(cmp, data_, restart_offset_, num_restarts,
|
||||||
prefix_index_ptr);
|
prefix_index_ptr, read_amp_bitmap_.get());
|
||||||
|
}
|
||||||
|
|
||||||
|
if (read_amp_bitmap_) {
|
||||||
|
if (read_amp_bitmap_->GetStatistics() != stats) {
|
||||||
|
// DB changed the Statistics pointer, we need to notify read_amp_bitmap_
|
||||||
|
read_amp_bitmap_->SetStatistics(stats);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
144
table/block.h
144
table/block.h
@ -20,6 +20,7 @@
|
|||||||
#include "db/pinned_iterators_manager.h"
|
#include "db/pinned_iterators_manager.h"
|
||||||
#include "rocksdb/iterator.h"
|
#include "rocksdb/iterator.h"
|
||||||
#include "rocksdb/options.h"
|
#include "rocksdb/options.h"
|
||||||
|
#include "rocksdb/statistics.h"
|
||||||
#include "table/block_prefix_index.h"
|
#include "table/block_prefix_index.h"
|
||||||
#include "table/internal_iterator.h"
|
#include "table/internal_iterator.h"
|
||||||
|
|
||||||
@ -32,10 +33,119 @@ class Comparator;
|
|||||||
class BlockIter;
|
class BlockIter;
|
||||||
class BlockPrefixIndex;
|
class BlockPrefixIndex;
|
||||||
|
|
||||||
|
// BlockReadAmpBitmap is a bitmap that map the rocksdb::Block data bytes to
|
||||||
|
// a bitmap with ratio bytes_per_bit. Whenever we access a range of bytes in
|
||||||
|
// the Block we update the bitmap and increment READ_AMP_ESTIMATE_USEFUL_BYTES.
|
||||||
|
class BlockReadAmpBitmap {
|
||||||
|
public:
|
||||||
|
explicit BlockReadAmpBitmap(size_t block_size, size_t bytes_per_bit,
|
||||||
|
Statistics* statistics)
|
||||||
|
: bitmap_(nullptr), bytes_per_bit_pow_(0), statistics_(statistics) {
|
||||||
|
assert(block_size > 0 && bytes_per_bit > 0);
|
||||||
|
|
||||||
|
// convert bytes_per_bit to be a power of 2
|
||||||
|
while (bytes_per_bit >>= 1) {
|
||||||
|
bytes_per_bit_pow_++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// num_bits_needed = ceil(block_size / bytes_per_bit)
|
||||||
|
size_t num_bits_needed = (block_size >> bytes_per_bit_pow_) +
|
||||||
|
(block_size % (1 << bytes_per_bit_pow_) != 0);
|
||||||
|
|
||||||
|
// bitmap_size = ceil(num_bits_needed / kBitsPerEntry)
|
||||||
|
size_t bitmap_size = (num_bits_needed / kBitsPerEntry) +
|
||||||
|
(num_bits_needed % kBitsPerEntry != 0);
|
||||||
|
|
||||||
|
// Create bitmap and set all the bits to 0
|
||||||
|
bitmap_ = new std::atomic<uint32_t>[bitmap_size];
|
||||||
|
memset(bitmap_, 0, bitmap_size * kBytesPersEntry);
|
||||||
|
|
||||||
|
RecordTick(GetStatistics(), READ_AMP_TOTAL_READ_BYTES,
|
||||||
|
num_bits_needed << bytes_per_bit_pow_);
|
||||||
|
}
|
||||||
|
|
||||||
|
~BlockReadAmpBitmap() { delete[] bitmap_; }
|
||||||
|
|
||||||
|
void Mark(uint32_t start_offset, uint32_t end_offset) {
|
||||||
|
assert(end_offset >= start_offset);
|
||||||
|
|
||||||
|
// Every new bit we set will bump this counter
|
||||||
|
uint32_t new_useful_bytes = 0;
|
||||||
|
// Index of first bit in mask (start_offset / bytes_per_bit)
|
||||||
|
uint32_t start_bit = start_offset >> bytes_per_bit_pow_;
|
||||||
|
// Index of last bit in mask (end_offset / bytes_per_bit)
|
||||||
|
uint32_t end_bit = end_offset >> bytes_per_bit_pow_;
|
||||||
|
// Index of middle bit (unique to this range)
|
||||||
|
uint32_t mid_bit = start_bit + 1;
|
||||||
|
|
||||||
|
// It's guaranteed that ranges sent to Mark() wont overlap, this mean that
|
||||||
|
// we dont need to set the middle bits, we can simply set only one bit of
|
||||||
|
// the middle bits, and check this bit if we want to know if the whole
|
||||||
|
// range is set or not.
|
||||||
|
if (mid_bit < end_bit) {
|
||||||
|
if (GetAndSet(mid_bit) == 0) {
|
||||||
|
new_useful_bytes += (end_bit - mid_bit) << bytes_per_bit_pow_;
|
||||||
|
} else {
|
||||||
|
// If the middle bit is set, it's guaranteed that start and end bits
|
||||||
|
// are also set
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// This range dont have a middle bit, the whole range fall in 1 or 2 bits
|
||||||
|
}
|
||||||
|
|
||||||
|
if (GetAndSet(start_bit) == 0) {
|
||||||
|
new_useful_bytes += (1 << bytes_per_bit_pow_);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (GetAndSet(end_bit) == 0) {
|
||||||
|
new_useful_bytes += (1 << bytes_per_bit_pow_);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (new_useful_bytes > 0) {
|
||||||
|
RecordTick(GetStatistics(), READ_AMP_ESTIMATE_USEFUL_BYTES,
|
||||||
|
new_useful_bytes);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Statistics* GetStatistics() {
|
||||||
|
return statistics_.load(std::memory_order_relaxed);
|
||||||
|
}
|
||||||
|
|
||||||
|
void SetStatistics(Statistics* stats) { statistics_.store(stats); }
|
||||||
|
|
||||||
|
uint32_t GetBytesPerBit() { return 1 << bytes_per_bit_pow_; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
// Get the current value of bit at `bit_idx` and set it to 1
|
||||||
|
inline bool GetAndSet(uint32_t bit_idx) {
|
||||||
|
const uint32_t byte_idx = bit_idx / kBitsPerEntry;
|
||||||
|
const uint32_t bit_mask = 1 << (bit_idx % kBitsPerEntry);
|
||||||
|
|
||||||
|
return bitmap_[byte_idx].fetch_or(bit_mask, std::memory_order_relaxed) &
|
||||||
|
bit_mask;
|
||||||
|
}
|
||||||
|
|
||||||
|
const uint32_t kBytesPersEntry = sizeof(uint32_t); // 4 bytes
|
||||||
|
const uint32_t kBitsPerEntry = kBytesPersEntry * 8; // 32 bits
|
||||||
|
|
||||||
|
// Bitmap used to record the bytes that we read, use atomic to protect
|
||||||
|
// against multiple threads updating the same bit
|
||||||
|
std::atomic<uint32_t>* bitmap_;
|
||||||
|
// (1 << bytes_per_bit_pow_) is bytes_per_bit. Use power of 2 to optimize
|
||||||
|
// muliplication and division
|
||||||
|
uint8_t bytes_per_bit_pow_;
|
||||||
|
// Pointer to DB Statistics object, Since this bitmap may outlive the DB
|
||||||
|
// this pointer maybe invalid, but the DB will update it to a valid pointer
|
||||||
|
// by using SetStatistics() before calling Mark()
|
||||||
|
std::atomic<Statistics*> statistics_;
|
||||||
|
};
|
||||||
|
|
||||||
class Block {
|
class Block {
|
||||||
public:
|
public:
|
||||||
// Initialize the block with the specified contents.
|
// Initialize the block with the specified contents.
|
||||||
explicit Block(BlockContents&& contents);
|
explicit Block(BlockContents&& contents, size_t read_amp_bytes_per_bit = 0,
|
||||||
|
Statistics* statistics = nullptr);
|
||||||
|
|
||||||
~Block() = default;
|
~Block() = default;
|
||||||
|
|
||||||
@ -70,7 +180,8 @@ class Block {
|
|||||||
// and prefix_index_ are null, so this option does not matter.
|
// and prefix_index_ are null, so this option does not matter.
|
||||||
InternalIterator* NewIterator(const Comparator* comparator,
|
InternalIterator* NewIterator(const Comparator* comparator,
|
||||||
BlockIter* iter = nullptr,
|
BlockIter* iter = nullptr,
|
||||||
bool total_order_seek = true);
|
bool total_order_seek = true,
|
||||||
|
Statistics* stats = nullptr);
|
||||||
void SetBlockPrefixIndex(BlockPrefixIndex* prefix_index);
|
void SetBlockPrefixIndex(BlockPrefixIndex* prefix_index);
|
||||||
|
|
||||||
// Report an approximation of how much memory has been used.
|
// Report an approximation of how much memory has been used.
|
||||||
@ -82,6 +193,7 @@ class Block {
|
|||||||
size_t size_; // contents_.data.size()
|
size_t size_; // contents_.data.size()
|
||||||
uint32_t restart_offset_; // Offset in data_ of restart array
|
uint32_t restart_offset_; // Offset in data_ of restart array
|
||||||
std::unique_ptr<BlockPrefixIndex> prefix_index_;
|
std::unique_ptr<BlockPrefixIndex> prefix_index_;
|
||||||
|
std::unique_ptr<BlockReadAmpBitmap> read_amp_bitmap_;
|
||||||
|
|
||||||
// No copying allowed
|
// No copying allowed
|
||||||
Block(const Block&);
|
Block(const Block&);
|
||||||
@ -99,17 +211,22 @@ class BlockIter : public InternalIterator {
|
|||||||
restart_index_(0),
|
restart_index_(0),
|
||||||
status_(Status::OK()),
|
status_(Status::OK()),
|
||||||
prefix_index_(nullptr),
|
prefix_index_(nullptr),
|
||||||
key_pinned_(false) {}
|
key_pinned_(false),
|
||||||
|
read_amp_bitmap_(nullptr),
|
||||||
|
last_bitmap_offset_(0) {}
|
||||||
|
|
||||||
BlockIter(const Comparator* comparator, const char* data, uint32_t restarts,
|
BlockIter(const Comparator* comparator, const char* data, uint32_t restarts,
|
||||||
uint32_t num_restarts, BlockPrefixIndex* prefix_index)
|
uint32_t num_restarts, BlockPrefixIndex* prefix_index,
|
||||||
|
BlockReadAmpBitmap* read_amp_bitmap)
|
||||||
: BlockIter() {
|
: BlockIter() {
|
||||||
Initialize(comparator, data, restarts, num_restarts, prefix_index);
|
Initialize(comparator, data, restarts, num_restarts, prefix_index,
|
||||||
|
read_amp_bitmap);
|
||||||
}
|
}
|
||||||
|
|
||||||
void Initialize(const Comparator* comparator, const char* data,
|
void Initialize(const Comparator* comparator, const char* data,
|
||||||
uint32_t restarts, uint32_t num_restarts,
|
uint32_t restarts, uint32_t num_restarts,
|
||||||
BlockPrefixIndex* prefix_index) {
|
BlockPrefixIndex* prefix_index,
|
||||||
|
BlockReadAmpBitmap* read_amp_bitmap) {
|
||||||
assert(data_ == nullptr); // Ensure it is called only once
|
assert(data_ == nullptr); // Ensure it is called only once
|
||||||
assert(num_restarts > 0); // Ensure the param is valid
|
assert(num_restarts > 0); // Ensure the param is valid
|
||||||
|
|
||||||
@ -120,6 +237,8 @@ class BlockIter : public InternalIterator {
|
|||||||
current_ = restarts_;
|
current_ = restarts_;
|
||||||
restart_index_ = num_restarts_;
|
restart_index_ = num_restarts_;
|
||||||
prefix_index_ = prefix_index;
|
prefix_index_ = prefix_index;
|
||||||
|
read_amp_bitmap_ = read_amp_bitmap;
|
||||||
|
last_bitmap_offset_ = current_ + 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
void SetStatus(Status s) {
|
void SetStatus(Status s) {
|
||||||
@ -134,6 +253,12 @@ class BlockIter : public InternalIterator {
|
|||||||
}
|
}
|
||||||
virtual Slice value() const override {
|
virtual Slice value() const override {
|
||||||
assert(Valid());
|
assert(Valid());
|
||||||
|
if (read_amp_bitmap_ && current_ < restarts_ &&
|
||||||
|
current_ != last_bitmap_offset_) {
|
||||||
|
read_amp_bitmap_->Mark(current_ /* current entry offset */,
|
||||||
|
NextEntryOffset() - 1);
|
||||||
|
last_bitmap_offset_ = current_;
|
||||||
|
}
|
||||||
return value_;
|
return value_;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -164,6 +289,8 @@ class BlockIter : public InternalIterator {
|
|||||||
|
|
||||||
virtual bool IsValuePinned() const override { return true; }
|
virtual bool IsValuePinned() const override { return true; }
|
||||||
|
|
||||||
|
size_t TEST_CurrentEntrySize() { return NextEntryOffset() - current_; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
const Comparator* comparator_;
|
const Comparator* comparator_;
|
||||||
const char* data_; // underlying block contents
|
const char* data_; // underlying block contents
|
||||||
@ -179,6 +306,11 @@ class BlockIter : public InternalIterator {
|
|||||||
BlockPrefixIndex* prefix_index_;
|
BlockPrefixIndex* prefix_index_;
|
||||||
bool key_pinned_;
|
bool key_pinned_;
|
||||||
|
|
||||||
|
// read-amp bitmap
|
||||||
|
BlockReadAmpBitmap* read_amp_bitmap_;
|
||||||
|
// last `current_` value we report to read-amp bitmp
|
||||||
|
mutable uint32_t last_bitmap_offset_;
|
||||||
|
|
||||||
struct CachedPrevEntry {
|
struct CachedPrevEntry {
|
||||||
explicit CachedPrevEntry(uint32_t _offset, const char* _key_ptr,
|
explicit CachedPrevEntry(uint32_t _offset, const char* _key_ptr,
|
||||||
size_t _key_offset, size_t _key_size, Slice _value)
|
size_t _key_offset, size_t _key_size, Slice _value)
|
||||||
|
@ -66,14 +66,16 @@ namespace {
|
|||||||
Status ReadBlockFromFile(RandomAccessFileReader* file, const Footer& footer,
|
Status ReadBlockFromFile(RandomAccessFileReader* file, const Footer& footer,
|
||||||
const ReadOptions& options, const BlockHandle& handle,
|
const ReadOptions& options, const BlockHandle& handle,
|
||||||
std::unique_ptr<Block>* result,
|
std::unique_ptr<Block>* result,
|
||||||
const ImmutableCFOptions &ioptions,
|
const ImmutableCFOptions& ioptions, bool do_uncompress,
|
||||||
bool do_uncompress, const Slice& compression_dict,
|
const Slice& compression_dict,
|
||||||
const PersistentCacheOptions& cache_options) {
|
const PersistentCacheOptions& cache_options,
|
||||||
|
size_t read_amp_bytes_per_bit) {
|
||||||
BlockContents contents;
|
BlockContents contents;
|
||||||
Status s = ReadBlockContents(file, footer, options, handle, &contents, ioptions,
|
Status s = ReadBlockContents(file, footer, options, handle, &contents, ioptions,
|
||||||
do_uncompress, compression_dict, cache_options);
|
do_uncompress, compression_dict, cache_options);
|
||||||
if (s.ok()) {
|
if (s.ok()) {
|
||||||
result->reset(new Block(std::move(contents)));
|
result->reset(new Block(std::move(contents), read_amp_bytes_per_bit,
|
||||||
|
ioptions.statistics));
|
||||||
}
|
}
|
||||||
|
|
||||||
return s;
|
return s;
|
||||||
@ -188,7 +190,8 @@ class BinarySearchIndexReader : public IndexReader {
|
|||||||
std::unique_ptr<Block> index_block;
|
std::unique_ptr<Block> index_block;
|
||||||
auto s = ReadBlockFromFile(file, footer, ReadOptions(), index_handle,
|
auto s = ReadBlockFromFile(file, footer, ReadOptions(), index_handle,
|
||||||
&index_block, ioptions, true /* decompress */,
|
&index_block, ioptions, true /* decompress */,
|
||||||
Slice() /*compression dict*/, cache_options);
|
Slice() /*compression dict*/, cache_options,
|
||||||
|
0 /* read_amp_bytes_per_bit */);
|
||||||
|
|
||||||
if (s.ok()) {
|
if (s.ok()) {
|
||||||
*index_reader = new BinarySearchIndexReader(
|
*index_reader = new BinarySearchIndexReader(
|
||||||
@ -227,17 +230,20 @@ class BinarySearchIndexReader : public IndexReader {
|
|||||||
// key.
|
// key.
|
||||||
class HashIndexReader : public IndexReader {
|
class HashIndexReader : public IndexReader {
|
||||||
public:
|
public:
|
||||||
static Status Create(
|
static Status Create(const SliceTransform* hash_key_extractor,
|
||||||
const SliceTransform* hash_key_extractor, const Footer& footer,
|
const Footer& footer, RandomAccessFileReader* file,
|
||||||
RandomAccessFileReader* file, const ImmutableCFOptions &ioptions,
|
const ImmutableCFOptions& ioptions,
|
||||||
const Comparator* comparator, const BlockHandle& index_handle,
|
const Comparator* comparator,
|
||||||
InternalIterator* meta_index_iter, IndexReader** index_reader,
|
const BlockHandle& index_handle,
|
||||||
bool hash_index_allow_collision,
|
InternalIterator* meta_index_iter,
|
||||||
const PersistentCacheOptions& cache_options) {
|
IndexReader** index_reader,
|
||||||
|
bool hash_index_allow_collision,
|
||||||
|
const PersistentCacheOptions& cache_options) {
|
||||||
std::unique_ptr<Block> index_block;
|
std::unique_ptr<Block> index_block;
|
||||||
auto s = ReadBlockFromFile(file, footer, ReadOptions(), index_handle,
|
auto s = ReadBlockFromFile(file, footer, ReadOptions(), index_handle,
|
||||||
&index_block, ioptions, true /* decompress */,
|
&index_block, ioptions, true /* decompress */,
|
||||||
Slice() /*compression dict*/, cache_options);
|
Slice() /*compression dict*/, cache_options,
|
||||||
|
0 /* read_amp_bytes_per_bit */);
|
||||||
|
|
||||||
if (!s.ok()) {
|
if (!s.ok()) {
|
||||||
return s;
|
return s;
|
||||||
@ -791,7 +797,7 @@ Status BlockBasedTable::ReadMetaBlock(Rep* rep,
|
|||||||
rep->file.get(), rep->footer, ReadOptions(),
|
rep->file.get(), rep->footer, ReadOptions(),
|
||||||
rep->footer.metaindex_handle(), &meta, rep->ioptions,
|
rep->footer.metaindex_handle(), &meta, rep->ioptions,
|
||||||
true /* decompress */, Slice() /*compression dict*/,
|
true /* decompress */, Slice() /*compression dict*/,
|
||||||
rep->persistent_cache_options);
|
rep->persistent_cache_options, 0 /* read_amp_bytes_per_bit */);
|
||||||
|
|
||||||
if (!s.ok()) {
|
if (!s.ok()) {
|
||||||
Log(InfoLogLevel::ERROR_LEVEL, rep->ioptions.info_log,
|
Log(InfoLogLevel::ERROR_LEVEL, rep->ioptions.info_log,
|
||||||
@ -809,9 +815,9 @@ Status BlockBasedTable::ReadMetaBlock(Rep* rep,
|
|||||||
Status BlockBasedTable::GetDataBlockFromCache(
|
Status BlockBasedTable::GetDataBlockFromCache(
|
||||||
const Slice& block_cache_key, const Slice& compressed_block_cache_key,
|
const Slice& block_cache_key, const Slice& compressed_block_cache_key,
|
||||||
Cache* block_cache, Cache* block_cache_compressed,
|
Cache* block_cache, Cache* block_cache_compressed,
|
||||||
const ImmutableCFOptions &ioptions, const ReadOptions& read_options,
|
const ImmutableCFOptions& ioptions, const ReadOptions& read_options,
|
||||||
BlockBasedTable::CachableEntry<Block>* block, uint32_t format_version,
|
BlockBasedTable::CachableEntry<Block>* block, uint32_t format_version,
|
||||||
const Slice& compression_dict) {
|
const Slice& compression_dict, size_t read_amp_bytes_per_bit) {
|
||||||
Status s;
|
Status s;
|
||||||
Block* compressed_block = nullptr;
|
Block* compressed_block = nullptr;
|
||||||
Cache::Handle* block_cache_compressed_handle = nullptr;
|
Cache::Handle* block_cache_compressed_handle = nullptr;
|
||||||
@ -861,7 +867,8 @@ Status BlockBasedTable::GetDataBlockFromCache(
|
|||||||
|
|
||||||
// Insert uncompressed block into block cache
|
// Insert uncompressed block into block cache
|
||||||
if (s.ok()) {
|
if (s.ok()) {
|
||||||
block->value = new Block(std::move(contents)); // uncompressed block
|
block->value = new Block(std::move(contents), read_amp_bytes_per_bit,
|
||||||
|
statistics); // uncompressed block
|
||||||
assert(block->value->compression_type() == kNoCompression);
|
assert(block->value->compression_type() == kNoCompression);
|
||||||
if (block_cache != nullptr && block->value->cachable() &&
|
if (block_cache != nullptr && block->value->cachable() &&
|
||||||
read_options.fill_cache) {
|
read_options.fill_cache) {
|
||||||
@ -886,9 +893,9 @@ Status BlockBasedTable::GetDataBlockFromCache(
|
|||||||
Status BlockBasedTable::PutDataBlockToCache(
|
Status BlockBasedTable::PutDataBlockToCache(
|
||||||
const Slice& block_cache_key, const Slice& compressed_block_cache_key,
|
const Slice& block_cache_key, const Slice& compressed_block_cache_key,
|
||||||
Cache* block_cache, Cache* block_cache_compressed,
|
Cache* block_cache, Cache* block_cache_compressed,
|
||||||
const ReadOptions& read_options, const ImmutableCFOptions &ioptions,
|
const ReadOptions& read_options, const ImmutableCFOptions& ioptions,
|
||||||
CachableEntry<Block>* block, Block* raw_block, uint32_t format_version,
|
CachableEntry<Block>* block, Block* raw_block, uint32_t format_version,
|
||||||
const Slice& compression_dict) {
|
const Slice& compression_dict, size_t read_amp_bytes_per_bit) {
|
||||||
assert(raw_block->compression_type() == kNoCompression ||
|
assert(raw_block->compression_type() == kNoCompression ||
|
||||||
block_cache_compressed != nullptr);
|
block_cache_compressed != nullptr);
|
||||||
|
|
||||||
@ -906,7 +913,8 @@ Status BlockBasedTable::PutDataBlockToCache(
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (raw_block->compression_type() != kNoCompression) {
|
if (raw_block->compression_type() != kNoCompression) {
|
||||||
block->value = new Block(std::move(contents)); // uncompressed block
|
block->value = new Block(std::move(contents), read_amp_bytes_per_bit,
|
||||||
|
statistics); // compressed block
|
||||||
} else {
|
} else {
|
||||||
block->value = raw_block;
|
block->value = raw_block;
|
||||||
raw_block = nullptr;
|
raw_block = nullptr;
|
||||||
@ -1206,8 +1214,9 @@ InternalIterator* BlockBasedTable::NewDataBlockIterator(
|
|||||||
}
|
}
|
||||||
|
|
||||||
s = GetDataBlockFromCache(
|
s = GetDataBlockFromCache(
|
||||||
key, ckey, block_cache, block_cache_compressed, rep->ioptions, ro, &block,
|
key, ckey, block_cache, block_cache_compressed, rep->ioptions, ro,
|
||||||
rep->table_options.format_version, compression_dict);
|
&block, rep->table_options.format_version, compression_dict,
|
||||||
|
rep->table_options.read_amp_bytes_per_bit);
|
||||||
|
|
||||||
if (block.value == nullptr && !no_io && ro.fill_cache) {
|
if (block.value == nullptr && !no_io && ro.fill_cache) {
|
||||||
std::unique_ptr<Block> raw_block;
|
std::unique_ptr<Block> raw_block;
|
||||||
@ -1216,14 +1225,15 @@ InternalIterator* BlockBasedTable::NewDataBlockIterator(
|
|||||||
s = ReadBlockFromFile(rep->file.get(), rep->footer, ro, handle,
|
s = ReadBlockFromFile(rep->file.get(), rep->footer, ro, handle,
|
||||||
&raw_block, rep->ioptions,
|
&raw_block, rep->ioptions,
|
||||||
block_cache_compressed == nullptr,
|
block_cache_compressed == nullptr,
|
||||||
compression_dict, rep->persistent_cache_options);
|
compression_dict, rep->persistent_cache_options,
|
||||||
|
rep->table_options.read_amp_bytes_per_bit);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (s.ok()) {
|
if (s.ok()) {
|
||||||
s = PutDataBlockToCache(key, ckey, block_cache, block_cache_compressed,
|
s = PutDataBlockToCache(
|
||||||
ro, rep->ioptions, &block, raw_block.release(),
|
key, ckey, block_cache, block_cache_compressed, ro, rep->ioptions,
|
||||||
rep->table_options.format_version,
|
&block, raw_block.release(), rep->table_options.format_version,
|
||||||
compression_dict);
|
compression_dict, rep->table_options.read_amp_bytes_per_bit);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1242,7 +1252,8 @@ InternalIterator* BlockBasedTable::NewDataBlockIterator(
|
|||||||
std::unique_ptr<Block> block_value;
|
std::unique_ptr<Block> block_value;
|
||||||
s = ReadBlockFromFile(rep->file.get(), rep->footer, ro, handle,
|
s = ReadBlockFromFile(rep->file.get(), rep->footer, ro, handle,
|
||||||
&block_value, rep->ioptions, true /* compress */,
|
&block_value, rep->ioptions, true /* compress */,
|
||||||
compression_dict, rep->persistent_cache_options);
|
compression_dict, rep->persistent_cache_options,
|
||||||
|
rep->table_options.read_amp_bytes_per_bit);
|
||||||
if (s.ok()) {
|
if (s.ok()) {
|
||||||
block.value = block_value.release();
|
block.value = block_value.release();
|
||||||
}
|
}
|
||||||
@ -1251,7 +1262,8 @@ InternalIterator* BlockBasedTable::NewDataBlockIterator(
|
|||||||
InternalIterator* iter;
|
InternalIterator* iter;
|
||||||
if (s.ok()) {
|
if (s.ok()) {
|
||||||
assert(block.value != nullptr);
|
assert(block.value != nullptr);
|
||||||
iter = block.value->NewIterator(&rep->internal_comparator, input_iter);
|
iter = block.value->NewIterator(&rep->internal_comparator, input_iter, true,
|
||||||
|
rep->ioptions.statistics);
|
||||||
if (block.cache_handle != nullptr) {
|
if (block.cache_handle != nullptr) {
|
||||||
iter->RegisterCleanup(&ReleaseCachedEntry, block_cache,
|
iter->RegisterCleanup(&ReleaseCachedEntry, block_cache,
|
||||||
block.cache_handle);
|
block.cache_handle);
|
||||||
@ -1607,12 +1619,12 @@ bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options,
|
|||||||
handle, cache_key_storage);
|
handle, cache_key_storage);
|
||||||
Slice ckey;
|
Slice ckey;
|
||||||
|
|
||||||
s = GetDataBlockFromCache(cache_key, ckey, block_cache, nullptr,
|
s = GetDataBlockFromCache(
|
||||||
rep_->ioptions, options, &block,
|
cache_key, ckey, block_cache, nullptr, rep_->ioptions, options, &block,
|
||||||
rep_->table_options.format_version,
|
rep_->table_options.format_version,
|
||||||
rep_->compression_dict_block
|
rep_->compression_dict_block ? rep_->compression_dict_block->data
|
||||||
? rep_->compression_dict_block->data
|
: Slice(),
|
||||||
: Slice());
|
0 /* read_amp_bytes_per_bit */);
|
||||||
assert(s.ok());
|
assert(s.ok());
|
||||||
bool in_cache = block.value != nullptr;
|
bool in_cache = block.value != nullptr;
|
||||||
if (in_cache) {
|
if (in_cache) {
|
||||||
|
@ -190,9 +190,9 @@ class BlockBasedTable : public TableReader {
|
|||||||
static Status GetDataBlockFromCache(
|
static Status GetDataBlockFromCache(
|
||||||
const Slice& block_cache_key, const Slice& compressed_block_cache_key,
|
const Slice& block_cache_key, const Slice& compressed_block_cache_key,
|
||||||
Cache* block_cache, Cache* block_cache_compressed,
|
Cache* block_cache, Cache* block_cache_compressed,
|
||||||
const ImmutableCFOptions &ioptions, const ReadOptions& read_options,
|
const ImmutableCFOptions& ioptions, const ReadOptions& read_options,
|
||||||
BlockBasedTable::CachableEntry<Block>* block, uint32_t format_version,
|
BlockBasedTable::CachableEntry<Block>* block, uint32_t format_version,
|
||||||
const Slice& compression_dict);
|
const Slice& compression_dict, size_t read_amp_bytes_per_bit);
|
||||||
|
|
||||||
// Put a raw block (maybe compressed) to the corresponding block caches.
|
// Put a raw block (maybe compressed) to the corresponding block caches.
|
||||||
// This method will perform decompression against raw_block if needed and then
|
// This method will perform decompression against raw_block if needed and then
|
||||||
@ -207,9 +207,9 @@ class BlockBasedTable : public TableReader {
|
|||||||
static Status PutDataBlockToCache(
|
static Status PutDataBlockToCache(
|
||||||
const Slice& block_cache_key, const Slice& compressed_block_cache_key,
|
const Slice& block_cache_key, const Slice& compressed_block_cache_key,
|
||||||
Cache* block_cache, Cache* block_cache_compressed,
|
Cache* block_cache, Cache* block_cache_compressed,
|
||||||
const ReadOptions& read_options, const ImmutableCFOptions &ioptions,
|
const ReadOptions& read_options, const ImmutableCFOptions& ioptions,
|
||||||
CachableEntry<Block>* block, Block* raw_block, uint32_t format_version,
|
CachableEntry<Block>* block, Block* raw_block, uint32_t format_version,
|
||||||
const Slice& compression_dict);
|
const Slice& compression_dict, size_t read_amp_bytes_per_bit);
|
||||||
|
|
||||||
// Calls (*handle_result)(arg, ...) repeatedly, starting with the entry found
|
// Calls (*handle_result)(arg, ...) repeatedly, starting with the entry found
|
||||||
// after a call to Seek(key), until handle_result returns false.
|
// after a call to Seek(key), until handle_result returns false.
|
||||||
|
@ -4,7 +4,11 @@
|
|||||||
// of patent rights can be found in the PATENTS file in the same directory.
|
// of patent rights can be found in the PATENTS file in the same directory.
|
||||||
//
|
//
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
#include <algorithm>
|
||||||
|
#include <set>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include <unordered_set>
|
||||||
|
#include <utility>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#include "db/dbformat.h"
|
#include "db/dbformat.h"
|
||||||
@ -216,6 +220,259 @@ TEST_F(BlockTest, IndexHashWithSharedPrefix) {
|
|||||||
CheckBlockContents(std::move(contents), kMaxKey, keys, values);
|
CheckBlockContents(std::move(contents), kMaxKey, keys, values);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// A slow and accurate version of BlockReadAmpBitmap that simply store
|
||||||
|
// all the marked ranges in a set.
|
||||||
|
class BlockReadAmpBitmapSlowAndAccurate {
|
||||||
|
public:
|
||||||
|
void Mark(size_t start_offset, size_t end_offset) {
|
||||||
|
assert(end_offset >= start_offset);
|
||||||
|
|
||||||
|
marked_ranges_.emplace(end_offset, start_offset);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Return true if any byte in this range was Marked
|
||||||
|
bool IsAnyInRangeMarked(size_t start_offset, size_t end_offset) {
|
||||||
|
auto it = marked_ranges_.lower_bound(std::make_pair(start_offset, 0));
|
||||||
|
if (it == marked_ranges_.end()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return start_offset <= it->first && end_offset >= it->second;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::set<std::pair<size_t, size_t>> marked_ranges_ = {};
|
||||||
|
};
|
||||||
|
|
||||||
|
TEST_F(BlockTest, BlockReadAmpBitmap) {
|
||||||
|
std::vector<size_t> block_sizes = {
|
||||||
|
1, // 1 byte
|
||||||
|
32, // 32 bytes
|
||||||
|
61, // 61 bytes
|
||||||
|
64, // 64 bytes
|
||||||
|
512, // 0.5 KB
|
||||||
|
1024, // 1 KB
|
||||||
|
1024 * 4, // 4 KB
|
||||||
|
1024 * 10, // 10 KB
|
||||||
|
1024 * 50, // 50 KB
|
||||||
|
1024 * 1024, // 1 MB
|
||||||
|
1024 * 1024 * 4, // 4 MB
|
||||||
|
1024 * 1024 * 50, // 10 MB
|
||||||
|
777,
|
||||||
|
124653,
|
||||||
|
};
|
||||||
|
const size_t kBytesPerBit = 64;
|
||||||
|
|
||||||
|
Random rnd(301);
|
||||||
|
for (size_t block_size : block_sizes) {
|
||||||
|
std::shared_ptr<Statistics> stats = rocksdb::CreateDBStatistics();
|
||||||
|
BlockReadAmpBitmap read_amp_bitmap(block_size, kBytesPerBit, stats.get());
|
||||||
|
BlockReadAmpBitmapSlowAndAccurate read_amp_slow_and_accurate;
|
||||||
|
|
||||||
|
size_t needed_bits = (block_size / kBytesPerBit);
|
||||||
|
if (block_size % kBytesPerBit != 0) {
|
||||||
|
needed_bits++;
|
||||||
|
}
|
||||||
|
size_t bitmap_size = needed_bits / 32;
|
||||||
|
if (needed_bits % 32 != 0) {
|
||||||
|
bitmap_size++;
|
||||||
|
}
|
||||||
|
size_t bits_in_bitmap = bitmap_size * 32;
|
||||||
|
|
||||||
|
ASSERT_EQ(stats->getTickerCount(READ_AMP_TOTAL_READ_BYTES),
|
||||||
|
needed_bits * kBytesPerBit);
|
||||||
|
|
||||||
|
// Generate some random entries
|
||||||
|
std::vector<size_t> random_entry_offsets;
|
||||||
|
for (int i = 0; i < 1000; i++) {
|
||||||
|
random_entry_offsets.push_back(rnd.Next() % block_size);
|
||||||
|
}
|
||||||
|
std::sort(random_entry_offsets.begin(), random_entry_offsets.end());
|
||||||
|
auto it =
|
||||||
|
std::unique(random_entry_offsets.begin(), random_entry_offsets.end());
|
||||||
|
random_entry_offsets.resize(
|
||||||
|
std::distance(random_entry_offsets.begin(), it));
|
||||||
|
|
||||||
|
std::vector<std::pair<uint32_t, uint32_t>> random_entries;
|
||||||
|
for (size_t i = 0; i < random_entry_offsets.size(); i++) {
|
||||||
|
size_t entry_start = random_entry_offsets[i];
|
||||||
|
size_t entry_end;
|
||||||
|
if (i + 1 < random_entry_offsets.size()) {
|
||||||
|
entry_end = random_entry_offsets[i + 1] - 1;
|
||||||
|
} else {
|
||||||
|
entry_end = block_size - 1;
|
||||||
|
}
|
||||||
|
random_entries.emplace_back(entry_start, entry_end);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (size_t i = 0; i < random_entries.size(); i++) {
|
||||||
|
auto ¤t_entry = random_entries[rnd.Next() % random_entries.size()];
|
||||||
|
|
||||||
|
read_amp_bitmap.Mark(current_entry.first, current_entry.second);
|
||||||
|
read_amp_slow_and_accurate.Mark(current_entry.first,
|
||||||
|
current_entry.second);
|
||||||
|
|
||||||
|
size_t total_bits = 0;
|
||||||
|
for (size_t bit_idx = 0; bit_idx < bits_in_bitmap; bit_idx++) {
|
||||||
|
size_t start_rng = bit_idx * kBytesPerBit;
|
||||||
|
size_t end_rng = (start_rng + kBytesPerBit) - 1;
|
||||||
|
|
||||||
|
total_bits +=
|
||||||
|
read_amp_slow_and_accurate.IsAnyInRangeMarked(start_rng, end_rng);
|
||||||
|
}
|
||||||
|
size_t expected_estimate_useful = total_bits * kBytesPerBit;
|
||||||
|
size_t got_estimate_useful =
|
||||||
|
stats->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES);
|
||||||
|
|
||||||
|
ASSERT_EQ(expected_estimate_useful, got_estimate_useful);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(BlockTest, BlockWithReadAmpBitmap) {
|
||||||
|
Random rnd(301);
|
||||||
|
Options options = Options();
|
||||||
|
std::unique_ptr<InternalKeyComparator> ic;
|
||||||
|
ic.reset(new test::PlainInternalKeyComparator(options.comparator));
|
||||||
|
|
||||||
|
std::vector<std::string> keys;
|
||||||
|
std::vector<std::string> values;
|
||||||
|
BlockBuilder builder(16);
|
||||||
|
int num_records = 10000;
|
||||||
|
|
||||||
|
GenerateRandomKVs(&keys, &values, 0, num_records, 1);
|
||||||
|
// add a bunch of records to a block
|
||||||
|
for (int i = 0; i < num_records; i++) {
|
||||||
|
builder.Add(keys[i], values[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
Slice rawblock = builder.Finish();
|
||||||
|
const size_t kBytesPerBit = 8;
|
||||||
|
|
||||||
|
// Read the block sequentially using Next()
|
||||||
|
{
|
||||||
|
std::shared_ptr<Statistics> stats = rocksdb::CreateDBStatistics();
|
||||||
|
|
||||||
|
// create block reader
|
||||||
|
BlockContents contents;
|
||||||
|
contents.data = rawblock;
|
||||||
|
contents.cachable = true;
|
||||||
|
Block reader(std::move(contents), kBytesPerBit, stats.get());
|
||||||
|
|
||||||
|
// read contents of block sequentially
|
||||||
|
size_t read_bytes = 0;
|
||||||
|
BlockIter *iter = static_cast<BlockIter *>(
|
||||||
|
reader.NewIterator(options.comparator, nullptr, true, stats.get()));
|
||||||
|
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
||||||
|
iter->value();
|
||||||
|
read_bytes += iter->TEST_CurrentEntrySize();
|
||||||
|
|
||||||
|
double semi_acc_read_amp =
|
||||||
|
static_cast<double>(read_bytes) / rawblock.size();
|
||||||
|
double read_amp = static_cast<double>(stats->getTickerCount(
|
||||||
|
READ_AMP_ESTIMATE_USEFUL_BYTES)) /
|
||||||
|
stats->getTickerCount(READ_AMP_TOTAL_READ_BYTES);
|
||||||
|
|
||||||
|
// Error in read amplification will be less than 1% if we are reading
|
||||||
|
// sequentially
|
||||||
|
double error_pct = fabs(semi_acc_read_amp - read_amp) * 100;
|
||||||
|
EXPECT_LT(error_pct, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
delete iter;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Read the block sequentially using Seek()
|
||||||
|
{
|
||||||
|
std::shared_ptr<Statistics> stats = rocksdb::CreateDBStatistics();
|
||||||
|
|
||||||
|
// create block reader
|
||||||
|
BlockContents contents;
|
||||||
|
contents.data = rawblock;
|
||||||
|
contents.cachable = true;
|
||||||
|
Block reader(std::move(contents), kBytesPerBit, stats.get());
|
||||||
|
|
||||||
|
size_t read_bytes = 0;
|
||||||
|
BlockIter *iter = static_cast<BlockIter *>(
|
||||||
|
reader.NewIterator(options.comparator, nullptr, true, stats.get()));
|
||||||
|
for (int i = 0; i < num_records; i++) {
|
||||||
|
Slice k(keys[i]);
|
||||||
|
|
||||||
|
// search in block for this key
|
||||||
|
iter->Seek(k);
|
||||||
|
iter->value();
|
||||||
|
read_bytes += iter->TEST_CurrentEntrySize();
|
||||||
|
|
||||||
|
double semi_acc_read_amp =
|
||||||
|
static_cast<double>(read_bytes) / rawblock.size();
|
||||||
|
double read_amp = static_cast<double>(stats->getTickerCount(
|
||||||
|
READ_AMP_ESTIMATE_USEFUL_BYTES)) /
|
||||||
|
stats->getTickerCount(READ_AMP_TOTAL_READ_BYTES);
|
||||||
|
|
||||||
|
// Error in read amplification will be less than 1% if we are reading
|
||||||
|
// sequentially
|
||||||
|
double error_pct = fabs(semi_acc_read_amp - read_amp) * 100;
|
||||||
|
EXPECT_LT(error_pct, 1);
|
||||||
|
}
|
||||||
|
delete iter;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Read the block randomly
|
||||||
|
{
|
||||||
|
std::shared_ptr<Statistics> stats = rocksdb::CreateDBStatistics();
|
||||||
|
|
||||||
|
// create block reader
|
||||||
|
BlockContents contents;
|
||||||
|
contents.data = rawblock;
|
||||||
|
contents.cachable = true;
|
||||||
|
Block reader(std::move(contents), kBytesPerBit, stats.get());
|
||||||
|
|
||||||
|
size_t read_bytes = 0;
|
||||||
|
BlockIter *iter = static_cast<BlockIter *>(
|
||||||
|
reader.NewIterator(options.comparator, nullptr, true, stats.get()));
|
||||||
|
std::unordered_set<int> read_keys;
|
||||||
|
for (int i = 0; i < num_records; i++) {
|
||||||
|
int index = rnd.Uniform(num_records);
|
||||||
|
Slice k(keys[index]);
|
||||||
|
|
||||||
|
iter->Seek(k);
|
||||||
|
iter->value();
|
||||||
|
if (read_keys.find(index) == read_keys.end()) {
|
||||||
|
read_keys.insert(index);
|
||||||
|
read_bytes += iter->TEST_CurrentEntrySize();
|
||||||
|
}
|
||||||
|
|
||||||
|
double semi_acc_read_amp =
|
||||||
|
static_cast<double>(read_bytes) / rawblock.size();
|
||||||
|
double read_amp = static_cast<double>(stats->getTickerCount(
|
||||||
|
READ_AMP_ESTIMATE_USEFUL_BYTES)) /
|
||||||
|
stats->getTickerCount(READ_AMP_TOTAL_READ_BYTES);
|
||||||
|
|
||||||
|
double error_pct = fabs(semi_acc_read_amp - read_amp) * 100;
|
||||||
|
// Error in read amplification will be less than 2% if we are reading
|
||||||
|
// randomly
|
||||||
|
EXPECT_LT(error_pct, 2);
|
||||||
|
}
|
||||||
|
delete iter;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(BlockTest, ReadAmpBitmapPow2) {
|
||||||
|
std::shared_ptr<Statistics> stats = rocksdb::CreateDBStatistics();
|
||||||
|
ASSERT_EQ(BlockReadAmpBitmap(100, 1, stats.get()).GetBytesPerBit(), 1);
|
||||||
|
ASSERT_EQ(BlockReadAmpBitmap(100, 2, stats.get()).GetBytesPerBit(), 2);
|
||||||
|
ASSERT_EQ(BlockReadAmpBitmap(100, 4, stats.get()).GetBytesPerBit(), 4);
|
||||||
|
ASSERT_EQ(BlockReadAmpBitmap(100, 8, stats.get()).GetBytesPerBit(), 8);
|
||||||
|
ASSERT_EQ(BlockReadAmpBitmap(100, 16, stats.get()).GetBytesPerBit(), 16);
|
||||||
|
ASSERT_EQ(BlockReadAmpBitmap(100, 32, stats.get()).GetBytesPerBit(), 32);
|
||||||
|
|
||||||
|
ASSERT_EQ(BlockReadAmpBitmap(100, 3, stats.get()).GetBytesPerBit(), 2);
|
||||||
|
ASSERT_EQ(BlockReadAmpBitmap(100, 7, stats.get()).GetBytesPerBit(), 4);
|
||||||
|
ASSERT_EQ(BlockReadAmpBitmap(100, 11, stats.get()).GetBytesPerBit(), 8);
|
||||||
|
ASSERT_EQ(BlockReadAmpBitmap(100, 17, stats.get()).GetBytesPerBit(), 16);
|
||||||
|
ASSERT_EQ(BlockReadAmpBitmap(100, 33, stats.get()).GetBytesPerBit(), 32);
|
||||||
|
ASSERT_EQ(BlockReadAmpBitmap(100, 35, stats.get()).GetBytesPerBit(), 32);
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace rocksdb
|
} // namespace rocksdb
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv) {
|
||||||
|
@ -384,6 +384,10 @@ DEFINE_int32(index_block_restart_interval,
|
|||||||
"Number of keys between restart points "
|
"Number of keys between restart points "
|
||||||
"for delta encoding of keys in index block.");
|
"for delta encoding of keys in index block.");
|
||||||
|
|
||||||
|
DEFINE_int32(read_amp_bytes_per_bit,
|
||||||
|
rocksdb::BlockBasedTableOptions().read_amp_bytes_per_bit,
|
||||||
|
"Number of bytes per bit to be used in block read-amp bitmap");
|
||||||
|
|
||||||
DEFINE_int64(compressed_cache_size, -1,
|
DEFINE_int64(compressed_cache_size, -1,
|
||||||
"Number of bytes to use as a cache of compressed data.");
|
"Number of bytes to use as a cache of compressed data.");
|
||||||
|
|
||||||
@ -2805,6 +2809,7 @@ class Benchmark {
|
|||||||
block_based_options.skip_table_builder_flush =
|
block_based_options.skip_table_builder_flush =
|
||||||
FLAGS_skip_table_builder_flush;
|
FLAGS_skip_table_builder_flush;
|
||||||
block_based_options.format_version = 2;
|
block_based_options.format_version = 2;
|
||||||
|
block_based_options.read_amp_bytes_per_bit = FLAGS_read_amp_bytes_per_bit;
|
||||||
options.table_factory.reset(
|
options.table_factory.reset(
|
||||||
NewBlockBasedTableFactory(block_based_options));
|
NewBlockBasedTableFactory(block_based_options));
|
||||||
}
|
}
|
||||||
|
@ -558,7 +558,10 @@ static std::unordered_map<std::string, OptionTypeInfo>
|
|||||||
OptionType::kUInt32T, OptionVerificationType::kNormal}},
|
OptionType::kUInt32T, OptionVerificationType::kNormal}},
|
||||||
{"verify_compression",
|
{"verify_compression",
|
||||||
{offsetof(struct BlockBasedTableOptions, verify_compression),
|
{offsetof(struct BlockBasedTableOptions, verify_compression),
|
||||||
OptionType::kBoolean, OptionVerificationType::kNormal}}};
|
OptionType::kBoolean, OptionVerificationType::kNormal}},
|
||||||
|
{"read_amp_bytes_per_bit",
|
||||||
|
{offsetof(struct BlockBasedTableOptions, read_amp_bytes_per_bit),
|
||||||
|
OptionType::kSizeT, OptionVerificationType::kNormal}}};
|
||||||
|
|
||||||
static std::unordered_map<std::string, OptionTypeInfo> plain_table_type_info = {
|
static std::unordered_map<std::string, OptionTypeInfo> plain_table_type_info = {
|
||||||
{"user_key_len",
|
{"user_key_len",
|
||||||
|
@ -159,7 +159,7 @@ TEST_F(OptionsSettableTest, BlockBasedTableOptionsAllFieldsSettable) {
|
|||||||
"filter_policy=bloomfilter:4:true;whole_key_filtering=1;"
|
"filter_policy=bloomfilter:4:true;whole_key_filtering=1;"
|
||||||
"skip_table_builder_flush=1;format_version=1;"
|
"skip_table_builder_flush=1;format_version=1;"
|
||||||
"hash_index_allow_collision=false;"
|
"hash_index_allow_collision=false;"
|
||||||
"verify_compression=true;",
|
"verify_compression=true;read_amp_bytes_per_bit=0",
|
||||||
new_bbto));
|
new_bbto));
|
||||||
|
|
||||||
ASSERT_EQ(unset_bytes_base,
|
ASSERT_EQ(unset_bytes_base,
|
||||||
|
Loading…
Reference in New Issue
Block a user