unbiase readamp bitmap
Summary: Consider BlockReadAmpBitmap with bytes_per_bit = 32. Suppose bytes [a, b) were used, while bytes [a-32, a) and [b+1, b+33) weren't used; more formally, the union of ranges passed to BlockReadAmpBitmap::Mark() contains [a, b) and doesn't intersect with [a-32, a) and [b+1, b+33). Then bits [floor(a/32), ceil(b/32)] will be set, and so the number of useful bytes will be estimated as (ceil(b/32) - floor(a/32)) * 32, which is on average equal to b-a+31. An extreme example: if we use 1 byte from each block, it'll be counted as 32 bytes from each block. It's easy to remove this bias by slightly changing the semantics of the bitmap. Currently each bit represents a byte range [i*32, (i+1)*32). This diff makes each bit represent a single byte: i*32 + X, where X is a random number in [0, 31] generated when bitmap is created. So, e.g., if you read a single byte at random, with probability 31/32 it won't be counted at all, and with probability 1/32 it will be counted as 32 bytes; so, on average it's counted as 1 byte. *But there is one exception: the last bit will always set with the old way.* (*) - assuming read_amp_bytes_per_bit = 32. Closes https://github.com/facebook/rocksdb/pull/2259 Differential Revision: D5035652 Pulled By: lightmark fbshipit-source-id: bd98b1b9b49fbe61f9e3781d07f624e3cbd92356
This commit is contained in:
parent
2014cdf2d0
commit
7e62c5d67a
116
db/db_test2.cc
116
db/db_test2.cc
@ -1640,66 +1640,80 @@ size_t GetEncodedEntrySize(size_t key_size, size_t value_size) {
|
|||||||
TEST_F(DBTest2, ReadAmpBitmap) {
|
TEST_F(DBTest2, ReadAmpBitmap) {
|
||||||
Options options = CurrentOptions();
|
Options options = CurrentOptions();
|
||||||
BlockBasedTableOptions bbto;
|
BlockBasedTableOptions bbto;
|
||||||
// Disable delta encoding to make it easier to calculate read amplification
|
size_t bytes_per_bit[2] = {1, 16};
|
||||||
bbto.use_delta_encoding = false;
|
for (size_t k = 0; k < 2; k++) {
|
||||||
// Huge block cache to make it easier to calculate read amplification
|
// Disable delta encoding to make it easier to calculate read amplification
|
||||||
bbto.block_cache = NewLRUCache(1024 * 1024 * 1024);
|
bbto.use_delta_encoding = false;
|
||||||
bbto.read_amp_bytes_per_bit = 16;
|
// Huge block cache to make it easier to calculate read amplification
|
||||||
options.table_factory.reset(NewBlockBasedTableFactory(bbto));
|
bbto.block_cache = NewLRUCache(1024 * 1024 * 1024);
|
||||||
options.statistics = rocksdb::CreateDBStatistics();
|
bbto.read_amp_bytes_per_bit = bytes_per_bit[k];
|
||||||
DestroyAndReopen(options);
|
options.table_factory.reset(NewBlockBasedTableFactory(bbto));
|
||||||
|
options.statistics = rocksdb::CreateDBStatistics();
|
||||||
|
DestroyAndReopen(options);
|
||||||
|
|
||||||
const size_t kNumEntries = 10000;
|
const size_t kNumEntries = 10000;
|
||||||
|
|
||||||
Random rnd(301);
|
Random rnd(301);
|
||||||
for (size_t i = 0; i < kNumEntries; i++) {
|
for (size_t i = 0; i < kNumEntries; i++) {
|
||||||
ASSERT_OK(Put(Key(static_cast<int>(i)), RandomString(&rnd, 100)));
|
ASSERT_OK(Put(Key(static_cast<int>(i)), RandomString(&rnd, 100)));
|
||||||
}
|
}
|
||||||
ASSERT_OK(Flush());
|
ASSERT_OK(Flush());
|
||||||
|
|
||||||
Close();
|
Close();
|
||||||
Reopen(options);
|
Reopen(options);
|
||||||
|
|
||||||
// Read keys/values randomly and verify that reported read amp error
|
// Read keys/values randomly and verify that reported read amp error
|
||||||
// is less than 2%
|
// is less than 2%
|
||||||
uint64_t total_useful_bytes = 0;
|
uint64_t total_useful_bytes = 0;
|
||||||
std::set<int> read_keys;
|
std::set<int> read_keys;
|
||||||
std::string value;
|
std::string value;
|
||||||
for (size_t i = 0; i < kNumEntries * 5; i++) {
|
for (size_t i = 0; i < kNumEntries * 5; i++) {
|
||||||
int key_idx = rnd.Next() % kNumEntries;
|
int key_idx = rnd.Next() % kNumEntries;
|
||||||
std::string k = Key(key_idx);
|
std::string key = Key(key_idx);
|
||||||
ASSERT_OK(db_->Get(ReadOptions(), k, &value));
|
ASSERT_OK(db_->Get(ReadOptions(), key, &value));
|
||||||
|
|
||||||
if (read_keys.find(key_idx) == read_keys.end()) {
|
if (read_keys.find(key_idx) == read_keys.end()) {
|
||||||
auto ik = InternalKey(k, 0, ValueType::kTypeValue);
|
auto internal_key = InternalKey(key, 0, ValueType::kTypeValue);
|
||||||
total_useful_bytes += GetEncodedEntrySize(ik.size(), value.size());
|
total_useful_bytes +=
|
||||||
read_keys.insert(key_idx);
|
GetEncodedEntrySize(internal_key.size(), value.size());
|
||||||
|
read_keys.insert(key_idx);
|
||||||
|
}
|
||||||
|
|
||||||
|
double expected_read_amp =
|
||||||
|
static_cast<double>(total_useful_bytes) /
|
||||||
|
options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES);
|
||||||
|
|
||||||
|
double read_amp =
|
||||||
|
static_cast<double>(options.statistics->getTickerCount(
|
||||||
|
READ_AMP_ESTIMATE_USEFUL_BYTES)) /
|
||||||
|
options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES);
|
||||||
|
|
||||||
|
double error_pct = fabs(expected_read_amp - read_amp) * 100;
|
||||||
|
// Error between reported read amp and real read amp should be less than
|
||||||
|
// 2%
|
||||||
|
EXPECT_LE(error_pct, 2);
|
||||||
}
|
}
|
||||||
|
|
||||||
double expected_read_amp =
|
// Make sure we read every thing in the DB (which is smaller than our cache)
|
||||||
static_cast<double>(total_useful_bytes) /
|
Iterator* iter = db_->NewIterator(ReadOptions());
|
||||||
options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES);
|
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
||||||
|
ASSERT_EQ(iter->value().ToString(), Get(iter->key().ToString()));
|
||||||
|
}
|
||||||
|
delete iter;
|
||||||
|
|
||||||
double read_amp =
|
// Read amp is on average 100% since we read all what we loaded in memory
|
||||||
static_cast<double>(options.statistics->getTickerCount(
|
if (k == 0) {
|
||||||
READ_AMP_ESTIMATE_USEFUL_BYTES)) /
|
ASSERT_EQ(
|
||||||
options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES);
|
options.statistics->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES),
|
||||||
|
options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES));
|
||||||
double error_pct = fabs(expected_read_amp - read_amp) * 100;
|
} else {
|
||||||
// Error between reported read amp and real read amp should be less than 2%
|
ASSERT_NEAR(
|
||||||
EXPECT_LE(error_pct, 2);
|
options.statistics->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES) *
|
||||||
|
1.0f /
|
||||||
|
options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES),
|
||||||
|
1, .01);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Make sure we read every thing in the DB (which is smaller than our cache)
|
|
||||||
Iterator* iter = db_->NewIterator(ReadOptions());
|
|
||||||
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
|
||||||
ASSERT_EQ(iter->value().ToString(), Get(iter->key().ToString()));
|
|
||||||
}
|
|
||||||
delete iter;
|
|
||||||
|
|
||||||
// Read amp is 100% since we read all what we loaded in memory
|
|
||||||
ASSERT_EQ(options.statistics->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES),
|
|
||||||
options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(DBTest2, ReadAmpBitmapLiveInCacheAfterDBClose) {
|
TEST_F(DBTest2, ReadAmpBitmapLiveInCacheAfterDBClose) {
|
||||||
|
@ -27,7 +27,8 @@
|
|||||||
#include "rocksdb/statistics.h"
|
#include "rocksdb/statistics.h"
|
||||||
#include "table/block_prefix_index.h"
|
#include "table/block_prefix_index.h"
|
||||||
#include "table/internal_iterator.h"
|
#include "table/internal_iterator.h"
|
||||||
|
#include "util/random.h"
|
||||||
|
#include "util/sync_point.h"
|
||||||
#include "format.h"
|
#include "format.h"
|
||||||
|
|
||||||
namespace rocksdb {
|
namespace rocksdb {
|
||||||
@ -44,7 +45,12 @@ class BlockReadAmpBitmap {
|
|||||||
public:
|
public:
|
||||||
explicit BlockReadAmpBitmap(size_t block_size, size_t bytes_per_bit,
|
explicit BlockReadAmpBitmap(size_t block_size, size_t bytes_per_bit,
|
||||||
Statistics* statistics)
|
Statistics* statistics)
|
||||||
: bitmap_(nullptr), bytes_per_bit_pow_(0), statistics_(statistics) {
|
: bitmap_(nullptr),
|
||||||
|
bytes_per_bit_pow_(0),
|
||||||
|
statistics_(statistics),
|
||||||
|
rnd_(
|
||||||
|
Random::GetTLSInstance()->Uniform(static_cast<int>(bytes_per_bit))) {
|
||||||
|
TEST_SYNC_POINT_CALLBACK("BlockReadAmpBitmap:rnd", &rnd_);
|
||||||
assert(block_size > 0 && bytes_per_bit > 0);
|
assert(block_size > 0 && bytes_per_bit > 0);
|
||||||
|
|
||||||
// convert bytes_per_bit to be a power of 2
|
// convert bytes_per_bit to be a power of 2
|
||||||
@ -54,62 +60,38 @@ class BlockReadAmpBitmap {
|
|||||||
|
|
||||||
// num_bits_needed = ceil(block_size / bytes_per_bit)
|
// num_bits_needed = ceil(block_size / bytes_per_bit)
|
||||||
size_t num_bits_needed =
|
size_t num_bits_needed =
|
||||||
(block_size >> static_cast<size_t>(bytes_per_bit_pow_)) +
|
((block_size - 1) >> bytes_per_bit_pow_) + 1;
|
||||||
(block_size % (static_cast<size_t>(1)
|
assert(num_bits_needed > 0);
|
||||||
<< static_cast<size_t>(bytes_per_bit_pow_)) !=
|
|
||||||
0);
|
|
||||||
|
|
||||||
// bitmap_size = ceil(num_bits_needed / kBitsPerEntry)
|
// bitmap_size = ceil(num_bits_needed / kBitsPerEntry)
|
||||||
size_t bitmap_size = (num_bits_needed / kBitsPerEntry) +
|
size_t bitmap_size = (num_bits_needed - 1) / kBitsPerEntry + 1;
|
||||||
(num_bits_needed % kBitsPerEntry != 0);
|
|
||||||
|
|
||||||
// Create bitmap and set all the bits to 0
|
// Create bitmap and set all the bits to 0
|
||||||
bitmap_ = new std::atomic<uint32_t>[bitmap_size];
|
bitmap_ = new std::atomic<uint32_t>[bitmap_size];
|
||||||
memset(bitmap_, 0, bitmap_size * kBytesPersEntry);
|
memset(bitmap_, 0, bitmap_size * kBytesPersEntry);
|
||||||
|
|
||||||
RecordTick(GetStatistics(), READ_AMP_TOTAL_READ_BYTES,
|
RecordTick(GetStatistics(), READ_AMP_TOTAL_READ_BYTES, block_size);
|
||||||
num_bits_needed << bytes_per_bit_pow_);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
~BlockReadAmpBitmap() { delete[] bitmap_; }
|
~BlockReadAmpBitmap() { delete[] bitmap_; }
|
||||||
|
|
||||||
void Mark(uint32_t start_offset, uint32_t end_offset) {
|
void Mark(uint32_t start_offset, uint32_t end_offset) {
|
||||||
assert(end_offset >= start_offset);
|
assert(end_offset >= start_offset);
|
||||||
|
// Index of first bit in mask
|
||||||
// Every new bit we set will bump this counter
|
uint32_t start_bit =
|
||||||
uint32_t new_useful_bytes = 0;
|
(start_offset + (1 << bytes_per_bit_pow_) - rnd_ - 1) >>
|
||||||
// Index of first bit in mask (start_offset / bytes_per_bit)
|
bytes_per_bit_pow_;
|
||||||
uint32_t start_bit = start_offset >> bytes_per_bit_pow_;
|
// Index of last bit in mask + 1
|
||||||
// Index of last bit in mask (end_offset / bytes_per_bit)
|
uint32_t exclusive_end_bit =
|
||||||
uint32_t end_bit = end_offset >> bytes_per_bit_pow_;
|
(end_offset + (1 << bytes_per_bit_pow_) - rnd_) >> bytes_per_bit_pow_;
|
||||||
// Index of middle bit (unique to this range)
|
if (start_bit >= exclusive_end_bit) {
|
||||||
uint32_t mid_bit = start_bit + 1;
|
return;
|
||||||
|
|
||||||
// It's guaranteed that ranges sent to Mark() wont overlap, this mean that
|
|
||||||
// we dont need to set the middle bits, we can simply set only one bit of
|
|
||||||
// the middle bits, and check this bit if we want to know if the whole
|
|
||||||
// range is set or not.
|
|
||||||
if (mid_bit < end_bit) {
|
|
||||||
if (GetAndSet(mid_bit) == 0) {
|
|
||||||
new_useful_bytes += (end_bit - mid_bit) << bytes_per_bit_pow_;
|
|
||||||
} else {
|
|
||||||
// If the middle bit is set, it's guaranteed that start and end bits
|
|
||||||
// are also set
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// This range dont have a middle bit, the whole range fall in 1 or 2 bits
|
|
||||||
}
|
}
|
||||||
|
assert(exclusive_end_bit > 0);
|
||||||
|
|
||||||
if (GetAndSet(start_bit) == 0) {
|
if (GetAndSet(start_bit) == 0) {
|
||||||
new_useful_bytes += (1 << bytes_per_bit_pow_);
|
uint32_t new_useful_bytes = (exclusive_end_bit - start_bit)
|
||||||
}
|
<< bytes_per_bit_pow_;
|
||||||
|
|
||||||
if (GetAndSet(end_bit) == 0) {
|
|
||||||
new_useful_bytes += (1 << bytes_per_bit_pow_);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (new_useful_bytes > 0) {
|
|
||||||
RecordTick(GetStatistics(), READ_AMP_ESTIMATE_USEFUL_BYTES,
|
RecordTick(GetStatistics(), READ_AMP_ESTIMATE_USEFUL_BYTES,
|
||||||
new_useful_bytes);
|
new_useful_bytes);
|
||||||
}
|
}
|
||||||
@ -146,6 +128,7 @@ class BlockReadAmpBitmap {
|
|||||||
// this pointer maybe invalid, but the DB will update it to a valid pointer
|
// this pointer maybe invalid, but the DB will update it to a valid pointer
|
||||||
// by using SetStatistics() before calling Mark()
|
// by using SetStatistics() before calling Mark()
|
||||||
std::atomic<Statistics*> statistics_;
|
std::atomic<Statistics*> statistics_;
|
||||||
|
uint32_t rnd_;
|
||||||
};
|
};
|
||||||
|
|
||||||
class Block {
|
class Block {
|
||||||
|
@ -226,18 +226,17 @@ class BlockReadAmpBitmapSlowAndAccurate {
|
|||||||
public:
|
public:
|
||||||
void Mark(size_t start_offset, size_t end_offset) {
|
void Mark(size_t start_offset, size_t end_offset) {
|
||||||
assert(end_offset >= start_offset);
|
assert(end_offset >= start_offset);
|
||||||
|
|
||||||
marked_ranges_.emplace(end_offset, start_offset);
|
marked_ranges_.emplace(end_offset, start_offset);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Return true if any byte in this range was Marked
|
// Return true if any byte in this range was Marked
|
||||||
bool IsAnyInRangeMarked(size_t start_offset, size_t end_offset) {
|
bool IsPinMarked(size_t offset) {
|
||||||
auto it = marked_ranges_.lower_bound(
|
auto it = marked_ranges_.lower_bound(
|
||||||
std::make_pair(start_offset, static_cast<size_t>(0)));
|
std::make_pair(offset, static_cast<size_t>(0)));
|
||||||
if (it == marked_ranges_.end()) {
|
if (it == marked_ranges_.end()) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
return start_offset <= it->first && end_offset >= it->second;
|
return offset <= it->first && offset >= it->second;
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
@ -245,6 +244,12 @@ class BlockReadAmpBitmapSlowAndAccurate {
|
|||||||
};
|
};
|
||||||
|
|
||||||
TEST_F(BlockTest, BlockReadAmpBitmap) {
|
TEST_F(BlockTest, BlockReadAmpBitmap) {
|
||||||
|
uint32_t pin_offset = 0;
|
||||||
|
SyncPoint::GetInstance()->SetCallBack(
|
||||||
|
"BlockReadAmpBitmap:rnd", [&pin_offset](void* arg) {
|
||||||
|
pin_offset = *(static_cast<uint32_t*>(arg));
|
||||||
|
});
|
||||||
|
SyncPoint::GetInstance()->EnableProcessing();
|
||||||
std::vector<size_t> block_sizes = {
|
std::vector<size_t> block_sizes = {
|
||||||
1, // 1 byte
|
1, // 1 byte
|
||||||
32, // 32 bytes
|
32, // 32 bytes
|
||||||
@ -277,10 +282,8 @@ TEST_F(BlockTest, BlockReadAmpBitmap) {
|
|||||||
if (needed_bits % 32 != 0) {
|
if (needed_bits % 32 != 0) {
|
||||||
bitmap_size++;
|
bitmap_size++;
|
||||||
}
|
}
|
||||||
size_t bits_in_bitmap = bitmap_size * 32;
|
|
||||||
|
|
||||||
ASSERT_EQ(stats->getTickerCount(READ_AMP_TOTAL_READ_BYTES),
|
ASSERT_EQ(stats->getTickerCount(READ_AMP_TOTAL_READ_BYTES), block_size);
|
||||||
needed_bits * kBytesPerBit);
|
|
||||||
|
|
||||||
// Generate some random entries
|
// Generate some random entries
|
||||||
std::vector<size_t> random_entry_offsets;
|
std::vector<size_t> random_entry_offsets;
|
||||||
@ -314,20 +317,18 @@ TEST_F(BlockTest, BlockReadAmpBitmap) {
|
|||||||
current_entry.second);
|
current_entry.second);
|
||||||
|
|
||||||
size_t total_bits = 0;
|
size_t total_bits = 0;
|
||||||
for (size_t bit_idx = 0; bit_idx < bits_in_bitmap; bit_idx++) {
|
for (size_t bit_idx = 0; bit_idx < needed_bits; bit_idx++) {
|
||||||
size_t start_rng = bit_idx * kBytesPerBit;
|
total_bits += read_amp_slow_and_accurate.IsPinMarked(
|
||||||
size_t end_rng = (start_rng + kBytesPerBit) - 1;
|
bit_idx * kBytesPerBit + pin_offset);
|
||||||
|
|
||||||
total_bits +=
|
|
||||||
read_amp_slow_and_accurate.IsAnyInRangeMarked(start_rng, end_rng);
|
|
||||||
}
|
}
|
||||||
size_t expected_estimate_useful = total_bits * kBytesPerBit;
|
size_t expected_estimate_useful = total_bits * kBytesPerBit;
|
||||||
size_t got_estimate_useful =
|
size_t got_estimate_useful =
|
||||||
stats->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES);
|
stats->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES);
|
||||||
|
|
||||||
ASSERT_EQ(expected_estimate_useful, got_estimate_useful);
|
ASSERT_EQ(expected_estimate_useful, got_estimate_useful);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
SyncPoint::GetInstance()->DisableProcessing();
|
||||||
|
SyncPoint::GetInstance()->ClearAllCallBacks();
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(BlockTest, BlockWithReadAmpBitmap) {
|
TEST_F(BlockTest, BlockWithReadAmpBitmap) {
|
||||||
|
Loading…
Reference in New Issue
Block a user