Make BlockBasedTable::kMaxAutoReadAheadSize configurable (#7951)
Summary: RocksDB does auto-readahead for iterators on noticing more than two reads for a table file. The readahead starts at 8KB and doubles on every additional read upto BlockBasedTable::kMaxAutoReadAheadSize which is 256*1024. This PR adds a new option BlockBasedTableOptions::max_auto_readahead_size which replaces BlockBasedTable::kMaxAutoReadAheadSize and the new option can be configured. If max_auto_readahead_size is set 0 then no implicit auto prefetching will be done. If max_auto_readahead_size provided is less than 8KB (which is initial readahead size used by rocksdb in case of auto-readahead), readahead size will remain same as max_auto_readahead_size. Pull Request resolved: https://github.com/facebook/rocksdb/pull/7951 Test Plan: Add new unit test case. Reviewed By: anand1976 Differential Revision: D26568085 Pulled By: akankshamahajan15 fbshipit-source-id: b6543520fc74e97d859f2002328d4c5254d417af
This commit is contained in:
parent
e017af15c1
commit
cd79a00903
@ -1,5 +1,7 @@
|
||||
# Rocksdb Change Log
|
||||
## Unreleased
|
||||
### Public API change
|
||||
* Add a new option BlockBasedTableOptions::max_auto_readahead_size. RocksDB does auto-readahead for iterators on noticing more than two reads for a table file if user doesn't provide readahead_size. The readahead starts at 8KB and doubles on every additional read upto max_auto_readahead_size and now max_auto_readahead_size can be configured dynamically as well. Found that 256 KB readahead size provides the best performance, based on experiments, for auto readahead. Experiment data is in PR #3282. If value is set 0 then no automatic prefetching will be done by rocksdb. Also changing the value will only affect files opened after the change.
|
||||
|
||||
## 6.18.0 (02/19/2021)
|
||||
### Behavior Changes
|
||||
|
@ -175,9 +175,154 @@ TEST_P(PrefetchTest, Basic) {
|
||||
Close();
|
||||
}
|
||||
|
||||
#ifndef ROCKSDB_LITE
|
||||
TEST_P(PrefetchTest, ConfigureAutoMaxReadaheadSize) {
|
||||
// First param is if the mockFS support_prefetch or not
|
||||
bool support_prefetch =
|
||||
std::get<0>(GetParam()) &&
|
||||
test::IsPrefetchSupported(env_->GetFileSystem(), dbname_);
|
||||
|
||||
// Second param is if directIO is enabled or not
|
||||
bool use_direct_io = std::get<1>(GetParam());
|
||||
|
||||
std::shared_ptr<MockFS> fs =
|
||||
std::make_shared<MockFS>(env_->GetFileSystem(), support_prefetch);
|
||||
std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
|
||||
|
||||
Options options = CurrentOptions();
|
||||
options.write_buffer_size = 1024;
|
||||
options.create_if_missing = true;
|
||||
options.compression = kNoCompression;
|
||||
options.env = env.get();
|
||||
options.disable_auto_compactions = true;
|
||||
if (use_direct_io) {
|
||||
options.use_direct_reads = true;
|
||||
options.use_direct_io_for_flush_and_compaction = true;
|
||||
}
|
||||
BlockBasedTableOptions table_options;
|
||||
table_options.no_block_cache = true;
|
||||
table_options.cache_index_and_filter_blocks = false;
|
||||
table_options.metadata_block_size = 1024;
|
||||
table_options.index_type =
|
||||
BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
|
||||
table_options.max_auto_readahead_size = 0;
|
||||
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
||||
|
||||
int buff_prefetch_count = 0;
|
||||
SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start",
|
||||
[&](void*) { buff_prefetch_count++; });
|
||||
|
||||
// DB open will create table readers unless we reduce the table cache
|
||||
// capacity. SanitizeOptions will set max_open_files to minimum of 20. Table
|
||||
// cache is allocated with max_open_files - 10 as capacity. So override
|
||||
// max_open_files to 10 so table cache capacity will become 0. This will
|
||||
// prevent file open during DB open and force the file to be opened during
|
||||
// Iteration.
|
||||
SyncPoint::GetInstance()->SetCallBack(
|
||||
"SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
|
||||
int* max_open_files = (int*)arg;
|
||||
*max_open_files = 11;
|
||||
});
|
||||
|
||||
SyncPoint::GetInstance()->EnableProcessing();
|
||||
|
||||
Status s = TryReopen(options);
|
||||
|
||||
if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) {
|
||||
// If direct IO is not supported, skip the test
|
||||
return;
|
||||
} else {
|
||||
ASSERT_OK(s);
|
||||
}
|
||||
|
||||
Random rnd(309);
|
||||
int key_count = 0;
|
||||
const int num_keys_per_level = 100;
|
||||
// Level 0 : Keys in range [0, 99], Level 1:[100, 199], Level 2:[200, 299].
|
||||
for (int level = 2; level >= 0; level--) {
|
||||
key_count = level * num_keys_per_level;
|
||||
for (int i = 0; i < num_keys_per_level; ++i) {
|
||||
ASSERT_OK(Put(Key(key_count++), rnd.RandomString(500)));
|
||||
}
|
||||
ASSERT_OK(Flush());
|
||||
MoveFilesToLevel(level);
|
||||
}
|
||||
Close();
|
||||
std::vector<int> buff_prefectch_level_count = {0, 0, 0};
|
||||
TryReopen(options);
|
||||
{
|
||||
auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
|
||||
fs->ClearPrefetchCount();
|
||||
buff_prefetch_count = 0;
|
||||
|
||||
for (int level = 2; level >= 0; level--) {
|
||||
key_count = level * num_keys_per_level;
|
||||
switch (level) {
|
||||
case 0:
|
||||
// max_auto_readahead_size is set 0 so data and index blocks are not
|
||||
// prefetched.
|
||||
ASSERT_OK(db_->SetOptions(
|
||||
{{"block_based_table_factory", "{max_auto_readahead_size=0;}"}}));
|
||||
break;
|
||||
case 1:
|
||||
// max_auto_readahead_size is set less than
|
||||
// BlockBasedTable::kInitAutoReadaheadSize. So readahead_size remains
|
||||
// equal to max_auto_readahead_size.
|
||||
ASSERT_OK(db_->SetOptions({{"block_based_table_factory",
|
||||
"{max_auto_readahead_size=4096;}"}}));
|
||||
break;
|
||||
case 2:
|
||||
ASSERT_OK(db_->SetOptions({{"block_based_table_factory",
|
||||
"{max_auto_readahead_size=65536;}"}}));
|
||||
break;
|
||||
default:
|
||||
assert(false);
|
||||
}
|
||||
|
||||
for (int i = 0; i < num_keys_per_level; ++i) {
|
||||
iter->Seek(Key(key_count++));
|
||||
iter->Next();
|
||||
}
|
||||
|
||||
buff_prefectch_level_count[level] = buff_prefetch_count;
|
||||
if (support_prefetch && !use_direct_io) {
|
||||
if (level == 0) {
|
||||
ASSERT_FALSE(fs->IsPrefetchCalled());
|
||||
} else {
|
||||
ASSERT_TRUE(fs->IsPrefetchCalled());
|
||||
}
|
||||
fs->ClearPrefetchCount();
|
||||
} else {
|
||||
ASSERT_FALSE(fs->IsPrefetchCalled());
|
||||
if (level == 0) {
|
||||
ASSERT_EQ(buff_prefetch_count, 0);
|
||||
} else {
|
||||
ASSERT_GT(buff_prefetch_count, 0);
|
||||
}
|
||||
buff_prefetch_count = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!support_prefetch) {
|
||||
ASSERT_GT(buff_prefectch_level_count[1], buff_prefectch_level_count[2]);
|
||||
}
|
||||
|
||||
SyncPoint::GetInstance()->DisableProcessing();
|
||||
SyncPoint::GetInstance()->ClearAllCallBacks();
|
||||
Close();
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(PrefetchTest, PrefetchTest,
|
||||
::testing::Combine(::testing::Bool(),
|
||||
::testing::Bool()));
|
||||
#endif // !ROCKSDB_LITE
|
||||
|
||||
class PrefetchTest1 : public DBTestBase,
|
||||
public ::testing::WithParamInterface<bool> {
|
||||
public:
|
||||
PrefetchTest1() : DBTestBase("/prefetch_test1", true) {}
|
||||
};
|
||||
|
||||
} // namespace ROCKSDB_NAMESPACE
|
||||
|
||||
|
@ -435,6 +435,33 @@ struct BlockBasedTableOptions {
|
||||
|
||||
IndexShorteningMode index_shortening =
|
||||
IndexShorteningMode::kShortenSeparators;
|
||||
|
||||
// RocksDB does auto-readahead for iterators on noticing more than two reads
|
||||
// for a table file if user doesn't provide readahead_size. The readahead
|
||||
// starts at 8KB and doubles on every additional read upto
|
||||
// max_auto_readahead_size and max_auto_readahead_size can be configured.
|
||||
//
|
||||
// Special Value: 0 - If max_auto_readahead_size is set 0 then no implicit
|
||||
// auto prefetching will be done. If max_auto_readahead_size provided is less
|
||||
// than 8KB (which is initial readahead size used by rocksdb in case of
|
||||
// auto-readahead), readahead size will remain same as
|
||||
// max_auto_readahead_size.
|
||||
//
|
||||
// Value should be provided along with KB i.e. 256 * 1024 as it will prefetch
|
||||
// the blocks.
|
||||
//
|
||||
// Found that 256 KB readahead size provides the best performance, based on
|
||||
// experiments, for auto readahead. Experiment data is in PR #3282.
|
||||
//
|
||||
// This parameter can be changed dynamically by
|
||||
// DB::SetOptions({{"block_based_table_factory",
|
||||
// "{max_auto_readahead_size=0;}"}}));
|
||||
//
|
||||
// Changing the value dynamically will only affect files opened after the
|
||||
// change.
|
||||
//
|
||||
// Default: 256 KB (256 * 1024).
|
||||
size_t max_auto_readahead_size = 256 * 1024;
|
||||
};
|
||||
|
||||
// Table Properties that are specific to block-based table properties.
|
||||
|
@ -179,7 +179,8 @@ TEST_F(OptionsSettableTest, BlockBasedTableOptionsAllFieldsSettable) {
|
||||
"hash_index_allow_collision=false;"
|
||||
"verify_compression=true;read_amp_bytes_per_bit=0;"
|
||||
"enable_index_compression=false;"
|
||||
"block_align=true",
|
||||
"block_align=true;"
|
||||
"max_auto_readahead_size=0",
|
||||
new_bbto));
|
||||
|
||||
ASSERT_EQ(unset_bytes_base,
|
||||
|
@ -415,6 +415,10 @@ static std::unordered_map<std::string, OptionTypeInfo>
|
||||
auto* cache = reinterpret_cast<std::shared_ptr<Cache>*>(addr);
|
||||
return Cache::CreateFromString(opts, value, cache);
|
||||
}}},
|
||||
{"max_auto_readahead_size",
|
||||
{offsetof(struct BlockBasedTableOptions, max_auto_readahead_size),
|
||||
OptionType::kSizeT, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kMutable}},
|
||||
#endif // ROCKSDB_LITE
|
||||
};
|
||||
|
||||
@ -687,6 +691,9 @@ std::string BlockBasedTableFactory::GetPrintableOptions() const {
|
||||
snprintf(buffer, kBufferSize, " block_align: %d\n",
|
||||
table_options_.block_align);
|
||||
ret.append(buffer);
|
||||
snprintf(buffer, kBufferSize,
|
||||
" max_auto_readahead_size: %" ROCKSDB_PRIszt "\n",
|
||||
table_options_.max_auto_readahead_size);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -67,11 +67,6 @@ extern const uint64_t kBlockBasedTableMagicNumber;
|
||||
extern const std::string kHashIndexPrefixesBlock;
|
||||
extern const std::string kHashIndexPrefixesMetadataBlock;
|
||||
|
||||
|
||||
// Found that 256 KB readahead size provides the best performance, based on
|
||||
// experiments, for auto readahead. Experiment data is in PR #3282.
|
||||
const size_t BlockBasedTable::kMaxAutoReadaheadSize = 256 * 1024;
|
||||
|
||||
BlockBasedTable::~BlockBasedTable() {
|
||||
delete rep_;
|
||||
}
|
||||
@ -2921,7 +2916,7 @@ Status BlockBasedTable::VerifyChecksumInBlocks(
|
||||
// increasing of the buffer size.
|
||||
size_t readahead_size = (read_options.readahead_size != 0)
|
||||
? read_options.readahead_size
|
||||
: kMaxAutoReadaheadSize;
|
||||
: rep_->table_options.max_auto_readahead_size;
|
||||
// FilePrefetchBuffer doesn't work in mmap mode and readahead is not
|
||||
// needed there.
|
||||
FilePrefetchBuffer prefetch_buffer(
|
||||
|
@ -65,9 +65,6 @@ class BlockBasedTable : public TableReader {
|
||||
|
||||
// All the below fields control iterator readahead
|
||||
static const size_t kInitAutoReadaheadSize = 8 * 1024;
|
||||
// Found that 256 KB readahead size provides the best performance, based on
|
||||
// experiments, for auto readahead. Experiment data is in PR #3282.
|
||||
static const size_t kMaxAutoReadaheadSize;
|
||||
static const int kMinNumFileReadsToStartAutoReadahead = 2;
|
||||
|
||||
// Attempt to open the table that is stored in bytes [0..file_size)
|
||||
|
@ -35,10 +35,23 @@ void BlockPrefetcher::PrefetchIfNeeded(const BlockBasedTable::Rep* rep,
|
||||
return;
|
||||
}
|
||||
|
||||
size_t max_auto_readahead_size = rep->table_options.max_auto_readahead_size;
|
||||
size_t initial_auto_readahead_size = BlockBasedTable::kInitAutoReadaheadSize;
|
||||
|
||||
// If max_auto_readahead_size is set to be 0 by user, no data will be
|
||||
// prefetched.
|
||||
if (max_auto_readahead_size == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (initial_auto_readahead_size > max_auto_readahead_size) {
|
||||
initial_auto_readahead_size = max_auto_readahead_size;
|
||||
}
|
||||
|
||||
if (rep->file->use_direct_io()) {
|
||||
rep->CreateFilePrefetchBufferIfNotExists(
|
||||
BlockBasedTable::kInitAutoReadaheadSize,
|
||||
BlockBasedTable::kMaxAutoReadaheadSize, &prefetch_buffer_);
|
||||
rep->CreateFilePrefetchBufferIfNotExists(initial_auto_readahead_size,
|
||||
max_auto_readahead_size,
|
||||
&prefetch_buffer_);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -47,20 +60,24 @@ void BlockPrefetcher::PrefetchIfNeeded(const BlockBasedTable::Rep* rep,
|
||||
return;
|
||||
}
|
||||
|
||||
if (readahead_size_ > max_auto_readahead_size) {
|
||||
readahead_size_ = max_auto_readahead_size;
|
||||
}
|
||||
|
||||
// If prefetch is not supported, fall back to use internal prefetch buffer.
|
||||
// Discarding other return status of Prefetch calls intentionally, as
|
||||
// we can fallback to reading from disk if Prefetch fails.
|
||||
Status s = rep->file->Prefetch(handle.offset(), readahead_size_);
|
||||
if (s.IsNotSupported()) {
|
||||
rep->CreateFilePrefetchBufferIfNotExists(
|
||||
BlockBasedTable::kInitAutoReadaheadSize,
|
||||
BlockBasedTable::kMaxAutoReadaheadSize, &prefetch_buffer_);
|
||||
rep->CreateFilePrefetchBufferIfNotExists(initial_auto_readahead_size,
|
||||
max_auto_readahead_size,
|
||||
&prefetch_buffer_);
|
||||
return;
|
||||
}
|
||||
readahead_limit_ = static_cast<size_t>(handle.offset() + readahead_size_);
|
||||
|
||||
// Keep exponentially increasing readahead size until
|
||||
// kMaxAutoReadaheadSize.
|
||||
readahead_size_ =
|
||||
std::min(BlockBasedTable::kMaxAutoReadaheadSize, readahead_size_ * 2);
|
||||
// max_auto_readahead_size.
|
||||
readahead_size_ = std::min(max_auto_readahead_size, readahead_size_ * 2);
|
||||
}
|
||||
} // namespace ROCKSDB_NAMESPACE
|
||||
|
Loading…
Reference in New Issue
Block a user