Make BlockBasedTable::kMaxAutoReadAheadSize configurable (#7951)
Summary: RocksDB does auto-readahead for iterators on noticing more than two reads for a table file. The readahead starts at 8KB and doubles on every additional read upto BlockBasedTable::kMaxAutoReadAheadSize which is 256*1024. This PR adds a new option BlockBasedTableOptions::max_auto_readahead_size which replaces BlockBasedTable::kMaxAutoReadAheadSize and the new option can be configured. If max_auto_readahead_size is set 0 then no implicit auto prefetching will be done. If max_auto_readahead_size provided is less than 8KB (which is initial readahead size used by rocksdb in case of auto-readahead), readahead size will remain same as max_auto_readahead_size. Pull Request resolved: https://github.com/facebook/rocksdb/pull/7951 Test Plan: Add new unit test case. Reviewed By: anand1976 Differential Revision: D26568085 Pulled By: akankshamahajan15 fbshipit-source-id: b6543520fc74e97d859f2002328d4c5254d417af
This commit is contained in:
parent
e017af15c1
commit
cd79a00903
@ -1,5 +1,7 @@
|
|||||||
# Rocksdb Change Log
|
# Rocksdb Change Log
|
||||||
## Unreleased
|
## Unreleased
|
||||||
|
### Public API change
|
||||||
|
* Add a new option BlockBasedTableOptions::max_auto_readahead_size. RocksDB does auto-readahead for iterators on noticing more than two reads for a table file if user doesn't provide readahead_size. The readahead starts at 8KB and doubles on every additional read upto max_auto_readahead_size and now max_auto_readahead_size can be configured dynamically as well. Found that 256 KB readahead size provides the best performance, based on experiments, for auto readahead. Experiment data is in PR #3282. If value is set 0 then no automatic prefetching will be done by rocksdb. Also changing the value will only affect files opened after the change.
|
||||||
|
|
||||||
## 6.18.0 (02/19/2021)
|
## 6.18.0 (02/19/2021)
|
||||||
### Behavior Changes
|
### Behavior Changes
|
||||||
|
@ -175,9 +175,154 @@ TEST_P(PrefetchTest, Basic) {
|
|||||||
Close();
|
Close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifndef ROCKSDB_LITE
|
||||||
|
TEST_P(PrefetchTest, ConfigureAutoMaxReadaheadSize) {
|
||||||
|
// First param is if the mockFS support_prefetch or not
|
||||||
|
bool support_prefetch =
|
||||||
|
std::get<0>(GetParam()) &&
|
||||||
|
test::IsPrefetchSupported(env_->GetFileSystem(), dbname_);
|
||||||
|
|
||||||
|
// Second param is if directIO is enabled or not
|
||||||
|
bool use_direct_io = std::get<1>(GetParam());
|
||||||
|
|
||||||
|
std::shared_ptr<MockFS> fs =
|
||||||
|
std::make_shared<MockFS>(env_->GetFileSystem(), support_prefetch);
|
||||||
|
std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
|
||||||
|
|
||||||
|
Options options = CurrentOptions();
|
||||||
|
options.write_buffer_size = 1024;
|
||||||
|
options.create_if_missing = true;
|
||||||
|
options.compression = kNoCompression;
|
||||||
|
options.env = env.get();
|
||||||
|
options.disable_auto_compactions = true;
|
||||||
|
if (use_direct_io) {
|
||||||
|
options.use_direct_reads = true;
|
||||||
|
options.use_direct_io_for_flush_and_compaction = true;
|
||||||
|
}
|
||||||
|
BlockBasedTableOptions table_options;
|
||||||
|
table_options.no_block_cache = true;
|
||||||
|
table_options.cache_index_and_filter_blocks = false;
|
||||||
|
table_options.metadata_block_size = 1024;
|
||||||
|
table_options.index_type =
|
||||||
|
BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
|
||||||
|
table_options.max_auto_readahead_size = 0;
|
||||||
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
||||||
|
|
||||||
|
int buff_prefetch_count = 0;
|
||||||
|
SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start",
|
||||||
|
[&](void*) { buff_prefetch_count++; });
|
||||||
|
|
||||||
|
// DB open will create table readers unless we reduce the table cache
|
||||||
|
// capacity. SanitizeOptions will set max_open_files to minimum of 20. Table
|
||||||
|
// cache is allocated with max_open_files - 10 as capacity. So override
|
||||||
|
// max_open_files to 10 so table cache capacity will become 0. This will
|
||||||
|
// prevent file open during DB open and force the file to be opened during
|
||||||
|
// Iteration.
|
||||||
|
SyncPoint::GetInstance()->SetCallBack(
|
||||||
|
"SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
|
||||||
|
int* max_open_files = (int*)arg;
|
||||||
|
*max_open_files = 11;
|
||||||
|
});
|
||||||
|
|
||||||
|
SyncPoint::GetInstance()->EnableProcessing();
|
||||||
|
|
||||||
|
Status s = TryReopen(options);
|
||||||
|
|
||||||
|
if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) {
|
||||||
|
// If direct IO is not supported, skip the test
|
||||||
|
return;
|
||||||
|
} else {
|
||||||
|
ASSERT_OK(s);
|
||||||
|
}
|
||||||
|
|
||||||
|
Random rnd(309);
|
||||||
|
int key_count = 0;
|
||||||
|
const int num_keys_per_level = 100;
|
||||||
|
// Level 0 : Keys in range [0, 99], Level 1:[100, 199], Level 2:[200, 299].
|
||||||
|
for (int level = 2; level >= 0; level--) {
|
||||||
|
key_count = level * num_keys_per_level;
|
||||||
|
for (int i = 0; i < num_keys_per_level; ++i) {
|
||||||
|
ASSERT_OK(Put(Key(key_count++), rnd.RandomString(500)));
|
||||||
|
}
|
||||||
|
ASSERT_OK(Flush());
|
||||||
|
MoveFilesToLevel(level);
|
||||||
|
}
|
||||||
|
Close();
|
||||||
|
std::vector<int> buff_prefectch_level_count = {0, 0, 0};
|
||||||
|
TryReopen(options);
|
||||||
|
{
|
||||||
|
auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
|
||||||
|
fs->ClearPrefetchCount();
|
||||||
|
buff_prefetch_count = 0;
|
||||||
|
|
||||||
|
for (int level = 2; level >= 0; level--) {
|
||||||
|
key_count = level * num_keys_per_level;
|
||||||
|
switch (level) {
|
||||||
|
case 0:
|
||||||
|
// max_auto_readahead_size is set 0 so data and index blocks are not
|
||||||
|
// prefetched.
|
||||||
|
ASSERT_OK(db_->SetOptions(
|
||||||
|
{{"block_based_table_factory", "{max_auto_readahead_size=0;}"}}));
|
||||||
|
break;
|
||||||
|
case 1:
|
||||||
|
// max_auto_readahead_size is set less than
|
||||||
|
// BlockBasedTable::kInitAutoReadaheadSize. So readahead_size remains
|
||||||
|
// equal to max_auto_readahead_size.
|
||||||
|
ASSERT_OK(db_->SetOptions({{"block_based_table_factory",
|
||||||
|
"{max_auto_readahead_size=4096;}"}}));
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
ASSERT_OK(db_->SetOptions({{"block_based_table_factory",
|
||||||
|
"{max_auto_readahead_size=65536;}"}}));
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
assert(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < num_keys_per_level; ++i) {
|
||||||
|
iter->Seek(Key(key_count++));
|
||||||
|
iter->Next();
|
||||||
|
}
|
||||||
|
|
||||||
|
buff_prefectch_level_count[level] = buff_prefetch_count;
|
||||||
|
if (support_prefetch && !use_direct_io) {
|
||||||
|
if (level == 0) {
|
||||||
|
ASSERT_FALSE(fs->IsPrefetchCalled());
|
||||||
|
} else {
|
||||||
|
ASSERT_TRUE(fs->IsPrefetchCalled());
|
||||||
|
}
|
||||||
|
fs->ClearPrefetchCount();
|
||||||
|
} else {
|
||||||
|
ASSERT_FALSE(fs->IsPrefetchCalled());
|
||||||
|
if (level == 0) {
|
||||||
|
ASSERT_EQ(buff_prefetch_count, 0);
|
||||||
|
} else {
|
||||||
|
ASSERT_GT(buff_prefetch_count, 0);
|
||||||
|
}
|
||||||
|
buff_prefetch_count = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!support_prefetch) {
|
||||||
|
ASSERT_GT(buff_prefectch_level_count[1], buff_prefectch_level_count[2]);
|
||||||
|
}
|
||||||
|
|
||||||
|
SyncPoint::GetInstance()->DisableProcessing();
|
||||||
|
SyncPoint::GetInstance()->ClearAllCallBacks();
|
||||||
|
Close();
|
||||||
|
}
|
||||||
|
|
||||||
INSTANTIATE_TEST_CASE_P(PrefetchTest, PrefetchTest,
|
INSTANTIATE_TEST_CASE_P(PrefetchTest, PrefetchTest,
|
||||||
::testing::Combine(::testing::Bool(),
|
::testing::Combine(::testing::Bool(),
|
||||||
::testing::Bool()));
|
::testing::Bool()));
|
||||||
|
#endif // !ROCKSDB_LITE
|
||||||
|
|
||||||
|
class PrefetchTest1 : public DBTestBase,
|
||||||
|
public ::testing::WithParamInterface<bool> {
|
||||||
|
public:
|
||||||
|
PrefetchTest1() : DBTestBase("/prefetch_test1", true) {}
|
||||||
|
};
|
||||||
|
|
||||||
} // namespace ROCKSDB_NAMESPACE
|
} // namespace ROCKSDB_NAMESPACE
|
||||||
|
|
||||||
|
@ -435,6 +435,33 @@ struct BlockBasedTableOptions {
|
|||||||
|
|
||||||
IndexShorteningMode index_shortening =
|
IndexShorteningMode index_shortening =
|
||||||
IndexShorteningMode::kShortenSeparators;
|
IndexShorteningMode::kShortenSeparators;
|
||||||
|
|
||||||
|
// RocksDB does auto-readahead for iterators on noticing more than two reads
|
||||||
|
// for a table file if user doesn't provide readahead_size. The readahead
|
||||||
|
// starts at 8KB and doubles on every additional read upto
|
||||||
|
// max_auto_readahead_size and max_auto_readahead_size can be configured.
|
||||||
|
//
|
||||||
|
// Special Value: 0 - If max_auto_readahead_size is set 0 then no implicit
|
||||||
|
// auto prefetching will be done. If max_auto_readahead_size provided is less
|
||||||
|
// than 8KB (which is initial readahead size used by rocksdb in case of
|
||||||
|
// auto-readahead), readahead size will remain same as
|
||||||
|
// max_auto_readahead_size.
|
||||||
|
//
|
||||||
|
// Value should be provided along with KB i.e. 256 * 1024 as it will prefetch
|
||||||
|
// the blocks.
|
||||||
|
//
|
||||||
|
// Found that 256 KB readahead size provides the best performance, based on
|
||||||
|
// experiments, for auto readahead. Experiment data is in PR #3282.
|
||||||
|
//
|
||||||
|
// This parameter can be changed dynamically by
|
||||||
|
// DB::SetOptions({{"block_based_table_factory",
|
||||||
|
// "{max_auto_readahead_size=0;}"}}));
|
||||||
|
//
|
||||||
|
// Changing the value dynamically will only affect files opened after the
|
||||||
|
// change.
|
||||||
|
//
|
||||||
|
// Default: 256 KB (256 * 1024).
|
||||||
|
size_t max_auto_readahead_size = 256 * 1024;
|
||||||
};
|
};
|
||||||
|
|
||||||
// Table Properties that are specific to block-based table properties.
|
// Table Properties that are specific to block-based table properties.
|
||||||
|
@ -179,7 +179,8 @@ TEST_F(OptionsSettableTest, BlockBasedTableOptionsAllFieldsSettable) {
|
|||||||
"hash_index_allow_collision=false;"
|
"hash_index_allow_collision=false;"
|
||||||
"verify_compression=true;read_amp_bytes_per_bit=0;"
|
"verify_compression=true;read_amp_bytes_per_bit=0;"
|
||||||
"enable_index_compression=false;"
|
"enable_index_compression=false;"
|
||||||
"block_align=true",
|
"block_align=true;"
|
||||||
|
"max_auto_readahead_size=0",
|
||||||
new_bbto));
|
new_bbto));
|
||||||
|
|
||||||
ASSERT_EQ(unset_bytes_base,
|
ASSERT_EQ(unset_bytes_base,
|
||||||
|
@ -415,6 +415,10 @@ static std::unordered_map<std::string, OptionTypeInfo>
|
|||||||
auto* cache = reinterpret_cast<std::shared_ptr<Cache>*>(addr);
|
auto* cache = reinterpret_cast<std::shared_ptr<Cache>*>(addr);
|
||||||
return Cache::CreateFromString(opts, value, cache);
|
return Cache::CreateFromString(opts, value, cache);
|
||||||
}}},
|
}}},
|
||||||
|
{"max_auto_readahead_size",
|
||||||
|
{offsetof(struct BlockBasedTableOptions, max_auto_readahead_size),
|
||||||
|
OptionType::kSizeT, OptionVerificationType::kNormal,
|
||||||
|
OptionTypeFlags::kMutable}},
|
||||||
#endif // ROCKSDB_LITE
|
#endif // ROCKSDB_LITE
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -687,6 +691,9 @@ std::string BlockBasedTableFactory::GetPrintableOptions() const {
|
|||||||
snprintf(buffer, kBufferSize, " block_align: %d\n",
|
snprintf(buffer, kBufferSize, " block_align: %d\n",
|
||||||
table_options_.block_align);
|
table_options_.block_align);
|
||||||
ret.append(buffer);
|
ret.append(buffer);
|
||||||
|
snprintf(buffer, kBufferSize,
|
||||||
|
" max_auto_readahead_size: %" ROCKSDB_PRIszt "\n",
|
||||||
|
table_options_.max_auto_readahead_size);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -67,11 +67,6 @@ extern const uint64_t kBlockBasedTableMagicNumber;
|
|||||||
extern const std::string kHashIndexPrefixesBlock;
|
extern const std::string kHashIndexPrefixesBlock;
|
||||||
extern const std::string kHashIndexPrefixesMetadataBlock;
|
extern const std::string kHashIndexPrefixesMetadataBlock;
|
||||||
|
|
||||||
|
|
||||||
// Found that 256 KB readahead size provides the best performance, based on
|
|
||||||
// experiments, for auto readahead. Experiment data is in PR #3282.
|
|
||||||
const size_t BlockBasedTable::kMaxAutoReadaheadSize = 256 * 1024;
|
|
||||||
|
|
||||||
BlockBasedTable::~BlockBasedTable() {
|
BlockBasedTable::~BlockBasedTable() {
|
||||||
delete rep_;
|
delete rep_;
|
||||||
}
|
}
|
||||||
@ -2921,7 +2916,7 @@ Status BlockBasedTable::VerifyChecksumInBlocks(
|
|||||||
// increasing of the buffer size.
|
// increasing of the buffer size.
|
||||||
size_t readahead_size = (read_options.readahead_size != 0)
|
size_t readahead_size = (read_options.readahead_size != 0)
|
||||||
? read_options.readahead_size
|
? read_options.readahead_size
|
||||||
: kMaxAutoReadaheadSize;
|
: rep_->table_options.max_auto_readahead_size;
|
||||||
// FilePrefetchBuffer doesn't work in mmap mode and readahead is not
|
// FilePrefetchBuffer doesn't work in mmap mode and readahead is not
|
||||||
// needed there.
|
// needed there.
|
||||||
FilePrefetchBuffer prefetch_buffer(
|
FilePrefetchBuffer prefetch_buffer(
|
||||||
|
@ -65,9 +65,6 @@ class BlockBasedTable : public TableReader {
|
|||||||
|
|
||||||
// All the below fields control iterator readahead
|
// All the below fields control iterator readahead
|
||||||
static const size_t kInitAutoReadaheadSize = 8 * 1024;
|
static const size_t kInitAutoReadaheadSize = 8 * 1024;
|
||||||
// Found that 256 KB readahead size provides the best performance, based on
|
|
||||||
// experiments, for auto readahead. Experiment data is in PR #3282.
|
|
||||||
static const size_t kMaxAutoReadaheadSize;
|
|
||||||
static const int kMinNumFileReadsToStartAutoReadahead = 2;
|
static const int kMinNumFileReadsToStartAutoReadahead = 2;
|
||||||
|
|
||||||
// Attempt to open the table that is stored in bytes [0..file_size)
|
// Attempt to open the table that is stored in bytes [0..file_size)
|
||||||
|
@ -35,10 +35,23 @@ void BlockPrefetcher::PrefetchIfNeeded(const BlockBasedTable::Rep* rep,
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
size_t max_auto_readahead_size = rep->table_options.max_auto_readahead_size;
|
||||||
|
size_t initial_auto_readahead_size = BlockBasedTable::kInitAutoReadaheadSize;
|
||||||
|
|
||||||
|
// If max_auto_readahead_size is set to be 0 by user, no data will be
|
||||||
|
// prefetched.
|
||||||
|
if (max_auto_readahead_size == 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (initial_auto_readahead_size > max_auto_readahead_size) {
|
||||||
|
initial_auto_readahead_size = max_auto_readahead_size;
|
||||||
|
}
|
||||||
|
|
||||||
if (rep->file->use_direct_io()) {
|
if (rep->file->use_direct_io()) {
|
||||||
rep->CreateFilePrefetchBufferIfNotExists(
|
rep->CreateFilePrefetchBufferIfNotExists(initial_auto_readahead_size,
|
||||||
BlockBasedTable::kInitAutoReadaheadSize,
|
max_auto_readahead_size,
|
||||||
BlockBasedTable::kMaxAutoReadaheadSize, &prefetch_buffer_);
|
&prefetch_buffer_);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -47,20 +60,24 @@ void BlockPrefetcher::PrefetchIfNeeded(const BlockBasedTable::Rep* rep,
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (readahead_size_ > max_auto_readahead_size) {
|
||||||
|
readahead_size_ = max_auto_readahead_size;
|
||||||
|
}
|
||||||
|
|
||||||
// If prefetch is not supported, fall back to use internal prefetch buffer.
|
// If prefetch is not supported, fall back to use internal prefetch buffer.
|
||||||
// Discarding other return status of Prefetch calls intentionally, as
|
// Discarding other return status of Prefetch calls intentionally, as
|
||||||
// we can fallback to reading from disk if Prefetch fails.
|
// we can fallback to reading from disk if Prefetch fails.
|
||||||
Status s = rep->file->Prefetch(handle.offset(), readahead_size_);
|
Status s = rep->file->Prefetch(handle.offset(), readahead_size_);
|
||||||
if (s.IsNotSupported()) {
|
if (s.IsNotSupported()) {
|
||||||
rep->CreateFilePrefetchBufferIfNotExists(
|
rep->CreateFilePrefetchBufferIfNotExists(initial_auto_readahead_size,
|
||||||
BlockBasedTable::kInitAutoReadaheadSize,
|
max_auto_readahead_size,
|
||||||
BlockBasedTable::kMaxAutoReadaheadSize, &prefetch_buffer_);
|
&prefetch_buffer_);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
readahead_limit_ = static_cast<size_t>(handle.offset() + readahead_size_);
|
readahead_limit_ = static_cast<size_t>(handle.offset() + readahead_size_);
|
||||||
|
|
||||||
// Keep exponentially increasing readahead size until
|
// Keep exponentially increasing readahead size until
|
||||||
// kMaxAutoReadaheadSize.
|
// max_auto_readahead_size.
|
||||||
readahead_size_ =
|
readahead_size_ = std::min(max_auto_readahead_size, readahead_size_ * 2);
|
||||||
std::min(BlockBasedTable::kMaxAutoReadaheadSize, readahead_size_ * 2);
|
|
||||||
}
|
}
|
||||||
} // namespace ROCKSDB_NAMESPACE
|
} // namespace ROCKSDB_NAMESPACE
|
||||||
|
Loading…
Reference in New Issue
Block a user