Add DBOptions::skip_sats_update_on_db_open
Summary: UpdateAccumulatedStats() is used to optimize compaction decision esp. when the number of deletion entries are high, but this function can slowdown DBOpen esp. in disk environment. This patch adds DBOptions::skip_sats_update_on_db_open, which skips UpdateAccumulatedStats() in DB::Open() time when it's set to true. Test Plan: Add DBCompactionTest.SkipStatsUpdateTest Reviewers: igor, anthony, IslamAbdelRahman, sdong Reviewed By: sdong Subscribers: tnovak, dhruba, leveldb Differential Revision: https://reviews.facebook.net/D42843
This commit is contained in:
parent
e2a3bfe74b
commit
14d0bfa429
@ -3,6 +3,7 @@
|
|||||||
## Unreleased
|
## Unreleased
|
||||||
|
|
||||||
### New Features
|
### New Features
|
||||||
|
* Add DBOptions::skip_stats_update_on_db_open. When it is on, DB::Open() will run faster as it skips the random reads required for loading necessary stats from SST files to optimize compaction.
|
||||||
* RollbackToSavePoint() in WriteBatch/WriteBatchWithIndex
|
* RollbackToSavePoint() in WriteBatch/WriteBatchWithIndex
|
||||||
|
|
||||||
### Public API Changes
|
### Public API Changes
|
||||||
|
@ -196,12 +196,16 @@ const SstFileMetaData* PickFileRandomly(
|
|||||||
} // anonymous namespace
|
} // anonymous namespace
|
||||||
|
|
||||||
TEST_F(DBCompactionTest, CompactionDeletionTrigger) {
|
TEST_F(DBCompactionTest, CompactionDeletionTrigger) {
|
||||||
for (int tid = 0; tid < 2; ++tid) {
|
for (int tid = 0; tid < 3; ++tid) {
|
||||||
uint64_t db_size[2];
|
uint64_t db_size[2];
|
||||||
Options options = CurrentOptions(DeletionTriggerOptions());
|
Options options = CurrentOptions(DeletionTriggerOptions());
|
||||||
|
|
||||||
if (tid == 1) {
|
if (tid == 1) {
|
||||||
// second pass with universal compaction
|
// the following only disable stats update in DB::Open()
|
||||||
|
// and should not affect the result of this test.
|
||||||
|
options.skip_stats_update_on_db_open = true;
|
||||||
|
} else if (tid == 2) {
|
||||||
|
// third pass with universal compaction
|
||||||
options.compaction_style = kCompactionStyleUniversal;
|
options.compaction_style = kCompactionStyleUniversal;
|
||||||
options.num_levels = 1;
|
options.num_levels = 1;
|
||||||
}
|
}
|
||||||
@ -231,6 +235,64 @@ TEST_F(DBCompactionTest, CompactionDeletionTrigger) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_F(DBCompactionTest, SkipStatsUpdateTest) {
|
||||||
|
// This test verify UpdateAccumulatedStats is not on by observing
|
||||||
|
// the compaction behavior when there are many of deletion entries.
|
||||||
|
// The test will need to be updated if the internal behavior changes.
|
||||||
|
|
||||||
|
Options options = DeletionTriggerOptions();
|
||||||
|
options = CurrentOptions(options);
|
||||||
|
options.env = env_;
|
||||||
|
DestroyAndReopen(options);
|
||||||
|
Random rnd(301);
|
||||||
|
|
||||||
|
const int kTestSize = kCDTKeysPerBuffer * 512;
|
||||||
|
std::vector<std::string> values;
|
||||||
|
for (int k = 0; k < kTestSize; ++k) {
|
||||||
|
values.push_back(RandomString(&rnd, kCDTValueSize));
|
||||||
|
ASSERT_OK(Put(Key(k), values[k]));
|
||||||
|
}
|
||||||
|
dbfull()->TEST_WaitForFlushMemTable();
|
||||||
|
dbfull()->TEST_WaitForCompact();
|
||||||
|
|
||||||
|
uint64_t db_size[2];
|
||||||
|
db_size[0] = Size(Key(0), Key(kTestSize - 1));
|
||||||
|
|
||||||
|
for (int k = 0; k < kTestSize; ++k) {
|
||||||
|
ASSERT_OK(Delete(Key(k)));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reopen the DB with stats-update disabled
|
||||||
|
options.skip_stats_update_on_db_open = true;
|
||||||
|
env_->random_file_open_counter_.store(0);
|
||||||
|
Reopen(options);
|
||||||
|
// As stats-update is disabled, we expect a very low
|
||||||
|
// number of random file open.
|
||||||
|
ASSERT_LT(env_->random_file_open_counter_.load(), 5);
|
||||||
|
dbfull()->TEST_WaitForFlushMemTable();
|
||||||
|
dbfull()->TEST_WaitForCompact();
|
||||||
|
db_size[1] = Size(Key(0), Key(kTestSize - 1));
|
||||||
|
|
||||||
|
// As stats update is disabled, we expect the deletion
|
||||||
|
// entries are not properly processed.
|
||||||
|
ASSERT_LT(db_size[0] / 3, db_size[1]);
|
||||||
|
|
||||||
|
// Repeat the reopen process, but this time we enable
|
||||||
|
// stats-update.
|
||||||
|
options.skip_stats_update_on_db_open = false;
|
||||||
|
env_->random_file_open_counter_.store(0);
|
||||||
|
Reopen(options);
|
||||||
|
// Since we do a normal stats update on db-open, there
|
||||||
|
// will be more random open files.
|
||||||
|
ASSERT_GT(env_->random_file_open_counter_.load(), 5);
|
||||||
|
dbfull()->TEST_WaitForFlushMemTable();
|
||||||
|
dbfull()->TEST_WaitForCompact();
|
||||||
|
db_size[1] = Size(Key(0), Key(kTestSize - 1));
|
||||||
|
|
||||||
|
// and we expect the deleiton entries being handled.
|
||||||
|
ASSERT_GT(db_size[0] / 3, db_size[1]);
|
||||||
|
}
|
||||||
|
|
||||||
TEST_F(DBCompactionTest, CompactionDeletionTriggerReopen) {
|
TEST_F(DBCompactionTest, CompactionDeletionTriggerReopen) {
|
||||||
for (int tid = 0; tid < 2; ++tid) {
|
for (int tid = 0; tid < 2; ++tid) {
|
||||||
uint64_t db_size[3];
|
uint64_t db_size[3];
|
||||||
@ -287,6 +349,63 @@ TEST_F(DBCompactionTest, CompactionDeletionTriggerReopen) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_F(DBCompactionTest, DisableStatsUpdateReopen) {
|
||||||
|
uint64_t db_size[3];
|
||||||
|
for (int test = 0; test < 2; ++test) {
|
||||||
|
Options options = CurrentOptions(DeletionTriggerOptions());
|
||||||
|
options.skip_stats_update_on_db_open = (test == 0);
|
||||||
|
|
||||||
|
env_->random_read_counter_.Reset();
|
||||||
|
DestroyAndReopen(options);
|
||||||
|
Random rnd(301);
|
||||||
|
|
||||||
|
// round 1 --- insert key/value pairs.
|
||||||
|
const int kTestSize = kCDTKeysPerBuffer * 512;
|
||||||
|
std::vector<std::string> values;
|
||||||
|
for (int k = 0; k < kTestSize; ++k) {
|
||||||
|
values.push_back(RandomString(&rnd, kCDTValueSize));
|
||||||
|
ASSERT_OK(Put(Key(k), values[k]));
|
||||||
|
}
|
||||||
|
dbfull()->TEST_WaitForFlushMemTable();
|
||||||
|
dbfull()->TEST_WaitForCompact();
|
||||||
|
db_size[0] = Size(Key(0), Key(kTestSize - 1));
|
||||||
|
Close();
|
||||||
|
|
||||||
|
// round 2 --- disable auto-compactions and issue deletions.
|
||||||
|
options.create_if_missing = false;
|
||||||
|
options.disable_auto_compactions = true;
|
||||||
|
|
||||||
|
env_->random_read_counter_.Reset();
|
||||||
|
Reopen(options);
|
||||||
|
|
||||||
|
for (int k = 0; k < kTestSize; ++k) {
|
||||||
|
ASSERT_OK(Delete(Key(k)));
|
||||||
|
}
|
||||||
|
db_size[1] = Size(Key(0), Key(kTestSize - 1));
|
||||||
|
Close();
|
||||||
|
// as auto_compaction is off, we shouldn't see too much reduce
|
||||||
|
// in db size.
|
||||||
|
ASSERT_LT(db_size[0] / 3, db_size[1]);
|
||||||
|
|
||||||
|
// round 3 --- reopen db with auto_compaction on and see if
|
||||||
|
// deletion compensation still work.
|
||||||
|
options.disable_auto_compactions = false;
|
||||||
|
Reopen(options);
|
||||||
|
dbfull()->TEST_WaitForFlushMemTable();
|
||||||
|
dbfull()->TEST_WaitForCompact();
|
||||||
|
db_size[2] = Size(Key(0), Key(kTestSize - 1));
|
||||||
|
|
||||||
|
if (options.skip_stats_update_on_db_open) {
|
||||||
|
// If update stats on DB::Open is disable, we don't expect
|
||||||
|
// deletion entries taking effect.
|
||||||
|
ASSERT_LT(db_size[0] / 3, db_size[2]);
|
||||||
|
} else {
|
||||||
|
// Otherwise, we should see a significant drop in db size.
|
||||||
|
ASSERT_GT(db_size[0] / 3, db_size[2]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
TEST_F(DBCompactionTest, CompactionTrigger) {
|
TEST_F(DBCompactionTest, CompactionTrigger) {
|
||||||
Options options;
|
Options options;
|
||||||
options.write_buffer_size = 100 << 10; // 100KB
|
options.write_buffer_size = 100 << 10; // 100KB
|
||||||
|
@ -873,8 +873,10 @@ void VersionStorageInfo::GenerateLevelFilesBrief() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void Version::PrepareApply(const MutableCFOptions& mutable_cf_options) {
|
void Version::PrepareApply(
|
||||||
UpdateAccumulatedStats();
|
const MutableCFOptions& mutable_cf_options,
|
||||||
|
bool update_stats) {
|
||||||
|
UpdateAccumulatedStats(update_stats);
|
||||||
storage_info_.UpdateNumNonEmptyLevels();
|
storage_info_.UpdateNumNonEmptyLevels();
|
||||||
storage_info_.CalculateBaseBytes(*cfd_->ioptions(), mutable_cf_options);
|
storage_info_.CalculateBaseBytes(*cfd_->ioptions(), mutable_cf_options);
|
||||||
storage_info_.UpdateFilesBySize();
|
storage_info_.UpdateFilesBySize();
|
||||||
@ -917,42 +919,45 @@ void VersionStorageInfo::UpdateAccumulatedStats(FileMetaData* file_meta) {
|
|||||||
num_samples_++;
|
num_samples_++;
|
||||||
}
|
}
|
||||||
|
|
||||||
void Version::UpdateAccumulatedStats() {
|
void Version::UpdateAccumulatedStats(bool update_stats) {
|
||||||
// maximum number of table properties loaded from files.
|
if (update_stats) {
|
||||||
const int kMaxInitCount = 20;
|
// maximum number of table properties loaded from files.
|
||||||
int init_count = 0;
|
const int kMaxInitCount = 20;
|
||||||
// here only the first kMaxInitCount files which haven't been
|
int init_count = 0;
|
||||||
// initialized from file will be updated with num_deletions.
|
// here only the first kMaxInitCount files which haven't been
|
||||||
// The motivation here is to cap the maximum I/O per Version creation.
|
// initialized from file will be updated with num_deletions.
|
||||||
// The reason for choosing files from lower-level instead of higher-level
|
// The motivation here is to cap the maximum I/O per Version creation.
|
||||||
// is that such design is able to propagate the initialization from
|
// The reason for choosing files from lower-level instead of higher-level
|
||||||
// lower-level to higher-level: When the num_deletions of lower-level
|
// is that such design is able to propagate the initialization from
|
||||||
// files are updated, it will make the lower-level files have accurate
|
// lower-level to higher-level: When the num_deletions of lower-level
|
||||||
// compensated_file_size, making lower-level to higher-level compaction
|
// files are updated, it will make the lower-level files have accurate
|
||||||
// will be triggered, which creates higher-level files whose num_deletions
|
// compensated_file_size, making lower-level to higher-level compaction
|
||||||
// will be updated here.
|
// will be triggered, which creates higher-level files whose num_deletions
|
||||||
for (int level = 0;
|
// will be updated here.
|
||||||
level < storage_info_.num_levels_ && init_count < kMaxInitCount;
|
for (int level = 0;
|
||||||
++level) {
|
level < storage_info_.num_levels_ && init_count < kMaxInitCount;
|
||||||
for (auto* file_meta : storage_info_.files_[level]) {
|
++level) {
|
||||||
if (MaybeInitializeFileMetaData(file_meta)) {
|
for (auto* file_meta : storage_info_.files_[level]) {
|
||||||
// each FileMeta will be initialized only once.
|
if (MaybeInitializeFileMetaData(file_meta)) {
|
||||||
storage_info_.UpdateAccumulatedStats(file_meta);
|
// each FileMeta will be initialized only once.
|
||||||
if (++init_count >= kMaxInitCount) {
|
storage_info_.UpdateAccumulatedStats(file_meta);
|
||||||
break;
|
if (++init_count >= kMaxInitCount) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
// In case all sampled-files contain only deletion entries, then we
|
||||||
// In case all sampled-files contain only deletion entries, then we
|
// load the table-property of a file in higher-level to initialize
|
||||||
// load the table-property of a file in higher-level to initialize
|
// that value.
|
||||||
// that value.
|
for (int level = storage_info_.num_levels_ - 1;
|
||||||
for (int level = storage_info_.num_levels_ - 1;
|
storage_info_.accumulated_raw_value_size_ == 0 && level >= 0;
|
||||||
storage_info_.accumulated_raw_value_size_ == 0 && level >= 0; --level) {
|
--level) {
|
||||||
for (int i = static_cast<int>(storage_info_.files_[level].size()) - 1;
|
for (int i = static_cast<int>(storage_info_.files_[level].size()) - 1;
|
||||||
storage_info_.accumulated_raw_value_size_ == 0 && i >= 0; --i) {
|
storage_info_.accumulated_raw_value_size_ == 0 && i >= 0; --i) {
|
||||||
if (MaybeInitializeFileMetaData(storage_info_.files_[level][i])) {
|
if (MaybeInitializeFileMetaData(storage_info_.files_[level][i])) {
|
||||||
storage_info_.UpdateAccumulatedStats(storage_info_.files_[level][i]);
|
storage_info_.UpdateAccumulatedStats(storage_info_.files_[level][i]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1967,7 +1972,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
|
|||||||
|
|
||||||
if (!edit->IsColumnFamilyManipulation()) {
|
if (!edit->IsColumnFamilyManipulation()) {
|
||||||
// This is cpu-heavy operations, which should be called outside mutex.
|
// This is cpu-heavy operations, which should be called outside mutex.
|
||||||
v->PrepareApply(mutable_cf_options);
|
v->PrepareApply(mutable_cf_options, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Write new record to MANIFEST log
|
// Write new record to MANIFEST log
|
||||||
@ -2398,7 +2403,8 @@ Status VersionSet::Recover(
|
|||||||
builder->SaveTo(v->storage_info());
|
builder->SaveTo(v->storage_info());
|
||||||
|
|
||||||
// Install recovered version
|
// Install recovered version
|
||||||
v->PrepareApply(*cfd->GetLatestMutableCFOptions());
|
v->PrepareApply(*cfd->GetLatestMutableCFOptions(),
|
||||||
|
!(db_options_->skip_stats_update_on_db_open));
|
||||||
AppendVersion(cfd, v);
|
AppendVersion(cfd, v);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2748,7 +2754,7 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
|
|||||||
|
|
||||||
Version* v = new Version(cfd, this, current_version_number_++);
|
Version* v = new Version(cfd, this, current_version_number_++);
|
||||||
builder->SaveTo(v->storage_info());
|
builder->SaveTo(v->storage_info());
|
||||||
v->PrepareApply(*cfd->GetLatestMutableCFOptions());
|
v->PrepareApply(*cfd->GetLatestMutableCFOptions(), false);
|
||||||
|
|
||||||
printf("--------------- Column family \"%s\" (ID %u) --------------\n",
|
printf("--------------- Column family \"%s\" (ID %u) --------------\n",
|
||||||
cfd->GetName().c_str(), (unsigned int)cfd->GetID());
|
cfd->GetName().c_str(), (unsigned int)cfd->GetID());
|
||||||
|
@ -417,7 +417,8 @@ class Version {
|
|||||||
|
|
||||||
// Loads some stats information from files. Call without mutex held. It needs
|
// Loads some stats information from files. Call without mutex held. It needs
|
||||||
// to be called before applying the version to the version set.
|
// to be called before applying the version to the version set.
|
||||||
void PrepareApply(const MutableCFOptions& mutable_cf_options);
|
void PrepareApply(const MutableCFOptions& mutable_cf_options,
|
||||||
|
bool update_stats);
|
||||||
|
|
||||||
// Reference count management (so Versions do not disappear out from
|
// Reference count management (so Versions do not disappear out from
|
||||||
// under live iterators)
|
// under live iterators)
|
||||||
@ -490,7 +491,7 @@ class Version {
|
|||||||
|
|
||||||
// Update the accumulated stats associated with the current version.
|
// Update the accumulated stats associated with the current version.
|
||||||
// This accumulated stats will be used in compaction.
|
// This accumulated stats will be used in compaction.
|
||||||
void UpdateAccumulatedStats();
|
void UpdateAccumulatedStats(bool update_stats);
|
||||||
|
|
||||||
// Sort all files for this version based on their file size and
|
// Sort all files for this version based on their file size and
|
||||||
// record results in files_by_size_. The largest files are listed first.
|
// record results in files_by_size_. The largest files are listed first.
|
||||||
|
@ -1058,6 +1058,14 @@ struct DBOptions {
|
|||||||
// Default: 1MB/s
|
// Default: 1MB/s
|
||||||
uint64_t delayed_write_rate;
|
uint64_t delayed_write_rate;
|
||||||
|
|
||||||
|
// If true, then DB::Open() will not update the statistics used to optimize
|
||||||
|
// compaction decision by loading table properties from many files.
|
||||||
|
// Turning off this feature will improve DBOpen time espcially in
|
||||||
|
// disk environment.
|
||||||
|
//
|
||||||
|
// Default: false
|
||||||
|
bool skip_stats_update_on_db_open;
|
||||||
|
|
||||||
// Recovery mode to control the consistency while replaying WAL
|
// Recovery mode to control the consistency while replaying WAL
|
||||||
// Default: kTolerateCorruptedTailRecords
|
// Default: kTolerateCorruptedTailRecords
|
||||||
WALRecoveryMode wal_recovery_mode;
|
WALRecoveryMode wal_recovery_mode;
|
||||||
|
@ -28,6 +28,7 @@ SpecialEnv::SpecialEnv(Env* base)
|
|||||||
manifest_sync_error_.store(false, std::memory_order_release);
|
manifest_sync_error_.store(false, std::memory_order_release);
|
||||||
manifest_write_error_.store(false, std::memory_order_release);
|
manifest_write_error_.store(false, std::memory_order_release);
|
||||||
log_write_error_.store(false, std::memory_order_release);
|
log_write_error_.store(false, std::memory_order_release);
|
||||||
|
random_file_open_counter_.store(0, std::memory_order_relaxed);
|
||||||
log_write_slowdown_ = 0;
|
log_write_slowdown_ = 0;
|
||||||
bytes_written_ = 0;
|
bytes_written_ = 0;
|
||||||
sync_counter_ = 0;
|
sync_counter_ = 0;
|
||||||
|
@ -281,6 +281,7 @@ class SpecialEnv : public EnvWrapper {
|
|||||||
};
|
};
|
||||||
|
|
||||||
Status s = target()->NewRandomAccessFile(f, r, soptions);
|
Status s = target()->NewRandomAccessFile(f, r, soptions);
|
||||||
|
random_file_open_counter_++;
|
||||||
if (s.ok() && count_random_reads_) {
|
if (s.ok() && count_random_reads_) {
|
||||||
r->reset(new CountingFile(std::move(*r), &random_read_counter_));
|
r->reset(new CountingFile(std::move(*r), &random_read_counter_));
|
||||||
}
|
}
|
||||||
@ -367,6 +368,7 @@ class SpecialEnv : public EnvWrapper {
|
|||||||
|
|
||||||
bool count_random_reads_;
|
bool count_random_reads_;
|
||||||
anon::AtomicCounter random_read_counter_;
|
anon::AtomicCounter random_read_counter_;
|
||||||
|
std::atomic<int> random_file_open_counter_;
|
||||||
|
|
||||||
bool count_sequential_reads_;
|
bool count_sequential_reads_;
|
||||||
anon::AtomicCounter sequential_read_counter_;
|
anon::AtomicCounter sequential_read_counter_;
|
||||||
|
@ -239,6 +239,7 @@ DBOptions::DBOptions()
|
|||||||
listeners(),
|
listeners(),
|
||||||
enable_thread_tracking(false),
|
enable_thread_tracking(false),
|
||||||
delayed_write_rate(1024U * 1024U),
|
delayed_write_rate(1024U * 1024U),
|
||||||
|
skip_stats_update_on_db_open(false),
|
||||||
wal_recovery_mode(WALRecoveryMode::kTolerateCorruptedTailRecords) {
|
wal_recovery_mode(WALRecoveryMode::kTolerateCorruptedTailRecords) {
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -287,6 +288,7 @@ DBOptions::DBOptions(const Options& options)
|
|||||||
listeners(options.listeners),
|
listeners(options.listeners),
|
||||||
enable_thread_tracking(options.enable_thread_tracking),
|
enable_thread_tracking(options.enable_thread_tracking),
|
||||||
delayed_write_rate(options.delayed_write_rate),
|
delayed_write_rate(options.delayed_write_rate),
|
||||||
|
skip_stats_update_on_db_open(options.skip_stats_update_on_db_open),
|
||||||
wal_recovery_mode(options.wal_recovery_mode),
|
wal_recovery_mode(options.wal_recovery_mode),
|
||||||
row_cache(options.row_cache) {}
|
row_cache(options.row_cache) {}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user