group multiple batch of flush into one manifest file (one call to LogAndApply)
Summary: Currently, if several flush outputs are committed together, we issue each manifest write per batch (1 batch = 1 flush = 1 sst file = 1+ continuous memtables). Each manifest write requires one fsync and one fsync to parent directory. In some cases, it becomes the bottleneck of write. We should batch them and write in one manifest write when possible. Test Plan: ` ./db_bench -benchmarks="fillseq" -max_write_buffer_number=16 -max_background_flushes=16 -disable_auto_compactions=true -min_write_buffer_number_to_merge=1 -write_buffer_size=65536 -level0_stop_writes_trigger=10000 -level0_slowdown_writes_trigger=10000` **Before** ``` Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 4.9 Date: Fri Jul 1 15:38:17 2016 CPU: 32 * Intel(R) Xeon(R) CPU E5-2660 0 @ 2.20GHz CPUCache: 20480 KB Keys: 16 bytes each Values: 100 bytes each (50 bytes after compression) Entries: 1000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 110.6 MB (estimated) FileSize: 62.9 MB (estimated) Write rate: 0 bytes/second Compression: Snappy Memtablerep: skip_list Perf Level: 1 WARNING: Assertions are enabled; benchmarks unnecessarily slow ------------------------------------------------ Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags DB path: [/tmp/rocksdbtest-112628/dbbench] fillseq : 166.277 micros/op 6014 ops/sec; 0.7 MB/s ``` **After** ``` Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 4.9 Date: Fri Jul 1 15:35:05 2016 CPU: 32 * Intel(R) Xeon(R) CPU E5-2660 0 @ 2.20GHz CPUCache: 20480 KB Keys: 16 bytes each Values: 100 bytes each (50 bytes after compression) Entries: 1000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 110.6 MB (estimated) FileSize: 62.9 MB (estimated) Write rate: 0 bytes/second Compression: Snappy Memtablerep: skip_list Perf Level: 1 WARNING: Assertions are enabled; benchmarks unnecessarily slow ------------------------------------------------ Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags DB path: [/tmp/rocksdbtest-112628/dbbench] fillseq : 52.328 micros/op 19110 ops/sec; 2.1 MB/s ``` Reviewers: andrewkr, IslamAbdelRahman, yhchiang, sdong Reviewed By: sdong Subscribers: igor, andrewkr, dhruba, leveldb Differential Revision: https://reviews.facebook.net/D60075
This commit is contained in:
parent
a45ee83181
commit
5aaef91d4a
@ -306,17 +306,28 @@ Status MemTableList::InstallMemtableFlushResults(
|
||||
// scan all memtables from the earliest, and commit those
|
||||
// (in that order) that have finished flushing. Memetables
|
||||
// are always committed in the order that they were created.
|
||||
while (!current_->memlist_.empty() && s.ok()) {
|
||||
MemTable* m = current_->memlist_.back(); // get the last element
|
||||
uint64_t batch_file_number = 0;
|
||||
size_t batch_count = 0;
|
||||
autovector<VersionEdit*> edit_list;
|
||||
auto& memlist = current_->memlist_;
|
||||
// enumerate from the last (earliest) element to see how many batch finished
|
||||
for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) {
|
||||
MemTable* m = *it;
|
||||
if (!m->flush_completed_) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (it == memlist.rbegin() || batch_file_number != m->file_number_) {
|
||||
batch_file_number = m->file_number_;
|
||||
LogToBuffer(log_buffer, "[%s] Level-0 commit table #%" PRIu64 " started",
|
||||
cfd->GetName().c_str(), m->file_number_);
|
||||
edit_list.push_back(&m->edit_);
|
||||
}
|
||||
batch_count++;
|
||||
}
|
||||
|
||||
if (batch_count > 0) {
|
||||
// this can release and reacquire the mutex.
|
||||
s = vset->LogAndApply(cfd, mutable_cf_options, &m->edit_, mu, db_directory);
|
||||
s = vset->LogAndApply(cfd, mutable_cf_options, edit_list, mu, db_directory);
|
||||
|
||||
// we will be changing the version in the next code path,
|
||||
// so we better create a new one, since versions are immutable
|
||||
@ -325,14 +336,19 @@ Status MemTableList::InstallMemtableFlushResults(
|
||||
// All the later memtables that have the same filenum
|
||||
// are part of the same batch. They can be committed now.
|
||||
uint64_t mem_id = 1; // how many memtables have been flushed.
|
||||
do {
|
||||
if (s.ok()) { // commit new state
|
||||
while (batch_count-- > 0) {
|
||||
MemTable* m = current_->memlist_.back();
|
||||
LogToBuffer(log_buffer, "[%s] Level-0 commit table #%" PRIu64
|
||||
": memtable #%" PRIu64 " done",
|
||||
cfd->GetName().c_str(), m->file_number_, mem_id);
|
||||
assert(m->file_number_ > 0);
|
||||
current_->Remove(m, to_delete);
|
||||
++mem_id;
|
||||
}
|
||||
} else {
|
||||
for (auto it = current_->memlist_.rbegin(); batch_count-- > 0; it++) {
|
||||
MemTable* m = *it;
|
||||
// commit failed. setup state so that we can flush again.
|
||||
LogToBuffer(log_buffer, "Level-0 commit table #%" PRIu64
|
||||
": memtable #%" PRIu64 " failed",
|
||||
@ -343,10 +359,9 @@ Status MemTableList::InstallMemtableFlushResults(
|
||||
num_flush_not_started_++;
|
||||
m->file_number_ = 0;
|
||||
imm_flush_needed.store(true, std::memory_order_release);
|
||||
}
|
||||
++mem_id;
|
||||
} while (!current_->memlist_.empty() && (nullptr != (m = current_->memlist_.back())) &&
|
||||
(m->file_number_ == file_number));
|
||||
}
|
||||
}
|
||||
}
|
||||
commit_in_progress_ = false;
|
||||
return s;
|
||||
|
@ -2077,11 +2077,11 @@ struct VersionSet::ManifestWriter {
|
||||
bool done;
|
||||
InstrumentedCondVar cv;
|
||||
ColumnFamilyData* cfd;
|
||||
VersionEdit* edit;
|
||||
const autovector<VersionEdit*>& edit_list;
|
||||
|
||||
explicit ManifestWriter(InstrumentedMutex* mu, ColumnFamilyData* _cfd,
|
||||
VersionEdit* e)
|
||||
: done(false), cv(mu), cfd(_cfd), edit(e) {}
|
||||
const autovector<VersionEdit*>& e)
|
||||
: done(false), cv(mu), cfd(_cfd), edit_list(e) {}
|
||||
};
|
||||
|
||||
VersionSet::VersionSet(const std::string& dbname, const DBOptions* db_options,
|
||||
@ -2150,20 +2150,32 @@ void VersionSet::AppendVersion(ColumnFamilyData* column_family_data,
|
||||
|
||||
Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
|
||||
const MutableCFOptions& mutable_cf_options,
|
||||
VersionEdit* edit, InstrumentedMutex* mu,
|
||||
Directory* db_directory, bool new_descriptor_log,
|
||||
const autovector<VersionEdit*>& edit_list,
|
||||
InstrumentedMutex* mu, Directory* db_directory,
|
||||
bool new_descriptor_log,
|
||||
const ColumnFamilyOptions* new_cf_options) {
|
||||
mu->AssertHeld();
|
||||
// num of edits
|
||||
auto num_edits = edit_list.size();
|
||||
if (num_edits == 0) {
|
||||
return Status::OK();
|
||||
} else if (num_edits > 1) {
|
||||
// no group commits for column family add or drop
|
||||
for (auto& edit : edit_list) {
|
||||
assert(!edit->IsColumnFamilyManipulation());
|
||||
}
|
||||
}
|
||||
|
||||
// column_family_data can be nullptr only if this is column_family_add.
|
||||
// in that case, we also need to specify ColumnFamilyOptions
|
||||
if (column_family_data == nullptr) {
|
||||
assert(edit->is_column_family_add_);
|
||||
assert(num_edits == 1);
|
||||
assert(edit_list[0]->is_column_family_add_);
|
||||
assert(new_cf_options != nullptr);
|
||||
}
|
||||
|
||||
// queue our request
|
||||
ManifestWriter w(mu, column_family_data, edit);
|
||||
ManifestWriter w(mu, column_family_data, edit_list);
|
||||
manifest_writers_.push_back(&w);
|
||||
while (!w.done && &w != manifest_writers_.front()) {
|
||||
w.cv.Wait();
|
||||
@ -2183,7 +2195,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
|
||||
return Status::ShutdownInProgress();
|
||||
}
|
||||
|
||||
std::vector<VersionEdit*> batch_edits;
|
||||
autovector<VersionEdit*> batch_edits;
|
||||
Version* v = nullptr;
|
||||
std::unique_ptr<BaseReferencedVersionBuilder> builder_guard(nullptr);
|
||||
|
||||
@ -2191,24 +2203,26 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
|
||||
ManifestWriter* last_writer = &w;
|
||||
assert(!manifest_writers_.empty());
|
||||
assert(manifest_writers_.front() == &w);
|
||||
if (edit->IsColumnFamilyManipulation()) {
|
||||
if (w.edit_list.front()->IsColumnFamilyManipulation()) {
|
||||
// no group commits for column family add or drop
|
||||
LogAndApplyCFHelper(edit);
|
||||
batch_edits.push_back(edit);
|
||||
LogAndApplyCFHelper(w.edit_list.front());
|
||||
batch_edits.push_back(w.edit_list.front());
|
||||
} else {
|
||||
v = new Version(column_family_data, this, current_version_number_++);
|
||||
builder_guard.reset(new BaseReferencedVersionBuilder(column_family_data));
|
||||
auto* builder = builder_guard->version_builder();
|
||||
for (const auto& writer : manifest_writers_) {
|
||||
if (writer->edit->IsColumnFamilyManipulation() ||
|
||||
if (writer->edit_list.front()->IsColumnFamilyManipulation() ||
|
||||
writer->cfd->GetID() != column_family_data->GetID()) {
|
||||
// no group commits for column family add or drop
|
||||
// also, group commits across column families are not supported
|
||||
break;
|
||||
}
|
||||
last_writer = writer;
|
||||
LogAndApplyHelper(column_family_data, builder, v, last_writer->edit, mu);
|
||||
batch_edits.push_back(last_writer->edit);
|
||||
for (const auto& edit : writer->edit_list) {
|
||||
LogAndApplyHelper(column_family_data, builder, v, edit, mu);
|
||||
batch_edits.push_back(edit);
|
||||
}
|
||||
}
|
||||
builder->SaveTo(v->storage_info());
|
||||
}
|
||||
@ -2231,7 +2245,8 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
|
||||
if (new_descriptor_log) {
|
||||
// if we're writing out new snapshot make sure to persist max column family
|
||||
if (column_family_set_->GetMaxColumnFamily() > 0) {
|
||||
edit->SetMaxColumnFamily(column_family_set_->GetMaxColumnFamily());
|
||||
w.edit_list.front()->SetMaxColumnFamily(
|
||||
column_family_set_->GetMaxColumnFamily());
|
||||
}
|
||||
}
|
||||
|
||||
@ -2242,7 +2257,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
|
||||
mu->Unlock();
|
||||
|
||||
TEST_SYNC_POINT("VersionSet::LogAndApply:WriteManifest");
|
||||
if (!edit->IsColumnFamilyManipulation() &&
|
||||
if (!w.edit_list.front()->IsColumnFamilyManipulation() &&
|
||||
db_options_->max_open_files == -1) {
|
||||
// unlimited table cache. Pre-load table handle now.
|
||||
// Need to do it out of the mutex.
|
||||
@ -2268,12 +2283,13 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
|
||||
|
||||
unique_ptr<WritableFileWriter> file_writer(
|
||||
new WritableFileWriter(std::move(descriptor_file), opt_env_opts));
|
||||
descriptor_log_.reset(new log::Writer(std::move(file_writer), 0, false));
|
||||
descriptor_log_.reset(
|
||||
new log::Writer(std::move(file_writer), 0, false));
|
||||
s = WriteSnapshot(descriptor_log_.get());
|
||||
}
|
||||
}
|
||||
|
||||
if (!edit->IsColumnFamilyManipulation()) {
|
||||
if (!w.edit_list.front()->IsColumnFamilyManipulation()) {
|
||||
// This is cpu-heavy operations, which should be called outside mutex.
|
||||
v->PrepareApply(mutable_cf_options, true);
|
||||
}
|
||||
@ -2315,7 +2331,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
|
||||
new_manifest_file_size = descriptor_log_->file()->GetFileSize();
|
||||
}
|
||||
|
||||
if (edit->is_column_family_drop_) {
|
||||
if (w.edit_list.front()->is_column_family_drop_) {
|
||||
TEST_SYNC_POINT("VersionSet::LogAndApply::ColumnFamilyDrop:0");
|
||||
TEST_SYNC_POINT("VersionSet::LogAndApply::ColumnFamilyDrop:1");
|
||||
TEST_SYNC_POINT("VersionSet::LogAndApply::ColumnFamilyDrop:2");
|
||||
@ -2335,12 +2351,12 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
|
||||
|
||||
// Install the new version
|
||||
if (s.ok()) {
|
||||
if (edit->is_column_family_add_) {
|
||||
if (w.edit_list.front()->is_column_family_add_) {
|
||||
// no group commit on column family add
|
||||
assert(batch_edits.size() == 1);
|
||||
assert(new_cf_options != nullptr);
|
||||
CreateColumnFamily(*new_cf_options, edit);
|
||||
} else if (edit->is_column_family_drop_) {
|
||||
CreateColumnFamily(*new_cf_options, w.edit_list.front());
|
||||
} else if (w.edit_list.front()->is_column_family_drop_) {
|
||||
assert(batch_edits.size() == 1);
|
||||
column_family_data->SetDropped();
|
||||
if (column_family_data->Unref()) {
|
||||
@ -2363,7 +2379,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
|
||||
|
||||
manifest_file_number_ = pending_manifest_file_number_;
|
||||
manifest_file_size_ = new_manifest_file_size;
|
||||
prev_log_number_ = edit->prev_log_number_;
|
||||
prev_log_number_ = w.edit_list.front()->prev_log_number_;
|
||||
} else {
|
||||
std::string version_edits;
|
||||
for (auto& e : batch_edits) {
|
||||
|
@ -589,6 +589,19 @@ class VersionSet {
|
||||
const MutableCFOptions& mutable_cf_options, VersionEdit* edit,
|
||||
InstrumentedMutex* mu, Directory* db_directory = nullptr,
|
||||
bool new_descriptor_log = false,
|
||||
const ColumnFamilyOptions* column_family_options = nullptr) {
|
||||
autovector<VersionEdit*> edit_list;
|
||||
edit_list.push_back(edit);
|
||||
return LogAndApply(column_family_data, mutable_cf_options, edit_list, mu,
|
||||
db_directory, new_descriptor_log, column_family_options);
|
||||
}
|
||||
// The batch version. If edit_list.size() > 1, caller must ensure that
|
||||
// no edit in the list column family add or drop
|
||||
Status LogAndApply(
|
||||
ColumnFamilyData* column_family_data,
|
||||
const MutableCFOptions& mutable_cf_options,
|
||||
const autovector<VersionEdit*>& edit_list, InstrumentedMutex* mu,
|
||||
Directory* db_directory = nullptr, bool new_descriptor_log = false,
|
||||
const ColumnFamilyOptions* column_family_options = nullptr);
|
||||
|
||||
// Recover the last saved descriptor from persistent storage.
|
||||
|
Loading…
Reference in New Issue
Block a user