MakeRoomForWrite() support for column families

Summary: Making room for write will be the hardest part of the column family implementation. For now, I just iterate through all column families and run MakeRoomForWrite() for every one.

Test Plan: make check does not complain

Reviewers: dhruba, haobo

CC: leveldb

Differential Revision: https://reviews.facebook.net/D15597
This commit is contained in:
Igor Canadi 2014-01-30 15:23:13 -08:00
parent c37e7de669
commit 6973bb1722
6 changed files with 76 additions and 71 deletions

View File

@ -17,9 +17,7 @@
namespace rocksdb { namespace rocksdb {
SuperVersion::SuperVersion(const int num_memtables) { SuperVersion::SuperVersion() {}
to_delete.resize(num_memtables);
}
SuperVersion::~SuperVersion() { SuperVersion::~SuperVersion() {
for (auto td : to_delete) { for (auto td : to_delete) {
@ -71,7 +69,8 @@ ColumnFamilyData::ColumnFamilyData(uint32_t id, const std::string& name,
imm_(options.min_write_buffer_number_to_merge), imm_(options.min_write_buffer_number_to_merge),
super_version_(nullptr), super_version_(nullptr),
super_version_number_(0), super_version_number_(0),
log_number_(0) {} log_number_(0),
need_slowdown_for_num_level0_files_(false) {}
ColumnFamilyData::~ColumnFamilyData() { ColumnFamilyData::~ColumnFamilyData() {
if (super_version_ != nullptr) { if (super_version_ != nullptr) {
@ -95,6 +94,13 @@ ColumnFamilyData::~ColumnFamilyData() {
} }
} }
void ColumnFamilyData::SetCurrent(Version* current) {
current_ = current;
need_slowdown_for_num_level0_files_ =
(options_.level0_slowdown_writes_trigger >= 0 &&
current_->NumLevelFiles(0) >= options_.level0_slowdown_writes_trigger);
}
void ColumnFamilyData::CreateNewMemtable() { void ColumnFamilyData::CreateNewMemtable() {
assert(current_ != nullptr); assert(current_ != nullptr);
if (mem_ != nullptr) { if (mem_ != nullptr) {

View File

@ -36,7 +36,7 @@ struct SuperVersion {
std::vector<MemTable*> to_delete; std::vector<MemTable*> to_delete;
// should be called outside the mutex // should be called outside the mutex
explicit SuperVersion(const int num_memtables = 0); SuperVersion();
~SuperVersion(); ~SuperVersion();
SuperVersion* Ref(); SuperVersion* Ref();
// Returns true if this was the last reference and caller should // Returns true if this was the last reference and caller should
@ -72,7 +72,7 @@ class ColumnFamilyData {
Version* current() { return current_; } Version* current() { return current_; }
Version* dummy_versions() { return dummy_versions_; } Version* dummy_versions() { return dummy_versions_; }
void SetMemtable(MemTable* new_mem) { mem_ = new_mem; } void SetMemtable(MemTable* new_mem) { mem_ = new_mem; }
void SetCurrent(Version* current) { current_ = current; } void SetCurrent(Version* current);
void CreateNewMemtable(); void CreateNewMemtable();
SuperVersion* GetSuperVersion() const { return super_version_; } SuperVersion* GetSuperVersion() const { return super_version_; }
@ -85,6 +85,12 @@ class ColumnFamilyData {
// the clients to allocate SuperVersion outside of mutex. // the clients to allocate SuperVersion outside of mutex.
SuperVersion* InstallSuperVersion(SuperVersion* new_superversion); SuperVersion* InstallSuperVersion(SuperVersion* new_superversion);
// A Flag indicating whether write needs to slowdown because of there are
// too many number of level0 files.
bool NeedSlowdownForNumLevel0Files() const {
return need_slowdown_for_num_level0_files_;
}
private: private:
uint32_t id_; uint32_t id_;
const std::string name_; const std::string name_;
@ -105,6 +111,10 @@ class ColumnFamilyData {
// Column Family. All earlier log files must be ignored and not // Column Family. All earlier log files must be ignored and not
// recovered from // recovered from
uint64_t log_number_; uint64_t log_number_;
// A flag indicating whether we should delay writes because
// we have too many level 0 files
bool need_slowdown_for_num_level0_files_;
}; };
// Thread safe only for reading without a writer. All access should be // Thread safe only for reading without a writer. All access should be

View File

@ -1298,8 +1298,7 @@ Status DBImpl::ReFitLevel(int level, int target_level) {
assert(level < NumberLevels()); assert(level < NumberLevels());
SuperVersion* superversion_to_free = nullptr; SuperVersion* superversion_to_free = nullptr;
SuperVersion* new_superversion = SuperVersion* new_superversion = new SuperVersion();
new SuperVersion(options_.max_write_buffer_number);
mutex_.Lock(); mutex_.Lock();
@ -2949,6 +2948,13 @@ std::vector<Status> DBImpl::MultiGet(
return statList; return statList;
} }
// TODO(icanadi) creating column family while writing will cause a data race.
// In write code path, we iterate through column families and call
// MakeRoomForWrite() for each. MakeRoomForWrite() can unlock the mutex
// and wait (delay the write). If we create or drop a column family when
// that mutex is unlocked for delay, that's bad.
// Solution TODO: enable iteration by chaining column families in
// circular linked lists
Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& options, Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& options,
const std::string& column_family_name, const std::string& column_family_name,
ColumnFamilyHandle* handle) { ColumnFamilyHandle* handle) {
@ -3106,9 +3112,14 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
RecordTick(options_.statistics.get(), WRITE_DONE_BY_SELF, 1); RecordTick(options_.statistics.get(), WRITE_DONE_BY_SELF, 1);
} }
Status status;
for (auto cfd : *versions_->GetColumnFamilySet()) {
// May temporarily unlock and wait. // May temporarily unlock and wait.
SuperVersion* superversion_to_free = nullptr; status = MakeRoomForWrite(cfd, my_batch == nullptr);
Status status = MakeRoomForWrite(my_batch == nullptr, &superversion_to_free); if (!status.ok()) {
break;
}
}
uint64_t last_sequence = versions_->LastSequence(); uint64_t last_sequence = versions_->LastSequence();
Writer* last_writer = &w; Writer* last_writer = &w;
if (status.ok() && my_batch != nullptr) { // nullptr batch is for compactions if (status.ok() && my_batch != nullptr) { // nullptr batch is for compactions
@ -3209,7 +3220,6 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
writers_.front()->cv.Signal(); writers_.front()->cv.Signal();
} }
mutex_.Unlock(); mutex_.Unlock();
delete superversion_to_free;
return status; return status;
} }
@ -3295,8 +3305,7 @@ uint64_t DBImpl::SlowdownAmount(int n, double bottom, double top) {
// REQUIRES: mutex_ is held // REQUIRES: mutex_ is held
// REQUIRES: this thread is currently at the front of the writer queue // REQUIRES: this thread is currently at the front of the writer queue
Status DBImpl::MakeRoomForWrite(bool force, Status DBImpl::MakeRoomForWrite(ColumnFamilyData* cfd, bool force) {
SuperVersion** superversion_to_free) {
mutex_.AssertHeld(); mutex_.AssertHeld();
assert(!writers_.empty()); assert(!writers_.empty());
bool allow_delay = !force; bool allow_delay = !force;
@ -3305,14 +3314,13 @@ Status DBImpl::MakeRoomForWrite(bool force,
uint64_t rate_limit_delay_millis = 0; uint64_t rate_limit_delay_millis = 0;
Status s; Status s;
double score; double score;
*superversion_to_free = nullptr;
while (true) { while (true) {
if (!bg_error_.ok()) { if (!bg_error_.ok()) {
// Yield previous error // Yield previous error
s = bg_error_; s = bg_error_;
break; break;
} else if (allow_delay && versions_->NeedSlowdownForNumLevel0Files()) { } else if (allow_delay && cfd->NeedSlowdownForNumLevel0Files()) {
// We are getting close to hitting a hard limit on the number of // We are getting close to hitting a hard limit on the number of
// L0 files. Rather than delaying a single write by several // L0 files. Rather than delaying a single write by several
// seconds when we hit the hard limit, start delaying each // seconds when we hit the hard limit, start delaying each
@ -3320,9 +3328,9 @@ Status DBImpl::MakeRoomForWrite(bool force,
// this delay hands over some CPU to the compaction thread in // this delay hands over some CPU to the compaction thread in
// case it is sharing the same core as the writer. // case it is sharing the same core as the writer.
uint64_t slowdown = uint64_t slowdown =
SlowdownAmount(default_cfd_->current()->NumLevelFiles(0), SlowdownAmount(cfd->current()->NumLevelFiles(0),
options_.level0_slowdown_writes_trigger, cfd->options()->level0_slowdown_writes_trigger,
options_.level0_stop_writes_trigger); cfd->options()->level0_stop_writes_trigger);
mutex_.Unlock(); mutex_.Unlock();
uint64_t delayed; uint64_t delayed;
{ {
@ -3335,15 +3343,15 @@ Status DBImpl::MakeRoomForWrite(bool force,
allow_delay = false; // Do not delay a single write more than once allow_delay = false; // Do not delay a single write more than once
mutex_.Lock(); mutex_.Lock();
delayed_writes_++; delayed_writes_++;
} else if (!force && (default_cfd_->mem()->ApproximateMemoryUsage() <= } else if (!force && (cfd->mem()->ApproximateMemoryUsage() <=
options_.write_buffer_size)) { cfd->options()->write_buffer_size)) {
// There is room in current memtable // There is room in current memtable
if (allow_delay) { if (allow_delay) {
DelayLoggingAndReset(); DelayLoggingAndReset();
} }
break; break;
} else if (default_cfd_->imm()->size() == } else if (cfd->imm()->size() ==
options_.max_write_buffer_number - 1) { cfd->options()->max_write_buffer_number - 1) {
// We have filled up the current memtable, but the previous // We have filled up the current memtable, but the previous
// ones are still being compacted, so we wait. // ones are still being compacted, so we wait.
DelayLoggingAndReset(); DelayLoggingAndReset();
@ -3359,8 +3367,8 @@ Status DBImpl::MakeRoomForWrite(bool force,
STALL_MEMTABLE_COMPACTION_MICROS, stall); STALL_MEMTABLE_COMPACTION_MICROS, stall);
internal_stats_.RecordWriteStall(InternalStats::MEMTABLE_COMPACTION, internal_stats_.RecordWriteStall(InternalStats::MEMTABLE_COMPACTION,
stall); stall);
} else if (default_cfd_->current()->NumLevelFiles(0) >= } else if (cfd->current()->NumLevelFiles(0) >=
options_.level0_stop_writes_trigger) { cfd->options()->level0_stop_writes_trigger) {
// There are too many level-0 files. // There are too many level-0 files.
DelayLoggingAndReset(); DelayLoggingAndReset();
Log(options_.info_log, "wait for fewer level0 files...\n"); Log(options_.info_log, "wait for fewer level0 files...\n");
@ -3374,10 +3382,10 @@ Status DBImpl::MakeRoomForWrite(bool force,
RecordTick(options_.statistics.get(), STALL_L0_NUM_FILES_MICROS, stall); RecordTick(options_.statistics.get(), STALL_L0_NUM_FILES_MICROS, stall);
internal_stats_.RecordWriteStall(InternalStats::LEVEL0_NUM_FILES, stall); internal_stats_.RecordWriteStall(InternalStats::LEVEL0_NUM_FILES, stall);
} else if (allow_hard_rate_limit_delay && options_.hard_rate_limit > 1.0 && } else if (allow_hard_rate_limit_delay && options_.hard_rate_limit > 1.0 &&
(score = default_cfd_->current()->MaxCompactionScore()) > (score = cfd->current()->MaxCompactionScore()) >
options_.hard_rate_limit) { cfd->options()->hard_rate_limit) {
// Delay a write when the compaction score for any level is too large. // Delay a write when the compaction score for any level is too large.
int max_level = default_cfd_->current()->MaxCompactionScoreLevel(); int max_level = cfd->current()->MaxCompactionScoreLevel();
mutex_.Unlock(); mutex_.Unlock();
uint64_t delayed; uint64_t delayed;
{ {
@ -3392,26 +3400,25 @@ Status DBImpl::MakeRoomForWrite(bool force,
rate_limit_delay_millis += rate_limit; rate_limit_delay_millis += rate_limit;
RecordTick(options_.statistics.get(), RecordTick(options_.statistics.get(),
RATE_LIMIT_DELAY_MILLIS, rate_limit); RATE_LIMIT_DELAY_MILLIS, rate_limit);
if (options_.rate_limit_delay_max_milliseconds > 0 && if (cfd->options()->rate_limit_delay_max_milliseconds > 0 &&
rate_limit_delay_millis >= rate_limit_delay_millis >=
(unsigned)options_.rate_limit_delay_max_milliseconds) { (unsigned)cfd->options()->rate_limit_delay_max_milliseconds) {
allow_hard_rate_limit_delay = false; allow_hard_rate_limit_delay = false;
} }
mutex_.Lock(); mutex_.Lock();
} else if (allow_soft_rate_limit_delay && options_.soft_rate_limit > 0.0 && } else if (allow_soft_rate_limit_delay &&
(score = default_cfd_->current()->MaxCompactionScore()) > cfd->options()->soft_rate_limit > 0.0 &&
options_.soft_rate_limit) { (score = cfd->current()->MaxCompactionScore()) >
cfd->options()->soft_rate_limit) {
// Delay a write when the compaction score for any level is too large. // Delay a write when the compaction score for any level is too large.
// TODO: add statistics // TODO: add statistics
mutex_.Unlock(); mutex_.Unlock();
{ {
StopWatch sw(env_, options_.statistics.get(), StopWatch sw(env_, options_.statistics.get(),
SOFT_RATE_LIMIT_DELAY_COUNT); SOFT_RATE_LIMIT_DELAY_COUNT);
env_->SleepForMicroseconds(SlowdownAmount( env_->SleepForMicroseconds(
score, SlowdownAmount(score, cfd->options()->soft_rate_limit,
options_.soft_rate_limit, cfd->options()->hard_rate_limit));
options_.hard_rate_limit)
);
rate_limit_delay_millis += sw.ElapsedMicros(); rate_limit_delay_millis += sw.ElapsedMicros();
} }
allow_soft_rate_limit_delay = false; allow_soft_rate_limit_delay = false;
@ -3436,9 +3443,10 @@ Status DBImpl::MakeRoomForWrite(bool force,
if (s.ok()) { if (s.ok()) {
// Our final size should be less than write_buffer_size // Our final size should be less than write_buffer_size
// (compression, etc) but err on the side of caution. // (compression, etc) but err on the side of caution.
lfile->SetPreallocationBlockSize(1.1 * options_.write_buffer_size); lfile->SetPreallocationBlockSize(1.1 *
memtmp = new MemTable(internal_comparator_, options_); cfd->options()->write_buffer_size);
new_superversion = new SuperVersion(options_.max_write_buffer_number); memtmp = new MemTable(internal_comparator_, *cfd->options());
new_superversion = new SuperVersion();
} }
} }
mutex_.Lock(); mutex_.Lock();
@ -3450,20 +3458,19 @@ Status DBImpl::MakeRoomForWrite(bool force,
} }
logfile_number_ = new_log_number; logfile_number_ = new_log_number;
log_.reset(new log::Writer(std::move(lfile))); log_.reset(new log::Writer(std::move(lfile)));
default_cfd_->mem()->SetNextLogNumber(logfile_number_); cfd->mem()->SetNextLogNumber(logfile_number_);
default_cfd_->imm()->Add(default_cfd_->mem()); cfd->imm()->Add(cfd->mem());
if (force) { if (force) {
default_cfd_->imm()->FlushRequested(); cfd->imm()->FlushRequested();
} }
memtmp->Ref(); memtmp->Ref();
memtmp->SetLogNumber(logfile_number_); memtmp->SetLogNumber(logfile_number_);
default_cfd_->SetMemtable(memtmp); cfd->SetMemtable(memtmp);
Log(options_.info_log, "New memtable created with log file: #%lu\n", Log(options_.info_log, "New memtable created with log file: #%lu\n",
(unsigned long)logfile_number_); (unsigned long)logfile_number_);
force = false; // Do not force another compaction if have room force = false; // Do not force another compaction if have room
MaybeScheduleFlushOrCompaction(); MaybeScheduleFlushOrCompaction();
*superversion_to_free = delete cfd->InstallSuperVersion(new_superversion);
default_cfd_->InstallSuperVersion(new_superversion);
} }
} }
return s; return s;

View File

@ -216,8 +216,7 @@ class DBImpl : public DB {
prev_log_number = 0; prev_log_number = 0;
memtables_to_free.reserve(num_memtables); memtables_to_free.reserve(num_memtables);
superversion_to_free = nullptr; superversion_to_free = nullptr;
new_superversion = new_superversion = create_superversion ? new SuperVersion() : nullptr;
create_superversion ? new SuperVersion(num_memtables) : nullptr;
} }
~DeletionState() { ~DeletionState() {
@ -303,11 +302,8 @@ class DBImpl : public DB {
uint64_t* filenumber); uint64_t* filenumber);
uint64_t SlowdownAmount(int n, double bottom, double top); uint64_t SlowdownAmount(int n, double bottom, double top);
// MakeRoomForWrite will return superversion_to_free through an arugment, Status MakeRoomForWrite(ColumnFamilyData* cfd,
// which the caller needs to delete. We do it because caller can delete bool force /* flush even if there is room? */);
// the superversion outside of mutex
Status MakeRoomForWrite(bool force /* compact even if there is room? */,
SuperVersion** superversion_to_free);
void BuildBatchGroup(Writer** last_writer, void BuildBatchGroup(Writer** last_writer,
autovector<WriteBatch*>* write_batch_group); autovector<WriteBatch*>* write_batch_group);

View File

@ -1377,7 +1377,6 @@ VersionSet::VersionSet(const std::string& dbname, const Options* options,
log_number_(0), log_number_(0),
prev_log_number_(0), prev_log_number_(0),
num_levels_(options_->num_levels), num_levels_(options_->num_levels),
need_slowdown_for_num_level0_files_(false),
current_version_number_(0), current_version_number_(0),
manifest_file_size_(0), manifest_file_size_(0),
storage_options_(storage_options), storage_options_(storage_options),
@ -1413,9 +1412,6 @@ void VersionSet::AppendVersion(ColumnFamilyData* column_family_data,
current->Unref(); current->Unref();
} }
column_family_data->SetCurrent(v); column_family_data->SetCurrent(v);
need_slowdown_for_num_level0_files_ =
(options_->level0_slowdown_writes_trigger >= 0 &&
v->NumLevelFiles(0) >= options_->level0_slowdown_writes_trigger);
v->Ref(); v->Ref();
// Append to linked list // Append to linked list

View File

@ -315,12 +315,6 @@ class VersionSet {
const EnvOptions& storage_options, const EnvOptions& storage_options,
int new_levels); int new_levels);
// A Flag indicating whether write needs to slowdown because of there are
// too many number of level0 files.
bool NeedSlowdownForNumLevel0Files() const {
return need_slowdown_for_num_level0_files_;
}
// Return the current manifest file number // Return the current manifest file number
uint64_t ManifestFileNumber() const { return manifest_file_number_; } uint64_t ManifestFileNumber() const { return manifest_file_number_; }
@ -482,10 +476,6 @@ class VersionSet {
// Opened lazily // Opened lazily
unique_ptr<log::Writer> descriptor_log_; unique_ptr<log::Writer> descriptor_log_;
// A flag indicating whether we should delay writes because
// we have too many level 0 files
bool need_slowdown_for_num_level0_files_;
// An object that keeps all the compaction stats // An object that keeps all the compaction stats
// and picks the next compaction // and picks the next compaction
std::unique_ptr<CompactionPicker> compaction_picker_; std::unique_ptr<CompactionPicker> compaction_picker_;