diff --git a/db/db_impl.cc b/db/db_impl.cc index 7bc46c2b4..9cff6991d 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -2421,6 +2421,40 @@ WriteBatch* DBImpl::BuildBatchGroup(Writer** last_writer) { return result; } +// This function computes the amount of time in microseconds by which a write +// should be delayed based on the number of level-0 files according to the +// following formula: +// if num_level_files < level0_slowdown_writes_trigger, return 0; +// if num_level_files >= level0_stop_writes_trigger, return 1000; +// otherwise, let r = (num_level_files - level0_slowdown) / +// (level0_stop - level0_slowdown) +// and return r^2 * 1000. +// The goal of this formula is to gradually increase the rate at which writes +// are slowed. We also tried linear delay (r * 1000), but it seemed to do +// slightly worse. There is no other particular reason for choosing quadratic. +uint64_t DBImpl::SlowdownAmount(int num_level0_files) { + uint64_t delay; + int stop_trigger = options_.level0_stop_writes_trigger; + int slowdown_trigger = options_.level0_slowdown_writes_trigger; + if (num_level0_files >= stop_trigger) { + delay = 1000; + } + else if (num_level0_files < slowdown_trigger) { + delay = 0; + } + else { + // If we are here, we know that: + // slowdown_trigger <= num_level0_files < stop_trigger + // since the previous two conditions are false. + float how_much = + (float) (num_level0_files - slowdown_trigger) / + (stop_trigger - slowdown_trigger); + delay = how_much * how_much * 1000; + } + assert(delay <= 1000); + return delay; +} + // REQUIRES: mutex_ is held // REQUIRES: this thread is currently at the front of the writer queue Status DBImpl::MakeRoomForWrite(bool force) { @@ -2444,14 +2478,14 @@ Status DBImpl::MakeRoomForWrite(bool force) { // We are getting close to hitting a hard limit on the number of // L0 files. Rather than delaying a single write by several // seconds when we hit the hard limit, start delaying each - // individual write by 1ms to reduce latency variance. Also, + // individual write by 0-1ms to reduce latency variance. Also, // this delay hands over some CPU to the compaction thread in // case it is sharing the same core as the writer. mutex_.Unlock(); uint64_t delayed; { StopWatch sw(env_, options_.statistics, STALL_L0_SLOWDOWN_COUNT); - env_->SleepForMicroseconds(1000); + env_->SleepForMicroseconds(SlowdownAmount(versions_->NumLevelFiles(0))); delayed = sw.ElapsedMicros(); } RecordTick(options_.statistics, STALL_L0_SLOWDOWN_MICROS, delayed); diff --git a/db/db_impl.h b/db/db_impl.h index fa20fe2b1..dedfd9d7e 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -170,6 +170,7 @@ class DBImpl : public DB { Status WriteLevel0Table(std::vector &mems, VersionEdit* edit, uint64_t* filenumber); + uint64_t SlowdownAmount(int num_level0_files); Status MakeRoomForWrite(bool force /* compact even if there is room? */); WriteBatch* BuildBatchGroup(Writer** last_writer); diff --git a/include/leveldb/options.h b/include/leveldb/options.h index 6252a1eb0..c591ad515 100644 --- a/include/leveldb/options.h +++ b/include/leveldb/options.h @@ -219,9 +219,9 @@ struct Options { // level-0 compaction will not be triggered by number of files at all. int level0_file_num_compaction_trigger; - // Soft limit on number of level-0 files. We slow down writes at this point. - // A value <0 means that no writing slow down will be triggered by number - // of files in level-0. + // Soft limit on number of level-0 files. We start slowing down writes at this + // point. A value <0 means that no writing slow down will be triggered by + // number of files in level-0. int level0_slowdown_writes_trigger; // Maximum number of level-0 files. We stop writes at this point.