Add soft and hard rate limit support

Summary:
This diff adds support for both soft and hard rate limiting. The following changes are included:

1) Options.rate_limit is renamed to Options.hard_rate_limit.
2) Options.rate_limit_delay_milliseconds is renamed to Options.rate_limit_delay_max_milliseconds.
3) Options.soft_rate_limit is added.
4) If the maximum compaction score is > hard_rate_limit and rate_limit_delay_max_milliseconds == 0, then writes are delayed by 1 ms at a time until the max compaction score falls below hard_rate_limit.
5) If the max compaction score is > soft_rate_limit but <= hard_rate_limit, then writes are delayed by 0-1 ms depending on how close we are to hard_rate_limit.
6) Users can disable 4 by setting hard_rate_limit = 0. They can add a limit to the maximum amount of time waited by setting rate_limit_delay_max_milliseconds > 0. Thus, the old behavior can be preserved by setting soft_rate_limit = 0, which is the default.

Test Plan:
make -j32 check
./db_stress

Reviewers: dhruba, haobo, MarkCallaghan

Reviewed By: dhruba

CC: leveldb

Differential Revision: https://reviews.facebook.net/D12003
This commit is contained in:
Jim Paton 2013-08-05 15:43:49 -07:00
parent cacd812fb2
commit 1036537c94
6 changed files with 86 additions and 45 deletions

View File

@ -264,13 +264,16 @@ static int FLAGS_stats_interval = 0;
// than 0.
static int FLAGS_stats_per_interval = 0;
static double FLAGS_soft_rate_limit = 0;
// When not equal to 0 this make threads sleep at each stats
// reporting interval until the compaction score for all levels is
// less than or equal to this value.
static double FLAGS_rate_limit = 0;
static double FLAGS_hard_rate_limit = 0;
// When FLAGS_rate_limit is set then this is the max time a put will be stalled.
static int FLAGS_rate_limit_delay_milliseconds = 1000;
// When FLAGS_hard_rate_limit is set then this is the max time a put will be
// stalled.
static int FLAGS_rate_limit_delay_max_milliseconds = 1000;
// Control maximum bytes of overlaps in grandparent (i.e., level+2) before we
// stop building a single file in a level->level+1 compaction.
@ -1146,8 +1149,10 @@ unique_ptr<char []> GenerateKeyFromInt(int v, const char* suffix = "")
options.disable_seek_compaction = FLAGS_disable_seek_compaction;
options.delete_obsolete_files_period_micros =
FLAGS_delete_obsolete_files_period_micros;
options.rate_limit = FLAGS_rate_limit;
options.rate_limit_delay_milliseconds = FLAGS_rate_limit_delay_milliseconds;
options.soft_rate_limit = FLAGS_soft_rate_limit;
options.hard_rate_limit = FLAGS_hard_rate_limit;
options.rate_limit_delay_max_milliseconds =
FLAGS_rate_limit_delay_max_milliseconds;
options.table_cache_numshardbits = FLAGS_table_cache_numshardbits;
options.max_grandparent_overlap_factor =
FLAGS_max_grandparent_overlap_factor;
@ -2039,7 +2044,8 @@ int main(int argc, char** argv) {
} else if (sscanf(argv[i], "--min_write_buffer_number_to_merge=%d%c",
&n, &junk) == 1) {
FLAGS_min_write_buffer_number_to_merge = n;
} else if (sscanf(argv[i], "--max_background_compactions=%d%c", &n, &junk) == 1) {
} else if (sscanf(argv[i], "--max_background_compactions=%d%c", &n, &junk)
== 1) {
FLAGS_max_background_compactions = n;
} else if (sscanf(argv[i], "--cache_size=%ld%c", &l, &junk) == 1) {
FLAGS_cache_size = l;
@ -2173,13 +2179,16 @@ int main(int argc, char** argv) {
} else if (sscanf(argv[i], "--stats_per_interval=%d%c", &n, &junk) == 1
&& (n == 0 || n == 1)) {
FLAGS_stats_per_interval = n;
} else if (sscanf(argv[i], "--rate_limit=%lf%c", &d, &junk) == 1 &&
} else if (sscanf(argv[i], "--soft_rate_limit=%lf%c", &d, &junk) == 1 &&
d > 0.0) {
FLAGS_soft_rate_limit = d;
} else if (sscanf(argv[i], "--hard_rate_limit=%lf%c", &d, &junk) == 1 &&
d > 1.0) {
FLAGS_rate_limit = d;
FLAGS_hard_rate_limit = d;
} else if (sscanf(argv[i],
"--rate_limit_delay_milliseconds=%d%c", &n, &junk) == 1
&& n > 0) {
FLAGS_rate_limit_delay_milliseconds = n;
"--rate_limit_delay_max_milliseconds=%d%c", &n, &junk) == 1
&& n >= 0) {
FLAGS_rate_limit_delay_max_milliseconds = n;
} else if (sscanf(argv[i], "--readonly=%d%c", &n, &junk) == 1 &&
(n == 0 || n ==1 )) {
FLAGS_read_only = n;

View File

@ -154,6 +154,9 @@ Options SanitizeOptions(const std::string& dbname,
if (result.max_mem_compaction_level >= result.num_levels) {
result.max_mem_compaction_level = result.num_levels - 1;
}
if (result.soft_rate_limit > result.hard_rate_limit) {
result.soft_rate_limit = result.hard_rate_limit;
}
return result;
}
@ -2417,31 +2420,29 @@ WriteBatch* DBImpl::BuildBatchGroup(Writer** last_writer) {
// This function computes the amount of time in microseconds by which a write
// should be delayed based on the number of level-0 files according to the
// following formula:
// if num_level_files < level0_slowdown_writes_trigger, return 0;
// if num_level_files >= level0_stop_writes_trigger, return 1000;
// otherwise, let r = (num_level_files - level0_slowdown) /
// (level0_stop - level0_slowdown)
// if n < bottom, return 0;
// if n >= top, return 1000;
// otherwise, let r = (n - bottom) /
// (top - bottom)
// and return r^2 * 1000.
// The goal of this formula is to gradually increase the rate at which writes
// are slowed. We also tried linear delay (r * 1000), but it seemed to do
// slightly worse. There is no other particular reason for choosing quadratic.
uint64_t DBImpl::SlowdownAmount(int num_level0_files) {
uint64_t DBImpl::SlowdownAmount(int n, int top, int bottom) {
uint64_t delay;
int stop_trigger = options_.level0_stop_writes_trigger;
int slowdown_trigger = options_.level0_slowdown_writes_trigger;
if (num_level0_files >= stop_trigger) {
if (n >= top) {
delay = 1000;
}
else if (num_level0_files < slowdown_trigger) {
else if (n < bottom) {
delay = 0;
}
else {
// If we are here, we know that:
// slowdown_trigger <= num_level0_files < stop_trigger
// level0_start_slowdown <= n < level0_slowdown
// since the previous two conditions are false.
float how_much =
(float) (num_level0_files - slowdown_trigger) /
(stop_trigger - slowdown_trigger);
(float) (n - bottom) /
(top - bottom);
delay = how_much * how_much * 1000;
}
assert(delay <= 1000);
@ -2454,7 +2455,8 @@ Status DBImpl::MakeRoomForWrite(bool force) {
mutex_.AssertHeld();
assert(!writers_.empty());
bool allow_delay = !force;
bool allow_rate_limit_delay = !force;
bool allow_hard_rate_limit_delay = !force;
bool allow_soft_rate_limit_delay = !force;
uint64_t rate_limit_delay_millis = 0;
Status s;
double score;
@ -2478,7 +2480,11 @@ Status DBImpl::MakeRoomForWrite(bool force) {
uint64_t delayed;
{
StopWatch sw(env_, options_.statistics, STALL_L0_SLOWDOWN_COUNT);
env_->SleepForMicroseconds(SlowdownAmount(versions_->NumLevelFiles(0)));
env_->SleepForMicroseconds(
SlowdownAmount(versions_->NumLevelFiles(0),
options_.level0_slowdown_writes_trigger,
options_.level0_stop_writes_trigger)
);
delayed = sw.ElapsedMicros();
}
RecordTick(options_.statistics, STALL_L0_SLOWDOWN_MICROS, delayed);
@ -2527,9 +2533,9 @@ Status DBImpl::MakeRoomForWrite(bool force) {
stall_level0_num_files_ += stall;
stall_level0_num_files_count_++;
} else if (
allow_rate_limit_delay &&
options_.rate_limit > 1.0 &&
(score = versions_->MaxCompactionScore()) > options_.rate_limit) {
allow_hard_rate_limit_delay &&
options_.hard_rate_limit > 1.0 &&
(score = versions_->MaxCompactionScore()) > options_.hard_rate_limit) {
// Delay a write when the compaction score for any level is too large.
int max_level = versions_->MaxCompactionScoreLevel();
mutex_.Unlock();
@ -2545,14 +2551,29 @@ Status DBImpl::MakeRoomForWrite(bool force) {
uint64_t rate_limit = std::max((delayed / 1000), (uint64_t) 1);
rate_limit_delay_millis += rate_limit;
RecordTick(options_.statistics, RATE_LIMIT_DELAY_MILLIS, rate_limit);
if (rate_limit_delay_millis >=
(unsigned)options_.rate_limit_delay_milliseconds) {
allow_rate_limit_delay = false;
if (options_.rate_limit_delay_max_milliseconds > 0 &&
rate_limit_delay_millis >=
(unsigned)options_.rate_limit_delay_max_milliseconds) {
allow_hard_rate_limit_delay = false;
}
// Log(options_.info_log,
// "delaying write %llu usecs for rate limits with max score %.2f\n",
// (long long unsigned int)delayed, score);
mutex_.Lock();
} else if (
allow_soft_rate_limit_delay &&
options_.soft_rate_limit > 0.0 &&
(score = versions_->MaxCompactionScore()) > options_.soft_rate_limit) {
// Delay a write when the compaction score for any level is too large.
// TODO: add statistics
mutex_.Unlock();
env_->SleepForMicroseconds(SlowdownAmount(
score,
options_.soft_rate_limit,
options_.hard_rate_limit)
);
allow_soft_rate_limit_delay = false;
mutex_.Lock();
} else {
// Attempt to switch to a new memtable and trigger compaction of old
DelayLoggingAndReset();

View File

@ -166,7 +166,7 @@ class DBImpl : public DB {
Status WriteLevel0Table(std::vector<MemTable*> &mems, VersionEdit* edit,
uint64_t* filenumber);
uint64_t SlowdownAmount(int num_level0_files);
uint64_t SlowdownAmount(int n, int top, int bottom);
Status MakeRoomForWrite(bool force /* compact even if there is room? */);
WriteBatch* BuildBatchGroup(Writer** last_writer);

View File

@ -286,8 +286,8 @@ class DBTest {
options.purge_redundant_kvs_while_flush = !options.purge_redundant_kvs_while_flush;
break;
case kPerfOptions:
options.rate_limit = 2.0;
options.rate_limit_delay_milliseconds = 2;
options.hard_rate_limit = 2.0;
options.rate_limit_delay_max_milliseconds = 2;
// TODO -- test more options
break;
case kDeletesFilterFirst:

View File

@ -355,12 +355,22 @@ struct Options {
// Default: 1000
size_t keep_log_file_num;
// Puts are delayed when any level has a compaction score that
// exceeds rate_limit. This is ignored when <= 1.0.
double rate_limit;
// Puts are delayed 0-1 ms when any level has a compaction score that exceeds
// soft_rate_limit. This is ignored when == 0.0.
// CONSTRAINT: soft_rate_limit <= hard_rate_limit. If this constraint does not
// hold, RocksDB will set soft_rate_limit = hard_rate_limit
// Default: 0 (disabled)
double soft_rate_limit;
// Max time a put will be stalled when rate_limit is enforced
unsigned int rate_limit_delay_milliseconds;
// Puts are delayed 1ms at a time when any level has a compaction score that
// exceeds hard_rate_limit. This is ignored when <= 1.0.
// Default: 0 (disabled)
double hard_rate_limit;
// Max time a put will be stalled when hard_rate_limit is enforced. If 0, then
// there is no limit.
// Default: 1000
unsigned int rate_limit_delay_max_milliseconds;
// manifest file is rolled over on reaching this limit.
// The older manifest file be deleted.

View File

@ -56,8 +56,9 @@ Options::Options()
max_log_file_size(0),
log_file_time_to_roll(0),
keep_log_file_num(1000),
rate_limit(0.0),
rate_limit_delay_milliseconds(1000),
soft_rate_limit(0.0),
hard_rate_limit(0.0),
rate_limit_delay_max_milliseconds(1000),
max_manifest_file_size(std::numeric_limits<uint64_t>::max()),
no_block_cache(false),
table_cache_numshardbits(4),
@ -181,10 +182,10 @@ Options::Dump(Logger* log) const
delete_obsolete_files_period_micros);
Log(log," Options.max_background_compactions: %d",
max_background_compactions);
Log(log," Options.rate_limit: %.2f",
rate_limit);
Log(log," Options.rate_limit_delay_milliseconds: %d",
rate_limit_delay_milliseconds);
Log(log," Options.hard_rate_limit: %.2f",
hard_rate_limit);
Log(log," Options.rate_limit_delay_max_milliseconds: %d",
rate_limit_delay_max_milliseconds);
Log(log," Options.disable_auto_compactions: %d",
disable_auto_compactions);
Log(log," Options.WAL_ttl_seconds: %ld",