Fix a deadlock in CompactRange()
Summary: The way DBImpl::TEST_CompactRange() throttles down the number of bg compactions can cause it to deadlock when CompactRange() is called concurrently from multiple threads. Imagine a following scenario with only two threads (max_background_compactions is 10 and bg_compaction_scheduled_ is initially 0): 1. Thread #1 increments bg_compaction_scheduled_ (to LargeNumber), sets bg_compaction_scheduled_ to 9 (newvalue), schedules the compaction (bg_compaction_scheduled_ is now 10) and waits for it to complete. 2. Thread #2 calls TEST_CompactRange(), increments bg_compaction_scheduled_ (now LargeNumber + 10) and waits on a cv for bg_compaction_scheduled_ to drop to LargeNumber. 3. BG thread completes the first manual compaction, decrements bg_compaction_scheduled_ and wakes up all threads waiting on bg_cv_. Thread #1 runs, increments bg_compaction_scheduled_ by LargeNumber again (now 2*LargeNumber + 9). Since that's more than LargeNumber + newvalue, thread #2 also goes to sleep (waiting on bg_cv_), without resetting bg_compaction_scheduled_. This diff attempts to address the problem by introducing a new counter bg_manual_only_ (when positive, MaybeScheduleFlushOrCompaction() will only schedule manual compactions). Test Plan: I could pretty much consistently reproduce the deadlock with a program that calls CompactRange(nullptr, nullptr) immediately after Write() from multiple threads. This no longer happens with this patch. Tests (make check) pass. Reviewers: dhruba, igor, sdong, haobo Reviewed By: igor CC: leveldb Differential Revision: https://reviews.facebook.net/D14799
This commit is contained in:
parent
c370f5597a
commit
9f690ec62c
@ -244,6 +244,7 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname)
|
|||||||
super_version_(nullptr),
|
super_version_(nullptr),
|
||||||
tmp_batch_(),
|
tmp_batch_(),
|
||||||
bg_compaction_scheduled_(0),
|
bg_compaction_scheduled_(0),
|
||||||
|
bg_manual_only_(0),
|
||||||
bg_flush_scheduled_(0),
|
bg_flush_scheduled_(0),
|
||||||
bg_logstats_scheduled_(false),
|
bg_logstats_scheduled_(false),
|
||||||
manual_compaction_(nullptr),
|
manual_compaction_(nullptr),
|
||||||
@ -1600,45 +1601,44 @@ void DBImpl::TEST_CompactRange(int level, const Slice* begin,const Slice* end) {
|
|||||||
|
|
||||||
MutexLock l(&mutex_);
|
MutexLock l(&mutex_);
|
||||||
|
|
||||||
// When a manual compaction arrives, temporarily throttle down
|
// When a manual compaction arrives, temporarily disable scheduling of
|
||||||
// the number of background compaction threads to 1. This is
|
// non-manual compactions and wait until the number of scheduled compaction
|
||||||
// needed to ensure that this manual compaction can compact
|
// jobs drops to zero. This is needed to ensure that this manual compaction
|
||||||
// any range of keys/files. We artificialy increase
|
// can compact any range of keys/files.
|
||||||
// bg_compaction_scheduled_ by a large number, this causes
|
//
|
||||||
// the system to have a single background thread. Now,
|
// bg_manual_only_ is non-zero when at least one thread is inside
|
||||||
// this manual compaction can progress without stomping
|
// TEST_CompactRange(), i.e. during that time no other compaction will
|
||||||
// on any other concurrent compactions.
|
// get scheduled (see MaybeScheduleFlushOrCompaction).
|
||||||
const int LargeNumber = 10000000;
|
//
|
||||||
const int newvalue = options_.max_background_compactions-1;
|
// Note that the following loop doesn't stop more that one thread calling
|
||||||
bg_compaction_scheduled_ += LargeNumber;
|
// TEST_CompactRange() from getting to the second while loop below.
|
||||||
while (bg_compaction_scheduled_ > LargeNumber) {
|
// However, only one of them will actually schedule compaction, while
|
||||||
Log(options_.info_log, "Manual compaction request waiting for background threads to fall below 1");
|
// others will wait on a condition variable until it completes.
|
||||||
|
|
||||||
|
++bg_manual_only_;
|
||||||
|
while (bg_compaction_scheduled_ > 0) {
|
||||||
|
Log(options_.info_log,
|
||||||
|
"Manual compaction waiting for all other scheduled background "
|
||||||
|
"compactions to finish");
|
||||||
bg_cv_.Wait();
|
bg_cv_.Wait();
|
||||||
}
|
}
|
||||||
|
|
||||||
Log(options_.info_log, "Manual compaction starting");
|
Log(options_.info_log, "Manual compaction starting");
|
||||||
|
|
||||||
while (!manual.done) {
|
while (!manual.done && !shutting_down_.Acquire_Load() && bg_error_.ok()) {
|
||||||
while (manual_compaction_ != nullptr) {
|
assert(bg_manual_only_ > 0);
|
||||||
bg_cv_.Wait();
|
if (manual_compaction_ != nullptr) {
|
||||||
}
|
// Running either this or some other manual compaction
|
||||||
manual_compaction_ = &manual;
|
|
||||||
if (bg_compaction_scheduled_ == LargeNumber) {
|
|
||||||
bg_compaction_scheduled_ = newvalue;
|
|
||||||
}
|
|
||||||
MaybeScheduleFlushOrCompaction();
|
|
||||||
while (manual_compaction_ == &manual) {
|
|
||||||
bg_cv_.Wait();
|
bg_cv_.Wait();
|
||||||
|
} else {
|
||||||
|
manual_compaction_ = &manual;
|
||||||
|
MaybeScheduleFlushOrCompaction();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
assert(!manual.in_progress);
|
|
||||||
|
|
||||||
// wait till there are no background threads scheduled
|
assert(!manual.in_progress);
|
||||||
bg_compaction_scheduled_ += LargeNumber;
|
assert(bg_manual_only_ > 0);
|
||||||
while (bg_compaction_scheduled_ > LargeNumber + newvalue) {
|
--bg_manual_only_;
|
||||||
Log(options_.info_log, "Manual compaction resetting background threads");
|
|
||||||
bg_cv_.Wait();
|
|
||||||
}
|
|
||||||
bg_compaction_scheduled_ = 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Status DBImpl::FlushMemTable(const FlushOptions& options) {
|
Status DBImpl::FlushMemTable(const FlushOptions& options) {
|
||||||
@ -1703,11 +1703,16 @@ void DBImpl::MaybeScheduleFlushOrCompaction() {
|
|||||||
env_->Schedule(&DBImpl::BGWorkFlush, this, Env::Priority::HIGH);
|
env_->Schedule(&DBImpl::BGWorkFlush, this, Env::Priority::HIGH);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Schedule BGWorkCompaction if there's a compaction pending (or a memtable
|
||||||
|
// flush, but the HIGH pool is not enabled). Do it only if
|
||||||
|
// max_background_compactions hasn't been reached and, in case
|
||||||
|
// bg_manual_only_ > 0, if it's a manual compaction.
|
||||||
if ((manual_compaction_ ||
|
if ((manual_compaction_ ||
|
||||||
versions_->NeedsCompaction() ||
|
versions_->NeedsCompaction() ||
|
||||||
(is_flush_pending && (options_.max_background_flushes <= 0))) &&
|
(is_flush_pending && (options_.max_background_flushes <= 0))) &&
|
||||||
bg_compaction_scheduled_ < options_.max_background_compactions) {
|
bg_compaction_scheduled_ < options_.max_background_compactions &&
|
||||||
// compaction needed, or memtable flush needed but HIGH pool not enabled.
|
(!bg_manual_only_ || manual_compaction_)) {
|
||||||
|
|
||||||
bg_compaction_scheduled_++;
|
bg_compaction_scheduled_++;
|
||||||
env_->Schedule(&DBImpl::BGWorkCompaction, this, Env::Priority::LOW);
|
env_->Schedule(&DBImpl::BGWorkCompaction, this, Env::Priority::LOW);
|
||||||
}
|
}
|
||||||
|
@ -388,9 +388,14 @@ class DBImpl : public DB {
|
|||||||
// part of ongoing compactions.
|
// part of ongoing compactions.
|
||||||
std::set<uint64_t> pending_outputs_;
|
std::set<uint64_t> pending_outputs_;
|
||||||
|
|
||||||
// count how many background compaction been scheduled or is running?
|
// count how many background compactions are running or have been scheduled
|
||||||
int bg_compaction_scheduled_;
|
int bg_compaction_scheduled_;
|
||||||
|
|
||||||
|
// If non-zero, MaybeScheduleFlushOrCompaction() will only schedule manual
|
||||||
|
// compactions (if manual_compaction_ is not null). This mechanism enables
|
||||||
|
// manual compactions to wait until all other compactions are finished.
|
||||||
|
int bg_manual_only_;
|
||||||
|
|
||||||
// number of background memtable flush jobs, submitted to the HIGH pool
|
// number of background memtable flush jobs, submitted to the HIGH pool
|
||||||
int bg_flush_scheduled_;
|
int bg_flush_scheduled_;
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user