Fix a deadlock in CompactRange()

Summary:
The way DBImpl::TEST_CompactRange() throttles down the number of bg compactions
can cause it to deadlock when CompactRange() is called concurrently from
multiple threads. Imagine a following scenario with only two threads
(max_background_compactions is 10 and bg_compaction_scheduled_ is initially 0):

   1. Thread #1 increments bg_compaction_scheduled_ (to LargeNumber), sets
      bg_compaction_scheduled_ to 9 (newvalue), schedules the compaction
      (bg_compaction_scheduled_ is now 10) and waits for it to complete.
   2. Thread #2 calls TEST_CompactRange(), increments bg_compaction_scheduled_
      (now LargeNumber + 10) and waits on a cv for bg_compaction_scheduled_ to
      drop to LargeNumber.
   3. BG thread completes the first manual compaction, decrements
      bg_compaction_scheduled_ and wakes up all threads waiting on bg_cv_.
      Thread #1 runs, increments bg_compaction_scheduled_ by LargeNumber again
      (now 2*LargeNumber + 9). Since that's more than LargeNumber + newvalue,
      thread #2 also goes to sleep (waiting on bg_cv_), without resetting
      bg_compaction_scheduled_.

This diff attempts to address the problem by introducing a new counter
bg_manual_only_ (when positive, MaybeScheduleFlushOrCompaction() will only
schedule manual compactions).

Test Plan:
I could pretty much consistently reproduce the deadlock with a program that
calls CompactRange(nullptr, nullptr) immediately after Write() from multiple
threads. This no longer happens with this patch.

Tests (make check) pass.

Reviewers: dhruba, igor, sdong, haobo

Reviewed By: igor

CC: leveldb

Differential Revision: https://reviews.facebook.net/D14799
This commit is contained in:
Tomislav Novak 2013-12-21 15:10:39 -08:00
parent c370f5597a
commit 9f690ec62c
2 changed files with 44 additions and 34 deletions

View File

@ -244,6 +244,7 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname)
super_version_(nullptr), super_version_(nullptr),
tmp_batch_(), tmp_batch_(),
bg_compaction_scheduled_(0), bg_compaction_scheduled_(0),
bg_manual_only_(0),
bg_flush_scheduled_(0), bg_flush_scheduled_(0),
bg_logstats_scheduled_(false), bg_logstats_scheduled_(false),
manual_compaction_(nullptr), manual_compaction_(nullptr),
@ -1600,45 +1601,44 @@ void DBImpl::TEST_CompactRange(int level, const Slice* begin,const Slice* end) {
MutexLock l(&mutex_); MutexLock l(&mutex_);
// When a manual compaction arrives, temporarily throttle down // When a manual compaction arrives, temporarily disable scheduling of
// the number of background compaction threads to 1. This is // non-manual compactions and wait until the number of scheduled compaction
// needed to ensure that this manual compaction can compact // jobs drops to zero. This is needed to ensure that this manual compaction
// any range of keys/files. We artificialy increase // can compact any range of keys/files.
// bg_compaction_scheduled_ by a large number, this causes //
// the system to have a single background thread. Now, // bg_manual_only_ is non-zero when at least one thread is inside
// this manual compaction can progress without stomping // TEST_CompactRange(), i.e. during that time no other compaction will
// on any other concurrent compactions. // get scheduled (see MaybeScheduleFlushOrCompaction).
const int LargeNumber = 10000000; //
const int newvalue = options_.max_background_compactions-1; // Note that the following loop doesn't stop more that one thread calling
bg_compaction_scheduled_ += LargeNumber; // TEST_CompactRange() from getting to the second while loop below.
while (bg_compaction_scheduled_ > LargeNumber) { // However, only one of them will actually schedule compaction, while
Log(options_.info_log, "Manual compaction request waiting for background threads to fall below 1"); // others will wait on a condition variable until it completes.
++bg_manual_only_;
while (bg_compaction_scheduled_ > 0) {
Log(options_.info_log,
"Manual compaction waiting for all other scheduled background "
"compactions to finish");
bg_cv_.Wait(); bg_cv_.Wait();
} }
Log(options_.info_log, "Manual compaction starting"); Log(options_.info_log, "Manual compaction starting");
while (!manual.done) { while (!manual.done && !shutting_down_.Acquire_Load() && bg_error_.ok()) {
while (manual_compaction_ != nullptr) { assert(bg_manual_only_ > 0);
bg_cv_.Wait(); if (manual_compaction_ != nullptr) {
} // Running either this or some other manual compaction
manual_compaction_ = &manual;
if (bg_compaction_scheduled_ == LargeNumber) {
bg_compaction_scheduled_ = newvalue;
}
MaybeScheduleFlushOrCompaction();
while (manual_compaction_ == &manual) {
bg_cv_.Wait(); bg_cv_.Wait();
} else {
manual_compaction_ = &manual;
MaybeScheduleFlushOrCompaction();
} }
} }
assert(!manual.in_progress);
// wait till there are no background threads scheduled assert(!manual.in_progress);
bg_compaction_scheduled_ += LargeNumber; assert(bg_manual_only_ > 0);
while (bg_compaction_scheduled_ > LargeNumber + newvalue) { --bg_manual_only_;
Log(options_.info_log, "Manual compaction resetting background threads");
bg_cv_.Wait();
}
bg_compaction_scheduled_ = 0;
} }
Status DBImpl::FlushMemTable(const FlushOptions& options) { Status DBImpl::FlushMemTable(const FlushOptions& options) {
@ -1703,11 +1703,16 @@ void DBImpl::MaybeScheduleFlushOrCompaction() {
env_->Schedule(&DBImpl::BGWorkFlush, this, Env::Priority::HIGH); env_->Schedule(&DBImpl::BGWorkFlush, this, Env::Priority::HIGH);
} }
// Schedule BGWorkCompaction if there's a compaction pending (or a memtable
// flush, but the HIGH pool is not enabled). Do it only if
// max_background_compactions hasn't been reached and, in case
// bg_manual_only_ > 0, if it's a manual compaction.
if ((manual_compaction_ || if ((manual_compaction_ ||
versions_->NeedsCompaction() || versions_->NeedsCompaction() ||
(is_flush_pending && (options_.max_background_flushes <= 0))) && (is_flush_pending && (options_.max_background_flushes <= 0))) &&
bg_compaction_scheduled_ < options_.max_background_compactions) { bg_compaction_scheduled_ < options_.max_background_compactions &&
// compaction needed, or memtable flush needed but HIGH pool not enabled. (!bg_manual_only_ || manual_compaction_)) {
bg_compaction_scheduled_++; bg_compaction_scheduled_++;
env_->Schedule(&DBImpl::BGWorkCompaction, this, Env::Priority::LOW); env_->Schedule(&DBImpl::BGWorkCompaction, this, Env::Priority::LOW);
} }

View File

@ -388,9 +388,14 @@ class DBImpl : public DB {
// part of ongoing compactions. // part of ongoing compactions.
std::set<uint64_t> pending_outputs_; std::set<uint64_t> pending_outputs_;
// count how many background compaction been scheduled or is running? // count how many background compactions are running or have been scheduled
int bg_compaction_scheduled_; int bg_compaction_scheduled_;
// If non-zero, MaybeScheduleFlushOrCompaction() will only schedule manual
// compactions (if manual_compaction_ is not null). This mechanism enables
// manual compactions to wait until all other compactions are finished.
int bg_manual_only_;
// number of background memtable flush jobs, submitted to the HIGH pool // number of background memtable flush jobs, submitted to the HIGH pool
int bg_flush_scheduled_; int bg_flush_scheduled_;