Fix IngestExternalFile's bug with two_write_queue (#5976)
Summary: When two_write_queue enable, IngestExternalFile performs EnterUnbatched on both write queues. SwitchMemtable also EnterUnbatched on 2nd write queue when this option is enabled. When the call stack includes IngestExternalFile -> FlushMemTable -> SwitchMemtable, this results into a deadlock. The implemented solution is to pass on the existing writes_stopped argument in FlushMemTable to skip EnterUnbatched in SwitchMemtable. Fixes https://github.com/facebook/rocksdb/issues/5974 Pull Request resolved: https://github.com/facebook/rocksdb/pull/5976 Differential Revision: D18535943 Pulled By: maysamyabandeh fbshipit-source-id: a4f9d4964c10d4a7ca06b1e0102ca2ec395512bc
This commit is contained in:
parent
0058daef7b
commit
f65ec09ef8
@ -1532,8 +1532,12 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
|
||||
InstrumentedMutexLock guard_lock(&mutex_);
|
||||
|
||||
WriteThread::Writer w;
|
||||
WriteThread::Writer nonmem_w;
|
||||
if (!writes_stopped) {
|
||||
write_thread_.EnterUnbatched(&w, &mutex_);
|
||||
if (two_write_queues_) {
|
||||
nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
|
||||
}
|
||||
}
|
||||
|
||||
if (!cfd->mem()->IsEmpty() || !cached_recoverable_state_empty_.load()) {
|
||||
@ -1596,6 +1600,9 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
|
||||
|
||||
if (!writes_stopped) {
|
||||
write_thread_.ExitUnbatched(&w);
|
||||
if (two_write_queues_) {
|
||||
nonmem_write_thread_.ExitUnbatched(&nonmem_w);
|
||||
}
|
||||
}
|
||||
}
|
||||
TEST_SYNC_POINT("DBImpl::FlushMemTable:AfterScheduleFlush");
|
||||
@ -1650,8 +1657,12 @@ Status DBImpl::AtomicFlushMemTables(
|
||||
InstrumentedMutexLock guard_lock(&mutex_);
|
||||
|
||||
WriteThread::Writer w;
|
||||
WriteThread::Writer nonmem_w;
|
||||
if (!writes_stopped) {
|
||||
write_thread_.EnterUnbatched(&w, &mutex_);
|
||||
if (two_write_queues_) {
|
||||
nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
|
||||
}
|
||||
}
|
||||
|
||||
for (auto cfd : column_family_datas) {
|
||||
@ -1695,6 +1706,9 @@ Status DBImpl::AtomicFlushMemTables(
|
||||
|
||||
if (!writes_stopped) {
|
||||
write_thread_.ExitUnbatched(&w);
|
||||
if (two_write_queues_) {
|
||||
nonmem_write_thread_.ExitUnbatched(&nonmem_w);
|
||||
}
|
||||
}
|
||||
}
|
||||
TEST_SYNC_POINT("DBImpl::AtomicFlushMemTables:AfterScheduleFlush");
|
||||
|
@ -105,7 +105,16 @@ Status DBImpl::TEST_SwitchMemtable(ColumnFamilyData* cfd) {
|
||||
if (cfd == nullptr) {
|
||||
cfd = default_cf_handle_->cfd();
|
||||
}
|
||||
return SwitchMemtable(cfd, &write_context);
|
||||
|
||||
if (two_write_queues_) {
|
||||
WriteThread::Writer nonmem_w;
|
||||
nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
|
||||
Status s = SwitchMemtable(cfd, &write_context);
|
||||
nonmem_write_thread_.ExitUnbatched(&nonmem_w);
|
||||
return s;
|
||||
} else {
|
||||
return SwitchMemtable(cfd, &write_context);
|
||||
}
|
||||
}
|
||||
|
||||
Status DBImpl::TEST_FlushMemTable(bool wait, bool allow_write_stall,
|
||||
|
@ -1246,6 +1246,11 @@ Status DBImpl::SwitchWAL(WriteContext* write_context) {
|
||||
}
|
||||
MaybeFlushStatsCF(&cfds);
|
||||
}
|
||||
WriteThread::Writer nonmem_w;
|
||||
if (two_write_queues_) {
|
||||
nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
|
||||
}
|
||||
|
||||
for (const auto cfd : cfds) {
|
||||
cfd->Ref();
|
||||
status = SwitchMemtable(cfd, write_context);
|
||||
@ -1254,6 +1259,10 @@ Status DBImpl::SwitchWAL(WriteContext* write_context) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (two_write_queues_) {
|
||||
nonmem_write_thread_.ExitUnbatched(&nonmem_w);
|
||||
}
|
||||
|
||||
if (status.ok()) {
|
||||
if (immutable_db_options_.atomic_flush) {
|
||||
AssignAtomicFlushSeq(cfds);
|
||||
@ -1314,6 +1323,10 @@ Status DBImpl::HandleWriteBufferFull(WriteContext* write_context) {
|
||||
MaybeFlushStatsCF(&cfds);
|
||||
}
|
||||
|
||||
WriteThread::Writer nonmem_w;
|
||||
if (two_write_queues_) {
|
||||
nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
|
||||
}
|
||||
for (const auto cfd : cfds) {
|
||||
if (cfd->mem()->IsEmpty()) {
|
||||
continue;
|
||||
@ -1325,6 +1338,10 @@ Status DBImpl::HandleWriteBufferFull(WriteContext* write_context) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (two_write_queues_) {
|
||||
nonmem_write_thread_.ExitUnbatched(&nonmem_w);
|
||||
}
|
||||
|
||||
if (status.ok()) {
|
||||
if (immutable_db_options_.atomic_flush) {
|
||||
AssignAtomicFlushSeq(cfds);
|
||||
@ -1530,6 +1547,11 @@ Status DBImpl::ScheduleFlushes(WriteContext* context) {
|
||||
MaybeFlushStatsCF(&cfds);
|
||||
}
|
||||
Status status;
|
||||
WriteThread::Writer nonmem_w;
|
||||
if (two_write_queues_) {
|
||||
nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
|
||||
}
|
||||
|
||||
for (auto& cfd : cfds) {
|
||||
if (!cfd->mem()->IsEmpty()) {
|
||||
status = SwitchMemtable(cfd, context);
|
||||
@ -1542,6 +1564,11 @@ Status DBImpl::ScheduleFlushes(WriteContext* context) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (two_write_queues_) {
|
||||
nonmem_write_thread_.ExitUnbatched(&nonmem_w);
|
||||
}
|
||||
|
||||
if (status.ok()) {
|
||||
if (immutable_db_options_.atomic_flush) {
|
||||
AssignAtomicFlushSeq(cfds);
|
||||
@ -1572,15 +1599,11 @@ void DBImpl::NotifyOnMemTableSealed(ColumnFamilyData* /*cfd*/,
|
||||
|
||||
// REQUIRES: mutex_ is held
|
||||
// REQUIRES: this thread is currently at the front of the writer queue
|
||||
// REQUIRES: this thread is currently at the front of the 2nd writer queue if
|
||||
// two_write_queues_ is true (This is to simplify the reasoning.)
|
||||
Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
|
||||
mutex_.AssertHeld();
|
||||
WriteThread::Writer nonmem_w;
|
||||
if (two_write_queues_) {
|
||||
// SwitchMemtable is a rare event. To simply the reasoning, we make sure
|
||||
// that there is no concurrent thread writing to WAL.
|
||||
nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
|
||||
}
|
||||
|
||||
std::unique_ptr<WritableFile> lfile;
|
||||
log::Writer* new_log = nullptr;
|
||||
MemTable* new_mem = nullptr;
|
||||
@ -1687,10 +1710,6 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
|
||||
error_handler_.SetBGError(s, BackgroundErrorReason::kMemTable);
|
||||
// Read back bg_error in order to get the right severity
|
||||
s = error_handler_.GetBGError();
|
||||
|
||||
if (two_write_queues_) {
|
||||
nonmem_write_thread_.ExitUnbatched(&nonmem_w);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
@ -1721,9 +1740,6 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
|
||||
NotifyOnMemTableSealed(cfd, memtable_info);
|
||||
mutex_.Lock();
|
||||
#endif // ROCKSDB_LITE
|
||||
if (two_write_queues_) {
|
||||
nonmem_write_thread_.ExitUnbatched(&nonmem_w);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
|
@ -2750,6 +2750,26 @@ TEST_P(ExternalSSTFileTest,
|
||||
Destroy(options, true /* delete_cf_paths */);
|
||||
}
|
||||
|
||||
TEST_P(ExternalSSTFileTest, IngestFilesTriggerFlushingWithTwoWriteQueue) {
|
||||
Options options = CurrentOptions();
|
||||
// Use large buffer to avoid memtable flush
|
||||
options.write_buffer_size = 1024 * 1024;
|
||||
options.two_write_queues = true;
|
||||
DestroyAndReopen(options);
|
||||
|
||||
ASSERT_OK(dbfull()->Put(WriteOptions(), "1000", "v1"));
|
||||
ASSERT_OK(dbfull()->Put(WriteOptions(), "1001", "v1"));
|
||||
ASSERT_OK(dbfull()->Put(WriteOptions(), "9999", "v1"));
|
||||
|
||||
// Put one key which is overlap with keys in memtable.
|
||||
// It will trigger flushing memtable and require this thread is
|
||||
// currently at the front of the 2nd writer queue. We must make
|
||||
// sure that it won't enter the 2nd writer queue for the second time.
|
||||
std::vector<std::pair<std::string, std::string>> data;
|
||||
data.push_back(std::make_pair("1001", "v2"));
|
||||
GenerateAndAddExternalFile(options, data);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(ExternalSSTFileTest, ExternalSSTFileTest,
|
||||
testing::Values(std::make_tuple(false, false),
|
||||
std::make_tuple(false, true),
|
||||
|
Loading…
Reference in New Issue
Block a user