Map retryable IO error during Flush without WAL to soft error and no switch memtable during resume (#7310)

Summary:
In the current implementation, any retryable IO error happens during Flush is mapped to a hard error. In this case, DB is stopped and write is stalled unless the background error is cleaned. In this PR, if WAL is DISABLED, the retryable IO error during FLush is mapped to a soft error. Such that, the memtable can continue receive the writes. At the same time, if auto resume is triggered, SwtichMemtable will not be called during Flush when resuming the DB to avoid to many small memtables. Testing cases are added.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/7310

Test Plan: adding new unit test, pass make check.

Reviewed By: anand1976

Differential Revision: D23710892

Pulled By: zhichao-cao

fbshipit-source-id: bc4ca50d11c6b23b60d2c0cb171d86d542b038e9
This commit is contained in:
Zhichao Cao 2020-09-17 20:22:35 -07:00 committed by Facebook GitHub Bot
parent 3ac07a12fe
commit c268628c25
9 changed files with 370 additions and 28 deletions

View File

@ -44,6 +44,8 @@
### Behavior Changes ### Behavior Changes
* File abstraction `FSRandomAccessFile.Prefetch()` default return status is changed from `OK` to `NotSupported`. If the user inherited file doesn't implement prefetch, RocksDB will create internal prefetch buffer to improve read performance. * File abstraction `FSRandomAccessFile.Prefetch()` default return status is changed from `OK` to `NotSupported`. If the user inherited file doesn't implement prefetch, RocksDB will create internal prefetch buffer to improve read performance.
### Behavior Changes
* When retryabel IO error happens during Flush (manifest write error is excluded) and WAL is disabled, originally it is mapped to kHardError. Now,it is mapped to soft error. So DB will not stall the writes unless the memtable is full. At the same time, when auto resume is triggered to recover the retryable IO error during Flush, SwitchMemtable is not called to avoid generating to many small immutable memtables. If WAL is enabled, no behavior changes.
### Others ### Others
* Error in prefetching partitioned index blocks will not be swallowed. It will fail the query and return the IOError users. * Error in prefetching partitioned index blocks will not be swallowed. It will fail the query and return the IOError users.

View File

@ -2427,7 +2427,7 @@ TEST_P(ColumnFamilyTest, FlushAndDropRaceCondition) {
// Make sure the task is sleeping. Otherwise, it might start to execute // Make sure the task is sleeping. Otherwise, it might start to execute
// after sleeping_task.WaitUntilDone() and cause TSAN warning. // after sleeping_task.WaitUntilDone() and cause TSAN warning.
sleeping_task.WaitUntilSleeping(); sleeping_task.WaitUntilSleeping();
// 1MB should create ~10 files for each CF // 1MB should create ~10 files for each CF
int kKeysNum = 10000; int kKeysNum = 10000;
PutRandomData(1, kKeysNum, 100); PutRandomData(1, kKeysNum, 100);

View File

@ -302,7 +302,7 @@ Status DBImpl::Resume() {
// 4. Schedule compactions if needed for all the CFs. This is needed as the // 4. Schedule compactions if needed for all the CFs. This is needed as the
// flush in the prior step might have been a no-op for some CFs, which // flush in the prior step might have been a no-op for some CFs, which
// means a new super version wouldn't have been installed // means a new super version wouldn't have been installed
Status DBImpl::ResumeImpl() { Status DBImpl::ResumeImpl(DBRecoverContext context) {
mutex_.AssertHeld(); mutex_.AssertHeld();
WaitForBackgroundWork(); WaitForBackgroundWork();
@ -364,7 +364,7 @@ Status DBImpl::ResumeImpl() {
autovector<ColumnFamilyData*> cfds; autovector<ColumnFamilyData*> cfds;
SelectColumnFamiliesForAtomicFlush(&cfds); SelectColumnFamiliesForAtomicFlush(&cfds);
mutex_.Unlock(); mutex_.Unlock();
s = AtomicFlushMemTables(cfds, flush_opts, FlushReason::kErrorRecovery); s = AtomicFlushMemTables(cfds, flush_opts, context.flush_reason);
mutex_.Lock(); mutex_.Lock();
} else { } else {
for (auto cfd : *versions_->GetColumnFamilySet()) { for (auto cfd : *versions_->GetColumnFamilySet()) {
@ -373,7 +373,7 @@ Status DBImpl::ResumeImpl() {
} }
cfd->Ref(); cfd->Ref();
mutex_.Unlock(); mutex_.Unlock();
s = FlushMemTable(cfd, flush_opts, FlushReason::kErrorRecovery); s = FlushMemTable(cfd, flush_opts, context.flush_reason);
mutex_.Lock(); mutex_.Lock();
cfd->UnrefAndTryDelete(); cfd->UnrefAndTryDelete();
if (!s.ok()) { if (!s.ok()) {

View File

@ -1379,7 +1379,7 @@ class DBImpl : public DB {
// Required: DB mutex held // Required: DB mutex held
Status PersistentStatsProcessFormatVersion(); Status PersistentStatsProcessFormatVersion();
Status ResumeImpl(); Status ResumeImpl(DBRecoverContext context);
void MaybeIgnoreError(Status* s) const; void MaybeIgnoreError(Status* s) const;

View File

@ -125,7 +125,12 @@ IOStatus DBImpl::SyncClosedLogs(JobContext* job_context) {
// "number < current_log_number". // "number < current_log_number".
MarkLogsSynced(current_log_number - 1, true, io_s); MarkLogsSynced(current_log_number - 1, true, io_s);
if (!io_s.ok()) { if (!io_s.ok()) {
error_handler_.SetBGError(io_s, BackgroundErrorReason::kFlush); if (total_log_size_ > 0) {
error_handler_.SetBGError(io_s, BackgroundErrorReason::kFlush);
} else {
// If the WAL is empty, we use different error reason
error_handler_.SetBGError(io_s, BackgroundErrorReason::kFlushNoWAL);
}
TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Failed"); TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Failed");
return io_s; return io_s;
} }
@ -179,6 +184,10 @@ Status DBImpl::FlushMemTableToOutputFile(
// SyncClosedLogs() may unlock and re-lock the db_mutex. // SyncClosedLogs() may unlock and re-lock the db_mutex.
io_s = SyncClosedLogs(job_context); io_s = SyncClosedLogs(job_context);
s = io_s; s = io_s;
if (!io_s.ok() && !io_s.IsShutdownInProgress() &&
!io_s.IsColumnFamilyDropped()) {
error_handler_.SetBGError(io_s, BackgroundErrorReason::kFlush);
}
} else { } else {
TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Skip"); TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Skip");
} }
@ -236,10 +245,14 @@ Status DBImpl::FlushMemTableToOutputFile(
// CURRENT file. With current code, it's just difficult to tell. So just // CURRENT file. With current code, it's just difficult to tell. So just
// be pessimistic and try write to a new MANIFEST. // be pessimistic and try write to a new MANIFEST.
// TODO: distinguish between MANIFEST write and CURRENT renaming // TODO: distinguish between MANIFEST write and CURRENT renaming
auto err_reason = versions_->io_status().ok() if (!versions_->io_status().ok()) {
? BackgroundErrorReason::kFlush error_handler_.SetBGError(io_s, BackgroundErrorReason::kManifestWrite);
: BackgroundErrorReason::kManifestWrite; } else if (total_log_size_ > 0) {
error_handler_.SetBGError(io_s, err_reason); error_handler_.SetBGError(io_s, BackgroundErrorReason::kFlush);
} else {
// If the WAL is empty, we use different error reason
error_handler_.SetBGError(io_s, BackgroundErrorReason::kFlushNoWAL);
}
} else { } else {
Status new_bg_error = s; Status new_bg_error = s;
error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush); error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush);
@ -632,10 +645,14 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
// CURRENT file. With current code, it's just difficult to tell. So just // CURRENT file. With current code, it's just difficult to tell. So just
// be pessimistic and try write to a new MANIFEST. // be pessimistic and try write to a new MANIFEST.
// TODO: distinguish between MANIFEST write and CURRENT renaming // TODO: distinguish between MANIFEST write and CURRENT renaming
auto err_reason = versions_->io_status().ok() if (!versions_->io_status().ok()) {
? BackgroundErrorReason::kFlush error_handler_.SetBGError(io_s, BackgroundErrorReason::kManifestWrite);
: BackgroundErrorReason::kManifestWrite; } else if (total_log_size_ > 0) {
error_handler_.SetBGError(io_s, err_reason); error_handler_.SetBGError(io_s, BackgroundErrorReason::kFlush);
} else {
// If the WAL is empty, we use different error reason
error_handler_.SetBGError(io_s, BackgroundErrorReason::kFlushNoWAL);
}
} else { } else {
Status new_bg_error = s; Status new_bg_error = s;
error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush); error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush);
@ -1637,6 +1654,7 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
return s; return s;
} }
} }
FlushRequest flush_req; FlushRequest flush_req;
{ {
WriteContext context; WriteContext context;
@ -1653,7 +1671,11 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
WaitForPendingWrites(); WaitForPendingWrites();
if (!cfd->mem()->IsEmpty() || !cached_recoverable_state_empty_.load()) { if (!cfd->mem()->IsEmpty() || !cached_recoverable_state_empty_.load()) {
s = SwitchMemtable(cfd, &context); if (flush_reason != FlushReason::kErrorRecoveryRetryFlush) {
s = SwitchMemtable(cfd, &context);
} else {
assert(cfd->imm()->NumNotFlushed() > 0);
}
} }
if (s.ok()) { if (s.ok()) {
if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() || if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() ||
@ -1661,7 +1683,8 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
flush_memtable_id = cfd->imm()->GetLatestMemTableID(); flush_memtable_id = cfd->imm()->GetLatestMemTableID();
flush_req.emplace_back(cfd, flush_memtable_id); flush_req.emplace_back(cfd, flush_memtable_id);
} }
if (immutable_db_options_.persist_stats_to_disk) { if (immutable_db_options_.persist_stats_to_disk &&
flush_reason != FlushReason::kErrorRecoveryRetryFlush) {
ColumnFamilyData* cfd_stats = ColumnFamilyData* cfd_stats =
versions_->GetColumnFamilySet()->GetColumnFamily( versions_->GetColumnFamilySet()->GetColumnFamily(
kPersistentStatsColumnFamilyName); kPersistentStatsColumnFamilyName);
@ -1690,7 +1713,6 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
} }
} }
} }
if (s.ok() && !flush_req.empty()) { if (s.ok() && !flush_req.empty()) {
for (auto& elem : flush_req) { for (auto& elem : flush_req) {
ColumnFamilyData* loop_cfd = elem.first; ColumnFamilyData* loop_cfd = elem.first;
@ -1726,8 +1748,10 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
cfds.push_back(iter.first); cfds.push_back(iter.first);
flush_memtable_ids.push_back(&(iter.second)); flush_memtable_ids.push_back(&(iter.second));
} }
s = WaitForFlushMemTables(cfds, flush_memtable_ids, s = WaitForFlushMemTables(
(flush_reason == FlushReason::kErrorRecovery)); cfds, flush_memtable_ids,
(flush_reason == FlushReason::kErrorRecovery ||
flush_reason == FlushReason::kErrorRecoveryRetryFlush));
InstrumentedMutexLock lock_guard(&mutex_); InstrumentedMutexLock lock_guard(&mutex_);
for (auto* tmp_cfd : cfds) { for (auto* tmp_cfd : cfds) {
tmp_cfd->UnrefAndTryDelete(); tmp_cfd->UnrefAndTryDelete();
@ -1785,7 +1809,8 @@ Status DBImpl::AtomicFlushMemTables(
} }
} }
for (auto cfd : cfds) { for (auto cfd : cfds) {
if (cfd->mem()->IsEmpty() && cached_recoverable_state_empty_.load()) { if ((cfd->mem()->IsEmpty() && cached_recoverable_state_empty_.load()) ||
flush_reason == FlushReason::kErrorRecoveryRetryFlush) {
continue; continue;
} }
cfd->Ref(); cfd->Ref();
@ -1828,8 +1853,10 @@ Status DBImpl::AtomicFlushMemTables(
for (auto& iter : flush_req) { for (auto& iter : flush_req) {
flush_memtable_ids.push_back(&(iter.second)); flush_memtable_ids.push_back(&(iter.second));
} }
s = WaitForFlushMemTables(cfds, flush_memtable_ids, s = WaitForFlushMemTables(
(flush_reason == FlushReason::kErrorRecovery)); cfds, flush_memtable_ids,
(flush_reason == FlushReason::kErrorRecovery ||
flush_reason == FlushReason::kErrorRecoveryRetryFlush));
InstrumentedMutexLock lock_guard(&mutex_); InstrumentedMutexLock lock_guard(&mutex_);
for (auto* cfd : cfds) { for (auto* cfd : cfds) {
cfd->UnrefAndTryDelete(); cfd->UnrefAndTryDelete();
@ -1939,6 +1966,13 @@ Status DBImpl::WaitForFlushMemTables(
if (!error_handler_.GetRecoveryError().ok()) { if (!error_handler_.GetRecoveryError().ok()) {
break; break;
} }
// If BGWorkStopped, which indicate that there is a BG error and
// 1) soft error but requires no BG work, 2) no in auto_recovery_
if (!resuming_from_bg_err && error_handler_.IsBGWorkStopped() &&
error_handler_.GetBGError().severity() < Status::Severity::kHardError) {
return error_handler_.GetBGError();
}
// Number of column families that have been dropped. // Number of column families that have been dropped.
int num_dropped = 0; int num_dropped = 0;
// Number of column families that have finished flush. // Number of column families that have finished flush.

View File

@ -90,6 +90,28 @@ std::map<std::tuple<BackgroundErrorReason, Status::Code, Status::SubCode, bool>,
Status::Code::kIOError, Status::SubCode::kIOFenced, Status::Code::kIOError, Status::SubCode::kIOFenced,
false), false),
Status::Severity::kFatalError}, Status::Severity::kFatalError},
// Errors during BG flush with WAL disabled
{std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
Status::Code::kIOError, Status::SubCode::kNoSpace,
true),
Status::Severity::kHardError},
{std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
Status::Code::kIOError, Status::SubCode::kNoSpace,
false),
Status::Severity::kNoError},
{std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
Status::Code::kIOError, Status::SubCode::kSpaceLimit,
true),
Status::Severity::kHardError},
{std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
Status::Code::kIOError, Status::SubCode::kIOFenced,
true),
Status::Severity::kFatalError},
{std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
Status::Code::kIOError, Status::SubCode::kIOFenced,
false),
Status::Severity::kFatalError},
}; };
std::map<std::tuple<BackgroundErrorReason, Status::Code, bool>, std::map<std::tuple<BackgroundErrorReason, Status::Code, bool>,
@ -140,6 +162,19 @@ std::map<std::tuple<BackgroundErrorReason, Status::Code, bool>,
{std::make_tuple(BackgroundErrorReason::kManifestWrite, {std::make_tuple(BackgroundErrorReason::kManifestWrite,
Status::Code::kIOError, false), Status::Code::kIOError, false),
Status::Severity::kFatalError}, Status::Severity::kFatalError},
// Errors during BG flush with WAL disabled
{std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
Status::Code::kCorruption, true),
Status::Severity::kUnrecoverableError},
{std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
Status::Code::kCorruption, false),
Status::Severity::kNoError},
{std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
Status::Code::kIOError, true),
Status::Severity::kFatalError},
{std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
Status::Code::kIOError, false),
Status::Severity::kNoError},
}; };
std::map<std::tuple<BackgroundErrorReason, bool>, Status::Severity> std::map<std::tuple<BackgroundErrorReason, bool>, Status::Severity>
@ -218,6 +253,7 @@ Status ErrorHandler::SetBGError(const Status& bg_err, BackgroundErrorReason reas
bool paranoid = db_options_.paranoid_checks; bool paranoid = db_options_.paranoid_checks;
Status::Severity sev = Status::Severity::kFatalError; Status::Severity sev = Status::Severity::kFatalError;
Status new_bg_err; Status new_bg_err;
DBRecoverContext context;
bool found = false; bool found = false;
{ {
@ -276,6 +312,7 @@ Status ErrorHandler::SetBGError(const Status& bg_err, BackgroundErrorReason reas
} }
} }
recover_context_ = context;
if (auto_recovery) { if (auto_recovery) {
recovery_in_prog_ = true; recovery_in_prog_ = true;
@ -303,8 +340,10 @@ Status ErrorHandler::SetBGError(const IOStatus& bg_io_err,
// Always returns ok // Always returns ok
db_->DisableFileDeletionsWithLock(); db_->DisableFileDeletionsWithLock();
} }
Status new_bg_io_err = bg_io_err; Status new_bg_io_err = bg_io_err;
Status s; Status s;
DBRecoverContext context;
if (bg_io_err.GetDataLoss()) { if (bg_io_err.GetDataLoss()) {
// FIrst, data loss is treated as unrecoverable error. So it can directly // FIrst, data loss is treated as unrecoverable error. So it can directly
// overwrite any existing bg_error_. // overwrite any existing bg_error_.
@ -316,6 +355,7 @@ Status ErrorHandler::SetBGError(const IOStatus& bg_io_err,
} }
EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason, &s, EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason, &s,
db_mutex_, &auto_recovery); db_mutex_, &auto_recovery);
recover_context_ = context;
return bg_error_; return bg_error_;
} else if (bg_io_err.GetRetryable()) { } else if (bg_io_err.GetRetryable()) {
// Second, check if the error is a retryable IO error or not. if it is // Second, check if the error is a retryable IO error or not. if it is
@ -332,7 +372,27 @@ Status ErrorHandler::SetBGError(const IOStatus& bg_io_err,
if (bg_err.severity() > bg_error_.severity()) { if (bg_err.severity() > bg_error_.severity()) {
bg_error_ = bg_err; bg_error_ = bg_err;
} }
recover_context_ = context;
return bg_error_; return bg_error_;
} else if (BackgroundErrorReason::kFlushNoWAL == reason) {
// When the BG Retryable IO error reason is flush without WAL,
// We map it to a soft error. At the same time, all the background work
// should be stopped except the BG work from recovery. Therefore, we
// set the soft_error_no_bg_work_ to true. At the same time, since DB
// continues to receive writes when BG error is soft error, to avoid
// to many small memtable being generated during auto resume, the flush
// reason is set to kErrorRecoveryRetryFlush.
Status bg_err(new_bg_io_err, Status::Severity::kSoftError);
if (recovery_in_prog_ && recovery_error_.ok()) {
recovery_error_ = bg_err;
}
if (bg_err.severity() > bg_error_.severity()) {
bg_error_ = bg_err;
}
soft_error_no_bg_work_ = true;
context.flush_reason = FlushReason::kErrorRecoveryRetryFlush;
recover_context_ = context;
return StartRecoverFromRetryableBGIOError(bg_io_err);
} else { } else {
Status bg_err(new_bg_io_err, Status::Severity::kHardError); Status bg_err(new_bg_io_err, Status::Severity::kHardError);
if (recovery_in_prog_ && recovery_error_.ok()) { if (recovery_in_prog_ && recovery_error_.ok()) {
@ -341,6 +401,7 @@ Status ErrorHandler::SetBGError(const IOStatus& bg_io_err,
if (bg_err.severity() > bg_error_.severity()) { if (bg_err.severity() > bg_error_.severity()) {
bg_error_ = bg_err; bg_error_ = bg_err;
} }
recover_context_ = context;
return StartRecoverFromRetryableBGIOError(bg_io_err); return StartRecoverFromRetryableBGIOError(bg_io_err);
} }
} else { } else {
@ -407,6 +468,7 @@ Status ErrorHandler::ClearBGError() {
Status old_bg_error = bg_error_; Status old_bg_error = bg_error_;
bg_error_ = Status::OK(); bg_error_ = Status::OK();
recovery_in_prog_ = false; recovery_in_prog_ = false;
soft_error_no_bg_work_ = false;
EventHelpers::NotifyOnErrorRecoveryCompleted(db_options_.listeners, EventHelpers::NotifyOnErrorRecoveryCompleted(db_options_.listeners,
old_bg_error, db_mutex_); old_bg_error, db_mutex_);
} }
@ -419,6 +481,7 @@ Status ErrorHandler::ClearBGError() {
Status ErrorHandler::RecoverFromBGError(bool is_manual) { Status ErrorHandler::RecoverFromBGError(bool is_manual) {
#ifndef ROCKSDB_LITE #ifndef ROCKSDB_LITE
InstrumentedMutexLock l(db_mutex_); InstrumentedMutexLock l(db_mutex_);
bool no_bg_work_original_flag = soft_error_no_bg_work_;
if (is_manual) { if (is_manual) {
// If its a manual recovery and there's a background recovery in progress // If its a manual recovery and there's a background recovery in progress
// return busy status // return busy status
@ -426,9 +489,24 @@ Status ErrorHandler::RecoverFromBGError(bool is_manual) {
return Status::Busy(); return Status::Busy();
} }
recovery_in_prog_ = true; recovery_in_prog_ = true;
// In manual resume, we allow the bg work to run. If it is a auto resume,
// the bg work should follow this tag.
soft_error_no_bg_work_ = false;
// In manual resume, if the bg error is a soft error and also requires
// no bg work, the error must be recovered by call the flush with
// flush reason: kErrorRecoveryRetryFlush. In other case, the flush
// reason is set to kErrorRecovery.
if (no_bg_work_original_flag) {
recover_context_.flush_reason = FlushReason::kErrorRecoveryRetryFlush;
} else {
recover_context_.flush_reason = FlushReason::kErrorRecovery;
}
} }
if (bg_error_.severity() == Status::Severity::kSoftError) { if (bg_error_.severity() == Status::Severity::kSoftError &&
recover_context_.flush_reason == FlushReason::kErrorRecovery) {
// Simply clear the background error and return // Simply clear the background error and return
recovery_error_ = Status::OK(); recovery_error_ = Status::OK();
return ClearBGError(); return ClearBGError();
@ -438,7 +516,13 @@ Status ErrorHandler::RecoverFromBGError(bool is_manual) {
// during the recovery process. While recovering, the only operations that // during the recovery process. While recovering, the only operations that
// can generate background errors should be the flush operations // can generate background errors should be the flush operations
recovery_error_ = Status::OK(); recovery_error_ = Status::OK();
Status s = db_->ResumeImpl(); Status s = db_->ResumeImpl(recover_context_);
if (s.ok()) {
soft_error_no_bg_work_ = false;
} else {
soft_error_no_bg_work_ = no_bg_work_original_flag;
}
// For manual recover, shutdown, and fatal error cases, set // For manual recover, shutdown, and fatal error cases, set
// recovery_in_prog_ to false. For automatic background recovery, leave it // recovery_in_prog_ to false. For automatic background recovery, leave it
// as is regardless of success or failure as it will be retried // as is regardless of success or failure as it will be retried
@ -491,6 +575,7 @@ void ErrorHandler::RecoverFromRetryableBGIOError() {
if (end_recovery_) { if (end_recovery_) {
return; return;
} }
DBRecoverContext context = recover_context_;
int resume_count = db_options_.max_bgerror_resume_count; int resume_count = db_options_.max_bgerror_resume_count;
uint64_t wait_interval = db_options_.bgerror_resume_retry_interval; uint64_t wait_interval = db_options_.bgerror_resume_retry_interval;
// Recover from the retryable error. Create a separate thread to do it. // Recover from the retryable error. Create a separate thread to do it.
@ -502,7 +587,7 @@ void ErrorHandler::RecoverFromRetryableBGIOError() {
TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeResume1"); TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeResume1");
recovery_io_error_ = IOStatus::OK(); recovery_io_error_ = IOStatus::OK();
recovery_error_ = Status::OK(); recovery_error_ = Status::OK();
Status s = db_->ResumeImpl(); Status s = db_->ResumeImpl(context);
TEST_SYNC_POINT("RecoverFromRetryableBGIOError:AfterResume0"); TEST_SYNC_POINT("RecoverFromRetryableBGIOError:AfterResume0");
TEST_SYNC_POINT("RecoverFromRetryableBGIOError:AfterResume1"); TEST_SYNC_POINT("RecoverFromRetryableBGIOError:AfterResume1");
if (s.IsShutdownInProgress() || if (s.IsShutdownInProgress() ||
@ -537,6 +622,9 @@ void ErrorHandler::RecoverFromRetryableBGIOError() {
EventHelpers::NotifyOnErrorRecoveryCompleted(db_options_.listeners, EventHelpers::NotifyOnErrorRecoveryCompleted(db_options_.listeners,
old_bg_error, db_mutex_); old_bg_error, db_mutex_);
recovery_in_prog_ = false; recovery_in_prog_ = false;
if (soft_error_no_bg_work_) {
soft_error_no_bg_work_ = false;
}
return; return;
} else { } else {
TEST_SYNC_POINT("RecoverFromRetryableBGIOError:RecoverFail1"); TEST_SYNC_POINT("RecoverFromRetryableBGIOError:RecoverFail1");

View File

@ -14,6 +14,17 @@ namespace ROCKSDB_NAMESPACE {
class DBImpl; class DBImpl;
// This structure is used to store the DB recovery context. The context is
// the information that related to the recover actions. For example, it contains
// FlushReason, which tells the flush job why this flush is called.
struct DBRecoverContext {
FlushReason flush_reason;
DBRecoverContext() : flush_reason(FlushReason::kErrorRecovery) {}
DBRecoverContext(FlushReason reason) : flush_reason(reason) {}
};
class ErrorHandler { class ErrorHandler {
public: public:
ErrorHandler(DBImpl* db, const ImmutableDBOptions& db_options, ErrorHandler(DBImpl* db, const ImmutableDBOptions& db_options,
@ -28,7 +39,8 @@ class ErrorHandler {
recovery_thread_(nullptr), recovery_thread_(nullptr),
db_mutex_(db_mutex), db_mutex_(db_mutex),
auto_recovery_(false), auto_recovery_(false),
recovery_in_prog_(false) {} recovery_in_prog_(false),
soft_error_no_bg_work_(false) {}
~ErrorHandler() { ~ErrorHandler() {
bg_error_.PermitUncheckedError(); bg_error_.PermitUncheckedError();
recovery_error_.PermitUncheckedError(); recovery_error_.PermitUncheckedError();
@ -59,9 +71,11 @@ class ErrorHandler {
bool IsBGWorkStopped() { bool IsBGWorkStopped() {
return !bg_error_.ok() && return !bg_error_.ok() &&
(bg_error_.severity() >= Status::Severity::kHardError || (bg_error_.severity() >= Status::Severity::kHardError ||
!auto_recovery_); !auto_recovery_ || soft_error_no_bg_work_);
} }
bool IsSoftErrorNoBGWork() { return soft_error_no_bg_work_; }
bool IsRecoveryInProgress() { return recovery_in_prog_; } bool IsRecoveryInProgress() { return recovery_in_prog_; }
Status RecoverFromBGError(bool is_manual = false); Status RecoverFromBGError(bool is_manual = false);
@ -89,6 +103,12 @@ class ErrorHandler {
// A flag indicating whether automatic recovery from errors is enabled // A flag indicating whether automatic recovery from errors is enabled
bool auto_recovery_; bool auto_recovery_;
bool recovery_in_prog_; bool recovery_in_prog_;
// A flag to indicate that for the soft error, we should not allow any
// backrgound work execpt the work is from recovery.
bool soft_error_no_bg_work_;
// Used to store the context for recover, such as flush reason.
DBRecoverContext recover_context_;
Status OverrideNoSpaceError(Status bg_error, bool* auto_recovery); Status OverrideNoSpaceError(Status bg_error, bool* auto_recovery);
void RecoverFromNoSpace(); void RecoverFromNoSpace();

View File

@ -183,7 +183,7 @@ TEST_F(DBErrorHandlingFSTest, FLushWriteError) {
Destroy(options); Destroy(options);
} }
TEST_F(DBErrorHandlingFSTest, FLushWritRetryableeError) { TEST_F(DBErrorHandlingFSTest, FLushWritRetryableError) {
std::shared_ptr<FaultInjectionTestFS> fault_fs( std::shared_ptr<FaultInjectionTestFS> fault_fs(
new FaultInjectionTestFS(FileSystem::Default())); new FaultInjectionTestFS(FileSystem::Default()));
std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs)); std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
@ -247,6 +247,92 @@ TEST_F(DBErrorHandlingFSTest, FLushWritRetryableeError) {
Destroy(options); Destroy(options);
} }
TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableError) {
std::shared_ptr<FaultInjectionTestFS> fault_fs(
new FaultInjectionTestFS(FileSystem::Default()));
std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
std::shared_ptr<ErrorHandlerFSListener> listener(
new ErrorHandlerFSListener());
Options options = GetDefaultOptions();
options.env = fault_fs_env.get();
options.create_if_missing = true;
options.listeners.emplace_back(listener);
options.max_bgerror_resume_count = 0;
Status s;
listener->EnableAutoRecovery(false);
DestroyAndReopen(options);
IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
error_msg.SetRetryable(true);
WriteOptions wo = WriteOptions();
wo.disableWAL = true;
Put(Key(1), "val1", wo);
SyncPoint::GetInstance()->SetCallBack(
"BuildTable:BeforeFinishBuildTable",
[&](void*) { fault_fs->SetFilesystemActive(false, error_msg); });
SyncPoint::GetInstance()->EnableProcessing();
s = Flush();
Put(Key(2), "val2", wo);
ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
ASSERT_EQ("val2", Get(Key(2)));
SyncPoint::GetInstance()->DisableProcessing();
fault_fs->SetFilesystemActive(true);
s = dbfull()->Resume();
ASSERT_EQ(s, Status::OK());
ASSERT_EQ("val1", Get(Key(1)));
ASSERT_EQ("val2", Get(Key(2)));
Put(Key(3), "val3", wo);
ASSERT_EQ("val3", Get(Key(3)));
s = Flush();
ASSERT_OK(s);
ASSERT_EQ("val3", Get(Key(3)));
Put(Key(4), "val4", wo);
SyncPoint::GetInstance()->SetCallBack(
"BuildTable:BeforeCloseTableFile",
[&](void*) { fault_fs->SetFilesystemActive(false, error_msg); });
SyncPoint::GetInstance()->EnableProcessing();
s = Flush();
Put(Key(5), "val5", wo);
ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
ASSERT_EQ("val5", Get(Key(5)));
SyncPoint::GetInstance()->DisableProcessing();
fault_fs->SetFilesystemActive(true);
s = dbfull()->Resume();
ASSERT_EQ(s, Status::OK());
ASSERT_EQ("val4", Get(Key(4)));
ASSERT_EQ("val5", Get(Key(5)));
Put(Key(6), "val6", wo);
ASSERT_EQ("val6", Get(Key(6)));
s = Flush();
ASSERT_OK(s);
ASSERT_EQ("val6", Get(Key(6)));
Put(Key(7), "val7", wo);
SyncPoint::GetInstance()->SetCallBack(
"BuildTable:BeforeSyncTable",
[&](void*) { fault_fs->SetFilesystemActive(false, error_msg); });
SyncPoint::GetInstance()->EnableProcessing();
s = Flush();
Put(Key(8), "val8", wo);
ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
ASSERT_EQ("val8", Get(Key(8)));
SyncPoint::GetInstance()->DisableProcessing();
fault_fs->SetFilesystemActive(true);
s = dbfull()->Resume();
ASSERT_EQ(s, Status::OK());
ASSERT_EQ("val7", Get(Key(7)));
ASSERT_EQ("val8", Get(Key(8)));
Put(Key(9), "val9", wo);
ASSERT_EQ("val9", Get(Key(9)));
s = Flush();
ASSERT_OK(s);
ASSERT_EQ("val9", Get(Key(9)));
Destroy(options);
}
TEST_F(DBErrorHandlingFSTest, ManifestWriteError) { TEST_F(DBErrorHandlingFSTest, ManifestWriteError) {
std::shared_ptr<FaultInjectionTestFS> fault_fs( std::shared_ptr<FaultInjectionTestFS> fault_fs(
new FaultInjectionTestFS(FileSystem::Default())); new FaultInjectionTestFS(FileSystem::Default()));
@ -1213,6 +1299,114 @@ TEST_F(DBErrorHandlingFSTest, MultiDBVariousErrors) {
delete def_env; delete def_env;
} }
// When Put the KV-pair, the write option is set to disable WAL.
// If retryable error happens in this condition, map the bg error
// to soft error and trigger auto resume. During auto resume, SwitchMemtable
// is disabled to avoid small SST tables. Write can still be applied before
// the bg error is cleaned unless the memtable is full.
TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableeErrorAutoRecover1) {
// Activate the FS before the first resume
std::shared_ptr<FaultInjectionTestFS> fault_fs(
new FaultInjectionTestFS(FileSystem::Default()));
std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
std::shared_ptr<ErrorHandlerFSListener> listener(
new ErrorHandlerFSListener());
Options options = GetDefaultOptions();
options.env = fault_fs_env.get();
options.create_if_missing = true;
options.listeners.emplace_back(listener);
options.max_bgerror_resume_count = 2;
options.bgerror_resume_retry_interval = 100000; // 0.1 second
Status s;
listener->EnableAutoRecovery(false);
DestroyAndReopen(options);
IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
error_msg.SetRetryable(true);
WriteOptions wo = WriteOptions();
wo.disableWAL = true;
Put(Key(1), "val1", wo);
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
{{"RecoverFromRetryableBGIOError:LoopOut",
"FLushWritNoWALRetryableeErrorAutoRecover1:1"}});
SyncPoint::GetInstance()->SetCallBack(
"BuildTable:BeforeFinishBuildTable",
[&](void*) { fault_fs->SetFilesystemActive(false, error_msg); });
SyncPoint::GetInstance()->EnableProcessing();
s = Flush();
ASSERT_EQ("val1", Get(Key(1)));
ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
TEST_SYNC_POINT("FLushWritNoWALRetryableeErrorAutoRecover1:1");
ASSERT_EQ("val1", Get(Key(1)));
ASSERT_EQ("val1", Get(Key(1)));
SyncPoint::GetInstance()->DisableProcessing();
fault_fs->SetFilesystemActive(true);
Put(Key(2), "val2", wo);
s = Flush();
// Since auto resume fails, the bg error is not cleand, flush will
// return the bg_error set before.
ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
ASSERT_EQ("val2", Get(Key(2)));
// call auto resume
s = dbfull()->Resume();
ASSERT_EQ(s, Status::OK());
Put(Key(3), "val3", wo);
s = Flush();
// After resume is successful, the flush should be ok.
ASSERT_EQ(s, Status::OK());
ASSERT_EQ("val3", Get(Key(3)));
Destroy(options);
}
TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableeErrorAutoRecover2) {
// Activate the FS before the first resume
std::shared_ptr<FaultInjectionTestFS> fault_fs(
new FaultInjectionTestFS(FileSystem::Default()));
std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
std::shared_ptr<ErrorHandlerFSListener> listener(
new ErrorHandlerFSListener());
Options options = GetDefaultOptions();
options.env = fault_fs_env.get();
options.create_if_missing = true;
options.listeners.emplace_back(listener);
options.max_bgerror_resume_count = 2;
options.bgerror_resume_retry_interval = 100000; // 0.1 second
Status s;
listener->EnableAutoRecovery(false);
DestroyAndReopen(options);
IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
error_msg.SetRetryable(true);
WriteOptions wo = WriteOptions();
wo.disableWAL = true;
Put(Key(1), "val1", wo);
SyncPoint::GetInstance()->SetCallBack(
"BuildTable:BeforeFinishBuildTable",
[&](void*) { fault_fs->SetFilesystemActive(false, error_msg); });
SyncPoint::GetInstance()->EnableProcessing();
s = Flush();
ASSERT_EQ("val1", Get(Key(1)));
ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
SyncPoint::GetInstance()->DisableProcessing();
fault_fs->SetFilesystemActive(true);
ASSERT_EQ(listener->WaitForRecovery(5000000), true);
ASSERT_EQ("val1", Get(Key(1)));
Put(Key(2), "val2", wo);
s = Flush();
// Since auto resume is successful, the bg error is cleaned, flush will
// be successful.
ASSERT_OK(s);
ASSERT_EQ("val2", Get(Key(2)));
Destroy(options);
}
TEST_F(DBErrorHandlingFSTest, DISABLED_FLushWritRetryableeErrorAutoRecover1) { TEST_F(DBErrorHandlingFSTest, DISABLED_FLushWritRetryableeErrorAutoRecover1) {
// Fail the first resume and make the second resume successful // Fail the first resume and make the second resume successful
std::shared_ptr<FaultInjectionTestFS> fault_fs( std::shared_ptr<FaultInjectionTestFS> fault_fs(

View File

@ -115,6 +115,9 @@ enum class FlushReason : int {
kAutoCompaction = 0x09, kAutoCompaction = 0x09,
kManualFlush = 0x0a, kManualFlush = 0x0a,
kErrorRecovery = 0xb, kErrorRecovery = 0xb,
// When set the flush reason to kErrorRecoveryRetryFlush, SwitchMemtable
// will not be called to avoid many small immutable memtables.
kErrorRecoveryRetryFlush = 0xc,
}; };
enum class BackgroundErrorReason { enum class BackgroundErrorReason {
@ -123,6 +126,7 @@ enum class BackgroundErrorReason {
kWriteCallback, kWriteCallback,
kMemTable, kMemTable,
kManifestWrite, kManifestWrite,
kFlushNoWAL,
}; };
enum class WriteStallCondition { enum class WriteStallCondition {