cleanup error_handler related code (#9098)

Summary:
Remove code not in use, add comments, remove redundant code.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/9098

Test Plan: make check

Reviewed By: anand1976

Differential Revision: D32027219

Pulled By: zhichao-cao

fbshipit-source-id: 253aae926c87726268af6c027bf805dc9156c8a8
This commit is contained in:
Zhichao Cao 2021-11-08 15:47:53 -08:00 committed by Facebook GitHub Bot
parent a7478070f3
commit efaef9b40a
3 changed files with 26 additions and 23 deletions

View File

@ -406,10 +406,7 @@ const Status& ErrorHandler::SetBGError(const IOStatus& bg_io_err,
// it can directly overwrite any existing bg_error_.
bool auto_recovery = false;
Status bg_err(new_bg_io_err, Status::Severity::kUnrecoverableError);
bg_error_ = bg_err;
if (recovery_in_prog_ && recovery_error_.ok()) {
recovery_error_ = bg_err;
}
CheckAndSetRecoveryAndBGError(bg_err);
if (bg_error_stats_ != nullptr) {
RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_ERROR_COUNT);
RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_IO_ERROR_COUNT);
@ -468,24 +465,14 @@ const Status& ErrorHandler::SetBGError(const IOStatus& bg_io_err,
// to many small memtable being generated during auto resume, the flush
// reason is set to kErrorRecoveryRetryFlush.
Status bg_err(new_bg_io_err, Status::Severity::kSoftError);
if (recovery_in_prog_ && recovery_error_.ok()) {
recovery_error_ = bg_err;
}
if (bg_err.severity() > bg_error_.severity()) {
bg_error_ = bg_err;
}
CheckAndSetRecoveryAndBGError(bg_err);
soft_error_no_bg_work_ = true;
context.flush_reason = FlushReason::kErrorRecoveryRetryFlush;
recover_context_ = context;
return StartRecoverFromRetryableBGIOError(bg_io_err);
} else {
Status bg_err(new_bg_io_err, Status::Severity::kHardError);
if (recovery_in_prog_ && recovery_error_.ok()) {
recovery_error_ = bg_err;
}
if (bg_err.severity() > bg_error_.severity()) {
bg_error_ = bg_err;
}
CheckAndSetRecoveryAndBGError(bg_err);
recover_context_ = context;
return StartRecoverFromRetryableBGIOError(bg_io_err);
}
@ -663,7 +650,6 @@ const Status& ErrorHandler::StartRecoverFromRetryableBGIOError(
if (recovery_io_error_.ok() && recovery_error_.ok()) {
return recovery_error_;
} else {
TEST_SYNC_POINT("StartRecoverRetryableBGIOError:RecoverFail");
return bg_error_;
}
#else
@ -677,7 +663,6 @@ const Status& ErrorHandler::StartRecoverFromRetryableBGIOError(
void ErrorHandler::RecoverFromRetryableBGIOError() {
#ifndef ROCKSDB_LITE
TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeStart");
TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeStart1");
InstrumentedMutexLock l(db_mutex_);
if (end_recovery_) {
return;
@ -697,8 +682,6 @@ void ErrorHandler::RecoverFromRetryableBGIOError() {
recovery_error_ = Status::OK();
retry_count++;
Status s = db_->ResumeImpl(context);
TEST_SYNC_POINT("RecoverFromRetryableBGIOError:AfterResume0");
TEST_SYNC_POINT("RecoverFromRetryableBGIOError:AfterResume1");
if (bg_error_stats_ != nullptr) {
RecordTick(bg_error_stats_.get(),
ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT);
@ -707,7 +690,6 @@ void ErrorHandler::RecoverFromRetryableBGIOError() {
bg_error_.severity() >= Status::Severity::kFatalError) {
// If DB shutdown in progress or the error severity is higher than
// Hard Error, stop auto resume and returns.
TEST_SYNC_POINT("RecoverFromRetryableBGIOError:RecoverFail0");
recovery_in_prog_ = false;
if (bg_error_stats_ != nullptr) {
RecordInHistogram(bg_error_stats_.get(),
@ -725,7 +707,6 @@ void ErrorHandler::RecoverFromRetryableBGIOError() {
TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeWait1");
int64_t wait_until = db_options_.clock->NowMicros() + wait_interval;
cv_.TimedWait(wait_until);
TEST_SYNC_POINT("RecoverFromRetryableBGIOError:AfterWait0");
} else {
// There are three possibility: 1) recover_io_error is set during resume
// and the error is not retryable, 2) recover is successful, 3) other
@ -751,7 +732,6 @@ void ErrorHandler::RecoverFromRetryableBGIOError() {
}
return;
} else {
TEST_SYNC_POINT("RecoverFromRetryableBGIOError:RecoverFail1");
// In this case: 1) recovery_io_error is more serious or not retryable
// 2) other Non IO recovery_error happens. The auto recovery stops.
recovery_in_prog_ = false;
@ -776,6 +756,16 @@ void ErrorHandler::RecoverFromRetryableBGIOError() {
#endif
}
void ErrorHandler::CheckAndSetRecoveryAndBGError(const Status& bg_err) {
if (recovery_in_prog_ && recovery_error_.ok()) {
recovery_error_ = bg_err;
}
if (bg_err.severity() > bg_error_.severity()) {
bg_error_ = bg_err;
}
return;
}
void ErrorHandler::EndAutoRecovery() {
db_mutex_->AssertHeld();
if (!end_recovery_) {

View File

@ -116,6 +116,10 @@ class ErrorHandler {
void RecoverFromNoSpace();
const Status& StartRecoverFromRetryableBGIOError(const IOStatus& io_error);
void RecoverFromRetryableBGIOError();
// First, if it is in recovery and the recovery_error is ok. Set the
// recovery_error_ to bg_err. Second, if the severity is higher than the
// current bg_error_, overwrite it.
void CheckAndSetRecoveryAndBGError(const Status& bg_err);
};
} // namespace ROCKSDB_NAMESPACE

View File

@ -487,6 +487,7 @@ TEST_F(DBErrorHandlingFSTest, FLushWALAtomicWriteRetryableError) {
Destroy(options);
}
// The flush error is injected before we finish the table build
TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableError1) {
std::shared_ptr<ErrorHandlerFSListener> listener(
new ErrorHandlerFSListener());
@ -542,6 +543,7 @@ TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableError1) {
Destroy(options);
}
// The retryable IO error is injected before we sync table
TEST_F(DBErrorHandlingFSTest, FLushWriteNoWALRetryableError2) {
std::shared_ptr<ErrorHandlerFSListener> listener(
new ErrorHandlerFSListener());
@ -585,6 +587,7 @@ TEST_F(DBErrorHandlingFSTest, FLushWriteNoWALRetryableError2) {
Destroy(options);
}
// The retryable IO error is injected before we close the table file
TEST_F(DBErrorHandlingFSTest, FLushWriteNoWALRetryableError3) {
std::shared_ptr<ErrorHandlerFSListener> listener(
new ErrorHandlerFSListener());
@ -1874,6 +1877,8 @@ TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableErrorAutoRecover2) {
Destroy(options);
}
// Auto resume fromt the flush retryable IO error. Activate the FS before the
// first resume. Resume is successful
TEST_F(DBErrorHandlingFSTest, FLushWritRetryableErrorAutoRecover1) {
// Activate the FS before the first resume
std::shared_ptr<ErrorHandlerFSListener> listener(
@ -1914,6 +1919,8 @@ TEST_F(DBErrorHandlingFSTest, FLushWritRetryableErrorAutoRecover1) {
Destroy(options);
}
// Auto resume fromt the flush retryable IO error and set the retry limit count.
// Never activate the FS and auto resume should fail at the end
TEST_F(DBErrorHandlingFSTest, FLushWritRetryableErrorAutoRecover2) {
// Fail all the resume and let user to resume
std::shared_ptr<ErrorHandlerFSListener> listener(
@ -1963,6 +1970,8 @@ TEST_F(DBErrorHandlingFSTest, FLushWritRetryableErrorAutoRecover2) {
Destroy(options);
}
// Auto resume fromt the flush retryable IO error and set the retry limit count.
// Fail the first resume and let the second resume be successful.
TEST_F(DBErrorHandlingFSTest, ManifestWriteRetryableErrorAutoRecover) {
// Fail the first resume and let the second resume be successful
std::shared_ptr<ErrorHandlerFSListener> listener(