cleanup error_handler related code (#9098)

Summary: Remove code not in use, add comments, remove redundant code. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9098 Test Plan: make check Reviewed By: anand1976 Differential Revision: D32027219 Pulled By: zhichao-cao fbshipit-source-id: 253aae926c87726268af6c027bf805dc9156c8a8
2021-11-08 15:47:53 -08:00 · 2021-11-08 15:47:53 -08:00 · efaef9b40a
commit efaef9b40a
parent a7478070f3
3 changed files with 26 additions and 23 deletions
--- a/db/error_handler.cc
+++ b/db/error_handler.cc
@ -406,10 +406,7 @@ const Status& ErrorHandler::SetBGError(const IOStatus& bg_io_err,
    // it can directly overwrite any existing bg_error_.
    bool auto_recovery = false;
    Status bg_err(new_bg_io_err, Status::Severity::kUnrecoverableError);
-    bg_error_ = bg_err;
-    if (recovery_in_prog_ && recovery_error_.ok()) {
-      recovery_error_ = bg_err;
-    }
+    CheckAndSetRecoveryAndBGError(bg_err);
    if (bg_error_stats_ != nullptr) {
      RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_ERROR_COUNT);
      RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_IO_ERROR_COUNT);
@ -468,24 +465,14 @@ const Status& ErrorHandler::SetBGError(const IOStatus& bg_io_err,
      // to many small memtable being generated during auto resume, the flush
      // reason is set to kErrorRecoveryRetryFlush.
      Status bg_err(new_bg_io_err, Status::Severity::kSoftError);
-      if (recovery_in_prog_ && recovery_error_.ok()) {
-        recovery_error_ = bg_err;
-      }
-      if (bg_err.severity() > bg_error_.severity()) {
-        bg_error_ = bg_err;
-      }
+      CheckAndSetRecoveryAndBGError(bg_err);
      soft_error_no_bg_work_ = true;
      context.flush_reason = FlushReason::kErrorRecoveryRetryFlush;
      recover_context_ = context;
      return StartRecoverFromRetryableBGIOError(bg_io_err);
    } else {
      Status bg_err(new_bg_io_err, Status::Severity::kHardError);
-      if (recovery_in_prog_ && recovery_error_.ok()) {
-        recovery_error_ = bg_err;
-      }
-      if (bg_err.severity() > bg_error_.severity()) {
-        bg_error_ = bg_err;
-      }
+      CheckAndSetRecoveryAndBGError(bg_err);
      recover_context_ = context;
      return StartRecoverFromRetryableBGIOError(bg_io_err);
    }
@ -663,7 +650,6 @@ const Status& ErrorHandler::StartRecoverFromRetryableBGIOError(
  if (recovery_io_error_.ok() && recovery_error_.ok()) {
    return recovery_error_;
  } else {
-    TEST_SYNC_POINT("StartRecoverRetryableBGIOError:RecoverFail");
    return bg_error_;
  }
 #else
@ -677,7 +663,6 @@ const Status& ErrorHandler::StartRecoverFromRetryableBGIOError(
 void ErrorHandler::RecoverFromRetryableBGIOError() {
 #ifndef ROCKSDB_LITE
  TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeStart");
-  TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeStart1");
  InstrumentedMutexLock l(db_mutex_);
  if (end_recovery_) {
    return;
@ -697,8 +682,6 @@ void ErrorHandler::RecoverFromRetryableBGIOError() {
    recovery_error_ = Status::OK();
    retry_count++;
    Status s = db_->ResumeImpl(context);
-    TEST_SYNC_POINT("RecoverFromRetryableBGIOError:AfterResume0");
-    TEST_SYNC_POINT("RecoverFromRetryableBGIOError:AfterResume1");
    if (bg_error_stats_ != nullptr) {
      RecordTick(bg_error_stats_.get(),
                 ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT);
@ -707,7 +690,6 @@ void ErrorHandler::RecoverFromRetryableBGIOError() {
        bg_error_.severity() >= Status::Severity::kFatalError) {
      // If DB shutdown in progress or the error severity is higher than
      // Hard Error, stop auto resume and returns.
-      TEST_SYNC_POINT("RecoverFromRetryableBGIOError:RecoverFail0");
      recovery_in_prog_ = false;
      if (bg_error_stats_ != nullptr) {
        RecordInHistogram(bg_error_stats_.get(),
@ -725,7 +707,6 @@ void ErrorHandler::RecoverFromRetryableBGIOError() {
      TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeWait1");
      int64_t wait_until = db_options_.clock->NowMicros() + wait_interval;
      cv_.TimedWait(wait_until);
-      TEST_SYNC_POINT("RecoverFromRetryableBGIOError:AfterWait0");
    } else {
      // There are three possibility: 1) recover_io_error is set during resume
      // and the error is not retryable, 2) recover is successful, 3) other
@ -751,7 +732,6 @@ void ErrorHandler::RecoverFromRetryableBGIOError() {
        }
        return;
      } else {
-        TEST_SYNC_POINT("RecoverFromRetryableBGIOError:RecoverFail1");
        // In this case: 1) recovery_io_error is more serious or not retryable
        // 2) other Non IO recovery_error happens. The auto recovery stops.
        recovery_in_prog_ = false;
@ -776,6 +756,16 @@ void ErrorHandler::RecoverFromRetryableBGIOError() {
 #endif
 }

+void ErrorHandler::CheckAndSetRecoveryAndBGError(const Status& bg_err) {
+  if (recovery_in_prog_ && recovery_error_.ok()) {
+    recovery_error_ = bg_err;
+  }
+  if (bg_err.severity() > bg_error_.severity()) {
+    bg_error_ = bg_err;
+  }
+  return;
+}
+
 void ErrorHandler::EndAutoRecovery() {
  db_mutex_->AssertHeld();
  if (!end_recovery_) {
--- a/db/error_handler.h
+++ b/db/error_handler.h
@ -116,6 +116,10 @@ class ErrorHandler {
    void RecoverFromNoSpace();
    const Status& StartRecoverFromRetryableBGIOError(const IOStatus& io_error);
    void RecoverFromRetryableBGIOError();
+    // First, if it is in recovery and the recovery_error is ok. Set the
+    // recovery_error_ to bg_err. Second, if the severity is higher than the
+    // current bg_error_, overwrite it.
+    void CheckAndSetRecoveryAndBGError(const Status& bg_err);
 };

 }  // namespace ROCKSDB_NAMESPACE
--- a/db/error_handler_fs_test.cc
+++ b/db/error_handler_fs_test.cc
@ -487,6 +487,7 @@ TEST_F(DBErrorHandlingFSTest, FLushWALAtomicWriteRetryableError) {
  Destroy(options);
 }

+// The flush error is injected before we finish the table build
 TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableError1) {
  std::shared_ptr<ErrorHandlerFSListener> listener(
      new ErrorHandlerFSListener());
@ -542,6 +543,7 @@ TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableError1) {
  Destroy(options);
 }

+// The retryable IO error is injected before we sync table
 TEST_F(DBErrorHandlingFSTest, FLushWriteNoWALRetryableError2) {
  std::shared_ptr<ErrorHandlerFSListener> listener(
      new ErrorHandlerFSListener());
@ -585,6 +587,7 @@ TEST_F(DBErrorHandlingFSTest, FLushWriteNoWALRetryableError2) {
  Destroy(options);
 }

+// The retryable IO error is injected before we close the table file
 TEST_F(DBErrorHandlingFSTest, FLushWriteNoWALRetryableError3) {
  std::shared_ptr<ErrorHandlerFSListener> listener(
      new ErrorHandlerFSListener());
@ -1874,6 +1877,8 @@ TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableErrorAutoRecover2) {
  Destroy(options);
 }

+// Auto resume fromt the flush retryable IO error. Activate the FS before the
+// first resume. Resume is successful
 TEST_F(DBErrorHandlingFSTest, FLushWritRetryableErrorAutoRecover1) {
  // Activate the FS before the first resume
  std::shared_ptr<ErrorHandlerFSListener> listener(
@ -1914,6 +1919,8 @@ TEST_F(DBErrorHandlingFSTest, FLushWritRetryableErrorAutoRecover1) {
  Destroy(options);
 }

+// Auto resume fromt the flush retryable IO error and set the retry limit count.
+// Never activate the FS and auto resume should fail at the end
 TEST_F(DBErrorHandlingFSTest, FLushWritRetryableErrorAutoRecover2) {
  // Fail all the resume and let user to resume
  std::shared_ptr<ErrorHandlerFSListener> listener(
@ -1963,6 +1970,8 @@ TEST_F(DBErrorHandlingFSTest, FLushWritRetryableErrorAutoRecover2) {
  Destroy(options);
 }

+// Auto resume fromt the flush retryable IO error and set the retry limit count.
+// Fail the first resume and let the second resume be successful.
 TEST_F(DBErrorHandlingFSTest, ManifestWriteRetryableErrorAutoRecover) {
  // Fail the first resume and let the second resume be successful
  std::shared_ptr<ErrorHandlerFSListener> listener(