diff --git a/HISTORY.md b/HISTORY.md index 543ad6048..d6e8f922c 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,7 +1,9 @@ # Rocksdb Change Log + ## 6.16.0 (12/18/2020) ### Behavior Changes * Attempting to write a merge operand without explicitly configuring `merge_operator` now fails immediately, causing the DB to enter read-only mode. Previously, failure was deferred until the `merge_operator` was needed by a user read or a background operation. +* Since RocksDB does not continue write the same file if a file write fails for any reason, the file scope write IO error is treated the same as retryable IO error. More information about error handling of file scope IO error is included in `ErrorHandler::SetBGError`. ### Bug Fixes * Truncated WALs ending in incomplete records can no longer produce gaps in the recovered data when `WALRecoveryMode::kPointInTimeRecovery` is used. Gaps are still possible when WALs are truncated exactly on record boundaries; for complete protection, users should enable `track_and_verify_wals_in_manifest`. diff --git a/db/error_handler.cc b/db/error_handler.cc index f121519f4..cc313f0a9 100644 --- a/db/error_handler.cc +++ b/db/error_handler.cc @@ -350,12 +350,17 @@ const Status& ErrorHandler::SetBGError(const Status& bg_err, // This is the main function for looking at IO related error during the // background operations. The main logic is: +// 1) File scope IO error is treated as retryable IO error in the write +// path. In RocksDB, If a file has write IO error and it is at file scope, +// RocksDB never write to the same file again. RocksDB will create a new +// file and rewrite the whole content. Thus, it is retryable. // 1) if the error is caused by data loss, the error is mapped to // unrecoverable error. Application/user must take action to handle -// this situation. -// 2) if the error is a Retryable IO error, auto resume will be called and the -// auto resume can be controlled by resume count and resume interval -// options. There are three sub-cases: +// this situation (File scope case is excluded). +// 2) if the error is a Retryable IO error (i.e., it is a file scope IO error, +// or its retryable flag is set and not a data loss error), auto resume +// will be called and the auto resume can be controlled by resume count +// and resume interval options. There are three sub-cases: // a) if the error happens during compaction, it is mapped to a soft error. // the compaction thread will reschedule a new compaction. // b) if the error happens during flush and also WAL is empty, it is mapped @@ -384,9 +389,10 @@ const Status& ErrorHandler::SetBGError(const IOStatus& bg_io_err, Status new_bg_io_err = bg_io_err; DBRecoverContext context; - if (bg_io_err.GetDataLoss()) { - // First, data loss is treated as unrecoverable error. So it can directly - // overwrite any existing bg_error_. + if (bg_io_err.GetScope() != IOStatus::IOErrorScope::kIOErrorScopeFile && + bg_io_err.GetDataLoss()) { + // First, data loss (non file scope) is treated as unrecoverable error. So + // it can directly overwrite any existing bg_error_. bool auto_recovery = false; Status bg_err(new_bg_io_err, Status::Severity::kUnrecoverableError); bg_error_ = bg_err; @@ -397,13 +403,15 @@ const Status& ErrorHandler::SetBGError(const IOStatus& bg_io_err, &bg_err, db_mutex_, &auto_recovery); recover_context_ = context; return bg_error_; - } else if (bg_io_err.GetRetryable()) { - // Second, check if the error is a retryable IO error or not. if it is - // retryable error and its severity is higher than bg_error_, overwrite - // the bg_error_ with new error. - // In current stage, for retryable IO error of compaction, treat it as - // soft error. In other cases, treat the retryable IO error as hard - // error. + } else if (bg_io_err.GetScope() == + IOStatus::IOErrorScope::kIOErrorScopeFile || + bg_io_err.GetRetryable()) { + // Second, check if the error is a retryable IO error (file scope IO error + // is also treated as retryable IO error in RocksDB write path). if it is + // retryable error and its severity is higher than bg_error_, overwrite the + // bg_error_ with new error. In current stage, for retryable IO error of + // compaction, treat it as soft error. In other cases, treat the retryable + // IO error as hard error. bool auto_recovery = false; EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason, &new_bg_io_err, db_mutex_, diff --git a/db/error_handler_fs_test.cc b/db/error_handler_fs_test.cc index c17cac290..29345505d 100644 --- a/db/error_handler_fs_test.cc +++ b/db/error_handler_fs_test.cc @@ -241,6 +241,90 @@ TEST_F(DBErrorHandlingFSTest, FLushWritRetryableError) { Destroy(options); } +TEST_F(DBErrorHandlingFSTest, FLushWritFileScopeError) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 0; + Status s; + + listener->EnableAutoRecovery(false); + DestroyAndReopen(options); + + IOStatus error_msg = IOStatus::IOError("File Scope Data Loss Error"); + error_msg.SetDataLoss(true); + error_msg.SetScope( + ROCKSDB_NAMESPACE::IOStatus::IOErrorScope::kIOErrorScopeFile); + error_msg.SetRetryable(false); + + ASSERT_OK(Put(Key(1), "val1")); + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:BeforeFinishBuildTable", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_OK(s); + Reopen(options); + ASSERT_EQ("val1", Get(Key(1))); + + ASSERT_OK(Put(Key(2), "val2")); + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:BeforeSyncTable", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_OK(s); + Reopen(options); + ASSERT_EQ("val2", Get(Key(2))); + + ASSERT_OK(Put(Key(3), "val3")); + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:BeforeCloseTableFile", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_OK(s); + Reopen(options); + ASSERT_EQ("val3", Get(Key(3))); + + // not file scope, but retyrable set + error_msg.SetDataLoss(false); + error_msg.SetScope( + ROCKSDB_NAMESPACE::IOStatus::IOErrorScope::kIOErrorScopeFileSystem); + error_msg.SetRetryable(true); + + ASSERT_OK(Put(Key(3), "val3")); + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:BeforeCloseTableFile", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_OK(s); + Reopen(options); + ASSERT_EQ("val3", Get(Key(3))); + + Destroy(options); +} + TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableError1) { std::shared_ptr listener( new ErrorHandlerFSListener()); @@ -453,6 +537,52 @@ TEST_F(DBErrorHandlingFSTest, ManifestWriteRetryableError) { Close(); } +TEST_F(DBErrorHandlingFSTest, ManifestWriteFileScopeError) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 0; + Status s; + std::string old_manifest; + std::string new_manifest; + + listener->EnableAutoRecovery(false); + DestroyAndReopen(options); + old_manifest = GetManifestNameFromLiveFiles(); + + IOStatus error_msg = IOStatus::IOError("File Scope Data Loss Error"); + error_msg.SetDataLoss(true); + error_msg.SetScope( + ROCKSDB_NAMESPACE::IOStatus::IOErrorScope::kIOErrorScopeFile); + error_msg.SetRetryable(false); + + ASSERT_OK(Put(Key(0), "val")); + ASSERT_OK(Flush()); + ASSERT_OK(Put(Key(1), "val")); + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::LogAndApply:WriteManifest", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + SyncPoint::GetInstance()->EnableProcessing(); + s = Flush(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->DisableProcessing(); + fault_fs_->SetFilesystemActive(true); + s = dbfull()->Resume(); + ASSERT_OK(s); + + new_manifest = GetManifestNameFromLiveFiles(); + ASSERT_NE(new_manifest, old_manifest); + + Reopen(options); + ASSERT_EQ("val", Get(Key(0))); + ASSERT_EQ("val", Get(Key(1))); + Close(); +} + TEST_F(DBErrorHandlingFSTest, ManifestWriteNoWALRetryableError) { std::shared_ptr listener( new ErrorHandlerFSListener()); @@ -779,6 +909,54 @@ TEST_F(DBErrorHandlingFSTest, CompactionWriteRetryableError) { Destroy(options); } +TEST_F(DBErrorHandlingFSTest, CompactionWriteFileScopeError) { + std::shared_ptr listener( + new ErrorHandlerFSListener()); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.level0_file_num_compaction_trigger = 2; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 0; + Status s; + DestroyAndReopen(options); + + IOStatus error_msg = IOStatus::IOError("File Scope Data Loss Error"); + error_msg.SetDataLoss(true); + error_msg.SetScope( + ROCKSDB_NAMESPACE::IOStatus::IOErrorScope::kIOErrorScopeFile); + error_msg.SetRetryable(false); + + ASSERT_OK(Put(Key(0), "va;")); + ASSERT_OK(Put(Key(2), "va;")); + s = Flush(); + ASSERT_OK(s); + + listener->OverrideBGError(Status(error_msg, Status::Severity::kHardError)); + listener->EnableAutoRecovery(false); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::FlushMemTable:FlushMemTableFinished", + "BackgroundCallCompaction:0"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::OpenCompactionOutputFile", + [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put(Key(1), "val")); + s = Flush(); + ASSERT_OK(s); + + s = dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + + fault_fs_->SetFilesystemActive(true); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->DisableProcessing(); + s = dbfull()->Resume(); + ASSERT_OK(s); + Destroy(options); +} + TEST_F(DBErrorHandlingFSTest, CorruptionError) { Options options = GetDefaultOptions(); options.env = fault_env_.get();