Add a listener callback for end of auto error recovery (#9244)
Summary: Previously, the OnErrorRecoveryCompleted callback was called when RocksDB was able to successfully recover from a retryable error. However, if the recovery failed and was eventually stopped, there was no indication of the status. To fix that, a new OnErrorRecoveryEnd callback is introduced that deprecates the OnErrorRecoveryCompleted callback. The new callback is called with the original error and the new error status. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9244 Test Plan: Add a new unit test in error_handler_fs_test Reviewed By: zhichao-cao Differential Revision: D32922303 Pulled By: anand1976 fbshipit-source-id: f04e77a9cb92c5ea6385590682d3fcf559971b99
This commit is contained in:
parent
9e4d56f2c9
commit
ecf2bec613
@ -9,6 +9,7 @@
|
|||||||
* MemTableList::TrimHistory now use allocated bytes when max_write_buffer_size_to_maintain > 0(default in TrasactionDB, introduced in PR#5022) Fix #8371.
|
* MemTableList::TrimHistory now use allocated bytes when max_write_buffer_size_to_maintain > 0(default in TrasactionDB, introduced in PR#5022) Fix #8371.
|
||||||
### Public API change
|
### Public API change
|
||||||
* Extend WriteBatch::AssignTimestamp and AssignTimestamps API so that both functions can accept an optional `checker` argument that performs additional checking on timestamp sizes.
|
* Extend WriteBatch::AssignTimestamp and AssignTimestamps API so that both functions can accept an optional `checker` argument that performs additional checking on timestamp sizes.
|
||||||
|
* Introduce a new EventListener callback that will be called upon the end of automatic error recovery.
|
||||||
|
|
||||||
### Performance Improvements
|
### Performance Improvements
|
||||||
* Replaced map property `TableProperties::properties_offsets` with uint64_t property `external_sst_file_global_seqno_offset` to save table properties's memory.
|
* Replaced map property `TableProperties::properties_offsets` with uint64_t property `external_sst_file_global_seqno_offset` to save table properties's memory.
|
||||||
|
@ -540,6 +540,8 @@ Status ErrorHandler::ClearBGError() {
|
|||||||
// Signal that recovery succeeded
|
// Signal that recovery succeeded
|
||||||
if (recovery_error_.ok()) {
|
if (recovery_error_.ok()) {
|
||||||
Status old_bg_error = bg_error_;
|
Status old_bg_error = bg_error_;
|
||||||
|
// old_bg_error is only for notifying listeners, so may not be checked
|
||||||
|
old_bg_error.PermitUncheckedError();
|
||||||
// Clear and check the recovery IO and BG error
|
// Clear and check the recovery IO and BG error
|
||||||
bg_error_ = Status::OK();
|
bg_error_ = Status::OK();
|
||||||
recovery_io_error_ = IOStatus::OK();
|
recovery_io_error_ = IOStatus::OK();
|
||||||
@ -547,8 +549,8 @@ Status ErrorHandler::ClearBGError() {
|
|||||||
recovery_io_error_.PermitUncheckedError();
|
recovery_io_error_.PermitUncheckedError();
|
||||||
recovery_in_prog_ = false;
|
recovery_in_prog_ = false;
|
||||||
soft_error_no_bg_work_ = false;
|
soft_error_no_bg_work_ = false;
|
||||||
EventHelpers::NotifyOnErrorRecoveryCompleted(db_options_.listeners,
|
EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, old_bg_error,
|
||||||
old_bg_error, db_mutex_);
|
bg_error_, db_mutex_);
|
||||||
}
|
}
|
||||||
return recovery_error_;
|
return recovery_error_;
|
||||||
#else
|
#else
|
||||||
@ -665,6 +667,9 @@ void ErrorHandler::RecoverFromRetryableBGIOError() {
|
|||||||
TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeStart");
|
TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeStart");
|
||||||
InstrumentedMutexLock l(db_mutex_);
|
InstrumentedMutexLock l(db_mutex_);
|
||||||
if (end_recovery_) {
|
if (end_recovery_) {
|
||||||
|
EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, bg_error_,
|
||||||
|
Status::ShutdownInProgress(),
|
||||||
|
db_mutex_);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
DBRecoverContext context = recover_context_;
|
DBRecoverContext context = recover_context_;
|
||||||
@ -674,6 +679,9 @@ void ErrorHandler::RecoverFromRetryableBGIOError() {
|
|||||||
// Recover from the retryable error. Create a separate thread to do it.
|
// Recover from the retryable error. Create a separate thread to do it.
|
||||||
while (resume_count > 0) {
|
while (resume_count > 0) {
|
||||||
if (end_recovery_) {
|
if (end_recovery_) {
|
||||||
|
EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, bg_error_,
|
||||||
|
Status::ShutdownInProgress(),
|
||||||
|
db_mutex_);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeResume0");
|
TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeResume0");
|
||||||
@ -695,6 +703,8 @@ void ErrorHandler::RecoverFromRetryableBGIOError() {
|
|||||||
RecordInHistogram(bg_error_stats_.get(),
|
RecordInHistogram(bg_error_stats_.get(),
|
||||||
ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count);
|
ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count);
|
||||||
}
|
}
|
||||||
|
EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, bg_error_,
|
||||||
|
bg_error_, db_mutex_);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (!recovery_io_error_.ok() &&
|
if (!recovery_io_error_.ok() &&
|
||||||
@ -718,8 +728,8 @@ void ErrorHandler::RecoverFromRetryableBGIOError() {
|
|||||||
Status old_bg_error = bg_error_;
|
Status old_bg_error = bg_error_;
|
||||||
bg_error_ = Status::OK();
|
bg_error_ = Status::OK();
|
||||||
bg_error_.PermitUncheckedError();
|
bg_error_.PermitUncheckedError();
|
||||||
EventHelpers::NotifyOnErrorRecoveryCompleted(db_options_.listeners,
|
EventHelpers::NotifyOnErrorRecoveryEnd(
|
||||||
old_bg_error, db_mutex_);
|
db_options_.listeners, old_bg_error, bg_error_, db_mutex_);
|
||||||
if (bg_error_stats_ != nullptr) {
|
if (bg_error_stats_ != nullptr) {
|
||||||
RecordTick(bg_error_stats_.get(),
|
RecordTick(bg_error_stats_.get(),
|
||||||
ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT);
|
ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT);
|
||||||
@ -739,12 +749,21 @@ void ErrorHandler::RecoverFromRetryableBGIOError() {
|
|||||||
RecordInHistogram(bg_error_stats_.get(),
|
RecordInHistogram(bg_error_stats_.get(),
|
||||||
ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count);
|
ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count);
|
||||||
}
|
}
|
||||||
|
EventHelpers::NotifyOnErrorRecoveryEnd(
|
||||||
|
db_options_.listeners, bg_error_,
|
||||||
|
!recovery_io_error_.ok()
|
||||||
|
? recovery_io_error_
|
||||||
|
: (!recovery_error_.ok() ? recovery_error_ : s),
|
||||||
|
db_mutex_);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
resume_count--;
|
resume_count--;
|
||||||
}
|
}
|
||||||
recovery_in_prog_ = false;
|
recovery_in_prog_ = false;
|
||||||
|
EventHelpers::NotifyOnErrorRecoveryEnd(
|
||||||
|
db_options_.listeners, bg_error_,
|
||||||
|
Status::Aborted("Exceeded resume retry count"), db_mutex_);
|
||||||
TEST_SYNC_POINT("RecoverFromRetryableBGIOError:LoopOut");
|
TEST_SYNC_POINT("RecoverFromRetryableBGIOError:LoopOut");
|
||||||
if (bg_error_stats_ != nullptr) {
|
if (bg_error_stats_ != nullptr) {
|
||||||
RecordInHistogram(bg_error_stats_.get(),
|
RecordInHistogram(bg_error_stats_.get(),
|
||||||
|
@ -66,6 +66,7 @@ class ErrorHandlerFSListener : public EventListener {
|
|||||||
~ErrorHandlerFSListener() {
|
~ErrorHandlerFSListener() {
|
||||||
file_creation_error_.PermitUncheckedError();
|
file_creation_error_.PermitUncheckedError();
|
||||||
bg_error_.PermitUncheckedError();
|
bg_error_.PermitUncheckedError();
|
||||||
|
new_bg_error_.PermitUncheckedError();
|
||||||
}
|
}
|
||||||
|
|
||||||
void OnTableFileCreationStarted(
|
void OnTableFileCreationStarted(
|
||||||
@ -96,6 +97,13 @@ class ErrorHandlerFSListener : public EventListener {
|
|||||||
old_bg_error.PermitUncheckedError();
|
old_bg_error.PermitUncheckedError();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void OnErrorRecoveryEnd(const BackgroundErrorRecoveryInfo& info) override {
|
||||||
|
InstrumentedMutexLock l(&mutex_);
|
||||||
|
recovery_complete_ = true;
|
||||||
|
cv_.SignalAll();
|
||||||
|
new_bg_error_ = info.new_bg_error;
|
||||||
|
}
|
||||||
|
|
||||||
bool WaitForRecovery(uint64_t /*abs_time_us*/) {
|
bool WaitForRecovery(uint64_t /*abs_time_us*/) {
|
||||||
InstrumentedMutexLock l(&mutex_);
|
InstrumentedMutexLock l(&mutex_);
|
||||||
while (!recovery_complete_) {
|
while (!recovery_complete_) {
|
||||||
@ -138,6 +146,8 @@ class ErrorHandlerFSListener : public EventListener {
|
|||||||
file_creation_error_ = io_s;
|
file_creation_error_ = io_s;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Status new_bg_error() { return new_bg_error_; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
InstrumentedMutex mutex_;
|
InstrumentedMutex mutex_;
|
||||||
InstrumentedCondVar cv_;
|
InstrumentedCondVar cv_;
|
||||||
@ -148,6 +158,7 @@ class ErrorHandlerFSListener : public EventListener {
|
|||||||
int file_count_;
|
int file_count_;
|
||||||
IOStatus file_creation_error_;
|
IOStatus file_creation_error_;
|
||||||
Status bg_error_;
|
Status bg_error_;
|
||||||
|
Status new_bg_error_;
|
||||||
FaultInjectionTestFS* fault_fs_;
|
FaultInjectionTestFS* fault_fs_;
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -2428,6 +2439,42 @@ TEST_F(DBErrorHandlingFSTest, WALWriteRetryableErrorAutoRecover2) {
|
|||||||
Close();
|
Close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Fail auto resume from a flush retryable error and verify that
|
||||||
|
// OnErrorRecoveryEnd listener callback is called
|
||||||
|
TEST_F(DBErrorHandlingFSTest, FLushWritRetryableErrorAbortRecovery) {
|
||||||
|
// Activate the FS before the first resume
|
||||||
|
std::shared_ptr<ErrorHandlerFSListener> listener(
|
||||||
|
new ErrorHandlerFSListener());
|
||||||
|
Options options = GetDefaultOptions();
|
||||||
|
options.env = fault_env_.get();
|
||||||
|
options.create_if_missing = true;
|
||||||
|
options.listeners.emplace_back(listener);
|
||||||
|
options.max_bgerror_resume_count = 2;
|
||||||
|
options.bgerror_resume_retry_interval = 100000; // 0.1 second
|
||||||
|
Status s;
|
||||||
|
|
||||||
|
listener->EnableAutoRecovery(false);
|
||||||
|
DestroyAndReopen(options);
|
||||||
|
|
||||||
|
IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
|
||||||
|
error_msg.SetRetryable(true);
|
||||||
|
|
||||||
|
ASSERT_OK(Put(Key(1), "val1"));
|
||||||
|
SyncPoint::GetInstance()->SetCallBack(
|
||||||
|
"BuildTable:BeforeFinishBuildTable",
|
||||||
|
[&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
|
||||||
|
|
||||||
|
SyncPoint::GetInstance()->EnableProcessing();
|
||||||
|
s = Flush();
|
||||||
|
ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
|
||||||
|
ASSERT_EQ(listener->WaitForRecovery(5000000), true);
|
||||||
|
ASSERT_EQ(listener->new_bg_error(), Status::Aborted());
|
||||||
|
SyncPoint::GetInstance()->DisableProcessing();
|
||||||
|
fault_fs_->SetFilesystemActive(true);
|
||||||
|
|
||||||
|
Destroy(options);
|
||||||
|
}
|
||||||
|
|
||||||
class DBErrorHandlingFencingTest : public DBErrorHandlingFSTest,
|
class DBErrorHandlingFencingTest : public DBErrorHandlingFSTest,
|
||||||
public testing::WithParamInterface<bool> {};
|
public testing::WithParamInterface<bool> {};
|
||||||
|
|
||||||
|
@ -232,23 +232,30 @@ void EventHelpers::LogAndNotifyTableFileDeletion(
|
|||||||
#endif // !ROCKSDB_LITE
|
#endif // !ROCKSDB_LITE
|
||||||
}
|
}
|
||||||
|
|
||||||
void EventHelpers::NotifyOnErrorRecoveryCompleted(
|
void EventHelpers::NotifyOnErrorRecoveryEnd(
|
||||||
const std::vector<std::shared_ptr<EventListener>>& listeners,
|
const std::vector<std::shared_ptr<EventListener>>& listeners,
|
||||||
Status old_bg_error, InstrumentedMutex* db_mutex) {
|
const Status& old_bg_error, const Status& new_bg_error,
|
||||||
|
InstrumentedMutex* db_mutex) {
|
||||||
#ifndef ROCKSDB_LITE
|
#ifndef ROCKSDB_LITE
|
||||||
if (!listeners.empty()) {
|
if (!listeners.empty()) {
|
||||||
db_mutex->AssertHeld();
|
db_mutex->AssertHeld();
|
||||||
// release lock while notifying events
|
// release lock while notifying events
|
||||||
db_mutex->Unlock();
|
db_mutex->Unlock();
|
||||||
for (auto& listener : listeners) {
|
for (auto& listener : listeners) {
|
||||||
|
BackgroundErrorRecoveryInfo info;
|
||||||
|
info.old_bg_error = old_bg_error;
|
||||||
|
info.new_bg_error = new_bg_error;
|
||||||
listener->OnErrorRecoveryCompleted(old_bg_error);
|
listener->OnErrorRecoveryCompleted(old_bg_error);
|
||||||
|
listener->OnErrorRecoveryEnd(info);
|
||||||
|
info.old_bg_error.PermitUncheckedError();
|
||||||
|
info.new_bg_error.PermitUncheckedError();
|
||||||
}
|
}
|
||||||
db_mutex->Lock();
|
db_mutex->Lock();
|
||||||
}
|
}
|
||||||
old_bg_error.PermitUncheckedError();
|
|
||||||
#else
|
#else
|
||||||
(void)listeners;
|
(void)listeners;
|
||||||
(void)old_bg_error;
|
(void)old_bg_error;
|
||||||
|
(void)new_bg_error;
|
||||||
(void)db_mutex;
|
(void)db_mutex;
|
||||||
#endif // ROCKSDB_LITE
|
#endif // ROCKSDB_LITE
|
||||||
}
|
}
|
||||||
|
@ -43,9 +43,10 @@ class EventHelpers {
|
|||||||
uint64_t file_number, const std::string& file_path,
|
uint64_t file_number, const std::string& file_path,
|
||||||
const Status& status, const std::string& db_name,
|
const Status& status, const std::string& db_name,
|
||||||
const std::vector<std::shared_ptr<EventListener>>& listeners);
|
const std::vector<std::shared_ptr<EventListener>>& listeners);
|
||||||
static void NotifyOnErrorRecoveryCompleted(
|
static void NotifyOnErrorRecoveryEnd(
|
||||||
const std::vector<std::shared_ptr<EventListener>>& listeners,
|
const std::vector<std::shared_ptr<EventListener>>& listeners,
|
||||||
Status bg_error, InstrumentedMutex* db_mutex);
|
const Status& old_bg_error, const Status& new_bg_error,
|
||||||
|
InstrumentedMutex* db_mutex);
|
||||||
|
|
||||||
#ifndef ROCKSDB_LITE
|
#ifndef ROCKSDB_LITE
|
||||||
static void NotifyBlobFileCreationStarted(
|
static void NotifyBlobFileCreationStarted(
|
||||||
|
@ -453,6 +453,16 @@ struct ExternalFileIngestionInfo {
|
|||||||
TableProperties table_properties;
|
TableProperties table_properties;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Result of auto background error recovery
|
||||||
|
struct BackgroundErrorRecoveryInfo {
|
||||||
|
// The original error that triggered the recovery
|
||||||
|
Status old_bg_error;
|
||||||
|
|
||||||
|
// The final bg_error after all recovery attempts. Status::OK() means
|
||||||
|
// the recovery was successful and the database is fully operational.
|
||||||
|
Status new_bg_error;
|
||||||
|
};
|
||||||
|
|
||||||
struct IOErrorInfo {
|
struct IOErrorInfo {
|
||||||
IOErrorInfo(const IOStatus& _io_status, FileOperationType _operation,
|
IOErrorInfo(const IOStatus& _io_status, FileOperationType _operation,
|
||||||
const std::string& _file_path, size_t _length, uint64_t _offset)
|
const std::string& _file_path, size_t _length, uint64_t _offset)
|
||||||
@ -691,12 +701,21 @@ class EventListener : public Customizable {
|
|||||||
Status /* bg_error */,
|
Status /* bg_error */,
|
||||||
bool* /* auto_recovery */) {}
|
bool* /* auto_recovery */) {}
|
||||||
|
|
||||||
|
// DEPRECATED
|
||||||
// A callback function for RocksDB which will be called once the database
|
// A callback function for RocksDB which will be called once the database
|
||||||
// is recovered from read-only mode after an error. When this is called, it
|
// is recovered from read-only mode after an error. When this is called, it
|
||||||
// means normal writes to the database can be issued and the user can
|
// means normal writes to the database can be issued and the user can
|
||||||
// initiate any further recovery actions needed
|
// initiate any further recovery actions needed
|
||||||
virtual void OnErrorRecoveryCompleted(Status /* old_bg_error */) {}
|
virtual void OnErrorRecoveryCompleted(Status /* old_bg_error */) {}
|
||||||
|
|
||||||
|
// A callback function for RocksDB which will be called once the recovery
|
||||||
|
// attempt from a background retryable error is completed. The recovery
|
||||||
|
// may have been successful or not. In either case, the callback is called
|
||||||
|
// with the old and new error. If info.new_bg_error is Status::OK(), that
|
||||||
|
// means the recovery succeeded.
|
||||||
|
virtual void OnErrorRecoveryEnd(const BackgroundErrorRecoveryInfo& /*info*/) {
|
||||||
|
}
|
||||||
|
|
||||||
// A callback function for RocksDB which will be called before
|
// A callback function for RocksDB which will be called before
|
||||||
// a blob file is being created. It will follow by OnBlobFileCreated after
|
// a blob file is being created. It will follow by OnBlobFileCreated after
|
||||||
// the creation finishes.
|
// the creation finishes.
|
||||||
|
Loading…
Reference in New Issue
Block a user