Fail point-in-time WAL recovery upon IOError reading WAL (#6963)

Summary:
If `options.wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery`, RocksDB stops replaying WAL once hitting an error and discards the rest of the WAL. This can lead to data loss if the error occurs at an offset smaller than the last sync'ed offset.
Ideally, RocksDB point-in-time recovery should permit recovery if the error occurs after last synced offset while fail recovery if error occurs before the last synced offset. However, RocksDB does not track the synced offset of WALs. Consequently, RocksDB does not know whether an error occurs before or after the last synced offset. An error can be one of the following.
- WAL record checksum mismatch. This can result from both corruption of synced data and dropping of unsynced data during shutdown. We cannot be sure which one. In order not to defeat the original motivation to permit the latter case, we keep the original behavior of point-in-time WAL recovery.
- IOError. This means the WAL can be bad, an indicator of whole file becoming unavailable, not to mention synced part of the WAL. Therefore, we choose to modify the behavior of point-in-time recovery and fail the database recovery.

Test plan (devserver):
make check
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6963

Reviewed By: ajkr

Differential Revision: D22011083

Pulled By: riversand963

fbshipit-source-id: f9cbf29a37dc5cc40d3fa62f89eed1ad67ca1536
This commit is contained in:
Yanqin Jin 2020-06-11 18:39:21 -07:00 committed by Facebook GitHub Bot
parent d854abad78
commit 717749f4c0
4 changed files with 44 additions and 4 deletions

View File

@ -3,6 +3,7 @@
### Behavior Changes ### Behavior Changes
* Disable delete triggered compaction (NewCompactOnDeletionCollectorFactory) in universal compaction mode and num_levels = 1 in order to avoid a corruption bug. * Disable delete triggered compaction (NewCompactOnDeletionCollectorFactory) in universal compaction mode and num_levels = 1 in order to avoid a corruption bug.
* `pin_l0_filter_and_index_blocks_in_cache` no longer applies to L0 files larger than `1.5 * write_buffer_size` to give more predictable memory usage. Such L0 files may exist due to intra-L0 compaction, external file ingestion, or user dynamically changing `write_buffer_size` (note, however, that files that are already pinned will continue being pinned, even after such a dynamic change). * `pin_l0_filter_and_index_blocks_in_cache` no longer applies to L0 files larger than `1.5 * write_buffer_size` to give more predictable memory usage. Such L0 files may exist due to intra-L0 compaction, external file ingestion, or user dynamically changing `write_buffer_size` (note, however, that files that are already pinned will continue being pinned, even after such a dynamic change).
* In point-in-time wal recovery mode, fail database recovery in case of IOError while reading the WAL to avoid data loss.
### Bug Fixes ### Bug Fixes
* Fix consistency checking error swallowing in some cases when options.force_consistency_checks = true. * Fix consistency checking error swallowing in some cases when options.force_consistency_checks = true.

View File

@ -704,10 +704,10 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
Status* status; // nullptr if immutable_db_options_.paranoid_checks==false Status* status; // nullptr if immutable_db_options_.paranoid_checks==false
void Corruption(size_t bytes, const Status& s) override { void Corruption(size_t bytes, const Status& s) override {
ROCKS_LOG_WARN(info_log, "%s%s: dropping %d bytes; %s", ROCKS_LOG_WARN(info_log, "%s%s: dropping %d bytes; %s",
(this->status == nullptr ? "(ignoring error) " : ""), (status == nullptr ? "(ignoring error) " : ""), fname,
fname, static_cast<int>(bytes), s.ToString().c_str()); static_cast<int>(bytes), s.ToString().c_str());
if (this->status != nullptr && this->status->ok()) { if (status != nullptr && status->ok()) {
*this->status = s; *status = s;
} }
} }
}; };
@ -830,6 +830,8 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
Slice record; Slice record;
WriteBatch batch; WriteBatch batch;
TEST_SYNC_POINT_CALLBACK("DBImpl::RecoverLogFiles:BeforeReadWal",
/*arg=*/nullptr);
while (!stop_replay_by_wal_filter && while (!stop_replay_by_wal_filter &&
reader.ReadRecord(&record, &scratch, reader.ReadRecord(&record, &scratch,
immutable_db_options_.wal_recovery_mode) && immutable_db_options_.wal_recovery_mode) &&
@ -994,6 +996,16 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
status = Status::OK(); status = Status::OK();
} else if (immutable_db_options_.wal_recovery_mode == } else if (immutable_db_options_.wal_recovery_mode ==
WALRecoveryMode::kPointInTimeRecovery) { WALRecoveryMode::kPointInTimeRecovery) {
if (status.IsIOError()) {
ROCKS_LOG_ERROR(immutable_db_options_.info_log,
"IOError during point-in-time reading log #%" PRIu64
" seq #%" PRIu64
". %s. This likely mean loss of synced WAL, "
"thus recovery fails.",
log_number, *next_sequence,
status.ToString().c_str());
return status;
}
// We should ignore the error but not continue replaying // We should ignore the error but not continue replaying
status = Status::OK(); status = Status::OK();
stop_replay_for_corruption = true; stop_replay_for_corruption = true;

View File

@ -4926,6 +4926,32 @@ TEST_F(DBTest2, AutoPrefixMode1) {
} }
} }
#endif // ROCKSDB_LITE #endif // ROCKSDB_LITE
// WAL recovery mode is WALRecoveryMode::kPointInTimeRecovery.
TEST_F(DBTest2, PointInTimeRecoveryWithIOErrorWhileReadingWal) {
Options options = CurrentOptions();
DestroyAndReopen(options);
ASSERT_OK(Put("foo", "value0"));
Close();
SyncPoint::GetInstance()->DisableProcessing();
SyncPoint::GetInstance()->ClearAllCallBacks();
bool should_inject_error = false;
SyncPoint::GetInstance()->SetCallBack(
"DBImpl::RecoverLogFiles:BeforeReadWal",
[&](void* /*arg*/) { should_inject_error = true; });
SyncPoint::GetInstance()->SetCallBack(
"LogReader::ReadMore:AfterReadFile", [&](void* arg) {
if (should_inject_error) {
ASSERT_NE(nullptr, arg);
*reinterpret_cast<Status*>(arg) = Status::IOError("Injected IOError");
}
});
SyncPoint::GetInstance()->EnableProcessing();
options.avoid_flush_during_recovery = true;
options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
Status s = TryReopen(options);
ASSERT_TRUE(s.IsIOError());
}
} // namespace ROCKSDB_NAMESPACE } // namespace ROCKSDB_NAMESPACE
#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS #ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS

View File

@ -281,6 +281,7 @@ bool Reader::ReadMore(size_t* drop_size, int *error) {
// Last read was a full read, so this is a trailer to skip // Last read was a full read, so this is a trailer to skip
buffer_.clear(); buffer_.clear();
Status status = file_->Read(kBlockSize, &buffer_, backing_store_); Status status = file_->Read(kBlockSize, &buffer_, backing_store_);
TEST_SYNC_POINT_CALLBACK("LogReader::ReadMore:AfterReadFile", &status);
end_of_buffer_offset_ += buffer_.size(); end_of_buffer_offset_ += buffer_.size();
if (!status.ok()) { if (!status.ok()) {
buffer_.clear(); buffer_.clear();