Add test kPointInTimeRecoveryCFConsistency

Summary:
Context/problem:

- CFs may be flushed at different times
- A WAL can only be deleted after all CFs have flushed beyond end of that WAL.
- Point-in-time recovery might stop upon reaching the first corruption.
- Some CFs may have already flushed beyond that point, while others haven't. We should fail the Open() instead of proceeding with inconsistent CFs.
Closes https://github.com/facebook/rocksdb/pull/2900

Differential Revision: D5863281

Pulled By: miasantreble

fbshipit-source-id: 180dbaf83d96c804cff49b3c406312a4ae61313e
This commit is contained in:
Zhongyi Xie 2017-09-22 17:24:14 -07:00 committed by Facebook Github Bot
parent be97dbb15c
commit 1d6700f9e6
3 changed files with 55 additions and 0 deletions

View File

@ -5,6 +5,7 @@
### New Features
### Bug Fixes
* Fix a potential data inconsistency issue during point-in-time recovery. `DB:Open()` will abort if column family inconsistency is found during PIT recovery.
## 5.8.0 (08/30/2017)
### Public API Change

View File

@ -493,6 +493,7 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
bool stop_replay_by_wal_filter = false;
bool stop_replay_for_corruption = false;
bool flushed = false;
uint64_t corrupted_log_number = kMaxSequenceNumber;
for (auto log_number : log_numbers) {
// The previous incarnation may not have written any MANIFEST
// records after allocating this log number. So we manually
@ -720,6 +721,7 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
// We should ignore the error but not continue replaying
status = Status::OK();
stop_replay_for_corruption = true;
corrupted_log_number = log_number;
ROCKS_LOG_INFO(immutable_db_options_.info_log,
"Point in time recovered to log #%" PRIu64
" seq #%" PRIu64,
@ -741,6 +743,25 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
versions_->SetLastSequence(last_sequence);
}
}
// Compare the corrupted log number to all columnfamily's current log number.
// Abort Open() if any column family's log number is greater than
// the corrupted log number, which means CF contains data beyond the point of
// corruption. This could during PIT recovery when the WAL is corrupted and
// some (but not all) CFs are flushed
if (stop_replay_for_corruption == true &&
(immutable_db_options_.wal_recovery_mode ==
WALRecoveryMode::kPointInTimeRecovery ||
immutable_db_options_.wal_recovery_mode ==
WALRecoveryMode::kTolerateCorruptedTailRecords)) {
for (auto cfd : *versions_->GetColumnFamilySet()) {
if (cfd->GetLogNumber() > corrupted_log_number) {
ROCKS_LOG_ERROR(immutable_db_options_.info_log,
"Column family inconsistency: SST file contains data"
" beyond the point of corruption.");
return Status::Corruption("SST file is ahead of WALs");
}
}
}
// True if there's any data in the WALs; if not, we can skip re-processing
// them later

View File

@ -876,6 +876,39 @@ TEST_F(DBWALTest, kAbsoluteConsistency) {
}
}
// Test scope:
// We don't expect the data store to be opened if there is any inconsistency
// between WAL and SST files
TEST_F(DBWALTest, kPointInTimeRecoveryCFConsistency) {
Options options = CurrentOptions();
options.avoid_flush_during_recovery = true;
// Create DB with multiple column families.
CreateAndReopenWithCF({"one", "two"}, options);
ASSERT_OK(Put(1, "key1", "val1"));
ASSERT_OK(Put(2, "key2", "val2"));
// Record the offset at this point
Env* env = options.env;
int wal_file_id = RecoveryTestHelper::kWALFileOffset + 1;
std::string fname = LogFileName(dbname_, wal_file_id);
uint64_t offset_to_corrupt;
ASSERT_OK(env->GetFileSize(fname, &offset_to_corrupt));
ASSERT_GT(offset_to_corrupt, 0);
ASSERT_OK(Put(1, "key3", "val3"));
// Corrupt WAL at location of key3
RecoveryTestHelper::InduceCorruption(
fname, static_cast<size_t>(offset_to_corrupt), static_cast<size_t>(4));
ASSERT_OK(Put(2, "key4", "val4"));
ASSERT_OK(Put(1, "key5", "val5"));
Flush(2);
// PIT recovery & verify
options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
ASSERT_NOK(TryReopenWithColumnFamilies({"default", "one", "two"}, options));
}
// Test scope:
// - We expect to open data store under all circumstances
// - We expect only data upto the point where the first error was encountered