Work around a backup bug with DB custom checksums

Summary: On a read-write DB configured with
DBOptions::file_checksum_gen_factory, BackupEngine::CreateNewBackup can
fail intermittently, with non-OK status. This is due to a race between
GetLiveFiles and GetLiveFilesChecksumInfo in creating backups.

For patching 6.12 release (as this commit is intended for), we can simply
treat files for which we falsely failed to get checksum info as legacy
files lacking checksum info.

Test Plan: unit test reproducer included
This commit is contained in:
Peter Dillinger 2020-08-20 14:02:35 -07:00
parent a38f04ac26
commit 25bddfa632
3 changed files with 44 additions and 1 deletions

View File

@ -1,5 +1,7 @@
# Rocksdb Change Log
## Unreleased
### Bug fixes
* BackupEngine::CreateNewBackup could fail intermittently with non-OK status when backing up a read-write DB configured with a DBOptions::file_checksum_gen_factory. This issue has been worked-around such that CreateNewBackup should succeed, but (until fully fixes) BackupEngine might not see all checksums available in the DB.
## 6.12 (2020-07-28)
### Public API Change

View File

@ -1357,6 +1357,38 @@ TEST_F(BackupableDBTest, InterruptCreationTest) {
AssertBackupConsistency(0, 0, keys_iteration);
}
TEST_F(BackupableDBTest, FlushCompactDuringBackupCheckpoint) {
const int keys_iteration = 5000;
options_.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
for (const auto& sopt : kAllShareOptions) {
OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */, sopt);
FillDB(db_.get(), 0, keys_iteration);
// That FillDB leaves a mix of flushed and unflushed data
SyncPoint::GetInstance()->LoadDependency(
{{"CheckpointImpl::CreateCustomCheckpoint:AfterGetLive1",
"BackupableDBTest::FlushDuringBackupCheckpoint:BeforeFlush"},
{"BackupableDBTest::FlushDuringBackupCheckpoint:AfterFlush",
"CheckpointImpl::CreateCustomCheckpoint:AfterGetLive2"}});
SyncPoint::GetInstance()->EnableProcessing();
ROCKSDB_NAMESPACE::port::Thread flush_thread{[this]() {
TEST_SYNC_POINT(
"BackupableDBTest::FlushDuringBackupCheckpoint:BeforeFlush");
FillDB(db_.get(), keys_iteration, 2 * keys_iteration);
ASSERT_OK(db_->Flush(FlushOptions()));
DBImpl* dbi = static_cast<DBImpl*>(db_.get());
dbi->TEST_WaitForFlushMemTable();
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
dbi->TEST_WaitForCompact();
TEST_SYNC_POINT(
"BackupableDBTest::FlushDuringBackupCheckpoint:AfterFlush");
}};
ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
flush_thread.join();
CloseDBAndBackupEngine();
AssertBackupConsistency(0, 0, keys_iteration);
}
}
inline std::string OptionsPath(std::string ret, int backupID) {
ret += "/private/";
ret += std::to_string(backupID);

View File

@ -250,6 +250,8 @@ Status CheckpointImpl::CreateCustomCheckpoint(
TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:SavedLiveFiles2");
db_->FlushWAL(false /* sync */);
}
TEST_SYNC_POINT("CheckpointImpl::CreateCustomCheckpoint:AfterGetLive1");
TEST_SYNC_POINT("CheckpointImpl::CreateCustomCheckpoint:AfterGetLive2");
// if we have more than one column family, we need to also get WAL files
if (s.ok()) {
s = db_->GetSortedWalFiles(live_wal_files);
@ -314,8 +316,15 @@ Status CheckpointImpl::CreateCustomCheckpoint(
// find checksum info for table files
s = checksum_list->SearchOneFileChecksum(number, &checksum_value,
&checksum_name);
// XXX/FIXME(peterd): There's currently a race between GetLiveFiles
// and GetLiveFilesChecksumInfo that could lead to not finding
// checksum info on a file that has it. For now, we can accept
// that and treat it like a legacy file lacking checksum info.
if (!s.ok()) {
return Status::NotFound("Can't find checksum for " + src_fname);
assert(checksum_name == kUnknownFileChecksumFuncName);
assert(checksum_value == kUnknownFileChecksum);
s = Status::OK();
}
}
s = copy_file_cb(db_->GetName(), src_fname,