From 25bddfa63227aa24d0ce6c80347c7a984920ae81 Mon Sep 17 00:00:00 2001 From: Peter Dillinger Date: Thu, 20 Aug 2020 14:02:35 -0700 Subject: [PATCH] Work around a backup bug with DB custom checksums Summary: On a read-write DB configured with DBOptions::file_checksum_gen_factory, BackupEngine::CreateNewBackup can fail intermittently, with non-OK status. This is due to a race between GetLiveFiles and GetLiveFilesChecksumInfo in creating backups. For patching 6.12 release (as this commit is intended for), we can simply treat files for which we falsely failed to get checksum info as legacy files lacking checksum info. Test Plan: unit test reproducer included --- HISTORY.md | 2 ++ utilities/backupable/backupable_db_test.cc | 32 ++++++++++++++++++++++ utilities/checkpoint/checkpoint_impl.cc | 11 +++++++- 3 files changed, 44 insertions(+), 1 deletion(-) diff --git a/HISTORY.md b/HISTORY.md index 2030c4cf6..7764331d2 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,7 @@ # Rocksdb Change Log ## Unreleased +### Bug fixes +* BackupEngine::CreateNewBackup could fail intermittently with non-OK status when backing up a read-write DB configured with a DBOptions::file_checksum_gen_factory. This issue has been worked-around such that CreateNewBackup should succeed, but (until fully fixes) BackupEngine might not see all checksums available in the DB. ## 6.12 (2020-07-28) ### Public API Change diff --git a/utilities/backupable/backupable_db_test.cc b/utilities/backupable/backupable_db_test.cc index 6a706ff76..c589705ec 100644 --- a/utilities/backupable/backupable_db_test.cc +++ b/utilities/backupable/backupable_db_test.cc @@ -1357,6 +1357,38 @@ TEST_F(BackupableDBTest, InterruptCreationTest) { AssertBackupConsistency(0, 0, keys_iteration); } +TEST_F(BackupableDBTest, FlushCompactDuringBackupCheckpoint) { + const int keys_iteration = 5000; + options_.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory(); + for (const auto& sopt : kAllShareOptions) { + OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */, sopt); + FillDB(db_.get(), 0, keys_iteration); + // That FillDB leaves a mix of flushed and unflushed data + SyncPoint::GetInstance()->LoadDependency( + {{"CheckpointImpl::CreateCustomCheckpoint:AfterGetLive1", + "BackupableDBTest::FlushDuringBackupCheckpoint:BeforeFlush"}, + {"BackupableDBTest::FlushDuringBackupCheckpoint:AfterFlush", + "CheckpointImpl::CreateCustomCheckpoint:AfterGetLive2"}}); + SyncPoint::GetInstance()->EnableProcessing(); + ROCKSDB_NAMESPACE::port::Thread flush_thread{[this]() { + TEST_SYNC_POINT( + "BackupableDBTest::FlushDuringBackupCheckpoint:BeforeFlush"); + FillDB(db_.get(), keys_iteration, 2 * keys_iteration); + ASSERT_OK(db_->Flush(FlushOptions())); + DBImpl* dbi = static_cast(db_.get()); + dbi->TEST_WaitForFlushMemTable(); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + dbi->TEST_WaitForCompact(); + TEST_SYNC_POINT( + "BackupableDBTest::FlushDuringBackupCheckpoint:AfterFlush"); + }}; + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get())); + flush_thread.join(); + CloseDBAndBackupEngine(); + AssertBackupConsistency(0, 0, keys_iteration); + } +} + inline std::string OptionsPath(std::string ret, int backupID) { ret += "/private/"; ret += std::to_string(backupID); diff --git a/utilities/checkpoint/checkpoint_impl.cc b/utilities/checkpoint/checkpoint_impl.cc index 4fc6ede00..81ee48fdd 100644 --- a/utilities/checkpoint/checkpoint_impl.cc +++ b/utilities/checkpoint/checkpoint_impl.cc @@ -250,6 +250,8 @@ Status CheckpointImpl::CreateCustomCheckpoint( TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:SavedLiveFiles2"); db_->FlushWAL(false /* sync */); } + TEST_SYNC_POINT("CheckpointImpl::CreateCustomCheckpoint:AfterGetLive1"); + TEST_SYNC_POINT("CheckpointImpl::CreateCustomCheckpoint:AfterGetLive2"); // if we have more than one column family, we need to also get WAL files if (s.ok()) { s = db_->GetSortedWalFiles(live_wal_files); @@ -314,8 +316,15 @@ Status CheckpointImpl::CreateCustomCheckpoint( // find checksum info for table files s = checksum_list->SearchOneFileChecksum(number, &checksum_value, &checksum_name); + + // XXX/FIXME(peterd): There's currently a race between GetLiveFiles + // and GetLiveFilesChecksumInfo that could lead to not finding + // checksum info on a file that has it. For now, we can accept + // that and treat it like a legacy file lacking checksum info. if (!s.ok()) { - return Status::NotFound("Can't find checksum for " + src_fname); + assert(checksum_name == kUnknownFileChecksumFuncName); + assert(checksum_value == kUnknownFileChecksum); + s = Status::OK(); } } s = copy_file_cb(db_->GetName(), src_fname,