From 76e951dbb1b4e06c5649618a61865f3141e66054 Mon Sep 17 00:00:00 2001 From: sdong Date: Thu, 26 Sep 2019 16:16:28 -0700 Subject: [PATCH] Add a unit test to reproduce a corruption bug (#5851) Summary: This is a bug occaionally shows up in crash test, and this unit test is to reproduce it. The bug is following: 1. Database has multiple CFs. 2. Between one DB restart, the last log file is corrupted in the middle (not the tail) 3. During restart, DB crashes between flushes between two CFs. The DB will fail to be opened again with error "SST file is ahead of WALs" Pull Request resolved: https://github.com/facebook/rocksdb/pull/5851 Test Plan: Run the test itself. Differential Revision: D17614721 fbshipit-source-id: 1b0abce49b203a76a039e38e76bc940429975f20 --- db/db_impl/db_impl_open.cc | 2 ++ db/db_test2.cc | 60 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc index 076985de1..92281df50 100644 --- a/db/db_impl/db_impl_open.cc +++ b/db/db_impl/db_impl_open.cc @@ -1053,6 +1053,8 @@ Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, versions_->MarkFileNumberUsed(max_log_number + 1); status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), edit, &mutex_); + TEST_SYNC_POINT_CALLBACK("DBImpl::RecoverLogFiles:AfterLogAndApply", + nullptr); if (!status.ok()) { // Recovery failed break; diff --git a/db/db_test2.cc b/db/db_test2.cc index 82df2a908..c50d55cb9 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -16,6 +16,7 @@ #include "port/stack_trace.h" #include "rocksdb/persistent_cache.h" #include "rocksdb/wal_filter.h" +#include "test_util/fault_injection_test_env.h" namespace rocksdb { @@ -4070,6 +4071,65 @@ TEST_F(DBTest2, RowCacheSnapshot) { db_->ReleaseSnapshot(s3); } #endif // ROCKSDB_LITE + +// Disabled but the test is failing. +// When DB is reopened with multiple column families, the manifest file +// is written after the first CF is flushed, and it is written again +// after each flush. If DB crashes between the flushes, the flushed CF +// flushed will pass the latest log file, and now we require it not +// to be corrupted, and triggering a corruption report. +// We need to fix the bug and enable the test. +TEST_F(DBTest2, DISABLED_CrashInRecoveryMultipleCF) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; + CreateAndReopenWithCF({"pikachu"}, options); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Flush()); + ASSERT_OK(Put(1, "foo", "bar")); + ASSERT_OK(Flush(1)); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put(1, "foo", "bar")); + // The value is large enough to be divided to two blocks. + std::string large_value(400, ' '); + ASSERT_OK(Put("foo1", large_value)); + ASSERT_OK(Put("foo2", large_value)); + Close(); + + // Corrupt the log file in the middle, so that it is not corrupted + // in the tail. + std::vector filenames; + ASSERT_OK(env_->GetChildren(dbname_, &filenames)); + for (const auto& f : filenames) { + uint64_t number; + FileType type; + if (ParseFileName(f, &number, &type) && type == FileType::kLogFile) { + std::string fname = dbname_ + "/" + f; + std::string file_content; + ASSERT_OK(ReadFileToString(env_, fname, &file_content)); + file_content[400] = 'h'; + file_content[401] = 'a'; + ASSERT_OK(WriteStringToFile(env_, file_content, fname)); + break; + } + } + + // Reopen and freeze the file system after the first manifest write. + FaultInjectionTestEnv fit_env(options.env); + options.env = &fit_env; + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::RecoverLogFiles:AfterLogAndApply", + [&](void* /*arg*/) { fit_env.SetFilesystemActive(false); }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_NOK(TryReopenWithColumnFamilies({"default", "pikachu"}, options)); + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + + fit_env.SetFilesystemActive(true); + // If we continue using failure ingestion Env, it will conplain something + // when renaming current file, which is not expected. Need to investigate why. + options.env = env_; + ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options)); +} } // namespace rocksdb #ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS