From 5e794b0841f72b5a192610eb1fca8ef3086540d2 Mon Sep 17 00:00:00 2001 From: Cheng Chang Date: Sat, 7 Nov 2020 21:54:55 -0800 Subject: [PATCH] Fix a recovery corner case (#7621) Summary: Consider the following sequence of events: 1. Db flushed an SST with file number N, appended to MANIFEST, and tried to sync the MANIFEST. 2. Syncing MANIFEST failed and db crashed. 3. Db tried to recover with this MANIFEST. In the meantime, no entry about the newly-flushed SST was found in the MANIFEST. Therefore, RocksDB replayed WAL and tried to flush to an SST file reusing the same file number N. This failed because file system does not support overwrite. Then Db deleted this file. 4. Db crashed again. 5. Db tried to recover. When db read the MANIFEST, there was an entry referencing N.sst. This could happen probably because the append in step 1 finally reached the MANIFEST and became visible. Since N.sst had been deleted in step 3, recovery failed. It is possible that N.sst created in step 1 is valid. Although step 3 would still fail since the MANIFEST was not synced properly in step 1 and 2, deleting N.sst would make it impossible for the db to recover even if the remaining part of MANIFEST was appended and visible after step 5. After this PR, in step 3, immediately after recovering from MANIFEST, a new MANIFEST is created, then we find that N.sst is not referenced in the MANIFEST, so we delete it, and we'll not reuse N as file number. Then in step 5, since the new MANIFEST does not contain N.sst, the recovery failure situation in step 5 won't happen. Pull Request resolved: https://github.com/facebook/rocksdb/pull/7621 Test Plan: 1. some tests are updated, because these tests assume that new MANIFEST is created after WAL recovery. 2. a new unit test is added in db_basic_test to simulate step 3. Reviewed By: riversand963 Differential Revision: D24668144 Pulled By: cheng-chang fbshipit-source-id: 90d7487fbad2bc3714f5ede46ea949895b15ae3b --- db/cuckoo_table_db_test.cc | 11 ++++- db/db_basic_test.cc | 37 +++++++++++++++++ db/db_impl/db_impl.h | 14 +++++-- db/db_impl/db_impl_files.cc | 40 +++++++++++++++++-- db/db_impl/db_impl_open.cc | 34 ++-------------- db/db_range_del_test.cc | 10 ++--- db_stress_tool/db_stress_test_base.cc | 1 - env/mock_env.cc | 2 +- .../test/java/org/rocksdb/RocksDBTest.java | 6 +-- utilities/backupable/backupable_db_test.cc | 8 ++-- 10 files changed, 111 insertions(+), 52 deletions(-) diff --git a/db/cuckoo_table_db_test.cc b/db/cuckoo_table_db_test.cc index 2aaf2c50d..7bb9478ac 100644 --- a/db/cuckoo_table_db_test.cc +++ b/db/cuckoo_table_db_test.cc @@ -63,6 +63,15 @@ class CuckooTableDBTest : public testing::Test { ASSERT_OK(DB::Open(opts, dbname_, &db_)); } + void DestroyAndReopen(Options* options) { + assert(options); + ASSERT_OK(db_->Close()); + delete db_; + db_ = nullptr; + ASSERT_OK(DestroyDB(dbname_, *options)); + Reopen(options); + } + Status Put(const Slice& k, const Slice& v) { return db_->Put(WriteOptions(), k, v); } @@ -205,7 +214,7 @@ static std::string Uint64Key(uint64_t i) { TEST_F(CuckooTableDBTest, Uint64Comparator) { Options options = CurrentOptions(); options.comparator = test::Uint64Comparator(); - Reopen(&options); + DestroyAndReopen(&options); ASSERT_OK(Put(Uint64Key(1), "v1")); ASSERT_OK(Put(Uint64Key(2), "v2")); diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc index 377952b2c..31200ee8a 100644 --- a/db/db_basic_test.cc +++ b/db/db_basic_test.cc @@ -2279,6 +2279,43 @@ class TableFileListener : public EventListener { }; } // namespace +TEST_F(DBBasicTest, LastSstFileNotInManifest) { + // If the last sst file is not tracked in MANIFEST, + // or the VersionEdit for the last sst file is not synced, + // on recovery, the last sst file should be deleted, + // and new sst files shouldn't reuse its file number. + Options options = CurrentOptions(); + DestroyAndReopen(options); + Close(); + + // Manually add a sst file. + constexpr uint64_t kSstFileNumber = 100; + const std::string kSstFile = MakeTableFileName(dbname_, kSstFileNumber); + ASSERT_OK(WriteStringToFile(env_, /* data = */ "bad sst file content", + /* fname = */ kSstFile, + /* should_sync = */ true)); + ASSERT_OK(env_->FileExists(kSstFile)); + + TableFileListener* listener = new TableFileListener(); + options.listeners.emplace_back(listener); + Reopen(options); + // kSstFile should already be deleted. + ASSERT_TRUE(env_->FileExists(kSstFile).IsNotFound()); + + ASSERT_OK(Put("k", "v")); + ASSERT_OK(Flush()); + // New sst file should have file number > kSstFileNumber. + std::vector& files = + listener->GetFiles(kDefaultColumnFamilyName); + ASSERT_EQ(files.size(), 1); + const std::string fname = files[0].erase(0, (dbname_ + "/").size()); + uint64_t number = 0; + FileType type = kTableFile; + ASSERT_TRUE(ParseFileName(fname, &number, &type)); + ASSERT_EQ(type, kTableFile); + ASSERT_GT(number, kSstFileNumber); +} + TEST_F(DBBasicTest, RecoverWithMissingFiles) { Options options = CurrentOptions(); DestroyAndReopen(options); diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index 1ffff8fa6..2f48c3c26 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -1215,14 +1215,22 @@ class DBImpl : public DB { virtual bool OwnTablesAndLogs() const { return true; } + // Set DB identity file, and write DB ID to manifest if necessary. + Status SetDBId(); + // REQUIRES: db mutex held when calling this function, but the db mutex can // be released and re-acquired. Db mutex will be held when the function // returns. - // After best-efforts recovery, there may be SST files in db/cf paths that are - // not referenced in the MANIFEST. We delete these SST files. In the + // After recovery, there may be SST files in db/cf paths that are + // not referenced in the MANIFEST (e.g. + // 1. It's best effort recovery; + // 2. The VersionEdits referencing the SST files are appended to + // MANIFEST, DB crashes when syncing the MANIFEST, the VersionEdits are + // still not synced to MANIFEST during recovery.) + // We delete these SST files. In the // meantime, we find out the largest file number present in the paths, and // bump up the version set's next_file_number_ to be 1 + largest_file_number. - Status FinishBestEffortsRecovery(); + Status DeleteUnreferencedSstFiles(); // SetDbSessionId() should be called in the constuctor DBImpl() // to ensure that db_session_id_ gets updated every time the DB is opened diff --git a/db/db_impl/db_impl_files.cc b/db/db_impl/db_impl_files.cc index d825c406c..f67e8f748 100644 --- a/db/db_impl/db_impl_files.cc +++ b/db/db_impl/db_impl_files.cc @@ -751,7 +751,43 @@ uint64_t PrecomputeMinLogNumberToKeep2PC( return min_log_number_to_keep; } -Status DBImpl::FinishBestEffortsRecovery() { +Status DBImpl::SetDBId() { + Status s; + // Happens when immutable_db_options_.write_dbid_to_manifest is set to true + // the very first time. + if (db_id_.empty()) { + // Check for the IDENTITY file and create it if not there. + s = fs_->FileExists(IdentityFileName(dbname_), IOOptions(), nullptr); + // Typically Identity file is created in NewDB() and for some reason if + // it is no longer available then at this point DB ID is not in Identity + // file or Manifest. + if (s.IsNotFound()) { + s = SetIdentityFile(env_, dbname_); + if (!s.ok()) { + return s; + } + } else if (!s.ok()) { + assert(s.IsIOError()); + return s; + } + s = GetDbIdentityFromIdentityFile(&db_id_); + if (immutable_db_options_.write_dbid_to_manifest && s.ok()) { + VersionEdit edit; + edit.SetDBId(db_id_); + Options options; + MutableCFOptions mutable_cf_options(options); + versions_->db_id_ = db_id_; + s = versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(), + mutable_cf_options, &edit, &mutex_, nullptr, + /* new_descriptor_log */ false); + } + } else { + s = SetIdentityFile(env_, dbname_, db_id_); + } + return s; +} + +Status DBImpl::DeleteUnreferencedSstFiles() { mutex_.AssertHeld(); std::vector paths; paths.push_back(NormalizePath(dbname_ + std::string(1, kFilePathSeparator))); @@ -807,8 +843,6 @@ Status DBImpl::FinishBestEffortsRecovery() { assert(versions_->GetColumnFamilySet()); ColumnFamilyData* default_cfd = versions_->GetColumnFamilySet()->GetDefault(); assert(default_cfd); - // Even if new_descriptor_log is false, we will still switch to a new - // MANIFEST and update CURRENT file, since this is in recovery. s = versions_->LogAndApply( default_cfd, *default_cfd->GetLatestMutableCFOptions(), &edit, &mutex_, directories_.GetDbDir(), /*new_descriptor_log*/ false); diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc index 90759e9c5..247985d00 100644 --- a/db/db_impl/db_impl_open.cc +++ b/db/db_impl/db_impl_open.cc @@ -479,42 +479,14 @@ Status DBImpl::Recover( // TryRecover may delete previous column_family_set_. column_family_memtables_.reset( new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet())); - s = FinishBestEffortsRecovery(); } } if (!s.ok()) { return s; } - // Happens when immutable_db_options_.write_dbid_to_manifest is set to true - // the very first time. - if (db_id_.empty()) { - // Check for the IDENTITY file and create it if not there. - s = fs_->FileExists(IdentityFileName(dbname_), IOOptions(), nullptr); - // Typically Identity file is created in NewDB() and for some reason if - // it is no longer available then at this point DB ID is not in Identity - // file or Manifest. - if (s.IsNotFound()) { - s = SetIdentityFile(env_, dbname_); - if (!s.ok()) { - return s; - } - } else if (!s.ok()) { - assert(s.IsIOError()); - return s; - } - s = GetDbIdentityFromIdentityFile(&db_id_); - if (immutable_db_options_.write_dbid_to_manifest && s.ok()) { - VersionEdit edit; - edit.SetDBId(db_id_); - Options options; - MutableCFOptions mutable_cf_options(options); - versions_->db_id_ = db_id_; - s = versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(), - mutable_cf_options, &edit, &mutex_, nullptr, - false); - } - } else { - s = SetIdentityFile(env_, dbname_, db_id_); + s = SetDBId(); + if (s.ok() && !read_only) { + s = DeleteUnreferencedSstFiles(); } if (immutable_db_options_.paranoid_checks && s.ok()) { diff --git a/db/db_range_del_test.cc b/db/db_range_del_test.cc index 403b221e6..3c36ecd1a 100644 --- a/db/db_range_del_test.cc +++ b/db/db_range_del_test.cc @@ -269,7 +269,7 @@ TEST_F(DBRangeDelTest, FlushRemovesCoveredKeys) { const int kNum = 300, kRangeBegin = 50, kRangeEnd = 250; Options opts = CurrentOptions(); opts.comparator = test::Uint64Comparator(); - Reopen(opts); + DestroyAndReopen(opts); // Write a third before snapshot, a third between snapshot and tombstone, and // a third after the tombstone. Keys older than snapshot or newer than the @@ -309,7 +309,7 @@ TEST_F(DBRangeDelTest, CompactionRemovesCoveredKeys) { opts.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile)); opts.num_levels = 2; opts.statistics = CreateDBStatistics(); - Reopen(opts); + DestroyAndReopen(opts); for (int i = 0; i < kNumFiles; ++i) { if (i > 0) { @@ -603,7 +603,7 @@ TEST_F(DBRangeDelTest, TableEvictedDuringScan) { bbto.cache_index_and_filter_blocks = true; bbto.block_cache = NewLRUCache(8 << 20); opts.table_factory.reset(NewBlockBasedTableFactory(bbto)); - Reopen(opts); + DestroyAndReopen(opts); // Hold a snapshot so range deletions can't become obsolete during compaction // to bottommost level (i.e., L1). @@ -761,7 +761,7 @@ TEST_F(DBRangeDelTest, IteratorRemovesCoveredKeys) { Options opts = CurrentOptions(); opts.comparator = test::Uint64Comparator(); opts.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile)); - Reopen(opts); + DestroyAndReopen(opts); // Write half of the keys before the tombstone and half after the tombstone. // Only covered keys (i.e., within the range and older than the tombstone) @@ -794,7 +794,7 @@ TEST_F(DBRangeDelTest, IteratorOverUserSnapshot) { Options opts = CurrentOptions(); opts.comparator = test::Uint64Comparator(); opts.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile)); - Reopen(opts); + DestroyAndReopen(opts); const Snapshot* snapshot = nullptr; // Put a snapshot before the range tombstone, verify an iterator using that diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc index da1098e27..fe90a0796 100644 --- a/db_stress_tool/db_stress_test_base.cc +++ b/db_stress_tool/db_stress_test_base.cc @@ -2066,7 +2066,6 @@ void StressTest::Open() { FLAGS_level_compaction_dynamic_level_bytes; options_.file_checksum_gen_factory = GetFileChecksumImpl(FLAGS_file_checksum_impl); - options_.track_and_verify_wals_in_manifest = true; } else { #ifdef ROCKSDB_LITE fprintf(stderr, "--options_file not supported in lite mode\n"); diff --git a/env/mock_env.cc b/env/mock_env.cc index d24557e5d..44c8a01b9 100644 --- a/env/mock_env.cc +++ b/env/mock_env.cc @@ -601,7 +601,7 @@ class MockFileSystem : public FileSystem { std::string NormalizeMockPath(const std::string& path) { std::string p = NormalizePath(path); - if (p.back() == '/' && p.size() > 1) { + if (p.back() == kFilePathSeparator && p.size() > 1) { p.pop_back(); } return p; diff --git a/java/src/test/java/org/rocksdb/RocksDBTest.java b/java/src/test/java/org/rocksdb/RocksDBTest.java index fc62dc80e..46f6414c8 100644 --- a/java/src/test/java/org/rocksdb/RocksDBTest.java +++ b/java/src/test/java/org/rocksdb/RocksDBTest.java @@ -1456,11 +1456,11 @@ public class RocksDBTest { try (final RocksDB db = RocksDB.open(options, dbPath)) { final RocksDB.LiveFiles livefiles = db.getLiveFiles(true); assertThat(livefiles).isNotNull(); - assertThat(livefiles.manifestFileSize).isEqualTo(13); + assertThat(livefiles.manifestFileSize).isEqualTo(57); assertThat(livefiles.files.size()).isEqualTo(3); assertThat(livefiles.files.get(0)).isEqualTo("/CURRENT"); - assertThat(livefiles.files.get(1)).isEqualTo("/MANIFEST-000001"); - assertThat(livefiles.files.get(2)).isEqualTo("/OPTIONS-000005"); + assertThat(livefiles.files.get(1)).isEqualTo("/MANIFEST-000003"); + assertThat(livefiles.files.get(2)).isEqualTo("/OPTIONS-000006"); } } } diff --git a/utilities/backupable/backupable_db_test.cc b/utilities/backupable/backupable_db_test.cc index 0cb9ee3da..8d8ca95f8 100644 --- a/utilities/backupable/backupable_db_test.cc +++ b/utilities/backupable/backupable_db_test.cc @@ -2500,19 +2500,19 @@ TEST_F(BackupableDBTest, GarbageCollectionBeforeBackup) { OpenDBAndBackupEngine(true); backup_chroot_env_->CreateDirIfMissing(backupdir_ + "/shared"); - std::string file_five = backupdir_ + "/shared/000007.sst"; + std::string file_five = backupdir_ + "/shared/000008.sst"; std::string file_five_contents = "I'm not really a sst file"; - // this depends on the fact that 00007.sst is the first file created by the DB + // this depends on the fact that 00008.sst is the first file created by the DB ASSERT_OK(file_manager_->WriteToFile(file_five, file_five_contents)); FillDB(db_.get(), 0, 100); - // backup overwrites file 000007.sst + // backup overwrites file 000008.sst ASSERT_TRUE(backup_engine_->CreateNewBackup(db_.get(), true).ok()); std::string new_file_five_contents; ASSERT_OK(ReadFileToString(backup_chroot_env_.get(), file_five, &new_file_five_contents)); - // file 000007.sst was overwritten + // file 000008.sst was overwritten ASSERT_TRUE(new_file_five_contents != file_five_contents); CloseDBAndBackupEngine();