diff --git a/db/corruption_test.cc b/db/corruption_test.cc index b694f384f..8a8a2770d 100644 --- a/db/corruption_test.cc +++ b/db/corruption_test.cc @@ -308,7 +308,7 @@ class CorruptionTest : public testing::Test { uint64_t new_size = old_size - bytes_to_truncate; // If bytes_to_truncate == 0, it will do full truncation. if (bytes_to_truncate == 0) { - new_size = old_size; + new_size = 0; } ASSERT_OK(test::TruncateFile(env_, path, new_size)); } @@ -1049,16 +1049,23 @@ INSTANTIATE_TEST_CASE_P(CorruptionTest, CrashDuringRecoveryWithCorruptionTest, // "Column family inconsistency: SST file contains data beyond the point of // corruption" error will be hit, causing recovery to fail. // -// After adding the fix, corrupted WALs whose numbers are larger than the -// corrupted wal and smaller than the new WAL are moved to a separate folder. -// Only after new WAL is synced, RocksDB persist a new MANIFEST with column -// families to ensure RocksDB is in consistent state. +// After adding the fix, only after new WAL is synced, RocksDB persist a new +// MANIFEST with column families to ensure RocksDB is in consistent state. // RocksDB writes an empty WriteBatch as a sentinel to the new WAL which is // synced immediately afterwards. The sequence number of the sentinel // WriteBatch will be the next sequence number immediately after the largest // sequence number recovered from previous WALs and MANIFEST because of which DB // will be in consistent state. -TEST_P(CrashDuringRecoveryWithCorruptionTest, CrashDuringRecovery) { +// If a future recovery starts from the new MANIFEST, then it means the new WAL +// is successfully synced. Due to the sentinel empty write batch at the +// beginning, kPointInTimeRecovery of WAL is guaranteed to go after this point. +// If future recovery starts from the old MANIFEST, it means the writing the new +// MANIFEST failed. It won't have the "SST ahead of WAL" error. +// +// The combination of corrupting a WAL and injecting an error during subsequent +// re-open exposes the bug of prematurely persisting a new MANIFEST with +// advanced ColumnFamilyData::log_number. +TEST_P(CrashDuringRecoveryWithCorruptionTest, DISABLED_CrashDuringRecovery) { CloseDb(); Options options; options.track_and_verify_wals_in_manifest = @@ -1068,7 +1075,7 @@ TEST_P(CrashDuringRecoveryWithCorruptionTest, CrashDuringRecovery) { options.env = env_; ASSERT_OK(DestroyDB(dbname_, options)); options.create_if_missing = true; - options.max_write_buffer_number = 3; + options.max_write_buffer_number = 8; Reopen(&options); Status s; @@ -1094,13 +1101,15 @@ TEST_P(CrashDuringRecoveryWithCorruptionTest, CrashDuringRecovery) { // Write one key to test_cf. ASSERT_OK(db_->Put(WriteOptions(), handles[1], "old_key", "dontcare")); + ASSERT_OK(db_->Flush(FlushOptions(), handles[1])); + // Write to default_cf and flush this cf several times to advance wal - // number. + // number. TEST_SwitchMemtable makes sure WALs are not synced and test can + // corrupt un-sync WAL. for (int i = 0; i < 2; ++i) { ASSERT_OK(db_->Put(WriteOptions(), "key" + std::to_string(i), "value")); ASSERT_OK(dbimpl->TEST_SwitchMemtable()); } - ASSERT_OK(db_->Put(WriteOptions(), handles[1], "dontcare", "dontcare")); for (auto* h : handles) { delete h; @@ -1109,12 +1118,13 @@ TEST_P(CrashDuringRecoveryWithCorruptionTest, CrashDuringRecovery) { CloseDb(); } - // 2. Corrupt second last wal file to emulate power reset which caused the DB - // to lose the un-synced WAL. + // 2. Corrupt second last un-syned wal file to emulate power reset which + // caused the DB to lose the un-synced WAL. { std::vector file_nums; GetSortedWalFiles(file_nums); size_t size = file_nums.size(); + assert(size >= 2); uint64_t log_num = file_nums[size - 2]; CorruptFileWithTruncation(FileType::kWalFile, log_num, /*bytes_to_truncate=*/8); @@ -1131,25 +1141,31 @@ TEST_P(CrashDuringRecoveryWithCorruptionTest, CrashDuringRecovery) { // Case2: If avoid_flush_during_recovery = false, all column families have // flushed their data from WAL to L0 during recovery, and none of them will // ever need to read the WALs again. + + // 4. Fault is injected to fail the recovery. { - options.avoid_flush_during_recovery = avoid_flush_during_recovery_; + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::GetLogSizeAndMaybeTruncate:0", [&](void* arg) { + auto* tmp_s = reinterpret_cast(arg); + assert(tmp_s); + *tmp_s = Status::IOError("Injected"); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + handles.clear(); + options.avoid_flush_during_recovery = true; s = DB::Open(options, dbname_, cf_descs, &handles, &db_); - ASSERT_OK(s); + ASSERT_TRUE(s.IsIOError()); + ASSERT_EQ("IO error: Injected", s.ToString()); for (auto* h : handles) { delete h; } - handles.clear(); CloseDb(); - } - // 4. Corrupt max_wal_num to emulate second power reset which caused the - // DB to again lose the un-synced WAL. - { - std::vector file_nums; - GetSortedWalFiles(file_nums); - size_t size = file_nums.size(); - uint64_t log_num = file_nums[size - 1]; - CorruptFileWithTruncation(FileType::kWalFile, log_num); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); } // 5. After second crash reopen the db with second corruption. Default family @@ -1187,16 +1203,24 @@ TEST_P(CrashDuringRecoveryWithCorruptionTest, CrashDuringRecovery) { // file contains data beyond the point of corruption" error will be hit, causing // recovery to fail. // -// After adding the fix, corrupted WALs whose numbers are larger than the -// corrupted wal and smaller than the new WAL are moved to a separate folder. -// Only after new WAL is synced, RocksDB persist a new MANIFEST with column -// families to ensure RocksDB is in consistent state. +// After adding the fix, only after new WAL is synced, RocksDB persist a new +// MANIFEST with column families to ensure RocksDB is in consistent state. // RocksDB writes an empty WriteBatch as a sentinel to the new WAL which is // synced immediately afterwards. The sequence number of the sentinel // WriteBatch will be the next sequence number immediately after the largest // sequence number recovered from previous WALs and MANIFEST because of which DB // will be in consistent state. -TEST_P(CrashDuringRecoveryWithCorruptionTest, TxnDbCrashDuringRecovery) { +// If a future recovery starts from the new MANIFEST, then it means the new WAL +// is successfully synced. Due to the sentinel empty write batch at the +// beginning, kPointInTimeRecovery of WAL is guaranteed to go after this point. +// If future recovery starts from the old MANIFEST, it means the writing the new +// MANIFEST failed. It won't have the "SST ahead of WAL" error. +// +// The combination of corrupting a WAL and injecting an error during subsequent +// re-open exposes the bug of prematurely persisting a new MANIFEST with +// advanced ColumnFamilyData::log_number. +TEST_P(CrashDuringRecoveryWithCorruptionTest, + DISABLED_TxnDbCrashDuringRecovery) { CloseDb(); Options options; options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; @@ -1229,7 +1253,6 @@ TEST_P(CrashDuringRecoveryWithCorruptionTest, TxnDbCrashDuringRecovery) { // advance wal number so that some column families have advanced log_number // while other don't. { - options.avoid_flush_during_recovery = avoid_flush_during_recovery_; ASSERT_OK(TransactionDB::Open(options, txn_db_opts, dbname_, cf_descs, &handles, &txn_db)); @@ -1238,6 +1261,8 @@ TEST_P(CrashDuringRecoveryWithCorruptionTest, TxnDbCrashDuringRecovery) { ASSERT_OK(txn->Put(handles[1], "foo", "value")); ASSERT_OK(txn->SetName("txn0")); ASSERT_OK(txn->Prepare()); + ASSERT_OK(txn_db->Flush(FlushOptions())); + delete txn; txn = nullptr; @@ -1257,6 +1282,7 @@ TEST_P(CrashDuringRecoveryWithCorruptionTest, TxnDbCrashDuringRecovery) { delete txn; txn = nullptr; + for (auto* h : handles) { delete h; } @@ -1270,6 +1296,7 @@ TEST_P(CrashDuringRecoveryWithCorruptionTest, TxnDbCrashDuringRecovery) { std::vector file_nums; GetSortedWalFiles(file_nums); size_t size = file_nums.size(); + assert(size >= 2); uint64_t log_num = file_nums[size - 2]; CorruptFileWithTruncation(FileType::kWalFile, log_num, /*bytes_to_truncate=*/8); @@ -1279,24 +1306,38 @@ TEST_P(CrashDuringRecoveryWithCorruptionTest, TxnDbCrashDuringRecovery) { // family has higher log number than corrupted wal number. There may be old // WAL files that it must not delete because they can contain data of // uncommitted transactions. As a result, min_log_number_to_keep won't change. - { - options.avoid_flush_during_recovery = avoid_flush_during_recovery_; - ASSERT_OK(TransactionDB::Open(options, txn_db_opts, dbname_, cf_descs, - &handles, &txn_db)); + { + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::Open::BeforeSyncWAL", [&](void* arg) { + auto* tmp_s = reinterpret_cast(arg); + assert(tmp_s); + *tmp_s = Status::IOError("Injected"); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + handles.clear(); + s = TransactionDB::Open(options, txn_db_opts, dbname_, cf_descs, &handles, + &txn_db); + ASSERT_TRUE(s.IsIOError()); + ASSERT_EQ("IO error: Injected", s.ToString()); for (auto* h : handles) { delete h; } - handles.clear(); - delete txn_db; + CloseDb(); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); } - // 4. Corrupt max_wal_num to emulate second power reset which caused the - // DB to again lose the un-synced WAL. + // 4. Corrupt max_wal_num. { std::vector file_nums; GetSortedWalFiles(file_nums); size_t size = file_nums.size(); + assert(size >= 2); uint64_t log_num = file_nums[size - 1]; CorruptFileWithTruncation(FileType::kWalFile, log_num); } @@ -1311,8 +1352,6 @@ TEST_P(CrashDuringRecoveryWithCorruptionTest, TxnDbCrashDuringRecovery) { // recovered from previous WALs and MANIFEST, db will be in consistent state // and opens successfully. { - options.avoid_flush_during_recovery = false; - ASSERT_OK(TransactionDB::Open(options, txn_db_opts, dbname_, cf_descs, &handles, &txn_db)); for (auto* h : handles) { @@ -1322,6 +1361,134 @@ TEST_P(CrashDuringRecoveryWithCorruptionTest, TxnDbCrashDuringRecovery) { } } +// This test is similar to +// CrashDuringRecoveryWithCorruptionTest.CrashDuringRecovery except it calls +// flush and corrupts Last WAL. It calls flush to sync some of the WALs and +// remaining are unsyned one of which is then corrupted to simulate crash. +// +// In case of non-TransactionDB with avoid_flush_during_recovery = true, RocksDB +// won't flush the data from WAL to L0 for all column families if possible. As a +// result, not all column families can increase their log_numbers, and +// min_log_number_to_keep won't change. +// It may prematurely persist a new MANIFEST even before we can declare the DB +// is in consistent state after recovery (this is when the new WAL is synced) +// and advances log_numbers for some column families. +// +// If there is power failure before we sync the new WAL, we will end up in +// a situation in which after persisting the MANIFEST, RocksDB will see some +// column families' log_numbers larger than the corrupted wal, and +// "Column family inconsistency: SST file contains data beyond the point of +// corruption" error will be hit, causing recovery to fail. +// +// After adding the fix, only after new WAL is synced, RocksDB persist a new +// MANIFEST with column families to ensure RocksDB is in consistent state. +// RocksDB writes an empty WriteBatch as a sentinel to the new WAL which is +// synced immediately afterwards. The sequence number of the sentinel +// WriteBatch will be the next sequence number immediately after the largest +// sequence number recovered from previous WALs and MANIFEST because of which DB +// will be in consistent state. +// If a future recovery starts from the new MANIFEST, then it means the new WAL +// is successfully synced. Due to the sentinel empty write batch at the +// beginning, kPointInTimeRecovery of WAL is guaranteed to go after this point. +// If future recovery starts from the old MANIFEST, it means the writing the new +// MANIFEST failed. It won't have the "SST ahead of WAL" error. + +// The combination of corrupting a WAL and injecting an error during subsequent +// re-open exposes the bug of prematurely persisting a new MANIFEST with +// advanced ColumnFamilyData::log_number. +TEST_P(CrashDuringRecoveryWithCorruptionTest, + DISABLED_CrashDuringRecoveryWithFlush) { + CloseDb(); + Options options; + options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; + options.avoid_flush_during_recovery = false; + options.env = env_; + options.create_if_missing = true; + + ASSERT_OK(DestroyDB(dbname_, options)); + Reopen(&options); + + ColumnFamilyHandle* cfh = nullptr; + const std::string test_cf_name = "test_cf"; + Status s = db_->CreateColumnFamily(options, test_cf_name, &cfh); + ASSERT_OK(s); + delete cfh; + + CloseDb(); + + std::vector cf_descs; + cf_descs.emplace_back(kDefaultColumnFamilyName, options); + cf_descs.emplace_back(test_cf_name, options); + std::vector handles; + + { + ASSERT_OK(DB::Open(options, dbname_, cf_descs, &handles, &db_)); + + // Write one key to test_cf. + ASSERT_OK(db_->Put(WriteOptions(), handles[1], "old_key", "dontcare")); + + // Write to default_cf and flush this cf several times to advance wal + // number. + for (int i = 0; i < 2; ++i) { + ASSERT_OK(db_->Put(WriteOptions(), "key" + std::to_string(i), "value")); + ASSERT_OK(db_->Flush(FlushOptions())); + } + + ASSERT_OK(db_->Put(WriteOptions(), handles[1], "dontcare", "dontcare")); + for (auto* h : handles) { + delete h; + } + handles.clear(); + CloseDb(); + } + + // Corrupt second last un-syned wal file to emulate power reset which + // caused the DB to lose the un-synced WAL. + { + std::vector file_nums; + GetSortedWalFiles(file_nums); + size_t size = file_nums.size(); + uint64_t log_num = file_nums[size - 1]; + CorruptFileWithTruncation(FileType::kWalFile, log_num, + /*bytes_to_truncate=*/8); + } + + // Fault is injected to fail the recovery. + { + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::GetLogSizeAndMaybeTruncate:0", [&](void* arg) { + auto* tmp_s = reinterpret_cast(arg); + assert(tmp_s); + *tmp_s = Status::IOError("Injected"); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + handles.clear(); + options.avoid_flush_during_recovery = true; + s = DB::Open(options, dbname_, cf_descs, &handles, &db_); + ASSERT_TRUE(s.IsIOError()); + ASSERT_EQ("IO error: Injected", s.ToString()); + for (auto* h : handles) { + delete h; + } + CloseDb(); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + } + + // Reopen db again + { + options.avoid_flush_during_recovery = avoid_flush_during_recovery_; + ASSERT_OK(DB::Open(options, dbname_, cf_descs, &handles, &db_)); + for (auto* h : handles) { + delete h; + } + } +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc index 3d770bc27..f1218d1ae 100644 --- a/db/db_impl/db_impl_open.cc +++ b/db/db_impl/db_impl_open.cc @@ -1326,6 +1326,7 @@ Status DBImpl::GetLogSizeAndMaybeTruncate(uint64_t wal_number, bool truncate, Status s; // This gets the appear size of the wals, not including preallocated space. s = env_->GetFileSize(fname, &log.size); + TEST_SYNC_POINT_CALLBACK("DBImpl::GetLogSizeAndMaybeTruncate:0", /*arg=*/&s); if (s.ok() && truncate) { std::unique_ptr last_log; Status truncate_status = fs_->ReopenWritableFile( @@ -1825,6 +1826,7 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, if (s.ok()) { // Need to fsync, otherwise it might get lost after a power reset. s = impl->FlushWAL(false); + TEST_SYNC_POINT_CALLBACK("DBImpl::Open::BeforeSyncWAL", /*arg=*/&s); if (s.ok()) { s = log_writer->file()->Sync(impl->immutable_db_options_.use_fsync); }