diff --git a/HISTORY.md b/HISTORY.md index 76b52730e..012c1cb39 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -4,6 +4,10 @@ * Fixed a bug in handling file rename error in distributed/network file systems when the server succeeds but client returns error. The bug can cause CURRENT file to point to non-existing MANIFEST file, thus DB cannot be opened. * Fixed a bug where ingested files were written with incorrect boundary key metadata. In rare cases this could have led to a level's files being wrongly ordered and queries for the boundary keys returning wrong results. * Fixed a data race between insertion into memtables and the retrieval of the DB properties `rocksdb.cur-size-active-mem-table`, `rocksdb.cur-size-all-mem-tables`, and `rocksdb.size-all-mem-tables`. +* Fixed the false-positive alert when recovering from the WAL file. Avoid reporting "SST file is ahead of WAL" on a newly created empty column family, if the previous WAL file is corrupted. + +### Behavior Changes +* Due to the fix of false-postive alert of "SST file is ahead of WAL", all the CFs with no SST file (CF empty) will bypass the consistency check. We fixed a false-positive, but introduced a very rare true-negative which will be triggered in the following conditions: A CF with some delete operations in the last a few queries which will result in an empty CF (those are flushed to SST file and a compaction triggered which combines this file and all other SST files and generates an empty CF, or there is another reason to write a manifest entry for this CF after a flush that generates no SST file from an empty CF). The deletion entries are logged in a WAL and this WAL was corrupted, while the CF's log number points to the next WAL (due to the flush). Therefore, the DB can only recover to the point without these trailing deletions and cause the inconsistent DB status. ## 6.20.0 (04/16/2021) ### Behavior Changes diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc index a4c965766..28ad6fd01 100644 --- a/db/db_impl/db_impl_compaction_flush.cc +++ b/db/db_impl/db_impl_compaction_flush.cc @@ -2582,6 +2582,8 @@ void DBImpl::BackgroundCallFlush(Env::Priority thread_pri) { LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, immutable_db_options_.info_log.get()); + TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:Start:1"); + TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:Start:2"); { InstrumentedMutexLock l(&mutex_); assert(bg_flush_scheduled_); diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc index d9683a802..684d70d28 100644 --- a/db/db_impl/db_impl_open.cc +++ b/db/db_impl/db_impl_open.cc @@ -1137,11 +1137,29 @@ Status DBImpl::RecoverLogFiles(const std::vector& wal_numbers, immutable_db_options_.wal_recovery_mode == WALRecoveryMode::kTolerateCorruptedTailRecords)) { for (auto cfd : *versions_->GetColumnFamilySet()) { - if (cfd->GetLogNumber() > corrupted_wal_number) { + // One special case cause cfd->GetLogNumber() > corrupted_wal_number but + // the CF is still consistent: If a new column family is created during + // the flush and the WAL sync fails at the same time, the new CF points to + // the new WAL but the old WAL is curropted. Since the new CF is empty, it + // is still consistent. We add the check of CF sst file size to avoid the + // false positive alert. + + // Note that, the check of (cfd->GetLiveSstFilesSize() > 0) may leads to + // the ignorance of a very rare inconsistency case caused in data + // canclation. One CF is empty due to KV deletion. But those operations + // are in the WAL. If the WAL is corrupted, the status of this CF might + // not be consistent with others. However, the consistency check will be + // bypassed due to empty CF. + // TODO: a better and complete implementation is needed to ensure strict + // consistency check in WAL recovery including hanlding the tailing + // issues. + if (cfd->GetLogNumber() > corrupted_wal_number && + cfd->GetLiveSstFilesSize() > 0) { ROCKS_LOG_ERROR(immutable_db_options_.info_log, "Column family inconsistency: SST file contains data" " beyond the point of corruption."); - return Status::Corruption("SST file is ahead of WALs"); + return Status::Corruption("SST file is ahead of WALs in CF " + + cfd->GetName()); } } } diff --git a/db/db_test2.cc b/db/db_test2.cc index a7952cce1..42ec2d103 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -5558,6 +5558,35 @@ TEST_F(DBTest2, PointInTimeRecoveryWithIOErrorWhileReadingWal) { Status s = TryReopen(options); ASSERT_TRUE(s.IsIOError()); } + +TEST_F(DBTest2, PointInTimeRecoveryWithSyncFailureInCFCreation) { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::BackgroundCallFlush:Start:1", + "PointInTimeRecoveryWithSyncFailureInCFCreation:1"}, + {"PointInTimeRecoveryWithSyncFailureInCFCreation:2", + "DBImpl::BackgroundCallFlush:Start:2"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + CreateColumnFamilies({"test1"}, Options()); + ASSERT_OK(Put("foo", "bar")); + + // Creating a CF when a flush is going on, log is synced but the + // closed log file is not synced and corrupted. + port::Thread flush_thread([&]() { ASSERT_NOK(Flush()); }); + TEST_SYNC_POINT("PointInTimeRecoveryWithSyncFailureInCFCreation:1"); + CreateColumnFamilies({"test2"}, Options()); + env_->corrupt_in_sync_ = true; + TEST_SYNC_POINT("PointInTimeRecoveryWithSyncFailureInCFCreation:2"); + flush_thread.join(); + env_->corrupt_in_sync_ = false; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + // Reopening the DB should not corrupt anything + Options options = CurrentOptions(); + options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; + ReopenWithColumnFamilies({"default", "test1", "test2"}, options); +} + } // namespace ROCKSDB_NAMESPACE #ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS diff --git a/db/db_test_util.h b/db/db_test_util.h index 8dc0e3a33..eb5853b00 100644 --- a/db/db_test_util.h +++ b/db/db_test_util.h @@ -393,6 +393,10 @@ class SpecialEnv : public EnvWrapper { Status Flush() override { return base_->Flush(); } Status Sync() override { ++env_->sync_counter_; + if (env_->corrupt_in_sync_) { + Append(std::string(33000, ' ')); + return Status::IOError("Ingested Sync Failure"); + } if (env_->skip_fsync_) { return Status::OK(); } else { @@ -717,6 +721,9 @@ class SpecialEnv : public EnvWrapper { // If true, all fsync to files and directories are skipped. bool skip_fsync_ = false; + // If true, ingest the corruption to file during sync. + bool corrupt_in_sync_ = false; + std::atomic non_writeable_rate_; std::atomic new_writable_count_;