From 09a9ec3ac04418ad3426ccc94d5cb9430a9d23f9 Mon Sep 17 00:00:00 2001
From: Zhichao Cao <zhichao@fb.com>
Date: Thu, 22 Apr 2021 10:27:56 -0700
Subject: [PATCH] Fix the false positive alert of CF consistency check in WAL
 recovery (#8207)

Summary:
In current RocksDB, in recover the information form WAL, we do the consistency check for each column family when one WAL file is corrupted and PointInTimeRecovery is set. However, it will report a false positive alert on "SST file is ahead of WALs" when one of the CF current log number is greater than the corrupted WAL number (CF contains the data beyond the corrupted WAl) due to a new column family creation during flush. In this case, a new WAL is created (it is empty) during a flush. Also, due to some reason (e.g., storage issue or crash happens before SyncCloseLog is called), the old WAL is corrupted. The new CF has no data, therefore, it does not have the consistency issue.

Fix: when checking cfd->GetLogNumber() > corrupted_wal_number also check cfd->GetLiveSstFilesSize() > 0. So the CFs with no SST file data will skip the check here.

Note potential ignored inconsistency caused due to fix: empty CF can also be caused by write+delete. In this case, after flush, there is no SST files being generated. However, this CF still have the log in the WAL. When the WAL is corrupted, the DB might be inconsistent.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/8207

Test Plan: added unit test, make crash_test

Reviewed By: riversand963

Differential Revision: D27898839

Pulled By: zhichao-cao

fbshipit-source-id: 931fc2d8b92dd00b4169bf84b94e712fd688a83e
---
 HISTORY.md                             |  4 ++++
 db/db_impl/db_impl_compaction_flush.cc |  2 ++
 db/db_impl/db_impl_open.cc             | 22 +++++++++++++++++--
 db/db_test2.cc                         | 29 ++++++++++++++++++++++++++
 db/db_test_util.h                      |  7 +++++++
 5 files changed, 62 insertions(+), 2 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 2e48a5a40..537af4f6f 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -4,6 +4,10 @@
 * Fixed a bug in handling file rename error in distributed/network file systems when the server succeeds but client returns error. The bug can cause CURRENT file to point to non-existing MANIFEST file, thus DB cannot be opened.
 * Fixed a bug where ingested files were written with incorrect boundary key metadata. In rare cases this could have led to a level's files being wrongly ordered and queries for the boundary keys returning wrong results.
 * Fixed a data race between insertion into memtables and the retrieval of the DB properties `rocksdb.cur-size-active-mem-table`, `rocksdb.cur-size-all-mem-tables`, and `rocksdb.size-all-mem-tables`.
+* Fixed the false-positive alert when recovering from the WAL file. Avoid reporting "SST file is ahead of WAL" on a newly created empty column family, if the previous WAL file is corrupted.
+
+### Behavior Changes
+* Due to the fix of false-postive alert of "SST file is ahead of WAL", all the CFs with no SST file (CF empty) will bypass the consistency check. We fixed a false-positive, but introduced a very rare true-negative which will be triggered in the following conditions: A CF with some delete operations in the last a few queries which will result in an empty CF (those are flushed to SST file and a compaction triggered which combines this file and all other SST files and generates an empty CF, or there is another reason to write a manifest entry for this CF after a flush that generates no SST file from an empty CF). The deletion entries are logged in a WAL and this WAL was corrupted, while the CF's log number points to the next WAL (due to the flush). Therefore, the DB can only recover to the point without these trailing deletions and cause the inconsistent DB status.
 
 ### New Features
 * Add new option allow_stall passed during instance creation of WriteBufferManager. When allow_stall is set, WriteBufferManager will stall all writers shared across multiple DBs and columns if memory usage goes beyond specified WriteBufferManager::buffer_size (soft limit). Stall will be cleared when memory is freed after flush and memory usage goes down below buffer_size.
diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc
index f35349a58..806bc548a 100644
--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@@ -2586,6 +2586,8 @@ void DBImpl::BackgroundCallFlush(Env::Priority thread_pri) {
 
   LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL,
                        immutable_db_options_.info_log.get());
+  TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:Start:1");
+  TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:Start:2");
   {
     InstrumentedMutexLock l(&mutex_);
     assert(bg_flush_scheduled_);
diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc
index d9683a802..684d70d28 100644
--- a/db/db_impl/db_impl_open.cc
+++ b/db/db_impl/db_impl_open.cc
@@ -1137,11 +1137,29 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
        immutable_db_options_.wal_recovery_mode ==
            WALRecoveryMode::kTolerateCorruptedTailRecords)) {
     for (auto cfd : *versions_->GetColumnFamilySet()) {
-      if (cfd->GetLogNumber() > corrupted_wal_number) {
+      // One special case cause cfd->GetLogNumber() > corrupted_wal_number but
+      // the CF is still consistent: If a new column family is created during
+      // the flush and the WAL sync fails at the same time, the new CF points to
+      // the new WAL but the old WAL is curropted. Since the new CF is empty, it
+      // is still consistent. We add the check of CF sst file size to avoid the
+      // false positive alert.
+
+      // Note that, the check of (cfd->GetLiveSstFilesSize() > 0) may leads to
+      // the ignorance of a very rare inconsistency case caused in data
+      // canclation. One CF is empty due to KV deletion. But those operations
+      // are in the WAL. If the WAL is corrupted, the status of this CF might
+      // not be consistent with others. However, the consistency check will be
+      // bypassed due to empty CF.
+      // TODO: a better and complete implementation is needed to ensure strict
+      // consistency check in WAL recovery including hanlding the tailing
+      // issues.
+      if (cfd->GetLogNumber() > corrupted_wal_number &&
+          cfd->GetLiveSstFilesSize() > 0) {
         ROCKS_LOG_ERROR(immutable_db_options_.info_log,
                         "Column family inconsistency: SST file contains data"
                         " beyond the point of corruption.");
-        return Status::Corruption("SST file is ahead of WALs");
+        return Status::Corruption("SST file is ahead of WALs in CF " +
+                                  cfd->GetName());
       }
     }
   }
diff --git a/db/db_test2.cc b/db/db_test2.cc
index a7952cce1..42ec2d103 100644
--- a/db/db_test2.cc
+++ b/db/db_test2.cc
@@ -5558,6 +5558,35 @@ TEST_F(DBTest2, PointInTimeRecoveryWithIOErrorWhileReadingWal) {
   Status s = TryReopen(options);
   ASSERT_TRUE(s.IsIOError());
 }
+
+TEST_F(DBTest2, PointInTimeRecoveryWithSyncFailureInCFCreation) {
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::BackgroundCallFlush:Start:1",
+        "PointInTimeRecoveryWithSyncFailureInCFCreation:1"},
+       {"PointInTimeRecoveryWithSyncFailureInCFCreation:2",
+        "DBImpl::BackgroundCallFlush:Start:2"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  CreateColumnFamilies({"test1"}, Options());
+  ASSERT_OK(Put("foo", "bar"));
+
+  // Creating a CF when a flush is going on, log is synced but the
+  // closed log file is not synced and corrupted.
+  port::Thread flush_thread([&]() { ASSERT_NOK(Flush()); });
+  TEST_SYNC_POINT("PointInTimeRecoveryWithSyncFailureInCFCreation:1");
+  CreateColumnFamilies({"test2"}, Options());
+  env_->corrupt_in_sync_ = true;
+  TEST_SYNC_POINT("PointInTimeRecoveryWithSyncFailureInCFCreation:2");
+  flush_thread.join();
+  env_->corrupt_in_sync_ = false;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  // Reopening the DB should not corrupt anything
+  Options options = CurrentOptions();
+  options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+  ReopenWithColumnFamilies({"default", "test1", "test2"}, options);
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 #ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
diff --git a/db/db_test_util.h b/db/db_test_util.h
index cd34bd21e..0c294f786 100644
--- a/db/db_test_util.h
+++ b/db/db_test_util.h
@@ -399,6 +399,10 @@ class SpecialEnv : public EnvWrapper {
       Status Flush() override { return base_->Flush(); }
       Status Sync() override {
         ++env_->sync_counter_;
+        if (env_->corrupt_in_sync_) {
+          Append(std::string(33000, ' '));
+          return Status::IOError("Ingested Sync Failure");
+        }
         if (env_->skip_fsync_) {
           return Status::OK();
         } else {
@@ -723,6 +727,9 @@ class SpecialEnv : public EnvWrapper {
   // If true, all fsync to files and directories are skipped.
   bool skip_fsync_ = false;
 
+  // If true, ingest the corruption to file during sync.
+  bool corrupt_in_sync_ = false;
+
   std::atomic<uint32_t> non_writeable_rate_;
 
   std::atomic<uint32_t> new_writable_count_;