Prevent manual flush hanging in read-only mode (#4615)

Summary: The logic to wait for stall conditions to clear before beginning a manual flush didn't take into account whether the DB was in read-only mode. In read-only mode the stall conditions would never clear since no background work is happening, so the wait would be never-ending. It's probably better to return an error to the user. Pull Request resolved: https://github.com/facebook/rocksdb/pull/4615 Differential Revision: D12888008 Pulled By: ajkr fbshipit-source-id: 1c474b42a7ac38d9fd0d0e2340ff1d53e684d83c
2018-11-01 15:23:20 -07:00 · 2018-11-01 15:23:20 -07:00 · 5c794d94c4
commit 5c794d94c4
parent b8f68bac38
2 changed files with 46 additions and 0 deletions
--- a/db/db_flush_test.cc
+++ b/db/db_flush_test.cc
@ -220,6 +220,44 @@ TEST_F(DBFlushTest, FlushError) {
  ASSERT_NE(s, Status::OK());
 }

+TEST_F(DBFlushTest, ManualFlushFailsInReadOnlyMode) {
+  // Regression test for bug where manual flush hangs forever when the DB
+  // is in read-only mode. Verify it now at least returns, despite failing.
+  Options options;
+  std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+      new FaultInjectionTestEnv(env_));
+  options.env = fault_injection_env.get();
+  options.max_write_buffer_number = 2;
+  Reopen(options);
+
+  // Trigger a first flush but don't let it run
+  ASSERT_OK(db_->PauseBackgroundWork());
+  ASSERT_OK(Put("key1", "value1"));
+  FlushOptions flush_opts;
+  flush_opts.wait = false;
+  ASSERT_OK(db_->Flush(flush_opts));
+
+  // Write a key to the second memtable so we have something to flush later
+  // after the DB is in read-only mode.
+  ASSERT_OK(Put("key2", "value2"));
+
+  // Let the first flush continue, hit an error, and put the DB in read-only
+  // mode.
+  fault_injection_env->SetFilesystemActive(false);
+  ASSERT_OK(db_->ContinueBackgroundWork());
+  dbfull()->TEST_WaitForFlushMemTable();
+  uint64_t num_bg_errors;
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBackgroundErrors,
+                                  &num_bg_errors));
+  ASSERT_GT(num_bg_errors, 0);
+
+  // In the bug scenario, triggering another flush would cause the second flush
+  // to hang forever. After the fix we expect it to return an error.
+  ASSERT_NOK(db_->Flush(FlushOptions()));
+
+  Close();
+}
+
 TEST_P(DBAtomicFlushTest, ManualAtomicFlush) {
  Options options = CurrentOptions();
  options.create_if_missing = true;
--- a/db/db_impl_compaction_flush.cc
+++ b/db/db_impl_compaction_flush.cc
@ -1543,6 +1543,14 @@ Status DBImpl::WaitUntilFlushWouldNotStallWrites(ColumnFamilyData* cfd,
    WriteStallCondition write_stall_condition = WriteStallCondition::kNormal;
    do {
      if (write_stall_condition != WriteStallCondition::kNormal) {
+        // Same error handling as user writes: Don't wait if there's a
+        // background error, even if it's a soft error. We might wait here
+        // indefinitely as the pending flushes/compactions may never finish
+        // successfully, resulting in the stall condition lasting indefinitely
+        if (error_handler_.IsBGWorkStopped()) {
+          return error_handler_.GetBGError();
+        }
+
        TEST_SYNC_POINT("DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait");
        ROCKS_LOG_INFO(immutable_db_options_.info_log,
                       "[%s] WaitUntilFlushWouldNotStallWrites"