diff --git a/HISTORY.md b/HISTORY.md index 340590d49..204ab25a9 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -2,6 +2,7 @@ ## 6.11.3 (7/9/2020) ### Bug Fixes * Fix a bug when index_type == kTwoLevelIndexSearch in PartitionedIndexBuilder to update FlushPolicy to point to internal key partitioner when it changes from user-key mode to internal-key mode in index partition. +* Disable file deletion after MANIFEST write/sync failure until db re-open or Resume() so that subsequent re-open will not see MANIFEST referencing deleted SSTs. ## 6.11.1 (6/23/2020) ### Bug Fixes diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc index aabff4fd6..46e685abd 100644 --- a/db/compaction/compaction_job.cc +++ b/db/compaction/compaction_job.cc @@ -721,7 +721,7 @@ Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options) { cfd->internal_stats()->AddCompactionStats( compact_->compaction->output_level(), thread_pri_, compaction_stats_); - versions_->SetIOStatusOK(); + versions_->SetIOStatus(IOStatus::OK()); if (status.ok()) { status = InstallCompactionResults(mutable_cf_options); } diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc index 02118ef3a..91ef626a4 100644 --- a/db/db_basic_test.cc +++ b/db/db_basic_test.cc @@ -3023,6 +3023,33 @@ TEST_F(DBBasicTestMultiGetDeadline, MultiGetDeadlineExceeded) { Close(); } +TEST_F(DBBasicTest, ManifestWriteFailure) { + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.disable_auto_compactions = true; + options.env = env_; + DestroyAndReopen(options); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Flush()); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::ProcessManifestWrites:AfterSyncManifest", [&](void* arg) { + ASSERT_NE(nullptr, arg); + auto* s = reinterpret_cast(arg); + ASSERT_OK(*s); + // Manually overwrite return status + *s = Status::IOError(); + }); + SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(Put("key", "value")); + ASSERT_NOK(Flush()); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->EnableProcessing(); + Reopen(options); +} + } // namespace ROCKSDB_NAMESPACE #ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc index 8af226c31..e9a094a28 100644 --- a/db/db_filesnapshot.cc +++ b/db/db_filesnapshot.cc @@ -23,57 +23,6 @@ namespace ROCKSDB_NAMESPACE { -Status DBImpl::DisableFileDeletions() { - InstrumentedMutexLock l(&mutex_); - ++disable_delete_obsolete_files_; - if (disable_delete_obsolete_files_ == 1) { - ROCKS_LOG_INFO(immutable_db_options_.info_log, "File Deletions Disabled"); - } else { - ROCKS_LOG_WARN(immutable_db_options_.info_log, - "File Deletions Disabled, but already disabled. Counter: %d", - disable_delete_obsolete_files_); - } - return Status::OK(); -} - -Status DBImpl::EnableFileDeletions(bool force) { - // Job id == 0 means that this is not our background process, but rather - // user thread - JobContext job_context(0); - bool file_deletion_enabled = false; - { - InstrumentedMutexLock l(&mutex_); - if (force) { - // if force, we need to enable file deletions right away - disable_delete_obsolete_files_ = 0; - } else if (disable_delete_obsolete_files_ > 0) { - --disable_delete_obsolete_files_; - } - if (disable_delete_obsolete_files_ == 0) { - file_deletion_enabled = true; - FindObsoleteFiles(&job_context, true); - bg_cv_.SignalAll(); - } - } - if (file_deletion_enabled) { - ROCKS_LOG_INFO(immutable_db_options_.info_log, "File Deletions Enabled"); - if (job_context.HaveSomethingToDelete()) { - PurgeObsoleteFiles(job_context); - } - } else { - ROCKS_LOG_WARN(immutable_db_options_.info_log, - "File Deletions Enable, but not really enabled. Counter: %d", - disable_delete_obsolete_files_); - } - job_context.Clean(); - LogFlush(immutable_db_options_.info_log); - return Status::OK(); -} - -int DBImpl::IsFileDeletionsEnabled() const { - return !disable_delete_obsolete_files_; -} - Status DBImpl::GetLiveFiles(std::vector& ret, uint64_t* manifest_file_size, bool flush_memtable) { diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 3d0ee681c..26ee78681 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -312,8 +312,36 @@ Status DBImpl::ResumeImpl() { } // Make sure the IO Status stored in version set is set to OK. + bool file_deletion_disabled = !IsFileDeletionsEnabled(); if (s.ok()) { - versions_->SetIOStatusOK(); + IOStatus io_s = versions_->io_status(); + if (io_s.IsIOError()) { + // If resuming from IOError resulted from MANIFEST write, then assert + // that we must have already set the MANIFEST writer to nullptr during + // clean-up phase MANIFEST writing. We must have also disabled file + // deletions. + assert(!versions_->descriptor_log_); + assert(file_deletion_disabled); + // Since we are trying to recover from MANIFEST write error, we need to + // switch to a new MANIFEST anyway. The old MANIFEST can be corrupted. + // Therefore, force writing a dummy version edit because we do not know + // whether there are flush jobs with non-empty data to flush, triggering + // appends to MANIFEST. + VersionEdit edit; + auto cfh = reinterpret_cast(default_cf_handle_); + assert(cfh); + ColumnFamilyData* cfd = cfh->cfd(); + const MutableCFOptions& cf_opts = *cfd->GetLatestMutableCFOptions(); + s = versions_->LogAndApply(cfd, cf_opts, &edit, &mutex_, + directories_.GetDbDir()); + if (!s.ok()) { + io_s = versions_->io_status(); + if (!io_s.ok()) { + s = error_handler_.SetBGError(io_s, + BackgroundErrorReason::kManifestWrite); + } + } + } } // We cannot guarantee consistency of the WAL. So force flush Memtables of @@ -364,6 +392,13 @@ Status DBImpl::ResumeImpl() { job_context.Clean(); if (s.ok()) { + assert(versions_->io_status().ok()); + // If we reach here, we should re-enable file deletions if it was disabled + // during previous error handling. + if (file_deletion_disabled) { + // Always return ok + EnableFileDeletions(/*force=*/true); + } ROCKS_LOG_INFO(immutable_db_options_.info_log, "Successfully resumed DB"); } mutex_.Lock(); @@ -4389,6 +4424,14 @@ Status DBImpl::IngestExternalFiles( #endif // !NDEBUG } } + } else if (versions_->io_status().IsIOError()) { + // Error while writing to MANIFEST. + // In fact, versions_->io_status() can also be the result of renaming + // CURRENT file. With current code, it's just difficult to tell. So just + // be pessimistic and try write to a new MANIFEST. + // TODO: distinguish between MANIFEST write and CURRENT renaming + const IOStatus& io_s = versions_->io_status(); + error_handler_.SetBGError(io_s, BackgroundErrorReason::kManifestWrite); } // Resume writes to the DB diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index 22cf37bf6..c8cdf570f 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -356,6 +356,12 @@ class DBImpl : public DB { virtual Status Close() override; + virtual Status DisableFileDeletions() override; + + virtual Status EnableFileDeletions(bool force) override; + + virtual bool IsFileDeletionsEnabled() const; + Status GetStatsHistory( uint64_t start_time, uint64_t end_time, std::unique_ptr* stats_iterator) override; @@ -363,9 +369,6 @@ class DBImpl : public DB { #ifndef ROCKSDB_LITE using DB::ResetStats; virtual Status ResetStats() override; - virtual Status DisableFileDeletions() override; - virtual Status EnableFileDeletions(bool force) override; - virtual int IsFileDeletionsEnabled() const; // All the returned filenames start with "/" virtual Status GetLiveFiles(std::vector&, uint64_t* manifest_file_size, @@ -1780,6 +1783,8 @@ class DBImpl : public DB { SuperVersion* sv, SequenceNumber snap_seqnum, ReadCallback* callback, bool* is_blob_index); + Status DisableFileDeletionsWithLock(); + // table_cache_ provides its own synchronization std::shared_ptr table_cache_; diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc index 4f1ca1fe3..1ab4cf523 100644 --- a/db/db_impl/db_impl_compaction_flush.cc +++ b/db/db_impl/db_impl_compaction_flush.cc @@ -210,7 +210,15 @@ Status DBImpl::FlushMemTableToOutputFile( if (!s.ok() && !s.IsShutdownInProgress() && !s.IsColumnFamilyDropped()) { if (!io_s.ok() && !io_s.IsShutdownInProgress() && !io_s.IsColumnFamilyDropped()) { - error_handler_.SetBGError(io_s, BackgroundErrorReason::kFlush); + // Error while writing to MANIFEST. + // In fact, versions_->io_status() can also be the result of renaming + // CURRENT file. With current code, it's just difficult to tell. So just + // be pessimistic and try write to a new MANIFEST. + // TODO: distinguish between MANIFEST write and CURRENT renaming + auto err_reason = versions_->io_status().ok() + ? BackgroundErrorReason::kFlush + : BackgroundErrorReason::kManifestWrite; + error_handler_.SetBGError(io_s, err_reason); } else { Status new_bg_error = s; error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush); @@ -574,7 +582,15 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( // it is not because of CF drop. if (!s.ok() && !s.IsColumnFamilyDropped()) { if (!io_s.ok() && !io_s.IsColumnFamilyDropped()) { - error_handler_.SetBGError(io_s, BackgroundErrorReason::kFlush); + // Error while writing to MANIFEST. + // In fact, versions_->io_status() can also be the result of renaming + // CURRENT file. With current code, it's just difficult to tell. So just + // be pessimistic and try write to a new MANIFEST. + // TODO: distinguish between MANIFEST write and CURRENT renaming + auto err_reason = versions_->io_status().ok() + ? BackgroundErrorReason::kFlush + : BackgroundErrorReason::kManifestWrite; + error_handler_.SetBGError(io_s, err_reason); } else { Status new_bg_error = s; error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush); @@ -2687,7 +2703,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, for (const auto& f : *c->inputs(0)) { c->edit()->DeleteFile(c->level(), f->fd.GetNumber()); } - versions_->SetIOStatusOK(); + versions_->SetIOStatus(IOStatus::OK()); status = versions_->LogAndApply(c->column_family_data(), *c->mutable_cf_options(), c->edit(), &mutex_, directories_.GetDbDir()); @@ -2745,7 +2761,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, } } - versions_->SetIOStatusOK(); + versions_->SetIOStatus(IOStatus::OK()); status = versions_->LogAndApply(c->column_family_data(), *c->mutable_cf_options(), c->edit(), &mutex_, directories_.GetDbDir()); @@ -2877,7 +2893,15 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, ROCKS_LOG_WARN(immutable_db_options_.info_log, "Compaction error: %s", status.ToString().c_str()); if (!io_s.ok()) { - error_handler_.SetBGError(io_s, BackgroundErrorReason::kCompaction); + // Error while writing to MANIFEST. + // In fact, versions_->io_status() can also be the result of renaming + // CURRENT file. With current code, it's just difficult to tell. So just + // be pessimistic and try write to a new MANIFEST. + // TODO: distinguish between MANIFEST write and CURRENT renaming + auto err_reason = versions_->io_status().ok() + ? BackgroundErrorReason::kCompaction + : BackgroundErrorReason::kManifestWrite; + error_handler_.SetBGError(io_s, err_reason); } else { error_handler_.SetBGError(status, BackgroundErrorReason::kCompaction); } diff --git a/db/db_impl/db_impl_files.cc b/db/db_impl/db_impl_files.cc index c43772dfd..ea0d12296 100644 --- a/db/db_impl/db_impl_files.cc +++ b/db/db_impl/db_impl_files.cc @@ -36,6 +36,62 @@ uint64_t DBImpl::MinObsoleteSstNumberToKeep() { return std::numeric_limits::max(); } +Status DBImpl::DisableFileDeletions() { + InstrumentedMutexLock l(&mutex_); + return DisableFileDeletionsWithLock(); +} + +Status DBImpl::DisableFileDeletionsWithLock() { + mutex_.AssertHeld(); + ++disable_delete_obsolete_files_; + if (disable_delete_obsolete_files_ == 1) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, "File Deletions Disabled"); + } else { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "File Deletions Disabled, but already disabled. Counter: %d", + disable_delete_obsolete_files_); + } + return Status::OK(); +} + +Status DBImpl::EnableFileDeletions(bool force) { + // Job id == 0 means that this is not our background process, but rather + // user thread + JobContext job_context(0); + bool file_deletion_enabled = false; + { + InstrumentedMutexLock l(&mutex_); + if (force) { + // if force, we need to enable file deletions right away + disable_delete_obsolete_files_ = 0; + } else if (disable_delete_obsolete_files_ > 0) { + --disable_delete_obsolete_files_; + } + if (disable_delete_obsolete_files_ == 0) { + file_deletion_enabled = true; + FindObsoleteFiles(&job_context, true); + bg_cv_.SignalAll(); + } + } + if (file_deletion_enabled) { + ROCKS_LOG_INFO(immutable_db_options_.info_log, "File Deletions Enabled"); + if (job_context.HaveSomethingToDelete()) { + PurgeObsoleteFiles(job_context); + } + } else { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "File Deletions Enable, but not really enabled. Counter: %d", + disable_delete_obsolete_files_); + } + job_context.Clean(); + LogFlush(immutable_db_options_.info_log); + return Status::OK(); +} + +bool DBImpl::IsFileDeletionsEnabled() const { + return 0 == disable_delete_obsolete_files_; +} + // * Returns the list of live files in 'sst_live' and 'blob_live'. // If it's doing full scan: // * Returns the list of all files in the filesystem in diff --git a/db/db_test.cc b/db/db_test.cc index 080a14eea..8d7cce460 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -2990,10 +2990,11 @@ class ModelDB : public DB { Status SyncWAL() override { return Status::OK(); } -#ifndef ROCKSDB_LITE Status DisableFileDeletions() override { return Status::OK(); } Status EnableFileDeletions(bool /*force*/) override { return Status::OK(); } +#ifndef ROCKSDB_LITE + Status GetLiveFiles(std::vector&, uint64_t* /*size*/, bool /*flush_memtable*/ = true) override { return Status::OK(); diff --git a/db/error_handler.cc b/db/error_handler.cc index 3c99dce99..1f7bbd7ec 100644 --- a/db/error_handler.cc +++ b/db/error_handler.cc @@ -51,9 +51,19 @@ std::map, Status::Code::kIOError, Status::SubCode::kNoSpace, false), Status::Severity::kHardError}, + // Errors during MANIFEST write + {std::make_tuple(BackgroundErrorReason::kManifestWrite, + Status::Code::kIOError, Status::SubCode::kNoSpace, + true), + Status::Severity::kHardError}, + {std::make_tuple(BackgroundErrorReason::kManifestWrite, + Status::Code::kIOError, Status::SubCode::kNoSpace, + false), + Status::Severity::kHardError}, }; -std::map, Status::Severity> +std::map, + Status::Severity> DefaultErrorSeverityMap = { // Errors during BG compaction {std::make_tuple(BackgroundErrorReason::kCompaction, @@ -75,11 +85,11 @@ std::map, Status::Severity {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kCorruption, false), Status::Severity::kNoError}, - {std::make_tuple(BackgroundErrorReason::kFlush, - Status::Code::kIOError, true), + {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError, + true), Status::Severity::kFatalError}, - {std::make_tuple(BackgroundErrorReason::kFlush, - Status::Code::kIOError, false), + {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError, + false), Status::Severity::kNoError}, // Errors during Write {std::make_tuple(BackgroundErrorReason::kWriteCallback, @@ -94,30 +104,36 @@ std::map, Status::Severity {std::make_tuple(BackgroundErrorReason::kWriteCallback, Status::Code::kIOError, false), Status::Severity::kNoError}, + {std::make_tuple(BackgroundErrorReason::kManifestWrite, + Status::Code::kIOError, true), + Status::Severity::kFatalError}, + {std::make_tuple(BackgroundErrorReason::kManifestWrite, + Status::Code::kIOError, false), + Status::Severity::kFatalError}, }; std::map, Status::Severity> DefaultReasonMap = { // Errors during BG compaction {std::make_tuple(BackgroundErrorReason::kCompaction, true), - Status::Severity::kFatalError}, + Status::Severity::kFatalError}, {std::make_tuple(BackgroundErrorReason::kCompaction, false), - Status::Severity::kNoError}, + Status::Severity::kNoError}, // Errors during BG flush {std::make_tuple(BackgroundErrorReason::kFlush, true), - Status::Severity::kFatalError}, + Status::Severity::kFatalError}, {std::make_tuple(BackgroundErrorReason::kFlush, false), - Status::Severity::kNoError}, + Status::Severity::kNoError}, // Errors during Write {std::make_tuple(BackgroundErrorReason::kWriteCallback, true), - Status::Severity::kFatalError}, + Status::Severity::kFatalError}, {std::make_tuple(BackgroundErrorReason::kWriteCallback, false), - Status::Severity::kFatalError}, + Status::Severity::kFatalError}, // Errors during Memtable update {std::make_tuple(BackgroundErrorReason::kMemTable, true), - Status::Severity::kFatalError}, + Status::Severity::kFatalError}, {std::make_tuple(BackgroundErrorReason::kMemTable, false), - Status::Severity::kFatalError}, + Status::Severity::kFatalError}, }; void ErrorHandler::CancelErrorRecovery() { @@ -247,6 +263,10 @@ Status ErrorHandler::SetBGError(const IOStatus& bg_io_err, if (recovery_in_prog_ && recovery_error_.ok()) { recovery_error_ = bg_io_err; } + if (BackgroundErrorReason::kManifestWrite == reason) { + // Always returns ok + db_->DisableFileDeletionsWithLock(); + } Status new_bg_io_err = bg_io_err; Status s; if (bg_io_err.GetDataLoss()) { diff --git a/db/internal_stats.cc b/db/internal_stats.cc index f729ee7c7..ff4c5f46c 100644 --- a/db/internal_stats.cc +++ b/db/internal_stats.cc @@ -798,7 +798,7 @@ bool InternalStats::HandleCurrentSuperVersionNumber(uint64_t* value, bool InternalStats::HandleIsFileDeletionsEnabled(uint64_t* value, DBImpl* db, Version* /*version*/) { - *value = db->IsFileDeletionsEnabled(); + *value = db->IsFileDeletionsEnabled() ? 1 : 0; return true; } diff --git a/db/memtable_list.cc b/db/memtable_list.cc index acdba0896..89de07f3d 100644 --- a/db/memtable_list.cc +++ b/db/memtable_list.cc @@ -470,7 +470,7 @@ Status MemTableList::TryInstallMemtableFlushResults( } // this can release and reacquire the mutex. - vset->SetIOStatusOK(); + vset->SetIOStatus(IOStatus::OK()); s = vset->LogAndApply(cfd, mutable_cf_options, edit_list, mu, db_directory); *io_s = vset->io_status(); diff --git a/db/version_set.cc b/db/version_set.cc index 899e7125b..8696b57f3 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -4001,12 +4001,16 @@ Status VersionSet::ProcessManifestWrites( } if (s.ok()) { io_s = SyncManifest(env_, db_options_, descriptor_log_->file()); + TEST_SYNC_POINT_CALLBACK( + "VersionSet::ProcessManifestWrites:AfterSyncManifest", &io_s); } if (!io_s.ok()) { io_status_ = io_s; s = io_s; ROCKS_LOG_ERROR(db_options_->info_log, "MANIFEST write %s\n", s.ToString().c_str()); + } else if (io_status_.IsIOError()) { + io_status_ = io_s; } } @@ -4018,6 +4022,8 @@ Status VersionSet::ProcessManifestWrites( if (!io_s.ok()) { io_status_ = io_s; s = io_s; + } else if (io_status_.IsIOError()) { + io_status_ = io_s; } TEST_SYNC_POINT("VersionSet::ProcessManifestWrites:AfterNewManifest"); } diff --git a/db/version_set.h b/db/version_set.h index d1766d0bf..16661e097 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -1162,7 +1162,7 @@ class VersionSet { IOStatus io_status() const { return io_status_; } // Set the IO Status to OK. Called before Manifest write if needed. - void SetIOStatusOK() { io_status_ = IOStatus::OK(); } + void SetIOStatus(const IOStatus& s) { io_status_ = s; } protected: using VersionBuilderMap = diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h index 337827191..08609f371 100644 --- a/include/rocksdb/db.h +++ b/include/rocksdb/db.h @@ -1266,8 +1266,6 @@ class DB { // updated, false if user attempted to call if with seqnum <= current value. virtual bool SetPreserveDeletesSequenceNumber(SequenceNumber seqnum) = 0; -#ifndef ROCKSDB_LITE - // Prevent file deletions. Compactions will continue to occur, // but no obsolete files will be deleted. Calling this multiple // times have the same effect as calling it once. @@ -1284,6 +1282,7 @@ class DB { // threads call EnableFileDeletions() virtual Status EnableFileDeletions(bool force = true) = 0; +#ifndef ROCKSDB_LITE // GetLiveFiles followed by GetSortedWalFiles can generate a lossless backup // Retrieve the list of all files in the database. The files are diff --git a/include/rocksdb/listener.h b/include/rocksdb/listener.h index d1c953f0f..97570713f 100644 --- a/include/rocksdb/listener.h +++ b/include/rocksdb/listener.h @@ -117,6 +117,7 @@ enum class BackgroundErrorReason { kCompaction, kWriteCallback, kMemTable, + kManifestWrite, }; enum class WriteStallCondition {