Fix bug affecting GetSortedWalFiles, Backups, Checkpoint (#9208)
Summary: Saw error like this: `Backup failed -- IO error: No such file or directory: While opening a file for sequentially reading: /dev/shm/rocksdb/rocksdb_crashtest_blackbox/004426.log: No such file or directory` Unfortunately, GetSortedWalFiles (used by Backups, Checkpoint, etc.) relies on no file deletions happening while its operating, which means not only disabling (more) deletions, but ensuring any pending deletions are completed. Two fixes related to this: * There was a gap in several places between decrementing pending_purge_obsolete_files_ and incrementing bg_purge_scheduled_ where the db mutex would be released and GetSortedWalFiles (and others) could get false information that no deletions are pending. * The fix to https://github.com/facebook/rocksdb/issues/8591 (disabling deletions in GetSortedWalFiles) seems incomplete because it doesn't prevent pending deletions from occuring during the operation (if deletions not already disabled, the case that was to be fixed by the change). Pull Request resolved: https://github.com/facebook/rocksdb/pull/9208 Test Plan: existing tests (it's hard to write a test for interleavings that are now excluded - this is what stress test is for) Reviewed By: ajkr Differential Revision: D32630675 Pulled By: pdillinger fbshipit-source-id: a121e3da648de130cd24d44c524232f4eb22f178
This commit is contained in:
parent
8e43279f93
commit
57a817df76
@ -75,11 +75,6 @@ ColumnFamilyHandleImpl::~ColumnFamilyHandleImpl() {
|
||||
bool defer_purge =
|
||||
db_->immutable_db_options().avoid_unnecessary_blocking_io;
|
||||
db_->PurgeObsoleteFiles(job_context, defer_purge);
|
||||
if (defer_purge) {
|
||||
mutex_->Lock();
|
||||
db_->SchedulePurge();
|
||||
mutex_->Unlock();
|
||||
}
|
||||
}
|
||||
job_context.Clean();
|
||||
}
|
||||
|
@ -127,31 +127,30 @@ Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
|
||||
}
|
||||
|
||||
Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) {
|
||||
{
|
||||
// If caller disabled deletions, this function should return files that are
|
||||
// guaranteed not to be deleted until deletions are re-enabled. We need to
|
||||
// wait for pending purges to finish since WalManager doesn't know which
|
||||
// files are going to be purged. Additional purges won't be scheduled as
|
||||
// long as deletions are disabled (so the below loop must terminate).
|
||||
// Also note that we disable deletions anyway to avoid the case where a
|
||||
// file is deleted in the middle of the scan, causing IO error.
|
||||
Status deletions_disabled = DisableFileDeletions();
|
||||
{
|
||||
InstrumentedMutexLock l(&mutex_);
|
||||
while (disable_delete_obsolete_files_ > 0 &&
|
||||
(pending_purge_obsolete_files_ > 0 || bg_purge_scheduled_ > 0)) {
|
||||
while (pending_purge_obsolete_files_ > 0 || bg_purge_scheduled_ > 0) {
|
||||
bg_cv_.Wait();
|
||||
}
|
||||
}
|
||||
|
||||
// Disable deletion in order to avoid the case where a file is deleted in
|
||||
// the middle of the process so IO error is returned.
|
||||
Status s = DisableFileDeletions();
|
||||
bool file_deletion_supported = !s.IsNotSupported();
|
||||
if (s.ok() || !file_deletion_supported) {
|
||||
s = wal_manager_.GetSortedWalFiles(files);
|
||||
if (file_deletion_supported) {
|
||||
Status s2 = EnableFileDeletions(false);
|
||||
if (!s2.ok() && s.ok()) {
|
||||
s = s2;
|
||||
}
|
||||
}
|
||||
Status s = wal_manager_.GetSortedWalFiles(files);
|
||||
|
||||
// DisableFileDeletions / EnableFileDeletions not supported in read-only DB
|
||||
if (deletions_disabled.ok()) {
|
||||
Status s2 = EnableFileDeletions(/*force*/ false);
|
||||
assert(s2.ok());
|
||||
s2.PermitUncheckedError();
|
||||
} else {
|
||||
assert(deletions_disabled.IsNotSupported());
|
||||
}
|
||||
|
||||
return s;
|
||||
|
@ -1546,6 +1546,8 @@ void DBImpl::BackgroundCallPurge() {
|
||||
mutex_.Lock();
|
||||
}
|
||||
|
||||
assert(bg_purge_scheduled_ > 0);
|
||||
|
||||
// Can't use iterator to go over purge_files_ because inside the loop we're
|
||||
// unlocking the mutex that protects purge_files_.
|
||||
while (!purge_files_.empty()) {
|
||||
@ -1613,17 +1615,7 @@ static void CleanupIteratorState(void* arg1, void* /*arg2*/) {
|
||||
delete state->super_version;
|
||||
}
|
||||
if (job_context.HaveSomethingToDelete()) {
|
||||
if (state->background_purge) {
|
||||
// PurgeObsoleteFiles here does not delete files. Instead, it adds the
|
||||
// files to be deleted to a job queue, and deletes it in a separate
|
||||
// background thread.
|
||||
state->db->PurgeObsoleteFiles(job_context, true /* schedule only */);
|
||||
state->mu->Lock();
|
||||
state->db->SchedulePurge();
|
||||
state->mu->Unlock();
|
||||
} else {
|
||||
state->db->PurgeObsoleteFiles(job_context);
|
||||
}
|
||||
state->db->PurgeObsoleteFiles(job_context, state->background_purge);
|
||||
}
|
||||
job_context.Clean();
|
||||
}
|
||||
|
@ -56,6 +56,8 @@ Status DBImpl::DisableFileDeletions() {
|
||||
return s;
|
||||
}
|
||||
|
||||
// FIXME: can be inconsistent with DisableFileDeletions in cases like
|
||||
// DBImplReadOnly
|
||||
Status DBImpl::DisableFileDeletionsWithLock() {
|
||||
mutex_.AssertHeld();
|
||||
++disable_delete_obsolete_files_;
|
||||
@ -642,6 +644,11 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
|
||||
InstrumentedMutexLock l(&mutex_);
|
||||
--pending_purge_obsolete_files_;
|
||||
assert(pending_purge_obsolete_files_ >= 0);
|
||||
if (schedule_only) {
|
||||
// Must change from pending_purge_obsolete_files_ to bg_purge_scheduled_
|
||||
// while holding mutex (for GetSortedWalFiles() etc.)
|
||||
SchedulePurge();
|
||||
}
|
||||
if (pending_purge_obsolete_files_ == 0) {
|
||||
bg_cv_.SignalAll();
|
||||
}
|
||||
@ -657,11 +664,6 @@ void DBImpl::DeleteObsoleteFiles() {
|
||||
if (job_context.HaveSomethingToDelete()) {
|
||||
bool defer_purge = immutable_db_options_.avoid_unnecessary_blocking_io;
|
||||
PurgeObsoleteFiles(job_context, defer_purge);
|
||||
if (defer_purge) {
|
||||
mutex_.Lock();
|
||||
SchedulePurge();
|
||||
mutex_.Unlock();
|
||||
}
|
||||
}
|
||||
job_context.Clean();
|
||||
mutex_.Lock();
|
||||
|
Loading…
Reference in New Issue
Block a user