[wal changes 2/3] write with sync=true syncs previous unsynced wals to prevent illegal data loss
Summary: I'll just copy internal task summary here: " This sequence will cause data loss in the middle after an sync write: non-sync write key 1 flush triggered, not yet scheduled sync write key 2 system crash After rebooting, users might see key 2 but not key 1, which violates the API of sync write. This can be reproduced using unit test FaultInjectionTest::DISABLED_WriteOptionSyncTest. One way to fix it is for a sync write, if there is outstanding unsynced log files, we need to syc them too. " This diff should be considered together with the next diff D40905; in isolation this fix probably could be a little simpler. Test Plan: `make check`; added a test for that (DBTest.SyncingPreviousLogs) before noticing FaultInjectionTest.WriteOptionSyncTest (keeping both since mine asserts a bit more); both tests fail without this diff; for D40905 stacked on top of this diff, ran tests with ASAN, TSAN and valgrind Reviewers: rven, yhchiang, IslamAbdelRahman, anthony, kradhakrishnan, igor, sdong Reviewed By: sdong Subscribers: dhruba Differential Revision: https://reviews.facebook.net/D40899
This commit is contained in:
parent
06aebca592
commit
fe09a6dae3
@ -224,6 +224,7 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname)
|
|||||||
log_dir_synced_(false),
|
log_dir_synced_(false),
|
||||||
log_empty_(true),
|
log_empty_(true),
|
||||||
default_cf_handle_(nullptr),
|
default_cf_handle_(nullptr),
|
||||||
|
log_sync_cv_(&mutex_),
|
||||||
total_log_size_(0),
|
total_log_size_(0),
|
||||||
max_total_in_memory_state_(0),
|
max_total_in_memory_state_(0),
|
||||||
is_snapshot_supported_(true),
|
is_snapshot_supported_(true),
|
||||||
@ -367,6 +368,7 @@ DBImpl::~DBImpl() {
|
|||||||
for (auto l : logs_to_free_) {
|
for (auto l : logs_to_free_) {
|
||||||
delete l;
|
delete l;
|
||||||
}
|
}
|
||||||
|
logs_.clear();
|
||||||
|
|
||||||
// versions need to be destroyed before table_cache since it can hold
|
// versions need to be destroyed before table_cache since it can hold
|
||||||
// references to table_cache.
|
// references to table_cache.
|
||||||
@ -535,8 +537,8 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
|
|||||||
versions_->GetObsoleteFiles(&job_context->sst_delete_files,
|
versions_->GetObsoleteFiles(&job_context->sst_delete_files,
|
||||||
job_context->min_pending_output);
|
job_context->min_pending_output);
|
||||||
|
|
||||||
uint64_t min_log_number = versions_->MinLogNumber();
|
|
||||||
if (!alive_log_files_.empty()) {
|
if (!alive_log_files_.empty()) {
|
||||||
|
uint64_t min_log_number = versions_->MinLogNumber();
|
||||||
// find newly obsoleted log files
|
// find newly obsoleted log files
|
||||||
while (alive_log_files_.begin()->number < min_log_number) {
|
while (alive_log_files_.begin()->number < min_log_number) {
|
||||||
auto& earliest = *alive_log_files_.begin();
|
auto& earliest = *alive_log_files_.begin();
|
||||||
@ -547,6 +549,18 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
|
|||||||
// number < MinLogNumber().
|
// number < MinLogNumber().
|
||||||
assert(alive_log_files_.size());
|
assert(alive_log_files_.size());
|
||||||
}
|
}
|
||||||
|
while (!logs_.empty() && logs_.front().number < min_log_number) {
|
||||||
|
auto& log = logs_.front();
|
||||||
|
if (log.getting_synced) {
|
||||||
|
log_sync_cv_.Wait();
|
||||||
|
// logs_ could have changed while we were waiting.
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
logs_to_free_.push_back(log.writer.release());
|
||||||
|
logs_.pop_front();
|
||||||
|
}
|
||||||
|
// Current log cannot be obsolete.
|
||||||
|
assert(!logs_.empty());
|
||||||
}
|
}
|
||||||
|
|
||||||
// We're just cleaning up for DB::Write().
|
// We're just cleaning up for DB::Write().
|
||||||
@ -597,6 +611,11 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Status DBImpl::SyncLog(log::Writer* log) {
|
||||||
|
assert(log);
|
||||||
|
return log->file()->Sync(db_options_.use_fsync);
|
||||||
|
}
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
bool CompareCandidateFile(const JobContext::CandidateFileInfo& first,
|
bool CompareCandidateFile(const JobContext::CandidateFileInfo& first,
|
||||||
const JobContext::CandidateFileInfo& second) {
|
const JobContext::CandidateFileInfo& second) {
|
||||||
@ -3448,11 +3467,22 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
|||||||
uint64_t last_sequence = versions_->LastSequence();
|
uint64_t last_sequence = versions_->LastSequence();
|
||||||
WriteThread::Writer* last_writer = &w;
|
WriteThread::Writer* last_writer = &w;
|
||||||
autovector<WriteBatch*> write_batch_group;
|
autovector<WriteBatch*> write_batch_group;
|
||||||
|
bool need_wal_sync = !write_options.disableWAL && write_options.sync;
|
||||||
|
|
||||||
if (status.ok()) {
|
if (status.ok()) {
|
||||||
last_batch_group_size_ =
|
last_batch_group_size_ =
|
||||||
write_thread_.BuildBatchGroup(&last_writer, &write_batch_group);
|
write_thread_.BuildBatchGroup(&last_writer, &write_batch_group);
|
||||||
|
|
||||||
|
if (need_wal_sync) {
|
||||||
|
while (logs_.front().getting_synced) {
|
||||||
|
log_sync_cv_.Wait();
|
||||||
|
}
|
||||||
|
for (auto& log : logs_) {
|
||||||
|
assert(!log.getting_synced);
|
||||||
|
log.getting_synced = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Add to log and apply to memtable. We can release the lock
|
// Add to log and apply to memtable. We can release the lock
|
||||||
// during this phase since &w is currently responsible for logging
|
// during this phase since &w is currently responsible for logging
|
||||||
// and protects against concurrent loggers and concurrent writes
|
// and protects against concurrent loggers and concurrent writes
|
||||||
@ -3499,16 +3529,28 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
|||||||
if (!write_options.disableWAL) {
|
if (!write_options.disableWAL) {
|
||||||
PERF_TIMER_GUARD(write_wal_time);
|
PERF_TIMER_GUARD(write_wal_time);
|
||||||
Slice log_entry = WriteBatchInternal::Contents(updates);
|
Slice log_entry = WriteBatchInternal::Contents(updates);
|
||||||
status = log_->AddRecord(log_entry);
|
status = logs_.back().writer->AddRecord(log_entry);
|
||||||
total_log_size_ += log_entry.size();
|
total_log_size_ += log_entry.size();
|
||||||
alive_log_files_.back().AddSize(log_entry.size());
|
alive_log_files_.back().AddSize(log_entry.size());
|
||||||
log_empty_ = false;
|
log_empty_ = false;
|
||||||
log_size = log_entry.size();
|
log_size = log_entry.size();
|
||||||
RecordTick(stats_, WAL_FILE_BYTES, log_size);
|
RecordTick(stats_, WAL_FILE_BYTES, log_size);
|
||||||
if (status.ok() && write_options.sync) {
|
if (status.ok() && need_wal_sync) {
|
||||||
RecordTick(stats_, WAL_FILE_SYNCED);
|
RecordTick(stats_, WAL_FILE_SYNCED);
|
||||||
StopWatch sw(env_, stats_, WAL_FILE_SYNC_MICROS);
|
StopWatch sw(env_, stats_, WAL_FILE_SYNC_MICROS);
|
||||||
status = log_->file()->Sync(db_options_.use_fsync);
|
// It's safe to access logs_ with unlocked mutex_ here because:
|
||||||
|
// - we've set getting_synced=true for all logs,
|
||||||
|
// so other threads won't pop from logs_ while we're here,
|
||||||
|
// - only writer thread can push to logs_, and we're in
|
||||||
|
// writer thread, so no one will push to logs_,
|
||||||
|
// - as long as other threads don't modify it, it's safe to read
|
||||||
|
// from std::deque from multiple threads concurrently.
|
||||||
|
for (auto& log : logs_) {
|
||||||
|
status = SyncLog(log.writer.get());
|
||||||
|
if (!status.ok()) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
if (status.ok() && !log_dir_synced_) {
|
if (status.ok() && !log_dir_synced_) {
|
||||||
// We only sync WAL directory the first time WAL syncing is
|
// We only sync WAL directory the first time WAL syncing is
|
||||||
// requested, so that in case users never turn on WAL sync,
|
// requested, so that in case users never turn on WAL sync,
|
||||||
@ -3557,14 +3599,27 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
|||||||
} else {
|
} else {
|
||||||
// Operation failed. Make sure sure mutex is held for cleanup code below.
|
// Operation failed. Make sure sure mutex is held for cleanup code below.
|
||||||
mutex_.Lock();
|
mutex_.Lock();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (db_options_.paranoid_checks && !status.ok() &&
|
if (db_options_.paranoid_checks && !status.ok() &&
|
||||||
!status.IsBusy() && bg_error_.ok()) {
|
!status.IsBusy() && bg_error_.ok()) {
|
||||||
bg_error_ = status; // stop compaction & fail any further writes
|
bg_error_ = status; // stop compaction & fail any further writes
|
||||||
}
|
}
|
||||||
|
|
||||||
mutex_.AssertHeld();
|
mutex_.AssertHeld();
|
||||||
|
|
||||||
|
if (need_wal_sync) {
|
||||||
|
while (logs_.size() > 1) {
|
||||||
|
auto& log = logs_.front();
|
||||||
|
assert(log.getting_synced);
|
||||||
|
logs_to_free_.push_back(log.writer.release());
|
||||||
|
logs_.pop_front();
|
||||||
|
}
|
||||||
|
assert(logs_.back().getting_synced);
|
||||||
|
logs_.back().getting_synced = false;
|
||||||
|
log_sync_cv_.SignalAll();
|
||||||
|
}
|
||||||
|
|
||||||
write_thread_.ExitWriteThread(&w, last_writer, status);
|
write_thread_.ExitWriteThread(&w, last_writer, status);
|
||||||
|
|
||||||
mutex_.Unlock();
|
mutex_.Unlock();
|
||||||
@ -3675,9 +3730,8 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
|
|||||||
if (creating_new_log) {
|
if (creating_new_log) {
|
||||||
logfile_number_ = new_log_number;
|
logfile_number_ = new_log_number;
|
||||||
assert(new_log != nullptr);
|
assert(new_log != nullptr);
|
||||||
logs_to_free_.push_back(log_.release());
|
|
||||||
log_.reset(new_log);
|
|
||||||
log_empty_ = true;
|
log_empty_ = true;
|
||||||
|
logs_.emplace_back(logfile_number_, std::unique_ptr<log::Writer>(new_log));
|
||||||
alive_log_files_.push_back(LogFileNumberSize(logfile_number_));
|
alive_log_files_.push_back(LogFileNumberSize(logfile_number_));
|
||||||
for (auto loop_cfd : *versions_->GetColumnFamilySet()) {
|
for (auto loop_cfd : *versions_->GetColumnFamilySet()) {
|
||||||
// all this is just optimization to delete logs that
|
// all this is just optimization to delete logs that
|
||||||
@ -4210,7 +4264,10 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname,
|
|||||||
impl->logfile_number_ = new_log_number;
|
impl->logfile_number_ = new_log_number;
|
||||||
unique_ptr<WritableFileWriter> file_writer(
|
unique_ptr<WritableFileWriter> file_writer(
|
||||||
new WritableFileWriter(std::move(lfile), opt_env_options));
|
new WritableFileWriter(std::move(lfile), opt_env_options));
|
||||||
impl->log_.reset(new log::Writer(std::move(file_writer)));
|
impl->logs_.emplace_back(
|
||||||
|
new_log_number,
|
||||||
|
std::unique_ptr<log::Writer>(
|
||||||
|
new log::Writer(std::move(file_writer))));
|
||||||
|
|
||||||
// set column family handles
|
// set column family handles
|
||||||
for (auto cf : column_families) {
|
for (auto cf : column_families) {
|
||||||
|
27
db/db_impl.h
27
db/db_impl.h
@ -308,6 +308,8 @@ class DBImpl : public DB {
|
|||||||
// It is not necessary to hold the mutex when invoking this method.
|
// It is not necessary to hold the mutex when invoking this method.
|
||||||
void PurgeObsoleteFiles(const JobContext& background_contet);
|
void PurgeObsoleteFiles(const JobContext& background_contet);
|
||||||
|
|
||||||
|
Status SyncLog(log::Writer* log);
|
||||||
|
|
||||||
ColumnFamilyHandle* DefaultColumnFamily() const override;
|
ColumnFamilyHandle* DefaultColumnFamily() const override;
|
||||||
|
|
||||||
const SnapshotList& snapshots() const { return snapshots_; }
|
const SnapshotList& snapshots() const { return snapshots_; }
|
||||||
@ -514,7 +516,6 @@ class DBImpl : public DB {
|
|||||||
// * whenever there is an error in background flush or compaction
|
// * whenever there is an error in background flush or compaction
|
||||||
InstrumentedCondVar bg_cv_;
|
InstrumentedCondVar bg_cv_;
|
||||||
uint64_t logfile_number_;
|
uint64_t logfile_number_;
|
||||||
unique_ptr<log::Writer> log_;
|
|
||||||
bool log_dir_synced_;
|
bool log_dir_synced_;
|
||||||
bool log_empty_;
|
bool log_empty_;
|
||||||
ColumnFamilyHandleImpl* default_cf_handle_;
|
ColumnFamilyHandleImpl* default_cf_handle_;
|
||||||
@ -522,13 +523,31 @@ class DBImpl : public DB {
|
|||||||
unique_ptr<ColumnFamilyMemTablesImpl> column_family_memtables_;
|
unique_ptr<ColumnFamilyMemTablesImpl> column_family_memtables_;
|
||||||
struct LogFileNumberSize {
|
struct LogFileNumberSize {
|
||||||
explicit LogFileNumberSize(uint64_t _number)
|
explicit LogFileNumberSize(uint64_t _number)
|
||||||
: number(_number), size(0), getting_flushed(false) {}
|
: number(_number) {}
|
||||||
void AddSize(uint64_t new_size) { size += new_size; }
|
void AddSize(uint64_t new_size) { size += new_size; }
|
||||||
uint64_t number;
|
uint64_t number;
|
||||||
uint64_t size;
|
uint64_t size = 0;
|
||||||
bool getting_flushed;
|
bool getting_flushed = false;
|
||||||
|
};
|
||||||
|
struct LogWriterNumber {
|
||||||
|
LogWriterNumber(uint64_t _number, std::unique_ptr<log::Writer> _writer)
|
||||||
|
: number(_number), writer(std::move(_writer)) {}
|
||||||
|
uint64_t number;
|
||||||
|
std::unique_ptr<log::Writer> writer;
|
||||||
|
// true for some prefix of logs_
|
||||||
|
bool getting_synced = false;
|
||||||
};
|
};
|
||||||
std::deque<LogFileNumberSize> alive_log_files_;
|
std::deque<LogFileNumberSize> alive_log_files_;
|
||||||
|
// Log files that aren't fully synced, and the current log file.
|
||||||
|
// Synchronization:
|
||||||
|
// - push_back() is done from write thread with locked mutex_,
|
||||||
|
// - pop_front() is done from any thread with locked mutex_,
|
||||||
|
// - back() and items with getting_synced=true are not popped,
|
||||||
|
// - it follows that write thread with unlocked mutex_ can safely access
|
||||||
|
// back() and items with getting_synced=true.
|
||||||
|
std::deque<LogWriterNumber> logs_;
|
||||||
|
// Signaled when getting_synced becomes false for some of the logs_.
|
||||||
|
InstrumentedCondVar log_sync_cv_;
|
||||||
uint64_t total_log_size_;
|
uint64_t total_log_size_;
|
||||||
// only used for dynamically adjusting max_total_wal_size. it is a sum of
|
// only used for dynamically adjusting max_total_wal_size. it is a sum of
|
||||||
// [write_buffer_size * max_write_buffer_number] over all column families
|
// [write_buffer_size * max_write_buffer_number] over all column families
|
||||||
|
@ -807,10 +807,8 @@ class SleepingBackgroundTask {
|
|||||||
bool done_with_sleep_;
|
bool done_with_sleep_;
|
||||||
};
|
};
|
||||||
|
|
||||||
// Disable the test because it is not passing.
|
|
||||||
// Previous log file is not fsynced if sync is forced after log rolling.
|
// Previous log file is not fsynced if sync is forced after log rolling.
|
||||||
// TODO(FB internal task#6730880) Fix the bug
|
TEST_P(FaultInjectionTest, WriteOptionSyncTest) {
|
||||||
TEST_P(FaultInjectionTest, DISABLED_WriteOptionSyncTest) {
|
|
||||||
SleepingBackgroundTask sleeping_task_low;
|
SleepingBackgroundTask sleeping_task_low;
|
||||||
env_->SetBackgroundThreads(1, Env::HIGH);
|
env_->SetBackgroundThreads(1, Env::HIGH);
|
||||||
// Block the job queue to prevent flush job from running.
|
// Block the job queue to prevent flush job from running.
|
||||||
|
Loading…
Reference in New Issue
Block a user