db_stress verify with lost unsynced operations (#8966)

Summary:
When a previous run left behind historical state/trace files (implying it was run with --sync_fault_injection set), this PR uses them to restore the expected state according to the DB's recovered sequence number. That way, a tail of latest unsynced operations are permitted to be dropped, as is the case when data in page cache or certain `Env`s is lost. The point of the verification in this scenario is just to ensure there is no hole in the recovered data.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/8966

Test Plan:
- ran it a while, made sure it is restoring expected values using the historical state/trace files:
```
$ rm -rf ./tmp-db/ ./exp/ && mkdir -p ./tmp-db/ ./exp/ && while ./db_stress -compression_type=none -clear_column_family_one_in=0 -expected_values_dir=./exp -sync_fault_injection=1 -destroy_db_initially=0 -db=./tmp-db -max_key=1000000 -ops_per_thread=10000 -reopen=0 -threads=32 ; do : ; done
```

Reviewed By: pdillinger

Differential Revision: D31219445

Pulled By: ajkr

fbshipit-source-id: f0e1d51fe5b35465b00565c33331190ea38ba0ad
This commit is contained in:
Andrew Kryczka 2021-12-15 12:53:32 -08:00 committed by Facebook GitHub Bot
parent 806d8916da
commit c9818b3325
7 changed files with 278 additions and 6 deletions

View File

@ -225,7 +225,7 @@ size_t GenerateValue(uint32_t rand, char* v, size_t max_sz) {
((rand % kRandomValueMaxFactor) + 1) * FLAGS_value_size_mult; ((rand % kRandomValueMaxFactor) + 1) * FLAGS_value_size_mult;
assert(value_sz <= max_sz && value_sz >= sizeof(uint32_t)); assert(value_sz <= max_sz && value_sz >= sizeof(uint32_t));
(void)max_sz; (void)max_sz;
*((uint32_t*)v) = rand; PutUnaligned(reinterpret_cast<uint32_t*>(v), rand);
for (size_t i = sizeof(uint32_t); i < value_sz; i++) { for (size_t i = sizeof(uint32_t); i < value_sz; i++) {
v[i] = (char)(rand ^ i); v[i] = (char)(rand ^ i);
} }
@ -233,6 +233,13 @@ size_t GenerateValue(uint32_t rand, char* v, size_t max_sz) {
return value_sz; // the size of the value set. return value_sz; // the size of the value set.
} }
uint32_t GetValueBase(Slice s) {
assert(s.size() >= sizeof(uint32_t));
uint32_t res;
GetUnaligned(reinterpret_cast<const uint32_t*>(s.data()), &res);
return res;
}
std::string NowNanosStr() { std::string NowNanosStr() {
uint64_t t = db_stress_env->NowNanos(); uint64_t t = db_stress_env->NowNanos();
std::string ret; std::string ret;

View File

@ -567,6 +567,7 @@ extern std::vector<int64_t> GenerateNKeys(ThreadState* thread, int num_keys,
uint64_t iteration); uint64_t iteration);
extern size_t GenerateValue(uint32_t rand, char* v, size_t max_sz); extern size_t GenerateValue(uint32_t rand, char* v, size_t max_sz);
extern uint32_t GetValueBase(Slice s);
extern StressTest* CreateCfConsistencyStressTest(); extern StressTest* CreateCfConsistencyStressTest();
extern StressTest* CreateBatchedOpsStressTest(); extern StressTest* CreateBatchedOpsStressTest();

View File

@ -818,7 +818,7 @@ DEFINE_bool(sync_fault_injection, false,
"and unsynced data in DB will lost after crash. In such a case we " "and unsynced data in DB will lost after crash. In such a case we "
"track DB changes in a trace file (\"*.trace\") in " "track DB changes in a trace file (\"*.trace\") in "
"--expected_values_dir for verifying there are no holes in the " "--expected_values_dir for verifying there are no holes in the "
"recovered data (future work)."); "recovered data.");
DEFINE_bool(best_efforts_recovery, false, DEFINE_bool(best_efforts_recovery, false,
"If true, use best efforts recovery."); "If true, use best efforts recovery.");

View File

@ -254,6 +254,10 @@ class SharedState {
return expected_state_manager_->SaveAtAndAfter(db); return expected_state_manager_->SaveAtAndAfter(db);
} }
bool HasHistory() { return expected_state_manager_->HasHistory(); }
Status Restore(DB* db) { return expected_state_manager_->Restore(db); }
// Requires external locking covering all keys in `cf`. // Requires external locking covering all keys in `cf`.
void ClearColumnFamily(int cf) { void ClearColumnFamily(int cf) {
return expected_state_manager_->ClearColumnFamily(cf); return expected_state_manager_->ClearColumnFamily(cf);

View File

@ -307,7 +307,20 @@ void StressTest::FinishInitDb(SharedState* shared) {
fprintf(stdout, "Compaction filter factory: %s\n", fprintf(stdout, "Compaction filter factory: %s\n",
compaction_filter_factory->Name()); compaction_filter_factory->Name());
} }
// TODO(ajkr): First restore if there's already a trace.
if (shared->HasHistory() && IsStateTracked()) {
// The way it works right now is, if there's any history, that means the
// previous run mutating the DB had all its operations traced, in which case
// we should always be able to `Restore()` the expected values to match the
// `db_`'s current seqno.
Status s = shared->Restore(db_);
if (!s.ok()) {
fprintf(stderr, "Error restoring historical expected values: %s\n",
s.ToString().c_str());
exit(1);
}
}
if (FLAGS_sync_fault_injection && IsStateTracked()) { if (FLAGS_sync_fault_injection && IsStateTracked()) {
Status s = shared->SaveAtAndAfter(db_); Status s = shared->SaveAtAndAfter(db_);
if (!s.ok()) { if (!s.ok()) {

View File

@ -7,8 +7,10 @@
#include "db_stress_tool/expected_state.h" #include "db_stress_tool/expected_state.h"
#include "db_stress_tool/db_stress_common.h"
#include "db_stress_tool/db_stress_shared_state.h" #include "db_stress_tool/db_stress_shared_state.h"
#include "rocksdb/trace_reader_writer.h" #include "rocksdb/trace_reader_writer.h"
#include "rocksdb/trace_record_result.h"
namespace ROCKSDB_NAMESPACE { namespace ROCKSDB_NAMESPACE {
@ -318,13 +320,220 @@ Status FileExpectedStateManager::SaveAtAndAfter(DB* /* db */) {
} }
#endif // ROCKSDB_LITE #endif // ROCKSDB_LITE
bool FileExpectedStateManager::HasHistory() {
return saved_seqno_ != kMaxSequenceNumber;
}
#ifndef ROCKSDB_LITE
// An `ExpectedStateTraceRecordHandler` applies a configurable number of
// write operation trace records to the configured expected state. It is used in
// `FileExpectedStateManager::Restore()` to sync the expected state with the
// DB's post-recovery state.
class ExpectedStateTraceRecordHandler : public TraceRecord::Handler,
public WriteBatch::Handler {
public:
ExpectedStateTraceRecordHandler(uint64_t max_write_ops, ExpectedState* state)
: max_write_ops_(max_write_ops), state_(state) {}
~ExpectedStateTraceRecordHandler() {
assert(num_write_ops_ == max_write_ops_);
}
Status Handle(const WriteQueryTraceRecord& record,
std::unique_ptr<TraceRecordResult>* /* result */) override {
if (num_write_ops_ == max_write_ops_) {
return Status::OK();
}
WriteBatch batch(record.GetWriteBatchRep().ToString());
return batch.Iterate(this);
}
// Ignore reads.
Status Handle(const GetQueryTraceRecord& /* record */,
std::unique_ptr<TraceRecordResult>* /* result */) override {
return Status::OK();
}
// Ignore reads.
Status Handle(const IteratorSeekQueryTraceRecord& /* record */,
std::unique_ptr<TraceRecordResult>* /* result */) override {
return Status::OK();
}
// Ignore reads.
Status Handle(const MultiGetQueryTraceRecord& /* record */,
std::unique_ptr<TraceRecordResult>* /* result */) override {
return Status::OK();
}
// Below are the WriteBatch::Handler overrides. We could use a separate
// object, but it's convenient and works to share state with the
// `TraceRecord::Handler`.
Status PutCF(uint32_t column_family_id, const Slice& key,
const Slice& value) override {
uint64_t key_id;
if (!GetIntVal(key.ToString(), &key_id)) {
return Status::Corruption("unable to parse key", key.ToString());
}
uint32_t value_id = GetValueBase(value);
state_->Put(column_family_id, static_cast<int64_t>(key_id), value_id,
false /* pending */);
++num_write_ops_;
return Status::OK();
}
Status DeleteCF(uint32_t column_family_id, const Slice& key) override {
uint64_t key_id;
if (!GetIntVal(key.ToString(), &key_id)) {
return Status::Corruption("unable to parse key", key.ToString());
}
state_->Delete(column_family_id, static_cast<int64_t>(key_id),
false /* pending */);
++num_write_ops_;
return Status::OK();
}
Status SingleDeleteCF(uint32_t column_family_id, const Slice& key) override {
return DeleteCF(column_family_id, key);
}
Status DeleteRangeCF(uint32_t column_family_id, const Slice& begin_key,
const Slice& end_key) override {
uint64_t begin_key_id, end_key_id;
if (!GetIntVal(begin_key.ToString(), &begin_key_id)) {
return Status::Corruption("unable to parse begin key",
begin_key.ToString());
}
if (!GetIntVal(end_key.ToString(), &end_key_id)) {
return Status::Corruption("unable to parse end key", end_key.ToString());
}
state_->DeleteRange(column_family_id, static_cast<int64_t>(begin_key_id),
static_cast<int64_t>(end_key_id), false /* pending */);
++num_write_ops_;
return Status::OK();
}
Status MergeCF(uint32_t column_family_id, const Slice& key,
const Slice& value) override {
return PutCF(column_family_id, key, value);
}
private:
uint64_t num_write_ops_ = 0;
uint64_t max_write_ops_;
ExpectedState* state_;
};
Status FileExpectedStateManager::Restore(DB* db) {
assert(HasHistory());
SequenceNumber seqno = db->GetLatestSequenceNumber();
if (seqno < saved_seqno_) {
return Status::Corruption("DB is older than any restorable expected state");
}
std::string state_filename = ToString(saved_seqno_) + kStateFilenameSuffix;
std::string state_file_path = GetPathForFilename(state_filename);
std::string latest_file_temp_path =
GetTempPathForFilename(kLatestBasename + kStateFilenameSuffix);
std::string latest_file_path =
GetPathForFilename(kLatestBasename + kStateFilenameSuffix);
std::string trace_filename = ToString(saved_seqno_) + kTraceFilenameSuffix;
std::string trace_file_path = GetPathForFilename(trace_filename);
std::unique_ptr<TraceReader> trace_reader;
Status s = NewFileTraceReader(Env::Default(), EnvOptions(), trace_file_path,
&trace_reader);
if (s.ok()) {
// We are going to replay on top of "`seqno`.state" to create a new
// "LATEST.state". Start off by creating a tempfile so we can later make the
// new "LATEST.state" appear atomically using `RenameFile()`.
s = CopyFile(FileSystem::Default(), state_file_path, latest_file_temp_path,
0 /* size */, false /* use_fsync */);
}
{
std::unique_ptr<Replayer> replayer;
std::unique_ptr<ExpectedState> state;
std::unique_ptr<TraceRecord::Handler> handler;
if (s.ok()) {
state.reset(new FileExpectedState(latest_file_temp_path, max_key_,
num_column_families_));
s = state->Open(false /* create */);
}
if (s.ok()) {
handler.reset(new ExpectedStateTraceRecordHandler(seqno - saved_seqno_,
state.get()));
// TODO(ajkr): An API limitation requires we provide `handles` although
// they will be unused since we only use the replayer for reading records.
// Just give a default CFH for now to satisfy the requirement.
s = db->NewDefaultReplayer({db->DefaultColumnFamily()} /* handles */,
std::move(trace_reader), &replayer);
}
if (s.ok()) {
s = replayer->Prepare();
}
for (;;) {
std::unique_ptr<TraceRecord> record;
s = replayer->Next(&record);
if (!s.ok()) {
break;
}
std::unique_ptr<TraceRecordResult> res;
record->Accept(handler.get(), &res);
}
if (s.IsIncomplete()) {
// OK because `Status::Incomplete` is expected upon finishing all the
// trace records.
s = Status::OK();
}
}
if (s.ok()) {
s = FileSystem::Default()->RenameFile(latest_file_temp_path,
latest_file_path, IOOptions(),
nullptr /* dbg */);
}
if (s.ok()) {
latest_.reset(new FileExpectedState(latest_file_path, max_key_,
num_column_families_));
s = latest_->Open(false /* create */);
}
// Delete old state/trace files. We must delete the state file first.
// Otherwise, a crash-recovery immediately after deleting the trace file could
// lead to `Restore()` unable to replay to `seqno`.
if (s.ok()) {
s = Env::Default()->DeleteFile(state_file_path);
}
if (s.ok()) {
saved_seqno_ = kMaxSequenceNumber;
s = Env::Default()->DeleteFile(trace_file_path);
}
return s;
}
#else // ROCKSDB_LITE
Status FileExpectedStateManager::Restore(DB* /* db */) {
return Status::NotSupported();
}
#endif // ROCKSDB_LITE
Status FileExpectedStateManager::Clean() { Status FileExpectedStateManager::Clean() {
std::vector<std::string> expected_state_dir_children; std::vector<std::string> expected_state_dir_children;
Status s = Env::Default()->GetChildren(expected_state_dir_path_, Status s = Env::Default()->GetChildren(expected_state_dir_path_,
&expected_state_dir_children); &expected_state_dir_children);
// An incomplete `Open()` or incomplete `SaveAtAndAfter()` could have left // An incomplete `Open()` or incomplete `SaveAtAndAfter()` could have left
// behind invalid temporary files. An incomplete `SaveAtAndAfter()` could have // behind invalid temporary files. An incomplete `SaveAtAndAfter()` could have
// also left behind stale state/trace files. // also left behind stale state/trace files. An incomplete `Restore()` could
// have left behind stale trace files.
for (size_t i = 0; s.ok() && i < expected_state_dir_children.size(); ++i) { for (size_t i = 0; s.ok() && i < expected_state_dir_children.size(); ++i) {
const auto& filename = expected_state_dir_children[i]; const auto& filename = expected_state_dir_children[i];
if (filename.rfind(kTempFilenamePrefix, 0 /* pos */) == 0 && if (filename.rfind(kTempFilenamePrefix, 0 /* pos */) == 0 &&
@ -349,7 +558,6 @@ Status FileExpectedStateManager::Clean() {
ParseUint64(filename.substr( ParseUint64(filename.substr(
0, filename.size() - kTraceFilenameSuffix.size())) < 0, filename.size() - kTraceFilenameSuffix.size())) <
saved_seqno_) { saved_seqno_) {
assert(saved_seqno_ != kMaxSequenceNumber);
// Delete stale trace files. // Delete stale trace files.
s = Env::Default()->DeleteFile(GetPathForFilename(filename)); s = Env::Default()->DeleteFile(GetPathForFilename(filename));
} }

View File

@ -133,13 +133,30 @@ class ExpectedStateManager {
virtual Status Open() = 0; virtual Status Open() = 0;
// Saves expected values for the current state of `db` and begins tracking // Saves expected values for the current state of `db` and begins tracking
// changes. // changes. Following a successful `SaveAtAndAfter()`, `Restore()` can be
// called on the same DB, as long as its state does not roll back to before
// its current state.
// //
// Requires external locking preventing concurrent execution with any other // Requires external locking preventing concurrent execution with any other
// member function. Furthermore, `db` must not be mutated while this function // member function. Furthermore, `db` must not be mutated while this function
// is executing. // is executing.
virtual Status SaveAtAndAfter(DB* db) = 0; virtual Status SaveAtAndAfter(DB* db) = 0;
// Returns true if at least one state of historical expected values can be
// restored.
//
// Requires external locking preventing concurrent execution with any other
// member function.
virtual bool HasHistory() = 0;
// Restores expected values according to the current state of `db`. See
// `SaveAtAndAfter()` for conditions where this can be called.
//
// Requires external locking preventing concurrent execution with any other
// member function. Furthermore, `db` must not be mutated while this function
// is executing.
virtual Status Restore(DB* db) = 0;
// Requires external locking covering all keys in `cf`. // Requires external locking covering all keys in `cf`.
void ClearColumnFamily(int cf) { return latest_->ClearColumnFamily(cf); } void ClearColumnFamily(int cf) { return latest_->ClearColumnFamily(cf); }
@ -204,8 +221,21 @@ class FileExpectedStateManager : public ExpectedStateManager {
// //
// This implementation makes a copy of "LATEST.state" into // This implementation makes a copy of "LATEST.state" into
// "<current seqno>.state", and starts a trace in "<current seqno>.trace". // "<current seqno>.state", and starts a trace in "<current seqno>.trace".
// Due to using external files, a following `Restore()` can happen even
// from a different process.
Status SaveAtAndAfter(DB* db) override; Status SaveAtAndAfter(DB* db) override;
// See `ExpectedStateManager::HasHistory()` API doc.
bool HasHistory() override;
// See `ExpectedStateManager::Restore()` API doc.
//
// Say `db->GetLatestSequenceNumber()` was `a` last time `SaveAtAndAfter()`
// was called and now it is `b`. Then this function replays `b - a` write
// operations from "`a`.trace" onto "`a`.state", and then copies the resulting
// file into "LATEST.state".
Status Restore(DB* db) override;
private: private:
// Requires external locking preventing concurrent execution with any other // Requires external locking preventing concurrent execution with any other
// member function. // member function.
@ -238,6 +268,15 @@ class AnonExpectedStateManager : public ExpectedStateManager {
return Status::NotSupported(); return Status::NotSupported();
} }
// See `ExpectedStateManager::HasHistory()` API doc.
bool HasHistory() override { return false; }
// See `ExpectedStateManager::Restore()` API doc.
//
// This implementation returns `Status::NotSupported` since we do not
// currently have a need to keep history of expected state within a process.
Status Restore(DB* /* db */) override { return Status::NotSupported(); }
// Requires external locking preventing concurrent execution with any other // Requires external locking preventing concurrent execution with any other
// member function. // member function.
Status Open() override; Status Open() override;