Test correctness with WAL disabled in non-txn blackbox crash tests (#9338)
Summary: Recently we added the ability to verify some prefix of operations are recovered (AKA no "hole" in the recovered data) (https://github.com/facebook/rocksdb/issues/8966). Besides testing unsynced data loss scenarios, it is also useful to test WAL disabled use cases, where unflushed writes are expected to be lost. Note RocksDB only offers the prefix-recovery guarantee to WAL-disabled use cases that use atomic flush, so crash test always enables atomic flush when WAL is disabled. To verify WAL-disabled crash-recovery correctness globally, i.e., also in whitebox and blackbox transaction tests, it is possible but requires further changes. I added TODOs in db_crashtest.py. Depends on https://github.com/facebook/rocksdb/issues/9305. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9338 Test Plan: Running all crash tests and many instances of blackbox. Sandcastle links are in Phabricator diff test plan. Reviewed By: riversand963 Differential Revision: D33345333 Pulled By: ajkr fbshipit-source-id: f56dd7d2e5a78d59301bf4fc3fedb980eb31e0ce
This commit is contained in:
parent
b860a42158
commit
6892f19b11
@ -312,7 +312,7 @@ void StressTest::FinishInitDb(SharedState* shared) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (FLAGS_sync_fault_injection && IsStateTracked()) {
|
if ((FLAGS_sync_fault_injection || FLAGS_disable_wal) && IsStateTracked()) {
|
||||||
Status s = shared->SaveAtAndAfter(db_);
|
Status s = shared->SaveAtAndAfter(db_);
|
||||||
if (!s.ok()) {
|
if (!s.ok()) {
|
||||||
fprintf(stderr, "Error enabling history tracing: %s\n",
|
fprintf(stderr, "Error enabling history tracing: %s\n",
|
||||||
@ -1449,7 +1449,20 @@ Status StressTest::TestBackupRestore(
|
|||||||
test_opts.file_sizes = thread->rand.OneIn(2) == 0;
|
test_opts.file_sizes = thread->rand.OneIn(2) == 0;
|
||||||
TEST_EnableWriteFutureSchemaVersion2(backup_engine, test_opts);
|
TEST_EnableWriteFutureSchemaVersion2(backup_engine, test_opts);
|
||||||
}
|
}
|
||||||
s = backup_engine->CreateNewBackup(db_);
|
CreateBackupOptions create_opts;
|
||||||
|
if (FLAGS_disable_wal) {
|
||||||
|
// The verification can only work when latest value of `key` is backed up,
|
||||||
|
// which requires flushing in case of WAL disabled.
|
||||||
|
//
|
||||||
|
// Note this triggers a flush with a key lock held. Meanwhile, operations
|
||||||
|
// like flush/compaction may attempt to grab key locks like in
|
||||||
|
// `DbStressCompactionFilter`. The philosophy around preventing deadlock
|
||||||
|
// is the background operation key lock acquisition only tries but does
|
||||||
|
// not wait for the lock. So here in the foreground it is OK to hold the
|
||||||
|
// lock and wait on a background operation (flush).
|
||||||
|
create_opts.flush_before_backup = true;
|
||||||
|
}
|
||||||
|
s = backup_engine->CreateNewBackup(create_opts, db_);
|
||||||
if (!s.ok()) {
|
if (!s.ok()) {
|
||||||
from = "BackupEngine::CreateNewBackup";
|
from = "BackupEngine::CreateNewBackup";
|
||||||
}
|
}
|
||||||
@ -1902,6 +1915,9 @@ void StressTest::TestCompactFiles(ThreadState* thread,
|
|||||||
|
|
||||||
Status StressTest::TestFlush(const std::vector<int>& rand_column_families) {
|
Status StressTest::TestFlush(const std::vector<int>& rand_column_families) {
|
||||||
FlushOptions flush_opts;
|
FlushOptions flush_opts;
|
||||||
|
if (FLAGS_atomic_flush) {
|
||||||
|
return db_->Flush(flush_opts, column_families_);
|
||||||
|
}
|
||||||
std::vector<ColumnFamilyHandle*> cfhs;
|
std::vector<ColumnFamilyHandle*> cfhs;
|
||||||
std::for_each(rand_column_families.begin(), rand_column_families.end(),
|
std::for_each(rand_column_families.begin(), rand_column_families.end(),
|
||||||
[this, &cfhs](int k) { cfhs.push_back(column_families_[k]); });
|
[this, &cfhs](int k) { cfhs.push_back(column_families_[k]); });
|
||||||
@ -2827,7 +2843,7 @@ void StressTest::Reopen(ThreadState* thread) {
|
|||||||
clock_->TimeToString(now / 1000000).c_str(), num_times_reopened_);
|
clock_->TimeToString(now / 1000000).c_str(), num_times_reopened_);
|
||||||
Open();
|
Open();
|
||||||
|
|
||||||
if (FLAGS_sync_fault_injection && IsStateTracked()) {
|
if ((FLAGS_sync_fault_injection || FLAGS_disable_wal) && IsStateTracked()) {
|
||||||
Status s = thread->shared->SaveAtAndAfter(db_);
|
Status s = thread->shared->SaveAtAndAfter(db_);
|
||||||
if (!s.ok()) {
|
if (!s.ok()) {
|
||||||
fprintf(stderr, "Error enabling history tracing: %s\n",
|
fprintf(stderr, "Error enabling history tracing: %s\n",
|
||||||
|
@ -91,7 +91,6 @@ default_params = {
|
|||||||
"progress_reports": 0,
|
"progress_reports": 0,
|
||||||
"readpercent": 45,
|
"readpercent": 45,
|
||||||
"recycle_log_file_num": lambda: random.randint(0, 1),
|
"recycle_log_file_num": lambda: random.randint(0, 1),
|
||||||
"reopen": 20,
|
|
||||||
"snapshot_hold_ops": 100000,
|
"snapshot_hold_ops": 100000,
|
||||||
"sst_file_manager_bytes_per_sec": lambda: random.choice([0, 104857600]),
|
"sst_file_manager_bytes_per_sec": lambda: random.choice([0, 104857600]),
|
||||||
"sst_file_manager_bytes_per_truncate": lambda: random.choice([0, 1048576]),
|
"sst_file_manager_bytes_per_truncate": lambda: random.choice([0, 1048576]),
|
||||||
@ -207,20 +206,28 @@ def is_direct_io_supported(dbname):
|
|||||||
|
|
||||||
|
|
||||||
blackbox_default_params = {
|
blackbox_default_params = {
|
||||||
|
"disable_wal": lambda: random.choice([0, 0, 0, 1]),
|
||||||
# total time for this script to test db_stress
|
# total time for this script to test db_stress
|
||||||
"duration": 6000,
|
"duration": 6000,
|
||||||
# time for one db_stress instance to run
|
# time for one db_stress instance to run
|
||||||
"interval": 120,
|
"interval": 120,
|
||||||
# since we will be killing anyway, use large value for ops_per_thread
|
# since we will be killing anyway, use large value for ops_per_thread
|
||||||
"ops_per_thread": 100000000,
|
"ops_per_thread": 100000000,
|
||||||
|
"reopen": 0,
|
||||||
"set_options_one_in": 10000,
|
"set_options_one_in": 10000,
|
||||||
}
|
}
|
||||||
|
|
||||||
whitebox_default_params = {
|
whitebox_default_params = {
|
||||||
|
# TODO: enable this once we figure out how to adjust kill odds for WAL-
|
||||||
|
# disabled runs, and either (1) separate full `db_stress` runs out of
|
||||||
|
# whitebox crash or (2) support verification at end of `db_stress` runs
|
||||||
|
# that ran with WAL disabled.
|
||||||
|
"disable_wal": 0,
|
||||||
"duration": 10000,
|
"duration": 10000,
|
||||||
"log2_keys_per_lock": 10,
|
"log2_keys_per_lock": 10,
|
||||||
"ops_per_thread": 200000,
|
"ops_per_thread": 200000,
|
||||||
"random_kill_odd": 888887,
|
"random_kill_odd": 888887,
|
||||||
|
"reopen": 20,
|
||||||
}
|
}
|
||||||
|
|
||||||
simple_default_params = {
|
simple_default_params = {
|
||||||
@ -269,6 +276,8 @@ txn_params = {
|
|||||||
# Avoid lambda to set it once for the entire test
|
# Avoid lambda to set it once for the entire test
|
||||||
"txn_write_policy": random.randint(0, 2),
|
"txn_write_policy": random.randint(0, 2),
|
||||||
"unordered_write": random.randint(0, 1),
|
"unordered_write": random.randint(0, 1),
|
||||||
|
# TODO: there is such a thing as transactions with WAL disabled. We should
|
||||||
|
# cover that case.
|
||||||
"disable_wal": 0,
|
"disable_wal": 0,
|
||||||
# OpenReadOnly after checkpoint is not currnetly compatible with WritePrepared txns
|
# OpenReadOnly after checkpoint is not currnetly compatible with WritePrepared txns
|
||||||
"checkpoint_one_in": 0,
|
"checkpoint_one_in": 0,
|
||||||
@ -352,6 +361,10 @@ def finalize_and_sanitize(src_params):
|
|||||||
dest_params["allow_concurrent_memtable_write"] = 1
|
dest_params["allow_concurrent_memtable_write"] = 1
|
||||||
if dest_params.get("disable_wal", 0) == 1:
|
if dest_params.get("disable_wal", 0) == 1:
|
||||||
dest_params["atomic_flush"] = 1
|
dest_params["atomic_flush"] = 1
|
||||||
|
# The `DbStressCompactionFilter` can apply memtable updates to SST
|
||||||
|
# files, which would be problematic without WAL since such updates are
|
||||||
|
# expected to be lost in crash recoveries.
|
||||||
|
dest_params["enable_compaction_filter"] = 0
|
||||||
dest_params["sync"] = 0
|
dest_params["sync"] = 0
|
||||||
dest_params["write_fault_one_in"] = 0
|
dest_params["write_fault_one_in"] = 0
|
||||||
if dest_params.get("open_files", 1) != -1:
|
if dest_params.get("open_files", 1) != -1:
|
||||||
|
Loading…
Reference in New Issue
Block a user