Fix, enable, and enhance backup/restore in db_stress (#7348)
Summary: Although added to db_stress, testing of backup/restore was never integrated into the crash test, originally concerned about performance. I've enabled it now and to address the peformance concern, testing backup/restore is always skipped once the db exceeds a certain size threshold, default 100MB. This should provide sufficient opportunity for testing BackupEngine without bogging down everything else with heavier and heavier operations. Also fixed backup/restore in db_stress by making sure PurgeOldBackups can remove manifest files, which are normally kept around for db_stress. Added more coverage of backup options, and up to three backups being saved in one backup directory (in some cases). Pull Request resolved: https://github.com/facebook/rocksdb/pull/7348 Test Plan: ran 'make blackbox_crash_test' for a while, with heightened probabilitly of taking backups (1/10k). Also confirmed with some debug output that the code is being covered, TestBackupRestore only takes a few seconds to complete when triggered, and even at 1/10k and ~50MB database, there's <,~ 1 thread testing backups at any time. Reviewed By: ajkr Differential Revision: D23510835 Pulled By: pdillinger fbshipit-source-id: b6b8735591808141f81f10773ac31634cf03b6c0
This commit is contained in:
parent
5746767387
commit
499c9448d0
@ -174,6 +174,7 @@ DECLARE_bool(use_txn);
|
||||
DECLARE_uint64(txn_write_policy);
|
||||
DECLARE_bool(unordered_write);
|
||||
DECLARE_int32(backup_one_in);
|
||||
DECLARE_uint64(backup_max_size);
|
||||
DECLARE_int32(checkpoint_one_in);
|
||||
DECLARE_int32(ingest_external_file_one_in);
|
||||
DECLARE_int32(ingest_external_file_width);
|
||||
|
@ -20,10 +20,12 @@ class DbStressEnvWrapper : public EnvWrapper {
|
||||
// We determine whether it is a manifest file by searching a strong,
|
||||
// so that there will be false positive if the directory path contains the
|
||||
// keyword but it is unlikely.
|
||||
// Checkpoint directory needs to be exempted.
|
||||
// Checkpoint, backup, and restore directories needs to be exempted.
|
||||
if (!if_preserve_all_manifests ||
|
||||
f.find("MANIFEST-") == std::string::npos ||
|
||||
f.find("checkpoint") != std::string::npos) {
|
||||
f.find("checkpoint") != std::string::npos ||
|
||||
f.find(".backup") != std::string::npos ||
|
||||
f.find(".restore") != std::string::npos) {
|
||||
return target()->DeleteFile(f);
|
||||
}
|
||||
return Status::OK();
|
||||
|
@ -479,6 +479,10 @@ DEFINE_int32(backup_one_in, 0,
|
||||
"every N operations on average. 0 indicates CreateNewBackup() "
|
||||
"is disabled.");
|
||||
|
||||
DEFINE_uint64(backup_max_size, 100 * 1024 * 1024,
|
||||
"If non-zero, skip checking backup/restore when DB size in "
|
||||
"bytes exceeds this setting.");
|
||||
|
||||
DEFINE_int32(checkpoint_one_in, 0,
|
||||
"If non-zero, then CreateCheckpoint() will be called once for "
|
||||
"every N operations on average. 0 indicates CreateCheckpoint() "
|
||||
|
@ -667,10 +667,23 @@ void StressTest::OperateDb(ThreadState* thread) {
|
||||
}
|
||||
|
||||
if (thread->rand.OneInOpt(FLAGS_backup_one_in)) {
|
||||
Status s = TestBackupRestore(thread, rand_column_families, rand_keys);
|
||||
if (!s.ok()) {
|
||||
VerificationAbort(shared, "Backup/restore gave inconsistent state",
|
||||
s);
|
||||
// Beyond a certain DB size threshold, this test becomes heavier than
|
||||
// it's worth.
|
||||
uint64_t total_size = 0;
|
||||
if (FLAGS_backup_max_size > 0) {
|
||||
std::vector<FileAttributes> files;
|
||||
db_stress_env->GetChildrenFileAttributes(FLAGS_db, &files);
|
||||
for (auto& file : files) {
|
||||
total_size += file.size_bytes;
|
||||
}
|
||||
}
|
||||
|
||||
if (total_size <= FLAGS_backup_max_size) {
|
||||
Status s = TestBackupRestore(thread, rand_column_families, rand_keys);
|
||||
if (!s.ok()) {
|
||||
VerificationAbort(shared, "Backup/restore gave inconsistent state",
|
||||
s);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -1211,6 +1224,30 @@ Status StressTest::TestBackupRestore(
|
||||
std::string backup_dir = FLAGS_db + "/.backup" + ToString(thread->tid);
|
||||
std::string restore_dir = FLAGS_db + "/.restore" + ToString(thread->tid);
|
||||
BackupableDBOptions backup_opts(backup_dir);
|
||||
// For debugging, get info_log from live options
|
||||
backup_opts.info_log = db_->GetDBOptions().info_log.get();
|
||||
assert(backup_opts.info_log);
|
||||
if (thread->rand.OneIn(2)) {
|
||||
backup_opts.file_checksum_gen_factory = options_.file_checksum_gen_factory;
|
||||
}
|
||||
if (thread->rand.OneIn(10)) {
|
||||
backup_opts.share_table_files = false;
|
||||
} else {
|
||||
backup_opts.share_table_files = true;
|
||||
if (thread->rand.OneIn(5)) {
|
||||
backup_opts.share_files_with_checksum = false;
|
||||
} else {
|
||||
backup_opts.share_files_with_checksum = true;
|
||||
if (thread->rand.OneIn(2)) {
|
||||
// old
|
||||
backup_opts.share_files_with_checksum_naming = kChecksumAndFileSize;
|
||||
} else {
|
||||
// new
|
||||
backup_opts.share_files_with_checksum_naming =
|
||||
kOptionalChecksumAndDbSessionId;
|
||||
}
|
||||
}
|
||||
}
|
||||
BackupEngine* backup_engine = nullptr;
|
||||
Status s = BackupEngine::Open(db_stress_env, backup_opts, &backup_engine);
|
||||
if (s.ok()) {
|
||||
@ -1221,12 +1258,31 @@ Status StressTest::TestBackupRestore(
|
||||
backup_engine = nullptr;
|
||||
s = BackupEngine::Open(db_stress_env, backup_opts, &backup_engine);
|
||||
}
|
||||
std::vector<BackupInfo> backup_info;
|
||||
if (s.ok()) {
|
||||
s = backup_engine->RestoreDBFromLatestBackup(restore_dir /* db_dir */,
|
||||
restore_dir /* wal_dir */);
|
||||
backup_engine->GetBackupInfo(&backup_info);
|
||||
if (backup_info.empty()) {
|
||||
s = Status::NotFound("no backups found");
|
||||
}
|
||||
}
|
||||
if (s.ok() && thread->rand.OneIn(2)) {
|
||||
s = backup_engine->VerifyBackup(
|
||||
backup_info.front().backup_id,
|
||||
thread->rand.OneIn(2) /* verify_with_checksum */);
|
||||
}
|
||||
if (s.ok()) {
|
||||
s = backup_engine->PurgeOldBackups(0 /* num_backups_to_keep */);
|
||||
int count = static_cast<int>(backup_info.size());
|
||||
s = backup_engine->RestoreDBFromBackup(
|
||||
RestoreOptions(), backup_info[thread->rand.Uniform(count)].backup_id,
|
||||
restore_dir /* db_dir */, restore_dir /* wal_dir */);
|
||||
}
|
||||
if (s.ok()) {
|
||||
uint32_t to_keep = 0;
|
||||
if (thread->tid == 0) {
|
||||
// allow one thread to keep up to 2 backups
|
||||
to_keep = thread->rand.Uniform(3);
|
||||
}
|
||||
s = backup_engine->PurgeOldBackups(to_keep);
|
||||
}
|
||||
DB* restored_db = nullptr;
|
||||
std::vector<ColumnFamilyHandle*> restored_cf_handles;
|
||||
|
@ -29,6 +29,9 @@ expected_values_file = tempfile.NamedTemporaryFile()
|
||||
|
||||
default_params = {
|
||||
"acquire_snapshot_one_in": 10000,
|
||||
"backup_max_size": 100 * 1024 * 1024,
|
||||
# Consider larger number when backups considered more stable
|
||||
"backup_one_in": 100000,
|
||||
"block_size": 16384,
|
||||
"bloom_bits": lambda: random.choice([random.randint(0,19),
|
||||
random.lognormvariate(2.3, 1.3)]),
|
||||
|
Loading…
Reference in New Issue
Block a user