Fix, enable, and enhance backup/restore in db_stress (#7348)

Summary:
Although added to db_stress, testing of backup/restore
was never integrated into the crash test, originally concerned about
performance. I've enabled it now and to address the peformance concern,
testing backup/restore is always skipped once the db exceeds a certain
size threshold, default 100MB. This should provide sufficient
opportunity for testing BackupEngine without bogging down everything
else with heavier and heavier operations.

Also fixed backup/restore in db_stress by making sure PurgeOldBackups
can remove manifest files, which are normally kept around for db_stress.

Added more coverage of backup options, and up to three backups being
saved in one backup directory (in some cases).

Pull Request resolved: https://github.com/facebook/rocksdb/pull/7348

Test Plan:
ran 'make blackbox_crash_test' for a while, with heightened
probabilitly of taking backups (1/10k). Also confirmed with some debug
output that the code is being covered, TestBackupRestore only takes
a few seconds to complete when triggered, and even at 1/10k and ~50MB
database, there's <,~ 1 thread testing backups at any time.

Reviewed By: ajkr

Differential Revision: D23510835

Pulled By: pdillinger

fbshipit-source-id: b6b8735591808141f81f10773ac31634cf03b6c0
This commit is contained in:
Peter Dillinger 2020-09-03 20:11:45 -07:00 committed by Facebook GitHub Bot
parent 5746767387
commit 499c9448d0
5 changed files with 75 additions and 9 deletions

View File

@ -174,6 +174,7 @@ DECLARE_bool(use_txn);
DECLARE_uint64(txn_write_policy);
DECLARE_bool(unordered_write);
DECLARE_int32(backup_one_in);
DECLARE_uint64(backup_max_size);
DECLARE_int32(checkpoint_one_in);
DECLARE_int32(ingest_external_file_one_in);
DECLARE_int32(ingest_external_file_width);

View File

@ -20,10 +20,12 @@ class DbStressEnvWrapper : public EnvWrapper {
// We determine whether it is a manifest file by searching a strong,
// so that there will be false positive if the directory path contains the
// keyword but it is unlikely.
// Checkpoint directory needs to be exempted.
// Checkpoint, backup, and restore directories needs to be exempted.
if (!if_preserve_all_manifests ||
f.find("MANIFEST-") == std::string::npos ||
f.find("checkpoint") != std::string::npos) {
f.find("checkpoint") != std::string::npos ||
f.find(".backup") != std::string::npos ||
f.find(".restore") != std::string::npos) {
return target()->DeleteFile(f);
}
return Status::OK();

View File

@ -479,6 +479,10 @@ DEFINE_int32(backup_one_in, 0,
"every N operations on average. 0 indicates CreateNewBackup() "
"is disabled.");
DEFINE_uint64(backup_max_size, 100 * 1024 * 1024,
"If non-zero, skip checking backup/restore when DB size in "
"bytes exceeds this setting.");
DEFINE_int32(checkpoint_one_in, 0,
"If non-zero, then CreateCheckpoint() will be called once for "
"every N operations on average. 0 indicates CreateCheckpoint() "

View File

@ -667,10 +667,23 @@ void StressTest::OperateDb(ThreadState* thread) {
}
if (thread->rand.OneInOpt(FLAGS_backup_one_in)) {
Status s = TestBackupRestore(thread, rand_column_families, rand_keys);
if (!s.ok()) {
VerificationAbort(shared, "Backup/restore gave inconsistent state",
s);
// Beyond a certain DB size threshold, this test becomes heavier than
// it's worth.
uint64_t total_size = 0;
if (FLAGS_backup_max_size > 0) {
std::vector<FileAttributes> files;
db_stress_env->GetChildrenFileAttributes(FLAGS_db, &files);
for (auto& file : files) {
total_size += file.size_bytes;
}
}
if (total_size <= FLAGS_backup_max_size) {
Status s = TestBackupRestore(thread, rand_column_families, rand_keys);
if (!s.ok()) {
VerificationAbort(shared, "Backup/restore gave inconsistent state",
s);
}
}
}
@ -1211,6 +1224,30 @@ Status StressTest::TestBackupRestore(
std::string backup_dir = FLAGS_db + "/.backup" + ToString(thread->tid);
std::string restore_dir = FLAGS_db + "/.restore" + ToString(thread->tid);
BackupableDBOptions backup_opts(backup_dir);
// For debugging, get info_log from live options
backup_opts.info_log = db_->GetDBOptions().info_log.get();
assert(backup_opts.info_log);
if (thread->rand.OneIn(2)) {
backup_opts.file_checksum_gen_factory = options_.file_checksum_gen_factory;
}
if (thread->rand.OneIn(10)) {
backup_opts.share_table_files = false;
} else {
backup_opts.share_table_files = true;
if (thread->rand.OneIn(5)) {
backup_opts.share_files_with_checksum = false;
} else {
backup_opts.share_files_with_checksum = true;
if (thread->rand.OneIn(2)) {
// old
backup_opts.share_files_with_checksum_naming = kChecksumAndFileSize;
} else {
// new
backup_opts.share_files_with_checksum_naming =
kOptionalChecksumAndDbSessionId;
}
}
}
BackupEngine* backup_engine = nullptr;
Status s = BackupEngine::Open(db_stress_env, backup_opts, &backup_engine);
if (s.ok()) {
@ -1221,12 +1258,31 @@ Status StressTest::TestBackupRestore(
backup_engine = nullptr;
s = BackupEngine::Open(db_stress_env, backup_opts, &backup_engine);
}
std::vector<BackupInfo> backup_info;
if (s.ok()) {
s = backup_engine->RestoreDBFromLatestBackup(restore_dir /* db_dir */,
restore_dir /* wal_dir */);
backup_engine->GetBackupInfo(&backup_info);
if (backup_info.empty()) {
s = Status::NotFound("no backups found");
}
}
if (s.ok() && thread->rand.OneIn(2)) {
s = backup_engine->VerifyBackup(
backup_info.front().backup_id,
thread->rand.OneIn(2) /* verify_with_checksum */);
}
if (s.ok()) {
s = backup_engine->PurgeOldBackups(0 /* num_backups_to_keep */);
int count = static_cast<int>(backup_info.size());
s = backup_engine->RestoreDBFromBackup(
RestoreOptions(), backup_info[thread->rand.Uniform(count)].backup_id,
restore_dir /* db_dir */, restore_dir /* wal_dir */);
}
if (s.ok()) {
uint32_t to_keep = 0;
if (thread->tid == 0) {
// allow one thread to keep up to 2 backups
to_keep = thread->rand.Uniform(3);
}
s = backup_engine->PurgeOldBackups(to_keep);
}
DB* restored_db = nullptr;
std::vector<ColumnFamilyHandle*> restored_cf_handles;

View File

@ -29,6 +29,9 @@ expected_values_file = tempfile.NamedTemporaryFile()
default_params = {
"acquire_snapshot_one_in": 10000,
"backup_max_size": 100 * 1024 * 1024,
# Consider larger number when backups considered more stable
"backup_one_in": 100000,
"block_size": 16384,
"bloom_bits": lambda: random.choice([random.randint(0,19),
random.lognormvariate(2.3, 1.3)]),