Fix, enable, and enhance backup/restore in db_stress (#7348)

Summary: Although added to db_stress, testing of backup/restore was never integrated into the crash test, originally concerned about performance. I've enabled it now and to address the peformance concern, testing backup/restore is always skipped once the db exceeds a certain size threshold, default 100MB. This should provide sufficient opportunity for testing BackupEngine without bogging down everything else with heavier and heavier operations. Also fixed backup/restore in db_stress by making sure PurgeOldBackups can remove manifest files, which are normally kept around for db_stress. Added more coverage of backup options, and up to three backups being saved in one backup directory (in some cases). Pull Request resolved: https://github.com/facebook/rocksdb/pull/7348 Test Plan: ran 'make blackbox_crash_test' for a while, with heightened probabilitly of taking backups (1/10k). Also confirmed with some debug output that the code is being covered, TestBackupRestore only takes a few seconds to complete when triggered, and even at 1/10k and ~50MB database, there's <,~ 1 thread testing backups at any time. Reviewed By: ajkr Differential Revision: D23510835 Pulled By: pdillinger fbshipit-source-id: b6b8735591808141f81f10773ac31634cf03b6c0
2020-09-03 20:11:45 -07:00 · 2020-09-03 20:11:45 -07:00 · 499c9448d0
commit 499c9448d0
parent 5746767387
5 changed files with 75 additions and 9 deletions
--- a/db_stress_tool/db_stress_common.h
+++ b/db_stress_tool/db_stress_common.h
@ -174,6 +174,7 @@ DECLARE_bool(use_txn);
 DECLARE_uint64(txn_write_policy);
 DECLARE_bool(unordered_write);
 DECLARE_int32(backup_one_in);
+DECLARE_uint64(backup_max_size);
 DECLARE_int32(checkpoint_one_in);
 DECLARE_int32(ingest_external_file_one_in);
 DECLARE_int32(ingest_external_file_width);
--- a/db_stress_tool/db_stress_env_wrapper.h
+++ b/db_stress_tool/db_stress_env_wrapper.h
@ -20,10 +20,12 @@ class DbStressEnvWrapper : public EnvWrapper {
    // We determine whether it is a manifest file by searching a strong,
    // so that there will be false positive if the directory path contains the
    // keyword but it is unlikely.
-    // Checkpoint directory needs to be exempted.
+    // Checkpoint, backup, and restore directories needs to be exempted.
    if (!if_preserve_all_manifests ||
        f.find("MANIFEST-") == std::string::npos ||
-        f.find("checkpoint") != std::string::npos) {
+        f.find("checkpoint") != std::string::npos ||
+        f.find(".backup") != std::string::npos ||
+        f.find(".restore") != std::string::npos) {
      return target()->DeleteFile(f);
    }
    return Status::OK();
--- a/db_stress_tool/db_stress_gflags.cc
+++ b/db_stress_tool/db_stress_gflags.cc
@ -479,6 +479,10 @@ DEFINE_int32(backup_one_in, 0,
             "every N operations on average.  0 indicates CreateNewBackup() "
             "is disabled.");

+DEFINE_uint64(backup_max_size, 100 * 1024 * 1024,
+              "If non-zero, skip checking backup/restore when DB size in "
+              "bytes exceeds this setting.");
+
 DEFINE_int32(checkpoint_one_in, 0,
             "If non-zero, then CreateCheckpoint() will be called once for "
             "every N operations on average.  0 indicates CreateCheckpoint() "
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@ -667,10 +667,23 @@ void StressTest::OperateDb(ThreadState* thread) {
      }

      if (thread->rand.OneInOpt(FLAGS_backup_one_in)) {
-        Status s = TestBackupRestore(thread, rand_column_families, rand_keys);
-        if (!s.ok()) {
-          VerificationAbort(shared, "Backup/restore gave inconsistent state",
-                            s);
+        // Beyond a certain DB size threshold, this test becomes heavier than
+        // it's worth.
+        uint64_t total_size = 0;
+        if (FLAGS_backup_max_size > 0) {
+          std::vector<FileAttributes> files;
+          db_stress_env->GetChildrenFileAttributes(FLAGS_db, &files);
+          for (auto& file : files) {
+            total_size += file.size_bytes;
+          }
+        }
+
+        if (total_size <= FLAGS_backup_max_size) {
+          Status s = TestBackupRestore(thread, rand_column_families, rand_keys);
+          if (!s.ok()) {
+            VerificationAbort(shared, "Backup/restore gave inconsistent state",
+                              s);
+          }
        }
      }

@ -1211,6 +1224,30 @@ Status StressTest::TestBackupRestore(
  std::string backup_dir = FLAGS_db + "/.backup" + ToString(thread->tid);
  std::string restore_dir = FLAGS_db + "/.restore" + ToString(thread->tid);
  BackupableDBOptions backup_opts(backup_dir);
+  // For debugging, get info_log from live options
+  backup_opts.info_log = db_->GetDBOptions().info_log.get();
+  assert(backup_opts.info_log);
+  if (thread->rand.OneIn(2)) {
+    backup_opts.file_checksum_gen_factory = options_.file_checksum_gen_factory;
+  }
+  if (thread->rand.OneIn(10)) {
+    backup_opts.share_table_files = false;
+  } else {
+    backup_opts.share_table_files = true;
+    if (thread->rand.OneIn(5)) {
+      backup_opts.share_files_with_checksum = false;
+    } else {
+      backup_opts.share_files_with_checksum = true;
+      if (thread->rand.OneIn(2)) {
+        // old
+        backup_opts.share_files_with_checksum_naming = kChecksumAndFileSize;
+      } else {
+        // new
+        backup_opts.share_files_with_checksum_naming =
+            kOptionalChecksumAndDbSessionId;
+      }
+    }
+  }
  BackupEngine* backup_engine = nullptr;
  Status s = BackupEngine::Open(db_stress_env, backup_opts, &backup_engine);
  if (s.ok()) {
@ -1221,12 +1258,31 @@ Status StressTest::TestBackupRestore(
    backup_engine = nullptr;
    s = BackupEngine::Open(db_stress_env, backup_opts, &backup_engine);
  }
+  std::vector<BackupInfo> backup_info;
  if (s.ok()) {
-    s = backup_engine->RestoreDBFromLatestBackup(restore_dir /* db_dir */,
-                                                 restore_dir /* wal_dir */);
+    backup_engine->GetBackupInfo(&backup_info);
+    if (backup_info.empty()) {
+      s = Status::NotFound("no backups found");
+    }
+  }
+  if (s.ok() && thread->rand.OneIn(2)) {
+    s = backup_engine->VerifyBackup(
+        backup_info.front().backup_id,
+        thread->rand.OneIn(2) /* verify_with_checksum */);
  }
  if (s.ok()) {
-    s = backup_engine->PurgeOldBackups(0 /* num_backups_to_keep */);
+    int count = static_cast<int>(backup_info.size());
+    s = backup_engine->RestoreDBFromBackup(
+        RestoreOptions(), backup_info[thread->rand.Uniform(count)].backup_id,
+        restore_dir /* db_dir */, restore_dir /* wal_dir */);
+  }
+  if (s.ok()) {
+    uint32_t to_keep = 0;
+    if (thread->tid == 0) {
+      // allow one thread to keep up to 2 backups
+      to_keep = thread->rand.Uniform(3);
+    }
+    s = backup_engine->PurgeOldBackups(to_keep);
  }
  DB* restored_db = nullptr;
  std::vector<ColumnFamilyHandle*> restored_cf_handles;
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@ -29,6 +29,9 @@ expected_values_file = tempfile.NamedTemporaryFile()

 default_params = {
    "acquire_snapshot_one_in": 10000,
+    "backup_max_size": 100 * 1024 * 1024,
+    # Consider larger number when backups considered more stable
+    "backup_one_in": 100000,
    "block_size": 16384,
    "bloom_bits": lambda: random.choice([random.randint(0,19),
                                         random.lognormvariate(2.3, 1.3)]),