Fix backup/restore in stress/crash test (#7357)
Summary: (1) Skip check on specific key if restoring an old backup (small minority of cases) because it can fail in those cases. (2) Remove an old assertion about number of column families and number of keys passed in, which is broken by atomic flush (cf_consistency) test. Like other code (for better or worse) assume a single key and iterate over column families. (3) Apply mock_direct_io to NewSequentialFile so that db_stress backup works on /dev/shm. Also add more context to output in case of backup/restore db_stress failure. Also a minor fix to BackupEngine to report first failure status in creating new backup, and drop another clue about the potential source of a "Backup failed" status. Reverts "Disable backup/restore stress test (https://github.com/facebook/rocksdb/issues/7350)" Pull Request resolved: https://github.com/facebook/rocksdb/pull/7357 Test Plan: Using backup_one_in=10000, "USE_CLANG=1 make crash_test_with_atomic_flush" for 30+ minutes "USE_CLANG=1 make blackbox_crash_test" for 30+ minutes And with use_direct_reads with TEST_TMPDIR=/dev/shm/rocksdb Reviewed By: riversand963 Differential Revision: D23567244 Pulled By: pdillinger fbshipit-source-id: e77171c2e8394d173917e36898c02dead1c40b77
This commit is contained in:
parent
423d051124
commit
4e258d3e63
@ -1217,10 +1217,6 @@ void StressTest::TestCompactFiles(ThreadState* /* thread */,
|
||||
Status StressTest::TestBackupRestore(
|
||||
ThreadState* thread, const std::vector<int>& rand_column_families,
|
||||
const std::vector<int64_t>& rand_keys) {
|
||||
// Note the column families chosen by `rand_column_families` cannot be
|
||||
// dropped while the locks for `rand_keys` are held. So we should not have
|
||||
// to worry about accessing those column families throughout this function.
|
||||
assert(rand_column_families.size() == rand_keys.size());
|
||||
std::string backup_dir = FLAGS_db + "/.backup" + ToString(thread->tid);
|
||||
std::string restore_dir = FLAGS_db + "/.restore" + ToString(thread->tid);
|
||||
BackupableDBOptions backup_opts(backup_dir);
|
||||
@ -1249,32 +1245,60 @@ Status StressTest::TestBackupRestore(
|
||||
}
|
||||
}
|
||||
BackupEngine* backup_engine = nullptr;
|
||||
std::string from = "a backup/restore operation";
|
||||
Status s = BackupEngine::Open(db_stress_env, backup_opts, &backup_engine);
|
||||
if (!s.ok()) {
|
||||
from = "BackupEngine::Open";
|
||||
}
|
||||
if (s.ok()) {
|
||||
s = backup_engine->CreateNewBackup(db_);
|
||||
if (!s.ok()) {
|
||||
from = "BackupEngine::CreateNewBackup";
|
||||
}
|
||||
}
|
||||
if (s.ok()) {
|
||||
delete backup_engine;
|
||||
backup_engine = nullptr;
|
||||
s = BackupEngine::Open(db_stress_env, backup_opts, &backup_engine);
|
||||
if (!s.ok()) {
|
||||
from = "BackupEngine::Open (again)";
|
||||
}
|
||||
}
|
||||
std::vector<BackupInfo> backup_info;
|
||||
if (s.ok()) {
|
||||
backup_engine->GetBackupInfo(&backup_info);
|
||||
if (backup_info.empty()) {
|
||||
s = Status::NotFound("no backups found");
|
||||
from = "BackupEngine::GetBackupInfo";
|
||||
}
|
||||
}
|
||||
if (s.ok() && thread->rand.OneIn(2)) {
|
||||
s = backup_engine->VerifyBackup(
|
||||
backup_info.front().backup_id,
|
||||
thread->rand.OneIn(2) /* verify_with_checksum */);
|
||||
if (!s.ok()) {
|
||||
from = "BackupEngine::VerifyBackup";
|
||||
}
|
||||
}
|
||||
bool from_latest = false;
|
||||
if (s.ok()) {
|
||||
int count = static_cast<int>(backup_info.size());
|
||||
s = backup_engine->RestoreDBFromBackup(
|
||||
RestoreOptions(), backup_info[thread->rand.Uniform(count)].backup_id,
|
||||
restore_dir /* db_dir */, restore_dir /* wal_dir */);
|
||||
if (count > 1) {
|
||||
s = backup_engine->RestoreDBFromBackup(
|
||||
RestoreOptions(), backup_info[thread->rand.Uniform(count)].backup_id,
|
||||
restore_dir /* db_dir */, restore_dir /* wal_dir */);
|
||||
if (!s.ok()) {
|
||||
from = "BackupEngine::RestoreDBFromBackup";
|
||||
}
|
||||
} else {
|
||||
from_latest = true;
|
||||
s = backup_engine->RestoreDBFromLatestBackup(RestoreOptions(),
|
||||
restore_dir /* db_dir */,
|
||||
restore_dir /* wal_dir */);
|
||||
if (!s.ok()) {
|
||||
from = "BackupEngine::RestoreDBFromLatestBackup";
|
||||
}
|
||||
}
|
||||
}
|
||||
if (s.ok()) {
|
||||
uint32_t to_keep = 0;
|
||||
@ -1283,6 +1307,9 @@ Status StressTest::TestBackupRestore(
|
||||
to_keep = thread->rand.Uniform(3);
|
||||
}
|
||||
s = backup_engine->PurgeOldBackups(to_keep);
|
||||
if (!s.ok()) {
|
||||
from = "BackupEngine::PurgeOldBackups";
|
||||
}
|
||||
}
|
||||
DB* restored_db = nullptr;
|
||||
std::vector<ColumnFamilyHandle*> restored_cf_handles;
|
||||
@ -1300,11 +1327,19 @@ Status StressTest::TestBackupRestore(
|
||||
}
|
||||
s = DB::Open(DBOptions(restore_options), restore_dir, cf_descriptors,
|
||||
&restored_cf_handles, &restored_db);
|
||||
if (!s.ok()) {
|
||||
from = "DB::Open in backup/restore";
|
||||
}
|
||||
}
|
||||
// for simplicity, currently only verifies existence/non-existence of a few
|
||||
// keys
|
||||
for (size_t i = 0; s.ok() && i < rand_column_families.size(); ++i) {
|
||||
std::string key_str = Key(rand_keys[i]);
|
||||
// Note the column families chosen by `rand_column_families` cannot be
|
||||
// dropped while the locks for `rand_keys` are held. So we should not have
|
||||
// to worry about accessing those column families throughout this function.
|
||||
//
|
||||
// For simplicity, currently only verifies existence/non-existence of a
|
||||
// single key
|
||||
for (size_t i = 0; from_latest && s.ok() && i < rand_column_families.size();
|
||||
++i) {
|
||||
std::string key_str = Key(rand_keys[0]);
|
||||
Slice key = key_str;
|
||||
std::string restored_value;
|
||||
Status get_status = restored_db->Get(
|
||||
@ -1321,6 +1356,9 @@ Status StressTest::TestBackupRestore(
|
||||
}
|
||||
} else {
|
||||
s = get_status;
|
||||
if (!s.ok()) {
|
||||
from = "DB::Get in backup/restore";
|
||||
}
|
||||
}
|
||||
}
|
||||
if (backup_engine != nullptr) {
|
||||
@ -1335,7 +1373,7 @@ Status StressTest::TestBackupRestore(
|
||||
restored_db = nullptr;
|
||||
}
|
||||
if (!s.ok()) {
|
||||
fprintf(stderr, "A backup/restore operation failed with: %s\n",
|
||||
fprintf(stderr, "Failure in %s with: %s\n", from.c_str(),
|
||||
s.ToString().c_str());
|
||||
}
|
||||
return s;
|
||||
|
1
env/fs_posix.cc
vendored
1
env/fs_posix.cc
vendored
@ -159,6 +159,7 @@ class PosixFileSystem : public FileSystem {
|
||||
#endif // !ROCKSDB_LITE
|
||||
#if !defined(OS_MACOSX) && !defined(OS_OPENBSD) && !defined(OS_SOLARIS)
|
||||
flags |= O_DIRECT;
|
||||
TEST_SYNC_POINT_CALLBACK("NewSequentialFile:O_DIRECT", &flags);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -83,6 +83,11 @@ void SetupSyncPointsToMockDirectIO() {
|
||||
int* val = static_cast<int*>(arg);
|
||||
*val &= ~O_DIRECT;
|
||||
});
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
||||
"NewSequentialFile:O_DIRECT", [&](void* arg) {
|
||||
int* val = static_cast<int*>(arg);
|
||||
*val &= ~O_DIRECT;
|
||||
});
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
||||
#endif
|
||||
}
|
||||
|
@ -29,6 +29,9 @@ expected_values_file = tempfile.NamedTemporaryFile()
|
||||
|
||||
default_params = {
|
||||
"acquire_snapshot_one_in": 10000,
|
||||
"backup_max_size": 100 * 1024 * 1024,
|
||||
# Consider larger number when backups considered more stable
|
||||
"backup_one_in": 100000,
|
||||
"block_size": 16384,
|
||||
"bloom_bits": lambda: random.choice([random.randint(0,19),
|
||||
random.lognormvariate(2.3, 1.3)]),
|
||||
|
@ -1182,7 +1182,9 @@ Status BackupEngineImpl::CreateNewBackupWithMetadata(
|
||||
new_backup->SetSequenceNumber(sequence_number);
|
||||
}
|
||||
}
|
||||
ROCKS_LOG_INFO(options_.info_log, "add files for backup done, wait finish.");
|
||||
ROCKS_LOG_INFO(options_.info_log,
|
||||
"add files for backup done (%s), wait finish.",
|
||||
s.ok() ? "OK" : "not OK");
|
||||
Status item_status;
|
||||
for (auto& item : backup_items_to_finish) {
|
||||
item.result.wait();
|
||||
@ -1198,7 +1200,7 @@ Status BackupEngineImpl::CreateNewBackupWithMetadata(
|
||||
result.custom_checksum_hex, result.checksum_func_name, result.db_id,
|
||||
result.db_session_id));
|
||||
}
|
||||
if (!item_status.ok()) {
|
||||
if (s.ok() && !item_status.ok()) {
|
||||
s = item_status;
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user