Fix cf_consistency_stress for backup/restore, harmonize (#7373)
Summary: We can only check key on restored backup if in a stress test configuration locking the key. (Fixes mismatch seen in backup/restore with atomic flush.) TestCheckpoint used a very ugly solution to the same problem: copy-paste dozens of lines of code with some changes and removals. I removed the unnecessary implementation and made the existing one simply adaptive, like TestBackupRestore. Also made TestBackupRestore clean up dead backup/restore directories on success. Pull Request resolved: https://github.com/facebook/rocksdb/pull/7373 Test Plan: blackbox_crash_test_with_atomic_flush for a while, blackbox_crash_test for a while, with backup and checkpoint 1 in 5k and only 1k max_keys to stress this area Reviewed By: ajkr Differential Revision: D23629057 Pulled By: pdillinger fbshipit-source-id: d7fe7e2be75aaf3cf974be9540a7c5c5de8b371b
This commit is contained in:
parent
92639b93a6
commit
c4e2066dbd
@ -283,96 +283,6 @@ class CfConsistencyStressTest : public StressTest {
|
||||
return column_families_[thread->rand.Next() % column_families_.size()];
|
||||
}
|
||||
|
||||
#ifdef ROCKSDB_LITE
|
||||
Status TestCheckpoint(ThreadState* /* thread */,
|
||||
const std::vector<int>& /* rand_column_families */,
|
||||
const std::vector<int64_t>& /* rand_keys */) override {
|
||||
assert(false);
|
||||
fprintf(stderr,
|
||||
"RocksDB lite does not support "
|
||||
"TestCheckpoint\n");
|
||||
std::terminate();
|
||||
}
|
||||
#else
|
||||
Status TestCheckpoint(ThreadState* thread,
|
||||
const std::vector<int>& /* rand_column_families */,
|
||||
const std::vector<int64_t>& /* rand_keys */) override {
|
||||
std::string checkpoint_dir =
|
||||
FLAGS_db + "/.checkpoint" + ToString(thread->tid);
|
||||
|
||||
// We need to clear DB including manifest files, so make a copy
|
||||
Options opt_copy = options_;
|
||||
opt_copy.env = db_stress_env->target();
|
||||
DestroyDB(checkpoint_dir, opt_copy);
|
||||
|
||||
if (db_stress_env->FileExists(checkpoint_dir).ok()) {
|
||||
// If the directory might still exist, try to delete the files one by one.
|
||||
// Likely a trash file is still there.
|
||||
Status my_s = DestroyDir(db_stress_env, checkpoint_dir);
|
||||
if (!my_s.ok()) {
|
||||
fprintf(stderr, "Fail to destory directory before checkpoint: %s",
|
||||
my_s.ToString().c_str());
|
||||
}
|
||||
}
|
||||
|
||||
Checkpoint* checkpoint = nullptr;
|
||||
Status s = Checkpoint::Create(db_, &checkpoint);
|
||||
if (s.ok()) {
|
||||
s = checkpoint->CreateCheckpoint(checkpoint_dir);
|
||||
if (!s.ok()) {
|
||||
fprintf(stderr, "Fail to create checkpoint to %s\n",
|
||||
checkpoint_dir.c_str());
|
||||
std::vector<std::string> files;
|
||||
Status my_s = db_stress_env->GetChildren(checkpoint_dir, &files);
|
||||
if (my_s.ok()) {
|
||||
for (const auto& f : files) {
|
||||
fprintf(stderr, " %s\n", f.c_str());
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr, "Fail to get files under the directory to %s\n",
|
||||
my_s.ToString().c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
std::vector<ColumnFamilyHandle*> cf_handles;
|
||||
DB* checkpoint_db = nullptr;
|
||||
if (s.ok()) {
|
||||
delete checkpoint;
|
||||
checkpoint = nullptr;
|
||||
Options options(options_);
|
||||
options.listeners.clear();
|
||||
std::vector<ColumnFamilyDescriptor> cf_descs;
|
||||
// TODO(ajkr): `column_family_names_` is not safe to access here when
|
||||
// `clear_column_family_one_in != 0`. But we can't easily switch to
|
||||
// `ListColumnFamilies` to get names because it won't necessarily give
|
||||
// the same order as `column_family_names_`.
|
||||
if (FLAGS_clear_column_family_one_in == 0) {
|
||||
for (const auto& name : column_family_names_) {
|
||||
cf_descs.emplace_back(name, ColumnFamilyOptions(options));
|
||||
}
|
||||
s = DB::OpenForReadOnly(DBOptions(options), checkpoint_dir, cf_descs,
|
||||
&cf_handles, &checkpoint_db);
|
||||
}
|
||||
}
|
||||
if (checkpoint_db != nullptr) {
|
||||
for (auto cfh : cf_handles) {
|
||||
delete cfh;
|
||||
}
|
||||
cf_handles.clear();
|
||||
delete checkpoint_db;
|
||||
checkpoint_db = nullptr;
|
||||
}
|
||||
|
||||
if (!s.ok()) {
|
||||
fprintf(stderr, "A checkpoint operation failed with: %s\n",
|
||||
s.ToString().c_str());
|
||||
} else {
|
||||
DestroyDB(checkpoint_dir, opt_copy);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
#endif // !ROCKSDB_LITE
|
||||
|
||||
void VerifyDb(ThreadState* thread) const override {
|
||||
ReadOptions options(FLAGS_verify_checksum, true);
|
||||
// We must set total_order_seek to true because we are doing a SeekToFirst
|
||||
|
@ -1280,6 +1280,7 @@ Status StressTest::TestBackupRestore(
|
||||
from = "BackupEngine::VerifyBackup";
|
||||
}
|
||||
}
|
||||
const bool allow_persistent = thread->tid == 0; // not too many
|
||||
bool from_latest = false;
|
||||
if (s.ok()) {
|
||||
int count = static_cast<int>(backup_info.size());
|
||||
@ -1302,7 +1303,7 @@ Status StressTest::TestBackupRestore(
|
||||
}
|
||||
if (s.ok()) {
|
||||
uint32_t to_keep = 0;
|
||||
if (thread->tid == 0) {
|
||||
if (allow_persistent) {
|
||||
// allow one thread to keep up to 2 backups
|
||||
to_keep = thread->rand.Uniform(3);
|
||||
}
|
||||
@ -1338,8 +1339,7 @@ Status StressTest::TestBackupRestore(
|
||||
//
|
||||
// For simplicity, currently only verifies existence/non-existence of a
|
||||
// single key
|
||||
for (size_t i = 0;
|
||||
from_latest && restored_db && s.ok() && i < rand_column_families.size();
|
||||
for (size_t i = 0; restored_db && s.ok() && i < rand_column_families.size();
|
||||
++i) {
|
||||
std::string key_str = Key(rand_keys[0]);
|
||||
Slice key = key_str;
|
||||
@ -1349,11 +1349,11 @@ Status StressTest::TestBackupRestore(
|
||||
&restored_value);
|
||||
bool exists = thread->shared->Exists(rand_column_families[i], rand_keys[0]);
|
||||
if (get_status.ok()) {
|
||||
if (!exists) {
|
||||
if (!exists && from_latest && ShouldAcquireMutexOnKey()) {
|
||||
s = Status::Corruption("key exists in restore but not in original db");
|
||||
}
|
||||
} else if (get_status.IsNotFound()) {
|
||||
if (exists) {
|
||||
if (exists && from_latest && ShouldAcquireMutexOnKey()) {
|
||||
s = Status::Corruption("key exists in original db but not in restore");
|
||||
}
|
||||
} else {
|
||||
@ -1374,6 +1374,21 @@ Status StressTest::TestBackupRestore(
|
||||
delete restored_db;
|
||||
restored_db = nullptr;
|
||||
}
|
||||
if (s.ok()) {
|
||||
// Preserve directories on failure, or allowed persistent backup
|
||||
if (!allow_persistent) {
|
||||
s = DestroyDir(db_stress_env, backup_dir);
|
||||
if (!s.ok()) {
|
||||
from = "Destroy backup dir";
|
||||
}
|
||||
}
|
||||
}
|
||||
if (s.ok()) {
|
||||
s = DestroyDir(db_stress_env, restore_dir);
|
||||
if (!s.ok()) {
|
||||
from = "Destroy restore dir";
|
||||
}
|
||||
}
|
||||
if (!s.ok()) {
|
||||
fprintf(stderr, "Failure in %s with: %s\n", from.c_str(),
|
||||
s.ToString().c_str());
|
||||
@ -1426,10 +1441,6 @@ Status StressTest::TestApproximateSize(
|
||||
Status StressTest::TestCheckpoint(ThreadState* thread,
|
||||
const std::vector<int>& rand_column_families,
|
||||
const std::vector<int64_t>& rand_keys) {
|
||||
// Note the column families chosen by `rand_column_families` cannot be
|
||||
// dropped while the locks for `rand_keys` are held. So we should not have
|
||||
// to worry about accessing those column families throughout this function.
|
||||
assert(rand_column_families.size() == rand_keys.size());
|
||||
std::string checkpoint_dir =
|
||||
FLAGS_db + "/.checkpoint" + ToString(thread->tid);
|
||||
Options tmp_opts(options_);
|
||||
@ -1479,6 +1490,7 @@ Status StressTest::TestCheckpoint(ThreadState* thread,
|
||||
// `clear_column_family_one_in != 0`. But we can't easily switch to
|
||||
// `ListColumnFamilies` to get names because it won't necessarily give
|
||||
// the same order as `column_family_names_`.
|
||||
assert(FLAGS_clear_column_family_one_in == 0);
|
||||
if (FLAGS_clear_column_family_one_in == 0) {
|
||||
for (const auto& name : column_family_names_) {
|
||||
cf_descs.emplace_back(name, ColumnFamilyOptions(options));
|
||||
@ -1488,21 +1500,24 @@ Status StressTest::TestCheckpoint(ThreadState* thread,
|
||||
}
|
||||
}
|
||||
if (checkpoint_db != nullptr) {
|
||||
// Note the column families chosen by `rand_column_families` cannot be
|
||||
// dropped while the locks for `rand_keys` are held. So we should not have
|
||||
// to worry about accessing those column families throughout this function.
|
||||
for (size_t i = 0; s.ok() && i < rand_column_families.size(); ++i) {
|
||||
std::string key_str = Key(rand_keys[i]);
|
||||
std::string key_str = Key(rand_keys[0]);
|
||||
Slice key = key_str;
|
||||
std::string value;
|
||||
Status get_status = checkpoint_db->Get(
|
||||
ReadOptions(), cf_handles[rand_column_families[i]], key, &value);
|
||||
bool exists =
|
||||
thread->shared->Exists(rand_column_families[i], rand_keys[i]);
|
||||
thread->shared->Exists(rand_column_families[i], rand_keys[0]);
|
||||
if (get_status.ok()) {
|
||||
if (!exists) {
|
||||
if (!exists && ShouldAcquireMutexOnKey()) {
|
||||
s = Status::Corruption(
|
||||
"key exists in checkpoint but not in original db");
|
||||
}
|
||||
} else if (get_status.IsNotFound()) {
|
||||
if (exists) {
|
||||
if (exists && ShouldAcquireMutexOnKey()) {
|
||||
s = Status::Corruption(
|
||||
"key exists in original db but not in checkpoint");
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user