Fix cf_consistency_stress for backup/restore, harmonize (#7373)
Summary: We can only check key on restored backup if in a stress test configuration locking the key. (Fixes mismatch seen in backup/restore with atomic flush.) TestCheckpoint used a very ugly solution to the same problem: copy-paste dozens of lines of code with some changes and removals. I removed the unnecessary implementation and made the existing one simply adaptive, like TestBackupRestore. Also made TestBackupRestore clean up dead backup/restore directories on success. Pull Request resolved: https://github.com/facebook/rocksdb/pull/7373 Test Plan: blackbox_crash_test_with_atomic_flush for a while, blackbox_crash_test for a while, with backup and checkpoint 1 in 5k and only 1k max_keys to stress this area Reviewed By: ajkr Differential Revision: D23629057 Pulled By: pdillinger fbshipit-source-id: d7fe7e2be75aaf3cf974be9540a7c5c5de8b371b
This commit is contained in:
parent
92639b93a6
commit
c4e2066dbd
@ -283,96 +283,6 @@ class CfConsistencyStressTest : public StressTest {
|
|||||||
return column_families_[thread->rand.Next() % column_families_.size()];
|
return column_families_[thread->rand.Next() % column_families_.size()];
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef ROCKSDB_LITE
|
|
||||||
Status TestCheckpoint(ThreadState* /* thread */,
|
|
||||||
const std::vector<int>& /* rand_column_families */,
|
|
||||||
const std::vector<int64_t>& /* rand_keys */) override {
|
|
||||||
assert(false);
|
|
||||||
fprintf(stderr,
|
|
||||||
"RocksDB lite does not support "
|
|
||||||
"TestCheckpoint\n");
|
|
||||||
std::terminate();
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
Status TestCheckpoint(ThreadState* thread,
|
|
||||||
const std::vector<int>& /* rand_column_families */,
|
|
||||||
const std::vector<int64_t>& /* rand_keys */) override {
|
|
||||||
std::string checkpoint_dir =
|
|
||||||
FLAGS_db + "/.checkpoint" + ToString(thread->tid);
|
|
||||||
|
|
||||||
// We need to clear DB including manifest files, so make a copy
|
|
||||||
Options opt_copy = options_;
|
|
||||||
opt_copy.env = db_stress_env->target();
|
|
||||||
DestroyDB(checkpoint_dir, opt_copy);
|
|
||||||
|
|
||||||
if (db_stress_env->FileExists(checkpoint_dir).ok()) {
|
|
||||||
// If the directory might still exist, try to delete the files one by one.
|
|
||||||
// Likely a trash file is still there.
|
|
||||||
Status my_s = DestroyDir(db_stress_env, checkpoint_dir);
|
|
||||||
if (!my_s.ok()) {
|
|
||||||
fprintf(stderr, "Fail to destory directory before checkpoint: %s",
|
|
||||||
my_s.ToString().c_str());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Checkpoint* checkpoint = nullptr;
|
|
||||||
Status s = Checkpoint::Create(db_, &checkpoint);
|
|
||||||
if (s.ok()) {
|
|
||||||
s = checkpoint->CreateCheckpoint(checkpoint_dir);
|
|
||||||
if (!s.ok()) {
|
|
||||||
fprintf(stderr, "Fail to create checkpoint to %s\n",
|
|
||||||
checkpoint_dir.c_str());
|
|
||||||
std::vector<std::string> files;
|
|
||||||
Status my_s = db_stress_env->GetChildren(checkpoint_dir, &files);
|
|
||||||
if (my_s.ok()) {
|
|
||||||
for (const auto& f : files) {
|
|
||||||
fprintf(stderr, " %s\n", f.c_str());
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
fprintf(stderr, "Fail to get files under the directory to %s\n",
|
|
||||||
my_s.ToString().c_str());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
std::vector<ColumnFamilyHandle*> cf_handles;
|
|
||||||
DB* checkpoint_db = nullptr;
|
|
||||||
if (s.ok()) {
|
|
||||||
delete checkpoint;
|
|
||||||
checkpoint = nullptr;
|
|
||||||
Options options(options_);
|
|
||||||
options.listeners.clear();
|
|
||||||
std::vector<ColumnFamilyDescriptor> cf_descs;
|
|
||||||
// TODO(ajkr): `column_family_names_` is not safe to access here when
|
|
||||||
// `clear_column_family_one_in != 0`. But we can't easily switch to
|
|
||||||
// `ListColumnFamilies` to get names because it won't necessarily give
|
|
||||||
// the same order as `column_family_names_`.
|
|
||||||
if (FLAGS_clear_column_family_one_in == 0) {
|
|
||||||
for (const auto& name : column_family_names_) {
|
|
||||||
cf_descs.emplace_back(name, ColumnFamilyOptions(options));
|
|
||||||
}
|
|
||||||
s = DB::OpenForReadOnly(DBOptions(options), checkpoint_dir, cf_descs,
|
|
||||||
&cf_handles, &checkpoint_db);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (checkpoint_db != nullptr) {
|
|
||||||
for (auto cfh : cf_handles) {
|
|
||||||
delete cfh;
|
|
||||||
}
|
|
||||||
cf_handles.clear();
|
|
||||||
delete checkpoint_db;
|
|
||||||
checkpoint_db = nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!s.ok()) {
|
|
||||||
fprintf(stderr, "A checkpoint operation failed with: %s\n",
|
|
||||||
s.ToString().c_str());
|
|
||||||
} else {
|
|
||||||
DestroyDB(checkpoint_dir, opt_copy);
|
|
||||||
}
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
#endif // !ROCKSDB_LITE
|
|
||||||
|
|
||||||
void VerifyDb(ThreadState* thread) const override {
|
void VerifyDb(ThreadState* thread) const override {
|
||||||
ReadOptions options(FLAGS_verify_checksum, true);
|
ReadOptions options(FLAGS_verify_checksum, true);
|
||||||
// We must set total_order_seek to true because we are doing a SeekToFirst
|
// We must set total_order_seek to true because we are doing a SeekToFirst
|
||||||
|
@ -1280,6 +1280,7 @@ Status StressTest::TestBackupRestore(
|
|||||||
from = "BackupEngine::VerifyBackup";
|
from = "BackupEngine::VerifyBackup";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
const bool allow_persistent = thread->tid == 0; // not too many
|
||||||
bool from_latest = false;
|
bool from_latest = false;
|
||||||
if (s.ok()) {
|
if (s.ok()) {
|
||||||
int count = static_cast<int>(backup_info.size());
|
int count = static_cast<int>(backup_info.size());
|
||||||
@ -1302,7 +1303,7 @@ Status StressTest::TestBackupRestore(
|
|||||||
}
|
}
|
||||||
if (s.ok()) {
|
if (s.ok()) {
|
||||||
uint32_t to_keep = 0;
|
uint32_t to_keep = 0;
|
||||||
if (thread->tid == 0) {
|
if (allow_persistent) {
|
||||||
// allow one thread to keep up to 2 backups
|
// allow one thread to keep up to 2 backups
|
||||||
to_keep = thread->rand.Uniform(3);
|
to_keep = thread->rand.Uniform(3);
|
||||||
}
|
}
|
||||||
@ -1338,8 +1339,7 @@ Status StressTest::TestBackupRestore(
|
|||||||
//
|
//
|
||||||
// For simplicity, currently only verifies existence/non-existence of a
|
// For simplicity, currently only verifies existence/non-existence of a
|
||||||
// single key
|
// single key
|
||||||
for (size_t i = 0;
|
for (size_t i = 0; restored_db && s.ok() && i < rand_column_families.size();
|
||||||
from_latest && restored_db && s.ok() && i < rand_column_families.size();
|
|
||||||
++i) {
|
++i) {
|
||||||
std::string key_str = Key(rand_keys[0]);
|
std::string key_str = Key(rand_keys[0]);
|
||||||
Slice key = key_str;
|
Slice key = key_str;
|
||||||
@ -1349,11 +1349,11 @@ Status StressTest::TestBackupRestore(
|
|||||||
&restored_value);
|
&restored_value);
|
||||||
bool exists = thread->shared->Exists(rand_column_families[i], rand_keys[0]);
|
bool exists = thread->shared->Exists(rand_column_families[i], rand_keys[0]);
|
||||||
if (get_status.ok()) {
|
if (get_status.ok()) {
|
||||||
if (!exists) {
|
if (!exists && from_latest && ShouldAcquireMutexOnKey()) {
|
||||||
s = Status::Corruption("key exists in restore but not in original db");
|
s = Status::Corruption("key exists in restore but not in original db");
|
||||||
}
|
}
|
||||||
} else if (get_status.IsNotFound()) {
|
} else if (get_status.IsNotFound()) {
|
||||||
if (exists) {
|
if (exists && from_latest && ShouldAcquireMutexOnKey()) {
|
||||||
s = Status::Corruption("key exists in original db but not in restore");
|
s = Status::Corruption("key exists in original db but not in restore");
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@ -1374,6 +1374,21 @@ Status StressTest::TestBackupRestore(
|
|||||||
delete restored_db;
|
delete restored_db;
|
||||||
restored_db = nullptr;
|
restored_db = nullptr;
|
||||||
}
|
}
|
||||||
|
if (s.ok()) {
|
||||||
|
// Preserve directories on failure, or allowed persistent backup
|
||||||
|
if (!allow_persistent) {
|
||||||
|
s = DestroyDir(db_stress_env, backup_dir);
|
||||||
|
if (!s.ok()) {
|
||||||
|
from = "Destroy backup dir";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (s.ok()) {
|
||||||
|
s = DestroyDir(db_stress_env, restore_dir);
|
||||||
|
if (!s.ok()) {
|
||||||
|
from = "Destroy restore dir";
|
||||||
|
}
|
||||||
|
}
|
||||||
if (!s.ok()) {
|
if (!s.ok()) {
|
||||||
fprintf(stderr, "Failure in %s with: %s\n", from.c_str(),
|
fprintf(stderr, "Failure in %s with: %s\n", from.c_str(),
|
||||||
s.ToString().c_str());
|
s.ToString().c_str());
|
||||||
@ -1426,10 +1441,6 @@ Status StressTest::TestApproximateSize(
|
|||||||
Status StressTest::TestCheckpoint(ThreadState* thread,
|
Status StressTest::TestCheckpoint(ThreadState* thread,
|
||||||
const std::vector<int>& rand_column_families,
|
const std::vector<int>& rand_column_families,
|
||||||
const std::vector<int64_t>& rand_keys) {
|
const std::vector<int64_t>& rand_keys) {
|
||||||
// Note the column families chosen by `rand_column_families` cannot be
|
|
||||||
// dropped while the locks for `rand_keys` are held. So we should not have
|
|
||||||
// to worry about accessing those column families throughout this function.
|
|
||||||
assert(rand_column_families.size() == rand_keys.size());
|
|
||||||
std::string checkpoint_dir =
|
std::string checkpoint_dir =
|
||||||
FLAGS_db + "/.checkpoint" + ToString(thread->tid);
|
FLAGS_db + "/.checkpoint" + ToString(thread->tid);
|
||||||
Options tmp_opts(options_);
|
Options tmp_opts(options_);
|
||||||
@ -1479,6 +1490,7 @@ Status StressTest::TestCheckpoint(ThreadState* thread,
|
|||||||
// `clear_column_family_one_in != 0`. But we can't easily switch to
|
// `clear_column_family_one_in != 0`. But we can't easily switch to
|
||||||
// `ListColumnFamilies` to get names because it won't necessarily give
|
// `ListColumnFamilies` to get names because it won't necessarily give
|
||||||
// the same order as `column_family_names_`.
|
// the same order as `column_family_names_`.
|
||||||
|
assert(FLAGS_clear_column_family_one_in == 0);
|
||||||
if (FLAGS_clear_column_family_one_in == 0) {
|
if (FLAGS_clear_column_family_one_in == 0) {
|
||||||
for (const auto& name : column_family_names_) {
|
for (const auto& name : column_family_names_) {
|
||||||
cf_descs.emplace_back(name, ColumnFamilyOptions(options));
|
cf_descs.emplace_back(name, ColumnFamilyOptions(options));
|
||||||
@ -1488,21 +1500,24 @@ Status StressTest::TestCheckpoint(ThreadState* thread,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (checkpoint_db != nullptr) {
|
if (checkpoint_db != nullptr) {
|
||||||
|
// Note the column families chosen by `rand_column_families` cannot be
|
||||||
|
// dropped while the locks for `rand_keys` are held. So we should not have
|
||||||
|
// to worry about accessing those column families throughout this function.
|
||||||
for (size_t i = 0; s.ok() && i < rand_column_families.size(); ++i) {
|
for (size_t i = 0; s.ok() && i < rand_column_families.size(); ++i) {
|
||||||
std::string key_str = Key(rand_keys[i]);
|
std::string key_str = Key(rand_keys[0]);
|
||||||
Slice key = key_str;
|
Slice key = key_str;
|
||||||
std::string value;
|
std::string value;
|
||||||
Status get_status = checkpoint_db->Get(
|
Status get_status = checkpoint_db->Get(
|
||||||
ReadOptions(), cf_handles[rand_column_families[i]], key, &value);
|
ReadOptions(), cf_handles[rand_column_families[i]], key, &value);
|
||||||
bool exists =
|
bool exists =
|
||||||
thread->shared->Exists(rand_column_families[i], rand_keys[i]);
|
thread->shared->Exists(rand_column_families[i], rand_keys[0]);
|
||||||
if (get_status.ok()) {
|
if (get_status.ok()) {
|
||||||
if (!exists) {
|
if (!exists && ShouldAcquireMutexOnKey()) {
|
||||||
s = Status::Corruption(
|
s = Status::Corruption(
|
||||||
"key exists in checkpoint but not in original db");
|
"key exists in checkpoint but not in original db");
|
||||||
}
|
}
|
||||||
} else if (get_status.IsNotFound()) {
|
} else if (get_status.IsNotFound()) {
|
||||||
if (exists) {
|
if (exists && ShouldAcquireMutexOnKey()) {
|
||||||
s = Status::Corruption(
|
s = Status::Corruption(
|
||||||
"key exists in original db but not in checkpoint");
|
"key exists in original db but not in checkpoint");
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user