Inject fatal write failures to db_stress when DB is running (#8479)
Summary: add the injest_error_severity to control if it is a retryable IO Error or a fatal or unrecoverable error. Use a flag to indicate, if fatal error comes, the flag is set and db is stopped (but not corrupted). Pull Request resolved: https://github.com/facebook/rocksdb/pull/8479 Test Plan: run ./db_stress --reopen=0 --read_fault_one_in=1000 --write_fault_one_in=5 --disable_wal=true --write_buffer_size=3000000 -writepercent=5 -readpercent=50 --injest_error_severity=2 --column_families=1, make check Reviewed By: anand1976 Differential Revision: D29524271 Pulled By: zhichao-cao fbshipit-source-id: 1aa9fb9b5655b0adba6f5ad12005ca8c074c795b
This commit is contained in:
parent
41d32152ce
commit
a95a776d75
@ -827,5 +827,9 @@ DEFINE_string(secondary_cache_uri, "",
|
|||||||
DEFINE_int32(open_write_fault_one_in, 0,
|
DEFINE_int32(open_write_fault_one_in, 0,
|
||||||
"On non-zero, enables fault injection on file write "
|
"On non-zero, enables fault injection on file write "
|
||||||
"during DB reopen.");
|
"during DB reopen.");
|
||||||
|
DEFINE_int32(injest_error_severity, 1,
|
||||||
|
"The severity of the injested IO Error. 1 is soft error (e.g. "
|
||||||
|
"retryable error), 2 is fatal error, and the default is "
|
||||||
|
"retryable error.");
|
||||||
|
|
||||||
#endif // GFLAGS
|
#endif // GFLAGS
|
||||||
|
@ -33,6 +33,8 @@ DECLARE_int32(write_fault_one_in);
|
|||||||
DECLARE_int32(open_metadata_write_fault_one_in);
|
DECLARE_int32(open_metadata_write_fault_one_in);
|
||||||
DECLARE_int32(open_write_fault_one_in);
|
DECLARE_int32(open_write_fault_one_in);
|
||||||
|
|
||||||
|
DECLARE_int32(injest_error_severity);
|
||||||
|
|
||||||
namespace ROCKSDB_NAMESPACE {
|
namespace ROCKSDB_NAMESPACE {
|
||||||
class StressTest;
|
class StressTest;
|
||||||
|
|
||||||
|
@ -62,7 +62,8 @@ StressTest::StressTest()
|
|||||||
new_column_family_name_(1),
|
new_column_family_name_(1),
|
||||||
num_times_reopened_(0),
|
num_times_reopened_(0),
|
||||||
db_preload_finished_(false),
|
db_preload_finished_(false),
|
||||||
cmp_db_(nullptr) {
|
cmp_db_(nullptr),
|
||||||
|
is_db_stopped_(false) {
|
||||||
if (FLAGS_destroy_db_initially) {
|
if (FLAGS_destroy_db_initially) {
|
||||||
std::vector<std::string> files;
|
std::vector<std::string> files;
|
||||||
db_stress_env->GetChildren(FLAGS_db, &files);
|
db_stress_env->GetChildren(FLAGS_db, &files);
|
||||||
@ -614,8 +615,15 @@ void StressTest::OperateDb(ThreadState* thread) {
|
|||||||
FLAGS_read_fault_one_in);
|
FLAGS_read_fault_one_in);
|
||||||
}
|
}
|
||||||
if (FLAGS_write_fault_one_in) {
|
if (FLAGS_write_fault_one_in) {
|
||||||
IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
|
IOStatus error_msg;
|
||||||
error_msg.SetRetryable(true);
|
if (FLAGS_injest_error_severity <= 1 || FLAGS_injest_error_severity > 2) {
|
||||||
|
error_msg = IOStatus::IOError("Retryable IO Error");
|
||||||
|
error_msg.SetRetryable(true);
|
||||||
|
} else if (FLAGS_injest_error_severity == 2) {
|
||||||
|
// Ingest the fatal error
|
||||||
|
error_msg = IOStatus::IOError("Fatal IO Error");
|
||||||
|
error_msg.SetDataLoss(true);
|
||||||
|
}
|
||||||
std::vector<FileType> types = {FileType::kTableFile,
|
std::vector<FileType> types = {FileType::kTableFile,
|
||||||
FileType::kDescriptorFile,
|
FileType::kDescriptorFile,
|
||||||
FileType::kCurrentFile};
|
FileType::kCurrentFile};
|
||||||
|
@ -237,6 +237,7 @@ class StressTest {
|
|||||||
// Fields used for continuous verification from another thread
|
// Fields used for continuous verification from another thread
|
||||||
DB* cmp_db_;
|
DB* cmp_db_;
|
||||||
std::vector<ColumnFamilyHandle*> cmp_cfhs_;
|
std::vector<ColumnFamilyHandle*> cmp_cfhs_;
|
||||||
|
bool is_db_stopped_;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace ROCKSDB_NAMESPACE
|
} // namespace ROCKSDB_NAMESPACE
|
||||||
|
@ -552,8 +552,18 @@ class NonBatchedOpsStressTest : public StressTest {
|
|||||||
}
|
}
|
||||||
shared->Put(rand_column_family, rand_key, value_base, false /* pending */);
|
shared->Put(rand_column_family, rand_key, value_base, false /* pending */);
|
||||||
if (!s.ok()) {
|
if (!s.ok()) {
|
||||||
fprintf(stderr, "put or merge error: %s\n", s.ToString().c_str());
|
if (FLAGS_injest_error_severity >= 2) {
|
||||||
std::terminate();
|
if (!is_db_stopped_ && s.severity() >= Status::Severity::kFatalError) {
|
||||||
|
is_db_stopped_ = true;
|
||||||
|
} else if (!is_db_stopped_ ||
|
||||||
|
s.severity() < Status::Severity::kFatalError) {
|
||||||
|
fprintf(stderr, "put or merge error: %s\n", s.ToString().c_str());
|
||||||
|
std::terminate();
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
fprintf(stderr, "put or merge error: %s\n", s.ToString().c_str());
|
||||||
|
std::terminate();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
thread->stats.AddBytesForWrites(1, sz);
|
thread->stats.AddBytesForWrites(1, sz);
|
||||||
PrintKeyValue(rand_column_family, static_cast<uint32_t>(rand_key), value,
|
PrintKeyValue(rand_column_family, static_cast<uint32_t>(rand_key), value,
|
||||||
@ -615,8 +625,19 @@ class NonBatchedOpsStressTest : public StressTest {
|
|||||||
shared->Delete(rand_column_family, rand_key, false /* pending */);
|
shared->Delete(rand_column_family, rand_key, false /* pending */);
|
||||||
thread->stats.AddDeletes(1);
|
thread->stats.AddDeletes(1);
|
||||||
if (!s.ok()) {
|
if (!s.ok()) {
|
||||||
fprintf(stderr, "delete error: %s\n", s.ToString().c_str());
|
if (FLAGS_injest_error_severity >= 2) {
|
||||||
std::terminate();
|
if (!is_db_stopped_ &&
|
||||||
|
s.severity() >= Status::Severity::kFatalError) {
|
||||||
|
is_db_stopped_ = true;
|
||||||
|
} else if (!is_db_stopped_ ||
|
||||||
|
s.severity() < Status::Severity::kFatalError) {
|
||||||
|
fprintf(stderr, "delete error: %s\n", s.ToString().c_str());
|
||||||
|
std::terminate();
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
fprintf(stderr, "delete error: %s\n", s.ToString().c_str());
|
||||||
|
std::terminate();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
shared->SingleDelete(rand_column_family, rand_key, true /* pending */);
|
shared->SingleDelete(rand_column_family, rand_key, true /* pending */);
|
||||||
@ -637,8 +658,19 @@ class NonBatchedOpsStressTest : public StressTest {
|
|||||||
shared->SingleDelete(rand_column_family, rand_key, false /* pending */);
|
shared->SingleDelete(rand_column_family, rand_key, false /* pending */);
|
||||||
thread->stats.AddSingleDeletes(1);
|
thread->stats.AddSingleDeletes(1);
|
||||||
if (!s.ok()) {
|
if (!s.ok()) {
|
||||||
fprintf(stderr, "single delete error: %s\n", s.ToString().c_str());
|
if (FLAGS_injest_error_severity >= 2) {
|
||||||
std::terminate();
|
if (!is_db_stopped_ &&
|
||||||
|
s.severity() >= Status::Severity::kFatalError) {
|
||||||
|
is_db_stopped_ = true;
|
||||||
|
} else if (!is_db_stopped_ ||
|
||||||
|
s.severity() < Status::Severity::kFatalError) {
|
||||||
|
fprintf(stderr, "single delete error: %s\n", s.ToString().c_str());
|
||||||
|
std::terminate();
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
fprintf(stderr, "single delete error: %s\n", s.ToString().c_str());
|
||||||
|
std::terminate();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return s;
|
return s;
|
||||||
@ -684,8 +716,18 @@ class NonBatchedOpsStressTest : public StressTest {
|
|||||||
Slice end_key = end_keystr;
|
Slice end_key = end_keystr;
|
||||||
Status s = db_->DeleteRange(write_opts, cfh, key, end_key);
|
Status s = db_->DeleteRange(write_opts, cfh, key, end_key);
|
||||||
if (!s.ok()) {
|
if (!s.ok()) {
|
||||||
fprintf(stderr, "delete range error: %s\n", s.ToString().c_str());
|
if (FLAGS_injest_error_severity >= 2) {
|
||||||
std::terminate();
|
if (!is_db_stopped_ && s.severity() >= Status::Severity::kFatalError) {
|
||||||
|
is_db_stopped_ = true;
|
||||||
|
} else if (!is_db_stopped_ ||
|
||||||
|
s.severity() < Status::Severity::kFatalError) {
|
||||||
|
fprintf(stderr, "delete range error: %s\n", s.ToString().c_str());
|
||||||
|
std::terminate();
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
fprintf(stderr, "delete range error: %s\n", s.ToString().c_str());
|
||||||
|
std::terminate();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
int covered = shared->DeleteRange(rand_column_family, rand_key,
|
int covered = shared->DeleteRange(rand_column_family, rand_key,
|
||||||
rand_key + FLAGS_range_deletion_width,
|
rand_key + FLAGS_range_deletion_width,
|
||||||
|
Loading…
Reference in New Issue
Block a user