Inject fatal write failures to db_stress when DB is running (#8479)

Summary:
add the injest_error_severity to control if it is a retryable IO Error or a fatal or unrecoverable error. Use a flag to indicate, if fatal error comes, the flag is set and db is stopped (but not corrupted).

Pull Request resolved: https://github.com/facebook/rocksdb/pull/8479

Test Plan: run  ./db_stress --reopen=0 --read_fault_one_in=1000 --write_fault_one_in=5 --disable_wal=true --write_buffer_size=3000000 -writepercent=5 -readpercent=50 --injest_error_severity=2 --column_families=1, make check

Reviewed By: anand1976

Differential Revision: D29524271

Pulled By: zhichao-cao

fbshipit-source-id: 1aa9fb9b5655b0adba6f5ad12005ca8c074c795b
This commit is contained in:
Zhichao Cao 2021-07-01 14:15:49 -07:00 committed by Facebook GitHub Bot
parent 41d32152ce
commit a95a776d75
5 changed files with 68 additions and 11 deletions

View File

@ -827,5 +827,9 @@ DEFINE_string(secondary_cache_uri, "",
DEFINE_int32(open_write_fault_one_in, 0, DEFINE_int32(open_write_fault_one_in, 0,
"On non-zero, enables fault injection on file write " "On non-zero, enables fault injection on file write "
"during DB reopen."); "during DB reopen.");
DEFINE_int32(injest_error_severity, 1,
"The severity of the injested IO Error. 1 is soft error (e.g. "
"retryable error), 2 is fatal error, and the default is "
"retryable error.");
#endif // GFLAGS #endif // GFLAGS

View File

@ -33,6 +33,8 @@ DECLARE_int32(write_fault_one_in);
DECLARE_int32(open_metadata_write_fault_one_in); DECLARE_int32(open_metadata_write_fault_one_in);
DECLARE_int32(open_write_fault_one_in); DECLARE_int32(open_write_fault_one_in);
DECLARE_int32(injest_error_severity);
namespace ROCKSDB_NAMESPACE { namespace ROCKSDB_NAMESPACE {
class StressTest; class StressTest;

View File

@ -62,7 +62,8 @@ StressTest::StressTest()
new_column_family_name_(1), new_column_family_name_(1),
num_times_reopened_(0), num_times_reopened_(0),
db_preload_finished_(false), db_preload_finished_(false),
cmp_db_(nullptr) { cmp_db_(nullptr),
is_db_stopped_(false) {
if (FLAGS_destroy_db_initially) { if (FLAGS_destroy_db_initially) {
std::vector<std::string> files; std::vector<std::string> files;
db_stress_env->GetChildren(FLAGS_db, &files); db_stress_env->GetChildren(FLAGS_db, &files);
@ -614,8 +615,15 @@ void StressTest::OperateDb(ThreadState* thread) {
FLAGS_read_fault_one_in); FLAGS_read_fault_one_in);
} }
if (FLAGS_write_fault_one_in) { if (FLAGS_write_fault_one_in) {
IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); IOStatus error_msg;
error_msg.SetRetryable(true); if (FLAGS_injest_error_severity <= 1 || FLAGS_injest_error_severity > 2) {
error_msg = IOStatus::IOError("Retryable IO Error");
error_msg.SetRetryable(true);
} else if (FLAGS_injest_error_severity == 2) {
// Ingest the fatal error
error_msg = IOStatus::IOError("Fatal IO Error");
error_msg.SetDataLoss(true);
}
std::vector<FileType> types = {FileType::kTableFile, std::vector<FileType> types = {FileType::kTableFile,
FileType::kDescriptorFile, FileType::kDescriptorFile,
FileType::kCurrentFile}; FileType::kCurrentFile};

View File

@ -237,6 +237,7 @@ class StressTest {
// Fields used for continuous verification from another thread // Fields used for continuous verification from another thread
DB* cmp_db_; DB* cmp_db_;
std::vector<ColumnFamilyHandle*> cmp_cfhs_; std::vector<ColumnFamilyHandle*> cmp_cfhs_;
bool is_db_stopped_;
}; };
} // namespace ROCKSDB_NAMESPACE } // namespace ROCKSDB_NAMESPACE

View File

@ -552,8 +552,18 @@ class NonBatchedOpsStressTest : public StressTest {
} }
shared->Put(rand_column_family, rand_key, value_base, false /* pending */); shared->Put(rand_column_family, rand_key, value_base, false /* pending */);
if (!s.ok()) { if (!s.ok()) {
fprintf(stderr, "put or merge error: %s\n", s.ToString().c_str()); if (FLAGS_injest_error_severity >= 2) {
std::terminate(); if (!is_db_stopped_ && s.severity() >= Status::Severity::kFatalError) {
is_db_stopped_ = true;
} else if (!is_db_stopped_ ||
s.severity() < Status::Severity::kFatalError) {
fprintf(stderr, "put or merge error: %s\n", s.ToString().c_str());
std::terminate();
}
} else {
fprintf(stderr, "put or merge error: %s\n", s.ToString().c_str());
std::terminate();
}
} }
thread->stats.AddBytesForWrites(1, sz); thread->stats.AddBytesForWrites(1, sz);
PrintKeyValue(rand_column_family, static_cast<uint32_t>(rand_key), value, PrintKeyValue(rand_column_family, static_cast<uint32_t>(rand_key), value,
@ -615,8 +625,19 @@ class NonBatchedOpsStressTest : public StressTest {
shared->Delete(rand_column_family, rand_key, false /* pending */); shared->Delete(rand_column_family, rand_key, false /* pending */);
thread->stats.AddDeletes(1); thread->stats.AddDeletes(1);
if (!s.ok()) { if (!s.ok()) {
fprintf(stderr, "delete error: %s\n", s.ToString().c_str()); if (FLAGS_injest_error_severity >= 2) {
std::terminate(); if (!is_db_stopped_ &&
s.severity() >= Status::Severity::kFatalError) {
is_db_stopped_ = true;
} else if (!is_db_stopped_ ||
s.severity() < Status::Severity::kFatalError) {
fprintf(stderr, "delete error: %s\n", s.ToString().c_str());
std::terminate();
}
} else {
fprintf(stderr, "delete error: %s\n", s.ToString().c_str());
std::terminate();
}
} }
} else { } else {
shared->SingleDelete(rand_column_family, rand_key, true /* pending */); shared->SingleDelete(rand_column_family, rand_key, true /* pending */);
@ -637,8 +658,19 @@ class NonBatchedOpsStressTest : public StressTest {
shared->SingleDelete(rand_column_family, rand_key, false /* pending */); shared->SingleDelete(rand_column_family, rand_key, false /* pending */);
thread->stats.AddSingleDeletes(1); thread->stats.AddSingleDeletes(1);
if (!s.ok()) { if (!s.ok()) {
fprintf(stderr, "single delete error: %s\n", s.ToString().c_str()); if (FLAGS_injest_error_severity >= 2) {
std::terminate(); if (!is_db_stopped_ &&
s.severity() >= Status::Severity::kFatalError) {
is_db_stopped_ = true;
} else if (!is_db_stopped_ ||
s.severity() < Status::Severity::kFatalError) {
fprintf(stderr, "single delete error: %s\n", s.ToString().c_str());
std::terminate();
}
} else {
fprintf(stderr, "single delete error: %s\n", s.ToString().c_str());
std::terminate();
}
} }
} }
return s; return s;
@ -684,8 +716,18 @@ class NonBatchedOpsStressTest : public StressTest {
Slice end_key = end_keystr; Slice end_key = end_keystr;
Status s = db_->DeleteRange(write_opts, cfh, key, end_key); Status s = db_->DeleteRange(write_opts, cfh, key, end_key);
if (!s.ok()) { if (!s.ok()) {
fprintf(stderr, "delete range error: %s\n", s.ToString().c_str()); if (FLAGS_injest_error_severity >= 2) {
std::terminate(); if (!is_db_stopped_ && s.severity() >= Status::Severity::kFatalError) {
is_db_stopped_ = true;
} else if (!is_db_stopped_ ||
s.severity() < Status::Severity::kFatalError) {
fprintf(stderr, "delete range error: %s\n", s.ToString().c_str());
std::terminate();
}
} else {
fprintf(stderr, "delete range error: %s\n", s.ToString().c_str());
std::terminate();
}
} }
int covered = shared->DeleteRange(rand_column_family, rand_key, int covered = shared->DeleteRange(rand_column_family, rand_key,
rand_key + FLAGS_range_deletion_width, rand_key + FLAGS_range_deletion_width,