From 04b3524ad00cb162020f5e56ae04d77ed1812bc9 Mon Sep 17 00:00:00 2001 From: Zhichao Cao Date: Thu, 17 Dec 2020 11:51:04 -0800 Subject: [PATCH] Inject the random write error to stress test (#7653) Summary: Inject the random write error to stress test, it requires set reopen=0 and disable_wal=true. Pull Request resolved: https://github.com/facebook/rocksdb/pull/7653 Test Plan: pass db_stress and python3 db_crashtest.py blackbox Reviewed By: ajkr Differential Revision: D25354132 Pulled By: zhichao-cao fbshipit-source-id: 44721104eecb416e27f65f854912c40e301dd669 --- db_stress_tool/db_stress_common.cc | 4 +-- db_stress_tool/db_stress_common.h | 2 +- db_stress_tool/db_stress_gflags.cc | 3 ++ db_stress_tool/db_stress_shared_state.h | 1 + db_stress_tool/db_stress_test_base.cc | 17 +++++++++-- db_stress_tool/db_stress_tool.cc | 26 ++++++++++++++-- tools/db_crashtest.py | 1 + utilities/fault_injection_fs.cc | 31 ++++++++++++++++++- utilities/fault_injection_fs.h | 40 +++++++++++++++++++++++-- 9 files changed, 115 insertions(+), 10 deletions(-) diff --git a/db_stress_tool/db_stress_common.cc b/db_stress_tool/db_stress_common.cc index 351c28e0c..62030f185 100644 --- a/db_stress_tool/db_stress_common.cc +++ b/db_stress_tool/db_stress_common.cc @@ -16,10 +16,10 @@ #include "util/file_checksum_helper.h" #include "util/xxhash.h" -ROCKSDB_NAMESPACE::DbStressEnvWrapper* db_stress_env = nullptr; +ROCKSDB_NAMESPACE::Env* db_stress_env = nullptr; #ifndef NDEBUG // If non-null, injects read error at a rate specified by the -// read_fault_one_in flag +// read_fault_one_in or write_fault_one_in flag std::shared_ptr fault_fs_guard; #endif // NDEBUG enum ROCKSDB_NAMESPACE::CompressionType compression_type_e = diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h index 40a1e653c..4c48d73c9 100644 --- a/db_stress_tool/db_stress_common.h +++ b/db_stress_tool/db_stress_common.h @@ -252,7 +252,7 @@ const int kRandomValueMaxFactor = 3; const int kValueMaxLen = 100; // wrapped posix or hdfs environment -extern ROCKSDB_NAMESPACE::DbStressEnvWrapper* db_stress_env; +extern ROCKSDB_NAMESPACE::Env* db_stress_env; #ifndef NDEBUG namespace ROCKSDB_NAMESPACE { class FaultInjectionTestFS; diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc index 155c9cc74..5046987f6 100644 --- a/db_stress_tool/db_stress_gflags.cc +++ b/db_stress_tool/db_stress_gflags.cc @@ -757,4 +757,7 @@ DEFINE_string(file_checksum_impl, "none", "Name of an implementation for file_checksum_gen_factory, or " "\"none\" for null."); +DEFINE_int32(write_fault_one_in, 0, + "On non-zero, enables fault injection on write"); + #endif // GFLAGS diff --git a/db_stress_tool/db_stress_shared_state.h b/db_stress_tool/db_stress_shared_state.h index 86310f82f..c21a6153e 100644 --- a/db_stress_tool/db_stress_shared_state.h +++ b/db_stress_tool/db_stress_shared_state.h @@ -29,6 +29,7 @@ DECLARE_bool(test_batches_snapshots); DECLARE_int32(compaction_thread_pool_adjust_interval); DECLARE_int32(continuous_verification_interval); DECLARE_int32(read_fault_one_in); +DECLARE_int32(write_fault_one_in); namespace ROCKSDB_NAMESPACE { class StressTest; diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc index 94082bd71..1bbab388f 100644 --- a/db_stress_tool/db_stress_test_base.cc +++ b/db_stress_tool/db_stress_test_base.cc @@ -15,6 +15,7 @@ #include "db_stress_tool/db_stress_table_properties_collector.h" #include "rocksdb/convenience.h" #include "rocksdb/sst_file_manager.h" +#include "rocksdb/types.h" #include "util/cast_util.h" #include "utilities/fault_injection_fs.h" @@ -525,6 +526,16 @@ void StressTest::OperateDb(ThreadState* thread) { fault_fs_guard->SetThreadLocalReadErrorContext(thread->shared->GetSeed(), FLAGS_read_fault_one_in); } + if (FLAGS_write_fault_one_in) { + IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); + error_msg.SetRetryable(true); + std::vector types; + types.push_back(FileType::kTableFile); + types.push_back(FileType::kDescriptorFile); + types.push_back(FileType::kCurrentFile); + fault_fs_guard->SetRandomWriteError( + thread->shared->GetSeed(), FLAGS_write_fault_one_in, error_msg, types); + } #endif // NDEBUG thread->stats.Start(); for (int open_cnt = 0; open_cnt <= FLAGS_reopen; ++open_cnt) { @@ -618,7 +629,8 @@ void StressTest::OperateDb(ThreadState* thread) { #ifndef ROCKSDB_LITE // Verify GetLiveFiles with a 1 in N chance. - if (thread->rand.OneInOpt(FLAGS_get_live_files_one_in)) { + if (thread->rand.OneInOpt(FLAGS_get_live_files_one_in) && + !FLAGS_write_fault_one_in) { Status status = VerifyGetLiveFiles(); if (!status.ok()) { VerificationAbort(shared, "VerifyGetLiveFiles status not OK", status); @@ -1460,7 +1472,7 @@ Status StressTest::TestCheckpoint(ThreadState* thread, FLAGS_db + "/.checkpoint" + ToString(thread->tid); Options tmp_opts(options_); tmp_opts.listeners.clear(); - tmp_opts.env = db_stress_env->target(); + tmp_opts.env = db_stress_env; DestroyDB(checkpoint_dir, tmp_opts); @@ -1952,6 +1964,7 @@ void StressTest::PrintEnv() const { fprintf(stdout, "Use dynamic level : %d\n", static_cast(FLAGS_level_compaction_dynamic_level_bytes)); fprintf(stdout, "Read fault one in : %d\n", FLAGS_read_fault_one_in); + fprintf(stdout, "Write fault one in : %d\n", FLAGS_write_fault_one_in); fprintf(stdout, "Sync fault injection : %d\n", FLAGS_sync_fault_injection); fprintf(stdout, "Best efforts recovery : %d\n", static_cast(FLAGS_best_efforts_recovery)); diff --git a/db_stress_tool/db_stress_tool.cc b/db_stress_tool/db_stress_tool.cc index 2126a1436..d8488fa90 100644 --- a/db_stress_tool/db_stress_tool.cc +++ b/db_stress_tool/db_stress_tool.cc @@ -97,20 +97,42 @@ int db_stress_tool(int argc, char** argv) { } #ifndef NDEBUG - if (FLAGS_read_fault_one_in || FLAGS_sync_fault_injection) { + if (FLAGS_read_fault_one_in || FLAGS_sync_fault_injection || + FLAGS_write_fault_one_in) { FaultInjectionTestFS* fs = new FaultInjectionTestFS(raw_env->GetFileSystem()); fault_fs_guard.reset(fs); - fault_fs_guard->SetFilesystemDirectWritable(true); + if (FLAGS_write_fault_one_in) { + fault_fs_guard->SetFilesystemDirectWritable(false); + } else { + fault_fs_guard->SetFilesystemDirectWritable(true); + } fault_env_guard = std::make_shared(raw_env, fault_fs_guard); raw_env = fault_env_guard.get(); } + if (FLAGS_write_fault_one_in) { + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:BeforeFinishBuildTable", + [&](void*) { fault_fs_guard->EnableWriteErrorInjection(); }); + SyncPoint::GetInstance()->EnableProcessing(); + } #endif env_wrapper_guard = std::make_shared(raw_env); db_stress_env = env_wrapper_guard.get(); +#ifndef NDEBUG + if (FLAGS_write_fault_one_in) { + // In the write injection case, we need to use the FS interface and returns + // the IOStatus with different error and flags. Therefore, + // DbStressEnvWrapper cannot be used which will swallow the FS + // implementations. We should directly use the raw_env which is the + // CompositeEnvWrapper of env and fault_fs. + db_stress_env = raw_env; + } +#endif + FLAGS_rep_factory = StringToRepFactory(FLAGS_memtablerep.c_str()); // The number of background threads should be at least as much the diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py index 722593caf..caff49790 100644 --- a/tools/db_crashtest.py +++ b/tools/db_crashtest.py @@ -299,6 +299,7 @@ def finalize_and_sanitize(src_params): if dest_params.get("disable_wal", 0) == 1: dest_params["atomic_flush"] = 1 dest_params["sync"] = 0 + dest_params["write_fault_one_in"] = 0 if dest_params.get("open_files", 1) != -1: # Compaction TTL and periodic compactions are only compatible # with open_files = -1 diff --git a/utilities/fault_injection_fs.cc b/utilities/fault_injection_fs.cc index 5d4f94e0c..a678291ee 100644 --- a/utilities/fault_injection_fs.cc +++ b/utilities/fault_injection_fs.cc @@ -99,7 +99,8 @@ IOStatus TestFSWritableFile::Append(const Slice& data, const IOOptions&, state_.buffer_.append(data.data(), data.size()); state_.pos_ += data.size(); fs_->WritableFileAppended(state_); - return IOStatus::OK(); + IOStatus io_s = fs_->InjectWriteError(state_.filename_); + return io_s; } IOStatus TestFSWritableFile::Close(const IOOptions& options, @@ -536,6 +537,34 @@ IOStatus FaultInjectionTestFS::InjectError(ErrorOperation op, return IOStatus::OK(); } +IOStatus FaultInjectionTestFS::InjectWriteError(const std::string& file_name) { + MutexLock l(&mutex_); + if (!enable_write_error_injection_ || !write_error_one_in_) { + return IOStatus::OK(); + } + bool allowed_type = false; + + uint64_t number; + FileType cur_type = kTempFile; + std::size_t found = file_name.find_last_of("/"); + std::string file = file_name.substr(found); + bool ret = ParseFileName(file, &number, &cur_type); + if (ret) { + for (const auto& type : write_error_allowed_types_) { + if (cur_type == type) { + allowed_type = true; + } + } + } + + if (allowed_type) { + if (write_error_rand_.OneIn(write_error_one_in_)) { + return GetError(); + } + } + return IOStatus::OK(); +} + void FaultInjectionTestFS::PrintFaultBacktrace() { #if defined(OS_LINUX) ErrorContext* ctx = diff --git a/utilities/fault_injection_fs.h b/utilities/fault_injection_fs.h index 737ce2379..fb18edbd6 100644 --- a/utilities/fault_injection_fs.h +++ b/utilities/fault_injection_fs.h @@ -172,8 +172,9 @@ class FaultInjectionTestFS : public FileSystemWrapper { : FileSystemWrapper(base), filesystem_active_(true), filesystem_writable_(false), - thread_local_error_(new ThreadLocalPtr(DeleteThreadLocalErrorContext)) { - } + thread_local_error_(new ThreadLocalPtr(DeleteThreadLocalErrorContext)), + enable_write_error_injection_(false), + write_error_rand_(0) {} virtual ~FaultInjectionTestFS() { error_.PermitUncheckedError(); } const char* Name() const override { return "FaultInjectionTestFS"; } @@ -316,6 +317,27 @@ class FaultInjectionTestFS : public FileSystemWrapper { delete ctx; } + // This is to set the parameters for the write error injection. + // seed is the seed for the random number generator, and one_in determines + // the probability of injecting error (i.e an error is injected with + // 1/one_in probability). For write error, we can specify the error we + // want to inject. Types decides the file types we want to inject the + // error (e.g., Wal files, SST files), which is empty by default. + void SetRandomWriteError(uint32_t seed, int one_in, IOStatus error, + const std::vector& types) { + MutexLock l(&mutex_); + Random tmp_rand(seed); + error.PermitUncheckedError(); + error_ = error; + write_error_rand_ = tmp_rand; + write_error_one_in_ = one_in; + write_error_allowed_types_ = types; + } + + // Inject an write error with randomlized parameter and the predefined + // error type. Only the allowed file types will inject the write error + IOStatus InjectWriteError(const std::string& file_name); + // Inject an error. For a READ operation, a status of IOError(), a // corruption in the contents of scratch, or truncation of slice // are the types of error with equal probability. For OPEN, @@ -343,6 +365,16 @@ class FaultInjectionTestFS : public FileSystemWrapper { } } + void EnableWriteErrorInjection() { + MutexLock l(&mutex_); + enable_write_error_injection_ = true; + } + + void DisableWriteErrorInjection() { + MutexLock l(&mutex_); + enable_write_error_injection_ = false; + } + void DisableErrorInjection() { ErrorContext* ctx = static_cast(thread_local_error_->Get()); @@ -396,6 +428,10 @@ class FaultInjectionTestFS : public FileSystemWrapper { }; std::unique_ptr thread_local_error_; + bool enable_write_error_injection_; + Random write_error_rand_; + int write_error_one_in_; + std::vector write_error_allowed_types_; }; } // namespace ROCKSDB_NAMESPACE