Improve fault injection to MultiRead (#8937)
Summary: Several improvements to MultiRead: 1. Fix a bug in stress test which causes false positive when both MultiRead() return and individual read request have failure injected. 2. Add two more types of fault that should be handled: empty read results and checksum mismatch 3. Add a message indicating which type of fault is injected 4. Increase the failure rate Pull Request resolved: https://github.com/facebook/rocksdb/pull/8937 Reviewed By: anand1976 Differential Revision: D31085930 fbshipit-source-id: 3a04994a3cadebf9a64d25e1fe12b14b7a272fba
This commit is contained in:
parent
fcce1f2c7a
commit
9320067703
@ -349,7 +349,9 @@ class NonBatchedOpsStressTest : public StressTest {
|
||||
// Grab mutex so multiple thread don't try to print the
|
||||
// stack trace at the same time
|
||||
MutexLock l(thread->shared->GetMutex());
|
||||
fprintf(stderr, "Didn't get expected error from MultiGet\n");
|
||||
fprintf(stderr, "Didn't get expected error from MultiGet. \n");
|
||||
fprintf(stderr, "num_keys %zu Expected %d errors, seen %d\n", num_keys,
|
||||
error_count, stat_nok);
|
||||
fprintf(stderr, "Callstack that injected the fault\n");
|
||||
fault_fs_guard->PrintFaultBacktrace();
|
||||
std::terminate();
|
||||
|
@ -140,7 +140,7 @@ default_params = {
|
||||
"continuous_verification_interval" : 0,
|
||||
"max_key_len": 3,
|
||||
"key_len_percent_dist": "1,30,69",
|
||||
"read_fault_one_in": lambda: random.choice([0, 1000]),
|
||||
"read_fault_one_in": lambda: random.choice([0, 32, 1000]),
|
||||
"open_metadata_write_fault_one_in": lambda: random.choice([0, 0, 8]),
|
||||
"open_write_fault_one_in": lambda: random.choice([0, 0, 16]),
|
||||
"open_read_fault_one_in": lambda: random.choice([0, 0, 32]),
|
||||
|
@ -26,6 +26,7 @@
|
||||
#include "util/coding.h"
|
||||
#include "util/crc32c.h"
|
||||
#include "util/random.h"
|
||||
#include "util/string_util.h"
|
||||
#include "util/xxhash.h"
|
||||
|
||||
namespace ROCKSDB_NAMESPACE {
|
||||
@ -340,7 +341,7 @@ IOStatus TestFSRandomAccessFile::Read(uint64_t offset, size_t n,
|
||||
if (s.ok()) {
|
||||
s = fs_->InjectThreadSpecificReadError(
|
||||
FaultInjectionTestFS::ErrorOperation::kRead, result, use_direct_io(),
|
||||
scratch);
|
||||
scratch, /*need_count_increase=*/true, /*fault_injected=*/nullptr);
|
||||
}
|
||||
if (s.ok() && fs_->ShouldInjectRandomReadError()) {
|
||||
return IOStatus::IOError("Injected read error");
|
||||
@ -355,19 +356,25 @@ IOStatus TestFSRandomAccessFile::MultiRead(FSReadRequest* reqs, size_t num_reqs,
|
||||
return fs_->GetError();
|
||||
}
|
||||
IOStatus s = target_->MultiRead(reqs, num_reqs, options, dbg);
|
||||
bool injected_error = false;
|
||||
for (size_t i = 0; i < num_reqs; i++) {
|
||||
if (!reqs[i].status.ok()) {
|
||||
// Already seeing an error.
|
||||
break;
|
||||
}
|
||||
bool this_injected_error;
|
||||
reqs[i].status = fs_->InjectThreadSpecificReadError(
|
||||
FaultInjectionTestFS::ErrorOperation::kRead, &reqs[i].result,
|
||||
use_direct_io(), reqs[i].scratch);
|
||||
FaultInjectionTestFS::ErrorOperation::kMultiReadSingleReq,
|
||||
&(reqs[i].result), use_direct_io(), reqs[i].scratch,
|
||||
/*need_count_increase=*/true,
|
||||
/*fault_injected=*/&this_injected_error);
|
||||
injected_error |= this_injected_error;
|
||||
}
|
||||
if (s.ok()) {
|
||||
s = fs_->InjectThreadSpecificReadError(
|
||||
FaultInjectionTestFS::ErrorOperation::kRead, nullptr, use_direct_io(),
|
||||
nullptr);
|
||||
FaultInjectionTestFS::ErrorOperation::kMultiRead, nullptr,
|
||||
use_direct_io(), nullptr, /*need_count_increase=*/!injected_error,
|
||||
/*fault_injected=*/nullptr);
|
||||
}
|
||||
if (s.ok() && fs_->ShouldInjectRandomReadError()) {
|
||||
return IOStatus::IOError("Injected read error");
|
||||
@ -550,7 +557,9 @@ IOStatus FaultInjectionTestFS::NewRandomAccessFile(
|
||||
return IOStatus::IOError("Injected error when open random access file");
|
||||
}
|
||||
IOStatus io_s = InjectThreadSpecificReadError(ErrorOperation::kOpen, nullptr,
|
||||
false, nullptr);
|
||||
false, nullptr,
|
||||
/*need_count_increase=*/true,
|
||||
/*fault_injected=*/nullptr);
|
||||
if (io_s.ok()) {
|
||||
io_s = target()->NewRandomAccessFile(fname, file_opts, result, dbg);
|
||||
}
|
||||
@ -759,8 +768,11 @@ void FaultInjectionTestFS::UntrackFile(const std::string& f) {
|
||||
}
|
||||
|
||||
IOStatus FaultInjectionTestFS::InjectThreadSpecificReadError(
|
||||
ErrorOperation /*op*/, Slice* /*result*/, bool /*direct_io*/,
|
||||
char* /*scratch*/) {
|
||||
ErrorOperation op, Slice* result, bool direct_io, char* /*scratch*/,
|
||||
bool need_count_increase, bool* fault_injected) {
|
||||
bool dummy_bool;
|
||||
bool& ret_fault_injected = fault_injected ? *fault_injected : dummy_bool;
|
||||
ret_fault_injected = false;
|
||||
ErrorContext* ctx =
|
||||
static_cast<ErrorContext*>(thread_local_error_->Get());
|
||||
if (ctx == nullptr || !ctx->enable_error_injection || !ctx->one_in) {
|
||||
@ -768,12 +780,47 @@ IOStatus FaultInjectionTestFS::InjectThreadSpecificReadError(
|
||||
}
|
||||
|
||||
if (ctx->rand.OneIn(ctx->one_in)) {
|
||||
ctx->count++;
|
||||
if (ctx->count == 0) {
|
||||
ctx->message = "";
|
||||
}
|
||||
if (need_count_increase) {
|
||||
ctx->count++;
|
||||
}
|
||||
if (ctx->callstack) {
|
||||
free(ctx->callstack);
|
||||
}
|
||||
ctx->callstack = port::SaveStack(&ctx->frames);
|
||||
return IOStatus::IOError();
|
||||
|
||||
if (op != ErrorOperation::kMultiReadSingleReq) {
|
||||
// Likely non-per read status code for MultiRead
|
||||
ctx->message += "error; ";
|
||||
ret_fault_injected = true;
|
||||
return IOStatus::IOError();
|
||||
} else if (Random::GetTLSInstance()->OneIn(8)) {
|
||||
assert(result);
|
||||
// For a small chance, set the failure to status but turn the
|
||||
// result to be empty, which is supposed to be caught for a check.
|
||||
*result = Slice();
|
||||
ctx->message += "inject empty result; ";
|
||||
ret_fault_injected = true;
|
||||
} else if (!direct_io && Random::GetTLSInstance()->OneIn(7)) {
|
||||
assert(result);
|
||||
// With direct I/O, many extra bytes might be read so corrupting
|
||||
// one byte might not cause checksum mismatch. Skip checksum
|
||||
// corruption injection.
|
||||
// For a small chance, set the failure to status but corrupt the
|
||||
// result in a way that checksum checking is supposed to fail.
|
||||
// Corrupt the last byte, which is supposed to be a checksum byte
|
||||
// It would work for CRC. Not 100% sure for xxhash and will adjust
|
||||
// if it is not the case.
|
||||
const_cast<char*>(result->data())[result->size() - 1]++;
|
||||
ctx->message += "corrupt last byte; ";
|
||||
ret_fault_injected = true;
|
||||
} else {
|
||||
ctx->message += "error result multiget single; ";
|
||||
ret_fault_injected = true;
|
||||
return IOStatus::IOError();
|
||||
}
|
||||
}
|
||||
return IOStatus::OK();
|
||||
}
|
||||
@ -835,6 +882,7 @@ void FaultInjectionTestFS::PrintFaultBacktrace() {
|
||||
return;
|
||||
}
|
||||
fprintf(stderr, "Injected error type = %d\n", ctx->type);
|
||||
fprintf(stderr, "Message: %s\n", ctx->message.c_str());
|
||||
port::PrintAndFreeStack(ctx->callstack, ctx->frames);
|
||||
ctx->callstack = nullptr;
|
||||
#endif
|
||||
|
@ -370,6 +370,8 @@ class FaultInjectionTestFS : public FileSystemWrapper {
|
||||
// Specify what the operation, so we can inject the right type of error
|
||||
enum ErrorOperation : char {
|
||||
kRead = 0,
|
||||
kMultiReadSingleReq = 1,
|
||||
kMultiRead = 2,
|
||||
kOpen,
|
||||
};
|
||||
|
||||
@ -440,8 +442,12 @@ class FaultInjectionTestFS : public FileSystemWrapper {
|
||||
// corruption in the contents of scratch, or truncation of slice
|
||||
// are the types of error with equal probability. For OPEN,
|
||||
// its always an IOError.
|
||||
// fault_injected returns whether a fault is injected. It is needed
|
||||
// because some fault is inected with IOStatus to be OK.
|
||||
IOStatus InjectThreadSpecificReadError(ErrorOperation op, Slice* slice,
|
||||
bool direct_io, char* scratch);
|
||||
bool direct_io, char* scratch,
|
||||
bool need_count_increase,
|
||||
bool* fault_injected);
|
||||
|
||||
// Get the count of how many times we injected since the previous call
|
||||
int GetAndResetErrorCount() {
|
||||
@ -525,6 +531,7 @@ class FaultInjectionTestFS : public FileSystemWrapper {
|
||||
int count;
|
||||
bool enable_error_injection;
|
||||
void* callstack;
|
||||
std::string message;
|
||||
int frames;
|
||||
ErrorType type;
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user