[Kill randomly at various points in source code for testing]

Summary:
This is initial version. A few ways in which this could
be extended in the future are:
(a) Killing from more places in source code
(b) Hashing stack and using that hash in determining whether to crash.
    This is to avoid crashing more often at source lines that are executed
    more often.
(c) Raising exceptions or returning errors instead of killing

Test Plan:
This whole thing is for testing.

Here is part of output:

python2.7 tools/db_crashtest2.py -d 600
Running db_stress

db_stress retncode -15 output LevelDB version     : 1.5
Number of threads   : 32
Ops per thread      : 10000000
Read percentage     : 50
Write-buffer-size   : 4194304
Delete percentage   : 30
Max key             : 1000
Ratio #ops/#keys    : 320000
Num times DB reopens: 0
Batches/snapshots   : 1
Purge redundant %   : 50
Num keys per lock   : 4
Compression         : snappy
------------------------------------------------
No lock creation because test_batches_snapshots set
2013/04/26-17:55:17  Starting database operations
Created bg thread 0x7fc1f07ff700
... finished 60000 ops
Running db_stress

db_stress retncode -15 output LevelDB version     : 1.5
Number of threads   : 32
Ops per thread      : 10000000
Read percentage     : 50
Write-buffer-size   : 4194304
Delete percentage   : 30
Max key             : 1000
Ratio #ops/#keys    : 320000
Num times DB reopens: 0
Batches/snapshots   : 1
Purge redundant %   : 50
Num keys per lock   : 4
Compression         : snappy
------------------------------------------------
Created bg thread 0x7ff0137ff700
No lock creation because test_batches_snapshots set
2013/04/26-17:56:15  Starting database operations
... finished 90000 ops

Revert Plan: OK

Task ID: #2252691

Reviewers: dhruba, emayanke

Reviewed By: emayanke

CC: leveldb, haobo

Differential Revision: https://reviews.facebook.net/D10581
This commit is contained in:
Vamsi Ponnekanti 2013-04-04 23:49:43 -07:00
parent 87d0af15d8
commit 760dd4750f
4 changed files with 195 additions and 5 deletions

108
tools/db_crashtest2.py Normal file
View File

@ -0,0 +1,108 @@
#! /usr/bin/env python
import os
import sys
import time
import shlex
import getopt
import logging
import tempfile
import subprocess
# This python script runs db_stress multiple times with kill_random_test
# that causes leveldb to crash at various points in code.
# It also has test-batches-snapshot ON so that basic atomic/consistency
# checks can be performed.
#
def main(argv):
os.system("make -C ~/rocksdb db_stress")
try:
opts, args = getopt.getopt(argv, "hd:t:k:o:b:")
except getopt.GetoptError:
print str(getopt.GetoptError)
print "db_crashtest2.py -d <duration_test> -t <#threads> " \
"-k <kills with prob 1/k> -o <ops_per_thread> "\
"-b <write_buffer_size>\n"
sys.exit(2)
# default values, will be overridden by cmdline args
kill_random_test = 97 # kill with probability 1/97 by default
duration = 6000 # total time for this script to test db_stress
threads = 32
ops_per_thread = 200000
write_buf_size = 4 * 1024 * 1024
for opt, arg in opts:
if opt == '-h':
print "db_crashtest2.py -d <duration_test> -t <#threads> " \
"-k <kills with prob 1/k> -o <ops_per_thread> "\
"-b <write_buffer_size>\n"
sys.exit()
elif opt == ("-d"):
duration = int(arg)
elif opt == ("-t"):
threads = int(arg)
elif opt == ("-k"):
kill_random_test = int(arg)
elif opt == ("-i"):
interval = int(arg)
elif opt == ("-o"):
ops_per_thread = int(arg)
elif opt == ("-b"):
write_buf_size = int(arg)
else:
print "unrecognized option " + str(opt) + "\n"
print "db_crashtest2.py -d <duration_test> -t <#threads> " \
"-k <kills with prob 1/k> -o <ops_per_thread> " \
"-b <write_buffer_size>\n"
sys.exit(2)
exit_time = time.time() + duration
dirpath = tempfile.mkdtemp()
# kill in every alternate run. toggle tracks which run we are doing.
toggle = True
while time.time() < exit_time:
run_had_errors = False
print "Running db_stress \n"
if toggle:
# since we are going to kill anyway, use more ops per thread
new_ops_per_thread = 100 * ops_per_thread
killoption = '--kill_random_test=' + str(kill_random_test)
else:
new_ops_per_thread = ops_per_thread
killoption = ''
toggle = not toggle
cmd = ['~/rocksdb/db_stress \
--test_batches_snapshots=1 \
--ops_per_thread=0' + str(new_ops_per_thread) + ' \
--threads=0' + str(threads) + ' \
--write_buffer_size=' + str(write_buf_size) + ' \
--destroy_db_initially=0 ' + killoption + ' \
--reopen=0 \
--readpercent=50 \
--db=' + dirpath + ' \
--max_key=10000']
try:
subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=True)
if killoption != '':
logging.warn("WARNING: db_stress did not kill itself\n")
continue
except subprocess.CalledProcessError as e:
msg = "db_stress retncode {0} output {1}".format(e.returncode,
e.output)
logging.info(msg)
print msg
msglower = msg.lower()
if ('error' in msglower) or ('fail' in msglower):
print "TEST FAILED!!!\n"
sys.exit(2)
time.sleep(1) # time to stabilize after a kill
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))

View File

@ -122,6 +122,10 @@ static bool FLAGS_disable_data_sync = false;
// If true, issue fsync instead of fdatasync
static bool FLAGS_use_fsync = false;
// If non-zero, kill at various points in source code with probability 1/this
static int FLAGS_kill_random_test = 0;
extern int leveldb_kill_odds;
// If true, do not write WAL for write.
static bool FLAGS_disable_wal = false;
@ -698,7 +702,7 @@ class StressTest {
char expected_prefix = (keys[i])[0];
char actual_prefix = (values[i])[0];
if (actual_prefix != expected_prefix) {
fprintf(stderr, "expected prefix = %c actual = %c\n",
fprintf(stderr, "error expected prefix = %c actual = %c\n",
expected_prefix, actual_prefix);
}
(values[i])[0] = ' '; // blank out the differing character
@ -710,7 +714,7 @@ class StressTest {
// Now that we retrieved all values, check that they all match
for (int i = 1; i < 10; i++) {
if (values[i] != values[0]) {
fprintf(stderr, "inconsistent values for key %s: %s, %s\n",
fprintf(stderr, "error : inconsistent values for key %s: %s, %s\n",
key.ToString().c_str(), values[0].c_str(),
values[i].c_str());
// we continue after error rather than exiting so that we can
@ -931,6 +935,7 @@ class StressTest {
options.env = FLAGS_env;
options.disableDataSync = FLAGS_disable_data_sync;
options.use_fsync = FLAGS_use_fsync;
leveldb_kill_odds = FLAGS_kill_random_test;
options.target_file_size_base = FLAGS_target_file_size_base;
options.target_file_size_multiplier = FLAGS_target_file_size_multiplier;
options.max_bytes_for_level_base = FLAGS_max_bytes_for_level_base;
@ -1093,6 +1098,9 @@ int main(int argc, char** argv) {
} else if (sscanf(argv[i], "--use_fsync=%d%c", &n, &junk) == 1 &&
(n == 0 || n == 1)) {
FLAGS_use_fsync = n;
} else if (sscanf(argv[i], "--kill_random_test=%d%c", &n, &junk) == 1 &&
(n >= 0)) {
FLAGS_kill_random_test = n;
} else if (sscanf(argv[i], "--disable_wal=%d%c", &n, &junk) == 1 &&
(n == 0 || n == 1)) {
FLAGS_disable_wal = n;

View File

@ -32,6 +32,8 @@
#include "util/coding.h"
#include "util/logging.h"
#include "util/posix_logger.h"
#include "util/random.h"
#include <signal.h>
#if !defined(TMPFS_MAGIC)
#define TMPFS_MAGIC 0x01021994
@ -48,8 +50,13 @@ bool useFsReadAhead = 1; // allow filesystem to do readaheads
bool useMmapRead = 0; // do not use mmaps for reading files
bool useMmapWrite = 1; // use mmaps for appending to files
// This is only set from db_stress.cc and for testing only.
// If non-zero, kill at various points in source code with probability 1/this
int leveldb_kill_odds = 0;
namespace leveldb {
namespace {
// list of pathnames that are locked
@ -60,6 +67,39 @@ static Status IOError(const std::string& context, int err_number) {
return Status::IOError(context, strerror(err_number));
}
#ifdef NDEBUG
// empty in release build
#define TEST_KILL_RANDOM(leveldb_kill_odds)
#else
// Kill the process with probablity 1/odds for testing.
static void TestKillRandom(int odds, const std::string& srcfile,
int srcline) {
time_t curtime = time(nullptr);
Random r((uint32_t)curtime);
assert(odds > 0);
bool crash = r.OneIn(odds);
if (crash) {
fprintf(stdout, "Crashing at %s:%d\n", srcfile.c_str(), srcline);
fflush(stdout);
kill(getpid(), SIGTERM);
}
}
// To avoid crashing always at some frequently executed codepaths (during
// kill random test), use this factor to reduce odds
#define REDUCE_ODDS 2
#define REDUCE_ODDS2 4
#define TEST_KILL_RANDOM(leveldb_kill_odds) { \
if (leveldb_kill_odds > 0) { \
TestKillRandom(leveldb_kill_odds, __FILE__, __LINE__); \
} \
}
#endif
class PosixSequentialFile: public SequentialFile {
private:
std::string filename_;
@ -232,6 +272,7 @@ class PosixMmapFile : public WritableFile {
bool UnmapCurrentRegion() {
bool result = true;
TEST_KILL_RANDOM(leveldb_kill_odds);
if (base_ != nullptr) {
if (last_sync_ < limit_) {
// Defer syncing this data until next Sync() call, if any
@ -257,18 +298,22 @@ class PosixMmapFile : public WritableFile {
Status MapNewRegion() {
assert(base_ == nullptr);
TEST_KILL_RANDOM(leveldb_kill_odds);
int alloc_status = posix_fallocate(fd_, file_offset_, map_size_);
if (alloc_status != 0) {
return Status::IOError("Error allocating space to file : " + filename_ +
"Error : " + strerror(alloc_status));
}
TEST_KILL_RANDOM(leveldb_kill_odds);
void* ptr = mmap(nullptr, map_size_, PROT_READ | PROT_WRITE, MAP_SHARED,
fd_, file_offset_);
if (ptr == MAP_FAILED) {
return Status::IOError("MMap failed on " + filename_);
}
TEST_KILL_RANDOM(leveldb_kill_odds);
base_ = reinterpret_cast<char*>(ptr);
limit_ = base_ + map_size_;
dst_ = base_;
@ -303,6 +348,7 @@ class PosixMmapFile : public WritableFile {
virtual Status Append(const Slice& data) {
const char* src = data.data();
size_t left = data.size();
TEST_KILL_RANDOM(leveldb_kill_odds * REDUCE_ODDS);
PrepareWrite(GetFileSize(), left);
while (left > 0) {
assert(base_ <= dst_);
@ -314,6 +360,7 @@ class PosixMmapFile : public WritableFile {
if (!s.ok()) {
return s;
}
TEST_KILL_RANDOM(leveldb_kill_odds);
}
}
@ -323,12 +370,16 @@ class PosixMmapFile : public WritableFile {
src += n;
left -= n;
}
TEST_KILL_RANDOM(leveldb_kill_odds);
return Status::OK();
}
virtual Status Close() {
Status s;
size_t unused = limit_ - dst_;
TEST_KILL_RANDOM(leveldb_kill_odds);
if (!UnmapCurrentRegion()) {
s = IOError(filename_, errno);
} else if (unused > 0) {
@ -338,6 +389,8 @@ class PosixMmapFile : public WritableFile {
}
}
TEST_KILL_RANDOM(leveldb_kill_odds);
if (close(fd_) < 0) {
if (s.ok()) {
s = IOError(filename_, errno);
@ -351,6 +404,7 @@ class PosixMmapFile : public WritableFile {
}
virtual Status Flush() {
TEST_KILL_RANDOM(leveldb_kill_odds);
return Status::OK();
}
@ -359,10 +413,12 @@ class PosixMmapFile : public WritableFile {
if (pending_sync_) {
// Some unmapped data was not synced
TEST_KILL_RANDOM(leveldb_kill_odds);
pending_sync_ = false;
if (fdatasync(fd_) < 0) {
s = IOError(filename_, errno);
}
TEST_KILL_RANDOM(leveldb_kill_odds * REDUCE_ODDS);
}
if (dst_ > last_sync_) {
@ -371,9 +427,11 @@ class PosixMmapFile : public WritableFile {
size_t p1 = TruncateToPageBoundary(last_sync_ - base_);
size_t p2 = TruncateToPageBoundary(dst_ - base_ - 1);
last_sync_ = dst_;
TEST_KILL_RANDOM(leveldb_kill_odds);
if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC) < 0) {
s = IOError(filename_, errno);
}
TEST_KILL_RANDOM(leveldb_kill_odds);
}
return s;
@ -385,10 +443,12 @@ class PosixMmapFile : public WritableFile {
virtual Status Fsync() {
if (pending_sync_) {
// Some unmapped data was not synced
TEST_KILL_RANDOM(leveldb_kill_odds);
pending_sync_ = false;
if (fsync(fd_) < 0) {
return IOError(filename_, errno);
}
TEST_KILL_RANDOM(leveldb_kill_odds);
}
// This invocation to Sync will not issue the call to
// fdatasync because pending_sync_ has already been cleared.
@ -407,6 +467,7 @@ class PosixMmapFile : public WritableFile {
#ifdef OS_LINUX
virtual Status Allocate(off_t offset, off_t len) {
TEST_KILL_RANDOM(leveldb_kill_odds);
if (!fallocate(fd_, FALLOC_FL_KEEP_SIZE, offset, len)) {
return Status::OK();
} else {
@ -455,6 +516,8 @@ class PosixWritableFile : public WritableFile {
pending_sync_ = true;
pending_fsync_ = true;
TEST_KILL_RANDOM(leveldb_kill_odds * REDUCE_ODDS2);
PrepareWrite(GetFileSize(), left);
// if there is no space in the cache, then flush
if (cursize_ + left > capacity_) {
@ -481,6 +544,8 @@ class PosixWritableFile : public WritableFile {
if (done < 0) {
return IOError(filename_, errno);
}
TEST_KILL_RANDOM(leveldb_kill_odds);
left -= done;
src += done;
}
@ -494,6 +559,9 @@ class PosixWritableFile : public WritableFile {
s = Flush(); // flush cache to OS
if (!s.ok()) {
}
TEST_KILL_RANDOM(leveldb_kill_odds);
if (close(fd_) < 0) {
if (s.ok()) {
s = IOError(filename_, errno);
@ -505,6 +573,7 @@ class PosixWritableFile : public WritableFile {
// write out the cached data to the OS cache
virtual Status Flush() {
TEST_KILL_RANDOM(leveldb_kill_odds * REDUCE_ODDS2);
size_t left = cursize_;
char* src = buf_.get();
while (left != 0) {
@ -512,6 +581,7 @@ class PosixWritableFile : public WritableFile {
if (done < 0) {
return IOError(filename_, errno);
}
TEST_KILL_RANDOM(leveldb_kill_odds * REDUCE_ODDS2);
left -= done;
src += done;
}
@ -520,17 +590,21 @@ class PosixWritableFile : public WritableFile {
}
virtual Status Sync() {
TEST_KILL_RANDOM(leveldb_kill_odds);
if (pending_sync_ && fdatasync(fd_) < 0) {
return IOError(filename_, errno);
}
TEST_KILL_RANDOM(leveldb_kill_odds);
pending_sync_ = false;
return Status::OK();
}
virtual Status Fsync() {
TEST_KILL_RANDOM(leveldb_kill_odds);
if (pending_fsync_ && fsync(fd_) < 0) {
return IOError(filename_, errno);
}
TEST_KILL_RANDOM(leveldb_kill_odds);
pending_fsync_ = false;
pending_sync_ = false;
return Status::OK();
@ -542,6 +616,7 @@ class PosixWritableFile : public WritableFile {
#ifdef OS_LINUX
virtual Status Allocate(off_t offset, off_t len) {
TEST_KILL_RANDOM(leveldb_kill_odds);
if (!fallocate(fd_, FALLOC_FL_KEEP_SIZE, offset, len)) {
return Status::OK();
} else {

View File

@ -21,8 +21,7 @@ class StorageOptions : public EnvOptions {
readahead_compactions_(opt.allow_readahead_compactions),
use_mmap_reads_(opt.allow_mmap_reads),
use_mmap_writes_(opt.allow_mmap_writes),
set_fd_cloexec_(opt.is_fd_close_on_exec)
{
set_fd_cloexec_(opt.is_fd_close_on_exec) {
}
// copy constructor with readaheads set to readahead_compactions_