#7916298: merge tools/db_crashtest2.py into tools/db_crashtest.py

Summary:
merge tools/db_crashtest2.py into tools/db_crashtest.py

python tools/db_crashtest.py -h  # show help message, ALL parameters can be overwrite by arguments

Example usages:
python tools/db_crashtest.py blackbox  # run blackbox with default parameters
python tools/db_crashtest.py blackbox --simple
python tools/db_crashtest.py whitebox  # run whitebox with default parameters
python tools/db_crashtest.py whitebox --simple

all default parameters are identical to previous version.

Test Plan: `make crash_test` and make sure it can run with expected parameters pased to db_stress.

Reviewers: igor, rven, anthony, IslamAbdelRahman, yhchiang, sdong

Reviewed By: sdong

Subscribers: dhruba

Differential Revision: https://reviews.facebook.net/D48567
This commit is contained in:
Shusen Liu 2015-10-19 11:43:14 -07:00
parent ec1f8354a9
commit 4575de5b9e
3 changed files with 305 additions and 396 deletions

View File

@ -568,12 +568,12 @@ ldb_tests: ldb
crash_test: whitebox_crash_test blackbox_crash_test
blackbox_crash_test: db_stress
python -u tools/db_crashtest.py -s
python -u tools/db_crashtest.py
python -u tools/db_crashtest.py --simple blackbox
python -u tools/db_crashtest.py blackbox
whitebox_crash_test: db_stress
python -u tools/db_crashtest2.py -s
python -u tools/db_crashtest2.py
python -u tools/db_crashtest.py --simple whitebox
python -u tools/db_crashtest.py whitebox
asan_check:
$(MAKE) clean

View File

@ -4,161 +4,183 @@ import re
import sys
import time
import random
import getopt
import logging
import tempfile
import subprocess
import shutil
import argparse
# params overwrite priority:
# for default:
# default_params < blackbox|whitebox_default_params < args
# for simple:
# simple_default_params < blackbox|whitebox_simple_default_params < args
default_params = {
"block_size": 16384,
"cache_size": 1048576,
"delpercent": 5,
"destroy_db_initially": 0,
"disable_data_sync": 0,
"disable_wal": 0,
"filter_deletes": lambda: random.randint(0, 1),
"iterpercent": 10,
"max_background_compactions": 20,
"max_bytes_for_level_base": 10485760,
"max_key": 100000000,
"max_write_buffer_number": 3,
"memtablerep": "prefix_hash",
"mmap_read": lambda: random.randint(0, 1),
"open_files": 500000,
"prefix_size": 7,
"prefixpercent": 5,
"progress_reports": 0,
"readpercent": 45,
"reopen": 20,
"sync": 0,
"target_file_size_base": 2097152,
"target_file_size_multiplier": 2,
"threads": 32,
"verify_checksum": 1,
"write_buffer_size": 4 * 1024 * 1024,
"writepercent": 35,
}
def get_dbname(test_name):
test_tmpdir = os.environ.get("TEST_TMPDIR")
if test_tmpdir is None or test_tmpdir == "":
dbname = tempfile.mkdtemp(prefix='rocksdb_crashtest_' + test_name)
else:
dbname = test_tmpdir + "/rocksdb_crashtest_" + test_name
shutil.rmtree(dbname, True)
return dbname
blackbox_default_params = {
'db': lambda: get_dbname('blackbox'),
# total time for this script to test db_stress
"duration": 6000,
# time for one db_stress instance to run
"interval": 120,
# since we will be killing anyway, use large value for ops_per_thread
"ops_per_thread": 100000000,
"set_options_one_in": 10000,
"test_batches_snapshots": 1,
}
whitebox_default_params = {
'db': lambda: get_dbname('whitebox'),
"duration": 10000,
"log2_keys_per_lock": 10,
"nooverwritepercent": 1,
"ops_per_thread": 200000,
"test_batches_snapshots": lambda: random.randint(0, 1),
"write_buffer_size": 4 * 1024 * 1024,
}
simple_default_params = {
"block_size": 16384,
"cache_size": 1048576,
"column_families": 1,
"delpercent": 5,
"destroy_db_initially": 0,
"disable_data_sync": 0,
"disable_wal": 0,
"filter_deletes": lambda: random.randint(0, 1),
"iterpercent": 10,
"max_background_compactions": 1,
"max_bytes_for_level_base": 67108864,
"max_key": 100000000,
"max_write_buffer_number": 3,
"memtablerep": "skip_list",
"mmap_read": lambda: random.randint(0, 1),
"prefix_size": 0,
"prefixpercent": 0,
"progress_reports": 0,
"readpercent": 50,
"reopen": 20,
"sync": 0,
"target_file_size_base": 16777216,
"target_file_size_multiplier": 1,
"test_batches_snapshots": 0,
"threads": 32,
"verify_checksum": 1,
"write_buffer_size": 32 * 1024 * 1024,
"writepercent": 35,
}
blackbox_simple_default_params = {
'db': lambda: get_dbname('blackbox'),
"duration": 6000,
"interval": 120,
"open_files": -1,
"ops_per_thread": 100000000,
"set_options_one_in": 0,
"test_batches_snapshots": 0,
}
whitebox_simple_default_params = {
'db': lambda: get_dbname('whitebox'),
"duration": 10000,
"log2_keys_per_lock": 10,
"nooverwritepercent": 1,
"open_files": 500000,
"ops_per_thread": 200000,
"write_buffer_size": 32 * 1024 * 1024,
}
def gen_cmd_params(args):
params = {}
if args.simple:
params.update(simple_default_params)
if args.test_type == 'blackbox':
params.update(blackbox_simple_default_params)
if args.test_type == 'whitebox':
params.update(whitebox_simple_default_params)
if not args.simple:
params.update(default_params)
if args.test_type == 'blackbox':
params.update(blackbox_default_params)
if args.test_type == 'whitebox':
params.update(whitebox_default_params)
for k, v in vars(args).items():
if v is not None:
params[k] = v
return params
def gen_cmd(params):
cmd = './db_stress ' + ' '.join(
'--{0}={1}'.format(k, v() if callable(v) else v)
for k, v in params.items()
if k not in set(['test_type', 'simple', 'duration', 'interval'])
and v is not None)
return cmd
# This script runs and kills db_stress multiple times. It checks consistency
# in case of unsafe crashes in RocksDB.
def blackbox_crash_main(args):
cmd_params = gen_cmd_params(args)
def main(argv):
try:
opts, args = getopt.getopt(argv, "hsd:t:i:o:b:")
except getopt.GetoptError:
print("db_crashtest.py -d <duration_test> -t <#threads> "
"-i <interval for one run> -o <ops_per_thread> "
"-b <write_buffer_size> [-s (simple mode)]\n")
sys.exit(2)
exit_time = time.time() + cmd_params['duration']
# default values, will be overridden by cmdline args
interval = 120 # time for one db_stress instance to run
duration = 6000 # total time for this script to test db_stress
threads = 32
# since we will be killing anyway, use large value for ops_per_thread
ops_per_thread = 100000000
write_buf_size = 4 * 1024 * 1024
simple_mode = False
write_buf_size_set = False
for opt, arg in opts:
if opt == '-h':
print("db_crashtest.py -d <duration_test>"
" -t <#threads> -i <interval for one run>"
" -o <ops_per_thread> -b <write_buffer_size>"
" [-s (simple mode)]\n")
sys.exit()
elif opt == '-s':
simple_mode = True
if not write_buf_size_set:
write_buf_size = 32 * 1024 * 1024
elif opt == "-d":
duration = int(arg)
elif opt == "-t":
threads = int(arg)
elif opt == "-i":
interval = int(arg)
elif opt == "-o":
ops_per_thread = int(arg)
elif opt == "-b":
write_buf_size = int(arg)
write_buf_size_set = True
else:
print("db_crashtest.py -d <duration_test>"
" -t <#threads> -i <interval for one run>"
" -o <ops_per_thread> -b <write_buffer_size>\n")
sys.exit(2)
exit_time = time.time() + duration
print("Running blackbox-crash-test with \ninterval_between_crash="
+ str(interval) + "\ntotal-duration=" + str(duration)
+ "\nthreads=" + str(threads) + "\nops_per_thread="
+ str(ops_per_thread) + "\nwrite_buffer_size="
+ str(write_buf_size) + "\n")
test_tmpdir = os.environ.get("TEST_TMPDIR")
if test_tmpdir is None or test_tmpdir == "":
dbname = tempfile.mkdtemp(prefix='rocksdb_crashtest_')
else:
dbname = test_tmpdir + "/rocksdb_crashtest"
shutil.rmtree(dbname, True)
print("Running blackbox-crash-test with \n"
+ "interval_between_crash=" + str(cmd_params['interval']) + "\n"
+ "total-duration=" + str(cmd_params['duration']) + "\n"
+ "threads=" + str(cmd_params['threads']) + "\n"
+ "ops_per_thread=" + str(cmd_params['ops_per_thread']) + "\n"
+ "write_buffer_size=" + str(cmd_params['write_buffer_size']) + "\n")
while time.time() < exit_time:
run_had_errors = False
killtime = time.time() + interval
killtime = time.time() + cmd_params['interval']
if simple_mode:
cmd = re.sub('\s+', ' ', """
./db_stress
--column_families=1
--test_batches_snapshots=0
--ops_per_thread=%s
--threads=%s
--write_buffer_size=%s
--destroy_db_initially=0
--reopen=20
--readpercent=50
--prefixpercent=0
--writepercent=35
--delpercent=5
--iterpercent=10
--db=%s
--max_key=100000000
--mmap_read=%s
--block_size=16384
--cache_size=1048576
--open_files=-1
--verify_checksum=1
--sync=0
--progress_reports=0
--disable_wal=0
--disable_data_sync=0
--target_file_size_base=16777216
--target_file_size_multiplier=1
--max_write_buffer_number=3
--max_background_compactions=1
--max_bytes_for_level_base=67108864
--filter_deletes=%s
--memtablerep=skip_list
--prefix_size=0
--set_options_one_in=0
""" % (ops_per_thread,
threads,
write_buf_size,
dbname,
random.randint(0, 1),
random.randint(0, 1)))
else:
cmd = re.sub('\s+', ' ', """
./db_stress
--test_batches_snapshots=1
--ops_per_thread=%s
--threads=%s
--write_buffer_size=%s
--destroy_db_initially=0
--reopen=20
--readpercent=45
--prefixpercent=5
--writepercent=35
--delpercent=5
--iterpercent=10
--db=%s
--max_key=100000000
--mmap_read=%s
--block_size=16384
--cache_size=1048576
--open_files=500000
--verify_checksum=1
--sync=0
--progress_reports=0
--disable_wal=0
--disable_data_sync=0
--target_file_size_base=2097152
--target_file_size_multiplier=2
--max_write_buffer_number=3
--max_background_compactions=20
--max_bytes_for_level_base=10485760
--filter_deletes=%s
--memtablerep=prefix_hash
--prefix_size=7
--set_options_one_in=10000
""" % (ops_per_thread,
threads,
write_buf_size,
dbname,
random.randint(0, 1),
random.randint(0, 1)))
cmd = gen_cmd(cmd_params)
child = subprocess.Popen([cmd],
stderr=subprocess.PIPE, shell=True)
@ -199,5 +221,140 @@ def main(argv):
# we need to clean up after ourselves -- only do this on test success
shutil.rmtree(dbname, True)
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
# This python script runs db_stress multiple times. Some runs with
# kill_random_test that causes rocksdb to crash at various points in code.
def whitebox_crash_main(args):
cmd_params = gen_cmd_params(args)
cur_time = time.time()
exit_time = cur_time + cmd_params['duration']
half_time = cur_time + cmd_params['duration'] / 2
print("Running whitebox-crash-test with \n"
+ "total-duration=" + str(cmd_params['duration']) + "\n"
+ "threads=" + str(cmd_params['threads']) + "\n"
+ "ops_per_thread=" + str(cmd_params['ops_per_thread']) + "\n"
+ "write_buffer_size=" + str(cmd_params['write_buffer_size']) + "\n")
total_check_mode = 4
check_mode = 0
kill_random_test = 97
kill_mode = 0
while time.time() < exit_time:
if check_mode == 0:
additional_opts = {
# use large ops per thread since we will kill it anyway
"ops_per_thread": 100 * cmd_params['ops_per_thread'],
}
# run with kill_random_test
if kill_mode == 0:
additional_opts.update({
"kill_random_test": kill_random_test,
})
elif kill_mode == 1:
additional_opts.update({
"kill_random_test": (kill_random_test / 3 + 1),
"kill_prefix_blacklist": "WritableFileWriter::Append,"
+ "WritableFileWriter::WriteBuffered",
})
# Run kill mode 0 and 1 by turn.
kill_mode = (kill_mode + 1) % 2
elif check_mode == 1:
# normal run with universal compaction mode
additional_opts = {
"kill_random_test": None,
"ops_per_thread": cmd_params['ops_per_thread'],
"compaction_style": 1,
}
elif check_mode == 2:
# normal run with FIFO compaction mode
# ops_per_thread is divided by 5 because FIFO compaction
# style is quite a bit slower on reads with lot of files
additional_opts = {
"kill_random_test": None,
"ops_per_thread": cmd_params['ops_per_thread'] / 5,
"compaction_style": 2,
}
else:
# normal run
additional_opts = additional_opts = {
"kill_random_test": None,
"ops_per_thread": cmd_params['ops_per_thread'],
}
cmd = gen_cmd(dict(cmd_params.items() + additional_opts.items()))
print "Running:" + cmd + "\n"
popen = subprocess.Popen([cmd], stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
shell=True)
stdoutdata, stderrdata = popen.communicate()
retncode = popen.returncode
msg = ("check_mode={0}, kill option={1}, exitcode={2}\n".format(
check_mode, additional_opts['kill_random_test'], retncode))
print msg
print stdoutdata
expected = False
if additional_opts['kill_random_test'] is None and (retncode == 0):
# we expect zero retncode if no kill option
expected = True
elif additional_opts['kill_random_test'] is not None and retncode < 0:
# we expect negative retncode if kill option was given
expected = True
if not expected:
print "TEST FAILED. See kill option and exit code above!!!\n"
sys.exit(1)
stdoutdata = stdoutdata.lower()
errorcount = (stdoutdata.count('error') -
stdoutdata.count('got errors 0 times'))
print "#times error occurred in output is " + str(errorcount) + "\n"
if (errorcount > 0):
print "TEST FAILED. Output has 'error'!!!\n"
sys.exit(2)
if (stdoutdata.find('fail') >= 0):
print "TEST FAILED. Output has 'fail'!!!\n"
sys.exit(2)
# First half of the duration, keep doing kill test. For the next half,
# try different modes.
if time.time() > half_time:
# we need to clean up after ourselves -- only do this on test
# success
shutil.rmtree(dbname, True)
check_mode = (check_mode + 1) % total_check_mode
time.sleep(1) # time to stabilize after a kill
def main():
parser = argparse.ArgumentParser(description="This script runs and kills \
db_stress multiple times")
parser.add_argument("test_type", choices=["blackbox", "whitebox"])
parser.add_argument("--simple", action="store_true")
all_params = dict(default_params.items()
+ blackbox_default_params.items()
+ whitebox_default_params.items()
+ simple_default_params.items()
+ blackbox_simple_default_params.items()
+ whitebox_simple_default_params.items())
for k, v in all_params.items():
parser.add_argument("--" + k, type=type(v() if callable(v) else v))
args = parser.parse_args()
if args.test_type == 'blackbox':
blackbox_crash_main(args)
if args.test_type == 'whitebox':
whitebox_crash_main(args)
if __name__ == '__main__':
main()

View File

@ -1,248 +0,0 @@
#! /usr/bin/env python
import os
import re
import sys
import time
import random
import getopt
import logging
import tempfile
import subprocess
import shutil
# This python script runs db_stress multiple times. Some runs with
# kill_random_test that causes rocksdb to crash at various points in code.
def main(argv):
try:
opts, args = getopt.getopt(argv, "hsd:t:k:o:b:")
except getopt.GetoptError:
print str(getopt.GetoptError)
print "db_crashtest2.py -d <duration_test> -t <#threads> " \
"-k <kills with prob 1/k> -o <ops_per_thread> "\
"-b <write_buffer_size> [-s (simple mode)]\n"
sys.exit(2)
# default values, will be overridden by cmdline args
kill_random_test = 97 # kill with probability 1/97 by default
duration = 10000 # total time for this script to test db_stress
threads = 32
ops_per_thread = 200000
write_buf_size = 4 * 1024 * 1024
simple_mode = False
write_buf_size_set = False
for opt, arg in opts:
if opt == '-h':
print "db_crashtest2.py -d <duration_test> -t <#threads> " \
"-k <kills with prob 1/k> -o <ops_per_thread> " \
"-b <write_buffer_size> [-s (simple mode)]\n"
sys.exit()
elif opt == '-s':
simple_mode = True
if not write_buf_size_set:
write_buf_size = 32 * 1024 * 1024
elif opt == "-d":
duration = int(arg)
elif opt == "-t":
threads = int(arg)
elif opt == "-k":
kill_random_test = int(arg)
elif opt == "-o":
ops_per_thread = int(arg)
elif opt == "-b":
write_buf_size = int(arg)
write_buf_size_set = True
else:
print "unrecognized option " + str(opt) + "\n"
print "db_crashtest2.py -d <duration_test> -t <#threads> " \
"-k <kills with prob 1/k> -o <ops_per_thread> " \
"-b <write_buffer_size>\n"
sys.exit(2)
cur_time = time.time()
exit_time = cur_time + duration
half_time = cur_time + duration / 2
print "Running whitebox-crash-test with \ntotal-duration=" + str(duration) \
+ "\nthreads=" + str(threads) + "\nops_per_thread=" \
+ str(ops_per_thread) + "\nwrite_buffer_size=" \
+ str(write_buf_size) + "\n"
total_check_mode = 4
check_mode = 0
kill_mode = 0
test_tmpdir = os.environ.get("TEST_TMPDIR")
if test_tmpdir is None or test_tmpdir == "":
dbname = tempfile.mkdtemp(prefix='rocksdb_crashtest2_')
else:
dbname = test_tmpdir + "/rocksdb_crashtest2"
shutil.rmtree(dbname, True)
while time.time() < exit_time:
killoption = ""
if check_mode == 0:
# run with kill_random_test
if kill_mode == 0:
killoption = " --kill_random_test=" + str(kill_random_test)
elif kill_mode == 1:
# Remove kill point for normal reads and reduce kill odds
# by 3, so that it still runs about one minutes in average
# before hitting a crash point.
killoption = " --kill_random_test=" + \
str(kill_random_test / 3 + 1)
killoption += \
" --kill_prefix_blacklist=WritableFileWriter::Append," \
"WritableFileWriter::WriteBuffered"
# Run kill mode 0 and 1 by turn.
kill_mode = (kill_mode + 1) % 2
# use large ops per thread since we will kill it anyway
additional_opts = "--ops_per_thread=" + \
str(100 * ops_per_thread) + killoption
elif check_mode == 1:
# normal run with universal compaction mode
additional_opts = "--ops_per_thread=" + str(ops_per_thread) + \
" --compaction_style=1"
elif check_mode == 2:
# normal run with FIFO compaction mode
# ops_per_thread is divided by 5 because FIFO compaction
# style is quite a bit slower on reads with lot of files
additional_opts = "--ops_per_thread=" + str(ops_per_thread / 5) + \
" --compaction_style=2"
else:
# normal run
additional_opts = "--ops_per_thread=" + str(ops_per_thread)
if simple_mode:
cmd = re.sub('\s+', ' ', """
./db_stress
--column_families=1
--threads=%s
--write_buffer_size=%s
--destroy_db_initially=0
--reopen=20
--prefixpercent=0
--readpercent=50
--writepercent=35
--delpercent=5
--iterpercent=10
--db=%s
--max_key=100000000
--mmap_read=%s
--block_size=16384
--cache_size=1048576
--open_files=500000
--verify_checksum=1
--sync=0
--progress_reports=0
--disable_wal=0
--disable_data_sync=0
--target_file_size_base=16777216
--target_file_size_multiplier=1
--max_write_buffer_number=3
--max_background_compactions=1
--max_bytes_for_level_base=67108864
--filter_deletes=%s
--memtablerep=skip_list
--prefix_size=0
--nooverwritepercent=1
--log2_keys_per_lock=10
%s
""" % (threads,
write_buf_size,
dbname,
random.randint(0, 1),
random.randint(0, 1),
additional_opts))
else:
cmd = re.sub('\s+', ' ', """
./db_stress
--test_batches_snapshots=%s
--threads=%s
--write_buffer_size=%s
--destroy_db_initially=0
--reopen=20
--readpercent=45
--prefixpercent=5
--writepercent=35
--delpercent=5
--iterpercent=10
--db=%s
--max_key=100000000
--mmap_read=%s
--block_size=16384
--cache_size=1048576
--open_files=500000
--verify_checksum=1
--sync=0
--progress_reports=0
--disable_wal=0
--disable_data_sync=0
--target_file_size_base=2097152
--target_file_size_multiplier=2
--max_write_buffer_number=3
--max_background_compactions=20
--max_bytes_for_level_base=10485760
--filter_deletes=%s
--memtablerep=prefix_hash
--prefix_size=7
--nooverwritepercent=1
--log2_keys_per_lock=10
%s
""" % (random.randint(0, 1),
threads,
write_buf_size,
dbname,
random.randint(0, 1),
random.randint(0, 1),
additional_opts))
print "Running:" + cmd + "\n"
popen = subprocess.Popen([cmd], stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
shell=True)
stdoutdata, stderrdata = popen.communicate()
retncode = popen.returncode
msg = ("check_mode={0}, kill option={1}, exitcode={2}\n".format(
check_mode, killoption, retncode))
print msg
print stdoutdata
expected = False
if (killoption == '') and (retncode == 0):
# we expect zero retncode if no kill option
expected = True
elif killoption != '' and retncode < 0:
# we expect negative retncode if kill option was given
expected = True
if not expected:
print "TEST FAILED. See kill option and exit code above!!!\n"
sys.exit(1)
stdoutdata = stdoutdata.lower()
errorcount = (stdoutdata.count('error') -
stdoutdata.count('got errors 0 times'))
print "#times error occurred in output is " + str(errorcount) + "\n"
if (errorcount > 0):
print "TEST FAILED. Output has 'error'!!!\n"
sys.exit(2)
if (stdoutdata.find('fail') >= 0):
print "TEST FAILED. Output has 'fail'!!!\n"
sys.exit(2)
# First half of the duration, keep doing kill test. For the next half,
# try different modes.
if time.time() > half_time:
# we need to clean up after ourselves -- only do this on test
# success
shutil.rmtree(dbname, True)
check_mode = (check_mode + 1) % total_check_mode
time.sleep(1) # time to stabilize after a kill
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))