Add universal compaction to db_stress nightly build

Summary:
Most code change in this diff is code cleanup/rewrite. The logic changes include:

(1) add universal compaction to db_crashtest2.py
(2) randomly set --test_batches_snapshots to be 0 or 1 in db_crashtest2.py. Old codes always use 1.
(3) use different tmp directory as db directory in different runs. I saw some intermittent errors in my local tests. Use of different tmp directory seems to be able to solve the issue.

Test Plan: Have run "make crashtest" for multiple times. Also run "make all check"

Reviewers: emayanke, dhruba, haobo

Reviewed By: emayanke

Differential Revision: https://reviews.facebook.net/D12369
This commit is contained in:
Xing Jin 2013-08-20 17:37:49 -07:00
parent b87dcae1a3
commit af732c7ba8
2 changed files with 156 additions and 136 deletions

View File

@ -1,5 +1,6 @@
#! /usr/bin/env python #! /usr/bin/env python
import os import os
import re
import sys import sys
import time import time
import random import random
@ -8,18 +9,16 @@ import logging
import tempfile import tempfile
import subprocess import subprocess
# This python script runs and kills db_stress multiple times with # This script runs and kills db_stress multiple times. It checks consistency
# test-batches-snapshot ON, # in case of unsafe crashes in Rocksdb.
# total operations much less than the total keys, and
# a high read percentage.
# This checks consistency in case of unsafe crashes in Rocksdb
def main(argv): def main(argv):
try: try:
opts, args = getopt.getopt(argv, "hd:t:i:o:b:") opts, args = getopt.getopt(argv, "hd:t:i:o:b:")
except getopt.GetoptError: except getopt.GetoptError:
print("db_crashtest.py -d <duration_test> -t <#threads> " print("db_crashtest.py -d <duration_test> -t <#threads> "
"-i <interval for one run> -o <ops_per_thread>\n") "-i <interval for one run> -o <ops_per_thread> "
"-b <write_buffer_size>\n")
sys.exit(2) sys.exit(2)
# default values, will be overridden by cmdline args # default values, will be overridden by cmdline args
@ -36,15 +35,15 @@ def main(argv):
" -t <#threads> -i <interval for one run>" " -t <#threads> -i <interval for one run>"
" -o <ops_per_thread> -b <write_buffer_size>\n") " -o <ops_per_thread> -b <write_buffer_size>\n")
sys.exit() sys.exit()
elif opt == ("-d"): elif opt == "-d":
duration = int(arg) duration = int(arg)
elif opt == ("-t"): elif opt == "-t":
threads = int(arg) threads = int(arg)
elif opt == ("-i"): elif opt == "-i":
interval = int(arg) interval = int(arg)
elif opt == ("-o"): elif opt == "-o":
ops_per_thread = int(arg) ops_per_thread = int(arg)
elif opt == ("-b"): elif opt == "-b":
write_buf_size = int(arg) write_buf_size = int(arg)
else: else:
print("db_crashtest.py -d <duration_test>" print("db_crashtest.py -d <duration_test>"
@ -54,8 +53,6 @@ def main(argv):
exit_time = time.time() + duration exit_time = time.time() + duration
dirpath = tempfile.mkdtemp()
print("Running blackbox-crash-test with \ninterval_between_crash=" print("Running blackbox-crash-test with \ninterval_between_crash="
+ str(interval) + "\ntotal-duration=" + str(duration) + str(interval) + "\ntotal-duration=" + str(duration)
+ "\nthreads=" + str(threads) + "\nops_per_thread=" + "\nthreads=" + str(threads) + "\nops_per_thread="
@ -64,62 +61,75 @@ def main(argv):
while time.time() < exit_time: while time.time() < exit_time:
run_had_errors = False run_had_errors = False
additional_opts = ' --disable_seek_compaction=' + \
str(random.randint(0, 1)) + \
' --mmap_read=' + str(random.randint(0, 1)) + \
' --block_size=16384 ' + \
' --cache_size=1048576 ' + \
' --open_files=500000 ' + \
' --verify_checksum=1 ' + \
' --sync=' + str(random.randint(0, 1)) + \
' --disable_wal=0 ' + \
' --disable_data_sync=' + \
str(random.randint(0, 1)) + \
' --target_file_size_base=2097152 ' + \
' --target_file_size_multiplier=2 ' + \
' --max_write_buffer_number=3 ' + \
' --max_background_compactions=20 ' + \
' --max_bytes_for_level_base=10485760 ' + \
' --filter_deletes=' + str(random.randint(0, 1))
killtime = time.time() + interval killtime = time.time() + interval
child = subprocess.Popen(['./db_stress \
--test_batches_snapshots=1 \
--ops_per_thread=0' + str(ops_per_thread) + ' \
--threads=0' + str(threads) + ' \
--write_buffer_size=' + str(write_buf_size) + '\
--destroy_db_initially=0 \
--reopen=0 \
--readpercent=50 \
--prefixpercent=5 \
--writepercent=40 \
--delpercent=5 \
--db=' + dirpath + '\
--max_key=100000000 ' + additional_opts],
stderr=subprocess.PIPE, shell=True)
print("Running db_stress with pid=%d and additional options=\n"
% child.pid + additional_opts + "\n")
time.sleep(interval)
while True:
if time.time() > killtime:
if child.poll() is not None:
print("WARNING: db_stress ended before kill\n")
else:
child.kill()
print("KILLED %d\n" % child.pid)
time.sleep(1) # time to stabilize after a kill
while True: cmd = re.sub('\s+', ' ', """
line = child.stderr.readline().strip() ./db_stress
if line != '': --test_batches_snapshots=1
run_had_errors = True --ops_per_thread=%s
print('***' + line + '^') --threads=%s
else: --write_buffer_size=%s
break --destroy_db_initially=0
if run_had_errors: --reopen=0
sys.exit(2) --readpercent=50
--prefixpercent=5
--writepercent=40
--delpercent=5
--db=%s
--max_key=100000000
--disable_seek_compaction=%s
--mmap_read=%s
--block_size=16384
--cache_size=1048576
--open_files=500000
--verify_checksum=1
--sync=%s
--disable_wal=0
--disable_data_sync=%s
--target_file_size_base=2097152
--target_file_size_multiplier=2
--max_write_buffer_number=3
--max_background_compactions=20
--max_bytes_for_level_base=10485760
--filter_deletes=%s
""" % (ops_per_thread,
threads,
write_buf_size,
tempfile.mkdtemp(),
random.randint(0, 1),
random.randint(0, 1),
random.randint(0, 1),
random.randint(0, 1),
random.randint(0, 1)))
child = subprocess.Popen([cmd],
stderr=subprocess.PIPE, shell=True)
print("Running db_stress with pid=%d: %s\n\n"
% (child.pid, cmd))
while time.time() < killtime:
time.sleep(10)
if child.poll() is not None:
print("WARNING: db_stress ended before kill: exitcode=%d\n"
% child.returncode)
else:
child.kill()
print("KILLED %d\n" % child.pid)
time.sleep(1) # time to stabilize after a kill
while True:
line = child.stderr.readline().strip()
if line != '':
run_had_errors = True
print('***' + line + '^')
else:
break break
time.sleep(1) # time to stabilize before the next run if run_had_errors:
sys.exit(2)
time.sleep(1) # time to stabilize before the next run
if __name__ == "__main__": if __name__ == "__main__":
sys.exit(main(sys.argv[1:])) sys.exit(main(sys.argv[1:]))

View File

@ -1,5 +1,6 @@
#! /usr/bin/env python #! /usr/bin/env python
import os import os
import re
import sys import sys
import time import time
import random import random
@ -8,24 +9,22 @@ import logging
import tempfile import tempfile
import subprocess import subprocess
# This python script runs db_stress multiple times with kill_random_test # This python script runs db_stress multiple times. Some runs with
# that causes leveldb to crash at various points in code. # kill_random_test that causes leveldb to crash at various points in code.
# It also has test-batches-snapshot ON so that basic atomic/consistency
# checks can be performed.
#
def main(argv): def main(argv):
try: try:
opts, args = getopt.getopt(argv, "hd:t:k:o:b:") opts, args = getopt.getopt(argv, "hd:t:k:o:b:")
except getopt.GetoptError: except getopt.GetoptError:
print str(getopt.GetoptError) print str(getopt.GetoptError)
print "db_crashtest2.py -d <duration_test> -t <#threads> " \ print "db_crashtest2.py -d <duration_test> -t <#threads> " \
"-k <kills with prob 1/k> -o <ops_per_thread> "\ "-k <kills with prob 1/k> -o <ops_per_thread> "\
"-b <write_buffer_size>\n" "-b <write_buffer_size>\n"
sys.exit(2) sys.exit(2)
# default values, will be overridden by cmdline args # default values, will be overridden by cmdline args
kill_random_test = 97 # kill with probability 1/97 by default kill_random_test = 97 # kill with probability 1/97 by default
duration = 6000 # total time for this script to test db_stress duration = 10000 # total time for this script to test db_stress
threads = 32 threads = 32
ops_per_thread = 200000 ops_per_thread = 200000
write_buf_size = 4 * 1024 * 1024 write_buf_size = 4 * 1024 * 1024
@ -33,93 +32,101 @@ def main(argv):
for opt, arg in opts: for opt, arg in opts:
if opt == '-h': if opt == '-h':
print "db_crashtest2.py -d <duration_test> -t <#threads> " \ print "db_crashtest2.py -d <duration_test> -t <#threads> " \
"-k <kills with prob 1/k> -o <ops_per_thread> "\ "-k <kills with prob 1/k> -o <ops_per_thread> " \
"-b <write_buffer_size>\n" "-b <write_buffer_size>\n"
sys.exit() sys.exit()
elif opt == ("-d"): elif opt == "-d":
duration = int(arg) duration = int(arg)
elif opt == ("-t"): elif opt == "-t":
threads = int(arg) threads = int(arg)
elif opt == ("-k"): elif opt == "-k":
kill_random_test = int(arg) kill_random_test = int(arg)
elif opt == ("-i"): elif opt == "-o":
interval = int(arg)
elif opt == ("-o"):
ops_per_thread = int(arg) ops_per_thread = int(arg)
elif opt == ("-b"): elif opt == "-b":
write_buf_size = int(arg) write_buf_size = int(arg)
else: else:
print "unrecognized option " + str(opt) + "\n" print "unrecognized option " + str(opt) + "\n"
print "db_crashtest2.py -d <duration_test> -t <#threads> " \ print "db_crashtest2.py -d <duration_test> -t <#threads> " \
"-k <kills with prob 1/k> -o <ops_per_thread> " \ "-k <kills with prob 1/k> -o <ops_per_thread> " \
"-b <write_buffer_size>\n" "-b <write_buffer_size>\n"
sys.exit(2) sys.exit(2)
exit_time = time.time() + duration exit_time = time.time() + duration
dirpath = tempfile.mkdtemp() print "Running whitebox-crash-test with \ntotal-duration=" + str(duration) \
+ "\nthreads=" + str(threads) + "\nops_per_thread=" \
+ str(ops_per_thread) + "\nwrite_buffer_size=" \
+ str(write_buf_size) + "\n"
print("Running whitebox-crash-test with \ntotal-duration=" + str(duration) total_check_mode = 3
+ "\nthreads=" + str(threads) + "\nops_per_thread=" check_mode = 0
+ str(ops_per_thread) + "\nwrite_buffer_size="
+ str(write_buf_size) + "\n")
# kill in every alternate run. toggle tracks which run we are doing.
toggle = True
while time.time() < exit_time: while time.time() < exit_time:
run_had_errors = False killoption = ""
additional_opts = ' --disable_seek_compaction=' + \ if check_mode == 0:
str(random.randint(0, 1)) + \ # run with kill_random_test
' --mmap_read=' + str(random.randint(0, 1)) + \ killoption = " --kill_random_test=" + str(kill_random_test)
' --block_size=16384 ' + \ # use large ops per thread since we will kill it anyway
' --cache_size=1048576 ' + \ additional_opts = "--ops_per_thread=" + \
' --open_files=500000 ' + \ str(100 * ops_per_thread) + killoption
' --verify_checksum=1 ' + \ elif check_mode == 1:
' --sync=' + str(random.randint(0, 1)) + \ # normal run with universal compaction mode
' --disable_wal=0 ' + \ additional_opts = "--ops_per_thread=" + str(ops_per_thread) + \
' --disable_data_sync=' + \ " --compaction_style=1"
str(random.randint(0, 1)) + \
' --target_file_size_base=2097152 ' + \
' --target_file_size_multiplier=2 ' + \
' --max_write_buffer_number=3 ' + \
' --max_background_compactions=20 ' + \
' --max_bytes_for_level_base=10485760 ' + \
' --filter_deletes=' + str(random.randint(0, 1))
print ("Running db_stress with additional options=\n"
+ additional_opts + "\n")
if toggle:
# since we are going to kill anyway, use more ops per thread
new_ops_per_thread = 100 * ops_per_thread
killoption = '--kill_random_test=' + str(kill_random_test)
else: else:
new_ops_per_thread = ops_per_thread # nomral run
killoption = '' additional_opts = "--ops_per_thread=" + str(ops_per_thread)
toggle = not toggle cmd = re.sub('\s+', ' ', """
./db_stress
--test_batches_snapshots=%s
--threads=%s
--write_buffer_size=%s
--destroy_db_initially=0
--reopen=0
--readpercent=50
--prefixpercent=5
--writepercent=40
--delpercent=5
--db=%s
--max_key=100000000
--disable_seek_compaction=%s
--mmap_read=%s
--block_size=16384
--cache_size=1048576
--open_files=500000
--verify_checksum=1
--sync=%s
--disable_wal=0
--disable_data_sync=%s
--target_file_size_base=2097152
--target_file_size_multiplier=2
--max_write_buffer_number=3
--max_background_compactions=20
--max_bytes_for_level_base=10485760
--filter_deletes=%s
%s
""" % (random.randint(0, 1),
threads,
write_buf_size,
tempfile.mkdtemp(),
random.randint(0, 1),
random.randint(0, 1),
random.randint(0, 1),
random.randint(0, 1),
random.randint(0, 1),
additional_opts))
cmd = ['./db_stress \ print "Running:" + cmd + "\n"
--test_batches_snapshots=1 \
--ops_per_thread=0' + str(new_ops_per_thread) + ' \
--threads=0' + str(threads) + ' \
--write_buffer_size=' + str(write_buf_size) + ' \
--destroy_db_initially=0 ' + killoption + ' \
--reopen=0 \
--readpercent=50 \
--prefixpercent=5 \
--writepercent=40 \
--delpercent=5 \
--db=' + dirpath + ' \
--max_key=100000000 ' + additional_opts]
popen = subprocess.Popen(cmd, stdout=subprocess.PIPE, popen = subprocess.Popen([cmd], stdout=subprocess.PIPE,
stderr=subprocess.STDOUT, stderr=subprocess.STDOUT,
shell=True) shell=True)
stdoutdata, stderrdata = popen.communicate() stdoutdata, stderrdata = popen.communicate()
retncode = popen.returncode retncode = popen.returncode
msg = ("kill option = {0}, exitcode = {1}".format( msg = ("check_mode={0}, kill option={1}, exitcode={2}\n".format(
killoption, retncode)) check_mode, killoption, retncode))
print msg print msg
print stdoutdata print stdoutdata
@ -146,6 +153,9 @@ def main(argv):
if (stdoutdata.find('fail') >= 0): if (stdoutdata.find('fail') >= 0):
print "TEST FAILED. Output has 'fail'!!!\n" print "TEST FAILED. Output has 'fail'!!!\n"
sys.exit(2) sys.exit(2)
check_mode = (check_mode + 1) % total_check_mode
time.sleep(1) # time to stabilize after a kill time.sleep(1) # time to stabilize after a kill
if __name__ == "__main__": if __name__ == "__main__":