2013-04-05 22:44:59 +02:00
|
|
|
#! /usr/bin/env python
|
2013-03-13 07:20:14 +01:00
|
|
|
import os
|
2013-08-21 02:37:49 +02:00
|
|
|
import re
|
2013-03-13 07:20:14 +01:00
|
|
|
import sys
|
|
|
|
import time
|
2013-06-08 21:29:43 +02:00
|
|
|
import random
|
2013-03-13 07:20:14 +01:00
|
|
|
import getopt
|
|
|
|
import logging
|
2013-04-05 22:44:59 +02:00
|
|
|
import tempfile
|
2013-03-13 07:20:14 +01:00
|
|
|
import subprocess
|
2014-03-20 19:11:08 +01:00
|
|
|
import shutil
|
2013-03-13 07:20:14 +01:00
|
|
|
|
2013-08-21 02:37:49 +02:00
|
|
|
# This script runs and kills db_stress multiple times. It checks consistency
|
2014-03-20 19:11:08 +01:00
|
|
|
# in case of unsafe crashes in RocksDB.
|
2013-03-13 07:20:14 +01:00
|
|
|
|
|
|
|
def main(argv):
|
|
|
|
try:
|
crash_test to cover simply cases
Summary:
crash_test now only runs complicated options, multiple column families, prefix hash, frequently changing options, many compaction threads, etc. These options are good to cover new features but we loss coverage in most common use cases. Furthermore, by running only for multiple column families, we are not able to create LSM trees that are large enough to cover some stress cases.
Make half of crash_test runs the simply tests: single column family, default mem table, one compaction thread, no change options.
Test Plan: Run crash_test
Reviewers: rven, yhchiang, IslamAbdelRahman, kradhakrishnan
Reviewed By: kradhakrishnan
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D43461
2015-08-04 02:23:34 +02:00
|
|
|
opts, args = getopt.getopt(argv, "hsd:t:i:o:b:")
|
2013-03-13 07:20:14 +01:00
|
|
|
except getopt.GetoptError:
|
2013-04-10 21:15:30 +02:00
|
|
|
print("db_crashtest.py -d <duration_test> -t <#threads> "
|
2013-08-21 02:37:49 +02:00
|
|
|
"-i <interval for one run> -o <ops_per_thread> "
|
crash_test to cover simply cases
Summary:
crash_test now only runs complicated options, multiple column families, prefix hash, frequently changing options, many compaction threads, etc. These options are good to cover new features but we loss coverage in most common use cases. Furthermore, by running only for multiple column families, we are not able to create LSM trees that are large enough to cover some stress cases.
Make half of crash_test runs the simply tests: single column family, default mem table, one compaction thread, no change options.
Test Plan: Run crash_test
Reviewers: rven, yhchiang, IslamAbdelRahman, kradhakrishnan
Reviewed By: kradhakrishnan
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D43461
2015-08-04 02:23:34 +02:00
|
|
|
"-b <write_buffer_size> [-s (simple mode)]\n")
|
2013-03-13 07:20:14 +01:00
|
|
|
sys.exit(2)
|
|
|
|
|
|
|
|
# default values, will be overridden by cmdline args
|
|
|
|
interval = 120 # time for one db_stress instance to run
|
|
|
|
duration = 6000 # total time for this script to test db_stress
|
|
|
|
threads = 32
|
2013-04-03 12:40:39 +02:00
|
|
|
# since we will be killing anyway, use large value for ops_per_thread
|
2013-06-08 21:29:43 +02:00
|
|
|
ops_per_thread = 100000000
|
2013-03-13 07:20:14 +01:00
|
|
|
write_buf_size = 4 * 1024 * 1024
|
crash_test to cover simply cases
Summary:
crash_test now only runs complicated options, multiple column families, prefix hash, frequently changing options, many compaction threads, etc. These options are good to cover new features but we loss coverage in most common use cases. Furthermore, by running only for multiple column families, we are not able to create LSM trees that are large enough to cover some stress cases.
Make half of crash_test runs the simply tests: single column family, default mem table, one compaction thread, no change options.
Test Plan: Run crash_test
Reviewers: rven, yhchiang, IslamAbdelRahman, kradhakrishnan
Reviewed By: kradhakrishnan
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D43461
2015-08-04 02:23:34 +02:00
|
|
|
simple_mode = False
|
|
|
|
write_buf_size_set = False
|
2013-03-13 07:20:14 +01:00
|
|
|
for opt, arg in opts:
|
|
|
|
if opt == '-h':
|
2013-04-10 21:15:30 +02:00
|
|
|
print("db_crashtest.py -d <duration_test>"
|
|
|
|
" -t <#threads> -i <interval for one run>"
|
crash_test to cover simply cases
Summary:
crash_test now only runs complicated options, multiple column families, prefix hash, frequently changing options, many compaction threads, etc. These options are good to cover new features but we loss coverage in most common use cases. Furthermore, by running only for multiple column families, we are not able to create LSM trees that are large enough to cover some stress cases.
Make half of crash_test runs the simply tests: single column family, default mem table, one compaction thread, no change options.
Test Plan: Run crash_test
Reviewers: rven, yhchiang, IslamAbdelRahman, kradhakrishnan
Reviewed By: kradhakrishnan
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D43461
2015-08-04 02:23:34 +02:00
|
|
|
" -o <ops_per_thread> -b <write_buffer_size>"
|
|
|
|
" [-s (simple mode)]\n")
|
2013-03-13 07:20:14 +01:00
|
|
|
sys.exit()
|
crash_test to cover simply cases
Summary:
crash_test now only runs complicated options, multiple column families, prefix hash, frequently changing options, many compaction threads, etc. These options are good to cover new features but we loss coverage in most common use cases. Furthermore, by running only for multiple column families, we are not able to create LSM trees that are large enough to cover some stress cases.
Make half of crash_test runs the simply tests: single column family, default mem table, one compaction thread, no change options.
Test Plan: Run crash_test
Reviewers: rven, yhchiang, IslamAbdelRahman, kradhakrishnan
Reviewed By: kradhakrishnan
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D43461
2015-08-04 02:23:34 +02:00
|
|
|
elif opt == '-s':
|
|
|
|
simple_mode = True
|
|
|
|
if not write_buf_size_set:
|
|
|
|
write_buf_size = 32 * 1024 * 1024
|
2013-08-21 02:37:49 +02:00
|
|
|
elif opt == "-d":
|
2013-03-13 07:20:14 +01:00
|
|
|
duration = int(arg)
|
2013-08-21 02:37:49 +02:00
|
|
|
elif opt == "-t":
|
2013-03-13 07:20:14 +01:00
|
|
|
threads = int(arg)
|
2013-08-21 02:37:49 +02:00
|
|
|
elif opt == "-i":
|
2013-03-13 07:20:14 +01:00
|
|
|
interval = int(arg)
|
2013-08-21 02:37:49 +02:00
|
|
|
elif opt == "-o":
|
2013-03-13 07:20:14 +01:00
|
|
|
ops_per_thread = int(arg)
|
2013-08-21 02:37:49 +02:00
|
|
|
elif opt == "-b":
|
2013-03-13 07:20:14 +01:00
|
|
|
write_buf_size = int(arg)
|
crash_test to cover simply cases
Summary:
crash_test now only runs complicated options, multiple column families, prefix hash, frequently changing options, many compaction threads, etc. These options are good to cover new features but we loss coverage in most common use cases. Furthermore, by running only for multiple column families, we are not able to create LSM trees that are large enough to cover some stress cases.
Make half of crash_test runs the simply tests: single column family, default mem table, one compaction thread, no change options.
Test Plan: Run crash_test
Reviewers: rven, yhchiang, IslamAbdelRahman, kradhakrishnan
Reviewed By: kradhakrishnan
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D43461
2015-08-04 02:23:34 +02:00
|
|
|
write_buf_size_set = True
|
2013-03-13 07:20:14 +01:00
|
|
|
else:
|
2013-04-10 21:15:30 +02:00
|
|
|
print("db_crashtest.py -d <duration_test>"
|
|
|
|
" -t <#threads> -i <interval for one run>"
|
|
|
|
" -o <ops_per_thread> -b <write_buffer_size>\n")
|
2013-03-13 07:20:14 +01:00
|
|
|
sys.exit(2)
|
|
|
|
|
|
|
|
exit_time = time.time() + duration
|
|
|
|
|
2013-05-24 04:10:13 +02:00
|
|
|
print("Running blackbox-crash-test with \ninterval_between_crash="
|
2013-04-10 21:15:30 +02:00
|
|
|
+ str(interval) + "\ntotal-duration=" + str(duration)
|
|
|
|
+ "\nthreads=" + str(threads) + "\nops_per_thread="
|
|
|
|
+ str(ops_per_thread) + "\nwrite_buffer_size="
|
|
|
|
+ str(write_buf_size) + "\n")
|
|
|
|
|
2015-08-04 20:35:44 +02:00
|
|
|
test_tmpdir = os.environ.get("TEST_TMPDIR")
|
|
|
|
if test_tmpdir is None or test_tmpdir == "":
|
|
|
|
dbname = tempfile.mkdtemp(prefix='rocksdb_crashtest_')
|
|
|
|
else:
|
|
|
|
dbname = test_tmpdir + "/rocksdb_crashtest"
|
2015-08-04 21:20:38 +02:00
|
|
|
shutil.rmtree(dbname, True)
|
2014-03-20 19:11:08 +01:00
|
|
|
|
2013-03-13 07:20:14 +01:00
|
|
|
while time.time() < exit_time:
|
|
|
|
run_had_errors = False
|
|
|
|
killtime = time.time() + interval
|
2013-08-21 02:37:49 +02:00
|
|
|
|
crash_test to cover simply cases
Summary:
crash_test now only runs complicated options, multiple column families, prefix hash, frequently changing options, many compaction threads, etc. These options are good to cover new features but we loss coverage in most common use cases. Furthermore, by running only for multiple column families, we are not able to create LSM trees that are large enough to cover some stress cases.
Make half of crash_test runs the simply tests: single column family, default mem table, one compaction thread, no change options.
Test Plan: Run crash_test
Reviewers: rven, yhchiang, IslamAbdelRahman, kradhakrishnan
Reviewed By: kradhakrishnan
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D43461
2015-08-04 02:23:34 +02:00
|
|
|
if simple_mode:
|
|
|
|
cmd = re.sub('\s+', ' ', """
|
|
|
|
./db_stress
|
|
|
|
--column_families=1
|
|
|
|
--test_batches_snapshots=0
|
|
|
|
--ops_per_thread=%s
|
|
|
|
--threads=%s
|
|
|
|
--write_buffer_size=%s
|
|
|
|
--destroy_db_initially=0
|
|
|
|
--reopen=20
|
|
|
|
--readpercent=50
|
|
|
|
--prefixpercent=0
|
|
|
|
--writepercent=35
|
|
|
|
--delpercent=5
|
|
|
|
--iterpercent=10
|
|
|
|
--db=%s
|
|
|
|
--max_key=100000000
|
|
|
|
--mmap_read=%s
|
|
|
|
--block_size=16384
|
|
|
|
--cache_size=1048576
|
|
|
|
--open_files=-1
|
|
|
|
--verify_checksum=1
|
|
|
|
--sync=0
|
|
|
|
--progress_reports=0
|
|
|
|
--disable_wal=0
|
|
|
|
--disable_data_sync=1
|
|
|
|
--target_file_size_base=16777216
|
|
|
|
--target_file_size_multiplier=1
|
|
|
|
--max_write_buffer_number=3
|
|
|
|
--max_background_compactions=1
|
|
|
|
--max_bytes_for_level_base=67108864
|
|
|
|
--filter_deletes=%s
|
|
|
|
--memtablerep=skip_list
|
|
|
|
--prefix_size=0
|
|
|
|
--set_options_one_in=0
|
|
|
|
""" % (ops_per_thread,
|
|
|
|
threads,
|
|
|
|
write_buf_size,
|
|
|
|
dbname,
|
|
|
|
random.randint(0, 1),
|
|
|
|
random.randint(0, 1)))
|
|
|
|
else:
|
|
|
|
cmd = re.sub('\s+', ' ', """
|
|
|
|
./db_stress
|
|
|
|
--test_batches_snapshots=1
|
|
|
|
--ops_per_thread=%s
|
|
|
|
--threads=%s
|
|
|
|
--write_buffer_size=%s
|
|
|
|
--destroy_db_initially=0
|
|
|
|
--reopen=20
|
|
|
|
--readpercent=45
|
|
|
|
--prefixpercent=5
|
|
|
|
--writepercent=35
|
|
|
|
--delpercent=5
|
|
|
|
--iterpercent=10
|
|
|
|
--db=%s
|
|
|
|
--max_key=100000000
|
|
|
|
--mmap_read=%s
|
|
|
|
--block_size=16384
|
|
|
|
--cache_size=1048576
|
|
|
|
--open_files=500000
|
|
|
|
--verify_checksum=1
|
|
|
|
--sync=0
|
|
|
|
--progress_reports=0
|
|
|
|
--disable_wal=0
|
|
|
|
--disable_data_sync=1
|
|
|
|
--target_file_size_base=2097152
|
|
|
|
--target_file_size_multiplier=2
|
|
|
|
--max_write_buffer_number=3
|
|
|
|
--max_background_compactions=20
|
|
|
|
--max_bytes_for_level_base=10485760
|
|
|
|
--filter_deletes=%s
|
|
|
|
--memtablerep=prefix_hash
|
|
|
|
--prefix_size=7
|
|
|
|
--set_options_one_in=10000
|
|
|
|
""" % (ops_per_thread,
|
|
|
|
threads,
|
|
|
|
write_buf_size,
|
|
|
|
dbname,
|
|
|
|
random.randint(0, 1),
|
|
|
|
random.randint(0, 1)))
|
2013-08-21 02:37:49 +02:00
|
|
|
|
|
|
|
child = subprocess.Popen([cmd],
|
2013-06-08 21:29:43 +02:00
|
|
|
stderr=subprocess.PIPE, shell=True)
|
2013-08-21 02:37:49 +02:00
|
|
|
print("Running db_stress with pid=%d: %s\n\n"
|
|
|
|
% (child.pid, cmd))
|
|
|
|
|
2014-03-11 21:44:33 +01:00
|
|
|
stop_early = False
|
2013-08-21 02:37:49 +02:00
|
|
|
while time.time() < killtime:
|
2014-03-11 21:44:33 +01:00
|
|
|
if child.poll() is not None:
|
|
|
|
print("WARNING: db_stress ended before kill: exitcode=%d\n"
|
|
|
|
% child.returncode)
|
|
|
|
stop_early = True
|
|
|
|
break
|
|
|
|
time.sleep(1)
|
2013-03-13 07:20:14 +01:00
|
|
|
|
2014-03-11 21:44:33 +01:00
|
|
|
if not stop_early:
|
|
|
|
if child.poll() is not None:
|
|
|
|
print("WARNING: db_stress ended before kill: exitcode=%d\n"
|
|
|
|
% child.returncode)
|
|
|
|
else:
|
|
|
|
child.kill()
|
|
|
|
print("KILLED %d\n" % child.pid)
|
|
|
|
time.sleep(1) # time to stabilize after a kill
|
2013-08-21 02:37:49 +02:00
|
|
|
|
|
|
|
while True:
|
|
|
|
line = child.stderr.readline().strip()
|
|
|
|
if line != '':
|
|
|
|
run_had_errors = True
|
|
|
|
print('***' + line + '^')
|
|
|
|
else:
|
2013-03-13 07:20:14 +01:00
|
|
|
break
|
|
|
|
|
2013-08-21 02:37:49 +02:00
|
|
|
if run_had_errors:
|
|
|
|
sys.exit(2)
|
|
|
|
|
|
|
|
time.sleep(1) # time to stabilize before the next run
|
2013-03-13 07:20:14 +01:00
|
|
|
|
2014-03-20 19:11:08 +01:00
|
|
|
# we need to clean up after ourselves -- only do this on test success
|
|
|
|
shutil.rmtree(dbname, True)
|
|
|
|
|
2013-03-13 07:20:14 +01:00
|
|
|
if __name__ == "__main__":
|
|
|
|
sys.exit(main(sys.argv[1:]))
|