Kill whitebox crash test if it is 15 minutes over the limit (#8341)
Summary: Whitebox crash test can run significantly over the time limit for test slowness or no kiling points. This indefinite job can create problem when this test is periodically scheduled as a job. Instead, kill the job if it is 15 minutes over the limit. Refactor the code slightly to consolidate the code for executing commands for white and black box tests. Pull Request resolved: https://github.com/facebook/rocksdb/pull/8341 Test Plan: Run both of black and white box tests with both of natual and explicit kill condition. Reviewed By: jay-zhuang Differential Revision: D28756170 fbshipit-source-id: f253149890e62ace78f871be927e093e9b12f49b
This commit is contained in:
parent
d561af487c
commit
ab718b415f
@ -461,6 +461,25 @@ def inject_inconsistencies_to_db_dir(dir_path):
|
|||||||
with open(os.path.join(dir_path, fname), "w") as fd:
|
with open(os.path.join(dir_path, fname), "w") as fd:
|
||||||
fd.write("garbage")
|
fd.write("garbage")
|
||||||
|
|
||||||
|
def execute_cmd(cmd, timeout):
|
||||||
|
child = subprocess.Popen(cmd, stderr=subprocess.PIPE,
|
||||||
|
stdout=subprocess.PIPE)
|
||||||
|
print("Running db_stress with pid=%d: %s\n\n"
|
||||||
|
% (child.pid, ' '.join(cmd)))
|
||||||
|
|
||||||
|
try:
|
||||||
|
outs, errs = child.communicate(timeout=timeout)
|
||||||
|
hit_timeout = False
|
||||||
|
print("WARNING: db_stress ended before kill: exitcode=%d\n"
|
||||||
|
% child.returncode)
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
hit_timeout = True
|
||||||
|
child.kill()
|
||||||
|
print("KILLED %d\n" % child.pid)
|
||||||
|
outs, errs = child.communicate()
|
||||||
|
|
||||||
|
return hit_timeout, child.returncode, outs.decode('utf-8'), errs.decode('utf-8')
|
||||||
|
|
||||||
|
|
||||||
# This script runs and kills db_stress multiple times. It checks consistency
|
# This script runs and kills db_stress multiple times. It checks consistency
|
||||||
# in case of unsafe crashes in RocksDB.
|
# in case of unsafe crashes in RocksDB.
|
||||||
@ -474,47 +493,26 @@ def blackbox_crash_main(args, unknown_args):
|
|||||||
+ "total-duration=" + str(cmd_params['duration']) + "\n")
|
+ "total-duration=" + str(cmd_params['duration']) + "\n")
|
||||||
|
|
||||||
while time.time() < exit_time:
|
while time.time() < exit_time:
|
||||||
run_had_errors = False
|
|
||||||
killtime = time.time() + cmd_params['interval']
|
|
||||||
|
|
||||||
cmd = gen_cmd(dict(
|
cmd = gen_cmd(dict(
|
||||||
list(cmd_params.items())
|
list(cmd_params.items())
|
||||||
+ list({'db': dbname}.items())), unknown_args)
|
+ list({'db': dbname}.items())), unknown_args)
|
||||||
|
|
||||||
child = subprocess.Popen(cmd, stderr=subprocess.PIPE)
|
hit_timeout, retcode, outs, errs = execute_cmd(cmd, cmd_params['interval'])
|
||||||
print("Running db_stress with pid=%d: %s\n\n"
|
|
||||||
% (child.pid, ' '.join(cmd)))
|
|
||||||
|
|
||||||
stop_early = False
|
if not hit_timeout:
|
||||||
while time.time() < killtime:
|
print('Exit Before Killing')
|
||||||
if child.poll() is not None:
|
print('stdout:')
|
||||||
print("WARNING: db_stress ended before kill: exitcode=%d\n"
|
print(outs)
|
||||||
% child.returncode)
|
print('stderr:')
|
||||||
stop_early = True
|
print(errs)
|
||||||
break
|
sys.exit(2)
|
||||||
time.sleep(1)
|
|
||||||
|
|
||||||
if not stop_early:
|
for line in errs.split('\n'):
|
||||||
if child.poll() is not None:
|
if line != '' and not line.startswith('WARNING'):
|
||||||
print("WARNING: db_stress ended before kill: exitcode=%d\n"
|
|
||||||
% child.returncode)
|
|
||||||
else:
|
|
||||||
child.kill()
|
|
||||||
print("KILLED %d\n" % child.pid)
|
|
||||||
time.sleep(1) # time to stabilize after a kill
|
|
||||||
|
|
||||||
while True:
|
|
||||||
line = child.stderr.readline().strip().decode('utf-8')
|
|
||||||
if line == '':
|
|
||||||
break
|
|
||||||
elif not line.startswith('WARNING'):
|
|
||||||
run_had_errors = True
|
run_had_errors = True
|
||||||
print('stderr has error message:')
|
print('stderr has error message:')
|
||||||
print('***' + line + '***')
|
print('***' + line + '***')
|
||||||
|
|
||||||
if run_had_errors:
|
|
||||||
sys.exit(2)
|
|
||||||
|
|
||||||
time.sleep(1) # time to stabilize before the next run
|
time.sleep(1) # time to stabilize before the next run
|
||||||
|
|
||||||
if args.test_best_efforts_recovery:
|
if args.test_best_efforts_recovery:
|
||||||
@ -614,14 +612,14 @@ def whitebox_crash_main(args, unknown_args):
|
|||||||
|
|
||||||
print("Running:" + ' '.join(cmd) + "\n") # noqa: E999 T25377293 Grandfathered in
|
print("Running:" + ' '.join(cmd) + "\n") # noqa: E999 T25377293 Grandfathered in
|
||||||
|
|
||||||
popen = subprocess.Popen(cmd, stdout=subprocess.PIPE,
|
# If the running time is 15 minutes over the run time, explicit kill and
|
||||||
stderr=subprocess.PIPE)
|
# exit even if white box kill didn't hit. This is to guarantee run time
|
||||||
stdoutdata, stderrdata = popen.communicate()
|
# limit, as if it runs as a job, running too long will create problems
|
||||||
if stdoutdata:
|
# for job scheduling or execution.
|
||||||
stdoutdata = stdoutdata.decode('utf-8')
|
# TODO detect a hanging condition. The job might run too long as RocksDB
|
||||||
if stderrdata:
|
# hits a hanging bug.
|
||||||
stderrdata = stderrdata.decode('utf-8')
|
hit_timeout, retncode, stdoutdata, stderrdata = execute_cmd(
|
||||||
retncode = popen.returncode
|
cmd, exit_time - time.time() + 900)
|
||||||
msg = ("check_mode={0}, kill option={1}, exitcode={2}\n".format(
|
msg = ("check_mode={0}, kill option={1}, exitcode={2}\n".format(
|
||||||
check_mode, additional_opts['kill_random_test'], retncode))
|
check_mode, additional_opts['kill_random_test'], retncode))
|
||||||
|
|
||||||
@ -629,6 +627,10 @@ def whitebox_crash_main(args, unknown_args):
|
|||||||
print(stdoutdata)
|
print(stdoutdata)
|
||||||
print(stderrdata)
|
print(stderrdata)
|
||||||
|
|
||||||
|
if hit_timeout:
|
||||||
|
print("Killing the run for running too long")
|
||||||
|
break
|
||||||
|
|
||||||
expected = False
|
expected = False
|
||||||
if additional_opts['kill_random_test'] is None and (retncode == 0):
|
if additional_opts['kill_random_test'] is None and (retncode == 0):
|
||||||
# we expect zero retncode if no kill option
|
# we expect zero retncode if no kill option
|
||||||
|
Loading…
x
Reference in New Issue
Block a user