More improvements to output for CircleCI (#9201)

Summary:
More follow-up to https://github.com/facebook/rocksdb/issues/9193 + https://github.com/facebook/rocksdb/issues/9188
* Even though we need to print ETA updates to avoid hitting the 10min
timeout, we need to avoid printing an update if there's no actual
progress, so that hung tests will timeout after 10 min rather than 5
hours.
* When there is a hung test, it's really annoying to track down which
test is hung, so if no progress is observed for 1 minute, we run ps once
to show what is running.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/9201

Test Plan: manual and CircleCI

Reviewed By: jay-zhuang

Differential Revision: D32612028

Pulled By: pdillinger

fbshipit-source-id: 00f8ea70fc5fec9ede28ff74287d90fc73854aad
This commit is contained in:
Peter Dillinger 2021-11-23 22:09:06 -08:00
parent c48bae0968
commit 8e43279f93

View File

@ -1842,7 +1842,6 @@ sub start_another_job {
}
$opt::min_progress_interval = 0;
$opt::progress_sep = "\r";
sub init_progress {
# Uses:
@ -1852,7 +1851,6 @@ sub init_progress {
$|=1;
if (not $Global::is_terminal) {
$opt::min_progress_interval = 30;
$opt::progress_sep = "\n";
}
if($opt::bar) {
return("","");
@ -1878,7 +1876,9 @@ sub drain_job_queue {
}
my $last_header="";
my $sleep = 0.2;
my $last_left = 1000000000;
my $last_progress_time = 0;
my $ps_reported = 0;
do {
while($Global::total_running > 0) {
debug($Global::total_running, "==", scalar
@ -1889,15 +1889,38 @@ sub drain_job_queue {
close $job->fh(0,"w");
}
}
if($opt::progress and (time() - $last_progress_time) >= $opt::min_progress_interval) {
$last_progress_time = time();
# When not connected to terminal, assume CI (e.g. CircleCI). In
# that case we want occasional progress output to prevent abort
# due to timeout with no output, but we also need to stop sending
# progress output if there has been no actual progress, so that
# the job can time out appropriately (CirecleCI: 10m) in case of
# a hung test. But without special output, it is extremely
# annoying to diagnose which test is hung, so we add that using
# `ps` below.
if($opt::progress and
($Global::is_terminal or (time() - $last_progress_time) >= 30)) {
my %progress = progress();
if($last_header ne $progress{'header'}) {
print $Global::original_stderr "\n", $progress{'header'}, "\n";
$last_header = $progress{'header'};
}
print $Global::original_stderr $opt::progress_sep,$progress{'status'};
flush $Global::original_stderr;
if ($Global::is_terminal) {
print $Global::original_stderr "\r",$progress{'status'};
}
if ($last_left > $Global::left) {
if (not $Global::is_terminal) {
print $Global::original_stderr $progress{'status'},"\n";
}
$last_progress_time = time();
$ps_reported = 0;
} elsif (not $ps_reported and (time() - $last_progress_time) >= 60) {
# No progress in at least 60 seconds: run ps
print $Global::original_stderr "\n";
system("ps", "-wf");
$ps_reported = 1;
}
$last_left = $Global::left;
flush $Global::original_stderr;
}
if($Global::total_running < $Global::max_jobs_running
and not $Global::JobQueue->empty()) {
@ -1964,10 +1987,11 @@ sub progress {
my $eta = "";
my ($status,$header)=("","");
if($opt::eta) {
my($total, $completed, $left, $pctcomplete, $avgtime, $this_eta) =
compute_eta();
$eta = sprintf("ETA: %ds Left: %d AVG: %.2fs ",
$this_eta, $left, $avgtime);
my($total, $completed, $left, $pctcomplete, $avgtime, $this_eta) =
compute_eta();
$eta = sprintf("ETA: %ds Left: %d AVG: %.2fs ",
$this_eta, $left, $avgtime);
$Global::left = $left;
}
my $termcols = terminal_columns();
my @workers = sort keys %Global::host;