rocksdb/db_stress_tool/db_stress_driver.cc
Andrew Kryczka 84228e21e8 Fix shutdown in db_stress with -test_batches_snapshots=1 (#9313)
Summary:
The `SharedState` constructor had an early return in case of
`-test_batches_snapshots=1`. This early return caused `num_bg_threads_`
to never be incremented. Consequently, the driver thread could cleanup
objects like the `SharedState` while BG threads were still running and
accessing it, leading to crash.

The fix is to move the logic for counting threads (both FG and BG) to
the place they are launched. That way we can be sure the counts are
consistent, at least for now.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/9313

Test Plan:
below command used to fail, now it passes.

```
$ ./db_stress --db=./test-db/ --expected_values_dir=./test-db-expected/ --max_key=100000 --ops_per_thread=1000 --sync_fault_injection=1 --clear_column_family_one_in=0 --destroy_db_initially=0 -reopen=0 -test_batches_snapshots=1
```

Reviewed By: jay-zhuang

Differential Revision: D33198670

Pulled By: ajkr

fbshipit-source-id: 126592dc1eb31998bc8f82ffbf5a0d4eb8dec317
2021-12-17 17:31:40 -08:00

178 lines
5.2 KiB
C++

// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
#ifdef GFLAGS
#include "db_stress_tool/db_stress_common.h"
#include "utilities/fault_injection_fs.h"
namespace ROCKSDB_NAMESPACE {
void ThreadBody(void* v) {
ThreadState* thread = reinterpret_cast<ThreadState*>(v);
SharedState* shared = thread->shared;
if (!FLAGS_skip_verifydb && shared->ShouldVerifyAtBeginning()) {
thread->shared->GetStressTest()->VerifyDb(thread);
}
{
MutexLock l(shared->GetMutex());
shared->IncInitialized();
if (shared->AllInitialized()) {
shared->GetCondVar()->SignalAll();
}
while (!shared->Started()) {
shared->GetCondVar()->Wait();
}
}
thread->shared->GetStressTest()->OperateDb(thread);
{
MutexLock l(shared->GetMutex());
shared->IncOperated();
if (shared->AllOperated()) {
shared->GetCondVar()->SignalAll();
}
while (!shared->VerifyStarted()) {
shared->GetCondVar()->Wait();
}
}
if (!FLAGS_skip_verifydb) {
thread->shared->GetStressTest()->VerifyDb(thread);
}
{
MutexLock l(shared->GetMutex());
shared->IncDone();
if (shared->AllDone()) {
shared->GetCondVar()->SignalAll();
}
}
}
bool RunStressTest(StressTest* stress) {
SystemClock* clock = db_stress_env->GetSystemClock().get();
stress->InitDb();
SharedState shared(db_stress_env, stress);
stress->FinishInitDb(&shared);
#ifndef NDEBUG
if (FLAGS_sync_fault_injection) {
fault_fs_guard->SetFilesystemDirectWritable(false);
}
#endif
uint32_t n = FLAGS_threads;
uint64_t now = clock->NowMicros();
fprintf(stdout, "%s Initializing worker threads\n",
clock->TimeToString(now / 1000000).c_str());
std::vector<ThreadState*> threads(n);
for (uint32_t i = 0; i < n; i++) {
shared.IncThreads();
threads[i] = new ThreadState(i, &shared);
db_stress_env->StartThread(ThreadBody, threads[i]);
}
ThreadState bg_thread(0, &shared);
if (FLAGS_compaction_thread_pool_adjust_interval > 0) {
shared.IncBgThreads();
db_stress_env->StartThread(PoolSizeChangeThread, &bg_thread);
}
ThreadState continuous_verification_thread(0, &shared);
if (FLAGS_continuous_verification_interval > 0) {
shared.IncBgThreads();
db_stress_env->StartThread(DbVerificationThread,
&continuous_verification_thread);
}
// Each thread goes through the following states:
// initializing -> wait for others to init -> read/populate/depopulate
// wait for others to operate -> verify -> done
{
MutexLock l(shared.GetMutex());
while (!shared.AllInitialized()) {
shared.GetCondVar()->Wait();
}
if (shared.ShouldVerifyAtBeginning()) {
if (shared.HasVerificationFailedYet()) {
fprintf(stderr, "Crash-recovery verification failed :(\n");
} else {
fprintf(stdout, "Crash-recovery verification passed :)\n");
}
}
now = clock->NowMicros();
fprintf(stdout, "%s Starting database operations\n",
clock->TimeToString(now / 1000000).c_str());
shared.SetStart();
shared.GetCondVar()->SignalAll();
while (!shared.AllOperated()) {
shared.GetCondVar()->Wait();
}
now = clock->NowMicros();
if (FLAGS_test_batches_snapshots) {
fprintf(stdout, "%s Limited verification already done during gets\n",
clock->TimeToString((uint64_t)now / 1000000).c_str());
} else if (FLAGS_skip_verifydb) {
fprintf(stdout, "%s Verification skipped\n",
clock->TimeToString((uint64_t)now / 1000000).c_str());
} else {
fprintf(stdout, "%s Starting verification\n",
clock->TimeToString((uint64_t)now / 1000000).c_str());
}
shared.SetStartVerify();
shared.GetCondVar()->SignalAll();
while (!shared.AllDone()) {
shared.GetCondVar()->Wait();
}
}
for (unsigned int i = 1; i < n; i++) {
threads[0]->stats.Merge(threads[i]->stats);
}
threads[0]->stats.Report("Stress Test");
for (unsigned int i = 0; i < n; i++) {
delete threads[i];
threads[i] = nullptr;
}
now = clock->NowMicros();
if (!FLAGS_skip_verifydb && !FLAGS_test_batches_snapshots &&
!shared.HasVerificationFailedYet()) {
fprintf(stdout, "%s Verification successful\n",
clock->TimeToString(now / 1000000).c_str());
}
stress->PrintStatistics();
if (FLAGS_compaction_thread_pool_adjust_interval > 0 ||
FLAGS_continuous_verification_interval > 0) {
MutexLock l(shared.GetMutex());
shared.SetShouldStopBgThread();
while (!shared.BgThreadsFinished()) {
shared.GetCondVar()->Wait();
}
}
if (!stress->VerifySecondaries()) {
return false;
}
if (shared.HasVerificationFailedYet()) {
fprintf(stderr, "Verification failed :(\n");
return false;
}
return true;
}
} // namespace ROCKSDB_NAMESPACE
#endif // GFLAGS