rocksdb/db/db_write_test.cc

451 lines
15 KiB
C++
Raw Permalink Normal View History

// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#include <atomic>
#include <fstream>
#include <memory>
#include <thread>
#include <vector>
#include "db/db_test_util.h"
#include "db/write_batch_internal.h"
#include "db/write_thread.h"
#include "port/port.h"
#include "port/stack_trace.h"
#include "test_util/sync_point.h"
#include "util/random.h"
#include "util/string_util.h"
#include "utilities/fault_injection_env.h"
namespace ROCKSDB_NAMESPACE {
// Test variations of WriteImpl.
class DBWriteTest : public DBTestBase, public testing::WithParamInterface<int> {
public:
DBWriteTest() : DBTestBase("/db_write_test", /*env_do_fsync=*/true) {}
Options GetOptions() { return DBTestBase::GetOptions(GetParam()); }
void Open() { DBTestBase::Reopen(GetOptions()); }
};
// It is invalid to do sync write while disabling WAL.
TEST_P(DBWriteTest, SyncAndDisableWAL) {
WriteOptions write_options;
write_options.sync = true;
write_options.disableWAL = true;
ASSERT_TRUE(dbfull()->Put(write_options, "foo", "bar").IsInvalidArgument());
WriteBatch batch;
ASSERT_OK(batch.Put("foo", "bar"));
ASSERT_TRUE(dbfull()->Write(write_options, &batch).IsInvalidArgument());
}
TEST_P(DBWriteTest, WriteStallRemoveNoSlowdownWrite) {
Options options = GetOptions();
options.level0_stop_writes_trigger = options.level0_slowdown_writes_trigger =
4;
std::vector<port::Thread> threads;
std::atomic<int> thread_num(0);
port::Mutex mutex;
port::CondVar cv(&mutex);
// Guarded by mutex
int writers = 0;
Reopen(options);
std::function<void()> write_slowdown_func = [&]() {
int a = thread_num.fetch_add(1);
std::string key = "foo" + std::to_string(a);
WriteOptions wo;
wo.no_slowdown = false;
dbfull()->Put(wo, key, "bar");
};
std::function<void()> write_no_slowdown_func = [&]() {
int a = thread_num.fetch_add(1);
std::string key = "foo" + std::to_string(a);
WriteOptions wo;
wo.no_slowdown = true;
dbfull()->Put(wo, key, "bar");
};
std::function<void(void*)> unblock_main_thread_func = [&](void*) {
mutex.Lock();
++writers;
cv.SignalAll();
mutex.Unlock();
};
// Create 3 L0 files and schedule 4th without waiting
Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar");
Flush();
Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar");
Flush();
Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar");
Flush();
Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar");
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"WriteThread::JoinBatchGroup:Start", unblock_main_thread_func);
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
{{"DBWriteTest::WriteStallRemoveNoSlowdownWrite:1",
"DBImpl::BackgroundCallFlush:start"},
{"DBWriteTest::WriteStallRemoveNoSlowdownWrite:2",
"DBImplWrite::PipelinedWriteImpl:AfterJoinBatchGroup"},
// Make compaction start wait for the write stall to be detected and
// implemented by a write group leader
{"DBWriteTest::WriteStallRemoveNoSlowdownWrite:3",
"BackgroundCallCompaction:0"}});
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
// Schedule creation of 4th L0 file without waiting. This will seal the
// memtable and then wait for a sync point before writing the file. We need
// to do it this way because SwitchMemtable() needs to enter the
// write_thread
FlushOptions fopt;
fopt.wait = false;
dbfull()->Flush(fopt);
// Create a mix of slowdown/no_slowdown write threads
mutex.Lock();
// First leader
threads.emplace_back(write_slowdown_func);
while (writers != 1) {
cv.Wait();
}
// Second leader. Will stall writes
// Build a writers list with no slowdown in the middle:
// +-------------+
// | slowdown +<----+ newest
// +--+----------+
// |
// v
// +--+----------+
// | no slowdown |
// +--+----------+
// |
// v
// +--+----------+
// | slowdown +
// +-------------+
threads.emplace_back(write_slowdown_func);
while (writers != 2) {
cv.Wait();
}
threads.emplace_back(write_no_slowdown_func);
while (writers != 3) {
cv.Wait();
}
threads.emplace_back(write_slowdown_func);
while (writers != 4) {
cv.Wait();
}
mutex.Unlock();
TEST_SYNC_POINT("DBWriteTest::WriteStallRemoveNoSlowdownWrite:1");
dbfull()->TEST_WaitForFlushMemTable(nullptr);
// This would have triggered a write stall. Unblock the write group leader
TEST_SYNC_POINT("DBWriteTest::WriteStallRemoveNoSlowdownWrite:2");
// The leader is going to create missing newer links. When the leader
// finishes, the next leader is going to delay writes and fail writers with
// no_slowdown
TEST_SYNC_POINT("DBWriteTest::WriteStallRemoveNoSlowdownWrite:3");
for (auto& t : threads) {
t.join();
}
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
}
TEST_P(DBWriteTest, WriteThreadHangOnWriteStall) {
Options options = GetOptions();
options.level0_stop_writes_trigger = options.level0_slowdown_writes_trigger = 4;
std::vector<port::Thread> threads;
std::atomic<int> thread_num(0);
port::Mutex mutex;
port::CondVar cv(&mutex);
Reopen(options);
std::function<void()> write_slowdown_func = [&]() {
int a = thread_num.fetch_add(1);
std::string key = "foo" + std::to_string(a);
WriteOptions wo;
wo.no_slowdown = false;
dbfull()->Put(wo, key, "bar");
};
std::function<void()> write_no_slowdown_func = [&]() {
int a = thread_num.fetch_add(1);
std::string key = "foo" + std::to_string(a);
WriteOptions wo;
wo.no_slowdown = true;
dbfull()->Put(wo, key, "bar");
};
std::function<void(void *)> unblock_main_thread_func = [&](void *) {
mutex.Lock();
cv.SignalAll();
mutex.Unlock();
};
// Create 3 L0 files and schedule 4th without waiting
Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar");
Flush();
Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar");
Flush();
Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar");
Flush();
Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar");
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"WriteThread::JoinBatchGroup:Start", unblock_main_thread_func);
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
{{"DBWriteTest::WriteThreadHangOnWriteStall:1",
"DBImpl::BackgroundCallFlush:start"},
{"DBWriteTest::WriteThreadHangOnWriteStall:2",
"DBImpl::WriteImpl:BeforeLeaderEnters"},
// Make compaction start wait for the write stall to be detected and
// implemented by a write group leader
{"DBWriteTest::WriteThreadHangOnWriteStall:3",
"BackgroundCallCompaction:0"}});
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
// Schedule creation of 4th L0 file without waiting. This will seal the
// memtable and then wait for a sync point before writing the file. We need
// to do it this way because SwitchMemtable() needs to enter the
// write_thread
FlushOptions fopt;
fopt.wait = false;
dbfull()->Flush(fopt);
// Create a mix of slowdown/no_slowdown write threads
mutex.Lock();
// First leader
threads.emplace_back(write_slowdown_func);
cv.Wait();
// Second leader. Will stall writes
threads.emplace_back(write_slowdown_func);
cv.Wait();
threads.emplace_back(write_no_slowdown_func);
cv.Wait();
threads.emplace_back(write_slowdown_func);
cv.Wait();
threads.emplace_back(write_no_slowdown_func);
cv.Wait();
threads.emplace_back(write_slowdown_func);
cv.Wait();
mutex.Unlock();
TEST_SYNC_POINT("DBWriteTest::WriteThreadHangOnWriteStall:1");
dbfull()->TEST_WaitForFlushMemTable(nullptr);
// This would have triggered a write stall. Unblock the write group leader
TEST_SYNC_POINT("DBWriteTest::WriteThreadHangOnWriteStall:2");
// The leader is going to create missing newer links. When the leader finishes,
// the next leader is going to delay writes and fail writers with no_slowdown
TEST_SYNC_POINT("DBWriteTest::WriteThreadHangOnWriteStall:3");
for (auto& t : threads) {
t.join();
}
}
TEST_P(DBWriteTest, IOErrorOnWALWritePropagateToWriteThreadFollower) {
constexpr int kNumThreads = 5;
std::unique_ptr<FaultInjectionTestEnv> mock_env(
new FaultInjectionTestEnv(Env::Default()));
Options options = GetOptions();
options.env = mock_env.get();
Reopen(options);
std::atomic<int> ready_count{0};
std::atomic<int> leader_count{0};
std::vector<port::Thread> threads;
mock_env->SetFilesystemActive(false);
// Wait until all threads linked to write threads, to make sure
// all threads join the same batch group.
SyncPoint::GetInstance()->SetCallBack(
"WriteThread::JoinBatchGroup:Wait", [&](void* arg) {
ready_count++;
auto* w = reinterpret_cast<WriteThread::Writer*>(arg);
if (w->state == WriteThread::STATE_GROUP_LEADER) {
leader_count++;
while (ready_count < kNumThreads) {
// busy waiting
}
}
});
SyncPoint::GetInstance()->EnableProcessing();
for (int i = 0; i < kNumThreads; i++) {
threads.push_back(port::Thread(
[&](int index) {
// All threads should fail.
auto res = Put("key" + ToString(index), "value");
if (options.manual_wal_flush) {
ASSERT_TRUE(res.ok());
// we should see fs error when we do the flush
// TSAN reports a false alarm for lock-order-inversion but Open and
// FlushWAL are not run concurrently. Disabling this until TSAN is
// fixed.
// res = dbfull()->FlushWAL(false);
// ASSERT_FALSE(res.ok());
} else {
ASSERT_FALSE(res.ok());
}
},
i));
}
for (int i = 0; i < kNumThreads; i++) {
threads[i].join();
}
ASSERT_EQ(1, leader_count);
// Close before mock_env destruct.
Close();
}
TEST_P(DBWriteTest, ManualWalFlushInEffect) {
Options options = GetOptions();
Reopen(options);
// try the 1st WAL created during open
ASSERT_TRUE(Put("key" + ToString(0), "value").ok());
ASSERT_TRUE(options.manual_wal_flush != dbfull()->TEST_WALBufferIsEmpty());
ASSERT_TRUE(dbfull()->FlushWAL(false).ok());
ASSERT_TRUE(dbfull()->TEST_WALBufferIsEmpty());
// try the 2nd wal created during SwitchWAL
dbfull()->TEST_SwitchWAL();
ASSERT_TRUE(Put("key" + ToString(0), "value").ok());
ASSERT_TRUE(options.manual_wal_flush != dbfull()->TEST_WALBufferIsEmpty());
ASSERT_TRUE(dbfull()->FlushWAL(false).ok());
ASSERT_TRUE(dbfull()->TEST_WALBufferIsEmpty());
}
TEST_P(DBWriteTest, IOErrorOnWALWriteTriggersReadOnlyMode) {
std::unique_ptr<FaultInjectionTestEnv> mock_env(
new FaultInjectionTestEnv(Env::Default()));
Options options = GetOptions();
options.env = mock_env.get();
Reopen(options);
for (int i = 0; i < 2; i++) {
// Forcibly fail WAL write for the first Put only. Subsequent Puts should
// fail due to read-only mode
mock_env->SetFilesystemActive(i != 0);
auto res = Put("key" + ToString(i), "value");
// TSAN reports a false alarm for lock-order-inversion but Open and
// FlushWAL are not run concurrently. Disabling this until TSAN is
// fixed.
/*
if (options.manual_wal_flush && i == 0) {
// even with manual_wal_flush the 2nd Put should return error because of
// the read-only mode
ASSERT_TRUE(res.ok());
// we should see fs error when we do the flush
res = dbfull()->FlushWAL(false);
}
*/
if (!options.manual_wal_flush) {
ASSERT_FALSE(res.ok());
}
}
// Close before mock_env destruct.
Close();
}
TEST_P(DBWriteTest, IOErrorOnSwitchMemtable) {
Random rnd(301);
std::unique_ptr<FaultInjectionTestEnv> mock_env(
new FaultInjectionTestEnv(Env::Default()));
Options options = GetOptions();
options.env = mock_env.get();
options.writable_file_max_buffer_size = 4 * 1024 * 1024;
options.write_buffer_size = 3 * 512 * 1024;
options.wal_bytes_per_sync = 256 * 1024;
options.manual_wal_flush = true;
Reopen(options);
mock_env->SetFilesystemActive(false, Status::IOError("Not active"));
Status s;
for (int i = 0; i < 4 * 512; ++i) {
s = Put(Key(i), rnd.RandomString(1024));
if (!s.ok()) {
break;
}
}
ASSERT_EQ(s.severity(), Status::Severity::kFatalError);
mock_env->SetFilesystemActive(true);
// Close before mock_env destruct.
Close();
}
Expose DB methods to lock and unlock the WAL (#5146) Summary: Expose DB methods to lock and unlock the WAL. These methods are intended to use by MyRocks in order to obtain WAL coordinates in consistent way. Usage scenario is following: MySQL has performance_schema.log_status which provides information that enables a backup tool to copy the required log files without locking for the duration of copy. To populate this table MySQL does following: 1. Lock the binary log. Transactions are not allowed to commit now 2. Save the binary log coordinates 3. Walk through the storage engines and lock writes on each engine. For InnoDB, redo log is locked. For MyRocks, WAL should be locked. 4. Ask storage engines for their coordinates. InnoDB reports its current LSN and checkpoint LSN. MyRocks should report active WAL files names and sizes. 5. Release storage engine's locks 6. Unlock binary log Backup tool will then use this information to copy InnoDB, RocksDB and MySQL binary logs up to specified positions to end up with consistent DB state after restore. Currently, RocksDB allows to obtain the list of WAL files. Only missing bit is the method to lock the writes to WAL files. LockWAL method must flush the WAL in order for the reported size to be accurate (GetSortedWALFiles is using file system stat call to return the file size), also, since backup tool is going to copy the WAL, it is better to be flushed. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5146 Differential Revision: D14815447 Pulled By: maysamyabandeh fbshipit-source-id: eec9535a6025229ed471119f19fe7b3d8ae888a3
2019-04-06 15:36:42 +02:00
// Test that db->LockWAL() flushes the WAL after locking.
TEST_P(DBWriteTest, LockWalInEffect) {
Options options = GetOptions();
Reopen(options);
// try the 1st WAL created during open
ASSERT_OK(Put("key" + ToString(0), "value"));
ASSERT_TRUE(options.manual_wal_flush != dbfull()->TEST_WALBufferIsEmpty());
ASSERT_OK(dbfull()->LockWAL());
ASSERT_TRUE(dbfull()->TEST_WALBufferIsEmpty(false));
ASSERT_OK(dbfull()->UnlockWAL());
// try the 2nd wal created during SwitchWAL
dbfull()->TEST_SwitchWAL();
ASSERT_OK(Put("key" + ToString(0), "value"));
ASSERT_TRUE(options.manual_wal_flush != dbfull()->TEST_WALBufferIsEmpty());
ASSERT_OK(dbfull()->LockWAL());
ASSERT_TRUE(dbfull()->TEST_WALBufferIsEmpty(false));
ASSERT_OK(dbfull()->UnlockWAL());
}
TEST_P(DBWriteTest, ConcurrentlyDisabledWAL) {
Options options = GetOptions();
options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
options.statistics->set_stats_level(StatsLevel::kAll);
Reopen(options);
std::string wal_key_prefix = "WAL_KEY_";
std::string no_wal_key_prefix = "K_";
// 100 KB value each for NO-WAL operation
std::string no_wal_value(1024 * 100, 'X');
// 1B value each for WAL operation
std::string wal_value = "0";
std::thread threads[10];
for (int t = 0; t < 10; t++) {
threads[t] = std::thread([t, wal_key_prefix, wal_value, no_wal_key_prefix, no_wal_value, this] {
for(int i = 0; i < 10; i++) {
ROCKSDB_NAMESPACE::WriteOptions write_option_disable;
write_option_disable.disableWAL = true;
ROCKSDB_NAMESPACE::WriteOptions write_option_default;
std::string no_wal_key = no_wal_key_prefix + std::to_string(t) +
"_" + std::to_string(i);
this->Put(no_wal_key, no_wal_value, write_option_disable);
std::string wal_key =
wal_key_prefix + std::to_string(i) + "_" + std::to_string(i);
this->Put(wal_key, wal_value, write_option_default);
dbfull()->SyncWAL();
}
return 0;
});
}
for (auto& t: threads) {
t.join();
}
uint64_t bytes_num = options.statistics->getTickerCount(
ROCKSDB_NAMESPACE::Tickers::WAL_FILE_BYTES);
// written WAL size should less than 100KB (even included HEADER & FOOTER overhead)
ASSERT_LE(bytes_num, 1024 * 100);
}
INSTANTIATE_TEST_CASE_P(DBWriteTestInstance, DBWriteTest,
testing::Values(DBTestBase::kDefault,
DBTestBase::kConcurrentWALWrites,
DBTestBase::kPipelinedWrite));
} // namespace ROCKSDB_NAMESPACE
int main(int argc, char** argv) {
ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}