rocksdb/db/write_thread.cc
Nathan Bronson b7198c3afe reduce db mutex contention for write batch groups
Summary:
This diff allows a Writer to join the next write batch group
without acquiring any locks. Waiting is performed via a per-Writer mutex,
so all of the non-leader writers never need to acquire the db mutex.
It is now possible to join a write batch group after the leader has been
chosen but before the batch has been constructed. This diff doesn't
increase parallelism, but reduces synchronization overheads.

For some CPU-bound workloads (no WAL, RAM-sized working set) this can
substantially reduce contention on the db mutex in a multi-threaded
environment.  With T=8 N=500000 in a CPU-bound scenario (see the test
plan) this is good for a 33% perf win.  Not all scenarios see such a
win, but none show a loss.  This code is slightly faster even for the
single-threaded case (about 2% for the CPU-bound scenario below).

Test Plan:
1. unit tests
2. COMPILE_WITH_TSAN=1 make check
3. stress high-contention scenarios with db_bench -benchmarks=fillrandom -threads=$T -batch_size=1 -memtablerep=skip_list -value_size=0 --num=$N -level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999 -disable_auto_compactions --max_write_buffer_number=8 -max_background_flushes=8 --disable_wal --write_buffer_size=160000000

Reviewers: sdong, igor, rven, ljin, yhchiang

Subscribers: dhruba

Differential Revision: https://reviews.facebook.net/D43887
2015-08-14 10:55:43 -07:00

201 lines
6.3 KiB
C++

// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#include "db/write_thread.h"
namespace rocksdb {
void WriteThread::Await(Writer* w) {
std::unique_lock<std::mutex> guard(w->JoinMutex());
w->JoinCV().wait(guard, [w] { return w->joined; });
}
void WriteThread::MarkJoined(Writer* w) {
std::lock_guard<std::mutex> guard(w->JoinMutex());
assert(!w->joined);
w->joined = true;
w->JoinCV().notify_one();
}
void WriteThread::LinkOne(Writer* w, bool* wait_needed) {
assert(!w->joined && !w->done);
Writer* writers = newest_writer_.load(std::memory_order_relaxed);
while (true) {
w->link_older = writers;
if (writers != nullptr) {
w->CreateMutex();
}
if (newest_writer_.compare_exchange_strong(writers, w)) {
// Success.
*wait_needed = (writers != nullptr);
return;
}
}
}
void WriteThread::CreateMissingNewerLinks(Writer* head) {
while (true) {
Writer* next = head->link_older;
if (next == nullptr || next->link_newer != nullptr) {
assert(next == nullptr || next->link_newer == head);
break;
}
next->link_newer = head;
head = next;
}
}
void WriteThread::JoinBatchGroup(Writer* w) {
assert(w->batch != nullptr);
bool wait_needed;
LinkOne(w, &wait_needed);
if (wait_needed) {
Await(w);
}
}
size_t WriteThread::EnterAsBatchGroupLeader(
Writer* leader, WriteThread::Writer** last_writer,
autovector<WriteBatch*>* write_batch_group) {
assert(leader->link_older == nullptr);
assert(leader->batch != nullptr);
size_t size = WriteBatchInternal::ByteSize(leader->batch);
write_batch_group->push_back(leader->batch);
// Allow the group to grow up to a maximum size, but if the
// original write is small, limit the growth so we do not slow
// down the small write too much.
size_t max_size = 1 << 20;
if (size <= (128 << 10)) {
max_size = size + (128 << 10);
}
*last_writer = leader;
if (leader->has_callback) {
// TODO(agiardullo:) Batching not currently supported as this write may
// fail if the callback function decides to abort this write.
return size;
}
Writer* newest_writer = newest_writer_.load(std::memory_order_acquire);
// This is safe regardless of any db mutex status of the caller. Previous
// calls to ExitAsGroupLeader either didn't call CreateMissingNewerLinks
// (they emptied the list and then we added ourself as leader) or had to
// explicitly wake up us (the list was non-empty when we added ourself,
// so we have already received our MarkJoined).
CreateMissingNewerLinks(newest_writer);
// Tricky. Iteration start (leader) is exclusive and finish
// (newest_writer) is inclusive. Iteration goes from old to new.
Writer* w = leader;
while (w != newest_writer) {
w = w->link_newer;
if (w->sync && !leader->sync) {
// Do not include a sync write into a batch handled by a non-sync write.
break;
}
if (!w->disableWAL && leader->disableWAL) {
// Do not include a write that needs WAL into a batch that has
// WAL disabled.
break;
}
if (w->has_callback) {
// Do not include writes which may be aborted if the callback does not
// succeed.
break;
}
if (w->batch == nullptr) {
// Do not include those writes with nullptr batch. Those are not writes,
// those are something else. They want to be alone
break;
}
size += WriteBatchInternal::ByteSize(w->batch);
if (size > max_size) {
// Do not make batch too big
break;
}
write_batch_group->push_back(w->batch);
w->in_batch_group = true;
*last_writer = w;
}
return size;
}
void WriteThread::ExitAsBatchGroupLeader(Writer* leader, Writer* last_writer,
Status status) {
assert(leader->link_older == nullptr);
Writer* head = newest_writer_.load(std::memory_order_acquire);
if (head != last_writer ||
!newest_writer_.compare_exchange_strong(head, nullptr)) {
// Either w wasn't the head during the load(), or it was the head
// during the load() but somebody else pushed onto the list before
// we did the compare_exchange_strong (causing it to fail). In the
// latter case compare_exchange_strong has the effect of re-reading
// its first param (head). No need to retry a failing CAS, because
// only a departing leader (which we are at the moment) can remove
// nodes from the list.
assert(head != last_writer);
// After walking link_older starting from head (if not already done)
// we will be able to traverse w->link_newer below. This function
// can only be called from an active leader, only a leader can
// clear newest_writer_, we didn't, and only a clear newest_writer_
// could cause the next leader to start their work without a call
// to MarkJoined, so we can definitely conclude that no other leader
// work is going on here (with or without db mutex).
CreateMissingNewerLinks(head);
assert(last_writer->link_newer->link_older == last_writer);
last_writer->link_newer->link_older = nullptr;
// Next leader didn't self-identify, because newest_writer_ wasn't
// nullptr when they enqueued (we were definitely enqueued before them
// and are still in the list). That means leader handoff occurs when
// we call MarkJoined
MarkJoined(last_writer->link_newer);
}
// else nobody else was waiting, although there might already be a new
// leader now
while (last_writer != leader) {
last_writer->status = status;
last_writer->done = true;
// We must read link_older before calling MarkJoined, because as
// soon as it is marked the other thread's AwaitJoined may return
// and deallocate the Writer.
auto next = last_writer->link_older;
MarkJoined(last_writer);
last_writer = next;
}
}
void WriteThread::EnterUnbatched(Writer* w, InstrumentedMutex* mu) {
assert(w->batch == nullptr);
bool wait_needed;
LinkOne(w, &wait_needed);
if (wait_needed) {
mu->Unlock();
Await(w);
mu->Lock();
}
}
void WriteThread::ExitUnbatched(Writer* w) {
Status dummy_status;
ExitAsBatchGroupLeader(w, w, dummy_status);
}
} // namespace rocksdb