Concurrent MultiGet using coroutines and async IO

This commit is contained in:
anand76 2022-05-08 20:51:42 -07:00
parent ebd85d1a8b
commit b8a5ef0890
16 changed files with 308 additions and 9 deletions

View File

@ -799,6 +799,7 @@ set(SOURCES
trace_replay/trace_record_result.cc
trace_replay/trace_record.cc
trace_replay/trace_replay.cc
util/async_file_reader.cc
util/cleanable.cc
util/coding.cc
util/compaction_job_stats_impl.cc

View File

@ -2352,7 +2352,7 @@ checkout_folly:
cd third-party/folly && git reset --hard 98b9b2c1124e99f50f9085ddee74ce32afffc665
@# A hack to remove boost dependency.
@# NOTE: this hack is not needed if using FBCODE compiler config
@#perl -pi -e 's/^(#include <boost)/\/\/$$1/' third-party/folly/folly/functional/Invoke.h
perl -pi -e 's/^(#include <boost)/\/\/$$1/' third-party/folly/folly/functional/Invoke.h
# ---------------------------------------------------------------------------
# Build size testing

View File

@ -224,6 +224,7 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[
"trace_replay/trace_record_handler.cc",
"trace_replay/trace_record_result.cc",
"trace_replay/trace_replay.cc",
"util/async_file_reader.cc",
"util/build_version.cc",
"util/cleanable.cc",
"util/coding.cc",

View File

@ -2590,7 +2590,8 @@ Status DBImpl::MultiGetImpl(
? MultiGetContext::MAX_BATCH_SIZE
: keys_left;
MultiGetContext ctx(sorted_keys, start_key + num_keys - keys_left,
batch_size, snapshot, read_options);
batch_size, snapshot, read_options, GetFileSystem(),
stats_);
MultiGetRange range = ctx.GetMultiGetRange();
range.AddValueSize(curr_value_size);
bool lookup_current = false;

View File

@ -136,5 +136,5 @@ DEFINE_SYNC_AND_ASYNC(Status, TableCache::MultiGet)
}
CO_RETURN s;
}
} // ROCKSDB_NAMESPACE
} // namespace ROCKSDB_NAMESPACE
#endif

View File

@ -2258,7 +2258,8 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
if (mget_tasks.size() > 0) {
// Collect all results so far
std::vector<Status> statuses = folly::coro::blockingWait(
folly::coro::collectAllRange(std::move(mget_tasks)));
folly::coro::collectAllRange(std::move(mget_tasks))
.scheduleOn(&range->context()->executor()));
for (Status stat : statuses) {
if (!stat.ok()) {
s = stat;

View File

@ -150,5 +150,5 @@ DEFINE_SYNC_AND_ASYNC(Status, Version::MultiGetFromSST)
RecordInHistogram(db_statistics_, SST_BATCH_SIZE, batch_size);
CO_RETURN s;
}
} // ROCKSDB_NAMESPACE
} // namespace ROCKSDB_NAMESPACE
#endif

View File

@ -543,6 +543,9 @@ enum Histograms : uint32_t {
// Number of prefetched bytes discarded by RocksDB.
PREFETCHED_BYTES_DISCARDED,
// Number of IOs issued in parallel in a MultiGet batch
MULTIGET_IO_BATCH_SIZE,
HISTOGRAM_ENUM_MAX,
};

View File

@ -286,7 +286,7 @@ const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = {
{ASYNC_READ_BYTES, "rocksdb.async.read.bytes"},
{POLL_WAIT_MICROS, "rocksdb.poll.wait.micros"},
{PREFETCHED_BYTES_DISCARDED, "rocksdb.prefetched.bytes.discarded"},
{MULTIGET_IO_BATCH_SIZE, "rocksdb.multiget.io.batch.size"},
};
std::shared_ptr<Statistics> CreateDBStatistics() {

1
src.mk
View File

@ -211,6 +211,7 @@ LIB_SOURCES = \
trace_replay/trace_replay.cc \
trace_replay/block_cache_tracer.cc \
trace_replay/io_tracer.cc \
util/async_file_reader.cc \
util/build_version.cc \
util/cleanable.cc \
util/coding.cc \

View File

@ -3,6 +3,7 @@
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#include "util/async_file_reader.h"
#include "util/coro_utils.h"
#if defined(WITHOUT_COROUTINES) || \
@ -139,8 +140,13 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::RetrieveMultipleBlocks)
IOOptions opts;
IOStatus s = file->PrepareIOOptions(options, opts);
if (s.ok()) {
#if defined(WITHOUT_COROUTINES)
s = file->MultiRead(opts, &read_reqs[0], read_reqs.size(), &direct_io_buf,
options.rate_limiter_priority);
#else // WITH_COROUTINES
co_await batch->context()->reader().MultiReadAsync(
file, opts, &read_reqs[0], read_reqs.size(), &direct_io_buf);
#endif // WITH_COROUTINES
}
if (!s.ok()) {
// Discard all the results in this batch if there is any time out

View File

@ -259,7 +259,8 @@ TEST_P(BlockBasedTableReaderTest, MultiGet) {
for (auto& key_ctx : key_context) {
sorted_keys.emplace_back(&key_ctx);
}
MultiGetContext ctx(&sorted_keys, 0, sorted_keys.size(), 0, ReadOptions());
MultiGetContext ctx(&sorted_keys, 0, sorted_keys.size(), 0, ReadOptions(),
fs_.get(), nullptr);
// Execute MultiGet.
MultiGetContext::Range range = ctx.GetMultiGetRange();

View File

@ -14,8 +14,10 @@
#include "rocksdb/env.h"
#include "rocksdb/statistics.h"
#include "rocksdb/types.h"
#include "util/async_file_reader.h"
#include "util/autovector.h"
#include "util/math.h"
#include "util/single_thread_executor.h"
namespace ROCKSDB_NAMESPACE {
class GetContext;
@ -104,11 +106,20 @@ class MultiGetContext {
MultiGetContext(autovector<KeyContext*, MAX_BATCH_SIZE>* sorted_keys,
size_t begin, size_t num_keys, SequenceNumber snapshot,
const ReadOptions& read_opts)
const ReadOptions& read_opts, FileSystem* fs,
Statistics* stats)
: num_keys_(num_keys),
value_mask_(0),
value_size_(0),
lookup_key_ptr_(reinterpret_cast<LookupKey*>(lookup_key_stack_buf)) {
lookup_key_ptr_(reinterpret_cast<LookupKey*>(lookup_key_stack_buf))
#if USE_COROUTINES
,
reader_(fs, stats),
executor_(reader_)
#endif // USE_COROUTINES
{
(void)fs;
(void)stats;
assert(num_keys <= MAX_BATCH_SIZE);
if (num_keys > MAX_LOOKUP_KEYS_ON_STACK) {
lookup_key_heap_buf.reset(new char[sizeof(LookupKey) * num_keys]);
@ -135,6 +146,12 @@ class MultiGetContext {
}
}
#if USE_COROUTINES
SingleThreadExecutor& executor() { return executor_; }
AsyncFileReader& reader() { return reader_; }
#endif // USE_COROUTINES
private:
static const int MAX_LOOKUP_KEYS_ON_STACK = 16;
alignas(alignof(LookupKey))
@ -145,6 +162,10 @@ class MultiGetContext {
uint64_t value_size_;
std::unique_ptr<char[]> lookup_key_heap_buf;
LookupKey* lookup_key_ptr_;
#if USE_COROUTINES
AsyncFileReader reader_;
SingleThreadExecutor executor_;
#endif // USE_COROUTINES
public:
// MultiGetContext::Range - Specifies a range of keys, by start and end index,
@ -267,6 +288,8 @@ class MultiGetContext {
void AddValueSize(uint64_t value_size) { ctx_->value_size_ += value_size; }
MultiGetContext* context() const { return ctx_; }
private:
friend MultiGetContext;
MultiGetContext* ctx_;

68
util/async_file_reader.cc Normal file
View File

@ -0,0 +1,68 @@
// Copyright (c) Meta Platforms, Inc. and its affiliates. All Rights Reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
//
#if USE_COROUTINES
#include "util/async_file_reader.h"
namespace ROCKSDB_NAMESPACE {
bool AsyncFileReader::MultiReadAsyncImpl(ReadAwaiter* awaiter) {
if (tail_) {
tail_->next_ = awaiter;
}
tail_ = awaiter;
if (!head_) {
head_ = awaiter;
}
num_reqs_ += awaiter->num_reqs_;
awaiter->io_handle_.resize(awaiter->num_reqs_);
awaiter->del_fn_.resize(awaiter->num_reqs_);
for (size_t i = 0; i < awaiter->num_reqs_; ++i) {
awaiter->file_
->ReadAsync(
awaiter->read_reqs_[i], awaiter->opts_,
[](const FSReadRequest& req, void* cb_arg) {
FSReadRequest* read_req = static_cast<FSReadRequest*>(cb_arg);
read_req->status = req.status;
read_req->result = req.result;
},
&awaiter->read_reqs_[i], &awaiter->io_handle_[i],
&awaiter->del_fn_[i], Env::IOPriority::IO_TOTAL)
.PermitUncheckedError();
}
return true;
}
void AsyncFileReader::Poll() {
if (!head_) {
return;
}
ReadAwaiter* waiter;
std::vector<void*> io_handles;
io_handles.reserve(num_reqs_);
waiter = head_;
do {
for (size_t i = 0; i < waiter->num_reqs_; ++i) {
io_handles.push_back(waiter->io_handle_[i]);
}
} while (waiter != tail_ && (waiter = waiter->next_));
{
StopWatch sw(SystemClock::Default().get(), stats_, POLL_WAIT_MICROS);
fs_->Poll(io_handles, io_handles.size()).PermitUncheckedError();
}
do {
waiter = head_;
head_ = waiter->next_;
for (size_t i = 0; i < waiter->num_reqs_; ++i) {
waiter->del_fn_[i](waiter->io_handle_[i]);
}
waiter->awaiting_coro_.resume();
} while (waiter != tail_);
head_ = tail_ = nullptr;
RecordInHistogram(stats_, MULTIGET_IO_BATCH_SIZE, num_reqs_);
num_reqs_ = 0;
}
} // namespace ROCKSDB_NAMESPACE
#endif // USE_COROUTINES

138
util/async_file_reader.h Normal file
View File

@ -0,0 +1,138 @@
// Copyright (c) Meta Platforms, Inc. and its affiliates. All Rights Reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
//
#pragma once
#if USE_COROUTINES
#include "file/random_access_file_reader.h"
#include "folly/experimental/coro/ViaIfAsync.h"
#include "port/port.h"
#include "rocksdb/file_system.h"
#include "rocksdb/statistics.h"
#include "util/autovector.h"
#include "util/stop_watch.h"
namespace ROCKSDB_NAMESPACE {
class SingleThreadExecutor;
// AsyncFileReader implements the Awaitable concept, which allows calling
// coroutines to co_await it. When the AsyncFileReader Awaitable is
// resumed, it initiates the fie reads requested by the awaiting caller
// by calling RandomAccessFileReader's ReadAsync. It then suspends the
// awaiting coroutine. The suspended awaiter is later resumed by Poll().
class AsyncFileReader {
class ReadAwaiter;
template <typename Awaiter>
class ReadOperation;
public:
AsyncFileReader(FileSystem* fs, Statistics* stats) : fs_(fs), stats_(stats) {}
~AsyncFileReader() {}
ReadOperation<ReadAwaiter> MultiReadAsync(RandomAccessFileReader* file,
const IOOptions& opts,
FSReadRequest* read_reqs,
size_t num_reqs,
AlignedBuf* aligned_buf) noexcept {
return ReadOperation<ReadAwaiter>{*this, file, opts,
read_reqs, num_reqs, aligned_buf};
}
private:
friend SingleThreadExecutor;
// Implementation of the Awaitable concept
class ReadAwaiter {
public:
explicit ReadAwaiter(AsyncFileReader& reader, RandomAccessFileReader* file,
const IOOptions& opts, FSReadRequest* read_reqs,
size_t num_reqs, AlignedBuf* /*aligned_buf*/) noexcept
: reader_(reader),
file_(file),
opts_(opts),
read_reqs_(read_reqs),
num_reqs_(num_reqs) {}
bool await_ready() noexcept { return false; }
// A return value of true means suspend the awaiter (calling coroutine). The
// awaiting_coro parameter is the handle of the awaiter. The handle can be
// resumed later, so we cache it here.
bool await_suspend(
std::experimental::coroutine_handle<> awaiting_coro) noexcept {
awaiting_coro_ = awaiting_coro;
// MultiReadAsyncImpl always returns true, so caller will be suspended
return reader_.MultiReadAsyncImpl(this);
}
void await_resume() noexcept {}
private:
friend AsyncFileReader;
// The parameters passed to MultiReadAsync are cached here when the caller
// calls MultiReadAsync. Later, when the execution of this awaitable is
// started, these are used to do the actual IO
AsyncFileReader& reader_;
RandomAccessFileReader* file_;
const IOOptions& opts_;
FSReadRequest* read_reqs_;
size_t num_reqs_;
autovector<void*, 32> io_handle_;
autovector<IOHandleDeleter, 32> del_fn_;
std::experimental::coroutine_handle<> awaiting_coro_;
ReadAwaiter* next_;
};
// An instance of ReadOperation is returned to the caller of MultiGetAsync.
// This represents an awaitable that can be started later.
template <typename Awaiter>
class ReadOperation {
public:
explicit ReadOperation(AsyncFileReader& reader,
RandomAccessFileReader* file, const IOOptions& opts,
FSReadRequest* read_reqs, size_t num_reqs,
AlignedBuf* aligned_buf) noexcept
: reader_(reader),
file_(file),
opts_(opts),
read_reqs_(read_reqs),
num_reqs_(num_reqs),
aligned_buf_(aligned_buf) {}
auto viaIfAsync(folly::Executor::KeepAlive<> executor) const {
return folly::coro::co_viaIfAsync(
std::move(executor),
Awaiter{reader_, file_, opts_, read_reqs_, num_reqs_, aligned_buf_});
}
private:
AsyncFileReader& reader_;
RandomAccessFileReader* file_;
const IOOptions& opts_;
FSReadRequest* read_reqs_;
size_t num_reqs_;
AlignedBuf* aligned_buf_;
};
// This function does the actual work when this awaitable starts execution
bool MultiReadAsyncImpl(ReadAwaiter* awaiter);
// Called by the SingleThreadExecutor to poll for async IO completion.
// This also resumes the awaiting coroutines.
void Poll();
// Head of the queue of awaiters waiting for async IO completion
ReadAwaiter* head_ = nullptr;
// Tail of the awaiter queue
ReadAwaiter* tail_ = nullptr;
// Total number of pending async IOs
size_t num_reqs_ = 0;
FileSystem* fs_;
Statistics* stats_;
};
} // namespace ROCKSDB_NAMESPACE
#endif // USE_COROUTINES

View File

@ -0,0 +1,55 @@
// Copyright (c) Meta Platforms, Inc. and its affiliates. All Rights Reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
//
#pragma once
#if USE_COROUTINES
#include <atomic>
#include "folly/CPortability.h"
#include "folly/CppAttributes.h"
#include "folly/Executor.h"
#include "util/async_file_reader.h"
namespace ROCKSDB_NAMESPACE {
// Implements a simple executor that runs callback functions in the same
// thread, unlike CPUThreadExecutor which may schedule the callback on
// another thread. Runs in a tight loop calling the queued callbacks,
// and polls for async IO completions when idle. The completions will
// resume suspended coroutines and they get added to the queue, which
// will get picked up by this loop.
// Any possibility of deadlock is precluded because the file system
// guarantees that async IO completion callbacks will not be scheduled
// to run in this thread or this executor.
class SingleThreadExecutor : public folly::Executor {
public:
explicit SingleThreadExecutor(AsyncFileReader& reader)
: reader_(reader), busy_(false) {}
void add(folly::Func callback) override {
auto& q = q_;
q.push(std::move(callback));
if (q.size() == 1 && !busy_) {
while (!q.empty()) {
q.front()();
q.pop();
if (q.empty()) {
// Prevent recursion, as the Poll may queue resumed coroutines
busy_ = true;
reader_.Poll();
busy_ = false;
}
}
}
}
private:
std::queue<folly::Func> q_;
AsyncFileReader& reader_;
bool busy_;
};
} // namespace ROCKSDB_NAMESPACE
#endif // USE_COROUTINES