Concurrent MultiGet using coroutines and async IO
This commit is contained in:
parent
ebd85d1a8b
commit
b8a5ef0890
@ -799,6 +799,7 @@ set(SOURCES
|
||||
trace_replay/trace_record_result.cc
|
||||
trace_replay/trace_record.cc
|
||||
trace_replay/trace_replay.cc
|
||||
util/async_file_reader.cc
|
||||
util/cleanable.cc
|
||||
util/coding.cc
|
||||
util/compaction_job_stats_impl.cc
|
||||
|
2
Makefile
2
Makefile
@ -2352,7 +2352,7 @@ checkout_folly:
|
||||
cd third-party/folly && git reset --hard 98b9b2c1124e99f50f9085ddee74ce32afffc665
|
||||
@# A hack to remove boost dependency.
|
||||
@# NOTE: this hack is not needed if using FBCODE compiler config
|
||||
@#perl -pi -e 's/^(#include <boost)/\/\/$$1/' third-party/folly/folly/functional/Invoke.h
|
||||
perl -pi -e 's/^(#include <boost)/\/\/$$1/' third-party/folly/folly/functional/Invoke.h
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Build size testing
|
||||
|
1
TARGETS
1
TARGETS
@ -224,6 +224,7 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[
|
||||
"trace_replay/trace_record_handler.cc",
|
||||
"trace_replay/trace_record_result.cc",
|
||||
"trace_replay/trace_replay.cc",
|
||||
"util/async_file_reader.cc",
|
||||
"util/build_version.cc",
|
||||
"util/cleanable.cc",
|
||||
"util/coding.cc",
|
||||
|
@ -2590,7 +2590,8 @@ Status DBImpl::MultiGetImpl(
|
||||
? MultiGetContext::MAX_BATCH_SIZE
|
||||
: keys_left;
|
||||
MultiGetContext ctx(sorted_keys, start_key + num_keys - keys_left,
|
||||
batch_size, snapshot, read_options);
|
||||
batch_size, snapshot, read_options, GetFileSystem(),
|
||||
stats_);
|
||||
MultiGetRange range = ctx.GetMultiGetRange();
|
||||
range.AddValueSize(curr_value_size);
|
||||
bool lookup_current = false;
|
||||
|
@ -136,5 +136,5 @@ DEFINE_SYNC_AND_ASYNC(Status, TableCache::MultiGet)
|
||||
}
|
||||
CO_RETURN s;
|
||||
}
|
||||
} // ROCKSDB_NAMESPACE
|
||||
} // namespace ROCKSDB_NAMESPACE
|
||||
#endif
|
||||
|
@ -2258,7 +2258,8 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
|
||||
if (mget_tasks.size() > 0) {
|
||||
// Collect all results so far
|
||||
std::vector<Status> statuses = folly::coro::blockingWait(
|
||||
folly::coro::collectAllRange(std::move(mget_tasks)));
|
||||
folly::coro::collectAllRange(std::move(mget_tasks))
|
||||
.scheduleOn(&range->context()->executor()));
|
||||
for (Status stat : statuses) {
|
||||
if (!stat.ok()) {
|
||||
s = stat;
|
||||
|
@ -150,5 +150,5 @@ DEFINE_SYNC_AND_ASYNC(Status, Version::MultiGetFromSST)
|
||||
RecordInHistogram(db_statistics_, SST_BATCH_SIZE, batch_size);
|
||||
CO_RETURN s;
|
||||
}
|
||||
} // ROCKSDB_NAMESPACE
|
||||
} // namespace ROCKSDB_NAMESPACE
|
||||
#endif
|
||||
|
@ -543,6 +543,9 @@ enum Histograms : uint32_t {
|
||||
// Number of prefetched bytes discarded by RocksDB.
|
||||
PREFETCHED_BYTES_DISCARDED,
|
||||
|
||||
// Number of IOs issued in parallel in a MultiGet batch
|
||||
MULTIGET_IO_BATCH_SIZE,
|
||||
|
||||
HISTOGRAM_ENUM_MAX,
|
||||
};
|
||||
|
||||
|
@ -286,7 +286,7 @@ const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = {
|
||||
{ASYNC_READ_BYTES, "rocksdb.async.read.bytes"},
|
||||
{POLL_WAIT_MICROS, "rocksdb.poll.wait.micros"},
|
||||
{PREFETCHED_BYTES_DISCARDED, "rocksdb.prefetched.bytes.discarded"},
|
||||
|
||||
{MULTIGET_IO_BATCH_SIZE, "rocksdb.multiget.io.batch.size"},
|
||||
};
|
||||
|
||||
std::shared_ptr<Statistics> CreateDBStatistics() {
|
||||
|
1
src.mk
1
src.mk
@ -211,6 +211,7 @@ LIB_SOURCES = \
|
||||
trace_replay/trace_replay.cc \
|
||||
trace_replay/block_cache_tracer.cc \
|
||||
trace_replay/io_tracer.cc \
|
||||
util/async_file_reader.cc \
|
||||
util/build_version.cc \
|
||||
util/cleanable.cc \
|
||||
util/coding.cc \
|
||||
|
@ -3,6 +3,7 @@
|
||||
// COPYING file in the root directory) and Apache 2.0 License
|
||||
// (found in the LICENSE.Apache file in the root directory).
|
||||
|
||||
#include "util/async_file_reader.h"
|
||||
#include "util/coro_utils.h"
|
||||
|
||||
#if defined(WITHOUT_COROUTINES) || \
|
||||
@ -139,8 +140,13 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::RetrieveMultipleBlocks)
|
||||
IOOptions opts;
|
||||
IOStatus s = file->PrepareIOOptions(options, opts);
|
||||
if (s.ok()) {
|
||||
#if defined(WITHOUT_COROUTINES)
|
||||
s = file->MultiRead(opts, &read_reqs[0], read_reqs.size(), &direct_io_buf,
|
||||
options.rate_limiter_priority);
|
||||
#else // WITH_COROUTINES
|
||||
co_await batch->context()->reader().MultiReadAsync(
|
||||
file, opts, &read_reqs[0], read_reqs.size(), &direct_io_buf);
|
||||
#endif // WITH_COROUTINES
|
||||
}
|
||||
if (!s.ok()) {
|
||||
// Discard all the results in this batch if there is any time out
|
||||
|
@ -259,7 +259,8 @@ TEST_P(BlockBasedTableReaderTest, MultiGet) {
|
||||
for (auto& key_ctx : key_context) {
|
||||
sorted_keys.emplace_back(&key_ctx);
|
||||
}
|
||||
MultiGetContext ctx(&sorted_keys, 0, sorted_keys.size(), 0, ReadOptions());
|
||||
MultiGetContext ctx(&sorted_keys, 0, sorted_keys.size(), 0, ReadOptions(),
|
||||
fs_.get(), nullptr);
|
||||
|
||||
// Execute MultiGet.
|
||||
MultiGetContext::Range range = ctx.GetMultiGetRange();
|
||||
|
@ -14,8 +14,10 @@
|
||||
#include "rocksdb/env.h"
|
||||
#include "rocksdb/statistics.h"
|
||||
#include "rocksdb/types.h"
|
||||
#include "util/async_file_reader.h"
|
||||
#include "util/autovector.h"
|
||||
#include "util/math.h"
|
||||
#include "util/single_thread_executor.h"
|
||||
|
||||
namespace ROCKSDB_NAMESPACE {
|
||||
class GetContext;
|
||||
@ -104,11 +106,20 @@ class MultiGetContext {
|
||||
|
||||
MultiGetContext(autovector<KeyContext*, MAX_BATCH_SIZE>* sorted_keys,
|
||||
size_t begin, size_t num_keys, SequenceNumber snapshot,
|
||||
const ReadOptions& read_opts)
|
||||
const ReadOptions& read_opts, FileSystem* fs,
|
||||
Statistics* stats)
|
||||
: num_keys_(num_keys),
|
||||
value_mask_(0),
|
||||
value_size_(0),
|
||||
lookup_key_ptr_(reinterpret_cast<LookupKey*>(lookup_key_stack_buf)) {
|
||||
lookup_key_ptr_(reinterpret_cast<LookupKey*>(lookup_key_stack_buf))
|
||||
#if USE_COROUTINES
|
||||
,
|
||||
reader_(fs, stats),
|
||||
executor_(reader_)
|
||||
#endif // USE_COROUTINES
|
||||
{
|
||||
(void)fs;
|
||||
(void)stats;
|
||||
assert(num_keys <= MAX_BATCH_SIZE);
|
||||
if (num_keys > MAX_LOOKUP_KEYS_ON_STACK) {
|
||||
lookup_key_heap_buf.reset(new char[sizeof(LookupKey) * num_keys]);
|
||||
@ -135,6 +146,12 @@ class MultiGetContext {
|
||||
}
|
||||
}
|
||||
|
||||
#if USE_COROUTINES
|
||||
SingleThreadExecutor& executor() { return executor_; }
|
||||
|
||||
AsyncFileReader& reader() { return reader_; }
|
||||
#endif // USE_COROUTINES
|
||||
|
||||
private:
|
||||
static const int MAX_LOOKUP_KEYS_ON_STACK = 16;
|
||||
alignas(alignof(LookupKey))
|
||||
@ -145,6 +162,10 @@ class MultiGetContext {
|
||||
uint64_t value_size_;
|
||||
std::unique_ptr<char[]> lookup_key_heap_buf;
|
||||
LookupKey* lookup_key_ptr_;
|
||||
#if USE_COROUTINES
|
||||
AsyncFileReader reader_;
|
||||
SingleThreadExecutor executor_;
|
||||
#endif // USE_COROUTINES
|
||||
|
||||
public:
|
||||
// MultiGetContext::Range - Specifies a range of keys, by start and end index,
|
||||
@ -267,6 +288,8 @@ class MultiGetContext {
|
||||
|
||||
void AddValueSize(uint64_t value_size) { ctx_->value_size_ += value_size; }
|
||||
|
||||
MultiGetContext* context() const { return ctx_; }
|
||||
|
||||
private:
|
||||
friend MultiGetContext;
|
||||
MultiGetContext* ctx_;
|
||||
|
68
util/async_file_reader.cc
Normal file
68
util/async_file_reader.cc
Normal file
@ -0,0 +1,68 @@
|
||||
// Copyright (c) Meta Platforms, Inc. and its affiliates. All Rights Reserved.
|
||||
// This source code is licensed under both the GPLv2 (found in the
|
||||
// COPYING file in the root directory) and Apache 2.0 License
|
||||
// (found in the LICENSE.Apache file in the root directory).
|
||||
//
|
||||
#if USE_COROUTINES
|
||||
#include "util/async_file_reader.h"
|
||||
|
||||
namespace ROCKSDB_NAMESPACE {
|
||||
bool AsyncFileReader::MultiReadAsyncImpl(ReadAwaiter* awaiter) {
|
||||
if (tail_) {
|
||||
tail_->next_ = awaiter;
|
||||
}
|
||||
tail_ = awaiter;
|
||||
if (!head_) {
|
||||
head_ = awaiter;
|
||||
}
|
||||
num_reqs_ += awaiter->num_reqs_;
|
||||
awaiter->io_handle_.resize(awaiter->num_reqs_);
|
||||
awaiter->del_fn_.resize(awaiter->num_reqs_);
|
||||
for (size_t i = 0; i < awaiter->num_reqs_; ++i) {
|
||||
awaiter->file_
|
||||
->ReadAsync(
|
||||
awaiter->read_reqs_[i], awaiter->opts_,
|
||||
[](const FSReadRequest& req, void* cb_arg) {
|
||||
FSReadRequest* read_req = static_cast<FSReadRequest*>(cb_arg);
|
||||
read_req->status = req.status;
|
||||
read_req->result = req.result;
|
||||
},
|
||||
&awaiter->read_reqs_[i], &awaiter->io_handle_[i],
|
||||
&awaiter->del_fn_[i], Env::IOPriority::IO_TOTAL)
|
||||
.PermitUncheckedError();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void AsyncFileReader::Poll() {
|
||||
if (!head_) {
|
||||
return;
|
||||
}
|
||||
ReadAwaiter* waiter;
|
||||
std::vector<void*> io_handles;
|
||||
io_handles.reserve(num_reqs_);
|
||||
waiter = head_;
|
||||
do {
|
||||
for (size_t i = 0; i < waiter->num_reqs_; ++i) {
|
||||
io_handles.push_back(waiter->io_handle_[i]);
|
||||
}
|
||||
} while (waiter != tail_ && (waiter = waiter->next_));
|
||||
{
|
||||
StopWatch sw(SystemClock::Default().get(), stats_, POLL_WAIT_MICROS);
|
||||
fs_->Poll(io_handles, io_handles.size()).PermitUncheckedError();
|
||||
}
|
||||
do {
|
||||
waiter = head_;
|
||||
head_ = waiter->next_;
|
||||
|
||||
for (size_t i = 0; i < waiter->num_reqs_; ++i) {
|
||||
waiter->del_fn_[i](waiter->io_handle_[i]);
|
||||
}
|
||||
waiter->awaiting_coro_.resume();
|
||||
} while (waiter != tail_);
|
||||
head_ = tail_ = nullptr;
|
||||
RecordInHistogram(stats_, MULTIGET_IO_BATCH_SIZE, num_reqs_);
|
||||
num_reqs_ = 0;
|
||||
}
|
||||
} // namespace ROCKSDB_NAMESPACE
|
||||
#endif // USE_COROUTINES
|
138
util/async_file_reader.h
Normal file
138
util/async_file_reader.h
Normal file
@ -0,0 +1,138 @@
|
||||
// Copyright (c) Meta Platforms, Inc. and its affiliates. All Rights Reserved.
|
||||
// This source code is licensed under both the GPLv2 (found in the
|
||||
// COPYING file in the root directory) and Apache 2.0 License
|
||||
// (found in the LICENSE.Apache file in the root directory).
|
||||
//
|
||||
#pragma once
|
||||
|
||||
#if USE_COROUTINES
|
||||
#include "file/random_access_file_reader.h"
|
||||
#include "folly/experimental/coro/ViaIfAsync.h"
|
||||
#include "port/port.h"
|
||||
#include "rocksdb/file_system.h"
|
||||
#include "rocksdb/statistics.h"
|
||||
#include "util/autovector.h"
|
||||
#include "util/stop_watch.h"
|
||||
|
||||
namespace ROCKSDB_NAMESPACE {
|
||||
class SingleThreadExecutor;
|
||||
|
||||
// AsyncFileReader implements the Awaitable concept, which allows calling
|
||||
// coroutines to co_await it. When the AsyncFileReader Awaitable is
|
||||
// resumed, it initiates the fie reads requested by the awaiting caller
|
||||
// by calling RandomAccessFileReader's ReadAsync. It then suspends the
|
||||
// awaiting coroutine. The suspended awaiter is later resumed by Poll().
|
||||
class AsyncFileReader {
|
||||
class ReadAwaiter;
|
||||
template <typename Awaiter>
|
||||
class ReadOperation;
|
||||
|
||||
public:
|
||||
AsyncFileReader(FileSystem* fs, Statistics* stats) : fs_(fs), stats_(stats) {}
|
||||
|
||||
~AsyncFileReader() {}
|
||||
|
||||
ReadOperation<ReadAwaiter> MultiReadAsync(RandomAccessFileReader* file,
|
||||
const IOOptions& opts,
|
||||
FSReadRequest* read_reqs,
|
||||
size_t num_reqs,
|
||||
AlignedBuf* aligned_buf) noexcept {
|
||||
return ReadOperation<ReadAwaiter>{*this, file, opts,
|
||||
read_reqs, num_reqs, aligned_buf};
|
||||
}
|
||||
|
||||
private:
|
||||
friend SingleThreadExecutor;
|
||||
|
||||
// Implementation of the Awaitable concept
|
||||
class ReadAwaiter {
|
||||
public:
|
||||
explicit ReadAwaiter(AsyncFileReader& reader, RandomAccessFileReader* file,
|
||||
const IOOptions& opts, FSReadRequest* read_reqs,
|
||||
size_t num_reqs, AlignedBuf* /*aligned_buf*/) noexcept
|
||||
: reader_(reader),
|
||||
file_(file),
|
||||
opts_(opts),
|
||||
read_reqs_(read_reqs),
|
||||
num_reqs_(num_reqs) {}
|
||||
|
||||
bool await_ready() noexcept { return false; }
|
||||
|
||||
// A return value of true means suspend the awaiter (calling coroutine). The
|
||||
// awaiting_coro parameter is the handle of the awaiter. The handle can be
|
||||
// resumed later, so we cache it here.
|
||||
bool await_suspend(
|
||||
std::experimental::coroutine_handle<> awaiting_coro) noexcept {
|
||||
awaiting_coro_ = awaiting_coro;
|
||||
// MultiReadAsyncImpl always returns true, so caller will be suspended
|
||||
return reader_.MultiReadAsyncImpl(this);
|
||||
}
|
||||
|
||||
void await_resume() noexcept {}
|
||||
|
||||
private:
|
||||
friend AsyncFileReader;
|
||||
|
||||
// The parameters passed to MultiReadAsync are cached here when the caller
|
||||
// calls MultiReadAsync. Later, when the execution of this awaitable is
|
||||
// started, these are used to do the actual IO
|
||||
AsyncFileReader& reader_;
|
||||
RandomAccessFileReader* file_;
|
||||
const IOOptions& opts_;
|
||||
FSReadRequest* read_reqs_;
|
||||
size_t num_reqs_;
|
||||
autovector<void*, 32> io_handle_;
|
||||
autovector<IOHandleDeleter, 32> del_fn_;
|
||||
std::experimental::coroutine_handle<> awaiting_coro_;
|
||||
ReadAwaiter* next_;
|
||||
};
|
||||
|
||||
// An instance of ReadOperation is returned to the caller of MultiGetAsync.
|
||||
// This represents an awaitable that can be started later.
|
||||
template <typename Awaiter>
|
||||
class ReadOperation {
|
||||
public:
|
||||
explicit ReadOperation(AsyncFileReader& reader,
|
||||
RandomAccessFileReader* file, const IOOptions& opts,
|
||||
FSReadRequest* read_reqs, size_t num_reqs,
|
||||
AlignedBuf* aligned_buf) noexcept
|
||||
: reader_(reader),
|
||||
file_(file),
|
||||
opts_(opts),
|
||||
read_reqs_(read_reqs),
|
||||
num_reqs_(num_reqs),
|
||||
aligned_buf_(aligned_buf) {}
|
||||
|
||||
auto viaIfAsync(folly::Executor::KeepAlive<> executor) const {
|
||||
return folly::coro::co_viaIfAsync(
|
||||
std::move(executor),
|
||||
Awaiter{reader_, file_, opts_, read_reqs_, num_reqs_, aligned_buf_});
|
||||
}
|
||||
|
||||
private:
|
||||
AsyncFileReader& reader_;
|
||||
RandomAccessFileReader* file_;
|
||||
const IOOptions& opts_;
|
||||
FSReadRequest* read_reqs_;
|
||||
size_t num_reqs_;
|
||||
AlignedBuf* aligned_buf_;
|
||||
};
|
||||
|
||||
// This function does the actual work when this awaitable starts execution
|
||||
bool MultiReadAsyncImpl(ReadAwaiter* awaiter);
|
||||
|
||||
// Called by the SingleThreadExecutor to poll for async IO completion.
|
||||
// This also resumes the awaiting coroutines.
|
||||
void Poll();
|
||||
|
||||
// Head of the queue of awaiters waiting for async IO completion
|
||||
ReadAwaiter* head_ = nullptr;
|
||||
// Tail of the awaiter queue
|
||||
ReadAwaiter* tail_ = nullptr;
|
||||
// Total number of pending async IOs
|
||||
size_t num_reqs_ = 0;
|
||||
FileSystem* fs_;
|
||||
Statistics* stats_;
|
||||
};
|
||||
} // namespace ROCKSDB_NAMESPACE
|
||||
#endif // USE_COROUTINES
|
55
util/single_thread_executor.h
Normal file
55
util/single_thread_executor.h
Normal file
@ -0,0 +1,55 @@
|
||||
// Copyright (c) Meta Platforms, Inc. and its affiliates. All Rights Reserved.
|
||||
// This source code is licensed under both the GPLv2 (found in the
|
||||
// COPYING file in the root directory) and Apache 2.0 License
|
||||
// (found in the LICENSE.Apache file in the root directory).
|
||||
//
|
||||
#pragma once
|
||||
|
||||
#if USE_COROUTINES
|
||||
#include <atomic>
|
||||
|
||||
#include "folly/CPortability.h"
|
||||
#include "folly/CppAttributes.h"
|
||||
#include "folly/Executor.h"
|
||||
#include "util/async_file_reader.h"
|
||||
|
||||
namespace ROCKSDB_NAMESPACE {
|
||||
// Implements a simple executor that runs callback functions in the same
|
||||
// thread, unlike CPUThreadExecutor which may schedule the callback on
|
||||
// another thread. Runs in a tight loop calling the queued callbacks,
|
||||
// and polls for async IO completions when idle. The completions will
|
||||
// resume suspended coroutines and they get added to the queue, which
|
||||
// will get picked up by this loop.
|
||||
// Any possibility of deadlock is precluded because the file system
|
||||
// guarantees that async IO completion callbacks will not be scheduled
|
||||
// to run in this thread or this executor.
|
||||
class SingleThreadExecutor : public folly::Executor {
|
||||
public:
|
||||
explicit SingleThreadExecutor(AsyncFileReader& reader)
|
||||
: reader_(reader), busy_(false) {}
|
||||
|
||||
void add(folly::Func callback) override {
|
||||
auto& q = q_;
|
||||
q.push(std::move(callback));
|
||||
if (q.size() == 1 && !busy_) {
|
||||
while (!q.empty()) {
|
||||
q.front()();
|
||||
q.pop();
|
||||
|
||||
if (q.empty()) {
|
||||
// Prevent recursion, as the Poll may queue resumed coroutines
|
||||
busy_ = true;
|
||||
reader_.Poll();
|
||||
busy_ = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
std::queue<folly::Func> q_;
|
||||
AsyncFileReader& reader_;
|
||||
bool busy_;
|
||||
};
|
||||
} // namespace ROCKSDB_NAMESPACE
|
||||
#endif // USE_COROUTINES
|
Loading…
Reference in New Issue
Block a user