2016-02-10 00:12:00 +01:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
2017-07-16 01:03:42 +02:00
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
//
|
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
|
|
|
#include "util/file_reader_writer.h"
|
|
|
|
|
|
|
|
#include <algorithm>
|
2015-08-27 00:25:59 +02:00
|
|
|
#include <mutex>
|
|
|
|
|
2017-04-06 04:02:00 +02:00
|
|
|
#include "monitoring/histogram.h"
|
|
|
|
#include "monitoring/iostats_context_imp.h"
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
#include "port/port.h"
|
2019-05-31 02:39:43 +02:00
|
|
|
#include "test_util/sync_point.h"
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
#include "util/random.h"
|
|
|
|
#include "util/rate_limiter.h"
|
|
|
|
|
|
|
|
namespace rocksdb {
|
2015-09-12 02:36:48 +02:00
|
|
|
|
2017-05-05 20:58:10 +02:00
|
|
|
#ifndef NDEBUG
|
|
|
|
namespace {
|
2017-05-18 03:46:03 +02:00
|
|
|
bool IsFileSectorAligned(const size_t off, size_t sector_size) {
|
2017-05-05 20:58:10 +02:00
|
|
|
return off % sector_size == 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
Status SequentialFileReader::Read(size_t n, Slice* result, char* scratch) {
|
2017-01-12 01:42:07 +01:00
|
|
|
Status s;
|
2017-01-13 21:01:08 +01:00
|
|
|
if (use_direct_io()) {
|
2017-02-16 19:25:06 +01:00
|
|
|
#ifndef ROCKSDB_LITE
|
2017-01-12 01:42:07 +01:00
|
|
|
size_t offset = offset_.fetch_add(n);
|
|
|
|
size_t alignment = file_->GetRequiredBufferAlignment();
|
|
|
|
size_t aligned_offset = TruncateToPageBoundary(alignment, offset);
|
|
|
|
size_t offset_advance = offset - aligned_offset;
|
|
|
|
size_t size = Roundup(offset + n, alignment) - aligned_offset;
|
|
|
|
size_t r = 0;
|
|
|
|
AlignedBuffer buf;
|
|
|
|
buf.Alignment(alignment);
|
|
|
|
buf.AllocateNewBuffer(size);
|
|
|
|
Slice tmp;
|
|
|
|
s = file_->PositionedRead(aligned_offset, size, &tmp, buf.BufferStart());
|
|
|
|
if (s.ok() && offset_advance < tmp.size()) {
|
|
|
|
buf.Size(tmp.size());
|
|
|
|
r = buf.Read(scratch, offset_advance,
|
|
|
|
std::min(tmp.size() - offset_advance, n));
|
|
|
|
}
|
|
|
|
*result = Slice(scratch, r);
|
2017-02-16 19:25:06 +01:00
|
|
|
#endif // !ROCKSDB_LITE
|
2017-01-12 01:42:07 +01:00
|
|
|
} else {
|
|
|
|
s = file_->Read(n, result, scratch);
|
|
|
|
}
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
IOSTATS_ADD(bytes_read, result->size());
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2017-01-12 01:42:07 +01:00
|
|
|
|
|
|
|
Status SequentialFileReader::Skip(uint64_t n) {
|
2017-02-16 19:25:06 +01:00
|
|
|
#ifndef ROCKSDB_LITE
|
2017-01-13 21:01:08 +01:00
|
|
|
if (use_direct_io()) {
|
2018-09-06 03:07:53 +02:00
|
|
|
offset_ += static_cast<size_t>(n);
|
2017-01-12 01:42:07 +01:00
|
|
|
return Status::OK();
|
|
|
|
}
|
2017-02-16 19:25:06 +01:00
|
|
|
#endif // !ROCKSDB_LITE
|
|
|
|
return file_->Skip(n);
|
2017-01-12 01:42:07 +01:00
|
|
|
}
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
|
|
|
|
Status RandomAccessFileReader::Read(uint64_t offset, size_t n, Slice* result,
|
2019-06-19 23:07:36 +02:00
|
|
|
char* scratch, bool for_compaction) const {
|
Measure file read latency histogram per level
Summary: In internal stats, remember read latency histogram, if statistics is enabled. It can be retrieved from DB::GetProperty() with "rocksdb.dbstats" property, if it is enabled.
Test Plan: Manually run db_bench and prints out "rocksdb.dbstats" by hand and make sure it prints out as expected
Reviewers: igor, IslamAbdelRahman, rven, kradhakrishnan, anthony, yhchiang
Reviewed By: yhchiang
Subscribers: MarkCallaghan, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D44193
2015-08-13 23:35:54 +02:00
|
|
|
Status s;
|
|
|
|
uint64_t elapsed = 0;
|
|
|
|
{
|
|
|
|
StopWatch sw(env_, stats_, hist_type_,
|
2018-07-14 03:29:50 +02:00
|
|
|
(stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/,
|
|
|
|
true /*delay_enabled*/);
|
2019-01-30 01:23:21 +01:00
|
|
|
auto prev_perf_level = GetPerfLevel();
|
Measure file read latency histogram per level
Summary: In internal stats, remember read latency histogram, if statistics is enabled. It can be retrieved from DB::GetProperty() with "rocksdb.dbstats" property, if it is enabled.
Test Plan: Manually run db_bench and prints out "rocksdb.dbstats" by hand and make sure it prints out as expected
Reviewers: igor, IslamAbdelRahman, rven, kradhakrishnan, anthony, yhchiang
Reviewed By: yhchiang
Subscribers: MarkCallaghan, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D44193
2015-08-13 23:35:54 +02:00
|
|
|
IOSTATS_TIMER_GUARD(read_nanos);
|
2017-01-13 21:01:08 +01:00
|
|
|
if (use_direct_io()) {
|
2017-02-16 19:25:06 +01:00
|
|
|
#ifndef ROCKSDB_LITE
|
2017-01-12 01:42:07 +01:00
|
|
|
size_t alignment = file_->GetRequiredBufferAlignment();
|
2018-09-06 03:07:53 +02:00
|
|
|
size_t aligned_offset = TruncateToPageBoundary(alignment, static_cast<size_t>(offset));
|
|
|
|
size_t offset_advance = static_cast<size_t>(offset) - aligned_offset;
|
|
|
|
size_t read_size = Roundup(static_cast<size_t>(offset + n), alignment) - aligned_offset;
|
2017-01-12 01:42:07 +01:00
|
|
|
AlignedBuffer buf;
|
|
|
|
buf.Alignment(alignment);
|
2017-06-13 23:51:22 +02:00
|
|
|
buf.AllocateNewBuffer(read_size);
|
|
|
|
while (buf.CurrentSize() < read_size) {
|
|
|
|
size_t allowed;
|
2019-06-19 23:07:36 +02:00
|
|
|
if (for_compaction && rate_limiter_ != nullptr) {
|
2017-06-13 23:51:22 +02:00
|
|
|
allowed = rate_limiter_->RequestToken(
|
|
|
|
buf.Capacity() - buf.CurrentSize(), buf.Alignment(),
|
|
|
|
Env::IOPriority::IO_LOW, stats_, RateLimiter::OpType::kRead);
|
|
|
|
} else {
|
|
|
|
assert(buf.CurrentSize() == 0);
|
|
|
|
allowed = read_size;
|
|
|
|
}
|
|
|
|
Slice tmp;
|
2018-10-13 03:34:03 +02:00
|
|
|
|
2019-01-16 18:48:01 +01:00
|
|
|
FileOperationInfo::TimePoint start_ts;
|
2018-10-13 03:34:03 +02:00
|
|
|
uint64_t orig_offset = 0;
|
|
|
|
if (ShouldNotifyListeners()) {
|
2019-01-16 18:48:01 +01:00
|
|
|
start_ts = std::chrono::system_clock::now();
|
2018-10-13 03:34:03 +02:00
|
|
|
orig_offset = aligned_offset + buf.CurrentSize();
|
|
|
|
}
|
2019-01-30 01:23:21 +01:00
|
|
|
{
|
|
|
|
IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, env_);
|
|
|
|
s = file_->Read(aligned_offset + buf.CurrentSize(), allowed, &tmp,
|
|
|
|
buf.Destination());
|
|
|
|
}
|
2018-10-13 03:34:03 +02:00
|
|
|
if (ShouldNotifyListeners()) {
|
2019-01-16 18:48:01 +01:00
|
|
|
auto finish_ts = std::chrono::system_clock::now();
|
|
|
|
NotifyOnFileReadFinish(orig_offset, tmp.size(), start_ts, finish_ts,
|
|
|
|
s);
|
2018-10-13 03:34:03 +02:00
|
|
|
}
|
|
|
|
|
2017-06-13 23:51:22 +02:00
|
|
|
buf.Size(buf.CurrentSize() + tmp.size());
|
|
|
|
if (!s.ok() || tmp.size() < allowed) {
|
|
|
|
break;
|
|
|
|
}
|
2017-01-12 01:42:07 +01:00
|
|
|
}
|
2017-06-13 23:51:22 +02:00
|
|
|
size_t res_len = 0;
|
|
|
|
if (s.ok() && offset_advance < buf.CurrentSize()) {
|
|
|
|
res_len = buf.Read(scratch, offset_advance,
|
|
|
|
std::min(buf.CurrentSize() - offset_advance, n));
|
|
|
|
}
|
|
|
|
*result = Slice(scratch, res_len);
|
2017-02-16 19:25:06 +01:00
|
|
|
#endif // !ROCKSDB_LITE
|
2017-01-12 01:42:07 +01:00
|
|
|
} else {
|
2017-06-13 23:51:22 +02:00
|
|
|
size_t pos = 0;
|
|
|
|
const char* res_scratch = nullptr;
|
|
|
|
while (pos < n) {
|
|
|
|
size_t allowed;
|
2019-06-19 23:07:36 +02:00
|
|
|
if (for_compaction && rate_limiter_ != nullptr) {
|
2018-07-14 03:29:50 +02:00
|
|
|
if (rate_limiter_->IsRateLimited(RateLimiter::OpType::kRead)) {
|
|
|
|
sw.DelayStart();
|
|
|
|
}
|
2017-06-13 23:51:22 +02:00
|
|
|
allowed = rate_limiter_->RequestToken(n - pos, 0 /* alignment */,
|
|
|
|
Env::IOPriority::IO_LOW, stats_,
|
|
|
|
RateLimiter::OpType::kRead);
|
2018-07-14 03:29:50 +02:00
|
|
|
if (rate_limiter_->IsRateLimited(RateLimiter::OpType::kRead)) {
|
|
|
|
sw.DelayStop();
|
|
|
|
}
|
2017-06-13 23:51:22 +02:00
|
|
|
} else {
|
|
|
|
allowed = n;
|
|
|
|
}
|
|
|
|
Slice tmp_result;
|
2018-10-13 03:34:03 +02:00
|
|
|
|
|
|
|
#ifndef ROCKSDB_LITE
|
2019-01-16 18:48:01 +01:00
|
|
|
FileOperationInfo::TimePoint start_ts;
|
2018-10-13 03:34:03 +02:00
|
|
|
if (ShouldNotifyListeners()) {
|
2019-01-16 18:48:01 +01:00
|
|
|
start_ts = std::chrono::system_clock::now();
|
2018-10-13 03:34:03 +02:00
|
|
|
}
|
|
|
|
#endif
|
2019-01-30 01:23:21 +01:00
|
|
|
{
|
|
|
|
IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, env_);
|
|
|
|
s = file_->Read(offset + pos, allowed, &tmp_result, scratch + pos);
|
|
|
|
}
|
2018-10-13 03:34:03 +02:00
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
if (ShouldNotifyListeners()) {
|
2019-01-16 18:48:01 +01:00
|
|
|
auto finish_ts = std::chrono::system_clock::now();
|
|
|
|
NotifyOnFileReadFinish(offset + pos, tmp_result.size(), start_ts,
|
|
|
|
finish_ts, s);
|
2018-10-13 03:34:03 +02:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2017-06-13 23:51:22 +02:00
|
|
|
if (res_scratch == nullptr) {
|
|
|
|
// we can't simply use `scratch` because reads of mmap'd files return
|
|
|
|
// data in a different buffer.
|
|
|
|
res_scratch = tmp_result.data();
|
|
|
|
} else {
|
|
|
|
// make sure chunks are inserted contiguously into `res_scratch`.
|
|
|
|
assert(tmp_result.data() == res_scratch + pos);
|
|
|
|
}
|
|
|
|
pos += tmp_result.size();
|
|
|
|
if (!s.ok() || tmp_result.size() < allowed) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
*result = Slice(res_scratch, s.ok() ? pos : 0);
|
2017-01-12 01:42:07 +01:00
|
|
|
}
|
Measure file read latency histogram per level
Summary: In internal stats, remember read latency histogram, if statistics is enabled. It can be retrieved from DB::GetProperty() with "rocksdb.dbstats" property, if it is enabled.
Test Plan: Manually run db_bench and prints out "rocksdb.dbstats" by hand and make sure it prints out as expected
Reviewers: igor, IslamAbdelRahman, rven, kradhakrishnan, anthony, yhchiang
Reviewed By: yhchiang
Subscribers: MarkCallaghan, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D44193
2015-08-13 23:35:54 +02:00
|
|
|
IOSTATS_ADD_IF_POSITIVE(bytes_read, result->size());
|
2019-01-30 01:23:21 +01:00
|
|
|
SetPerfLevel(prev_perf_level);
|
2019-07-01 05:52:34 +02:00
|
|
|
}
|
|
|
|
if (stats_ != nullptr && file_read_hist_ != nullptr) {
|
|
|
|
file_read_hist_->Add(elapsed);
|
|
|
|
}
|
|
|
|
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status RandomAccessFileReader::MultiRead(ReadRequest* read_reqs,
|
|
|
|
size_t num_reqs) const {
|
|
|
|
Status s;
|
|
|
|
uint64_t elapsed = 0;
|
|
|
|
assert(!use_direct_io());
|
|
|
|
assert(!for_compaction_);
|
|
|
|
{
|
|
|
|
StopWatch sw(env_, stats_, hist_type_,
|
|
|
|
(stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/,
|
|
|
|
true /*delay_enabled*/);
|
|
|
|
auto prev_perf_level = GetPerfLevel();
|
|
|
|
IOSTATS_TIMER_GUARD(read_nanos);
|
|
|
|
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
FileOperationInfo::TimePoint start_ts;
|
|
|
|
if (ShouldNotifyListeners()) {
|
|
|
|
start_ts = std::chrono::system_clock::now();
|
|
|
|
}
|
|
|
|
#endif // ROCKSDB_LITE
|
|
|
|
{
|
|
|
|
IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, env_);
|
|
|
|
s = file_->MultiRead(read_reqs, num_reqs);
|
|
|
|
}
|
|
|
|
for (size_t i = 0; i < num_reqs; ++i) {
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
if (ShouldNotifyListeners()) {
|
|
|
|
auto finish_ts = std::chrono::system_clock::now();
|
|
|
|
NotifyOnFileReadFinish(read_reqs[i].offset,
|
|
|
|
read_reqs[i].result.size(), start_ts, finish_ts,
|
|
|
|
read_reqs[i].status);
|
|
|
|
}
|
|
|
|
#endif // ROCKSDB_LITE
|
|
|
|
IOSTATS_ADD_IF_POSITIVE(bytes_read, read_reqs[i].result.size());
|
|
|
|
}
|
|
|
|
SetPerfLevel(prev_perf_level);
|
Measure file read latency histogram per level
Summary: In internal stats, remember read latency histogram, if statistics is enabled. It can be retrieved from DB::GetProperty() with "rocksdb.dbstats" property, if it is enabled.
Test Plan: Manually run db_bench and prints out "rocksdb.dbstats" by hand and make sure it prints out as expected
Reviewers: igor, IslamAbdelRahman, rven, kradhakrishnan, anthony, yhchiang
Reviewed By: yhchiang
Subscribers: MarkCallaghan, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D44193
2015-08-13 23:35:54 +02:00
|
|
|
}
|
|
|
|
if (stats_ != nullptr && file_read_hist_ != nullptr) {
|
|
|
|
file_read_hist_->Add(elapsed);
|
|
|
|
}
|
2019-01-30 01:23:21 +01:00
|
|
|
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status WritableFileWriter::Append(const Slice& data) {
|
|
|
|
const char* src = data.data();
|
|
|
|
size_t left = data.size();
|
|
|
|
Status s;
|
|
|
|
pending_sync_ = true;
|
|
|
|
|
2015-10-14 23:08:50 +02:00
|
|
|
TEST_KILL_RANDOM("WritableFileWriter::Append:0",
|
|
|
|
rocksdb_kill_odds * REDUCE_ODDS2);
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
|
Add options.compaction_measure_io_stats to print write I/O stats in compactions
Summary:
Add options.compaction_measure_io_stats to print out / pass to listener accumulated time spent on write calls. Example outputs in info logs:
2015/08/12-16:27:59.463944 7fd428bff700 (Original Log Time 2015/08/12-16:27:59.463922) EVENT_LOG_v1 {"time_micros": 1439422079463897, "job": 6, "event": "compaction_finished", "output_level": 1, "num_output_files": 4, "total_output_size": 6900525, "num_input_records": 111483, "num_output_records": 106877, "file_write_nanos": 15663206, "file_range_sync_nanos": 649588, "file_fsync_nanos": 349614797, "file_prepare_write_nanos": 1505812, "lsm_state": [2, 4, 0, 0, 0, 0, 0]}
Add two more counters in iostats_context.
Also add a parameter of db_bench.
Test Plan: Add a unit test. Also manually verify LOG outputs in db_bench
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D44115
2015-08-13 02:24:45 +02:00
|
|
|
{
|
|
|
|
IOSTATS_TIMER_GUARD(prepare_write_nanos);
|
|
|
|
TEST_SYNC_POINT("WritableFileWriter::Append:BeforePrepareWrite");
|
|
|
|
writable_file_->PrepareWrite(static_cast<size_t>(GetFileSize()), left);
|
|
|
|
}
|
2015-09-11 18:57:02 +02:00
|
|
|
|
2017-06-13 13:34:51 +02:00
|
|
|
// See whether we need to enlarge the buffer to avoid the flush
|
|
|
|
if (buf_.Capacity() - buf_.CurrentSize() < left) {
|
|
|
|
for (size_t cap = buf_.Capacity();
|
|
|
|
cap < max_buffer_size_; // There is still room to increase
|
|
|
|
cap *= 2) {
|
|
|
|
// See whether the next available size is large enough.
|
|
|
|
// Buffer will never be increased to more than max_buffer_size_.
|
|
|
|
size_t desired_capacity = std::min(cap * 2, max_buffer_size_);
|
|
|
|
if (desired_capacity - buf_.CurrentSize() >= left ||
|
|
|
|
(use_direct_io() && desired_capacity == max_buffer_size_)) {
|
|
|
|
buf_.AllocateNewBuffer(desired_capacity, true);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-12-22 21:51:29 +01:00
|
|
|
// Flush only when buffered I/O
|
2017-03-15 06:23:21 +01:00
|
|
|
if (!use_direct_io() && (buf_.Capacity() - buf_.CurrentSize()) < left) {
|
2015-09-12 02:36:48 +02:00
|
|
|
if (buf_.CurrentSize() > 0) {
|
2015-09-11 18:57:02 +02:00
|
|
|
s = Flush();
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
}
|
2015-09-12 02:36:48 +02:00
|
|
|
assert(buf_.CurrentSize() == 0);
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
}
|
|
|
|
|
2016-12-22 21:51:29 +01:00
|
|
|
// We never write directly to disk with direct I/O on.
|
2015-09-11 18:57:02 +02:00
|
|
|
// or we simply use it for its original purpose to accumulate many small
|
|
|
|
// chunks
|
2017-03-15 06:23:21 +01:00
|
|
|
if (use_direct_io() || (buf_.Capacity() >= left)) {
|
2015-09-11 18:57:02 +02:00
|
|
|
while (left > 0) {
|
|
|
|
size_t appended = buf_.Append(src, left);
|
|
|
|
left -= appended;
|
|
|
|
src += appended;
|
|
|
|
|
|
|
|
if (left > 0) {
|
|
|
|
s = Flush();
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
if (!s.ok()) {
|
2015-09-11 18:57:02 +02:00
|
|
|
break;
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
}
|
2015-09-11 18:57:02 +02:00
|
|
|
}
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
}
|
2015-09-11 18:57:02 +02:00
|
|
|
} else {
|
|
|
|
// Writing directly to file bypassing the buffer
|
2015-09-12 02:36:48 +02:00
|
|
|
assert(buf_.CurrentSize() == 0);
|
2015-09-11 18:57:02 +02:00
|
|
|
s = WriteBuffered(src, left);
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
}
|
2015-09-11 18:57:02 +02:00
|
|
|
|
2015-10-14 23:08:50 +02:00
|
|
|
TEST_KILL_RANDOM("WritableFileWriter::Append:1", rocksdb_kill_odds);
|
2015-10-28 05:04:00 +01:00
|
|
|
if (s.ok()) {
|
|
|
|
filesize_ += data.size();
|
|
|
|
}
|
|
|
|
return s;
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
}
|
|
|
|
|
2018-03-27 05:14:24 +02:00
|
|
|
Status WritableFileWriter::Pad(const size_t pad_bytes) {
|
|
|
|
assert(pad_bytes < kDefaultPageSize);
|
|
|
|
size_t left = pad_bytes;
|
|
|
|
size_t cap = buf_.Capacity() - buf_.CurrentSize();
|
|
|
|
|
|
|
|
// Assume pad_bytes is small compared to buf_ capacity. So we always
|
|
|
|
// use buf_ rather than write directly to file in certain cases like
|
|
|
|
// Append() does.
|
|
|
|
while (left) {
|
|
|
|
size_t append_bytes = std::min(cap, left);
|
|
|
|
buf_.PadWith(append_bytes, 0);
|
|
|
|
left -= append_bytes;
|
|
|
|
if (left > 0) {
|
|
|
|
Status s = Flush();
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
cap = buf_.Capacity() - buf_.CurrentSize();
|
|
|
|
}
|
|
|
|
pending_sync_ = true;
|
|
|
|
filesize_ += pad_bytes;
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
Status WritableFileWriter::Close() {
|
2015-09-11 18:57:02 +02:00
|
|
|
|
|
|
|
// Do not quit immediately on failure the file MUST be closed
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
Status s;
|
2015-09-11 18:57:02 +02:00
|
|
|
|
|
|
|
// Possible to close it twice now as we MUST close
|
|
|
|
// in __dtor, simply flushing is not enough
|
|
|
|
// Windows when pre-allocating does not fill with zeros
|
|
|
|
// also with unbuffered access we also set the end of data.
|
|
|
|
if (!writable_file_) {
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2015-09-11 18:57:02 +02:00
|
|
|
s = Flush(); // flush cache to OS
|
|
|
|
|
2017-03-17 19:25:01 +01:00
|
|
|
Status interim;
|
2016-12-22 21:51:29 +01:00
|
|
|
// In direct I/O mode we write whole pages so
|
2015-09-11 18:57:02 +02:00
|
|
|
// we need to let the file know where data ends.
|
2017-03-17 19:25:01 +01:00
|
|
|
if (use_direct_io()) {
|
|
|
|
interim = writable_file_->Truncate(filesize_);
|
2018-02-14 01:20:13 +01:00
|
|
|
if (interim.ok()) {
|
|
|
|
interim = writable_file_->Fsync();
|
|
|
|
}
|
2017-03-17 19:25:01 +01:00
|
|
|
if (!interim.ok() && s.ok()) {
|
|
|
|
s = interim;
|
|
|
|
}
|
2015-09-11 18:57:02 +02:00
|
|
|
}
|
|
|
|
|
2015-10-14 23:08:50 +02:00
|
|
|
TEST_KILL_RANDOM("WritableFileWriter::Close:0", rocksdb_kill_odds);
|
2015-09-11 18:57:02 +02:00
|
|
|
interim = writable_file_->Close();
|
|
|
|
if (!interim.ok() && s.ok()) {
|
|
|
|
s = interim;
|
|
|
|
}
|
|
|
|
|
|
|
|
writable_file_.reset();
|
2015-10-16 23:33:47 +02:00
|
|
|
TEST_KILL_RANDOM("WritableFileWriter::Close:1", rocksdb_kill_odds);
|
2015-09-11 18:57:02 +02:00
|
|
|
|
|
|
|
return s;
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
}
|
|
|
|
|
2016-12-22 21:51:29 +01:00
|
|
|
// write out the cached data to the OS cache or storage if direct I/O
|
|
|
|
// enabled
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
Status WritableFileWriter::Flush() {
|
2015-09-11 18:57:02 +02:00
|
|
|
Status s;
|
2015-10-14 23:08:50 +02:00
|
|
|
TEST_KILL_RANDOM("WritableFileWriter::Flush:0",
|
|
|
|
rocksdb_kill_odds * REDUCE_ODDS2);
|
2015-09-11 18:57:02 +02:00
|
|
|
|
2015-09-12 02:36:48 +02:00
|
|
|
if (buf_.CurrentSize() > 0) {
|
2017-03-15 06:23:21 +01:00
|
|
|
if (use_direct_io()) {
|
2017-02-16 19:25:06 +01:00
|
|
|
#ifndef ROCKSDB_LITE
|
2018-12-11 22:51:08 +01:00
|
|
|
if (pending_sync_) {
|
|
|
|
s = WriteDirect();
|
|
|
|
}
|
2017-02-16 19:25:06 +01:00
|
|
|
#endif // !ROCKSDB_LITE
|
2015-09-11 18:57:02 +02:00
|
|
|
} else {
|
2016-12-22 21:51:29 +01:00
|
|
|
s = WriteBuffered(buf_.BufferStart(), buf_.CurrentSize());
|
2015-09-11 18:57:02 +02:00
|
|
|
}
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-09-11 18:57:02 +02:00
|
|
|
s = writable_file_->Flush();
|
|
|
|
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
|
|
|
|
// sync OS cache to disk for every bytes_per_sync_
|
|
|
|
// TODO: give log file and sst file different options (log
|
|
|
|
// files could be potentially cached in OS for their whole
|
|
|
|
// life time, thus we might not want to flush at all).
|
RangeSync not to sync last 1MB of the file
Summary:
From other ones' investigation:
"sync_file_range() behavior highly depends on kernel version and filesystem.
xfs does neighbor page flushing outside of the specified ranges. For example, sync_file_range(fd, 8192, 16384) does not only trigger flushing page #3 to #4, but also flushing many more dirty pages (i.e. up to page#16)... Ranges of the sync_file_range() should be far enough from write() offset (at least 1MB)."
Test Plan: make all check
Reviewers: igor, rven, kradhakrishnan, yhchiang, IslamAbdelRahman, anthony
Reviewed By: anthony
Subscribers: yoshinorim, MarkCallaghan, sumeet, domas, dhruba, leveldb, ljin
Differential Revision: https://reviews.facebook.net/D15807
2015-07-20 23:46:15 +02:00
|
|
|
|
|
|
|
// We try to avoid sync to the last 1MB of data. For two reasons:
|
|
|
|
// (1) avoid rewrite the same page that is modified later.
|
|
|
|
// (2) for older version of OS, write can block while writing out
|
|
|
|
// the page.
|
|
|
|
// Xfs does neighbor page flushing outside of the specified ranges. We
|
|
|
|
// need to make sure sync range is far from the write offset.
|
2017-03-15 06:23:21 +01:00
|
|
|
if (!use_direct_io() && bytes_per_sync_) {
|
2015-09-11 18:57:02 +02:00
|
|
|
const uint64_t kBytesNotSyncRange = 1024 * 1024; // recent 1MB is not synced.
|
|
|
|
const uint64_t kBytesAlignWhenSync = 4 * 1024; // Align 4KB.
|
RangeSync not to sync last 1MB of the file
Summary:
From other ones' investigation:
"sync_file_range() behavior highly depends on kernel version and filesystem.
xfs does neighbor page flushing outside of the specified ranges. For example, sync_file_range(fd, 8192, 16384) does not only trigger flushing page #3 to #4, but also flushing many more dirty pages (i.e. up to page#16)... Ranges of the sync_file_range() should be far enough from write() offset (at least 1MB)."
Test Plan: make all check
Reviewers: igor, rven, kradhakrishnan, yhchiang, IslamAbdelRahman, anthony
Reviewed By: anthony
Subscribers: yoshinorim, MarkCallaghan, sumeet, domas, dhruba, leveldb, ljin
Differential Revision: https://reviews.facebook.net/D15807
2015-07-20 23:46:15 +02:00
|
|
|
if (filesize_ > kBytesNotSyncRange) {
|
|
|
|
uint64_t offset_sync_to = filesize_ - kBytesNotSyncRange;
|
|
|
|
offset_sync_to -= offset_sync_to % kBytesAlignWhenSync;
|
|
|
|
assert(offset_sync_to >= last_sync_size_);
|
|
|
|
if (offset_sync_to > 0 &&
|
|
|
|
offset_sync_to - last_sync_size_ >= bytes_per_sync_) {
|
2015-09-11 18:57:02 +02:00
|
|
|
s = RangeSync(last_sync_size_, offset_sync_to - last_sync_size_);
|
RangeSync not to sync last 1MB of the file
Summary:
From other ones' investigation:
"sync_file_range() behavior highly depends on kernel version and filesystem.
xfs does neighbor page flushing outside of the specified ranges. For example, sync_file_range(fd, 8192, 16384) does not only trigger flushing page #3 to #4, but also flushing many more dirty pages (i.e. up to page#16)... Ranges of the sync_file_range() should be far enough from write() offset (at least 1MB)."
Test Plan: make all check
Reviewers: igor, rven, kradhakrishnan, yhchiang, IslamAbdelRahman, anthony
Reviewed By: anthony
Subscribers: yoshinorim, MarkCallaghan, sumeet, domas, dhruba, leveldb, ljin
Differential Revision: https://reviews.facebook.net/D15807
2015-07-20 23:46:15 +02:00
|
|
|
last_sync_size_ = offset_sync_to;
|
|
|
|
}
|
|
|
|
}
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
}
|
|
|
|
|
2015-09-11 18:57:02 +02:00
|
|
|
return s;
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
Status WritableFileWriter::Sync(bool use_fsync) {
|
|
|
|
Status s = Flush();
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
2015-10-14 23:08:50 +02:00
|
|
|
TEST_KILL_RANDOM("WritableFileWriter::Sync:0", rocksdb_kill_odds);
|
2017-03-15 06:23:21 +01:00
|
|
|
if (!use_direct_io() && pending_sync_) {
|
[wal changes 3/3] method in DB to sync WAL without blocking writers
Summary:
Subj. We really need this feature.
Previous diff D40899 has most of the changes to make this possible, this diff just adds the method.
Test Plan: `make check`, the new test fails without this diff; ran with ASAN, TSAN and valgrind.
Reviewers: igor, rven, IslamAbdelRahman, anthony, kradhakrishnan, tnovak, yhchiang, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, maykov, hermanlee4, yoshinorim, tnovak, dhruba
Differential Revision: https://reviews.facebook.net/D40905
2015-08-05 15:06:39 +02:00
|
|
|
s = SyncInternal(use_fsync);
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
}
|
2015-10-14 23:08:50 +02:00
|
|
|
TEST_KILL_RANDOM("WritableFileWriter::Sync:1", rocksdb_kill_odds);
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
pending_sync_ = false;
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
[wal changes 3/3] method in DB to sync WAL without blocking writers
Summary:
Subj. We really need this feature.
Previous diff D40899 has most of the changes to make this possible, this diff just adds the method.
Test Plan: `make check`, the new test fails without this diff; ran with ASAN, TSAN and valgrind.
Reviewers: igor, rven, IslamAbdelRahman, anthony, kradhakrishnan, tnovak, yhchiang, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, maykov, hermanlee4, yoshinorim, tnovak, dhruba
Differential Revision: https://reviews.facebook.net/D40905
2015-08-05 15:06:39 +02:00
|
|
|
Status WritableFileWriter::SyncWithoutFlush(bool use_fsync) {
|
|
|
|
if (!writable_file_->IsSyncThreadSafe()) {
|
|
|
|
return Status::NotSupported(
|
|
|
|
"Can't WritableFileWriter::SyncWithoutFlush() because "
|
|
|
|
"WritableFile::IsSyncThreadSafe() is false");
|
|
|
|
}
|
2015-08-05 20:56:19 +02:00
|
|
|
TEST_SYNC_POINT("WritableFileWriter::SyncWithoutFlush:1");
|
|
|
|
Status s = SyncInternal(use_fsync);
|
|
|
|
TEST_SYNC_POINT("WritableFileWriter::SyncWithoutFlush:2");
|
|
|
|
return s;
|
[wal changes 3/3] method in DB to sync WAL without blocking writers
Summary:
Subj. We really need this feature.
Previous diff D40899 has most of the changes to make this possible, this diff just adds the method.
Test Plan: `make check`, the new test fails without this diff; ran with ASAN, TSAN and valgrind.
Reviewers: igor, rven, IslamAbdelRahman, anthony, kradhakrishnan, tnovak, yhchiang, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, maykov, hermanlee4, yoshinorim, tnovak, dhruba
Differential Revision: https://reviews.facebook.net/D40905
2015-08-05 15:06:39 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
Status WritableFileWriter::SyncInternal(bool use_fsync) {
|
|
|
|
Status s;
|
Add options.compaction_measure_io_stats to print write I/O stats in compactions
Summary:
Add options.compaction_measure_io_stats to print out / pass to listener accumulated time spent on write calls. Example outputs in info logs:
2015/08/12-16:27:59.463944 7fd428bff700 (Original Log Time 2015/08/12-16:27:59.463922) EVENT_LOG_v1 {"time_micros": 1439422079463897, "job": 6, "event": "compaction_finished", "output_level": 1, "num_output_files": 4, "total_output_size": 6900525, "num_input_records": 111483, "num_output_records": 106877, "file_write_nanos": 15663206, "file_range_sync_nanos": 649588, "file_fsync_nanos": 349614797, "file_prepare_write_nanos": 1505812, "lsm_state": [2, 4, 0, 0, 0, 0, 0]}
Add two more counters in iostats_context.
Also add a parameter of db_bench.
Test Plan: Add a unit test. Also manually verify LOG outputs in db_bench
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D44115
2015-08-13 02:24:45 +02:00
|
|
|
IOSTATS_TIMER_GUARD(fsync_nanos);
|
|
|
|
TEST_SYNC_POINT("WritableFileWriter::SyncInternal:0");
|
2019-01-30 01:23:21 +01:00
|
|
|
auto prev_perf_level = GetPerfLevel();
|
|
|
|
IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, env_);
|
[wal changes 3/3] method in DB to sync WAL without blocking writers
Summary:
Subj. We really need this feature.
Previous diff D40899 has most of the changes to make this possible, this diff just adds the method.
Test Plan: `make check`, the new test fails without this diff; ran with ASAN, TSAN and valgrind.
Reviewers: igor, rven, IslamAbdelRahman, anthony, kradhakrishnan, tnovak, yhchiang, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, maykov, hermanlee4, yoshinorim, tnovak, dhruba
Differential Revision: https://reviews.facebook.net/D40905
2015-08-05 15:06:39 +02:00
|
|
|
if (use_fsync) {
|
|
|
|
s = writable_file_->Fsync();
|
|
|
|
} else {
|
|
|
|
s = writable_file_->Sync();
|
|
|
|
}
|
2019-01-30 01:23:21 +01:00
|
|
|
SetPerfLevel(prev_perf_level);
|
[wal changes 3/3] method in DB to sync WAL without blocking writers
Summary:
Subj. We really need this feature.
Previous diff D40899 has most of the changes to make this possible, this diff just adds the method.
Test Plan: `make check`, the new test fails without this diff; ran with ASAN, TSAN and valgrind.
Reviewers: igor, rven, IslamAbdelRahman, anthony, kradhakrishnan, tnovak, yhchiang, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, maykov, hermanlee4, yoshinorim, tnovak, dhruba
Differential Revision: https://reviews.facebook.net/D40905
2015-08-05 15:06:39 +02:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2015-11-11 02:03:42 +01:00
|
|
|
Status WritableFileWriter::RangeSync(uint64_t offset, uint64_t nbytes) {
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
IOSTATS_TIMER_GUARD(range_sync_nanos);
|
Add options.compaction_measure_io_stats to print write I/O stats in compactions
Summary:
Add options.compaction_measure_io_stats to print out / pass to listener accumulated time spent on write calls. Example outputs in info logs:
2015/08/12-16:27:59.463944 7fd428bff700 (Original Log Time 2015/08/12-16:27:59.463922) EVENT_LOG_v1 {"time_micros": 1439422079463897, "job": 6, "event": "compaction_finished", "output_level": 1, "num_output_files": 4, "total_output_size": 6900525, "num_input_records": 111483, "num_output_records": 106877, "file_write_nanos": 15663206, "file_range_sync_nanos": 649588, "file_fsync_nanos": 349614797, "file_prepare_write_nanos": 1505812, "lsm_state": [2, 4, 0, 0, 0, 0, 0]}
Add two more counters in iostats_context.
Also add a parameter of db_bench.
Test Plan: Add a unit test. Also manually verify LOG outputs in db_bench
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D44115
2015-08-13 02:24:45 +02:00
|
|
|
TEST_SYNC_POINT("WritableFileWriter::RangeSync:0");
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
return writable_file_->RangeSync(offset, nbytes);
|
|
|
|
}
|
|
|
|
|
2015-09-11 18:57:02 +02:00
|
|
|
// This method writes to disk the specified data and makes use of the rate
|
|
|
|
// limiter if available
|
|
|
|
Status WritableFileWriter::WriteBuffered(const char* data, size_t size) {
|
|
|
|
Status s;
|
2017-03-15 06:23:21 +01:00
|
|
|
assert(!use_direct_io());
|
2015-09-11 18:57:02 +02:00
|
|
|
const char* src = data;
|
|
|
|
size_t left = size;
|
|
|
|
|
|
|
|
while (left > 0) {
|
2017-06-13 23:51:22 +02:00
|
|
|
size_t allowed;
|
|
|
|
if (rate_limiter_ != nullptr) {
|
|
|
|
allowed = rate_limiter_->RequestToken(
|
|
|
|
left, 0 /* alignment */, writable_file_->GetIOPriority(), stats_,
|
|
|
|
RateLimiter::OpType::kWrite);
|
|
|
|
} else {
|
|
|
|
allowed = left;
|
|
|
|
}
|
2015-09-11 18:57:02 +02:00
|
|
|
|
|
|
|
{
|
|
|
|
IOSTATS_TIMER_GUARD(write_nanos);
|
|
|
|
TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend");
|
2018-10-13 03:34:03 +02:00
|
|
|
|
|
|
|
#ifndef ROCKSDB_LITE
|
2019-01-16 18:48:01 +01:00
|
|
|
FileOperationInfo::TimePoint start_ts;
|
2018-10-13 03:34:03 +02:00
|
|
|
uint64_t old_size = writable_file_->GetFileSize();
|
|
|
|
if (ShouldNotifyListeners()) {
|
2019-01-16 18:48:01 +01:00
|
|
|
start_ts = std::chrono::system_clock::now();
|
2018-10-13 03:34:03 +02:00
|
|
|
old_size = next_write_offset_;
|
|
|
|
}
|
|
|
|
#endif
|
2019-01-30 01:23:21 +01:00
|
|
|
{
|
|
|
|
auto prev_perf_level = GetPerfLevel();
|
|
|
|
IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, env_);
|
|
|
|
s = writable_file_->Append(Slice(src, allowed));
|
|
|
|
SetPerfLevel(prev_perf_level);
|
|
|
|
}
|
2018-10-13 03:34:03 +02:00
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
if (ShouldNotifyListeners()) {
|
2019-01-16 18:48:01 +01:00
|
|
|
auto finish_ts = std::chrono::system_clock::now();
|
|
|
|
NotifyOnFileWriteFinish(old_size, allowed, start_ts, finish_ts, s);
|
2018-10-13 03:34:03 +02:00
|
|
|
}
|
|
|
|
#endif
|
2015-09-11 18:57:02 +02:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
IOSTATS_ADD(bytes_written, allowed);
|
2015-10-14 23:08:50 +02:00
|
|
|
TEST_KILL_RANDOM("WritableFileWriter::WriteBuffered:0", rocksdb_kill_odds);
|
2015-09-11 18:57:02 +02:00
|
|
|
|
|
|
|
left -= allowed;
|
|
|
|
src += allowed;
|
|
|
|
}
|
2015-09-12 02:36:48 +02:00
|
|
|
buf_.Size(0);
|
2015-09-11 18:57:02 +02:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// This flushes the accumulated data in the buffer. We pad data with zeros if
|
|
|
|
// necessary to the whole page.
|
|
|
|
// However, during automatic flushes padding would not be necessary.
|
|
|
|
// We always use RateLimiter if available. We move (Refit) any buffer bytes
|
|
|
|
// that are left over the
|
|
|
|
// whole number of pages to be written again on the next flush because we can
|
|
|
|
// only write on aligned
|
|
|
|
// offsets.
|
2017-02-16 19:25:06 +01:00
|
|
|
#ifndef ROCKSDB_LITE
|
2016-12-22 21:51:29 +01:00
|
|
|
Status WritableFileWriter::WriteDirect() {
|
2017-03-15 06:23:21 +01:00
|
|
|
assert(use_direct_io());
|
2015-09-11 18:57:02 +02:00
|
|
|
Status s;
|
2015-09-12 02:36:48 +02:00
|
|
|
const size_t alignment = buf_.Alignment();
|
2015-09-11 18:57:02 +02:00
|
|
|
assert((next_write_offset_ % alignment) == 0);
|
|
|
|
|
|
|
|
// Calculate whole page final file advance if all writes succeed
|
|
|
|
size_t file_advance =
|
2015-09-12 02:36:48 +02:00
|
|
|
TruncateToPageBoundary(alignment, buf_.CurrentSize());
|
2015-09-11 18:57:02 +02:00
|
|
|
|
|
|
|
// Calculate the leftover tail, we write it here padded with zeros BUT we
|
|
|
|
// will write
|
|
|
|
// it again in the future either on Close() OR when the current whole page
|
|
|
|
// fills out
|
2015-09-12 02:36:48 +02:00
|
|
|
size_t leftover_tail = buf_.CurrentSize() - file_advance;
|
2015-09-11 18:57:02 +02:00
|
|
|
|
|
|
|
// Round up and pad
|
|
|
|
buf_.PadToAlignmentWith(0);
|
|
|
|
|
2015-09-12 02:36:48 +02:00
|
|
|
const char* src = buf_.BufferStart();
|
2015-09-11 18:57:02 +02:00
|
|
|
uint64_t write_offset = next_write_offset_;
|
2015-09-12 02:36:48 +02:00
|
|
|
size_t left = buf_.CurrentSize();
|
2015-09-11 18:57:02 +02:00
|
|
|
|
|
|
|
while (left > 0) {
|
|
|
|
// Check how much is allowed
|
2017-06-13 23:51:22 +02:00
|
|
|
size_t size;
|
|
|
|
if (rate_limiter_ != nullptr) {
|
|
|
|
size = rate_limiter_->RequestToken(left, buf_.Alignment(),
|
|
|
|
writable_file_->GetIOPriority(),
|
|
|
|
stats_, RateLimiter::OpType::kWrite);
|
|
|
|
} else {
|
|
|
|
size = left;
|
|
|
|
}
|
2015-09-11 18:57:02 +02:00
|
|
|
|
|
|
|
{
|
|
|
|
IOSTATS_TIMER_GUARD(write_nanos);
|
|
|
|
TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend");
|
2019-01-16 18:48:01 +01:00
|
|
|
FileOperationInfo::TimePoint start_ts;
|
2018-10-13 03:34:03 +02:00
|
|
|
if (ShouldNotifyListeners()) {
|
2019-01-16 18:48:01 +01:00
|
|
|
start_ts = std::chrono::system_clock::now();
|
2018-10-13 03:34:03 +02:00
|
|
|
}
|
2016-12-22 21:51:29 +01:00
|
|
|
// direct writes must be positional
|
2015-09-12 02:36:48 +02:00
|
|
|
s = writable_file_->PositionedAppend(Slice(src, size), write_offset);
|
2018-10-13 03:34:03 +02:00
|
|
|
if (ShouldNotifyListeners()) {
|
2019-01-16 18:48:01 +01:00
|
|
|
auto finish_ts = std::chrono::system_clock::now();
|
|
|
|
NotifyOnFileWriteFinish(write_offset, size, start_ts, finish_ts, s);
|
2018-10-13 03:34:03 +02:00
|
|
|
}
|
2015-09-11 18:57:02 +02:00
|
|
|
if (!s.ok()) {
|
2015-09-12 02:36:48 +02:00
|
|
|
buf_.Size(file_advance + leftover_tail);
|
2015-09-11 18:57:02 +02:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
IOSTATS_ADD(bytes_written, size);
|
|
|
|
left -= size;
|
|
|
|
src += size;
|
|
|
|
write_offset += size;
|
|
|
|
assert((next_write_offset_ % alignment) == 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (s.ok()) {
|
|
|
|
// Move the tail to the beginning of the buffer
|
|
|
|
// This never happens during normal Append but rather during
|
|
|
|
// explicit call to Flush()/Sync() or Close()
|
|
|
|
buf_.RefitTail(file_advance, leftover_tail);
|
|
|
|
// This is where we start writing next time which may or not be
|
|
|
|
// the actual file size on disk. They match if the buffer size
|
|
|
|
// is a multiple of whole pages otherwise filesize_ is leftover_tail
|
|
|
|
// behind
|
|
|
|
next_write_offset_ += file_advance;
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
2017-02-16 19:25:06 +01:00
|
|
|
#endif // !ROCKSDB_LITE
|
2015-09-11 18:57:02 +02:00
|
|
|
|
2015-08-27 00:25:59 +02:00
|
|
|
namespace {
|
|
|
|
class ReadaheadRandomAccessFile : public RandomAccessFile {
|
|
|
|
public:
|
2015-10-29 23:52:32 +01:00
|
|
|
ReadaheadRandomAccessFile(std::unique_ptr<RandomAccessFile>&& file,
|
|
|
|
size_t readahead_size)
|
|
|
|
: file_(std::move(file)),
|
2017-02-18 20:54:49 +01:00
|
|
|
alignment_(file_->GetRequiredBufferAlignment()),
|
|
|
|
readahead_size_(Roundup(readahead_size, alignment_)),
|
2015-10-29 23:52:32 +01:00
|
|
|
buffer_(),
|
2018-07-12 00:42:49 +02:00
|
|
|
buffer_offset_(0) {
|
2017-04-27 21:19:55 +02:00
|
|
|
buffer_.Alignment(alignment_);
|
|
|
|
buffer_.AllocateNewBuffer(readahead_size_);
|
2015-10-29 23:52:32 +01:00
|
|
|
}
|
2015-09-11 18:57:02 +02:00
|
|
|
|
|
|
|
ReadaheadRandomAccessFile(const ReadaheadRandomAccessFile&) = delete;
|
|
|
|
|
|
|
|
ReadaheadRandomAccessFile& operator=(const ReadaheadRandomAccessFile&) = delete;
|
2015-08-27 00:25:59 +02:00
|
|
|
|
2019-02-14 22:52:47 +01:00
|
|
|
Status Read(uint64_t offset, size_t n, Slice* result,
|
|
|
|
char* scratch) const override {
|
|
|
|
if (n + alignment_ >= readahead_size_) {
|
|
|
|
return file_->Read(offset, n, result, scratch);
|
|
|
|
}
|
|
|
|
|
|
|
|
std::unique_lock<std::mutex> lk(lock_);
|
|
|
|
|
|
|
|
size_t cached_len = 0;
|
|
|
|
// Check if there is a cache hit, means that [offset, offset + n) is either
|
|
|
|
// completely or partially in the buffer
|
|
|
|
// If it's completely cached, including end of file case when offset + n is
|
|
|
|
// greater than EOF, return
|
|
|
|
if (TryReadFromCache(offset, n, &cached_len, scratch) &&
|
|
|
|
(cached_len == n ||
|
|
|
|
// End of file
|
|
|
|
buffer_.CurrentSize() < readahead_size_)) {
|
|
|
|
*result = Slice(scratch, cached_len);
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
size_t advanced_offset = static_cast<size_t>(offset + cached_len);
|
|
|
|
// In the case of cache hit advanced_offset is already aligned, means that
|
|
|
|
// chunk_offset equals to advanced_offset
|
|
|
|
size_t chunk_offset = TruncateToPageBoundary(alignment_, advanced_offset);
|
|
|
|
Slice readahead_result;
|
|
|
|
|
|
|
|
Status s = ReadIntoBuffer(chunk_offset, readahead_size_);
|
|
|
|
if (s.ok()) {
|
|
|
|
// In the case of cache miss, i.e. when cached_len equals 0, an offset can
|
|
|
|
// exceed the file end position, so the following check is required
|
|
|
|
if (advanced_offset < chunk_offset + buffer_.CurrentSize()) {
|
|
|
|
// In the case of cache miss, the first chunk_padding bytes in buffer_
|
|
|
|
// are
|
|
|
|
// stored for alignment only and must be skipped
|
|
|
|
size_t chunk_padding = advanced_offset - chunk_offset;
|
|
|
|
auto remaining_len =
|
|
|
|
std::min(buffer_.CurrentSize() - chunk_padding, n - cached_len);
|
|
|
|
memcpy(scratch + cached_len, buffer_.BufferStart() + chunk_padding,
|
|
|
|
remaining_len);
|
|
|
|
*result = Slice(scratch, cached_len + remaining_len);
|
|
|
|
} else {
|
|
|
|
*result = Slice(scratch, cached_len);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status Prefetch(uint64_t offset, size_t n) override {
|
|
|
|
if (n < readahead_size_) {
|
|
|
|
// Don't allow smaller prefetches than the configured `readahead_size_`.
|
|
|
|
// `Read()` assumes a smaller prefetch buffer indicates EOF was reached.
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
size_t offset_ = static_cast<size_t>(offset);
|
|
|
|
size_t prefetch_offset = TruncateToPageBoundary(alignment_, offset_);
|
|
|
|
if (prefetch_offset == buffer_offset_) {
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
return ReadIntoBuffer(prefetch_offset,
|
|
|
|
Roundup(offset_ + n, alignment_) - prefetch_offset);
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t GetUniqueId(char* id, size_t max_size) const override {
|
|
|
|
return file_->GetUniqueId(id, max_size);
|
|
|
|
}
|
|
|
|
|
|
|
|
void Hint(AccessPattern pattern) override { file_->Hint(pattern); }
|
|
|
|
|
|
|
|
Status InvalidateCache(size_t offset, size_t length) override {
|
|
|
|
return file_->InvalidateCache(offset, length);
|
|
|
|
}
|
|
|
|
|
|
|
|
bool use_direct_io() const override { return file_->use_direct_io(); }
|
|
|
|
|
|
|
|
private:
|
|
|
|
bool TryReadFromCache(uint64_t offset, size_t n, size_t* cached_len,
|
|
|
|
char* scratch) const {
|
|
|
|
if (offset < buffer_offset_ ||
|
|
|
|
offset >= buffer_offset_ + buffer_.CurrentSize()) {
|
|
|
|
*cached_len = 0;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
uint64_t offset_in_buffer = offset - buffer_offset_;
|
|
|
|
*cached_len = std::min(
|
|
|
|
buffer_.CurrentSize() - static_cast<size_t>(offset_in_buffer), n);
|
|
|
|
memcpy(scratch, buffer_.BufferStart() + offset_in_buffer, *cached_len);
|
|
|
|
return true;
|
2017-02-18 20:54:49 +01:00
|
|
|
}
|
|
|
|
|
2017-04-15 03:43:32 +02:00
|
|
|
Status ReadIntoBuffer(uint64_t offset, size_t n) const {
|
|
|
|
if (n > buffer_.Capacity()) {
|
|
|
|
n = buffer_.Capacity();
|
|
|
|
}
|
2017-05-18 03:46:03 +02:00
|
|
|
assert(IsFileSectorAligned(offset, alignment_));
|
|
|
|
assert(IsFileSectorAligned(n, alignment_));
|
2017-04-15 03:43:32 +02:00
|
|
|
Slice result;
|
|
|
|
Status s = file_->Read(offset, n, &result, buffer_.BufferStart());
|
|
|
|
if (s.ok()) {
|
|
|
|
buffer_offset_ = offset;
|
2018-07-12 00:42:49 +02:00
|
|
|
buffer_.Size(result.size());
|
2018-05-08 21:03:29 +02:00
|
|
|
assert(buffer_.BufferStart() == result.data());
|
2017-04-15 03:43:32 +02:00
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2015-08-27 00:25:59 +02:00
|
|
|
std::unique_ptr<RandomAccessFile> file_;
|
2017-02-18 20:54:49 +01:00
|
|
|
const size_t alignment_;
|
2015-09-11 18:57:02 +02:00
|
|
|
size_t readahead_size_;
|
2015-08-27 00:25:59 +02:00
|
|
|
|
2017-04-15 03:43:32 +02:00
|
|
|
mutable std::mutex lock_;
|
2017-02-18 20:54:49 +01:00
|
|
|
mutable AlignedBuffer buffer_;
|
2017-04-15 03:43:32 +02:00
|
|
|
mutable uint64_t buffer_offset_;
|
2015-08-27 00:25:59 +02:00
|
|
|
};
|
|
|
|
} // namespace
|
|
|
|
|
2017-08-11 20:59:13 +02:00
|
|
|
Status FilePrefetchBuffer::Prefetch(RandomAccessFileReader* reader,
|
2019-06-19 23:07:36 +02:00
|
|
|
uint64_t offset, size_t n,
|
|
|
|
bool for_compaction) {
|
2017-08-11 20:59:13 +02:00
|
|
|
size_t alignment = reader->file()->GetRequiredBufferAlignment();
|
2018-03-06 21:27:07 +01:00
|
|
|
size_t offset_ = static_cast<size_t>(offset);
|
|
|
|
uint64_t rounddown_offset = Rounddown(offset_, alignment);
|
|
|
|
uint64_t roundup_end = Roundup(offset_ + n, alignment);
|
2018-01-26 21:50:48 +01:00
|
|
|
uint64_t roundup_len = roundup_end - rounddown_offset;
|
|
|
|
assert(roundup_len >= alignment);
|
|
|
|
assert(roundup_len % alignment == 0);
|
Improve direct IO range scan performance with readahead (#3884)
Summary:
This PR extends the improvements in #3282 to also work when using Direct IO.
We see **4.5X performance improvement** in seekrandom benchmark doing long range scans, when using direct reads, on flash.
**Description:**
This change improves the performance of iterators doing long range scans (e.g. big/full index or table scans in MyRocks) by using readahead and prefetching additional data on each disk IO, and storing in a local buffer. This prefetching is automatically enabled on noticing more than 2 IOs for the same table file during iteration. The readahead size starts with 8KB and is exponentially increased on each additional sequential IO, up to a max of 256 KB. This helps in cutting down the number of IOs needed to complete the range scan.
**Implementation Details:**
- Used `FilePrefetchBuffer` as the underlying buffer to store the readahead data. `FilePrefetchBuffer` can now take file_reader, readahead_size and max_readahead_size as input to the constructor, and automatically do readahead.
- `FilePrefetchBuffer::TryReadFromCache` can now call `FilePrefetchBuffer::Prefetch` if readahead is enabled.
- `AlignedBuffer` (which is the underlying store for `FilePrefetchBuffer`) now takes a few additional args in `AlignedBuffer::AllocateNewBuffer` to allow copying data from the old buffer.
- Made sure not to re-read partial chunks of data that were already available in the buffer, from device again.
- Fixed a couple of cases where `AlignedBuffer::cursize_` was not being properly kept up-to-date.
**Constraints:**
- Similar to #3282, this gets currently enabled only when ReadOptions.readahead_size = 0 (which is the default value).
- Since the prefetched data is stored in a temporary buffer allocated on heap, this could increase the memory usage if you have many iterators doing long range scans simultaneously.
- Enabled only for user reads, and disabled for compactions. Compaction reads are controlled by the options `use_direct_io_for_flush_and_compaction` and `compaction_readahead_size`, and the current feature takes precautions not to mess with them.
**Benchmarks:**
I used the same benchmark as used in #3282.
Data fill:
```
TEST_TMPDIR=/data/users/$USER/benchmarks/iter ./db_bench -benchmarks=fillrandom -num=1000000000 -compression_type="none" -level_compaction_dynamic_level_bytes
```
Do a long range scan: Seekrandom with large number of nexts
```
TEST_TMPDIR=/data/users/$USER/benchmarks/iter ./db_bench -benchmarks=seekrandom -use_direct_reads -duration=60 -num=1000000000 -use_existing_db -seek_nexts=10000 -statistics -histogram
```
```
Before:
seekrandom : 37939.906 micros/op 26 ops/sec; 29.2 MB/s (1636 of 1999 found)
With this change:
seekrandom : 8527.720 micros/op 117 ops/sec; 129.7 MB/s (6530 of 7999 found)
```
~4.5X perf improvement. Taken on an average of 3 runs.
Closes https://github.com/facebook/rocksdb/pull/3884
Differential Revision: D8082143
Pulled By: sagar0
fbshipit-source-id: 4d7a8561cbac03478663713df4d31ad2620253bb
2018-06-21 20:02:49 +02:00
|
|
|
|
|
|
|
// Check if requested bytes are in the existing buffer_.
|
|
|
|
// If all bytes exist -- return.
|
|
|
|
// If only a few bytes exist -- reuse them & read only what is really needed.
|
|
|
|
// This is typically the case of incremental reading of data.
|
|
|
|
// If no bytes exist in buffer -- full pread.
|
|
|
|
|
|
|
|
Status s;
|
|
|
|
uint64_t chunk_offset_in_buffer = 0;
|
|
|
|
uint64_t chunk_len = 0;
|
|
|
|
bool copy_data_to_new_buffer = false;
|
2018-07-12 00:42:49 +02:00
|
|
|
if (buffer_.CurrentSize() > 0 && offset >= buffer_offset_ &&
|
|
|
|
offset <= buffer_offset_ + buffer_.CurrentSize()) {
|
|
|
|
if (offset + n <= buffer_offset_ + buffer_.CurrentSize()) {
|
Improve direct IO range scan performance with readahead (#3884)
Summary:
This PR extends the improvements in #3282 to also work when using Direct IO.
We see **4.5X performance improvement** in seekrandom benchmark doing long range scans, when using direct reads, on flash.
**Description:**
This change improves the performance of iterators doing long range scans (e.g. big/full index or table scans in MyRocks) by using readahead and prefetching additional data on each disk IO, and storing in a local buffer. This prefetching is automatically enabled on noticing more than 2 IOs for the same table file during iteration. The readahead size starts with 8KB and is exponentially increased on each additional sequential IO, up to a max of 256 KB. This helps in cutting down the number of IOs needed to complete the range scan.
**Implementation Details:**
- Used `FilePrefetchBuffer` as the underlying buffer to store the readahead data. `FilePrefetchBuffer` can now take file_reader, readahead_size and max_readahead_size as input to the constructor, and automatically do readahead.
- `FilePrefetchBuffer::TryReadFromCache` can now call `FilePrefetchBuffer::Prefetch` if readahead is enabled.
- `AlignedBuffer` (which is the underlying store for `FilePrefetchBuffer`) now takes a few additional args in `AlignedBuffer::AllocateNewBuffer` to allow copying data from the old buffer.
- Made sure not to re-read partial chunks of data that were already available in the buffer, from device again.
- Fixed a couple of cases where `AlignedBuffer::cursize_` was not being properly kept up-to-date.
**Constraints:**
- Similar to #3282, this gets currently enabled only when ReadOptions.readahead_size = 0 (which is the default value).
- Since the prefetched data is stored in a temporary buffer allocated on heap, this could increase the memory usage if you have many iterators doing long range scans simultaneously.
- Enabled only for user reads, and disabled for compactions. Compaction reads are controlled by the options `use_direct_io_for_flush_and_compaction` and `compaction_readahead_size`, and the current feature takes precautions not to mess with them.
**Benchmarks:**
I used the same benchmark as used in #3282.
Data fill:
```
TEST_TMPDIR=/data/users/$USER/benchmarks/iter ./db_bench -benchmarks=fillrandom -num=1000000000 -compression_type="none" -level_compaction_dynamic_level_bytes
```
Do a long range scan: Seekrandom with large number of nexts
```
TEST_TMPDIR=/data/users/$USER/benchmarks/iter ./db_bench -benchmarks=seekrandom -use_direct_reads -duration=60 -num=1000000000 -use_existing_db -seek_nexts=10000 -statistics -histogram
```
```
Before:
seekrandom : 37939.906 micros/op 26 ops/sec; 29.2 MB/s (1636 of 1999 found)
With this change:
seekrandom : 8527.720 micros/op 117 ops/sec; 129.7 MB/s (6530 of 7999 found)
```
~4.5X perf improvement. Taken on an average of 3 runs.
Closes https://github.com/facebook/rocksdb/pull/3884
Differential Revision: D8082143
Pulled By: sagar0
fbshipit-source-id: 4d7a8561cbac03478663713df4d31ad2620253bb
2018-06-21 20:02:49 +02:00
|
|
|
// All requested bytes are already in the buffer. So no need to Read
|
|
|
|
// again.
|
|
|
|
return s;
|
|
|
|
} else {
|
|
|
|
// Only a few requested bytes are in the buffer. memmove those chunk of
|
|
|
|
// bytes to the beginning, and memcpy them back into the new buffer if a
|
|
|
|
// new buffer is created.
|
2018-09-06 03:07:53 +02:00
|
|
|
chunk_offset_in_buffer = Rounddown(static_cast<size_t>(offset - buffer_offset_), alignment);
|
2018-07-12 00:42:49 +02:00
|
|
|
chunk_len = buffer_.CurrentSize() - chunk_offset_in_buffer;
|
Improve direct IO range scan performance with readahead (#3884)
Summary:
This PR extends the improvements in #3282 to also work when using Direct IO.
We see **4.5X performance improvement** in seekrandom benchmark doing long range scans, when using direct reads, on flash.
**Description:**
This change improves the performance of iterators doing long range scans (e.g. big/full index or table scans in MyRocks) by using readahead and prefetching additional data on each disk IO, and storing in a local buffer. This prefetching is automatically enabled on noticing more than 2 IOs for the same table file during iteration. The readahead size starts with 8KB and is exponentially increased on each additional sequential IO, up to a max of 256 KB. This helps in cutting down the number of IOs needed to complete the range scan.
**Implementation Details:**
- Used `FilePrefetchBuffer` as the underlying buffer to store the readahead data. `FilePrefetchBuffer` can now take file_reader, readahead_size and max_readahead_size as input to the constructor, and automatically do readahead.
- `FilePrefetchBuffer::TryReadFromCache` can now call `FilePrefetchBuffer::Prefetch` if readahead is enabled.
- `AlignedBuffer` (which is the underlying store for `FilePrefetchBuffer`) now takes a few additional args in `AlignedBuffer::AllocateNewBuffer` to allow copying data from the old buffer.
- Made sure not to re-read partial chunks of data that were already available in the buffer, from device again.
- Fixed a couple of cases where `AlignedBuffer::cursize_` was not being properly kept up-to-date.
**Constraints:**
- Similar to #3282, this gets currently enabled only when ReadOptions.readahead_size = 0 (which is the default value).
- Since the prefetched data is stored in a temporary buffer allocated on heap, this could increase the memory usage if you have many iterators doing long range scans simultaneously.
- Enabled only for user reads, and disabled for compactions. Compaction reads are controlled by the options `use_direct_io_for_flush_and_compaction` and `compaction_readahead_size`, and the current feature takes precautions not to mess with them.
**Benchmarks:**
I used the same benchmark as used in #3282.
Data fill:
```
TEST_TMPDIR=/data/users/$USER/benchmarks/iter ./db_bench -benchmarks=fillrandom -num=1000000000 -compression_type="none" -level_compaction_dynamic_level_bytes
```
Do a long range scan: Seekrandom with large number of nexts
```
TEST_TMPDIR=/data/users/$USER/benchmarks/iter ./db_bench -benchmarks=seekrandom -use_direct_reads -duration=60 -num=1000000000 -use_existing_db -seek_nexts=10000 -statistics -histogram
```
```
Before:
seekrandom : 37939.906 micros/op 26 ops/sec; 29.2 MB/s (1636 of 1999 found)
With this change:
seekrandom : 8527.720 micros/op 117 ops/sec; 129.7 MB/s (6530 of 7999 found)
```
~4.5X perf improvement. Taken on an average of 3 runs.
Closes https://github.com/facebook/rocksdb/pull/3884
Differential Revision: D8082143
Pulled By: sagar0
fbshipit-source-id: 4d7a8561cbac03478663713df4d31ad2620253bb
2018-06-21 20:02:49 +02:00
|
|
|
assert(chunk_offset_in_buffer % alignment == 0);
|
|
|
|
assert(chunk_len % alignment == 0);
|
2018-07-11 21:24:07 +02:00
|
|
|
assert(chunk_offset_in_buffer + chunk_len <=
|
2018-07-12 00:42:49 +02:00
|
|
|
buffer_offset_ + buffer_.CurrentSize());
|
2018-07-11 21:24:07 +02:00
|
|
|
if (chunk_len > 0) {
|
|
|
|
copy_data_to_new_buffer = true;
|
|
|
|
} else {
|
|
|
|
// this reset is not necessary, but just to be safe.
|
|
|
|
chunk_offset_in_buffer = 0;
|
|
|
|
}
|
Improve direct IO range scan performance with readahead (#3884)
Summary:
This PR extends the improvements in #3282 to also work when using Direct IO.
We see **4.5X performance improvement** in seekrandom benchmark doing long range scans, when using direct reads, on flash.
**Description:**
This change improves the performance of iterators doing long range scans (e.g. big/full index or table scans in MyRocks) by using readahead and prefetching additional data on each disk IO, and storing in a local buffer. This prefetching is automatically enabled on noticing more than 2 IOs for the same table file during iteration. The readahead size starts with 8KB and is exponentially increased on each additional sequential IO, up to a max of 256 KB. This helps in cutting down the number of IOs needed to complete the range scan.
**Implementation Details:**
- Used `FilePrefetchBuffer` as the underlying buffer to store the readahead data. `FilePrefetchBuffer` can now take file_reader, readahead_size and max_readahead_size as input to the constructor, and automatically do readahead.
- `FilePrefetchBuffer::TryReadFromCache` can now call `FilePrefetchBuffer::Prefetch` if readahead is enabled.
- `AlignedBuffer` (which is the underlying store for `FilePrefetchBuffer`) now takes a few additional args in `AlignedBuffer::AllocateNewBuffer` to allow copying data from the old buffer.
- Made sure not to re-read partial chunks of data that were already available in the buffer, from device again.
- Fixed a couple of cases where `AlignedBuffer::cursize_` was not being properly kept up-to-date.
**Constraints:**
- Similar to #3282, this gets currently enabled only when ReadOptions.readahead_size = 0 (which is the default value).
- Since the prefetched data is stored in a temporary buffer allocated on heap, this could increase the memory usage if you have many iterators doing long range scans simultaneously.
- Enabled only for user reads, and disabled for compactions. Compaction reads are controlled by the options `use_direct_io_for_flush_and_compaction` and `compaction_readahead_size`, and the current feature takes precautions not to mess with them.
**Benchmarks:**
I used the same benchmark as used in #3282.
Data fill:
```
TEST_TMPDIR=/data/users/$USER/benchmarks/iter ./db_bench -benchmarks=fillrandom -num=1000000000 -compression_type="none" -level_compaction_dynamic_level_bytes
```
Do a long range scan: Seekrandom with large number of nexts
```
TEST_TMPDIR=/data/users/$USER/benchmarks/iter ./db_bench -benchmarks=seekrandom -use_direct_reads -duration=60 -num=1000000000 -use_existing_db -seek_nexts=10000 -statistics -histogram
```
```
Before:
seekrandom : 37939.906 micros/op 26 ops/sec; 29.2 MB/s (1636 of 1999 found)
With this change:
seekrandom : 8527.720 micros/op 117 ops/sec; 129.7 MB/s (6530 of 7999 found)
```
~4.5X perf improvement. Taken on an average of 3 runs.
Closes https://github.com/facebook/rocksdb/pull/3884
Differential Revision: D8082143
Pulled By: sagar0
fbshipit-source-id: 4d7a8561cbac03478663713df4d31ad2620253bb
2018-06-21 20:02:49 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create a new buffer only if current capacity is not sufficient, and memcopy
|
|
|
|
// bytes from old buffer if needed (i.e., if chunk_len is greater than 0).
|
|
|
|
if (buffer_.Capacity() < roundup_len) {
|
|
|
|
buffer_.Alignment(alignment);
|
|
|
|
buffer_.AllocateNewBuffer(static_cast<size_t>(roundup_len),
|
|
|
|
copy_data_to_new_buffer, chunk_offset_in_buffer,
|
2018-09-06 03:07:53 +02:00
|
|
|
static_cast<size_t>(chunk_len));
|
Improve direct IO range scan performance with readahead (#3884)
Summary:
This PR extends the improvements in #3282 to also work when using Direct IO.
We see **4.5X performance improvement** in seekrandom benchmark doing long range scans, when using direct reads, on flash.
**Description:**
This change improves the performance of iterators doing long range scans (e.g. big/full index or table scans in MyRocks) by using readahead and prefetching additional data on each disk IO, and storing in a local buffer. This prefetching is automatically enabled on noticing more than 2 IOs for the same table file during iteration. The readahead size starts with 8KB and is exponentially increased on each additional sequential IO, up to a max of 256 KB. This helps in cutting down the number of IOs needed to complete the range scan.
**Implementation Details:**
- Used `FilePrefetchBuffer` as the underlying buffer to store the readahead data. `FilePrefetchBuffer` can now take file_reader, readahead_size and max_readahead_size as input to the constructor, and automatically do readahead.
- `FilePrefetchBuffer::TryReadFromCache` can now call `FilePrefetchBuffer::Prefetch` if readahead is enabled.
- `AlignedBuffer` (which is the underlying store for `FilePrefetchBuffer`) now takes a few additional args in `AlignedBuffer::AllocateNewBuffer` to allow copying data from the old buffer.
- Made sure not to re-read partial chunks of data that were already available in the buffer, from device again.
- Fixed a couple of cases where `AlignedBuffer::cursize_` was not being properly kept up-to-date.
**Constraints:**
- Similar to #3282, this gets currently enabled only when ReadOptions.readahead_size = 0 (which is the default value).
- Since the prefetched data is stored in a temporary buffer allocated on heap, this could increase the memory usage if you have many iterators doing long range scans simultaneously.
- Enabled only for user reads, and disabled for compactions. Compaction reads are controlled by the options `use_direct_io_for_flush_and_compaction` and `compaction_readahead_size`, and the current feature takes precautions not to mess with them.
**Benchmarks:**
I used the same benchmark as used in #3282.
Data fill:
```
TEST_TMPDIR=/data/users/$USER/benchmarks/iter ./db_bench -benchmarks=fillrandom -num=1000000000 -compression_type="none" -level_compaction_dynamic_level_bytes
```
Do a long range scan: Seekrandom with large number of nexts
```
TEST_TMPDIR=/data/users/$USER/benchmarks/iter ./db_bench -benchmarks=seekrandom -use_direct_reads -duration=60 -num=1000000000 -use_existing_db -seek_nexts=10000 -statistics -histogram
```
```
Before:
seekrandom : 37939.906 micros/op 26 ops/sec; 29.2 MB/s (1636 of 1999 found)
With this change:
seekrandom : 8527.720 micros/op 117 ops/sec; 129.7 MB/s (6530 of 7999 found)
```
~4.5X perf improvement. Taken on an average of 3 runs.
Closes https://github.com/facebook/rocksdb/pull/3884
Differential Revision: D8082143
Pulled By: sagar0
fbshipit-source-id: 4d7a8561cbac03478663713df4d31ad2620253bb
2018-06-21 20:02:49 +02:00
|
|
|
} else if (chunk_len > 0) {
|
|
|
|
// New buffer not needed. But memmove bytes from tail to the beginning since
|
|
|
|
// chunk_len is greater than 0.
|
2018-09-06 03:07:53 +02:00
|
|
|
buffer_.RefitTail(static_cast<size_t>(chunk_offset_in_buffer), static_cast<size_t>(chunk_len));
|
Improve direct IO range scan performance with readahead (#3884)
Summary:
This PR extends the improvements in #3282 to also work when using Direct IO.
We see **4.5X performance improvement** in seekrandom benchmark doing long range scans, when using direct reads, on flash.
**Description:**
This change improves the performance of iterators doing long range scans (e.g. big/full index or table scans in MyRocks) by using readahead and prefetching additional data on each disk IO, and storing in a local buffer. This prefetching is automatically enabled on noticing more than 2 IOs for the same table file during iteration. The readahead size starts with 8KB and is exponentially increased on each additional sequential IO, up to a max of 256 KB. This helps in cutting down the number of IOs needed to complete the range scan.
**Implementation Details:**
- Used `FilePrefetchBuffer` as the underlying buffer to store the readahead data. `FilePrefetchBuffer` can now take file_reader, readahead_size and max_readahead_size as input to the constructor, and automatically do readahead.
- `FilePrefetchBuffer::TryReadFromCache` can now call `FilePrefetchBuffer::Prefetch` if readahead is enabled.
- `AlignedBuffer` (which is the underlying store for `FilePrefetchBuffer`) now takes a few additional args in `AlignedBuffer::AllocateNewBuffer` to allow copying data from the old buffer.
- Made sure not to re-read partial chunks of data that were already available in the buffer, from device again.
- Fixed a couple of cases where `AlignedBuffer::cursize_` was not being properly kept up-to-date.
**Constraints:**
- Similar to #3282, this gets currently enabled only when ReadOptions.readahead_size = 0 (which is the default value).
- Since the prefetched data is stored in a temporary buffer allocated on heap, this could increase the memory usage if you have many iterators doing long range scans simultaneously.
- Enabled only for user reads, and disabled for compactions. Compaction reads are controlled by the options `use_direct_io_for_flush_and_compaction` and `compaction_readahead_size`, and the current feature takes precautions not to mess with them.
**Benchmarks:**
I used the same benchmark as used in #3282.
Data fill:
```
TEST_TMPDIR=/data/users/$USER/benchmarks/iter ./db_bench -benchmarks=fillrandom -num=1000000000 -compression_type="none" -level_compaction_dynamic_level_bytes
```
Do a long range scan: Seekrandom with large number of nexts
```
TEST_TMPDIR=/data/users/$USER/benchmarks/iter ./db_bench -benchmarks=seekrandom -use_direct_reads -duration=60 -num=1000000000 -use_existing_db -seek_nexts=10000 -statistics -histogram
```
```
Before:
seekrandom : 37939.906 micros/op 26 ops/sec; 29.2 MB/s (1636 of 1999 found)
With this change:
seekrandom : 8527.720 micros/op 117 ops/sec; 129.7 MB/s (6530 of 7999 found)
```
~4.5X perf improvement. Taken on an average of 3 runs.
Closes https://github.com/facebook/rocksdb/pull/3884
Differential Revision: D8082143
Pulled By: sagar0
fbshipit-source-id: 4d7a8561cbac03478663713df4d31ad2620253bb
2018-06-21 20:02:49 +02:00
|
|
|
}
|
2017-08-11 20:59:13 +02:00
|
|
|
|
|
|
|
Slice result;
|
Improve direct IO range scan performance with readahead (#3884)
Summary:
This PR extends the improvements in #3282 to also work when using Direct IO.
We see **4.5X performance improvement** in seekrandom benchmark doing long range scans, when using direct reads, on flash.
**Description:**
This change improves the performance of iterators doing long range scans (e.g. big/full index or table scans in MyRocks) by using readahead and prefetching additional data on each disk IO, and storing in a local buffer. This prefetching is automatically enabled on noticing more than 2 IOs for the same table file during iteration. The readahead size starts with 8KB and is exponentially increased on each additional sequential IO, up to a max of 256 KB. This helps in cutting down the number of IOs needed to complete the range scan.
**Implementation Details:**
- Used `FilePrefetchBuffer` as the underlying buffer to store the readahead data. `FilePrefetchBuffer` can now take file_reader, readahead_size and max_readahead_size as input to the constructor, and automatically do readahead.
- `FilePrefetchBuffer::TryReadFromCache` can now call `FilePrefetchBuffer::Prefetch` if readahead is enabled.
- `AlignedBuffer` (which is the underlying store for `FilePrefetchBuffer`) now takes a few additional args in `AlignedBuffer::AllocateNewBuffer` to allow copying data from the old buffer.
- Made sure not to re-read partial chunks of data that were already available in the buffer, from device again.
- Fixed a couple of cases where `AlignedBuffer::cursize_` was not being properly kept up-to-date.
**Constraints:**
- Similar to #3282, this gets currently enabled only when ReadOptions.readahead_size = 0 (which is the default value).
- Since the prefetched data is stored in a temporary buffer allocated on heap, this could increase the memory usage if you have many iterators doing long range scans simultaneously.
- Enabled only for user reads, and disabled for compactions. Compaction reads are controlled by the options `use_direct_io_for_flush_and_compaction` and `compaction_readahead_size`, and the current feature takes precautions not to mess with them.
**Benchmarks:**
I used the same benchmark as used in #3282.
Data fill:
```
TEST_TMPDIR=/data/users/$USER/benchmarks/iter ./db_bench -benchmarks=fillrandom -num=1000000000 -compression_type="none" -level_compaction_dynamic_level_bytes
```
Do a long range scan: Seekrandom with large number of nexts
```
TEST_TMPDIR=/data/users/$USER/benchmarks/iter ./db_bench -benchmarks=seekrandom -use_direct_reads -duration=60 -num=1000000000 -use_existing_db -seek_nexts=10000 -statistics -histogram
```
```
Before:
seekrandom : 37939.906 micros/op 26 ops/sec; 29.2 MB/s (1636 of 1999 found)
With this change:
seekrandom : 8527.720 micros/op 117 ops/sec; 129.7 MB/s (6530 of 7999 found)
```
~4.5X perf improvement. Taken on an average of 3 runs.
Closes https://github.com/facebook/rocksdb/pull/3884
Differential Revision: D8082143
Pulled By: sagar0
fbshipit-source-id: 4d7a8561cbac03478663713df4d31ad2620253bb
2018-06-21 20:02:49 +02:00
|
|
|
s = reader->Read(rounddown_offset + chunk_len,
|
|
|
|
static_cast<size_t>(roundup_len - chunk_len), &result,
|
2019-06-19 23:07:36 +02:00
|
|
|
buffer_.BufferStart() + chunk_len, for_compaction);
|
2017-08-11 20:59:13 +02:00
|
|
|
if (s.ok()) {
|
2018-01-26 21:50:48 +01:00
|
|
|
buffer_offset_ = rounddown_offset;
|
2018-09-06 03:07:53 +02:00
|
|
|
buffer_.Size(static_cast<size_t>(chunk_len) + result.size());
|
2017-08-11 20:59:13 +02:00
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool FilePrefetchBuffer::TryReadFromCache(uint64_t offset, size_t n,
|
2019-06-19 23:07:36 +02:00
|
|
|
Slice* result, bool for_compaction) {
|
2018-07-20 23:31:27 +02:00
|
|
|
if (track_min_offset_ && offset < min_offset_read_) {
|
2018-09-06 03:07:53 +02:00
|
|
|
min_offset_read_ = static_cast<size_t>(offset);
|
2018-07-20 23:31:27 +02:00
|
|
|
}
|
|
|
|
if (!enable_ || offset < buffer_offset_) {
|
2017-08-11 20:59:13 +02:00
|
|
|
return false;
|
|
|
|
}
|
Improve direct IO range scan performance with readahead (#3884)
Summary:
This PR extends the improvements in #3282 to also work when using Direct IO.
We see **4.5X performance improvement** in seekrandom benchmark doing long range scans, when using direct reads, on flash.
**Description:**
This change improves the performance of iterators doing long range scans (e.g. big/full index or table scans in MyRocks) by using readahead and prefetching additional data on each disk IO, and storing in a local buffer. This prefetching is automatically enabled on noticing more than 2 IOs for the same table file during iteration. The readahead size starts with 8KB and is exponentially increased on each additional sequential IO, up to a max of 256 KB. This helps in cutting down the number of IOs needed to complete the range scan.
**Implementation Details:**
- Used `FilePrefetchBuffer` as the underlying buffer to store the readahead data. `FilePrefetchBuffer` can now take file_reader, readahead_size and max_readahead_size as input to the constructor, and automatically do readahead.
- `FilePrefetchBuffer::TryReadFromCache` can now call `FilePrefetchBuffer::Prefetch` if readahead is enabled.
- `AlignedBuffer` (which is the underlying store for `FilePrefetchBuffer`) now takes a few additional args in `AlignedBuffer::AllocateNewBuffer` to allow copying data from the old buffer.
- Made sure not to re-read partial chunks of data that were already available in the buffer, from device again.
- Fixed a couple of cases where `AlignedBuffer::cursize_` was not being properly kept up-to-date.
**Constraints:**
- Similar to #3282, this gets currently enabled only when ReadOptions.readahead_size = 0 (which is the default value).
- Since the prefetched data is stored in a temporary buffer allocated on heap, this could increase the memory usage if you have many iterators doing long range scans simultaneously.
- Enabled only for user reads, and disabled for compactions. Compaction reads are controlled by the options `use_direct_io_for_flush_and_compaction` and `compaction_readahead_size`, and the current feature takes precautions not to mess with them.
**Benchmarks:**
I used the same benchmark as used in #3282.
Data fill:
```
TEST_TMPDIR=/data/users/$USER/benchmarks/iter ./db_bench -benchmarks=fillrandom -num=1000000000 -compression_type="none" -level_compaction_dynamic_level_bytes
```
Do a long range scan: Seekrandom with large number of nexts
```
TEST_TMPDIR=/data/users/$USER/benchmarks/iter ./db_bench -benchmarks=seekrandom -use_direct_reads -duration=60 -num=1000000000 -use_existing_db -seek_nexts=10000 -statistics -histogram
```
```
Before:
seekrandom : 37939.906 micros/op 26 ops/sec; 29.2 MB/s (1636 of 1999 found)
With this change:
seekrandom : 8527.720 micros/op 117 ops/sec; 129.7 MB/s (6530 of 7999 found)
```
~4.5X perf improvement. Taken on an average of 3 runs.
Closes https://github.com/facebook/rocksdb/pull/3884
Differential Revision: D8082143
Pulled By: sagar0
fbshipit-source-id: 4d7a8561cbac03478663713df4d31ad2620253bb
2018-06-21 20:02:49 +02:00
|
|
|
|
|
|
|
// If the buffer contains only a few of the requested bytes:
|
|
|
|
// If readahead is enabled: prefetch the remaining bytes + readadhead bytes
|
|
|
|
// and satisfy the request.
|
|
|
|
// If readahead is not enabled: return false.
|
2018-07-12 00:42:49 +02:00
|
|
|
if (offset + n > buffer_offset_ + buffer_.CurrentSize()) {
|
Improve direct IO range scan performance with readahead (#3884)
Summary:
This PR extends the improvements in #3282 to also work when using Direct IO.
We see **4.5X performance improvement** in seekrandom benchmark doing long range scans, when using direct reads, on flash.
**Description:**
This change improves the performance of iterators doing long range scans (e.g. big/full index or table scans in MyRocks) by using readahead and prefetching additional data on each disk IO, and storing in a local buffer. This prefetching is automatically enabled on noticing more than 2 IOs for the same table file during iteration. The readahead size starts with 8KB and is exponentially increased on each additional sequential IO, up to a max of 256 KB. This helps in cutting down the number of IOs needed to complete the range scan.
**Implementation Details:**
- Used `FilePrefetchBuffer` as the underlying buffer to store the readahead data. `FilePrefetchBuffer` can now take file_reader, readahead_size and max_readahead_size as input to the constructor, and automatically do readahead.
- `FilePrefetchBuffer::TryReadFromCache` can now call `FilePrefetchBuffer::Prefetch` if readahead is enabled.
- `AlignedBuffer` (which is the underlying store for `FilePrefetchBuffer`) now takes a few additional args in `AlignedBuffer::AllocateNewBuffer` to allow copying data from the old buffer.
- Made sure not to re-read partial chunks of data that were already available in the buffer, from device again.
- Fixed a couple of cases where `AlignedBuffer::cursize_` was not being properly kept up-to-date.
**Constraints:**
- Similar to #3282, this gets currently enabled only when ReadOptions.readahead_size = 0 (which is the default value).
- Since the prefetched data is stored in a temporary buffer allocated on heap, this could increase the memory usage if you have many iterators doing long range scans simultaneously.
- Enabled only for user reads, and disabled for compactions. Compaction reads are controlled by the options `use_direct_io_for_flush_and_compaction` and `compaction_readahead_size`, and the current feature takes precautions not to mess with them.
**Benchmarks:**
I used the same benchmark as used in #3282.
Data fill:
```
TEST_TMPDIR=/data/users/$USER/benchmarks/iter ./db_bench -benchmarks=fillrandom -num=1000000000 -compression_type="none" -level_compaction_dynamic_level_bytes
```
Do a long range scan: Seekrandom with large number of nexts
```
TEST_TMPDIR=/data/users/$USER/benchmarks/iter ./db_bench -benchmarks=seekrandom -use_direct_reads -duration=60 -num=1000000000 -use_existing_db -seek_nexts=10000 -statistics -histogram
```
```
Before:
seekrandom : 37939.906 micros/op 26 ops/sec; 29.2 MB/s (1636 of 1999 found)
With this change:
seekrandom : 8527.720 micros/op 117 ops/sec; 129.7 MB/s (6530 of 7999 found)
```
~4.5X perf improvement. Taken on an average of 3 runs.
Closes https://github.com/facebook/rocksdb/pull/3884
Differential Revision: D8082143
Pulled By: sagar0
fbshipit-source-id: 4d7a8561cbac03478663713df4d31ad2620253bb
2018-06-21 20:02:49 +02:00
|
|
|
if (readahead_size_ > 0) {
|
|
|
|
assert(file_reader_ != nullptr);
|
|
|
|
assert(max_readahead_size_ >= readahead_size_);
|
Compaction Reads should read no more than compaction_readahead_size bytes, when set! (#5498)
Summary:
As a result of https://github.com/facebook/rocksdb/issues/5431 the compaction_readahead_size given by a user was not used exactly, the reason being the code behind readahead for user-read and compaction-read was unified in the above PR and the behavior for user-read is to read readahead_size+n bytes (see FilePrefetchBuffer::TryReadFromCache method). Before the unification the ReadaheadRandomAccessFileReader used compaction_readahead_size as it is.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5498
Test Plan:
Ran strace command : strace -e pread64 -f -T -t ./db_compaction_test --gtest_filter=DBCompactionTest.PartialManualCompaction
In the test the compaction_readahead_size was configured to 2MB and verified the pread syscall did indeed request 2MB. Before the change it was requesting more than 2MB.
Strace Output:
strace: Process 3798982 attached
Note: Google Test filter = DBCompactionTest.PartialManualCompaction
[==========] Running 1 test from 1 test case.
[----------] Global test environment set-up.
[----------] 1 test from DBCompactionTest
[ RUN ] DBCompactionTest.PartialManualCompaction
strace: Process 3798983 attached
strace: Process 3798984 attached
strace: Process 3798985 attached
strace: Process 3798986 attached
strace: Process 3798987 attached
strace: Process 3798992 attached
[pid 3798987] 12:07:05 +++ exited with 0 +++
strace: Process 3798993 attached
[pid 3798993] 12:07:05 +++ exited with 0 +++
strace: Process 3798994 attached
strace: Process 3799008 attached
strace: Process 3799009 attached
[pid 3799008] 12:07:05 +++ exited with 0 +++
strace: Process 3799010 attached
[pid 3799009] 12:07:05 +++ exited with 0 +++
strace: Process 3799011 attached
[pid 3799010] 12:07:05 +++ exited with 0 +++
[pid 3799011] 12:07:05 +++ exited with 0 +++
strace: Process 3799012 attached
[pid 3799012] 12:07:05 +++ exited with 0 +++
strace: Process 3799013 attached
strace: Process 3799014 attached
[pid 3799013] 12:07:05 +++ exited with 0 +++
strace: Process 3799015 attached
[pid 3799014] 12:07:05 +++ exited with 0 +++
[pid 3799015] 12:07:05 +++ exited with 0 +++
strace: Process 3799016 attached
[pid 3799016] 12:07:05 +++ exited with 0 +++
strace: Process 3799017 attached
[pid 3799017] 12:07:05 +++ exited with 0 +++
strace: Process 3799019 attached
[pid 3799019] 12:07:05 +++ exited with 0 +++
strace: Process 3799020 attached
strace: Process 3799021 attached
[pid 3799020] 12:07:05 +++ exited with 0 +++
[pid 3799021] 12:07:05 +++ exited with 0 +++
strace: Process 3799022 attached
[pid 3799022] 12:07:05 +++ exited with 0 +++
strace: Process 3799023 attached
[pid 3799023] 12:07:05 +++ exited with 0 +++
strace: Process 3799047 attached
strace: Process 3799048 attached
[pid 3799047] 12:07:06 +++ exited with 0 +++
[pid 3799048] 12:07:06 +++ exited with 0 +++
[pid 3798994] 12:07:06 +++ exited with 0 +++
strace: Process 3799052 attached
[pid 3799052] 12:07:06 +++ exited with 0 +++
strace: Process 3799054 attached
strace: Process 3799069 attached
strace: Process 3799070 attached
[pid 3799069] 12:07:06 +++ exited with 0 +++
strace: Process 3799071 attached
[pid 3799070] 12:07:06 +++ exited with 0 +++
[pid 3799071] 12:07:06 +++ exited with 0 +++
strace: Process 3799072 attached
strace: Process 3799073 attached
[pid 3799072] 12:07:06 +++ exited with 0 +++
[pid 3799073] 12:07:06 +++ exited with 0 +++
strace: Process 3799074 attached
[pid 3799074] 12:07:06 +++ exited with 0 +++
strace: Process 3799075 attached
[pid 3799075] 12:07:06 +++ exited with 0 +++
strace: Process 3799076 attached
[pid 3799076] 12:07:06 +++ exited with 0 +++
strace: Process 3799077 attached
[pid 3799077] 12:07:06 +++ exited with 0 +++
strace: Process 3799078 attached
[pid 3799078] 12:07:06 +++ exited with 0 +++
strace: Process 3799079 attached
[pid 3799079] 12:07:06 +++ exited with 0 +++
strace: Process 3799080 attached
[pid 3799080] 12:07:06 +++ exited with 0 +++
strace: Process 3799081 attached
[pid 3799081] 12:07:06 +++ exited with 0 +++
strace: Process 3799082 attached
[pid 3799082] 12:07:06 +++ exited with 0 +++
strace: Process 3799083 attached
[pid 3799083] 12:07:06 +++ exited with 0 +++
strace: Process 3799086 attached
strace: Process 3799087 attached
[pid 3798984] 12:07:06 pread64(9, "\1\203W!\241QE\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 53, 11177) = 53 <0.000121>
[pid 3798984] 12:07:06 pread64(9, "\0\22\4rocksdb.properties\353Q\223\5\0\0\0\0\1\0\0"..., 38, 11139) = 38 <0.000106>
[pid 3798984] 12:07:06 pread64(9, "\0$\4rocksdb.block.based.table.ind"..., 664, 10475) = 664 <0.000081>
[pid 3798984] 12:07:06 pread64(9, "\0\v\3foo\2\7\0\0\0\0\0\0\0\270 \0\v\4foo\2\3\0\0\0\0\0\0\275"..., 74, 10401) = 74 <0.000138>
[pid 3798984] 12:07:06 pread64(11, "\1\203W!\241QE\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 53, 11177) = 53 <0.000097>
[pid 3798984] 12:07:06 pread64(11, "\0\22\4rocksdb.properties\353Q\223\5\0\0\0\0\1\0\0"..., 38, 11139) = 38 <0.000086>
[pid 3798984] 12:07:06 pread64(11, "\0$\4rocksdb.block.based.table.ind"..., 664, 10475) = 664 <0.000064>
[pid 3798984] 12:07:06 pread64(11, "\0\v\3foo\2\21\0\0\0\0\0\0\0\270 \0\v\4foo\2\r\0\0\0\0\0\0\275"..., 74, 10401) = 74 <0.000064>
[pid 3798984] 12:07:06 pread64(12, "\1\203W!\241QE\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 53, 11177) = 53 <0.000080>
[pid 3798984] 12:07:06 pread64(12, "\0\22\4rocksdb.properties\353Q\223\5\0\0\0\0\1\0\0"..., 38, 11139) = 38 <0.000090>
[pid 3798984] 12:07:06 pread64(12, "\0$\4rocksdb.block.based.table.ind"..., 664, 10475) = 664 <0.000059>
[pid 3798984] 12:07:06 pread64(12, "\0\v\3foo\2\33\0\0\0\0\0\0\0\270 \0\v\4foo\2\27\0\0\0\0\0\0\275"..., 74, 10401) = 74 <0.000065>
[pid 3798984] 12:07:06 pread64(13, "\1\203W!\241QE\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 53, 11177) = 53 <0.000070>
[pid 3798984] 12:07:06 pread64(13, "\0\22\4rocksdb.properties\353Q\223\5\0\0\0\0\1\0\0"..., 38, 11139) = 38 <0.000059>
[pid 3798984] 12:07:06 pread64(13, "\0$\4rocksdb.block.based.table.ind"..., 664, 10475) = 664 <0.000061>
[pid 3798984] 12:07:06 pread64(13, "\0\v\3foo\2%\0\0\0\0\0\0\0\270 \0\v\4foo\2!\0\0\0\0\0\0\275"..., 74, 10401) = 74 <0.000065>
[pid 3798984] 12:07:06 pread64(14, "\1\203W!\241QE\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 53, 11177) = 53 <0.000118>
[pid 3798984] 12:07:06 pread64(14, "\0\22\4rocksdb.properties\353Q\223\5\0\0\0\0\1\0\0"..., 38, 11139) = 38 <0.000093>
[pid 3798984] 12:07:06 pread64(14, "\0$\4rocksdb.block.based.table.ind"..., 664, 10475) = 664 <0.000050>
[pid 3798984] 12:07:06 pread64(14, "\0\v\3foo\2/\0\0\0\0\0\0\0\270 \0\v\4foo\2+\0\0\0\0\0\0\275"..., 74, 10401) = 74 <0.000082>
[pid 3798984] 12:07:06 pread64(15, "\1\203W!\241QE\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 53, 11177) = 53 <0.000080>
[pid 3798984] 12:07:06 pread64(15, "\0\22\4rocksdb.properties\353Q\223\5\0\0\0\0\1\0\0"..., 38, 11139) = 38 <0.000086>
[pid 3798984] 12:07:06 pread64(15, "\0$\4rocksdb.block.based.table.ind"..., 664, 10475) = 664 <0.000091>
[pid 3798984] 12:07:06 pread64(15, "\0\v\3foo\0029\0\0\0\0\0\0\0\270 \0\v\4foo\0025\0\0\0\0\0\0\275"..., 74, 10401) = 74 <0.000174>
[pid 3798984] 12:07:06 pread64(16, "\1\203W!\241QE\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 53, 11177) = 53 <0.000080>
[pid 3798984] 12:07:06 pread64(16, "\0\22\4rocksdb.properties\353Q\223\5\0\0\0\0\1\0\0"..., 38, 11139) = 38 <0.000093>
[pid 3798984] 12:07:06 pread64(16, "\0$\4rocksdb.block.based.table.ind"..., 664, 10475) = 664 <0.000194>
[pid 3798984] 12:07:06 pread64(16, "\0\v\3foo\2C\0\0\0\0\0\0\0\270 \0\v\4foo\2?\0\0\0\0\0\0\275"..., 74, 10401) = 74 <0.000086>
[pid 3798984] 12:07:06 pread64(17, "\1\203W!\241QE\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 53, 11177) = 53 <0.000079>
[pid 3798984] 12:07:06 pread64(17, "\0\22\4rocksdb.properties\353Q\223\5\0\0\0\0\1\0\0"..., 38, 11139) = 38 <0.000047>
[pid 3798984] 12:07:06 pread64(17, "\0$\4rocksdb.block.based.table.ind"..., 664, 10475) = 664 <0.000045>
[pid 3798984] 12:07:06 pread64(17, "\0\v\3foo\2M\0\0\0\0\0\0\0\270 \0\v\4foo\2I\0\0\0\0\0\0\275"..., 74, 10401) = 74 <0.000107>
[pid 3798983] 12:07:06 pread64(17, "\0\v\200\10foo\2P\0\0\0\0\0\0)U?MSg_)j(roFn($e"..., 2097152, 0) = 11230 <0.000091>
[pid 3798983] 12:07:06 pread64(17, "", 2085922, 11230) = 0 <0.000073>
[pid 3798983] 12:07:06 pread64(16, "\0\v\200\10foo\2F\0\0\0\0\0\0k[h3%.OPH_^:\\S7T&"..., 2097152, 0) = 11230 <0.000083>
[pid 3798983] 12:07:06 pread64(16, "", 2085922, 11230) = 0 <0.000078>
[pid 3798983] 12:07:06 pread64(15, "\0\v\200\10foo\2<\0\0\0\0\0\0+qToi_c{*S+4:N(:"..., 2097152, 0) = 11230 <0.000095>
[pid 3798983] 12:07:06 pread64(15, "", 2085922, 11230) = 0 <0.000067>
[pid 3798983] 12:07:06 pread64(14, "\0\v\200\10foo\0022\0\0\0\0\0\0%hw%OMa\"}9I609Q!B"..., 2097152, 0) = 11230 <0.000111>
[pid 3798983] 12:07:06 pread64(14, "", 2085922, 11230) = 0 <0.000093>
[pid 3798983] 12:07:06 pread64(13, "\0\v\200\10foo\2(\0\0\0\0\0\0p}Y&mu^DcaSGb2&nP"..., 2097152, 0) = 11230 <0.000128>
[pid 3798983] 12:07:06 pread64(13, "", 2085922, 11230) = 0 <0.000076>
[pid 3798983] 12:07:06 pread64(12, "\0\v\200\10foo\2\36\0\0\0\0\0\0YIyW#]oSs^6VHfB<`"..., 2097152, 0) = 11230 <0.000092>
[pid 3798983] 12:07:06 pread64(12, "", 2085922, 11230) = 0 <0.000073>
[pid 3798983] 12:07:06 pread64(11, "\0\v\200\10foo\2\24\0\0\0\0\0\0mfF8Jel/*Zf :-#s("..., 2097152, 0) = 11230 <0.000088>
[pid 3798983] 12:07:06 pread64(11, "", 2085922, 11230) = 0 <0.000067>
[pid 3798983] 12:07:06 pread64(9, "\0\v\200\10foo\2\n\0\0\0\0\0\0\\X'cjiHX)D,RSj1X!"..., 2097152, 0) = 11230 <0.000115>
[pid 3798983] 12:07:06 pread64(9, "", 2085922, 11230) = 0 <0.000073>
[pid 3798983] 12:07:06 pread64(8, "\1\315\5 \36\30\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 53, 754) = 53 <0.000098>
[pid 3798983] 12:07:06 pread64(8, "\0\22\3rocksdb.properties;\215\5\0\0\0\0\1\0\0\0"..., 37, 717) = 37 <0.000064>
[pid 3798983] 12:07:06 pread64(8, "\0$\4rocksdb.block.based.table.ind"..., 658, 59) = 658 <0.000074>
[pid 3798983] 12:07:06 pread64(8, "\0\v\2foo\1\0\0\0\0\0\0\0\0\31\0\0\0\0\1\0\0\0\0\212\216\222P", 29, 30) = 29 <0.000064>
[pid 3799086] 12:07:06 +++ exited with 0 +++
[pid 3799087] 12:07:06 +++ exited with 0 +++
[pid 3799054] 12:07:06 +++ exited with 0 +++
strace: Process 3799104 attached
[pid 3799104] 12:07:06 +++ exited with 0 +++
[ OK ] DBCompactionTest.PartialManualCompaction (757 ms)
[----------] 1 test from DBCompactionTest (758 ms total)
[----------] Global test environment tear-down
[==========] 1 test from 1 test case ran. (759 ms total)
[ PASSED ] 1 test.
[pid 3798983] 12:07:06 +++ exited with 0 +++
[pid 3798984] 12:07:06 +++ exited with 0 +++
[pid 3798992] 12:07:06 +++ exited with 0 +++
[pid 3798986] 12:07:06 +++ exited with 0 +++
[pid 3798982] 12:07:06 +++ exited with 0 +++
[pid 3798985] 12:07:06 +++ exited with 0 +++
12:07:06 +++ exited with 0 +++
Differential Revision: D15948422
Pulled By: vjnadimpalli
fbshipit-source-id: 9b189d1e8675d290c7784e4b33e5d3b5761d2ac8
2019-06-22 06:07:09 +02:00
|
|
|
Status s;
|
|
|
|
if (for_compaction) {
|
Fix a bug in compaction reads causing checksum mismatches and asan errors (#5531)
Summary:
Fixed a bug in compaction reads due to which incorrect number of bytes were being read/utilized. The bug was introduced in https://github.com/facebook/rocksdb/issues/5498 , resulting in "Corruption: block checksum mismatch" and "heap-buffer-overflow" asan errors in our tests.
https://github.com/facebook/rocksdb/issues/5498 was introduced recently and is not in any released versions.
ASAN:
```
> ==2280939==ERROR: AddressSanitizer: heap-buffer-overflow on address 0x6250005e83da at pc 0x000000d57f62 bp 0x7f954f483770 sp 0x7f954f482f20
> === How to use this, how to get the raw stack trace, and more: fburl.com/ASAN ===
> READ of size 4 at 0x6250005e83da thread T4
> SCARINESS: 27 (4-byte-read-heap-buffer-overflow-far-from-bounds)
> #0 tests+0xd57f61 __asan_memcpy
> https://github.com/facebook/rocksdb/issues/1 rocksdb/src/util/coding.h:124 rocksdb::DecodeFixed32(char const*)
> https://github.com/facebook/rocksdb/issues/2 rocksdb/src/table/block_fetcher.cc:39 rocksdb::BlockFetcher::CheckBlockChecksum()
> https://github.com/facebook/rocksdb/issues/3 rocksdb/src/table/block_fetcher.cc:99 rocksdb::BlockFetcher::TryGetFromPrefetchBuffer()
> https://github.com/facebook/rocksdb/issues/4 rocksdb/src/table/block_fetcher.cc:209 rocksdb::BlockFetcher::ReadBlockContents()
> https://github.com/facebook/rocksdb/issues/5 rocksdb/src/table/block_based/block_based_table_reader.cc:93 rocksdb::(anonymous namespace)::ReadBlockFromFile(rocksdb::RandomAccessFileReader*, rocksdb::FilePrefetchBuffer*, rocksdb::Footer const&, rocksdb::ReadOptions const&, rocksdb::BlockHandle const&, std::unique_ptr<...>*, rocksdb::ImmutableCFOptions const&, bool, bool, rocksdb::UncompressionDict
const&, rocksdb::PersistentCacheOptions const&, unsigned long, unsigned long, rocksdb::MemoryAllocator*, bool)
> https://github.com/facebook/rocksdb/issues/6 rocksdb/src/table/block_based/block_based_table_reader.cc:2331 rocksdb::BlockBasedTable::RetrieveBlock(rocksdb::FilePrefetchBuffer*, rocksdb::ReadOptions const&, rocksdb::BlockHandle const&, rocksdb::UncompressionDict const&, rocksdb::CachableEntry<...>*, rocksdb::BlockType, rocksdb::GetContext*, rocksdb::BlockCacheLookupContext*, bool) const
> https://github.com/facebook/rocksdb/issues/7 rocksdb/src/table/block_based/block_based_table_reader.cc:2090 rocksdb::DataBlockIter* rocksdb::BlockBasedTable::NewDataBlockIterator<...>(rocksdb::ReadOptions const&, rocksdb::BlockHandle const&, rocksdb::DataBlockIter*, rocksdb::BlockType, bool, bool, rocksdb::GetContext*, rocksdb::BlockCacheLookupContext*, rocksdb::Status, rocksdb::FilePrefetchBuffe
r*, bool) const
> https://github.com/facebook/rocksdb/issues/8 rocksdb/src/table/block_based/block_based_table_reader.cc:2720 rocksdb::BlockBasedTableIterator<...>::InitDataBlock()
> https://github.com/facebook/rocksdb/issues/9 rocksdb/src/table/block_based/block_based_table_reader.cc:2607 rocksdb::BlockBasedTableIterator<...>::SeekToFirst()
> https://github.com/facebook/rocksdb/issues/10 rocksdb/src/table/iterator_wrapper.h:83 rocksdb::IteratorWrapperBase<...>::SeekToFirst()
> https://github.com/facebook/rocksdb/issues/11 rocksdb/src/table/merging_iterator.cc:100 rocksdb::MergingIterator::SeekToFirst()
> https://github.com/facebook/rocksdb/issues/12 rocksdb/compaction/compaction_job.cc:877 rocksdb::CompactionJob::ProcessKeyValueCompaction(rocksdb::CompactionJob::SubcompactionState*)
> https://github.com/facebook/rocksdb/issues/13 rocksdb/compaction/compaction_job.cc:590 rocksdb::CompactionJob::Run()
> https://github.com/facebook/rocksdb/issues/14 rocksdb/db_impl/db_impl_compaction_flush.cc:2689 rocksdb::DBImpl::BackgroundCompaction(bool*, rocksdb::JobContext*, rocksdb::LogBuffer*, rocksdb::DBImpl::PrepickedCompaction*, rocksdb::Env::Priority)
> https://github.com/facebook/rocksdb/issues/15 rocksdb/db_impl/db_impl_compaction_flush.cc:2248 rocksdb::DBImpl::BackgroundCallCompaction(rocksdb::DBImpl::PrepickedCompaction*, rocksdb::Env::Priority)
> https://github.com/facebook/rocksdb/issues/16 rocksdb/db_impl/db_impl_compaction_flush.cc:2024 rocksdb::DBImpl::BGWorkCompaction(void*)
> https://github.com/facebook/rocksdb/issues/23 rocksdb/src/util/threadpool_imp.cc:266 rocksdb::ThreadPoolImpl::Impl::BGThread(unsigned long)
> https://github.com/facebook/rocksdb/issues/24 rocksdb/src/util/threadpool_imp.cc:307 rocksdb::ThreadPoolImpl::Impl::BGThreadWrapper(void*)
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5531
Test Plan: Verified that this fixes the fb-internal Logdevice test which caught the issue.
Differential Revision: D16109702
Pulled By: sagar0
fbshipit-source-id: 1fc08549cf7b553e338a133ae11eb9f4d5011914
2019-07-04 03:36:08 +02:00
|
|
|
s = Prefetch(file_reader_, offset, std::max(n, readahead_size_), for_compaction);
|
Compaction Reads should read no more than compaction_readahead_size bytes, when set! (#5498)
Summary:
As a result of https://github.com/facebook/rocksdb/issues/5431 the compaction_readahead_size given by a user was not used exactly, the reason being the code behind readahead for user-read and compaction-read was unified in the above PR and the behavior for user-read is to read readahead_size+n bytes (see FilePrefetchBuffer::TryReadFromCache method). Before the unification the ReadaheadRandomAccessFileReader used compaction_readahead_size as it is.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5498
Test Plan:
Ran strace command : strace -e pread64 -f -T -t ./db_compaction_test --gtest_filter=DBCompactionTest.PartialManualCompaction
In the test the compaction_readahead_size was configured to 2MB and verified the pread syscall did indeed request 2MB. Before the change it was requesting more than 2MB.
Strace Output:
strace: Process 3798982 attached
Note: Google Test filter = DBCompactionTest.PartialManualCompaction
[==========] Running 1 test from 1 test case.
[----------] Global test environment set-up.
[----------] 1 test from DBCompactionTest
[ RUN ] DBCompactionTest.PartialManualCompaction
strace: Process 3798983 attached
strace: Process 3798984 attached
strace: Process 3798985 attached
strace: Process 3798986 attached
strace: Process 3798987 attached
strace: Process 3798992 attached
[pid 3798987] 12:07:05 +++ exited with 0 +++
strace: Process 3798993 attached
[pid 3798993] 12:07:05 +++ exited with 0 +++
strace: Process 3798994 attached
strace: Process 3799008 attached
strace: Process 3799009 attached
[pid 3799008] 12:07:05 +++ exited with 0 +++
strace: Process 3799010 attached
[pid 3799009] 12:07:05 +++ exited with 0 +++
strace: Process 3799011 attached
[pid 3799010] 12:07:05 +++ exited with 0 +++
[pid 3799011] 12:07:05 +++ exited with 0 +++
strace: Process 3799012 attached
[pid 3799012] 12:07:05 +++ exited with 0 +++
strace: Process 3799013 attached
strace: Process 3799014 attached
[pid 3799013] 12:07:05 +++ exited with 0 +++
strace: Process 3799015 attached
[pid 3799014] 12:07:05 +++ exited with 0 +++
[pid 3799015] 12:07:05 +++ exited with 0 +++
strace: Process 3799016 attached
[pid 3799016] 12:07:05 +++ exited with 0 +++
strace: Process 3799017 attached
[pid 3799017] 12:07:05 +++ exited with 0 +++
strace: Process 3799019 attached
[pid 3799019] 12:07:05 +++ exited with 0 +++
strace: Process 3799020 attached
strace: Process 3799021 attached
[pid 3799020] 12:07:05 +++ exited with 0 +++
[pid 3799021] 12:07:05 +++ exited with 0 +++
strace: Process 3799022 attached
[pid 3799022] 12:07:05 +++ exited with 0 +++
strace: Process 3799023 attached
[pid 3799023] 12:07:05 +++ exited with 0 +++
strace: Process 3799047 attached
strace: Process 3799048 attached
[pid 3799047] 12:07:06 +++ exited with 0 +++
[pid 3799048] 12:07:06 +++ exited with 0 +++
[pid 3798994] 12:07:06 +++ exited with 0 +++
strace: Process 3799052 attached
[pid 3799052] 12:07:06 +++ exited with 0 +++
strace: Process 3799054 attached
strace: Process 3799069 attached
strace: Process 3799070 attached
[pid 3799069] 12:07:06 +++ exited with 0 +++
strace: Process 3799071 attached
[pid 3799070] 12:07:06 +++ exited with 0 +++
[pid 3799071] 12:07:06 +++ exited with 0 +++
strace: Process 3799072 attached
strace: Process 3799073 attached
[pid 3799072] 12:07:06 +++ exited with 0 +++
[pid 3799073] 12:07:06 +++ exited with 0 +++
strace: Process 3799074 attached
[pid 3799074] 12:07:06 +++ exited with 0 +++
strace: Process 3799075 attached
[pid 3799075] 12:07:06 +++ exited with 0 +++
strace: Process 3799076 attached
[pid 3799076] 12:07:06 +++ exited with 0 +++
strace: Process 3799077 attached
[pid 3799077] 12:07:06 +++ exited with 0 +++
strace: Process 3799078 attached
[pid 3799078] 12:07:06 +++ exited with 0 +++
strace: Process 3799079 attached
[pid 3799079] 12:07:06 +++ exited with 0 +++
strace: Process 3799080 attached
[pid 3799080] 12:07:06 +++ exited with 0 +++
strace: Process 3799081 attached
[pid 3799081] 12:07:06 +++ exited with 0 +++
strace: Process 3799082 attached
[pid 3799082] 12:07:06 +++ exited with 0 +++
strace: Process 3799083 attached
[pid 3799083] 12:07:06 +++ exited with 0 +++
strace: Process 3799086 attached
strace: Process 3799087 attached
[pid 3798984] 12:07:06 pread64(9, "\1\203W!\241QE\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 53, 11177) = 53 <0.000121>
[pid 3798984] 12:07:06 pread64(9, "\0\22\4rocksdb.properties\353Q\223\5\0\0\0\0\1\0\0"..., 38, 11139) = 38 <0.000106>
[pid 3798984] 12:07:06 pread64(9, "\0$\4rocksdb.block.based.table.ind"..., 664, 10475) = 664 <0.000081>
[pid 3798984] 12:07:06 pread64(9, "\0\v\3foo\2\7\0\0\0\0\0\0\0\270 \0\v\4foo\2\3\0\0\0\0\0\0\275"..., 74, 10401) = 74 <0.000138>
[pid 3798984] 12:07:06 pread64(11, "\1\203W!\241QE\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 53, 11177) = 53 <0.000097>
[pid 3798984] 12:07:06 pread64(11, "\0\22\4rocksdb.properties\353Q\223\5\0\0\0\0\1\0\0"..., 38, 11139) = 38 <0.000086>
[pid 3798984] 12:07:06 pread64(11, "\0$\4rocksdb.block.based.table.ind"..., 664, 10475) = 664 <0.000064>
[pid 3798984] 12:07:06 pread64(11, "\0\v\3foo\2\21\0\0\0\0\0\0\0\270 \0\v\4foo\2\r\0\0\0\0\0\0\275"..., 74, 10401) = 74 <0.000064>
[pid 3798984] 12:07:06 pread64(12, "\1\203W!\241QE\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 53, 11177) = 53 <0.000080>
[pid 3798984] 12:07:06 pread64(12, "\0\22\4rocksdb.properties\353Q\223\5\0\0\0\0\1\0\0"..., 38, 11139) = 38 <0.000090>
[pid 3798984] 12:07:06 pread64(12, "\0$\4rocksdb.block.based.table.ind"..., 664, 10475) = 664 <0.000059>
[pid 3798984] 12:07:06 pread64(12, "\0\v\3foo\2\33\0\0\0\0\0\0\0\270 \0\v\4foo\2\27\0\0\0\0\0\0\275"..., 74, 10401) = 74 <0.000065>
[pid 3798984] 12:07:06 pread64(13, "\1\203W!\241QE\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 53, 11177) = 53 <0.000070>
[pid 3798984] 12:07:06 pread64(13, "\0\22\4rocksdb.properties\353Q\223\5\0\0\0\0\1\0\0"..., 38, 11139) = 38 <0.000059>
[pid 3798984] 12:07:06 pread64(13, "\0$\4rocksdb.block.based.table.ind"..., 664, 10475) = 664 <0.000061>
[pid 3798984] 12:07:06 pread64(13, "\0\v\3foo\2%\0\0\0\0\0\0\0\270 \0\v\4foo\2!\0\0\0\0\0\0\275"..., 74, 10401) = 74 <0.000065>
[pid 3798984] 12:07:06 pread64(14, "\1\203W!\241QE\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 53, 11177) = 53 <0.000118>
[pid 3798984] 12:07:06 pread64(14, "\0\22\4rocksdb.properties\353Q\223\5\0\0\0\0\1\0\0"..., 38, 11139) = 38 <0.000093>
[pid 3798984] 12:07:06 pread64(14, "\0$\4rocksdb.block.based.table.ind"..., 664, 10475) = 664 <0.000050>
[pid 3798984] 12:07:06 pread64(14, "\0\v\3foo\2/\0\0\0\0\0\0\0\270 \0\v\4foo\2+\0\0\0\0\0\0\275"..., 74, 10401) = 74 <0.000082>
[pid 3798984] 12:07:06 pread64(15, "\1\203W!\241QE\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 53, 11177) = 53 <0.000080>
[pid 3798984] 12:07:06 pread64(15, "\0\22\4rocksdb.properties\353Q\223\5\0\0\0\0\1\0\0"..., 38, 11139) = 38 <0.000086>
[pid 3798984] 12:07:06 pread64(15, "\0$\4rocksdb.block.based.table.ind"..., 664, 10475) = 664 <0.000091>
[pid 3798984] 12:07:06 pread64(15, "\0\v\3foo\0029\0\0\0\0\0\0\0\270 \0\v\4foo\0025\0\0\0\0\0\0\275"..., 74, 10401) = 74 <0.000174>
[pid 3798984] 12:07:06 pread64(16, "\1\203W!\241QE\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 53, 11177) = 53 <0.000080>
[pid 3798984] 12:07:06 pread64(16, "\0\22\4rocksdb.properties\353Q\223\5\0\0\0\0\1\0\0"..., 38, 11139) = 38 <0.000093>
[pid 3798984] 12:07:06 pread64(16, "\0$\4rocksdb.block.based.table.ind"..., 664, 10475) = 664 <0.000194>
[pid 3798984] 12:07:06 pread64(16, "\0\v\3foo\2C\0\0\0\0\0\0\0\270 \0\v\4foo\2?\0\0\0\0\0\0\275"..., 74, 10401) = 74 <0.000086>
[pid 3798984] 12:07:06 pread64(17, "\1\203W!\241QE\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 53, 11177) = 53 <0.000079>
[pid 3798984] 12:07:06 pread64(17, "\0\22\4rocksdb.properties\353Q\223\5\0\0\0\0\1\0\0"..., 38, 11139) = 38 <0.000047>
[pid 3798984] 12:07:06 pread64(17, "\0$\4rocksdb.block.based.table.ind"..., 664, 10475) = 664 <0.000045>
[pid 3798984] 12:07:06 pread64(17, "\0\v\3foo\2M\0\0\0\0\0\0\0\270 \0\v\4foo\2I\0\0\0\0\0\0\275"..., 74, 10401) = 74 <0.000107>
[pid 3798983] 12:07:06 pread64(17, "\0\v\200\10foo\2P\0\0\0\0\0\0)U?MSg_)j(roFn($e"..., 2097152, 0) = 11230 <0.000091>
[pid 3798983] 12:07:06 pread64(17, "", 2085922, 11230) = 0 <0.000073>
[pid 3798983] 12:07:06 pread64(16, "\0\v\200\10foo\2F\0\0\0\0\0\0k[h3%.OPH_^:\\S7T&"..., 2097152, 0) = 11230 <0.000083>
[pid 3798983] 12:07:06 pread64(16, "", 2085922, 11230) = 0 <0.000078>
[pid 3798983] 12:07:06 pread64(15, "\0\v\200\10foo\2<\0\0\0\0\0\0+qToi_c{*S+4:N(:"..., 2097152, 0) = 11230 <0.000095>
[pid 3798983] 12:07:06 pread64(15, "", 2085922, 11230) = 0 <0.000067>
[pid 3798983] 12:07:06 pread64(14, "\0\v\200\10foo\0022\0\0\0\0\0\0%hw%OMa\"}9I609Q!B"..., 2097152, 0) = 11230 <0.000111>
[pid 3798983] 12:07:06 pread64(14, "", 2085922, 11230) = 0 <0.000093>
[pid 3798983] 12:07:06 pread64(13, "\0\v\200\10foo\2(\0\0\0\0\0\0p}Y&mu^DcaSGb2&nP"..., 2097152, 0) = 11230 <0.000128>
[pid 3798983] 12:07:06 pread64(13, "", 2085922, 11230) = 0 <0.000076>
[pid 3798983] 12:07:06 pread64(12, "\0\v\200\10foo\2\36\0\0\0\0\0\0YIyW#]oSs^6VHfB<`"..., 2097152, 0) = 11230 <0.000092>
[pid 3798983] 12:07:06 pread64(12, "", 2085922, 11230) = 0 <0.000073>
[pid 3798983] 12:07:06 pread64(11, "\0\v\200\10foo\2\24\0\0\0\0\0\0mfF8Jel/*Zf :-#s("..., 2097152, 0) = 11230 <0.000088>
[pid 3798983] 12:07:06 pread64(11, "", 2085922, 11230) = 0 <0.000067>
[pid 3798983] 12:07:06 pread64(9, "\0\v\200\10foo\2\n\0\0\0\0\0\0\\X'cjiHX)D,RSj1X!"..., 2097152, 0) = 11230 <0.000115>
[pid 3798983] 12:07:06 pread64(9, "", 2085922, 11230) = 0 <0.000073>
[pid 3798983] 12:07:06 pread64(8, "\1\315\5 \36\30\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 53, 754) = 53 <0.000098>
[pid 3798983] 12:07:06 pread64(8, "\0\22\3rocksdb.properties;\215\5\0\0\0\0\1\0\0\0"..., 37, 717) = 37 <0.000064>
[pid 3798983] 12:07:06 pread64(8, "\0$\4rocksdb.block.based.table.ind"..., 658, 59) = 658 <0.000074>
[pid 3798983] 12:07:06 pread64(8, "\0\v\2foo\1\0\0\0\0\0\0\0\0\31\0\0\0\0\1\0\0\0\0\212\216\222P", 29, 30) = 29 <0.000064>
[pid 3799086] 12:07:06 +++ exited with 0 +++
[pid 3799087] 12:07:06 +++ exited with 0 +++
[pid 3799054] 12:07:06 +++ exited with 0 +++
strace: Process 3799104 attached
[pid 3799104] 12:07:06 +++ exited with 0 +++
[ OK ] DBCompactionTest.PartialManualCompaction (757 ms)
[----------] 1 test from DBCompactionTest (758 ms total)
[----------] Global test environment tear-down
[==========] 1 test from 1 test case ran. (759 ms total)
[ PASSED ] 1 test.
[pid 3798983] 12:07:06 +++ exited with 0 +++
[pid 3798984] 12:07:06 +++ exited with 0 +++
[pid 3798992] 12:07:06 +++ exited with 0 +++
[pid 3798986] 12:07:06 +++ exited with 0 +++
[pid 3798982] 12:07:06 +++ exited with 0 +++
[pid 3798985] 12:07:06 +++ exited with 0 +++
12:07:06 +++ exited with 0 +++
Differential Revision: D15948422
Pulled By: vjnadimpalli
fbshipit-source-id: 9b189d1e8675d290c7784e4b33e5d3b5761d2ac8
2019-06-22 06:07:09 +02:00
|
|
|
} else {
|
|
|
|
s = Prefetch(file_reader_, offset, n + readahead_size_, for_compaction);
|
|
|
|
}
|
Improve direct IO range scan performance with readahead (#3884)
Summary:
This PR extends the improvements in #3282 to also work when using Direct IO.
We see **4.5X performance improvement** in seekrandom benchmark doing long range scans, when using direct reads, on flash.
**Description:**
This change improves the performance of iterators doing long range scans (e.g. big/full index or table scans in MyRocks) by using readahead and prefetching additional data on each disk IO, and storing in a local buffer. This prefetching is automatically enabled on noticing more than 2 IOs for the same table file during iteration. The readahead size starts with 8KB and is exponentially increased on each additional sequential IO, up to a max of 256 KB. This helps in cutting down the number of IOs needed to complete the range scan.
**Implementation Details:**
- Used `FilePrefetchBuffer` as the underlying buffer to store the readahead data. `FilePrefetchBuffer` can now take file_reader, readahead_size and max_readahead_size as input to the constructor, and automatically do readahead.
- `FilePrefetchBuffer::TryReadFromCache` can now call `FilePrefetchBuffer::Prefetch` if readahead is enabled.
- `AlignedBuffer` (which is the underlying store for `FilePrefetchBuffer`) now takes a few additional args in `AlignedBuffer::AllocateNewBuffer` to allow copying data from the old buffer.
- Made sure not to re-read partial chunks of data that were already available in the buffer, from device again.
- Fixed a couple of cases where `AlignedBuffer::cursize_` was not being properly kept up-to-date.
**Constraints:**
- Similar to #3282, this gets currently enabled only when ReadOptions.readahead_size = 0 (which is the default value).
- Since the prefetched data is stored in a temporary buffer allocated on heap, this could increase the memory usage if you have many iterators doing long range scans simultaneously.
- Enabled only for user reads, and disabled for compactions. Compaction reads are controlled by the options `use_direct_io_for_flush_and_compaction` and `compaction_readahead_size`, and the current feature takes precautions not to mess with them.
**Benchmarks:**
I used the same benchmark as used in #3282.
Data fill:
```
TEST_TMPDIR=/data/users/$USER/benchmarks/iter ./db_bench -benchmarks=fillrandom -num=1000000000 -compression_type="none" -level_compaction_dynamic_level_bytes
```
Do a long range scan: Seekrandom with large number of nexts
```
TEST_TMPDIR=/data/users/$USER/benchmarks/iter ./db_bench -benchmarks=seekrandom -use_direct_reads -duration=60 -num=1000000000 -use_existing_db -seek_nexts=10000 -statistics -histogram
```
```
Before:
seekrandom : 37939.906 micros/op 26 ops/sec; 29.2 MB/s (1636 of 1999 found)
With this change:
seekrandom : 8527.720 micros/op 117 ops/sec; 129.7 MB/s (6530 of 7999 found)
```
~4.5X perf improvement. Taken on an average of 3 runs.
Closes https://github.com/facebook/rocksdb/pull/3884
Differential Revision: D8082143
Pulled By: sagar0
fbshipit-source-id: 4d7a8561cbac03478663713df4d31ad2620253bb
2018-06-21 20:02:49 +02:00
|
|
|
if (!s.ok()) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
readahead_size_ = std::min(max_readahead_size_, readahead_size_ * 2);
|
|
|
|
} else {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-08-11 20:59:13 +02:00
|
|
|
uint64_t offset_in_buffer = offset - buffer_offset_;
|
|
|
|
*result = Slice(buffer_.BufferStart() + offset_in_buffer, n);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2015-08-27 00:25:59 +02:00
|
|
|
std::unique_ptr<RandomAccessFile> NewReadaheadRandomAccessFile(
|
2015-09-11 18:57:02 +02:00
|
|
|
std::unique_ptr<RandomAccessFile>&& file, size_t readahead_size) {
|
2015-09-12 02:36:48 +02:00
|
|
|
std::unique_ptr<RandomAccessFile> result(
|
|
|
|
new ReadaheadRandomAccessFile(std::move(file), readahead_size));
|
|
|
|
return result;
|
2015-08-27 00:25:59 +02:00
|
|
|
}
|
|
|
|
|
2015-10-16 23:33:47 +02:00
|
|
|
Status NewWritableFile(Env* env, const std::string& fname,
|
2018-11-09 20:17:34 +01:00
|
|
|
std::unique_ptr<WritableFile>* result,
|
2015-10-16 23:33:47 +02:00
|
|
|
const EnvOptions& options) {
|
|
|
|
Status s = env->NewWritableFile(fname, result, options);
|
|
|
|
TEST_KILL_RANDOM("NewWritableFile:0", rocksdb_kill_odds * REDUCE_ODDS2);
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
RocksDB Trace Analyzer (#4091)
Summary:
A framework of trace analyzing for RocksDB
After collecting the trace by using the tool of [PR #3837](https://github.com/facebook/rocksdb/pull/3837). User can use the Trace Analyzer to interpret, analyze, and characterize the collected workload.
**Input:**
1. trace file
2. Whole keys space file
**Statistics:**
1. Access count of each operation (Get, Put, Delete, SingleDelete, DeleteRange, Merge) in each column family.
2. Key hotness (access count) of each one
3. Key space separation based on given prefix
4. Key size distribution
5. Value size distribution if appliable
6. Top K accessed keys
7. QPS statistics including the average QPS and peak QPS
8. Top K accessed prefix
9. The query correlation analyzing, output the number of X after Y and the corresponding average time
intervals
**Output:**
1. key access heat map (either in the accessed key space or whole key space)
2. trace sequence file (interpret the raw trace file to line base text file for future use)
3. Time serial (The key space ID and its access time)
4. Key access count distritbution
5. Key size distribution
6. Value size distribution (in each intervals)
7. whole key space separation by the prefix
8. Accessed key space separation by the prefix
9. QPS of each operation and each column family
10. Top K QPS and their accessed prefix range
**Test:**
1. Added the unit test of analyzing Get, Put, Delete, SingleDelete, DeleteRange, Merge
2. Generated the trace and analyze the trace
**Implemented but not tested (due to the limitation of trace_replay):**
1. Analyzing Iterator, supporting Seek() and SeekForPrev() analyzing
2. Analyzing the number of Key found by Get
**Future Work:**
1. Support execution time analyzing of each requests
2. Support cache hit situation and block read situation of Get
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4091
Differential Revision: D9256157
Pulled By: zhichao-cao
fbshipit-source-id: f0ceacb7eedbc43a3eee6e85b76087d7832a8fe6
2018-08-13 20:32:04 +02:00
|
|
|
bool ReadOneLine(std::istringstream* iss, SequentialFile* seq_file,
|
|
|
|
std::string* output, bool* has_data, Status* result) {
|
|
|
|
const int kBufferSize = 8192;
|
|
|
|
char buffer[kBufferSize + 1];
|
|
|
|
Slice input_slice;
|
|
|
|
|
|
|
|
std::string line;
|
|
|
|
bool has_complete_line = false;
|
|
|
|
while (!has_complete_line) {
|
|
|
|
if (std::getline(*iss, line)) {
|
|
|
|
has_complete_line = !iss->eof();
|
|
|
|
} else {
|
|
|
|
has_complete_line = false;
|
|
|
|
}
|
|
|
|
if (!has_complete_line) {
|
|
|
|
// if we're not sure whether we have a complete line,
|
|
|
|
// further read from the file.
|
|
|
|
if (*has_data) {
|
|
|
|
*result = seq_file->Read(kBufferSize, &input_slice, buffer);
|
|
|
|
}
|
|
|
|
if (input_slice.size() == 0) {
|
|
|
|
// meaning we have read all the data
|
|
|
|
*has_data = false;
|
|
|
|
break;
|
|
|
|
} else {
|
|
|
|
iss->str(line + input_slice.ToString());
|
|
|
|
// reset the internal state of iss so that we can keep reading it.
|
|
|
|
iss->clear();
|
|
|
|
*has_data = (input_slice.size() == kBufferSize);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
*output = line;
|
|
|
|
return *has_data || has_complete_line;
|
|
|
|
}
|
|
|
|
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
} // namespace rocksdb
|