2016-02-10 00:12:00 +01:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
2017-07-16 01:03:42 +02:00
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
//
|
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
|
|
|
#include "util/file_reader_writer.h"
|
|
|
|
|
|
|
|
#include <algorithm>
|
2015-08-27 00:25:59 +02:00
|
|
|
#include <mutex>
|
|
|
|
|
2017-04-06 04:02:00 +02:00
|
|
|
#include "monitoring/histogram.h"
|
|
|
|
#include "monitoring/iostats_context_imp.h"
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
#include "port/port.h"
|
|
|
|
#include "util/random.h"
|
|
|
|
#include "util/rate_limiter.h"
|
|
|
|
#include "util/sync_point.h"
|
|
|
|
|
|
|
|
namespace rocksdb {
|
2015-09-12 02:36:48 +02:00
|
|
|
|
2017-05-05 20:58:10 +02:00
|
|
|
#ifndef NDEBUG
|
|
|
|
namespace {
|
2017-05-18 03:46:03 +02:00
|
|
|
bool IsFileSectorAligned(const size_t off, size_t sector_size) {
|
2017-05-05 20:58:10 +02:00
|
|
|
return off % sector_size == 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
Status SequentialFileReader::Read(size_t n, Slice* result, char* scratch) {
|
2017-01-12 01:42:07 +01:00
|
|
|
Status s;
|
2017-01-13 21:01:08 +01:00
|
|
|
if (use_direct_io()) {
|
2017-02-16 19:25:06 +01:00
|
|
|
#ifndef ROCKSDB_LITE
|
2017-01-12 01:42:07 +01:00
|
|
|
size_t offset = offset_.fetch_add(n);
|
|
|
|
size_t alignment = file_->GetRequiredBufferAlignment();
|
|
|
|
size_t aligned_offset = TruncateToPageBoundary(alignment, offset);
|
|
|
|
size_t offset_advance = offset - aligned_offset;
|
|
|
|
size_t size = Roundup(offset + n, alignment) - aligned_offset;
|
|
|
|
size_t r = 0;
|
|
|
|
AlignedBuffer buf;
|
|
|
|
buf.Alignment(alignment);
|
|
|
|
buf.AllocateNewBuffer(size);
|
|
|
|
Slice tmp;
|
|
|
|
s = file_->PositionedRead(aligned_offset, size, &tmp, buf.BufferStart());
|
|
|
|
if (s.ok() && offset_advance < tmp.size()) {
|
|
|
|
buf.Size(tmp.size());
|
|
|
|
r = buf.Read(scratch, offset_advance,
|
|
|
|
std::min(tmp.size() - offset_advance, n));
|
|
|
|
}
|
|
|
|
*result = Slice(scratch, r);
|
2017-02-16 19:25:06 +01:00
|
|
|
#endif // !ROCKSDB_LITE
|
2017-01-12 01:42:07 +01:00
|
|
|
} else {
|
|
|
|
s = file_->Read(n, result, scratch);
|
|
|
|
}
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
IOSTATS_ADD(bytes_read, result->size());
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2017-01-12 01:42:07 +01:00
|
|
|
|
|
|
|
Status SequentialFileReader::Skip(uint64_t n) {
|
2017-02-16 19:25:06 +01:00
|
|
|
#ifndef ROCKSDB_LITE
|
2017-01-13 21:01:08 +01:00
|
|
|
if (use_direct_io()) {
|
2017-01-12 01:42:07 +01:00
|
|
|
offset_ += n;
|
|
|
|
return Status::OK();
|
|
|
|
}
|
2017-02-16 19:25:06 +01:00
|
|
|
#endif // !ROCKSDB_LITE
|
|
|
|
return file_->Skip(n);
|
2017-01-12 01:42:07 +01:00
|
|
|
}
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
|
|
|
|
Status RandomAccessFileReader::Read(uint64_t offset, size_t n, Slice* result,
|
|
|
|
char* scratch) const {
|
Measure file read latency histogram per level
Summary: In internal stats, remember read latency histogram, if statistics is enabled. It can be retrieved from DB::GetProperty() with "rocksdb.dbstats" property, if it is enabled.
Test Plan: Manually run db_bench and prints out "rocksdb.dbstats" by hand and make sure it prints out as expected
Reviewers: igor, IslamAbdelRahman, rven, kradhakrishnan, anthony, yhchiang
Reviewed By: yhchiang
Subscribers: MarkCallaghan, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D44193
2015-08-13 23:35:54 +02:00
|
|
|
Status s;
|
|
|
|
uint64_t elapsed = 0;
|
|
|
|
{
|
|
|
|
StopWatch sw(env_, stats_, hist_type_,
|
|
|
|
(stats_ != nullptr) ? &elapsed : nullptr);
|
|
|
|
IOSTATS_TIMER_GUARD(read_nanos);
|
2017-01-13 21:01:08 +01:00
|
|
|
if (use_direct_io()) {
|
2017-02-16 19:25:06 +01:00
|
|
|
#ifndef ROCKSDB_LITE
|
2017-01-12 01:42:07 +01:00
|
|
|
size_t alignment = file_->GetRequiredBufferAlignment();
|
|
|
|
size_t aligned_offset = TruncateToPageBoundary(alignment, offset);
|
|
|
|
size_t offset_advance = offset - aligned_offset;
|
2017-06-13 23:51:22 +02:00
|
|
|
size_t read_size = Roundup(offset + n, alignment) - aligned_offset;
|
2017-01-12 01:42:07 +01:00
|
|
|
AlignedBuffer buf;
|
|
|
|
buf.Alignment(alignment);
|
2017-06-13 23:51:22 +02:00
|
|
|
buf.AllocateNewBuffer(read_size);
|
|
|
|
while (buf.CurrentSize() < read_size) {
|
|
|
|
size_t allowed;
|
|
|
|
if (rate_limiter_ != nullptr) {
|
|
|
|
allowed = rate_limiter_->RequestToken(
|
|
|
|
buf.Capacity() - buf.CurrentSize(), buf.Alignment(),
|
|
|
|
Env::IOPriority::IO_LOW, stats_, RateLimiter::OpType::kRead);
|
|
|
|
} else {
|
|
|
|
assert(buf.CurrentSize() == 0);
|
|
|
|
allowed = read_size;
|
|
|
|
}
|
|
|
|
Slice tmp;
|
|
|
|
s = file_->Read(aligned_offset + buf.CurrentSize(), allowed, &tmp,
|
|
|
|
buf.Destination());
|
|
|
|
buf.Size(buf.CurrentSize() + tmp.size());
|
|
|
|
if (!s.ok() || tmp.size() < allowed) {
|
|
|
|
break;
|
|
|
|
}
|
2017-01-12 01:42:07 +01:00
|
|
|
}
|
2017-06-13 23:51:22 +02:00
|
|
|
size_t res_len = 0;
|
|
|
|
if (s.ok() && offset_advance < buf.CurrentSize()) {
|
|
|
|
res_len = buf.Read(scratch, offset_advance,
|
|
|
|
std::min(buf.CurrentSize() - offset_advance, n));
|
|
|
|
}
|
|
|
|
*result = Slice(scratch, res_len);
|
2017-02-16 19:25:06 +01:00
|
|
|
#endif // !ROCKSDB_LITE
|
2017-01-12 01:42:07 +01:00
|
|
|
} else {
|
2017-06-13 23:51:22 +02:00
|
|
|
size_t pos = 0;
|
|
|
|
const char* res_scratch = nullptr;
|
|
|
|
while (pos < n) {
|
|
|
|
size_t allowed;
|
|
|
|
if (for_compaction_ && rate_limiter_ != nullptr) {
|
|
|
|
allowed = rate_limiter_->RequestToken(n - pos, 0 /* alignment */,
|
|
|
|
Env::IOPriority::IO_LOW, stats_,
|
|
|
|
RateLimiter::OpType::kRead);
|
|
|
|
} else {
|
|
|
|
allowed = n;
|
|
|
|
}
|
|
|
|
Slice tmp_result;
|
|
|
|
s = file_->Read(offset + pos, allowed, &tmp_result, scratch + pos);
|
|
|
|
if (res_scratch == nullptr) {
|
|
|
|
// we can't simply use `scratch` because reads of mmap'd files return
|
|
|
|
// data in a different buffer.
|
|
|
|
res_scratch = tmp_result.data();
|
|
|
|
} else {
|
|
|
|
// make sure chunks are inserted contiguously into `res_scratch`.
|
|
|
|
assert(tmp_result.data() == res_scratch + pos);
|
|
|
|
}
|
|
|
|
pos += tmp_result.size();
|
|
|
|
if (!s.ok() || tmp_result.size() < allowed) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
*result = Slice(res_scratch, s.ok() ? pos : 0);
|
2017-01-12 01:42:07 +01:00
|
|
|
}
|
Measure file read latency histogram per level
Summary: In internal stats, remember read latency histogram, if statistics is enabled. It can be retrieved from DB::GetProperty() with "rocksdb.dbstats" property, if it is enabled.
Test Plan: Manually run db_bench and prints out "rocksdb.dbstats" by hand and make sure it prints out as expected
Reviewers: igor, IslamAbdelRahman, rven, kradhakrishnan, anthony, yhchiang
Reviewed By: yhchiang
Subscribers: MarkCallaghan, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D44193
2015-08-13 23:35:54 +02:00
|
|
|
IOSTATS_ADD_IF_POSITIVE(bytes_read, result->size());
|
|
|
|
}
|
|
|
|
if (stats_ != nullptr && file_read_hist_ != nullptr) {
|
|
|
|
file_read_hist_->Add(elapsed);
|
|
|
|
}
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status WritableFileWriter::Append(const Slice& data) {
|
|
|
|
const char* src = data.data();
|
|
|
|
size_t left = data.size();
|
|
|
|
Status s;
|
|
|
|
pending_sync_ = true;
|
|
|
|
|
2015-10-14 23:08:50 +02:00
|
|
|
TEST_KILL_RANDOM("WritableFileWriter::Append:0",
|
|
|
|
rocksdb_kill_odds * REDUCE_ODDS2);
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
|
Add options.compaction_measure_io_stats to print write I/O stats in compactions
Summary:
Add options.compaction_measure_io_stats to print out / pass to listener accumulated time spent on write calls. Example outputs in info logs:
2015/08/12-16:27:59.463944 7fd428bff700 (Original Log Time 2015/08/12-16:27:59.463922) EVENT_LOG_v1 {"time_micros": 1439422079463897, "job": 6, "event": "compaction_finished", "output_level": 1, "num_output_files": 4, "total_output_size": 6900525, "num_input_records": 111483, "num_output_records": 106877, "file_write_nanos": 15663206, "file_range_sync_nanos": 649588, "file_fsync_nanos": 349614797, "file_prepare_write_nanos": 1505812, "lsm_state": [2, 4, 0, 0, 0, 0, 0]}
Add two more counters in iostats_context.
Also add a parameter of db_bench.
Test Plan: Add a unit test. Also manually verify LOG outputs in db_bench
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D44115
2015-08-13 02:24:45 +02:00
|
|
|
{
|
|
|
|
IOSTATS_TIMER_GUARD(prepare_write_nanos);
|
|
|
|
TEST_SYNC_POINT("WritableFileWriter::Append:BeforePrepareWrite");
|
|
|
|
writable_file_->PrepareWrite(static_cast<size_t>(GetFileSize()), left);
|
|
|
|
}
|
2015-09-11 18:57:02 +02:00
|
|
|
|
2017-06-13 13:34:51 +02:00
|
|
|
// See whether we need to enlarge the buffer to avoid the flush
|
|
|
|
if (buf_.Capacity() - buf_.CurrentSize() < left) {
|
|
|
|
for (size_t cap = buf_.Capacity();
|
|
|
|
cap < max_buffer_size_; // There is still room to increase
|
|
|
|
cap *= 2) {
|
|
|
|
// See whether the next available size is large enough.
|
|
|
|
// Buffer will never be increased to more than max_buffer_size_.
|
|
|
|
size_t desired_capacity = std::min(cap * 2, max_buffer_size_);
|
|
|
|
if (desired_capacity - buf_.CurrentSize() >= left ||
|
|
|
|
(use_direct_io() && desired_capacity == max_buffer_size_)) {
|
|
|
|
buf_.AllocateNewBuffer(desired_capacity, true);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-12-22 21:51:29 +01:00
|
|
|
// Flush only when buffered I/O
|
2017-03-15 06:23:21 +01:00
|
|
|
if (!use_direct_io() && (buf_.Capacity() - buf_.CurrentSize()) < left) {
|
2015-09-12 02:36:48 +02:00
|
|
|
if (buf_.CurrentSize() > 0) {
|
2015-09-11 18:57:02 +02:00
|
|
|
s = Flush();
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
}
|
2015-09-12 02:36:48 +02:00
|
|
|
assert(buf_.CurrentSize() == 0);
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
}
|
|
|
|
|
2016-12-22 21:51:29 +01:00
|
|
|
// We never write directly to disk with direct I/O on.
|
2015-09-11 18:57:02 +02:00
|
|
|
// or we simply use it for its original purpose to accumulate many small
|
|
|
|
// chunks
|
2017-03-15 06:23:21 +01:00
|
|
|
if (use_direct_io() || (buf_.Capacity() >= left)) {
|
2015-09-11 18:57:02 +02:00
|
|
|
while (left > 0) {
|
|
|
|
size_t appended = buf_.Append(src, left);
|
|
|
|
left -= appended;
|
|
|
|
src += appended;
|
|
|
|
|
|
|
|
if (left > 0) {
|
|
|
|
s = Flush();
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
if (!s.ok()) {
|
2015-09-11 18:57:02 +02:00
|
|
|
break;
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
}
|
2015-09-11 18:57:02 +02:00
|
|
|
}
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
}
|
2015-09-11 18:57:02 +02:00
|
|
|
} else {
|
|
|
|
// Writing directly to file bypassing the buffer
|
2015-09-12 02:36:48 +02:00
|
|
|
assert(buf_.CurrentSize() == 0);
|
2015-09-11 18:57:02 +02:00
|
|
|
s = WriteBuffered(src, left);
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
}
|
2015-09-11 18:57:02 +02:00
|
|
|
|
2015-10-14 23:08:50 +02:00
|
|
|
TEST_KILL_RANDOM("WritableFileWriter::Append:1", rocksdb_kill_odds);
|
2015-10-28 05:04:00 +01:00
|
|
|
if (s.ok()) {
|
|
|
|
filesize_ += data.size();
|
|
|
|
}
|
|
|
|
return s;
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
Status WritableFileWriter::Close() {
|
2015-09-11 18:57:02 +02:00
|
|
|
|
|
|
|
// Do not quit immediately on failure the file MUST be closed
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
Status s;
|
2015-09-11 18:57:02 +02:00
|
|
|
|
|
|
|
// Possible to close it twice now as we MUST close
|
|
|
|
// in __dtor, simply flushing is not enough
|
|
|
|
// Windows when pre-allocating does not fill with zeros
|
|
|
|
// also with unbuffered access we also set the end of data.
|
|
|
|
if (!writable_file_) {
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2015-09-11 18:57:02 +02:00
|
|
|
s = Flush(); // flush cache to OS
|
|
|
|
|
2017-03-17 19:25:01 +01:00
|
|
|
Status interim;
|
2016-12-22 21:51:29 +01:00
|
|
|
// In direct I/O mode we write whole pages so
|
2015-09-11 18:57:02 +02:00
|
|
|
// we need to let the file know where data ends.
|
2017-03-17 19:25:01 +01:00
|
|
|
if (use_direct_io()) {
|
|
|
|
interim = writable_file_->Truncate(filesize_);
|
|
|
|
if (!interim.ok() && s.ok()) {
|
|
|
|
s = interim;
|
|
|
|
}
|
2015-09-11 18:57:02 +02:00
|
|
|
}
|
|
|
|
|
2015-10-14 23:08:50 +02:00
|
|
|
TEST_KILL_RANDOM("WritableFileWriter::Close:0", rocksdb_kill_odds);
|
2015-09-11 18:57:02 +02:00
|
|
|
interim = writable_file_->Close();
|
|
|
|
if (!interim.ok() && s.ok()) {
|
|
|
|
s = interim;
|
|
|
|
}
|
|
|
|
|
|
|
|
writable_file_.reset();
|
2015-10-16 23:33:47 +02:00
|
|
|
TEST_KILL_RANDOM("WritableFileWriter::Close:1", rocksdb_kill_odds);
|
2015-09-11 18:57:02 +02:00
|
|
|
|
|
|
|
return s;
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
}
|
|
|
|
|
2016-12-22 21:51:29 +01:00
|
|
|
// write out the cached data to the OS cache or storage if direct I/O
|
|
|
|
// enabled
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
Status WritableFileWriter::Flush() {
|
2015-09-11 18:57:02 +02:00
|
|
|
Status s;
|
2015-10-14 23:08:50 +02:00
|
|
|
TEST_KILL_RANDOM("WritableFileWriter::Flush:0",
|
|
|
|
rocksdb_kill_odds * REDUCE_ODDS2);
|
2015-09-11 18:57:02 +02:00
|
|
|
|
2015-09-12 02:36:48 +02:00
|
|
|
if (buf_.CurrentSize() > 0) {
|
2017-03-15 06:23:21 +01:00
|
|
|
if (use_direct_io()) {
|
2017-02-16 19:25:06 +01:00
|
|
|
#ifndef ROCKSDB_LITE
|
2016-12-22 21:51:29 +01:00
|
|
|
s = WriteDirect();
|
2017-02-16 19:25:06 +01:00
|
|
|
#endif // !ROCKSDB_LITE
|
2015-09-11 18:57:02 +02:00
|
|
|
} else {
|
2016-12-22 21:51:29 +01:00
|
|
|
s = WriteBuffered(buf_.BufferStart(), buf_.CurrentSize());
|
2015-09-11 18:57:02 +02:00
|
|
|
}
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-09-11 18:57:02 +02:00
|
|
|
s = writable_file_->Flush();
|
|
|
|
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
|
|
|
|
// sync OS cache to disk for every bytes_per_sync_
|
|
|
|
// TODO: give log file and sst file different options (log
|
|
|
|
// files could be potentially cached in OS for their whole
|
|
|
|
// life time, thus we might not want to flush at all).
|
RangeSync not to sync last 1MB of the file
Summary:
From other ones' investigation:
"sync_file_range() behavior highly depends on kernel version and filesystem.
xfs does neighbor page flushing outside of the specified ranges. For example, sync_file_range(fd, 8192, 16384) does not only trigger flushing page #3 to #4, but also flushing many more dirty pages (i.e. up to page#16)... Ranges of the sync_file_range() should be far enough from write() offset (at least 1MB)."
Test Plan: make all check
Reviewers: igor, rven, kradhakrishnan, yhchiang, IslamAbdelRahman, anthony
Reviewed By: anthony
Subscribers: yoshinorim, MarkCallaghan, sumeet, domas, dhruba, leveldb, ljin
Differential Revision: https://reviews.facebook.net/D15807
2015-07-20 23:46:15 +02:00
|
|
|
|
|
|
|
// We try to avoid sync to the last 1MB of data. For two reasons:
|
|
|
|
// (1) avoid rewrite the same page that is modified later.
|
|
|
|
// (2) for older version of OS, write can block while writing out
|
|
|
|
// the page.
|
|
|
|
// Xfs does neighbor page flushing outside of the specified ranges. We
|
|
|
|
// need to make sure sync range is far from the write offset.
|
2017-03-15 06:23:21 +01:00
|
|
|
if (!use_direct_io() && bytes_per_sync_) {
|
2015-09-11 18:57:02 +02:00
|
|
|
const uint64_t kBytesNotSyncRange = 1024 * 1024; // recent 1MB is not synced.
|
|
|
|
const uint64_t kBytesAlignWhenSync = 4 * 1024; // Align 4KB.
|
RangeSync not to sync last 1MB of the file
Summary:
From other ones' investigation:
"sync_file_range() behavior highly depends on kernel version and filesystem.
xfs does neighbor page flushing outside of the specified ranges. For example, sync_file_range(fd, 8192, 16384) does not only trigger flushing page #3 to #4, but also flushing many more dirty pages (i.e. up to page#16)... Ranges of the sync_file_range() should be far enough from write() offset (at least 1MB)."
Test Plan: make all check
Reviewers: igor, rven, kradhakrishnan, yhchiang, IslamAbdelRahman, anthony
Reviewed By: anthony
Subscribers: yoshinorim, MarkCallaghan, sumeet, domas, dhruba, leveldb, ljin
Differential Revision: https://reviews.facebook.net/D15807
2015-07-20 23:46:15 +02:00
|
|
|
if (filesize_ > kBytesNotSyncRange) {
|
|
|
|
uint64_t offset_sync_to = filesize_ - kBytesNotSyncRange;
|
|
|
|
offset_sync_to -= offset_sync_to % kBytesAlignWhenSync;
|
|
|
|
assert(offset_sync_to >= last_sync_size_);
|
|
|
|
if (offset_sync_to > 0 &&
|
|
|
|
offset_sync_to - last_sync_size_ >= bytes_per_sync_) {
|
2015-09-11 18:57:02 +02:00
|
|
|
s = RangeSync(last_sync_size_, offset_sync_to - last_sync_size_);
|
RangeSync not to sync last 1MB of the file
Summary:
From other ones' investigation:
"sync_file_range() behavior highly depends on kernel version and filesystem.
xfs does neighbor page flushing outside of the specified ranges. For example, sync_file_range(fd, 8192, 16384) does not only trigger flushing page #3 to #4, but also flushing many more dirty pages (i.e. up to page#16)... Ranges of the sync_file_range() should be far enough from write() offset (at least 1MB)."
Test Plan: make all check
Reviewers: igor, rven, kradhakrishnan, yhchiang, IslamAbdelRahman, anthony
Reviewed By: anthony
Subscribers: yoshinorim, MarkCallaghan, sumeet, domas, dhruba, leveldb, ljin
Differential Revision: https://reviews.facebook.net/D15807
2015-07-20 23:46:15 +02:00
|
|
|
last_sync_size_ = offset_sync_to;
|
|
|
|
}
|
|
|
|
}
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
}
|
|
|
|
|
2015-09-11 18:57:02 +02:00
|
|
|
return s;
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
Status WritableFileWriter::Sync(bool use_fsync) {
|
|
|
|
Status s = Flush();
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
2015-10-14 23:08:50 +02:00
|
|
|
TEST_KILL_RANDOM("WritableFileWriter::Sync:0", rocksdb_kill_odds);
|
2017-03-15 06:23:21 +01:00
|
|
|
if (!use_direct_io() && pending_sync_) {
|
[wal changes 3/3] method in DB to sync WAL without blocking writers
Summary:
Subj. We really need this feature.
Previous diff D40899 has most of the changes to make this possible, this diff just adds the method.
Test Plan: `make check`, the new test fails without this diff; ran with ASAN, TSAN and valgrind.
Reviewers: igor, rven, IslamAbdelRahman, anthony, kradhakrishnan, tnovak, yhchiang, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, maykov, hermanlee4, yoshinorim, tnovak, dhruba
Differential Revision: https://reviews.facebook.net/D40905
2015-08-05 15:06:39 +02:00
|
|
|
s = SyncInternal(use_fsync);
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
}
|
2015-10-14 23:08:50 +02:00
|
|
|
TEST_KILL_RANDOM("WritableFileWriter::Sync:1", rocksdb_kill_odds);
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
pending_sync_ = false;
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
[wal changes 3/3] method in DB to sync WAL without blocking writers
Summary:
Subj. We really need this feature.
Previous diff D40899 has most of the changes to make this possible, this diff just adds the method.
Test Plan: `make check`, the new test fails without this diff; ran with ASAN, TSAN and valgrind.
Reviewers: igor, rven, IslamAbdelRahman, anthony, kradhakrishnan, tnovak, yhchiang, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, maykov, hermanlee4, yoshinorim, tnovak, dhruba
Differential Revision: https://reviews.facebook.net/D40905
2015-08-05 15:06:39 +02:00
|
|
|
Status WritableFileWriter::SyncWithoutFlush(bool use_fsync) {
|
|
|
|
if (!writable_file_->IsSyncThreadSafe()) {
|
|
|
|
return Status::NotSupported(
|
|
|
|
"Can't WritableFileWriter::SyncWithoutFlush() because "
|
|
|
|
"WritableFile::IsSyncThreadSafe() is false");
|
|
|
|
}
|
2015-08-05 20:56:19 +02:00
|
|
|
TEST_SYNC_POINT("WritableFileWriter::SyncWithoutFlush:1");
|
|
|
|
Status s = SyncInternal(use_fsync);
|
|
|
|
TEST_SYNC_POINT("WritableFileWriter::SyncWithoutFlush:2");
|
|
|
|
return s;
|
[wal changes 3/3] method in DB to sync WAL without blocking writers
Summary:
Subj. We really need this feature.
Previous diff D40899 has most of the changes to make this possible, this diff just adds the method.
Test Plan: `make check`, the new test fails without this diff; ran with ASAN, TSAN and valgrind.
Reviewers: igor, rven, IslamAbdelRahman, anthony, kradhakrishnan, tnovak, yhchiang, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, maykov, hermanlee4, yoshinorim, tnovak, dhruba
Differential Revision: https://reviews.facebook.net/D40905
2015-08-05 15:06:39 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
Status WritableFileWriter::SyncInternal(bool use_fsync) {
|
|
|
|
Status s;
|
Add options.compaction_measure_io_stats to print write I/O stats in compactions
Summary:
Add options.compaction_measure_io_stats to print out / pass to listener accumulated time spent on write calls. Example outputs in info logs:
2015/08/12-16:27:59.463944 7fd428bff700 (Original Log Time 2015/08/12-16:27:59.463922) EVENT_LOG_v1 {"time_micros": 1439422079463897, "job": 6, "event": "compaction_finished", "output_level": 1, "num_output_files": 4, "total_output_size": 6900525, "num_input_records": 111483, "num_output_records": 106877, "file_write_nanos": 15663206, "file_range_sync_nanos": 649588, "file_fsync_nanos": 349614797, "file_prepare_write_nanos": 1505812, "lsm_state": [2, 4, 0, 0, 0, 0, 0]}
Add two more counters in iostats_context.
Also add a parameter of db_bench.
Test Plan: Add a unit test. Also manually verify LOG outputs in db_bench
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D44115
2015-08-13 02:24:45 +02:00
|
|
|
IOSTATS_TIMER_GUARD(fsync_nanos);
|
|
|
|
TEST_SYNC_POINT("WritableFileWriter::SyncInternal:0");
|
[wal changes 3/3] method in DB to sync WAL without blocking writers
Summary:
Subj. We really need this feature.
Previous diff D40899 has most of the changes to make this possible, this diff just adds the method.
Test Plan: `make check`, the new test fails without this diff; ran with ASAN, TSAN and valgrind.
Reviewers: igor, rven, IslamAbdelRahman, anthony, kradhakrishnan, tnovak, yhchiang, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, maykov, hermanlee4, yoshinorim, tnovak, dhruba
Differential Revision: https://reviews.facebook.net/D40905
2015-08-05 15:06:39 +02:00
|
|
|
if (use_fsync) {
|
|
|
|
s = writable_file_->Fsync();
|
|
|
|
} else {
|
|
|
|
s = writable_file_->Sync();
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2015-11-11 02:03:42 +01:00
|
|
|
Status WritableFileWriter::RangeSync(uint64_t offset, uint64_t nbytes) {
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
IOSTATS_TIMER_GUARD(range_sync_nanos);
|
Add options.compaction_measure_io_stats to print write I/O stats in compactions
Summary:
Add options.compaction_measure_io_stats to print out / pass to listener accumulated time spent on write calls. Example outputs in info logs:
2015/08/12-16:27:59.463944 7fd428bff700 (Original Log Time 2015/08/12-16:27:59.463922) EVENT_LOG_v1 {"time_micros": 1439422079463897, "job": 6, "event": "compaction_finished", "output_level": 1, "num_output_files": 4, "total_output_size": 6900525, "num_input_records": 111483, "num_output_records": 106877, "file_write_nanos": 15663206, "file_range_sync_nanos": 649588, "file_fsync_nanos": 349614797, "file_prepare_write_nanos": 1505812, "lsm_state": [2, 4, 0, 0, 0, 0, 0]}
Add two more counters in iostats_context.
Also add a parameter of db_bench.
Test Plan: Add a unit test. Also manually verify LOG outputs in db_bench
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D44115
2015-08-13 02:24:45 +02:00
|
|
|
TEST_SYNC_POINT("WritableFileWriter::RangeSync:0");
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
return writable_file_->RangeSync(offset, nbytes);
|
|
|
|
}
|
|
|
|
|
2015-09-11 18:57:02 +02:00
|
|
|
// This method writes to disk the specified data and makes use of the rate
|
|
|
|
// limiter if available
|
|
|
|
Status WritableFileWriter::WriteBuffered(const char* data, size_t size) {
|
|
|
|
Status s;
|
2017-03-15 06:23:21 +01:00
|
|
|
assert(!use_direct_io());
|
2015-09-11 18:57:02 +02:00
|
|
|
const char* src = data;
|
|
|
|
size_t left = size;
|
|
|
|
|
|
|
|
while (left > 0) {
|
2017-06-13 23:51:22 +02:00
|
|
|
size_t allowed;
|
|
|
|
if (rate_limiter_ != nullptr) {
|
|
|
|
allowed = rate_limiter_->RequestToken(
|
|
|
|
left, 0 /* alignment */, writable_file_->GetIOPriority(), stats_,
|
|
|
|
RateLimiter::OpType::kWrite);
|
|
|
|
} else {
|
|
|
|
allowed = left;
|
|
|
|
}
|
2015-09-11 18:57:02 +02:00
|
|
|
|
|
|
|
{
|
|
|
|
IOSTATS_TIMER_GUARD(write_nanos);
|
|
|
|
TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend");
|
|
|
|
s = writable_file_->Append(Slice(src, allowed));
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
IOSTATS_ADD(bytes_written, allowed);
|
2015-10-14 23:08:50 +02:00
|
|
|
TEST_KILL_RANDOM("WritableFileWriter::WriteBuffered:0", rocksdb_kill_odds);
|
2015-09-11 18:57:02 +02:00
|
|
|
|
|
|
|
left -= allowed;
|
|
|
|
src += allowed;
|
|
|
|
}
|
2015-09-12 02:36:48 +02:00
|
|
|
buf_.Size(0);
|
2015-09-11 18:57:02 +02:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// This flushes the accumulated data in the buffer. We pad data with zeros if
|
|
|
|
// necessary to the whole page.
|
|
|
|
// However, during automatic flushes padding would not be necessary.
|
|
|
|
// We always use RateLimiter if available. We move (Refit) any buffer bytes
|
|
|
|
// that are left over the
|
|
|
|
// whole number of pages to be written again on the next flush because we can
|
|
|
|
// only write on aligned
|
|
|
|
// offsets.
|
2017-02-16 19:25:06 +01:00
|
|
|
#ifndef ROCKSDB_LITE
|
2016-12-22 21:51:29 +01:00
|
|
|
Status WritableFileWriter::WriteDirect() {
|
2017-03-15 06:23:21 +01:00
|
|
|
assert(use_direct_io());
|
2015-09-11 18:57:02 +02:00
|
|
|
Status s;
|
2015-09-12 02:36:48 +02:00
|
|
|
const size_t alignment = buf_.Alignment();
|
2015-09-11 18:57:02 +02:00
|
|
|
assert((next_write_offset_ % alignment) == 0);
|
|
|
|
|
|
|
|
// Calculate whole page final file advance if all writes succeed
|
|
|
|
size_t file_advance =
|
2015-09-12 02:36:48 +02:00
|
|
|
TruncateToPageBoundary(alignment, buf_.CurrentSize());
|
2015-09-11 18:57:02 +02:00
|
|
|
|
|
|
|
// Calculate the leftover tail, we write it here padded with zeros BUT we
|
|
|
|
// will write
|
|
|
|
// it again in the future either on Close() OR when the current whole page
|
|
|
|
// fills out
|
2015-09-12 02:36:48 +02:00
|
|
|
size_t leftover_tail = buf_.CurrentSize() - file_advance;
|
2015-09-11 18:57:02 +02:00
|
|
|
|
|
|
|
// Round up and pad
|
|
|
|
buf_.PadToAlignmentWith(0);
|
|
|
|
|
2015-09-12 02:36:48 +02:00
|
|
|
const char* src = buf_.BufferStart();
|
2015-09-11 18:57:02 +02:00
|
|
|
uint64_t write_offset = next_write_offset_;
|
2015-09-12 02:36:48 +02:00
|
|
|
size_t left = buf_.CurrentSize();
|
2015-09-11 18:57:02 +02:00
|
|
|
|
|
|
|
while (left > 0) {
|
|
|
|
// Check how much is allowed
|
2017-06-13 23:51:22 +02:00
|
|
|
size_t size;
|
|
|
|
if (rate_limiter_ != nullptr) {
|
|
|
|
size = rate_limiter_->RequestToken(left, buf_.Alignment(),
|
|
|
|
writable_file_->GetIOPriority(),
|
|
|
|
stats_, RateLimiter::OpType::kWrite);
|
|
|
|
} else {
|
|
|
|
size = left;
|
|
|
|
}
|
2015-09-11 18:57:02 +02:00
|
|
|
|
|
|
|
{
|
|
|
|
IOSTATS_TIMER_GUARD(write_nanos);
|
|
|
|
TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend");
|
2016-12-22 21:51:29 +01:00
|
|
|
// direct writes must be positional
|
2015-09-12 02:36:48 +02:00
|
|
|
s = writable_file_->PositionedAppend(Slice(src, size), write_offset);
|
2015-09-11 18:57:02 +02:00
|
|
|
if (!s.ok()) {
|
2015-09-12 02:36:48 +02:00
|
|
|
buf_.Size(file_advance + leftover_tail);
|
2015-09-11 18:57:02 +02:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
IOSTATS_ADD(bytes_written, size);
|
|
|
|
left -= size;
|
|
|
|
src += size;
|
|
|
|
write_offset += size;
|
|
|
|
assert((next_write_offset_ % alignment) == 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (s.ok()) {
|
|
|
|
// Move the tail to the beginning of the buffer
|
|
|
|
// This never happens during normal Append but rather during
|
|
|
|
// explicit call to Flush()/Sync() or Close()
|
|
|
|
buf_.RefitTail(file_advance, leftover_tail);
|
|
|
|
// This is where we start writing next time which may or not be
|
|
|
|
// the actual file size on disk. They match if the buffer size
|
|
|
|
// is a multiple of whole pages otherwise filesize_ is leftover_tail
|
|
|
|
// behind
|
|
|
|
next_write_offset_ += file_advance;
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
2017-02-16 19:25:06 +01:00
|
|
|
#endif // !ROCKSDB_LITE
|
2015-09-11 18:57:02 +02:00
|
|
|
|
2015-08-27 00:25:59 +02:00
|
|
|
namespace {
|
|
|
|
class ReadaheadRandomAccessFile : public RandomAccessFile {
|
|
|
|
public:
|
2015-10-29 23:52:32 +01:00
|
|
|
ReadaheadRandomAccessFile(std::unique_ptr<RandomAccessFile>&& file,
|
|
|
|
size_t readahead_size)
|
|
|
|
: file_(std::move(file)),
|
2017-02-18 20:54:49 +01:00
|
|
|
alignment_(file_->GetRequiredBufferAlignment()),
|
|
|
|
readahead_size_(Roundup(readahead_size, alignment_)),
|
2015-10-29 23:52:32 +01:00
|
|
|
buffer_(),
|
|
|
|
buffer_offset_(0),
|
|
|
|
buffer_len_(0) {
|
2017-04-27 21:19:55 +02:00
|
|
|
|
|
|
|
buffer_.Alignment(alignment_);
|
|
|
|
buffer_.AllocateNewBuffer(readahead_size_);
|
2015-10-29 23:52:32 +01:00
|
|
|
}
|
2015-09-11 18:57:02 +02:00
|
|
|
|
|
|
|
ReadaheadRandomAccessFile(const ReadaheadRandomAccessFile&) = delete;
|
|
|
|
|
|
|
|
ReadaheadRandomAccessFile& operator=(const ReadaheadRandomAccessFile&) = delete;
|
2015-08-27 00:25:59 +02:00
|
|
|
|
|
|
|
virtual Status Read(uint64_t offset, size_t n, Slice* result,
|
|
|
|
char* scratch) const override {
|
|
|
|
|
2017-04-27 21:19:55 +02:00
|
|
|
if (n + alignment_ >= readahead_size_) {
|
2015-09-11 18:57:02 +02:00
|
|
|
return file_->Read(offset, n, result, scratch);
|
|
|
|
}
|
|
|
|
|
2015-08-27 00:25:59 +02:00
|
|
|
std::unique_lock<std::mutex> lk(lock_);
|
|
|
|
|
2017-02-18 20:54:49 +01:00
|
|
|
size_t cached_len = 0;
|
|
|
|
// Check if there is a cache hit, means that [offset, offset + n) is either
|
2017-04-27 21:19:55 +02:00
|
|
|
// completely or partially in the buffer
|
2017-02-18 20:54:49 +01:00
|
|
|
// If it's completely cached, including end of file case when offset + n is
|
|
|
|
// greater than EOF, return
|
2017-04-15 03:43:32 +02:00
|
|
|
if (TryReadFromCache(offset, n, &cached_len, scratch) &&
|
2017-02-18 20:54:49 +01:00
|
|
|
(cached_len == n ||
|
|
|
|
// End of file
|
2017-04-12 01:59:50 +02:00
|
|
|
buffer_len_ < readahead_size_)) {
|
2017-02-18 20:54:49 +01:00
|
|
|
*result = Slice(scratch, cached_len);
|
|
|
|
return Status::OK();
|
2015-08-27 00:25:59 +02:00
|
|
|
}
|
2017-02-18 20:54:49 +01:00
|
|
|
size_t advanced_offset = offset + cached_len;
|
|
|
|
// In the case of cache hit advanced_offset is already aligned, means that
|
|
|
|
// chunk_offset equals to advanced_offset
|
|
|
|
size_t chunk_offset = TruncateToPageBoundary(alignment_, advanced_offset);
|
2015-08-27 00:25:59 +02:00
|
|
|
Slice readahead_result;
|
|
|
|
|
2017-04-15 03:43:32 +02:00
|
|
|
Status s = ReadIntoBuffer(chunk_offset, readahead_size_);
|
|
|
|
if (s.ok()) {
|
|
|
|
// In the case of cache miss, i.e. when cached_len equals 0, an offset can
|
|
|
|
// exceed the file end position, so the following check is required
|
|
|
|
if (advanced_offset < chunk_offset + buffer_len_) {
|
|
|
|
// In the case of cache miss, the first chunk_padding bytes in buffer_
|
|
|
|
// are
|
|
|
|
// stored for alignment only and must be skipped
|
|
|
|
size_t chunk_padding = advanced_offset - chunk_offset;
|
|
|
|
auto remaining_len =
|
|
|
|
std::min(buffer_len_ - chunk_padding, n - cached_len);
|
|
|
|
memcpy(scratch + cached_len, buffer_.BufferStart() + chunk_padding,
|
|
|
|
remaining_len);
|
|
|
|
*result = Slice(scratch, cached_len + remaining_len);
|
|
|
|
} else {
|
|
|
|
*result = Slice(scratch, cached_len);
|
|
|
|
}
|
2015-08-27 00:25:59 +02:00
|
|
|
}
|
2017-04-15 03:43:32 +02:00
|
|
|
return s;
|
|
|
|
}
|
2015-08-27 00:25:59 +02:00
|
|
|
|
2017-04-15 03:43:32 +02:00
|
|
|
virtual Status Prefetch(uint64_t offset, size_t n) override {
|
fix ReadaheadRandomAccessFile/iterator prefetch bug
Summary:
`ReadaheadRandomAccessFile` is used by iterators for file reads in several cases, like in compaction when `compaction_readahead_size > 0` or `use_direct_io_for_flush_and_compaction == true`, or in user iterator when `ReadOptions::readahead_size > 0`. `ReadaheadRandomAccessFile` maintains an internal buffer for readahead data. It assumes that, if the buffer's length is less than `ReadaheadRandomAccessFile::readahead_size_`, which is fixed in the constructor, then EOF has been reached so it doesn't try reading further.
Recently, d938226af405681c592f25310f41c0c933bcdb19 started calling `RandomAccessFile::Prefetch` with various lengths: 8KB, 16KB, etc. When the `RandomAccessFile` is a `ReadaheadRandomAccessFile`, it triggers the above condition and incorrectly determines EOF. If a block is partially in the readahead buffer and EOF is incorrectly decided, the result is a truncated data block.
The problem is reproducible:
```
TEST_TMPDIR=/data/compaction_bench ./db_bench -benchmarks=fillrandom -write_buffer_size=1048576 -target_file_size_base=1048576 -block_size=18384 -use_direct_io_for_flush_and_compaction=true
...
put error: Corruption: truncated block read from /data/compaction_bench/dbbench/000014.sst offset 20245, expected 10143 bytes, got 8427
```
Closes https://github.com/facebook/rocksdb/pull/3454
Differential Revision: D6869405
Pulled By: ajkr
fbshipit-source-id: 87001c299e7600a37c0dcccbd0368e0954c929cf
2018-02-01 18:36:01 +01:00
|
|
|
if (n < readahead_size_) {
|
|
|
|
// Don't allow smaller prefetches than the configured `readahead_size_`.
|
|
|
|
// `Read()` assumes a smaller prefetch buffer indicates EOF was reached.
|
|
|
|
return Status::OK();
|
|
|
|
}
|
2017-04-15 03:43:32 +02:00
|
|
|
size_t prefetch_offset = TruncateToPageBoundary(alignment_, offset);
|
|
|
|
if (prefetch_offset == buffer_offset_) {
|
|
|
|
return Status::OK();
|
|
|
|
}
|
2017-05-05 20:58:10 +02:00
|
|
|
return ReadIntoBuffer(prefetch_offset,
|
|
|
|
Roundup(offset + n, alignment_) - prefetch_offset);
|
2015-08-27 00:25:59 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
virtual size_t GetUniqueId(char* id, size_t max_size) const override {
|
|
|
|
return file_->GetUniqueId(id, max_size);
|
|
|
|
}
|
|
|
|
|
|
|
|
virtual void Hint(AccessPattern pattern) override { file_->Hint(pattern); }
|
|
|
|
|
|
|
|
virtual Status InvalidateCache(size_t offset, size_t length) override {
|
|
|
|
return file_->InvalidateCache(offset, length);
|
|
|
|
}
|
|
|
|
|
2017-02-22 23:48:09 +01:00
|
|
|
virtual bool use_direct_io() const override {
|
|
|
|
return file_->use_direct_io();
|
|
|
|
}
|
|
|
|
|
2015-08-27 00:25:59 +02:00
|
|
|
private:
|
2017-04-15 03:43:32 +02:00
|
|
|
bool TryReadFromCache(uint64_t offset, size_t n, size_t* cached_len,
|
2017-02-18 20:54:49 +01:00
|
|
|
char* scratch) const {
|
|
|
|
if (offset < buffer_offset_ || offset >= buffer_offset_ + buffer_len_) {
|
|
|
|
*cached_len = 0;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
uint64_t offset_in_buffer = offset - buffer_offset_;
|
|
|
|
*cached_len =
|
|
|
|
std::min(buffer_len_ - static_cast<size_t>(offset_in_buffer), n);
|
|
|
|
memcpy(scratch, buffer_.BufferStart() + offset_in_buffer, *cached_len);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2017-04-15 03:43:32 +02:00
|
|
|
Status ReadIntoBuffer(uint64_t offset, size_t n) const {
|
|
|
|
if (n > buffer_.Capacity()) {
|
|
|
|
n = buffer_.Capacity();
|
|
|
|
}
|
2017-05-18 03:46:03 +02:00
|
|
|
assert(IsFileSectorAligned(offset, alignment_));
|
|
|
|
assert(IsFileSectorAligned(n, alignment_));
|
2017-04-15 03:43:32 +02:00
|
|
|
Slice result;
|
|
|
|
Status s = file_->Read(offset, n, &result, buffer_.BufferStart());
|
|
|
|
if (s.ok()) {
|
|
|
|
buffer_offset_ = offset;
|
|
|
|
buffer_len_ = result.size();
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2015-08-27 00:25:59 +02:00
|
|
|
std::unique_ptr<RandomAccessFile> file_;
|
2017-02-18 20:54:49 +01:00
|
|
|
const size_t alignment_;
|
2015-09-11 18:57:02 +02:00
|
|
|
size_t readahead_size_;
|
2015-08-27 00:25:59 +02:00
|
|
|
|
2017-04-15 03:43:32 +02:00
|
|
|
mutable std::mutex lock_;
|
2017-02-18 20:54:49 +01:00
|
|
|
mutable AlignedBuffer buffer_;
|
2017-04-15 03:43:32 +02:00
|
|
|
mutable uint64_t buffer_offset_;
|
|
|
|
mutable size_t buffer_len_;
|
2015-08-27 00:25:59 +02:00
|
|
|
};
|
|
|
|
} // namespace
|
|
|
|
|
2017-08-11 20:59:13 +02:00
|
|
|
Status FilePrefetchBuffer::Prefetch(RandomAccessFileReader* reader,
|
|
|
|
uint64_t offset, size_t n) {
|
|
|
|
size_t alignment = reader->file()->GetRequiredBufferAlignment();
|
2018-01-26 21:50:48 +01:00
|
|
|
uint64_t rounddown_offset = Rounddown(offset, alignment);
|
|
|
|
uint64_t roundup_end = Roundup(offset + n, alignment);
|
|
|
|
uint64_t roundup_len = roundup_end - rounddown_offset;
|
|
|
|
assert(roundup_len >= alignment);
|
|
|
|
assert(roundup_len % alignment == 0);
|
2017-08-11 20:59:13 +02:00
|
|
|
buffer_.Alignment(alignment);
|
|
|
|
buffer_.AllocateNewBuffer(roundup_len);
|
|
|
|
|
|
|
|
Slice result;
|
2018-01-26 21:50:48 +01:00
|
|
|
Status s = reader->Read(rounddown_offset, roundup_len, &result,
|
|
|
|
buffer_.BufferStart());
|
2017-08-11 20:59:13 +02:00
|
|
|
if (s.ok()) {
|
2018-01-26 21:50:48 +01:00
|
|
|
buffer_offset_ = rounddown_offset;
|
2017-08-11 20:59:13 +02:00
|
|
|
buffer_len_ = result.size();
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool FilePrefetchBuffer::TryReadFromCache(uint64_t offset, size_t n,
|
|
|
|
Slice* result) const {
|
|
|
|
if (offset < buffer_offset_ || offset + n > buffer_offset_ + buffer_len_) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
uint64_t offset_in_buffer = offset - buffer_offset_;
|
|
|
|
*result = Slice(buffer_.BufferStart() + offset_in_buffer, n);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2015-08-27 00:25:59 +02:00
|
|
|
std::unique_ptr<RandomAccessFile> NewReadaheadRandomAccessFile(
|
2015-09-11 18:57:02 +02:00
|
|
|
std::unique_ptr<RandomAccessFile>&& file, size_t readahead_size) {
|
2015-09-12 02:36:48 +02:00
|
|
|
std::unique_ptr<RandomAccessFile> result(
|
|
|
|
new ReadaheadRandomAccessFile(std::move(file), readahead_size));
|
|
|
|
return result;
|
2015-08-27 00:25:59 +02:00
|
|
|
}
|
|
|
|
|
2015-10-16 23:33:47 +02:00
|
|
|
Status NewWritableFile(Env* env, const std::string& fname,
|
|
|
|
unique_ptr<WritableFile>* result,
|
|
|
|
const EnvOptions& options) {
|
|
|
|
Status s = env->NewWritableFile(fname, result, options);
|
|
|
|
TEST_KILL_RANDOM("NewWritableFile:0", rocksdb_kill_odds * REDUCE_ODDS2);
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-18 01:16:11 +02:00
|
|
|
} // namespace rocksdb
|