RangeSync not to sync last 1MB of the file
Summary: From other ones' investigation: "sync_file_range() behavior highly depends on kernel version and filesystem. xfs does neighbor page flushing outside of the specified ranges. For example, sync_file_range(fd, 8192, 16384) does not only trigger flushing page #3 to #4, but also flushing many more dirty pages (i.e. up to page#16)... Ranges of the sync_file_range() should be far enough from write() offset (at least 1MB)." Test Plan: make all check Reviewers: igor, rven, kradhakrishnan, yhchiang, IslamAbdelRahman, anthony Reviewed By: anthony Subscribers: yoshinorim, MarkCallaghan, sumeet, domas, dhruba, leveldb, ljin Differential Revision: https://reviews.facebook.net/D15807
This commit is contained in:
parent
7219088cda
commit
3dbf4ba220
@ -306,6 +306,7 @@ set(TESTS
|
|||||||
util/env_test.cc
|
util/env_test.cc
|
||||||
util/event_logger_test.cc
|
util/event_logger_test.cc
|
||||||
util/filelock_test.cc
|
util/filelock_test.cc
|
||||||
|
util/file_reader_writer_test.cc
|
||||||
util/histogram_test.cc
|
util/histogram_test.cc
|
||||||
util/manual_compaction_test.cc
|
util/manual_compaction_test.cc
|
||||||
util/memenv_test.cc
|
util/memenv_test.cc
|
||||||
|
4
Makefile
4
Makefile
@ -248,6 +248,7 @@ TESTS = \
|
|||||||
fault_injection_test \
|
fault_injection_test \
|
||||||
filelock_test \
|
filelock_test \
|
||||||
filename_test \
|
filename_test \
|
||||||
|
file_reader_writer_test \
|
||||||
block_based_filter_block_test \
|
block_based_filter_block_test \
|
||||||
full_filter_block_test \
|
full_filter_block_test \
|
||||||
histogram_test \
|
histogram_test \
|
||||||
@ -774,6 +775,9 @@ rate_limiter_test: util/rate_limiter_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
|||||||
filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||||
$(AM_LINK)
|
$(AM_LINK)
|
||||||
|
|
||||||
|
file_reader_writer_test: util/file_reader_writer_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||||
|
$(AM_LINK)
|
||||||
|
|
||||||
block_based_filter_block_test: table/block_based_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
block_based_filter_block_test: table/block_based_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||||
$(AM_LINK)
|
$(AM_LINK)
|
||||||
|
|
||||||
|
@ -1014,7 +1014,9 @@ struct DBOptions {
|
|||||||
void Dump(Logger* log) const;
|
void Dump(Logger* log) const;
|
||||||
|
|
||||||
// Allows OS to incrementally sync files to disk while they are being
|
// Allows OS to incrementally sync files to disk while they are being
|
||||||
// written, asynchronously, in the background.
|
// written, asynchronously, in the background. This operation can be used
|
||||||
|
// to smooth out write I/Os over time. Users shouldn't reply on it for
|
||||||
|
// persistency guarantee.
|
||||||
// Issue one request for every bytes_per_sync written. 0 turns it off.
|
// Issue one request for every bytes_per_sync written. 0 turns it off.
|
||||||
// Default: 0
|
// Default: 0
|
||||||
//
|
//
|
||||||
|
@ -122,9 +122,26 @@ Status WritableFileWriter::Flush() {
|
|||||||
// TODO: give log file and sst file different options (log
|
// TODO: give log file and sst file different options (log
|
||||||
// files could be potentially cached in OS for their whole
|
// files could be potentially cached in OS for their whole
|
||||||
// life time, thus we might not want to flush at all).
|
// life time, thus we might not want to flush at all).
|
||||||
if (bytes_per_sync_ && filesize_ - last_sync_size_ >= bytes_per_sync_) {
|
|
||||||
writable_file_->RangeSync(last_sync_size_, filesize_ - last_sync_size_);
|
// We try to avoid sync to the last 1MB of data. For two reasons:
|
||||||
last_sync_size_ = filesize_;
|
// (1) avoid rewrite the same page that is modified later.
|
||||||
|
// (2) for older version of OS, write can block while writing out
|
||||||
|
// the page.
|
||||||
|
// Xfs does neighbor page flushing outside of the specified ranges. We
|
||||||
|
// need to make sure sync range is far from the write offset.
|
||||||
|
if (bytes_per_sync_) {
|
||||||
|
uint64_t kBytesNotSyncRange = 1024 * 1024; // recent 1MB is not synced.
|
||||||
|
uint64_t kBytesAlignWhenSync = 4 * 1024; // Align 4KB.
|
||||||
|
if (filesize_ > kBytesNotSyncRange) {
|
||||||
|
uint64_t offset_sync_to = filesize_ - kBytesNotSyncRange;
|
||||||
|
offset_sync_to -= offset_sync_to % kBytesAlignWhenSync;
|
||||||
|
assert(offset_sync_to >= last_sync_size_);
|
||||||
|
if (offset_sync_to > 0 &&
|
||||||
|
offset_sync_to - last_sync_size_ >= bytes_per_sync_) {
|
||||||
|
RangeSync(last_sync_size_, offset_sync_to - last_sync_size_);
|
||||||
|
last_sync_size_ = offset_sync_to;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
|
89
util/file_reader_writer_test.cc
Normal file
89
util/file_reader_writer_test.cc
Normal file
@ -0,0 +1,89 @@
|
|||||||
|
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||||
|
// This source code is licensed under the BSD-style license found in the
|
||||||
|
// LICENSE file in the root directory of this source tree. An additional grant
|
||||||
|
// of patent rights can be found in the PATENTS file in the same directory.
|
||||||
|
//
|
||||||
|
#include <vector>
|
||||||
|
#include "util/file_reader_writer.h"
|
||||||
|
#include "util/random.h"
|
||||||
|
#include "util/testharness.h"
|
||||||
|
|
||||||
|
namespace rocksdb {
|
||||||
|
|
||||||
|
class WritableFileWriterTest : public testing::Test {};
|
||||||
|
|
||||||
|
const uint32_t kMb = 1 << 20;
|
||||||
|
|
||||||
|
TEST_F(WritableFileWriterTest, RangeSync) {
|
||||||
|
class FakeWF : public WritableFile {
|
||||||
|
public:
|
||||||
|
explicit FakeWF() : size_(0), last_synced_(0) {}
|
||||||
|
~FakeWF() {}
|
||||||
|
|
||||||
|
Status Append(const Slice& data) override {
|
||||||
|
size_ += data.size();
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
Status Close() override {
|
||||||
|
EXPECT_GE(size_, last_synced_ + kMb);
|
||||||
|
EXPECT_LT(size_, last_synced_ + 2 * kMb);
|
||||||
|
// Make sure random writes generated enough writes.
|
||||||
|
EXPECT_GT(size_, 10 * kMb);
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
Status Flush() override { return Status::OK(); }
|
||||||
|
Status Sync() override { return Status::OK(); }
|
||||||
|
Status Fsync() override { return Status::OK(); }
|
||||||
|
void SetIOPriority(Env::IOPriority pri) override {}
|
||||||
|
uint64_t GetFileSize() override { return size_; }
|
||||||
|
void GetPreallocationStatus(size_t* block_size,
|
||||||
|
size_t* last_allocated_block) override {}
|
||||||
|
size_t GetUniqueId(char* id, size_t max_size) const override { return 0; }
|
||||||
|
Status InvalidateCache(size_t offset, size_t length) override {
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
|
protected:
|
||||||
|
Status Allocate(off_t offset, off_t len) override { return Status::OK(); }
|
||||||
|
Status RangeSync(off_t offset, off_t nbytes) override {
|
||||||
|
EXPECT_EQ(offset % 4096, 0u);
|
||||||
|
EXPECT_EQ(nbytes % 4096, 0u);
|
||||||
|
|
||||||
|
EXPECT_EQ(offset, last_synced_);
|
||||||
|
last_synced_ = offset + nbytes;
|
||||||
|
EXPECT_GE(size_, last_synced_ + kMb);
|
||||||
|
if (size_ > 2 * kMb) {
|
||||||
|
EXPECT_LT(size_, last_synced_ + 2 * kMb);
|
||||||
|
}
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64_t size_;
|
||||||
|
uint64_t last_synced_;
|
||||||
|
};
|
||||||
|
|
||||||
|
EnvOptions env_options;
|
||||||
|
env_options.bytes_per_sync = kMb;
|
||||||
|
unique_ptr<FakeWF> wf(new FakeWF);
|
||||||
|
unique_ptr<WritableFileWriter> writer(
|
||||||
|
new WritableFileWriter(std::move(wf), env_options));
|
||||||
|
Random r(301);
|
||||||
|
std::unique_ptr<char[]> large_buf(new char[10 * kMb]);
|
||||||
|
for (int i = 0; i < 1000; i++) {
|
||||||
|
int skew_limit = (i < 700) ? 10 : 15;
|
||||||
|
uint32_t num = r.Skewed(skew_limit) * 100 + r.Uniform(100);
|
||||||
|
writer->Append(Slice(large_buf.get(), num));
|
||||||
|
|
||||||
|
// Flush in a chance of 1/10.
|
||||||
|
if (r.Uniform(10) == 0) {
|
||||||
|
writer->Flush();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
writer->Close();
|
||||||
|
}
|
||||||
|
} // namespace rocksdb
|
||||||
|
|
||||||
|
int main(int argc, char** argv) {
|
||||||
|
::testing::InitGoogleTest(&argc, argv);
|
||||||
|
return RUN_ALL_TESTS();
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user