Optimize fallocation
Summary: Based on my recent findings (posted in our internal group), if we use fallocate without KEEP_SIZE flag, we get superior performance of fdatasync() in append-only workloads. This diff provides an option for user to not use KEEP_SIZE flag, thus optimizing his sync performance by up to 2x-3x. At one point we also just called posix_fallocate instead of fallocate, which isn't very fast: http://code.woboq.org/userspace/glibc/sysdeps/posix/posix_fallocate.c.html (tl;dr it manually writes out zero bytes to allocate storage). This diff also fixes that, by first calling fallocate and then posix_fallocate if fallocate is not supported. Test Plan: make check Reviewers: dhruba, sdong, haobo, ljin Reviewed By: dhruba CC: leveldb Differential Revision: https://reviews.facebook.net/D16761
This commit is contained in:
parent
ae25742af9
commit
f26cb0f093
|
@ -456,8 +456,8 @@ Status DBImpl::NewDB() {
|
||||||
|
|
||||||
const std::string manifest = DescriptorFileName(dbname_, 1);
|
const std::string manifest = DescriptorFileName(dbname_, 1);
|
||||||
unique_ptr<WritableFile> file;
|
unique_ptr<WritableFile> file;
|
||||||
Status s = env_->NewWritableFile(manifest, &file,
|
Status s = env_->NewWritableFile(
|
||||||
storage_options_.AdaptForLogWrite());
|
manifest, &file, env_->OptimizeForManifestWrite(storage_options_));
|
||||||
if (!s.ok()) {
|
if (!s.ok()) {
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
@ -3626,7 +3626,8 @@ Status DBImpl::MakeRoomForWrite(bool force,
|
||||||
{
|
{
|
||||||
DelayLoggingAndReset();
|
DelayLoggingAndReset();
|
||||||
s = env_->NewWritableFile(LogFileName(options_.wal_dir, new_log_number),
|
s = env_->NewWritableFile(LogFileName(options_.wal_dir, new_log_number),
|
||||||
&lfile, storage_options_.AdaptForLogWrite());
|
&lfile,
|
||||||
|
env_->OptimizeForLogWrite(storage_options_));
|
||||||
if (s.ok()) {
|
if (s.ok()) {
|
||||||
// Our final size should be less than write_buffer_size
|
// Our final size should be less than write_buffer_size
|
||||||
// (compression, etc) but err on the side of caution.
|
// (compression, etc) but err on the side of caution.
|
||||||
|
@ -3912,7 +3913,7 @@ Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) {
|
||||||
EnvOptions soptions(options);
|
EnvOptions soptions(options);
|
||||||
s = impl->options_.env->NewWritableFile(
|
s = impl->options_.env->NewWritableFile(
|
||||||
LogFileName(impl->options_.wal_dir, new_log_number), &lfile,
|
LogFileName(impl->options_.wal_dir, new_log_number), &lfile,
|
||||||
soptions.AdaptForLogWrite());
|
impl->options_.env->OptimizeForLogWrite(soptions));
|
||||||
if (s.ok()) {
|
if (s.ok()) {
|
||||||
lfile->SetPreallocationBlockSize(1.1 * impl->options_.write_buffer_size);
|
lfile->SetPreallocationBlockSize(1.1 * impl->options_.write_buffer_size);
|
||||||
VersionEdit edit;
|
VersionEdit edit;
|
||||||
|
|
|
@ -306,8 +306,8 @@ class Repairer {
|
||||||
Status WriteDescriptor() {
|
Status WriteDescriptor() {
|
||||||
std::string tmp = TempFileName(dbname_, 1);
|
std::string tmp = TempFileName(dbname_, 1);
|
||||||
unique_ptr<WritableFile> file;
|
unique_ptr<WritableFile> file;
|
||||||
Status status =
|
Status status = env_->NewWritableFile(
|
||||||
env_->NewWritableFile(tmp, &file, storage_options_.AdaptForLogWrite());
|
tmp, &file, env_->OptimizeForManifestWrite(storage_options_));
|
||||||
if (!status.ok()) {
|
if (!status.ok()) {
|
||||||
return status;
|
return status;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1564,7 +1564,7 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu,
|
||||||
unique_ptr<WritableFile> descriptor_file;
|
unique_ptr<WritableFile> descriptor_file;
|
||||||
s = env_->NewWritableFile(
|
s = env_->NewWritableFile(
|
||||||
DescriptorFileName(dbname_, pending_manifest_file_number_),
|
DescriptorFileName(dbname_, pending_manifest_file_number_),
|
||||||
&descriptor_file, storage_options_.AdaptForLogWrite());
|
&descriptor_file, env_->OptimizeForManifestWrite(storage_options_));
|
||||||
if (s.ok()) {
|
if (s.ok()) {
|
||||||
descriptor_log_.reset(new log::Writer(std::move(descriptor_file)));
|
descriptor_log_.reset(new log::Writer(std::move(descriptor_file)));
|
||||||
s = WriteSnapshot(descriptor_log_.get());
|
s = WriteSnapshot(descriptor_log_.get());
|
||||||
|
|
|
@ -49,8 +49,6 @@ struct EnvOptions {
|
||||||
// construct from Options
|
// construct from Options
|
||||||
explicit EnvOptions(const Options& options);
|
explicit EnvOptions(const Options& options);
|
||||||
|
|
||||||
EnvOptions AdaptForLogWrite() const;
|
|
||||||
|
|
||||||
// If true, then allow caching of data in environment buffers
|
// If true, then allow caching of data in environment buffers
|
||||||
bool use_os_buffer = true;
|
bool use_os_buffer = true;
|
||||||
|
|
||||||
|
@ -61,13 +59,21 @@ struct EnvOptions {
|
||||||
bool use_mmap_writes = true;
|
bool use_mmap_writes = true;
|
||||||
|
|
||||||
// If true, set the FD_CLOEXEC on open fd.
|
// If true, set the FD_CLOEXEC on open fd.
|
||||||
bool set_fd_cloexec= true;
|
bool set_fd_cloexec = true;
|
||||||
|
|
||||||
// Allows OS to incrementally sync files to disk while they are being
|
// Allows OS to incrementally sync files to disk while they are being
|
||||||
// written, in the background. Issue one request for every bytes_per_sync
|
// written, in the background. Issue one request for every bytes_per_sync
|
||||||
// written. 0 turns it off.
|
// written. 0 turns it off.
|
||||||
// Default: 0
|
// Default: 0
|
||||||
uint64_t bytes_per_sync = 0;
|
uint64_t bytes_per_sync = 0;
|
||||||
|
|
||||||
|
// If true, we will preallocate the file with FALLOC_FL_KEEP_SIZE flag, which
|
||||||
|
// means that file size won't change as part of preallocation.
|
||||||
|
// If false, preallocation will also change the file size. This option will
|
||||||
|
// improve the performance in workloads where you sync the data on every
|
||||||
|
// write. By default, we set it to true for MANIFEST writes and false for
|
||||||
|
// WAL writes
|
||||||
|
bool fallocate_with_keep_size = true;
|
||||||
};
|
};
|
||||||
|
|
||||||
class Env {
|
class Env {
|
||||||
|
@ -260,6 +266,16 @@ class Env {
|
||||||
// Generates a unique id that can be used to identify a db
|
// Generates a unique id that can be used to identify a db
|
||||||
virtual std::string GenerateUniqueId();
|
virtual std::string GenerateUniqueId();
|
||||||
|
|
||||||
|
// OptimizeForLogWrite will create a new EnvOptions object that is a copy of
|
||||||
|
// the EnvOptions in the parameters, but is optimized for writing log files.
|
||||||
|
// Default implementation returns the copy of the same object.
|
||||||
|
virtual EnvOptions OptimizeForLogWrite(const EnvOptions& env_options) const;
|
||||||
|
// OptimizeForManifestWrite will create a new EnvOptions object that is a copy
|
||||||
|
// of the EnvOptions in the parameters, but is optimized for writing manifest
|
||||||
|
// files. Default implementation returns the copy of the same object.
|
||||||
|
virtual EnvOptions OptimizeForManifestWrite(const EnvOptions& env_options)
|
||||||
|
const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// No copying allowed
|
// No copying allowed
|
||||||
Env(const Env&);
|
Env(const Env&);
|
||||||
|
|
10
util/env.cc
10
util/env.cc
|
@ -241,10 +241,12 @@ void AssignEnvOptions(EnvOptions* env_options, const Options& options) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
EnvOptions EnvOptions::AdaptForLogWrite() const {
|
EnvOptions Env::OptimizeForLogWrite(const EnvOptions& env_options) const {
|
||||||
EnvOptions adapted = *this;
|
return env_options;
|
||||||
adapted.use_mmap_writes = false;
|
}
|
||||||
return adapted;
|
|
||||||
|
EnvOptions Env::OptimizeForManifestWrite(const EnvOptions& env_options) const {
|
||||||
|
return env_options;
|
||||||
}
|
}
|
||||||
|
|
||||||
EnvOptions::EnvOptions(const Options& options) {
|
EnvOptions::EnvOptions(const Options& options) {
|
||||||
|
|
|
@ -354,9 +354,9 @@ class PosixMmapFile : public WritableFile {
|
||||||
char* dst_; // Where to write next (in range [base_,limit_])
|
char* dst_; // Where to write next (in range [base_,limit_])
|
||||||
char* last_sync_; // Where have we synced up to
|
char* last_sync_; // Where have we synced up to
|
||||||
uint64_t file_offset_; // Offset of base_ in file
|
uint64_t file_offset_; // Offset of base_ in file
|
||||||
|
|
||||||
// Have we done an munmap of unsynced data?
|
// Have we done an munmap of unsynced data?
|
||||||
bool pending_sync_;
|
bool pending_sync_;
|
||||||
|
bool fallocate_with_keep_size_;
|
||||||
|
|
||||||
// Roundup x to a multiple of y
|
// Roundup x to a multiple of y
|
||||||
static size_t Roundup(size_t x, size_t y) {
|
static size_t Roundup(size_t x, size_t y) {
|
||||||
|
@ -399,7 +399,12 @@ class PosixMmapFile : public WritableFile {
|
||||||
assert(base_ == nullptr);
|
assert(base_ == nullptr);
|
||||||
|
|
||||||
TEST_KILL_RANDOM(rocksdb_kill_odds);
|
TEST_KILL_RANDOM(rocksdb_kill_odds);
|
||||||
int alloc_status = posix_fallocate(fd_, file_offset_, map_size_);
|
// we can't fallocate with FALLOC_FL_KEEP_SIZE here
|
||||||
|
int alloc_status = fallocate(fd_, 0, file_offset_, map_size_);
|
||||||
|
if (alloc_status != 0) {
|
||||||
|
// fallback to posix_fallocate
|
||||||
|
alloc_status = posix_fallocate(fd_, file_offset_, map_size_);
|
||||||
|
}
|
||||||
if (alloc_status != 0) {
|
if (alloc_status != 0) {
|
||||||
return Status::IOError("Error allocating space to file : " + filename_ +
|
return Status::IOError("Error allocating space to file : " + filename_ +
|
||||||
"Error : " + strerror(alloc_status));
|
"Error : " + strerror(alloc_status));
|
||||||
|
@ -436,7 +441,8 @@ class PosixMmapFile : public WritableFile {
|
||||||
dst_(nullptr),
|
dst_(nullptr),
|
||||||
last_sync_(nullptr),
|
last_sync_(nullptr),
|
||||||
file_offset_(0),
|
file_offset_(0),
|
||||||
pending_sync_(false) {
|
pending_sync_(false),
|
||||||
|
fallocate_with_keep_size_(options.fallocate_with_keep_size) {
|
||||||
assert((page_size & (page_size - 1)) == 0);
|
assert((page_size & (page_size - 1)) == 0);
|
||||||
assert(options.use_mmap_writes);
|
assert(options.use_mmap_writes);
|
||||||
}
|
}
|
||||||
|
@ -584,7 +590,9 @@ class PosixMmapFile : public WritableFile {
|
||||||
#ifdef ROCKSDB_FALLOCATE_PRESENT
|
#ifdef ROCKSDB_FALLOCATE_PRESENT
|
||||||
virtual Status Allocate(off_t offset, off_t len) {
|
virtual Status Allocate(off_t offset, off_t len) {
|
||||||
TEST_KILL_RANDOM(rocksdb_kill_odds);
|
TEST_KILL_RANDOM(rocksdb_kill_odds);
|
||||||
if (!fallocate(fd_, FALLOC_FL_KEEP_SIZE, offset, len)) {
|
int alloc_status = fallocate(
|
||||||
|
fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len);
|
||||||
|
if (alloc_status == 0) {
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
} else {
|
} else {
|
||||||
return IOError(filename_, errno);
|
return IOError(filename_, errno);
|
||||||
|
@ -606,20 +614,22 @@ class PosixWritableFile : public WritableFile {
|
||||||
bool pending_fsync_;
|
bool pending_fsync_;
|
||||||
uint64_t last_sync_size_;
|
uint64_t last_sync_size_;
|
||||||
uint64_t bytes_per_sync_;
|
uint64_t bytes_per_sync_;
|
||||||
|
bool fallocate_with_keep_size_;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
PosixWritableFile(const std::string& fname, int fd, size_t capacity,
|
PosixWritableFile(const std::string& fname, int fd, size_t capacity,
|
||||||
const EnvOptions& options) :
|
const EnvOptions& options)
|
||||||
filename_(fname),
|
: filename_(fname),
|
||||||
fd_(fd),
|
fd_(fd),
|
||||||
cursize_(0),
|
cursize_(0),
|
||||||
capacity_(capacity),
|
capacity_(capacity),
|
||||||
buf_(new char[capacity]),
|
buf_(new char[capacity]),
|
||||||
filesize_(0),
|
filesize_(0),
|
||||||
pending_sync_(false),
|
pending_sync_(false),
|
||||||
pending_fsync_(false),
|
pending_fsync_(false),
|
||||||
last_sync_size_(0),
|
last_sync_size_(0),
|
||||||
bytes_per_sync_(options.bytes_per_sync) {
|
bytes_per_sync_(options.bytes_per_sync),
|
||||||
|
fallocate_with_keep_size_(options.fallocate_with_keep_size) {
|
||||||
assert(!options.use_mmap_writes);
|
assert(!options.use_mmap_writes);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -771,7 +781,9 @@ class PosixWritableFile : public WritableFile {
|
||||||
#ifdef ROCKSDB_FALLOCATE_PRESENT
|
#ifdef ROCKSDB_FALLOCATE_PRESENT
|
||||||
virtual Status Allocate(off_t offset, off_t len) {
|
virtual Status Allocate(off_t offset, off_t len) {
|
||||||
TEST_KILL_RANDOM(rocksdb_kill_odds);
|
TEST_KILL_RANDOM(rocksdb_kill_odds);
|
||||||
if (!fallocate(fd_, FALLOC_FL_KEEP_SIZE, offset, len)) {
|
int alloc_status = fallocate(
|
||||||
|
fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len);
|
||||||
|
if (alloc_status == 0) {
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
} else {
|
} else {
|
||||||
return IOError(filename_, errno);
|
return IOError(filename_, errno);
|
||||||
|
@ -797,14 +809,15 @@ class PosixRandomRWFile : public RandomRWFile {
|
||||||
int fd_;
|
int fd_;
|
||||||
bool pending_sync_;
|
bool pending_sync_;
|
||||||
bool pending_fsync_;
|
bool pending_fsync_;
|
||||||
|
bool fallocate_with_keep_size_;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
PosixRandomRWFile(const std::string& fname, int fd,
|
PosixRandomRWFile(const std::string& fname, int fd, const EnvOptions& options)
|
||||||
const EnvOptions& options) :
|
: filename_(fname),
|
||||||
filename_(fname),
|
fd_(fd),
|
||||||
fd_(fd),
|
pending_sync_(false),
|
||||||
pending_sync_(false),
|
pending_fsync_(false),
|
||||||
pending_fsync_(false) {
|
fallocate_with_keep_size_(options.fallocate_with_keep_size) {
|
||||||
assert(!options.use_mmap_writes && !options.use_mmap_reads);
|
assert(!options.use_mmap_writes && !options.use_mmap_reads);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -874,7 +887,10 @@ class PosixRandomRWFile : public RandomRWFile {
|
||||||
|
|
||||||
#ifdef ROCKSDB_FALLOCATE_PRESENT
|
#ifdef ROCKSDB_FALLOCATE_PRESENT
|
||||||
virtual Status Allocate(off_t offset, off_t len) {
|
virtual Status Allocate(off_t offset, off_t len) {
|
||||||
if (!fallocate(fd_, FALLOC_FL_KEEP_SIZE, offset, len)) {
|
TEST_KILL_RANDOM(rocksdb_kill_odds);
|
||||||
|
int alloc_status = fallocate(
|
||||||
|
fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len);
|
||||||
|
if (alloc_status == 0) {
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
} else {
|
} else {
|
||||||
return IOError(filename_, errno);
|
return IOError(filename_, errno);
|
||||||
|
@ -1332,6 +1348,20 @@ class PosixEnv : public Env {
|
||||||
return dummy;
|
return dummy;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
EnvOptions OptimizeForLogWrite(const EnvOptions& env_options) const {
|
||||||
|
EnvOptions optimized = env_options;
|
||||||
|
optimized.use_mmap_writes = false;
|
||||||
|
optimized.fallocate_with_keep_size = true;
|
||||||
|
return optimized;
|
||||||
|
}
|
||||||
|
|
||||||
|
EnvOptions OptimizeForManifestWrite(const EnvOptions& env_options) const {
|
||||||
|
EnvOptions optimized = env_options;
|
||||||
|
optimized.use_mmap_writes = false;
|
||||||
|
optimized.fallocate_with_keep_size = true;
|
||||||
|
return optimized;
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
bool checkedDiskForMmap_;
|
bool checkedDiskForMmap_;
|
||||||
bool forceMmapOff; // do we override Env options?
|
bool forceMmapOff; // do we override Env options?
|
||||||
|
|
Loading…
Reference in New Issue
Block a user