Optimize fallocation

Summary:
Based on my recent findings (posted in our internal group), if we use fallocate without KEEP_SIZE flag, we get superior performance of fdatasync() in append-only workloads.

This diff provides an option for user to not use KEEP_SIZE flag, thus optimizing his sync performance by up to 2x-3x.

At one point we also just called posix_fallocate instead of fallocate, which isn't very fast: http://code.woboq.org/userspace/glibc/sysdeps/posix/posix_fallocate.c.html (tl;dr it manually writes out zero bytes to allocate storage). This diff also fixes that, by first calling fallocate and then posix_fallocate if fallocate is not supported.

Test Plan: make check

Reviewers: dhruba, sdong, haobo, ljin

Reviewed By: dhruba

CC: leveldb

Differential Revision: https://reviews.facebook.net/D16761
This commit is contained in:
Igor Canadi 2014-03-17 21:52:14 -07:00
parent ae25742af9
commit f26cb0f093
6 changed files with 86 additions and 37 deletions

View File

@ -456,8 +456,8 @@ Status DBImpl::NewDB() {
const std::string manifest = DescriptorFileName(dbname_, 1); const std::string manifest = DescriptorFileName(dbname_, 1);
unique_ptr<WritableFile> file; unique_ptr<WritableFile> file;
Status s = env_->NewWritableFile(manifest, &file, Status s = env_->NewWritableFile(
storage_options_.AdaptForLogWrite()); manifest, &file, env_->OptimizeForManifestWrite(storage_options_));
if (!s.ok()) { if (!s.ok()) {
return s; return s;
} }
@ -3626,7 +3626,8 @@ Status DBImpl::MakeRoomForWrite(bool force,
{ {
DelayLoggingAndReset(); DelayLoggingAndReset();
s = env_->NewWritableFile(LogFileName(options_.wal_dir, new_log_number), s = env_->NewWritableFile(LogFileName(options_.wal_dir, new_log_number),
&lfile, storage_options_.AdaptForLogWrite()); &lfile,
env_->OptimizeForLogWrite(storage_options_));
if (s.ok()) { if (s.ok()) {
// Our final size should be less than write_buffer_size // Our final size should be less than write_buffer_size
// (compression, etc) but err on the side of caution. // (compression, etc) but err on the side of caution.
@ -3912,7 +3913,7 @@ Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) {
EnvOptions soptions(options); EnvOptions soptions(options);
s = impl->options_.env->NewWritableFile( s = impl->options_.env->NewWritableFile(
LogFileName(impl->options_.wal_dir, new_log_number), &lfile, LogFileName(impl->options_.wal_dir, new_log_number), &lfile,
soptions.AdaptForLogWrite()); impl->options_.env->OptimizeForLogWrite(soptions));
if (s.ok()) { if (s.ok()) {
lfile->SetPreallocationBlockSize(1.1 * impl->options_.write_buffer_size); lfile->SetPreallocationBlockSize(1.1 * impl->options_.write_buffer_size);
VersionEdit edit; VersionEdit edit;

View File

@ -306,8 +306,8 @@ class Repairer {
Status WriteDescriptor() { Status WriteDescriptor() {
std::string tmp = TempFileName(dbname_, 1); std::string tmp = TempFileName(dbname_, 1);
unique_ptr<WritableFile> file; unique_ptr<WritableFile> file;
Status status = Status status = env_->NewWritableFile(
env_->NewWritableFile(tmp, &file, storage_options_.AdaptForLogWrite()); tmp, &file, env_->OptimizeForManifestWrite(storage_options_));
if (!status.ok()) { if (!status.ok()) {
return status; return status;
} }

View File

@ -1564,7 +1564,7 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu,
unique_ptr<WritableFile> descriptor_file; unique_ptr<WritableFile> descriptor_file;
s = env_->NewWritableFile( s = env_->NewWritableFile(
DescriptorFileName(dbname_, pending_manifest_file_number_), DescriptorFileName(dbname_, pending_manifest_file_number_),
&descriptor_file, storage_options_.AdaptForLogWrite()); &descriptor_file, env_->OptimizeForManifestWrite(storage_options_));
if (s.ok()) { if (s.ok()) {
descriptor_log_.reset(new log::Writer(std::move(descriptor_file))); descriptor_log_.reset(new log::Writer(std::move(descriptor_file)));
s = WriteSnapshot(descriptor_log_.get()); s = WriteSnapshot(descriptor_log_.get());

View File

@ -49,8 +49,6 @@ struct EnvOptions {
// construct from Options // construct from Options
explicit EnvOptions(const Options& options); explicit EnvOptions(const Options& options);
EnvOptions AdaptForLogWrite() const;
// If true, then allow caching of data in environment buffers // If true, then allow caching of data in environment buffers
bool use_os_buffer = true; bool use_os_buffer = true;
@ -61,13 +59,21 @@ struct EnvOptions {
bool use_mmap_writes = true; bool use_mmap_writes = true;
// If true, set the FD_CLOEXEC on open fd. // If true, set the FD_CLOEXEC on open fd.
bool set_fd_cloexec= true; bool set_fd_cloexec = true;
// Allows OS to incrementally sync files to disk while they are being // Allows OS to incrementally sync files to disk while they are being
// written, in the background. Issue one request for every bytes_per_sync // written, in the background. Issue one request for every bytes_per_sync
// written. 0 turns it off. // written. 0 turns it off.
// Default: 0 // Default: 0
uint64_t bytes_per_sync = 0; uint64_t bytes_per_sync = 0;
// If true, we will preallocate the file with FALLOC_FL_KEEP_SIZE flag, which
// means that file size won't change as part of preallocation.
// If false, preallocation will also change the file size. This option will
// improve the performance in workloads where you sync the data on every
// write. By default, we set it to true for MANIFEST writes and false for
// WAL writes
bool fallocate_with_keep_size = true;
}; };
class Env { class Env {
@ -260,6 +266,16 @@ class Env {
// Generates a unique id that can be used to identify a db // Generates a unique id that can be used to identify a db
virtual std::string GenerateUniqueId(); virtual std::string GenerateUniqueId();
// OptimizeForLogWrite will create a new EnvOptions object that is a copy of
// the EnvOptions in the parameters, but is optimized for writing log files.
// Default implementation returns the copy of the same object.
virtual EnvOptions OptimizeForLogWrite(const EnvOptions& env_options) const;
// OptimizeForManifestWrite will create a new EnvOptions object that is a copy
// of the EnvOptions in the parameters, but is optimized for writing manifest
// files. Default implementation returns the copy of the same object.
virtual EnvOptions OptimizeForManifestWrite(const EnvOptions& env_options)
const;
private: private:
// No copying allowed // No copying allowed
Env(const Env&); Env(const Env&);

View File

@ -241,10 +241,12 @@ void AssignEnvOptions(EnvOptions* env_options, const Options& options) {
} }
EnvOptions EnvOptions::AdaptForLogWrite() const { EnvOptions Env::OptimizeForLogWrite(const EnvOptions& env_options) const {
EnvOptions adapted = *this; return env_options;
adapted.use_mmap_writes = false; }
return adapted;
EnvOptions Env::OptimizeForManifestWrite(const EnvOptions& env_options) const {
return env_options;
} }
EnvOptions::EnvOptions(const Options& options) { EnvOptions::EnvOptions(const Options& options) {

View File

@ -354,9 +354,9 @@ class PosixMmapFile : public WritableFile {
char* dst_; // Where to write next (in range [base_,limit_]) char* dst_; // Where to write next (in range [base_,limit_])
char* last_sync_; // Where have we synced up to char* last_sync_; // Where have we synced up to
uint64_t file_offset_; // Offset of base_ in file uint64_t file_offset_; // Offset of base_ in file
// Have we done an munmap of unsynced data? // Have we done an munmap of unsynced data?
bool pending_sync_; bool pending_sync_;
bool fallocate_with_keep_size_;
// Roundup x to a multiple of y // Roundup x to a multiple of y
static size_t Roundup(size_t x, size_t y) { static size_t Roundup(size_t x, size_t y) {
@ -399,7 +399,12 @@ class PosixMmapFile : public WritableFile {
assert(base_ == nullptr); assert(base_ == nullptr);
TEST_KILL_RANDOM(rocksdb_kill_odds); TEST_KILL_RANDOM(rocksdb_kill_odds);
int alloc_status = posix_fallocate(fd_, file_offset_, map_size_); // we can't fallocate with FALLOC_FL_KEEP_SIZE here
int alloc_status = fallocate(fd_, 0, file_offset_, map_size_);
if (alloc_status != 0) {
// fallback to posix_fallocate
alloc_status = posix_fallocate(fd_, file_offset_, map_size_);
}
if (alloc_status != 0) { if (alloc_status != 0) {
return Status::IOError("Error allocating space to file : " + filename_ + return Status::IOError("Error allocating space to file : " + filename_ +
"Error : " + strerror(alloc_status)); "Error : " + strerror(alloc_status));
@ -436,7 +441,8 @@ class PosixMmapFile : public WritableFile {
dst_(nullptr), dst_(nullptr),
last_sync_(nullptr), last_sync_(nullptr),
file_offset_(0), file_offset_(0),
pending_sync_(false) { pending_sync_(false),
fallocate_with_keep_size_(options.fallocate_with_keep_size) {
assert((page_size & (page_size - 1)) == 0); assert((page_size & (page_size - 1)) == 0);
assert(options.use_mmap_writes); assert(options.use_mmap_writes);
} }
@ -584,7 +590,9 @@ class PosixMmapFile : public WritableFile {
#ifdef ROCKSDB_FALLOCATE_PRESENT #ifdef ROCKSDB_FALLOCATE_PRESENT
virtual Status Allocate(off_t offset, off_t len) { virtual Status Allocate(off_t offset, off_t len) {
TEST_KILL_RANDOM(rocksdb_kill_odds); TEST_KILL_RANDOM(rocksdb_kill_odds);
if (!fallocate(fd_, FALLOC_FL_KEEP_SIZE, offset, len)) { int alloc_status = fallocate(
fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len);
if (alloc_status == 0) {
return Status::OK(); return Status::OK();
} else { } else {
return IOError(filename_, errno); return IOError(filename_, errno);
@ -606,20 +614,22 @@ class PosixWritableFile : public WritableFile {
bool pending_fsync_; bool pending_fsync_;
uint64_t last_sync_size_; uint64_t last_sync_size_;
uint64_t bytes_per_sync_; uint64_t bytes_per_sync_;
bool fallocate_with_keep_size_;
public: public:
PosixWritableFile(const std::string& fname, int fd, size_t capacity, PosixWritableFile(const std::string& fname, int fd, size_t capacity,
const EnvOptions& options) : const EnvOptions& options)
filename_(fname), : filename_(fname),
fd_(fd), fd_(fd),
cursize_(0), cursize_(0),
capacity_(capacity), capacity_(capacity),
buf_(new char[capacity]), buf_(new char[capacity]),
filesize_(0), filesize_(0),
pending_sync_(false), pending_sync_(false),
pending_fsync_(false), pending_fsync_(false),
last_sync_size_(0), last_sync_size_(0),
bytes_per_sync_(options.bytes_per_sync) { bytes_per_sync_(options.bytes_per_sync),
fallocate_with_keep_size_(options.fallocate_with_keep_size) {
assert(!options.use_mmap_writes); assert(!options.use_mmap_writes);
} }
@ -771,7 +781,9 @@ class PosixWritableFile : public WritableFile {
#ifdef ROCKSDB_FALLOCATE_PRESENT #ifdef ROCKSDB_FALLOCATE_PRESENT
virtual Status Allocate(off_t offset, off_t len) { virtual Status Allocate(off_t offset, off_t len) {
TEST_KILL_RANDOM(rocksdb_kill_odds); TEST_KILL_RANDOM(rocksdb_kill_odds);
if (!fallocate(fd_, FALLOC_FL_KEEP_SIZE, offset, len)) { int alloc_status = fallocate(
fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len);
if (alloc_status == 0) {
return Status::OK(); return Status::OK();
} else { } else {
return IOError(filename_, errno); return IOError(filename_, errno);
@ -797,14 +809,15 @@ class PosixRandomRWFile : public RandomRWFile {
int fd_; int fd_;
bool pending_sync_; bool pending_sync_;
bool pending_fsync_; bool pending_fsync_;
bool fallocate_with_keep_size_;
public: public:
PosixRandomRWFile(const std::string& fname, int fd, PosixRandomRWFile(const std::string& fname, int fd, const EnvOptions& options)
const EnvOptions& options) : : filename_(fname),
filename_(fname), fd_(fd),
fd_(fd), pending_sync_(false),
pending_sync_(false), pending_fsync_(false),
pending_fsync_(false) { fallocate_with_keep_size_(options.fallocate_with_keep_size) {
assert(!options.use_mmap_writes && !options.use_mmap_reads); assert(!options.use_mmap_writes && !options.use_mmap_reads);
} }
@ -874,7 +887,10 @@ class PosixRandomRWFile : public RandomRWFile {
#ifdef ROCKSDB_FALLOCATE_PRESENT #ifdef ROCKSDB_FALLOCATE_PRESENT
virtual Status Allocate(off_t offset, off_t len) { virtual Status Allocate(off_t offset, off_t len) {
if (!fallocate(fd_, FALLOC_FL_KEEP_SIZE, offset, len)) { TEST_KILL_RANDOM(rocksdb_kill_odds);
int alloc_status = fallocate(
fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len);
if (alloc_status == 0) {
return Status::OK(); return Status::OK();
} else { } else {
return IOError(filename_, errno); return IOError(filename_, errno);
@ -1332,6 +1348,20 @@ class PosixEnv : public Env {
return dummy; return dummy;
} }
EnvOptions OptimizeForLogWrite(const EnvOptions& env_options) const {
EnvOptions optimized = env_options;
optimized.use_mmap_writes = false;
optimized.fallocate_with_keep_size = true;
return optimized;
}
EnvOptions OptimizeForManifestWrite(const EnvOptions& env_options) const {
EnvOptions optimized = env_options;
optimized.use_mmap_writes = false;
optimized.fallocate_with_keep_size = true;
return optimized;
}
private: private:
bool checkedDiskForMmap_; bool checkedDiskForMmap_;
bool forceMmapOff; // do we override Env options? bool forceMmapOff; // do we override Env options?