From 3dafdfb2c49d879c457e726b664ba97b2d73d7a4 Mon Sep 17 00:00:00 2001 From: Chip Turner Date: Tue, 15 Jan 2013 14:05:42 -0800 Subject: [PATCH] Use fallocate to prevent excessive allocation of sst files and logs Summary: On some filesystems, pre-allocation can be a considerable amount of space. xfs in our production environment pre-allocates by 1GB, for instance. By using fallocate to inform the kernel of our expected file sizes, we eliminate this wasteage (that isn't recovered until the file is closed which, in the case of LOG files, can be a considerable amount of time). Test Plan: created an xfs loopback filesystem, mounted with allocsize=4M, and ran db_stress. LOG file without this change was 4M, and with it it was 128k then grew to normal size. Reviewers: dhruba Reviewed By: dhruba CC: adsharma, leveldb Differential Revision: https://reviews.facebook.net/D7953 --- db/db_impl.cc | 11 ++++++++ db/db_test.cc | 29 +++++++++++++++++++++ db/version_set.h | 5 ++-- include/leveldb/env.h | 53 ++++++++++++++++++++++++++++++++++++++- include/leveldb/options.h | 6 +++++ util/env_posix.cc | 18 +++++++++++++ util/options.cc | 8 ++++-- util/posix_logger.h | 28 ++++++++++++++++++--- 8 files changed, 150 insertions(+), 8 deletions(-) diff --git a/db/db_impl.cc b/db/db_impl.cc index 4c34f3b49..4d508517f 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -295,6 +295,7 @@ Status DBImpl::NewDB() { if (!s.ok()) { return s; } + file->SetPreallocationBlockSize(options_.manifest_preallocation_size); { log::Writer log(std::move(file)); std::string record; @@ -1380,6 +1381,12 @@ Status DBImpl::OpenCompactionOutputFile(CompactionState* compact) { // Make the output file std::string fname = TableFileName(dbname_, file_number); Status s = env_->NewWritableFile(fname, &compact->outfile); + + // Over-estimate slightly so we don't end up just barely crossing + // the threshold. + compact->outfile->SetPreallocationBlockSize( + 1.1 * versions_->MaxFileSizeForLevel(compact->compaction->level() + 1)); + if (s.ok()) { compact->builder.reset(new TableBuilder(options_, compact->outfile.get(), compact->compaction->level() + 1)); @@ -2105,6 +2112,9 @@ Status DBImpl::MakeRoomForWrite(bool force) { versions_->ReuseFileNumber(new_log_number); break; } + // Our final size should be less than write_buffer_size + // (compression, etc) but err on the side of caution. + lfile->SetPreallocationBlockSize(1.1 * options_.write_buffer_size); logfile_number_ = new_log_number; log_.reset(new log::Writer(std::move(lfile))); imm_.Add(mem_); @@ -2292,6 +2302,7 @@ Status DB::Open(const Options& options, const std::string& dbname, s = options.env->NewWritableFile(LogFileName(dbname, new_log_number), &lfile); if (s.ok()) { + lfile->SetPreallocationBlockSize(1.1 * options.write_buffer_size); edit.SetLogNumber(new_log_number); impl->logfile_number_ = new_log_number; impl->log_.reset(new log::Writer(std::move(lfile))); diff --git a/db/db_test.cc b/db/db_test.cc index 8e3fa5ae8..55ecbb292 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -524,6 +524,35 @@ TEST(DBTest, ReadWrite) { } while (ChangeOptions()); } +TEST(DBTest, Preallocation) { + const std::string src = dbname_ + "/alloc_test"; + unique_ptr srcfile; + ASSERT_OK(env_->NewWritableFile(src, &srcfile)); + srcfile->SetPreallocationBlockSize(1024 * 1024); + + // No writes should mean no preallocation + size_t block_size, last_allocated_block; + srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); + ASSERT_EQ(last_allocated_block, 0UL); + + // Small write should preallocate one block + srcfile->Append("test"); + srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); + ASSERT_EQ(last_allocated_block, 1UL); + + // Write an entire preallocation block, make sure we increased by two. + std::string buf(block_size, ' '); + srcfile->Append(buf); + srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); + ASSERT_EQ(last_allocated_block, 2UL); + + // Write five more blocks at once, ensure we're where we need to be. + buf = std::string(block_size * 5, ' '); + srcfile->Append(buf); + srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); + ASSERT_EQ(last_allocated_block, 7UL); +} + TEST(DBTest, PutDeleteGet) { do { ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1")); diff --git a/db/version_set.h b/db/version_set.h index 0950342ef..0c86e6a70 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -369,6 +369,9 @@ class VersionSet { // record results in files_by_size_. The largest files are listed first. void UpdateFilesBySize(Version *v); + // Get the max file size in a given level. + uint64_t MaxFileSizeForLevel(int level); + private: class Builder; struct ManifestWriter; @@ -400,8 +403,6 @@ class VersionSet { double MaxBytesForLevel(int level); - uint64_t MaxFileSizeForLevel(int level); - int64_t ExpandedCompactionByteSizeLimit(int level); int64_t MaxGrandParentOverlapBytes(int level); diff --git a/include/leveldb/env.h b/include/leveldb/env.h index 811c7eb29..ad770305a 100644 --- a/include/leveldb/env.h +++ b/include/leveldb/env.h @@ -230,7 +230,8 @@ class RandomAccessFile { // at a time to the file. class WritableFile { public: - WritableFile() { } + WritableFile() : last_preallocated_block_(0), preallocation_block_size_ (0) { + } virtual ~WritableFile(); virtual Status Append(const Slice& data) = 0; @@ -255,7 +256,57 @@ class WritableFile { return 0; } + /* + * Get and set the default pre-allocation block size for writes to + * this file. If non-zero, then Allocate will be used to extend the + * underlying storage of a file (generally via fallocate) if the Env + * instance supports it. + */ + void SetPreallocationBlockSize(size_t size) { + preallocation_block_size_ = size; + } + + virtual void GetPreallocationStatus(size_t* block_size, + size_t* last_allocated_block) { + *last_allocated_block = last_preallocated_block_; + *block_size = preallocation_block_size_; + } + + protected: + // PrepareWrite performs any necessary preparation for a write + // before the write actually occurs. This allows for pre-allocation + // of space on devices where it can result in less file + // fragmentation and/or less waste from over-zealous filesystem + // pre-allocation. + void PrepareWrite(size_t offset, size_t len) { + if (preallocation_block_size_ == 0) { + return; + } + // If this write would cross one or more preallocation blocks, + // determine what the last preallocation block necesessary to + // cover this write would be and Allocate to that point. + const auto block_size = preallocation_block_size_; + size_t new_last_preallocated_block = + (offset + len + block_size - 1) / block_size; + if (new_last_preallocated_block > last_preallocated_block_) { + size_t num_spanned_blocks = + new_last_preallocated_block - last_preallocated_block_; + Allocate(block_size * last_preallocated_block_, + block_size * num_spanned_blocks); + last_preallocated_block_ = new_last_preallocated_block; + } + } + + /* + * Pre-allocate space for a file. + */ + virtual Status Allocate(off_t offset, off_t len) { + return Status::OK(); + } + private: + size_t last_preallocated_block_; + size_t preallocation_block_size_; // No copying allowed WritableFile(const WritableFile&); void operator=(const WritableFile&); diff --git a/include/leveldb/options.h b/include/leveldb/options.h index f30325fd7..6a090b6b3 100644 --- a/include/leveldb/options.h +++ b/include/leveldb/options.h @@ -363,6 +363,12 @@ struct Options { // deleted. // Default : 0 uint64_t WAL_ttl_seconds; + + // Number of bytes to preallocate (via fallocate) the manifest + // files. Default is 4mb, which is reasonable to reduce random IO + // as well as prevent overallocation for mounts that preallocate + // large amounts of data (such as xfs's allocsize option). + size_t manifest_preallocation_size; }; // Options that control read operations diff --git a/util/env_posix.cc b/util/env_posix.cc index 0247dbf6d..68cabff69 100644 --- a/util/env_posix.cc +++ b/util/env_posix.cc @@ -232,6 +232,7 @@ class PosixMmapFile : public WritableFile { virtual Status Append(const Slice& data) { const char* src = data.data(); size_t left = data.size(); + PrepareWrite(GetFileSize(), left); while (left > 0) { assert(base_ <= dst_); assert(dst_ <= limit_); @@ -330,6 +331,14 @@ class PosixMmapFile : public WritableFile { size_t used = dst_ - base_; return file_offset_ + used; } + + virtual Status Allocate(off_t offset, off_t len) { + if (!fallocate(fd_, FALLOC_FL_KEEP_SIZE, offset, len)) { + return Status::OK(); + } else { + return IOError(filename_, errno); + } + } }; // Use posix write to write data to a file. @@ -371,6 +380,7 @@ class PosixWritableFile : public WritableFile { pending_sync_ = true; pending_fsync_ = true; + PrepareWrite(GetFileSize(), left); // if there is no space in the cache, then flush if (cursize_ + left > capacity_) { s = Flush(); @@ -455,6 +465,14 @@ class PosixWritableFile : public WritableFile { virtual uint64_t GetFileSize() { return filesize_; } + + virtual Status Allocate(off_t offset, off_t len) { + if (!fallocate(fd_, FALLOC_FL_KEEP_SIZE, offset, len)) { + return Status::OK(); + } else { + return IOError(filename_, errno); + } + } }; static int LockOrUnlock(const std::string& fname, int fd, bool lock) { diff --git a/util/options.cc b/util/options.cc index 0789f446f..f4a4e27bb 100644 --- a/util/options.cc +++ b/util/options.cc @@ -56,7 +56,9 @@ Options::Options() compaction_filter_args(NULL), CompactionFilter(NULL), disable_auto_compactions(false), - WAL_ttl_seconds(0){ + WAL_ttl_seconds(0), + manifest_preallocation_size(4 * 1024 * 1024) { + } void @@ -144,8 +146,10 @@ Options::Dump(Logger* log) const CompactionFilter); Log(log," Options.disable_auto_compactions: %d", disable_auto_compactions); - Log(log," Options.WAL_ttl_seconds: %ld", + Log(log," Options.WAL_ttl_seconds: %ld", WAL_ttl_seconds); + Log(log," Options.manifest_preallocation_size: %ld", + manifest_preallocation_size); } // Options::Dump } // namespace leveldb diff --git a/util/posix_logger.h b/util/posix_logger.h index 513528314..bb2518ae0 100644 --- a/util/posix_logger.h +++ b/util/posix_logger.h @@ -12,19 +12,25 @@ #include #include #include +#include +#include +#include #include "leveldb/env.h" namespace leveldb { +const int kDebugLogChunkSize = 128 * 1024; + class PosixLogger : public Logger { private: FILE* file_; uint64_t (*gettid_)(); // Return the thread id for the current thread size_t log_size_; + int fd_; public: PosixLogger(FILE* f, uint64_t (*gettid)()) : - file_(f), gettid_(gettid), log_size_(0) { } + file_(f), gettid_(gettid), log_size_(0), fd_(fileno(f)) { } virtual ~PosixLogger() { fclose(file_); } @@ -86,9 +92,25 @@ class PosixLogger : public Logger { } assert(p <= limit); - fwrite(base, 1, p - base, file_); + + // If this write would cross a boundary of kDebugLogChunkSize + // space, pre-allocate more space to avoid overly large + // allocations from filesystem allocsize options. + const size_t write_size = p - base; + const int last_allocation_chunk = + ((kDebugLogChunkSize - 1 + log_size_) / kDebugLogChunkSize); + const int desired_allocation_chunk = + ((kDebugLogChunkSize - 1 + log_size_ + write_size) / + kDebugLogChunkSize); + if (last_allocation_chunk != desired_allocation_chunk) { + fallocate(fd_, FALLOC_FL_KEEP_SIZE, 0, + desired_allocation_chunk * kDebugLogChunkSize); + } + + fwrite(base, 1, write_size, file_); fflush(file_); - log_size_ += (p - base); + + log_size_ += write_size; if (base != buffer) { delete[] base;