diff --git a/db/db_impl.cc b/db/db_impl.cc index 4c34f3b49..4d508517f 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -295,6 +295,7 @@ Status DBImpl::NewDB() { if (!s.ok()) { return s; } + file->SetPreallocationBlockSize(options_.manifest_preallocation_size); { log::Writer log(std::move(file)); std::string record; @@ -1380,6 +1381,12 @@ Status DBImpl::OpenCompactionOutputFile(CompactionState* compact) { // Make the output file std::string fname = TableFileName(dbname_, file_number); Status s = env_->NewWritableFile(fname, &compact->outfile); + + // Over-estimate slightly so we don't end up just barely crossing + // the threshold. + compact->outfile->SetPreallocationBlockSize( + 1.1 * versions_->MaxFileSizeForLevel(compact->compaction->level() + 1)); + if (s.ok()) { compact->builder.reset(new TableBuilder(options_, compact->outfile.get(), compact->compaction->level() + 1)); @@ -2105,6 +2112,9 @@ Status DBImpl::MakeRoomForWrite(bool force) { versions_->ReuseFileNumber(new_log_number); break; } + // Our final size should be less than write_buffer_size + // (compression, etc) but err on the side of caution. + lfile->SetPreallocationBlockSize(1.1 * options_.write_buffer_size); logfile_number_ = new_log_number; log_.reset(new log::Writer(std::move(lfile))); imm_.Add(mem_); @@ -2292,6 +2302,7 @@ Status DB::Open(const Options& options, const std::string& dbname, s = options.env->NewWritableFile(LogFileName(dbname, new_log_number), &lfile); if (s.ok()) { + lfile->SetPreallocationBlockSize(1.1 * options.write_buffer_size); edit.SetLogNumber(new_log_number); impl->logfile_number_ = new_log_number; impl->log_.reset(new log::Writer(std::move(lfile))); diff --git a/db/db_test.cc b/db/db_test.cc index 8e3fa5ae8..55ecbb292 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -524,6 +524,35 @@ TEST(DBTest, ReadWrite) { } while (ChangeOptions()); } +TEST(DBTest, Preallocation) { + const std::string src = dbname_ + "/alloc_test"; + unique_ptr srcfile; + ASSERT_OK(env_->NewWritableFile(src, &srcfile)); + srcfile->SetPreallocationBlockSize(1024 * 1024); + + // No writes should mean no preallocation + size_t block_size, last_allocated_block; + srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); + ASSERT_EQ(last_allocated_block, 0UL); + + // Small write should preallocate one block + srcfile->Append("test"); + srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); + ASSERT_EQ(last_allocated_block, 1UL); + + // Write an entire preallocation block, make sure we increased by two. + std::string buf(block_size, ' '); + srcfile->Append(buf); + srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); + ASSERT_EQ(last_allocated_block, 2UL); + + // Write five more blocks at once, ensure we're where we need to be. + buf = std::string(block_size * 5, ' '); + srcfile->Append(buf); + srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); + ASSERT_EQ(last_allocated_block, 7UL); +} + TEST(DBTest, PutDeleteGet) { do { ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1")); diff --git a/db/version_set.h b/db/version_set.h index 0950342ef..0c86e6a70 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -369,6 +369,9 @@ class VersionSet { // record results in files_by_size_. The largest files are listed first. void UpdateFilesBySize(Version *v); + // Get the max file size in a given level. + uint64_t MaxFileSizeForLevel(int level); + private: class Builder; struct ManifestWriter; @@ -400,8 +403,6 @@ class VersionSet { double MaxBytesForLevel(int level); - uint64_t MaxFileSizeForLevel(int level); - int64_t ExpandedCompactionByteSizeLimit(int level); int64_t MaxGrandParentOverlapBytes(int level); diff --git a/include/leveldb/env.h b/include/leveldb/env.h index 811c7eb29..ad770305a 100644 --- a/include/leveldb/env.h +++ b/include/leveldb/env.h @@ -230,7 +230,8 @@ class RandomAccessFile { // at a time to the file. class WritableFile { public: - WritableFile() { } + WritableFile() : last_preallocated_block_(0), preallocation_block_size_ (0) { + } virtual ~WritableFile(); virtual Status Append(const Slice& data) = 0; @@ -255,7 +256,57 @@ class WritableFile { return 0; } + /* + * Get and set the default pre-allocation block size for writes to + * this file. If non-zero, then Allocate will be used to extend the + * underlying storage of a file (generally via fallocate) if the Env + * instance supports it. + */ + void SetPreallocationBlockSize(size_t size) { + preallocation_block_size_ = size; + } + + virtual void GetPreallocationStatus(size_t* block_size, + size_t* last_allocated_block) { + *last_allocated_block = last_preallocated_block_; + *block_size = preallocation_block_size_; + } + + protected: + // PrepareWrite performs any necessary preparation for a write + // before the write actually occurs. This allows for pre-allocation + // of space on devices where it can result in less file + // fragmentation and/or less waste from over-zealous filesystem + // pre-allocation. + void PrepareWrite(size_t offset, size_t len) { + if (preallocation_block_size_ == 0) { + return; + } + // If this write would cross one or more preallocation blocks, + // determine what the last preallocation block necesessary to + // cover this write would be and Allocate to that point. + const auto block_size = preallocation_block_size_; + size_t new_last_preallocated_block = + (offset + len + block_size - 1) / block_size; + if (new_last_preallocated_block > last_preallocated_block_) { + size_t num_spanned_blocks = + new_last_preallocated_block - last_preallocated_block_; + Allocate(block_size * last_preallocated_block_, + block_size * num_spanned_blocks); + last_preallocated_block_ = new_last_preallocated_block; + } + } + + /* + * Pre-allocate space for a file. + */ + virtual Status Allocate(off_t offset, off_t len) { + return Status::OK(); + } + private: + size_t last_preallocated_block_; + size_t preallocation_block_size_; // No copying allowed WritableFile(const WritableFile&); void operator=(const WritableFile&); diff --git a/include/leveldb/options.h b/include/leveldb/options.h index f30325fd7..6a090b6b3 100644 --- a/include/leveldb/options.h +++ b/include/leveldb/options.h @@ -363,6 +363,12 @@ struct Options { // deleted. // Default : 0 uint64_t WAL_ttl_seconds; + + // Number of bytes to preallocate (via fallocate) the manifest + // files. Default is 4mb, which is reasonable to reduce random IO + // as well as prevent overallocation for mounts that preallocate + // large amounts of data (such as xfs's allocsize option). + size_t manifest_preallocation_size; }; // Options that control read operations diff --git a/util/env_posix.cc b/util/env_posix.cc index 0247dbf6d..68cabff69 100644 --- a/util/env_posix.cc +++ b/util/env_posix.cc @@ -232,6 +232,7 @@ class PosixMmapFile : public WritableFile { virtual Status Append(const Slice& data) { const char* src = data.data(); size_t left = data.size(); + PrepareWrite(GetFileSize(), left); while (left > 0) { assert(base_ <= dst_); assert(dst_ <= limit_); @@ -330,6 +331,14 @@ class PosixMmapFile : public WritableFile { size_t used = dst_ - base_; return file_offset_ + used; } + + virtual Status Allocate(off_t offset, off_t len) { + if (!fallocate(fd_, FALLOC_FL_KEEP_SIZE, offset, len)) { + return Status::OK(); + } else { + return IOError(filename_, errno); + } + } }; // Use posix write to write data to a file. @@ -371,6 +380,7 @@ class PosixWritableFile : public WritableFile { pending_sync_ = true; pending_fsync_ = true; + PrepareWrite(GetFileSize(), left); // if there is no space in the cache, then flush if (cursize_ + left > capacity_) { s = Flush(); @@ -455,6 +465,14 @@ class PosixWritableFile : public WritableFile { virtual uint64_t GetFileSize() { return filesize_; } + + virtual Status Allocate(off_t offset, off_t len) { + if (!fallocate(fd_, FALLOC_FL_KEEP_SIZE, offset, len)) { + return Status::OK(); + } else { + return IOError(filename_, errno); + } + } }; static int LockOrUnlock(const std::string& fname, int fd, bool lock) { diff --git a/util/options.cc b/util/options.cc index 0789f446f..f4a4e27bb 100644 --- a/util/options.cc +++ b/util/options.cc @@ -56,7 +56,9 @@ Options::Options() compaction_filter_args(NULL), CompactionFilter(NULL), disable_auto_compactions(false), - WAL_ttl_seconds(0){ + WAL_ttl_seconds(0), + manifest_preallocation_size(4 * 1024 * 1024) { + } void @@ -144,8 +146,10 @@ Options::Dump(Logger* log) const CompactionFilter); Log(log," Options.disable_auto_compactions: %d", disable_auto_compactions); - Log(log," Options.WAL_ttl_seconds: %ld", + Log(log," Options.WAL_ttl_seconds: %ld", WAL_ttl_seconds); + Log(log," Options.manifest_preallocation_size: %ld", + manifest_preallocation_size); } // Options::Dump } // namespace leveldb diff --git a/util/posix_logger.h b/util/posix_logger.h index 513528314..bb2518ae0 100644 --- a/util/posix_logger.h +++ b/util/posix_logger.h @@ -12,19 +12,25 @@ #include #include #include +#include +#include +#include #include "leveldb/env.h" namespace leveldb { +const int kDebugLogChunkSize = 128 * 1024; + class PosixLogger : public Logger { private: FILE* file_; uint64_t (*gettid_)(); // Return the thread id for the current thread size_t log_size_; + int fd_; public: PosixLogger(FILE* f, uint64_t (*gettid)()) : - file_(f), gettid_(gettid), log_size_(0) { } + file_(f), gettid_(gettid), log_size_(0), fd_(fileno(f)) { } virtual ~PosixLogger() { fclose(file_); } @@ -86,9 +92,25 @@ class PosixLogger : public Logger { } assert(p <= limit); - fwrite(base, 1, p - base, file_); + + // If this write would cross a boundary of kDebugLogChunkSize + // space, pre-allocate more space to avoid overly large + // allocations from filesystem allocsize options. + const size_t write_size = p - base; + const int last_allocation_chunk = + ((kDebugLogChunkSize - 1 + log_size_) / kDebugLogChunkSize); + const int desired_allocation_chunk = + ((kDebugLogChunkSize - 1 + log_size_ + write_size) / + kDebugLogChunkSize); + if (last_allocation_chunk != desired_allocation_chunk) { + fallocate(fd_, FALLOC_FL_KEEP_SIZE, 0, + desired_allocation_chunk * kDebugLogChunkSize); + } + + fwrite(base, 1, write_size, file_); fflush(file_); - log_size_ += (p - base); + + log_size_ += write_size; if (base != buffer) { delete[] base;