From 1ba5abca97b4728b2b6b0d2f9395ebf387f9a2ae Mon Sep 17 00:00:00 2001 From: Abhishek Kona Date: Wed, 13 Mar 2013 13:50:26 -0700 Subject: [PATCH] Use posix_fallocate as default. Summary: Ftruncate does not throw an error on disk-full. This causes Sig-bus in the case where the database tries to issue a Put call on a full-disk. Use posix_fallocate for allocation instead of truncate. Add a check to use MMaped files only on ext4, xfs and tempfs, as posix_fallocate is very slow on ext3 and older. Test Plan: make all check Reviewers: dhruba, chip Reviewed By: dhruba CC: adsharma, leveldb Differential Revision: https://reviews.facebook.net/D9291 --- README | 2 +- util/env_posix.cc | 63 +++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 56 insertions(+), 9 deletions(-) diff --git a/README b/README index 783332c44..4ac8f93cb 100644 --- a/README +++ b/README @@ -1,7 +1,7 @@ rocksdb: A persistent key-value store for flash storage Authors: The Facebook Database Engineering Team -This code is a library that forms the core building block for a fast +This code is a library that forms the core building block for a fast key value server, especially suited for storing data on flash drives. It has an Log-Stuctured-Merge-Database (LSM) design with flexible tradeoffs between Write-Amplification-Factor(WAF), Read-Amplification-Factor (RAF) diff --git a/util/env_posix.cc b/util/env_posix.cc index e9b561ab9..103cadcf5 100644 --- a/util/env_posix.cc +++ b/util/env_posix.cc @@ -14,8 +14,10 @@ #include #include #include +#include #include #include +#include #include #include #if defined(OS_LINUX) @@ -31,6 +33,16 @@ #include "util/logging.h" #include "util/posix_logger.h" +#if !defined(TMPFS_MAGIC) +#define TMPFS_MAGIC 0x01021994 +#endif +#if !defined(XFS_SUPER_MAGIC) +#define XFS_SUPER_MAGIC 0x58465342 +#endif +#if !defined(EXT4_SUPER_MAGIC) +#define EXT4_SUPER_MAGIC 0xEF53 +#endif + bool useOsBuffer = 1; // cache data in OS buffers bool useFsReadAhead = 1; // allow filesystem to do readaheads bool useMmapRead = 0; // do not use mmaps for reading files @@ -224,21 +236,26 @@ class PosixMmapFile : public WritableFile { return result; } - bool MapNewRegion() { + Status MapNewRegion() { assert(base_ == nullptr); - if (ftruncate(fd_, file_offset_ + map_size_) < 0) { - return false; + + int alloc_status = posix_fallocate(fd_, file_offset_, map_size_); + if (alloc_status != 0) { + return Status::IOError("Error allocating space to file : " + filename_ + + "Error : " + strerror(alloc_status)); } + + void* ptr = mmap(nullptr, map_size_, PROT_READ | PROT_WRITE, MAP_SHARED, fd_, file_offset_); if (ptr == MAP_FAILED) { - return false; + return Status::IOError("MMap failed on " + filename_); } base_ = reinterpret_cast(ptr); limit_ = base_ + map_size_; dst_ = base_; last_sync_ = base_; - return true; + return Status::OK(); } public: @@ -272,9 +289,11 @@ class PosixMmapFile : public WritableFile { assert(dst_ <= limit_); size_t avail = limit_ - dst_; if (avail == 0) { - if (!UnmapCurrentRegion() || - !MapNewRegion()) { - return IOError(filename_, errno); + if (UnmapCurrentRegion()) { + Status s = MapNewRegion(); + if (!s.ok()) { + return s; + } } } @@ -614,6 +633,15 @@ class PosixEnv : public Env { if (fd < 0) { s = IOError(fname, errno); } else { + if (!checkedDiskForMmap_) { + // this will be executed once in the program's lifetime. + if (useMmapWrite) { + // do not use mmapWrite on non ext-3/xfs/tmpfs systems. + useMmapWrite = SupportsFastAllocate(fname); + } + checkedDiskForMmap_ = true; + } + if (useMmapWrite) { result->reset(new PosixMmapFile(fname, fd, page_size_)); } else { @@ -851,6 +879,8 @@ class PosixEnv : public Env { } private: + bool checkedDiskForMmap_ = false; + void PthreadCall(const char* label, int result) { if (result != 0) { fprintf(stderr, "pthread %s: %s\n", label, strerror(result)); @@ -875,6 +905,23 @@ class PosixEnv : public Env { return nullptr; } + bool SupportsFastAllocate(const std::string& path) { + struct statfs s; + if (statfs(path.c_str(), &s)){ + return false; + } + switch (s.f_type) { + case EXT4_SUPER_MAGIC: + return true; + case XFS_SUPER_MAGIC: + return true; + case TMPFS_MAGIC: + return true; + default: + return false; + } + } + size_t page_size_; pthread_mutex_t mu_; pthread_cond_t bgsignal_;