Use posix_fallocate as default.

Summary:
Ftruncate does not throw an error on disk-full. This causes Sig-bus in
the case where the database tries to issue a Put call on a full-disk.

Use posix_fallocate for allocation instead of truncate.
Add a check to use MMaped files only on ext4, xfs and tempfs, as
posix_fallocate is very slow on ext3 and older.

Test Plan: make all check

Reviewers: dhruba, chip

Reviewed By: dhruba

CC: adsharma, leveldb

Differential Revision: https://reviews.facebook.net/D9291
This commit is contained in:
Abhishek Kona 2013-03-13 13:50:26 -07:00
parent 4e581c6ab4
commit 1ba5abca97
2 changed files with 56 additions and 9 deletions

2
README
View File

@ -1,7 +1,7 @@
rocksdb: A persistent key-value store for flash storage rocksdb: A persistent key-value store for flash storage
Authors: The Facebook Database Engineering Team Authors: The Facebook Database Engineering Team
This code is a library that forms the core building block for a fast This code is a library that forms the core building block for a fast
key value server, especially suited for storing data on flash drives. key value server, especially suited for storing data on flash drives.
It has an Log-Stuctured-Merge-Database (LSM) design with flexible tradeoffs It has an Log-Stuctured-Merge-Database (LSM) design with flexible tradeoffs
between Write-Amplification-Factor(WAF), Read-Amplification-Factor (RAF) between Write-Amplification-Factor(WAF), Read-Amplification-Factor (RAF)

View File

@ -14,8 +14,10 @@
#include <sys/ioctl.h> #include <sys/ioctl.h>
#include <sys/mman.h> #include <sys/mman.h>
#include <sys/stat.h> #include <sys/stat.h>
#include <sys/statfs.h>
#include <sys/time.h> #include <sys/time.h>
#include <sys/types.h> #include <sys/types.h>
#include <sys/vfs.h>
#include <time.h> #include <time.h>
#include <unistd.h> #include <unistd.h>
#if defined(OS_LINUX) #if defined(OS_LINUX)
@ -31,6 +33,16 @@
#include "util/logging.h" #include "util/logging.h"
#include "util/posix_logger.h" #include "util/posix_logger.h"
#if !defined(TMPFS_MAGIC)
#define TMPFS_MAGIC 0x01021994
#endif
#if !defined(XFS_SUPER_MAGIC)
#define XFS_SUPER_MAGIC 0x58465342
#endif
#if !defined(EXT4_SUPER_MAGIC)
#define EXT4_SUPER_MAGIC 0xEF53
#endif
bool useOsBuffer = 1; // cache data in OS buffers bool useOsBuffer = 1; // cache data in OS buffers
bool useFsReadAhead = 1; // allow filesystem to do readaheads bool useFsReadAhead = 1; // allow filesystem to do readaheads
bool useMmapRead = 0; // do not use mmaps for reading files bool useMmapRead = 0; // do not use mmaps for reading files
@ -224,21 +236,26 @@ class PosixMmapFile : public WritableFile {
return result; return result;
} }
bool MapNewRegion() { Status MapNewRegion() {
assert(base_ == nullptr); assert(base_ == nullptr);
if (ftruncate(fd_, file_offset_ + map_size_) < 0) {
return false; int alloc_status = posix_fallocate(fd_, file_offset_, map_size_);
if (alloc_status != 0) {
return Status::IOError("Error allocating space to file : " + filename_ +
"Error : " + strerror(alloc_status));
} }
void* ptr = mmap(nullptr, map_size_, PROT_READ | PROT_WRITE, MAP_SHARED, void* ptr = mmap(nullptr, map_size_, PROT_READ | PROT_WRITE, MAP_SHARED,
fd_, file_offset_); fd_, file_offset_);
if (ptr == MAP_FAILED) { if (ptr == MAP_FAILED) {
return false; return Status::IOError("MMap failed on " + filename_);
} }
base_ = reinterpret_cast<char*>(ptr); base_ = reinterpret_cast<char*>(ptr);
limit_ = base_ + map_size_; limit_ = base_ + map_size_;
dst_ = base_; dst_ = base_;
last_sync_ = base_; last_sync_ = base_;
return true; return Status::OK();
} }
public: public:
@ -272,9 +289,11 @@ class PosixMmapFile : public WritableFile {
assert(dst_ <= limit_); assert(dst_ <= limit_);
size_t avail = limit_ - dst_; size_t avail = limit_ - dst_;
if (avail == 0) { if (avail == 0) {
if (!UnmapCurrentRegion() || if (UnmapCurrentRegion()) {
!MapNewRegion()) { Status s = MapNewRegion();
return IOError(filename_, errno); if (!s.ok()) {
return s;
}
} }
} }
@ -614,6 +633,15 @@ class PosixEnv : public Env {
if (fd < 0) { if (fd < 0) {
s = IOError(fname, errno); s = IOError(fname, errno);
} else { } else {
if (!checkedDiskForMmap_) {
// this will be executed once in the program's lifetime.
if (useMmapWrite) {
// do not use mmapWrite on non ext-3/xfs/tmpfs systems.
useMmapWrite = SupportsFastAllocate(fname);
}
checkedDiskForMmap_ = true;
}
if (useMmapWrite) { if (useMmapWrite) {
result->reset(new PosixMmapFile(fname, fd, page_size_)); result->reset(new PosixMmapFile(fname, fd, page_size_));
} else { } else {
@ -851,6 +879,8 @@ class PosixEnv : public Env {
} }
private: private:
bool checkedDiskForMmap_ = false;
void PthreadCall(const char* label, int result) { void PthreadCall(const char* label, int result) {
if (result != 0) { if (result != 0) {
fprintf(stderr, "pthread %s: %s\n", label, strerror(result)); fprintf(stderr, "pthread %s: %s\n", label, strerror(result));
@ -875,6 +905,23 @@ class PosixEnv : public Env {
return nullptr; return nullptr;
} }
bool SupportsFastAllocate(const std::string& path) {
struct statfs s;
if (statfs(path.c_str(), &s)){
return false;
}
switch (s.f_type) {
case EXT4_SUPER_MAGIC:
return true;
case XFS_SUPER_MAGIC:
return true;
case TMPFS_MAGIC:
return true;
default:
return false;
}
}
size_t page_size_; size_t page_size_;
pthread_mutex_t mu_; pthread_mutex_t mu_;
pthread_cond_t bgsignal_; pthread_cond_t bgsignal_;