Direct IO capability for RocksDB

Summary:
This patch adds direct IO capability to RocksDB Env.

The direct IO capability is required for persistent cache since NVM is best
accessed as 4K direct IO. SSDs can leverage direct IO for reading.

Direct IO requires the offset and size be sector size aligned, and memory to
be kernel page aligned. Since neither RocksDB/Persistent read cache data
layout is aligned to sector size, the code can accommodate reading unaligned IO size
(or unaligned memory) at the cost of an alloc/copy.

The write code path expects the size and memory to be aligned.

Test Plan: Run RocksDB unit tests

Reviewers: sdong

Subscribers: andrewkr, dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D57393
This commit is contained in:
krad 2016-04-21 10:37:27 -07:00
parent 8f1214531e
commit f89caa127b
5 changed files with 605 additions and 247 deletions

View File

@ -68,6 +68,12 @@ struct EnvOptions {
// If true, then use mmap to write data // If true, then use mmap to write data
bool use_mmap_writes = true; bool use_mmap_writes = true;
// If true, then use O_DIRECT for reading data
bool use_direct_reads = false;
// If true, then use O_DIRECT for writing data
bool use_direct_writes = false;
// If false, fallocate() calls are bypassed // If false, fallocate() calls are bypassed
bool allow_fallocate = true; bool allow_fallocate = true;

View File

@ -152,6 +152,17 @@ class PosixEnv : public Env {
if (f == nullptr) { if (f == nullptr) {
*result = nullptr; *result = nullptr;
return IOError(fname, errno); return IOError(fname, errno);
} else if (options.use_direct_reads && !options.use_mmap_writes) {
int flags = O_RDONLY | O_DIRECT;
TEST_SYNC_POINT_CALLBACK("NewSequentialFile:O_DIRECT", &flags);
int fd = open(fname.c_str(), flags, 0644);
if (fd < 0) {
return IOError(fname, errno);
}
std::unique_ptr<PosixDirectIOSequentialFile> file(
new PosixDirectIOSequentialFile(fname, fd));
*result = std::move(file);
return Status::OK();
} else { } else {
int fd = fileno(f); int fd = fileno(f);
SetFD_CLOEXEC(fd, &options); SetFD_CLOEXEC(fd, &options);
@ -189,6 +200,18 @@ class PosixEnv : public Env {
} }
} }
close(fd); close(fd);
} else if (options.use_direct_reads) {
int flags = O_RDONLY | O_DIRECT;
TEST_SYNC_POINT_CALLBACK("NewRandomAccessFile:O_DIRECT", &flags);
fd = open(fname.c_str(), flags, 0644);
if (fd < 0) {
s = IOError(fname, errno);
} else {
std::unique_ptr<PosixDirectIORandomAccessFile> file(
new PosixDirectIORandomAccessFile(fname, fd));
*result = std::move(file);
s = Status::OK();
}
} else { } else {
result->reset(new PosixRandomAccessFile(fname, fd, options)); result->reset(new PosixRandomAccessFile(fname, fd, options));
} }
@ -221,6 +244,18 @@ class PosixEnv : public Env {
} }
if (options.use_mmap_writes && !forceMmapOff) { if (options.use_mmap_writes && !forceMmapOff) {
result->reset(new PosixMmapFile(fname, fd, page_size_, options)); result->reset(new PosixMmapFile(fname, fd, page_size_, options));
} else if (options.use_direct_writes) {
int flags = O_WRONLY | O_APPEND | O_TRUNC | O_CREAT | O_DIRECT;
TEST_SYNC_POINT_CALLBACK("NewWritableFile:O_DIRECT", &flags);
fd = open(fname.c_str(), flags, 0644);
if (fd < 0) {
s = IOError(fname, errno);
} else {
std::unique_ptr<PosixDirectIOWritableFile> file(
new PosixDirectIOWritableFile(fname, fd));
*result = std::move(file);
s = Status::OK();
}
} else { } else {
// disable mmap writes // disable mmap writes
EnvOptions no_mmap_writes_options = options; EnvOptions no_mmap_writes_options = options;
@ -763,6 +798,9 @@ std::string Env::GenerateUniqueId() {
return uuid2; return uuid2;
} }
//
// Default Posix Env
//
Env* Env::Default() { Env* Env::Default() {
// The following function call initializes the singletons of ThreadLocalPtr // The following function call initializes the singletons of ThreadLocalPtr
// right before the static default_env. This guarantees default_env will // right before the static default_env. This guarantees default_env will

View File

@ -36,6 +36,7 @@
#include "util/log_buffer.h" #include "util/log_buffer.h"
#include "util/mutexlock.h" #include "util/mutexlock.h"
#include "util/string_util.h" #include "util/string_util.h"
#include "util/sync_point.h"
#include "util/testharness.h" #include "util/testharness.h"
#include "util/testutil.h" #include "util/testutil.h"
@ -43,6 +44,17 @@ namespace rocksdb {
static const int kDelayMicros = 100000; static const int kDelayMicros = 100000;
std::unique_ptr<char, void (&)(void*)> NewAligned(const size_t size,
const char ch) {
char* ptr = nullptr;
if (posix_memalign(reinterpret_cast<void**>(&ptr), 4 * 1024, size) != 0) {
return std::unique_ptr<char, void (&)(void*)>(nullptr, free);
}
std::unique_ptr<char, void (&)(void*)> uptr(ptr, free);
memset(uptr.get(), ch, size);
return uptr;
}
class EnvPosixTest : public testing::Test { class EnvPosixTest : public testing::Test {
private: private:
port::Mutex mu_; port::Mutex mu_;
@ -553,8 +565,10 @@ class IoctlFriendlyTmpdir {
// Only works in linux platforms // Only works in linux platforms
TEST_F(EnvPosixTest, RandomAccessUniqueID) { TEST_F(EnvPosixTest, RandomAccessUniqueID) {
for (bool directio : {true, false}) {
// Create file. // Create file.
const EnvOptions soptions; EnvOptions soptions;
soptions.use_direct_reads = soptions.use_direct_writes = directio;
IoctlFriendlyTmpdir ift; IoctlFriendlyTmpdir ift;
std::string fname = ift.name() + "/testfile"; std::string fname = ift.name() + "/testfile";
unique_ptr<WritableFile> wfile; unique_ptr<WritableFile> wfile;
@ -590,15 +604,18 @@ TEST_F(EnvPosixTest, RandomAccessUniqueID) {
// Delete the file // Delete the file
env_->DeleteFile(fname); env_->DeleteFile(fname);
}
} }
// only works in linux platforms // only works in linux platforms
#ifdef ROCKSDB_FALLOCATE_PRESENT #ifdef ROCKSDB_FALLOCATE_PRESENT
TEST_F(EnvPosixTest, AllocateTest) { TEST_F(EnvPosixTest, AllocateTest) {
for (bool directio : {true, false}) {
IoctlFriendlyTmpdir ift; IoctlFriendlyTmpdir ift;
std::string fname = ift.name() + "/preallocate_testfile"; std::string fname = ift.name() + "/preallocate_testfile";
// Try fallocate in a file to see whether the target file system supports it. // Try fallocate in a file to see whether the target file system supports
// it.
// Skip the test if fallocate is not supported. // Skip the test if fallocate is not supported.
std::string fname_test_fallocate = ift.name() + "/preallocate_testfile_2"; std::string fname_test_fallocate = ift.name() + "/preallocate_testfile_2";
int fd = -1; int fd = -1;
@ -623,6 +640,7 @@ TEST_F(EnvPosixTest, AllocateTest) {
EnvOptions soptions; EnvOptions soptions;
soptions.use_mmap_writes = false; soptions.use_mmap_writes = false;
soptions.use_direct_reads = soptions.use_direct_writes = directio;
unique_ptr<WritableFile> wfile; unique_ptr<WritableFile> wfile;
ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions)); ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions));
@ -630,18 +648,21 @@ TEST_F(EnvPosixTest, AllocateTest) {
size_t kPreallocateSize = 100 * 1024 * 1024; size_t kPreallocateSize = 100 * 1024 * 1024;
size_t kBlockSize = 512; size_t kBlockSize = 512;
size_t kPageSize = 4096; size_t kPageSize = 4096;
std::string data(1024 * 1024, 'a'); size_t kDataSize = 1024 * 1024;
auto data_ptr = NewAligned(kDataSize, 'A');
Slice data(data_ptr.get(), kDataSize);
wfile->SetPreallocationBlockSize(kPreallocateSize); wfile->SetPreallocationBlockSize(kPreallocateSize);
wfile->PrepareWrite(wfile->GetFileSize(), data.size()); wfile->PrepareWrite(wfile->GetFileSize(), kDataSize);
ASSERT_OK(wfile->Append(Slice(data))); ASSERT_OK(wfile->Append(data));
ASSERT_OK(wfile->Flush()); ASSERT_OK(wfile->Flush());
struct stat f_stat; struct stat f_stat;
stat(fname.c_str(), &f_stat); ASSERT_EQ(stat(fname.c_str(), &f_stat), 0);
ASSERT_EQ((unsigned int)data.size(), f_stat.st_size); ASSERT_EQ((unsigned int)kDataSize, f_stat.st_size);
// verify that blocks are preallocated // verify that blocks are preallocated
// Note here that we don't check the exact number of blocks preallocated -- // Note here that we don't check the exact number of blocks preallocated --
// we only require that number of allocated blocks is at least what we expect. // we only require that number of allocated blocks is at least what we
// expect.
// It looks like some FS give us more blocks that we asked for. That's fine. // It looks like some FS give us more blocks that we asked for. That's fine.
// It might be worth investigating further. // It might be worth investigating further.
ASSERT_LE((unsigned int)(kPreallocateSize / kBlockSize), f_stat.st_blocks); ASSERT_LE((unsigned int)(kPreallocateSize / kBlockSize), f_stat.st_blocks);
@ -650,11 +671,13 @@ TEST_F(EnvPosixTest, AllocateTest) {
wfile.reset(); wfile.reset();
stat(fname.c_str(), &f_stat); stat(fname.c_str(), &f_stat);
ASSERT_EQ((unsigned int)data.size(), f_stat.st_size); ASSERT_EQ((unsigned int)kDataSize, f_stat.st_size);
// verify that preallocated blocks were deallocated on file close // verify that preallocated blocks were deallocated on file close
// Because the FS might give us more blocks, we add a full page to the size // Because the FS might give us more blocks, we add a full page to the size
// and expect the number of blocks to be less or equal to that. // and expect the number of blocks to be less or equal to that.
ASSERT_GE((f_stat.st_size + kPageSize + kBlockSize - 1) / kBlockSize, (unsigned int)f_stat.st_blocks); ASSERT_GE((f_stat.st_size + kPageSize + kBlockSize - 1) / kBlockSize,
(unsigned int)f_stat.st_blocks);
}
} }
#endif // ROCKSDB_FALLOCATE_PRESENT #endif // ROCKSDB_FALLOCATE_PRESENT
@ -675,8 +698,10 @@ bool HasPrefix(const std::unordered_set<std::string>& ss) {
// Only works in linux platforms // Only works in linux platforms
TEST_F(EnvPosixTest, RandomAccessUniqueIDConcurrent) { TEST_F(EnvPosixTest, RandomAccessUniqueIDConcurrent) {
for (bool directio : {true, false}) {
// Check whether a bunch of concurrently existing files have unique IDs. // Check whether a bunch of concurrently existing files have unique IDs.
const EnvOptions soptions; EnvOptions soptions;
soptions.use_direct_reads = soptions.use_direct_writes = directio;
// Create the files // Create the files
IoctlFriendlyTmpdir ift; IoctlFriendlyTmpdir ift;
@ -691,7 +716,7 @@ TEST_F(EnvPosixTest, RandomAccessUniqueIDConcurrent) {
// Collect and check whether the IDs are unique. // Collect and check whether the IDs are unique.
std::unordered_set<std::string> ids; std::unordered_set<std::string> ids;
for (const std::string fname: fnames) { for (const std::string fname : fnames) {
unique_ptr<RandomAccessFile> file; unique_ptr<RandomAccessFile> file;
std::string unique_id; std::string unique_id;
ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions)); ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions));
@ -705,21 +730,25 @@ TEST_F(EnvPosixTest, RandomAccessUniqueIDConcurrent) {
} }
// Delete the files // Delete the files
for (const std::string fname: fnames) { for (const std::string fname : fnames) {
ASSERT_OK(env_->DeleteFile(fname)); ASSERT_OK(env_->DeleteFile(fname));
} }
ASSERT_TRUE(!HasPrefix(ids)); ASSERT_TRUE(!HasPrefix(ids));
}
} }
// Only works in linux platforms // Only works in linux platforms
TEST_F(EnvPosixTest, RandomAccessUniqueIDDeletes) { TEST_F(EnvPosixTest, RandomAccessUniqueIDDeletes) {
const EnvOptions soptions; for (bool directio : {true, false}) {
EnvOptions soptions;
soptions.use_direct_reads = soptions.use_direct_writes = directio;
IoctlFriendlyTmpdir ift; IoctlFriendlyTmpdir ift;
std::string fname = ift.name() + "/" + "testfile"; std::string fname = ift.name() + "/" + "testfile";
// Check that after file is deleted we don't get same ID again in a new file. // Check that after file is deleted we don't get same ID again in a new
// file.
std::unordered_set<std::string> ids; std::unordered_set<std::string> ids;
for (int i = 0; i < 1000; ++i) { for (int i = 0; i < 1000; ++i) {
// Create file. // Create file.
@ -747,18 +776,34 @@ TEST_F(EnvPosixTest, RandomAccessUniqueIDDeletes) {
} }
ASSERT_TRUE(!HasPrefix(ids)); ASSERT_TRUE(!HasPrefix(ids));
}
} }
// Only works in linux platforms // Only works in linux platforms
TEST_P(EnvPosixTestWithParam, InvalidateCache) { TEST_P(EnvPosixTestWithParam, InvalidateCache) {
const EnvOptions soptions; rocksdb::SyncPoint::GetInstance()->EnableProcessing();
for (bool directio : {true, false}) {
EnvOptions soptions;
soptions.use_direct_reads = soptions.use_direct_writes = directio;
std::string fname = test::TmpDir(env_) + "/" + "testfile"; std::string fname = test::TmpDir(env_) + "/" + "testfile";
const size_t kSectorSize = 512;
auto data = NewAligned(kSectorSize, 'A');
Slice slice(data.get(), kSectorSize);
// Create file. // Create file.
{ {
unique_ptr<WritableFile> wfile; unique_ptr<WritableFile> wfile;
if (soptions.use_direct_writes) {
rocksdb::SyncPoint::GetInstance()->SetCallBack(
"NewWritableFile:O_DIRECT", [&](void* arg) {
int* val = static_cast<int*>(arg);
*val &= ~O_DIRECT;
});
}
ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions)); ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions));
ASSERT_OK(wfile.get()->Append(Slice("Hello world"))); ASSERT_OK(wfile.get()->Append(slice));
ASSERT_OK(wfile.get()->InvalidateCache(0, 0)); ASSERT_OK(wfile.get()->InvalidateCache(0, 0));
ASSERT_OK(wfile.get()->Close()); ASSERT_OK(wfile.get()->Close());
} }
@ -766,11 +811,19 @@ TEST_P(EnvPosixTestWithParam, InvalidateCache) {
// Random Read // Random Read
{ {
unique_ptr<RandomAccessFile> file; unique_ptr<RandomAccessFile> file;
char scratch[100]; char scratch[kSectorSize];
Slice result; Slice result;
if (soptions.use_direct_reads) {
rocksdb::SyncPoint::GetInstance()->SetCallBack(
"NewRandomAccessFile:O_DIRECT", [&](void* arg) {
int* val = static_cast<int*>(arg);
*val &= ~O_DIRECT;
});
}
ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions)); ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions));
ASSERT_OK(file.get()->Read(0, 11, &result, scratch)); ASSERT_OK(file.get()->Read(0, kSectorSize, &result, scratch));
ASSERT_EQ(memcmp(scratch, "Hello world", 11), 0); ASSERT_EQ(memcmp(scratch, data.get(), kSectorSize), 0);
ASSERT_OK(file.get()->InvalidateCache(0, 11)); ASSERT_OK(file.get()->InvalidateCache(0, 11));
ASSERT_OK(file.get()->InvalidateCache(0, 0)); ASSERT_OK(file.get()->InvalidateCache(0, 0));
} }
@ -778,16 +831,26 @@ TEST_P(EnvPosixTestWithParam, InvalidateCache) {
// Sequential Read // Sequential Read
{ {
unique_ptr<SequentialFile> file; unique_ptr<SequentialFile> file;
char scratch[100]; char scratch[kSectorSize];
Slice result; Slice result;
if (soptions.use_direct_reads) {
rocksdb::SyncPoint::GetInstance()->SetCallBack(
"NewSequentialFile:O_DIRECT", [&](void* arg) {
int* val = static_cast<int*>(arg);
*val &= ~O_DIRECT;
});
}
ASSERT_OK(env_->NewSequentialFile(fname, &file, soptions)); ASSERT_OK(env_->NewSequentialFile(fname, &file, soptions));
ASSERT_OK(file.get()->Read(11, &result, scratch)); ASSERT_OK(file.get()->Read(kSectorSize, &result, scratch));
ASSERT_EQ(memcmp(scratch, "Hello world", 11), 0); ASSERT_EQ(memcmp(scratch, data.get(), kSectorSize), 0);
ASSERT_OK(file.get()->InvalidateCache(0, 11)); ASSERT_OK(file.get()->InvalidateCache(0, 11));
ASSERT_OK(file.get()->InvalidateCache(0, 0)); ASSERT_OK(file.get()->InvalidateCache(0, 0));
} }
// Delete the file // Delete the file
ASSERT_OK(env_->DeleteFile(fname)); ASSERT_OK(env_->DeleteFile(fname));
}
rocksdb::SyncPoint::GetInstance()->ClearTrace();
} }
#endif // not TRAVIS #endif // not TRAVIS
#endif // OS_LINUX #endif // OS_LINUX
@ -909,9 +972,20 @@ TEST_P(EnvPosixTestWithParam, LogBufferMaxSizeTest) {
} }
TEST_P(EnvPosixTestWithParam, Preallocation) { TEST_P(EnvPosixTestWithParam, Preallocation) {
rocksdb::SyncPoint::GetInstance()->EnableProcessing();
for (bool directio : {true, false}) {
const std::string src = test::TmpDir(env_) + "/" + "testfile"; const std::string src = test::TmpDir(env_) + "/" + "testfile";
unique_ptr<WritableFile> srcfile; unique_ptr<WritableFile> srcfile;
const EnvOptions soptions; EnvOptions soptions;
soptions.use_direct_reads = soptions.use_direct_writes = directio;
if (soptions.use_direct_writes) {
rocksdb::SyncPoint::GetInstance()->SetCallBack(
"NewWritableFile:O_DIRECT", [&](void* arg) {
int* val = static_cast<int*>(arg);
*val &= ~O_DIRECT;
});
}
ASSERT_OK(env_->NewWritableFile(src, &srcfile, soptions)); ASSERT_OK(env_->NewWritableFile(src, &srcfile, soptions));
srcfile->SetPreallocationBlockSize(1024 * 1024); srcfile->SetPreallocationBlockSize(1024 * 1024);
@ -921,31 +995,44 @@ TEST_P(EnvPosixTestWithParam, Preallocation) {
ASSERT_EQ(last_allocated_block, 0UL); ASSERT_EQ(last_allocated_block, 0UL);
// Small write should preallocate one block // Small write should preallocate one block
std::string str = "test"; size_t kStrSize = 512;
srcfile->PrepareWrite(srcfile->GetFileSize(), str.size()); auto data = NewAligned(kStrSize, 'A');
Slice str(data.get(), kStrSize);
srcfile->PrepareWrite(srcfile->GetFileSize(), kStrSize);
srcfile->Append(str); srcfile->Append(str);
srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); srcfile->GetPreallocationStatus(&block_size, &last_allocated_block);
ASSERT_EQ(last_allocated_block, 1UL); ASSERT_EQ(last_allocated_block, 1UL);
// Write an entire preallocation block, make sure we increased by two. // Write an entire preallocation block, make sure we increased by two.
std::string buf(block_size, ' '); {
srcfile->PrepareWrite(srcfile->GetFileSize(), buf.size()); auto buf_ptr = NewAligned(block_size, ' ');
Slice buf(buf_ptr.get(), block_size);
srcfile->PrepareWrite(srcfile->GetFileSize(), block_size);
srcfile->Append(buf); srcfile->Append(buf);
srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); srcfile->GetPreallocationStatus(&block_size, &last_allocated_block);
ASSERT_EQ(last_allocated_block, 2UL); ASSERT_EQ(last_allocated_block, 2UL);
}
// Write five more blocks at once, ensure we're where we need to be. // Write five more blocks at once, ensure we're where we need to be.
buf = std::string(block_size * 5, ' '); {
auto buf_ptr = NewAligned(block_size * 5, ' ');
Slice buf = Slice(buf_ptr.get(), block_size * 5);
srcfile->PrepareWrite(srcfile->GetFileSize(), buf.size()); srcfile->PrepareWrite(srcfile->GetFileSize(), buf.size());
srcfile->Append(buf); srcfile->Append(buf);
srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); srcfile->GetPreallocationStatus(&block_size, &last_allocated_block);
ASSERT_EQ(last_allocated_block, 7UL); ASSERT_EQ(last_allocated_block, 7UL);
}
}
rocksdb::SyncPoint::GetInstance()->ClearTrace();
} }
// Test that the two ways to get children file attributes (in bulk or // Test that the two ways to get children file attributes (in bulk or
// individually) behave consistently. // individually) behave consistently.
TEST_P(EnvPosixTestWithParam, ConsistentChildrenAttributes) { TEST_P(EnvPosixTestWithParam, ConsistentChildrenAttributes) {
const EnvOptions soptions; rocksdb::SyncPoint::GetInstance()->EnableProcessing();
for (bool directio : {true, false}) {
EnvOptions soptions;
soptions.use_direct_reads = soptions.use_direct_writes = directio;
const int kNumChildren = 10; const int kNumChildren = 10;
std::string data; std::string data;
@ -954,9 +1041,19 @@ TEST_P(EnvPosixTestWithParam, ConsistentChildrenAttributes) {
oss << test::TmpDir(env_) << "/testfile_" << i; oss << test::TmpDir(env_) << "/testfile_" << i;
const std::string path = oss.str(); const std::string path = oss.str();
unique_ptr<WritableFile> file; unique_ptr<WritableFile> file;
if (soptions.use_direct_writes) {
rocksdb::SyncPoint::GetInstance()->SetCallBack(
"NewWritableFile:O_DIRECT", [&](void* arg) {
int* val = static_cast<int*>(arg);
*val &= ~O_DIRECT;
});
}
ASSERT_OK(env_->NewWritableFile(path, &file, soptions)); ASSERT_OK(env_->NewWritableFile(path, &file, soptions));
file->Append(data); auto buf_ptr = NewAligned(data.size(), 'T');
data.append("test"); Slice buf(buf_ptr.get(), data.size());
file->Append(buf);
data.append(std::string(512, 'T'));
} }
std::vector<Env::FileAttributes> file_attrs; std::vector<Env::FileAttributes> file_attrs;
@ -973,9 +1070,11 @@ TEST_P(EnvPosixTestWithParam, ConsistentChildrenAttributes) {
ASSERT_TRUE(file_attrs_iter != file_attrs.end()); ASSERT_TRUE(file_attrs_iter != file_attrs.end());
uint64_t size; uint64_t size;
ASSERT_OK(env_->GetFileSize(path, &size)); ASSERT_OK(env_->GetFileSize(path, &size));
ASSERT_EQ(size, 4 * i); ASSERT_EQ(size, 512 * i);
ASSERT_EQ(size, file_attrs_iter->size_bytes); ASSERT_EQ(size, file_attrs_iter->size_bytes);
} }
}
rocksdb::SyncPoint::GetInstance()->ClearTrace();
} }
// Test that all WritableFileWrapper forwards all calls to WritableFile. // Test that all WritableFileWrapper forwards all calls to WritableFile.

View File

@ -12,6 +12,7 @@
#include "util/io_posix.h" #include "util/io_posix.h"
#include <errno.h> #include <errno.h>
#include <fcntl.h> #include <fcntl.h>
#include <algorithm>
#if defined(OS_LINUX) #if defined(OS_LINUX)
#include <linux/fs.h> #include <linux/fs.h>
#endif #endif
@ -46,6 +47,112 @@ int Fadvise(int fd, off_t offset, size_t len, int advice) {
#endif #endif
} }
/*
* DirectIOHelper
*/
namespace {
const size_t kSectorSize = 512;
#ifdef OS_LINUX
const size_t kPageSize = sysconf(_SC_PAGESIZE);
#else
const size_t kPageSize = 4 * 1024;
#endif
std::unique_ptr<void, void (&)(void*)> NewAligned(const size_t size) {
void* ptr = nullptr;
if (posix_memalign(&ptr, 4 * 1024, size) != 0) {
return std::unique_ptr<char, void (&)(void*)>(nullptr, free);
}
std::unique_ptr<void, void (&)(void*)> uptr(ptr, free);
return uptr;
}
size_t Upper(const size_t size, const size_t fac) {
if (size % fac == 0) {
return size;
}
return size + (fac - size % fac);
}
size_t Lower(const size_t size, const size_t fac) {
if (size % fac == 0) {
return size;
}
return size - (size % fac);
}
bool IsSectorAligned(const size_t off) { return off % kSectorSize == 0; }
static bool IsPageAligned(const void* ptr) {
return uintptr_t(ptr) % (kPageSize) == 0;
}
Status ReadAligned(int fd, Slice* data, const uint64_t offset,
const size_t size, char* scratch) {
assert(IsSectorAligned(offset));
assert(IsSectorAligned(size));
assert(IsPageAligned(scratch));
size_t bytes_read = 0;
ssize_t status = -1;
while (bytes_read < size) {
status =
pread(fd, scratch + bytes_read, size - bytes_read, offset + bytes_read);
if (status <= 0) {
if (errno == EINTR) {
continue;
}
break;
}
bytes_read += status;
}
*data = Slice(scratch, bytes_read);
return status < 0 ? Status::IOError(strerror(errno)) : Status::OK();
}
Status ReadUnaligned(int fd, Slice* data, const uint64_t offset,
const size_t size, char* scratch) {
assert(scratch);
assert(!IsSectorAligned(offset) || !IsSectorAligned(size) ||
!IsPageAligned(scratch));
const uint64_t aligned_off = Lower(offset, kSectorSize);
const size_t aligned_size = Upper(size + (offset - aligned_off), kSectorSize);
auto aligned_scratch = NewAligned(aligned_size);
assert(aligned_scratch);
if (!aligned_scratch) {
return Status::IOError("Unable to allocate");
}
assert(IsSectorAligned(aligned_off));
assert(IsSectorAligned(aligned_size));
assert(aligned_scratch);
assert(IsPageAligned(aligned_scratch.get()));
assert(offset + size <= aligned_off + aligned_size);
Slice scratch_slice;
Status s = ReadAligned(fd, &scratch_slice, aligned_off, aligned_size,
reinterpret_cast<char*>(aligned_scratch.get()));
// copy data upto min(size, what was read)
memcpy(scratch, reinterpret_cast<char*>(aligned_scratch.get()) +
(offset % kSectorSize),
std::min(size, scratch_slice.size()));
*data = Slice(scratch, std::min(size, scratch_slice.size()));
return s;
}
Status DirectIORead(int fd, Slice* result, size_t off, size_t n,
char* scratch) {
if (IsSectorAligned(off) && IsSectorAligned(n) &&
IsPageAligned(result->data())) {
return ReadAligned(fd, result, off, n, scratch);
}
return ReadUnaligned(fd, result, off, n, scratch);
}
} // namespace
/* /*
* PosixSequentialFile * PosixSequentialFile
*/ */
@ -104,15 +211,37 @@ Status PosixSequentialFile::InvalidateCache(size_t offset, size_t length) {
#endif #endif
} }
/*
* PosixDirectIOSequentialFile
*/
Status PosixDirectIOSequentialFile::Read(size_t n, Slice* result,
char* scratch) {
const size_t off = off_.fetch_add(n);
return DirectIORead(fd_, result, off, n, scratch);
}
Status PosixDirectIOSequentialFile::Skip(uint64_t n) {
off_ += n;
return Status::OK();
}
Status PosixDirectIOSequentialFile::InvalidateCache(size_t /*offset*/,
size_t /*length*/) {
return Status::OK();
}
/*
* PosixRandomAccessFile
*/
#if defined(OS_LINUX) #if defined(OS_LINUX)
namespace { size_t PosixHelper::GetUniqueIdFromFile(int fd, char* id, size_t max_size) {
static size_t GetUniqueIdFromFile(int fd, char* id, size_t max_size) {
if (max_size < kMaxVarint64Length * 3) { if (max_size < kMaxVarint64Length * 3) {
return 0; return 0;
} }
struct stat buf; struct stat buf;
int result = fstat(fd, &buf); int result = fstat(fd, &buf);
assert(result != -1);
if (result == -1) { if (result == -1) {
return 0; return 0;
} }
@ -132,12 +261,10 @@ static size_t GetUniqueIdFromFile(int fd, char* id, size_t max_size) {
assert(rid >= id); assert(rid >= id);
return static_cast<size_t>(rid - id); return static_cast<size_t>(rid - id);
} }
} // namespace
#endif #endif
#if defined(OS_MACOSX) #if defined(OS_MACOSX)
namespace { size_t PosixHelper::GetUniqueIdFromFile(int fd, char* id, size_t max_size) {
static size_t GetUniqueIdFromFile(int fd, char* id, size_t max_size) {
if (max_size < kMaxVarint64Length * 3) { if (max_size < kMaxVarint64Length * 3) {
return 0; return 0;
} }
@ -155,9 +282,7 @@ static size_t GetUniqueIdFromFile(int fd, char* id, size_t max_size) {
assert(rid >= id); assert(rid >= id);
return static_cast<size_t>(rid - id); return static_cast<size_t>(rid - id);
} }
} // namespace
#endif #endif
/* /*
* PosixRandomAccessFile * PosixRandomAccessFile
* *
@ -206,7 +331,7 @@ Status PosixRandomAccessFile::Read(uint64_t offset, size_t n, Slice* result,
#if defined(OS_LINUX) || defined(OS_MACOSX) #if defined(OS_LINUX) || defined(OS_MACOSX)
size_t PosixRandomAccessFile::GetUniqueId(char* id, size_t max_size) const { size_t PosixRandomAccessFile::GetUniqueId(char* id, size_t max_size) const {
return GetUniqueIdFromFile(fd_, id, max_size); return PosixHelper::GetUniqueIdFromFile(fd_, id, max_size);
} }
#endif #endif
@ -246,6 +371,15 @@ Status PosixRandomAccessFile::InvalidateCache(size_t offset, size_t length) {
#endif #endif
} }
/*
* PosixDirectIORandomAccessFile
*/
Status PosixDirectIORandomAccessFile::Read(uint64_t offset, size_t n,
Slice* result, char* scratch) const {
Status s = DirectIORead(fd_, result, offset, n, scratch);
return s;
}
/* /*
* PosixMmapReadableFile * PosixMmapReadableFile
* *
@ -663,10 +797,37 @@ Status PosixWritableFile::RangeSync(uint64_t offset, uint64_t nbytes) {
} }
size_t PosixWritableFile::GetUniqueId(char* id, size_t max_size) const { size_t PosixWritableFile::GetUniqueId(char* id, size_t max_size) const {
return GetUniqueIdFromFile(fd_, id, max_size); return PosixHelper::GetUniqueIdFromFile(fd_, id, max_size);
} }
#endif #endif
/*
* PosixDirectIOWritableFile
*/
Status PosixDirectIOWritableFile::Append(const Slice& data) {
assert(IsSectorAligned(data.size()) && IsPageAligned(data.data()));
if (!IsSectorAligned(data.size()) || !IsPageAligned(data.data())) {
return Status::IOError("Unaligned buffer for direct IO");
}
return PosixWritableFile::Append(data);
}
Status PosixDirectIOWritableFile::PositionedAppend(const Slice& data,
uint64_t offset) {
assert(IsSectorAligned(offset));
assert(IsSectorAligned(data.size()));
assert(IsPageAligned(data.data()));
if (!IsSectorAligned(offset) || !IsSectorAligned(data.size()) ||
!IsPageAligned(data.data())) {
return Status::IOError("offset or size is not aligned");
}
return PosixWritableFile::PositionedAppend(data, offset);
}
/*
* PosixDirectory
*/
PosixDirectory::~PosixDirectory() { close(fd_); } PosixDirectory::~PosixDirectory() { close(fd_); }
Status PosixDirectory::Fsync() { Status PosixDirectory::Fsync() {

View File

@ -8,6 +8,7 @@
// found in the LICENSE file. See the AUTHORS file for names of contributors. // found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once #pragma once
#include <unistd.h> #include <unistd.h>
#include <atomic>
#include "rocksdb/env.h" #include "rocksdb/env.h"
// For non linux platform, the following macros are used only as place // For non linux platform, the following macros are used only as place
@ -26,6 +27,11 @@ static Status IOError(const std::string& context, int err_number) {
return Status::IOError(context, strerror(err_number)); return Status::IOError(context, strerror(err_number));
} }
class PosixHelper {
public:
static size_t GetUniqueIdFromFile(int fd, char* id, size_t max_size);
};
class PosixSequentialFile : public SequentialFile { class PosixSequentialFile : public SequentialFile {
private: private:
std::string filename_; std::string filename_;
@ -43,8 +49,25 @@ class PosixSequentialFile : public SequentialFile {
virtual Status InvalidateCache(size_t offset, size_t length) override; virtual Status InvalidateCache(size_t offset, size_t length) override;
}; };
class PosixRandomAccessFile : public RandomAccessFile { class PosixDirectIOSequentialFile : public SequentialFile {
public:
explicit PosixDirectIOSequentialFile(const std::string& filename, int fd)
: filename_(filename), fd_(fd) {}
virtual ~PosixDirectIOSequentialFile() {}
Status Read(size_t n, Slice* result, char* scratch) override;
Status Skip(uint64_t n) override;
Status InvalidateCache(size_t offset, size_t length) override;
private: private:
const std::string filename_;
int fd_ = -1;
std::atomic<size_t> off_{0}; // read offset
};
class PosixRandomAccessFile : public RandomAccessFile {
protected:
std::string filename_; std::string filename_;
int fd_; int fd_;
bool use_os_buffer_; bool use_os_buffer_;
@ -63,8 +86,23 @@ class PosixRandomAccessFile : public RandomAccessFile {
virtual Status InvalidateCache(size_t offset, size_t length) override; virtual Status InvalidateCache(size_t offset, size_t length) override;
}; };
// Direct IO random access file direct IO implementation
class PosixDirectIORandomAccessFile : public PosixRandomAccessFile {
public:
explicit PosixDirectIORandomAccessFile(const std::string& filename, int fd)
: PosixRandomAccessFile(filename, fd, EnvOptions()) {}
virtual ~PosixDirectIORandomAccessFile() {}
Status Read(uint64_t offset, size_t n, Slice* result,
char* scratch) const override;
virtual void Hint(AccessPattern pattern) override {}
Status InvalidateCache(size_t offset, size_t length) override {
return Status::OK();
}
};
class PosixWritableFile : public WritableFile { class PosixWritableFile : public WritableFile {
private: protected:
const std::string filename_; const std::string filename_;
int fd_; int fd_;
uint64_t filesize_; uint64_t filesize_;
@ -74,9 +112,9 @@ class PosixWritableFile : public WritableFile {
#endif #endif
public: public:
PosixWritableFile(const std::string& fname, int fd, explicit PosixWritableFile(const std::string& fname, int fd,
const EnvOptions& options); const EnvOptions& options);
~PosixWritableFile(); virtual ~PosixWritableFile();
// Means Close() will properly take care of truncate // Means Close() will properly take care of truncate
// and it does not need any additional information // and it does not need any additional information
@ -96,6 +134,22 @@ class PosixWritableFile : public WritableFile {
#endif #endif
}; };
class PosixDirectIOWritableFile : public PosixWritableFile {
public:
explicit PosixDirectIOWritableFile(const std::string& filename, int fd)
: PosixWritableFile(filename, fd, EnvOptions()) {}
virtual ~PosixDirectIOWritableFile() {}
bool UseOSBuffer() const override { return false; }
size_t GetRequiredBufferAlignment() const override { return 4 * 1024; }
Status Append(const Slice& data) override;
Status PositionedAppend(const Slice& data, uint64_t offset) override;
bool UseDirectIO() const override { return true; }
Status InvalidateCache(size_t offset, size_t length) override {
return Status::OK();
}
};
class PosixMmapReadableFile : public RandomAccessFile { class PosixMmapReadableFile : public RandomAccessFile {
private: private:
int fd_; int fd_;