direct io write support
Summary: rocksdb direct io support ``` [gzh@dev11575.prn2 ~/rocksdb] ./db_bench -benchmarks=fillseq --num=1000000 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 5.0 Date: Wed Nov 23 13:17:43 2016 CPU: 40 * Intel(R) Xeon(R) CPU E5-2660 v2 @ 2.20GHz CPUCache: 25600 KB Keys: 16 bytes each Values: 100 bytes each (50 bytes after compression) Entries: 1000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 110.6 MB (estimated) FileSize: 62.9 MB (estimated) Write rate: 0 bytes/second Compression: Snappy Memtablerep: skip_list Perf Level: 1 WARNING: Assertions are enabled; benchmarks unnecessarily slow ------------------------------------------------ Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags DB path: [/tmp/rocksdbtest-112628/dbbench] fillseq : 4.393 micros/op 227639 ops/sec; 25.2 MB/s [gzh@dev11575.prn2 ~/roc Closes https://github.com/facebook/rocksdb/pull/1564 Differential Revision: D4241093 Pulled By: lightmark fbshipit-source-id: 98c29e3
This commit is contained in:
parent
989e644ed8
commit
972f96b3fb
9
db/c.cc
9
db/c.cc
@ -1719,9 +1719,14 @@ void rocksdb_options_set_manifest_preallocation_size(
|
|||||||
void rocksdb_options_set_purge_redundant_kvs_while_flush(rocksdb_options_t* opt,
|
void rocksdb_options_set_purge_redundant_kvs_while_flush(rocksdb_options_t* opt,
|
||||||
unsigned char v) {}
|
unsigned char v) {}
|
||||||
|
|
||||||
void rocksdb_options_set_allow_os_buffer(rocksdb_options_t* opt,
|
void rocksdb_options_set_use_direct_reads(rocksdb_options_t* opt,
|
||||||
unsigned char v) {
|
unsigned char v) {
|
||||||
opt->rep.allow_os_buffer = v;
|
opt->rep.use_direct_reads = v;
|
||||||
|
}
|
||||||
|
|
||||||
|
void rocksdb_options_set_use_direct_writes(rocksdb_options_t* opt,
|
||||||
|
unsigned char v) {
|
||||||
|
opt->rep.use_direct_writes = v;
|
||||||
}
|
}
|
||||||
|
|
||||||
void rocksdb_options_set_allow_mmap_reads(
|
void rocksdb_options_set_allow_mmap_reads(
|
||||||
|
@ -255,11 +255,17 @@ static Status ValidateOptions(
|
|||||||
"More than four DB paths are not supported yet. ");
|
"More than four DB paths are not supported yet. ");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (db_options.allow_mmap_reads && !db_options.allow_os_buffer) {
|
if (db_options.allow_mmap_reads && db_options.use_direct_reads) {
|
||||||
// Protect against assert in PosixMMapReadableFile constructor
|
// Protect against assert in PosixMMapReadableFile constructor
|
||||||
return Status::NotSupported(
|
return Status::NotSupported(
|
||||||
"If memory mapped reads (allow_mmap_reads) are enabled "
|
"If memory mapped reads (allow_mmap_reads) are enabled "
|
||||||
"then os caching (allow_os_buffer) must also be enabled. ");
|
"then direct I/O reads (use_direct_reads) must be disabled. ");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (db_options.allow_mmap_writes && db_options.use_direct_writes) {
|
||||||
|
return Status::NotSupported(
|
||||||
|
"If memory mapped writes (allow_mmap_writes) are enabled "
|
||||||
|
"then direct I/O writes (use_direct_writes) must be disabled. ");
|
||||||
}
|
}
|
||||||
|
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
|
@ -3396,19 +3396,21 @@ TEST_F(DBTest, TableOptionsSanitizeTest) {
|
|||||||
TEST_F(DBTest, MmapAndBufferOptions) {
|
TEST_F(DBTest, MmapAndBufferOptions) {
|
||||||
Options options = CurrentOptions();
|
Options options = CurrentOptions();
|
||||||
|
|
||||||
options.allow_os_buffer = false;
|
options.use_direct_reads = true;
|
||||||
options.allow_mmap_reads = true;
|
options.allow_mmap_reads = true;
|
||||||
ASSERT_NOK(TryReopen(options));
|
ASSERT_NOK(TryReopen(options));
|
||||||
|
|
||||||
// All other combinations are acceptable
|
// All other combinations are acceptable
|
||||||
options.allow_os_buffer = true;
|
options.use_direct_reads = false;
|
||||||
ASSERT_OK(TryReopen(options));
|
ASSERT_OK(TryReopen(options));
|
||||||
|
|
||||||
options.allow_os_buffer = false;
|
if (IsDirectIOSupported()) {
|
||||||
|
options.use_direct_reads = true;
|
||||||
options.allow_mmap_reads = false;
|
options.allow_mmap_reads = false;
|
||||||
ASSERT_OK(TryReopen(options));
|
ASSERT_OK(TryReopen(options));
|
||||||
|
}
|
||||||
|
|
||||||
options.allow_os_buffer = true;
|
options.use_direct_reads = false;
|
||||||
ASSERT_OK(TryReopen(options));
|
ASSERT_OK(TryReopen(options));
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -478,6 +478,22 @@ Status DBTestBase::TryReopen(const Options& options) {
|
|||||||
return DB::Open(options, dbname_, &db_);
|
return DB::Open(options, dbname_, &db_);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool DBTestBase::IsDirectIOSupported() {
|
||||||
|
EnvOptions env_options;
|
||||||
|
env_options.use_mmap_writes = false;
|
||||||
|
env_options.use_direct_writes = true;
|
||||||
|
std::string tmp = TempFileName(dbname_, 999);
|
||||||
|
Status s;
|
||||||
|
{
|
||||||
|
unique_ptr<WritableFile> file;
|
||||||
|
s = env_->NewWritableFile(tmp, &file, env_options);
|
||||||
|
}
|
||||||
|
if (s.ok()) {
|
||||||
|
s = env_->DeleteFile(tmp);
|
||||||
|
}
|
||||||
|
return s.ok();
|
||||||
|
}
|
||||||
|
|
||||||
Status DBTestBase::Flush(int cf) {
|
Status DBTestBase::Flush(int cf) {
|
||||||
if (cf == 0) {
|
if (cf == 0) {
|
||||||
return db_->Flush(FlushOptions());
|
return db_->Flush(FlushOptions());
|
||||||
|
@ -692,6 +692,8 @@ class DBTestBase : public testing::Test {
|
|||||||
|
|
||||||
Status TryReopen(const Options& options);
|
Status TryReopen(const Options& options);
|
||||||
|
|
||||||
|
bool IsDirectIOSupported();
|
||||||
|
|
||||||
Status Flush(int cf = 0);
|
Status Flush(int cf = 0);
|
||||||
|
|
||||||
Status Put(const Slice& k, const Slice& v, WriteOptions wo = WriteOptions());
|
Status Put(const Slice& k, const Slice& v, WriteOptions wo = WriteOptions());
|
||||||
|
@ -330,7 +330,7 @@ int main(int argc, char** argv) {
|
|||||||
options.compaction_style = rocksdb::CompactionStyle::kCompactionStyleNone;
|
options.compaction_style = rocksdb::CompactionStyle::kCompactionStyleNone;
|
||||||
options.level0_slowdown_writes_trigger = 99999;
|
options.level0_slowdown_writes_trigger = 99999;
|
||||||
options.level0_stop_writes_trigger = 99999;
|
options.level0_stop_writes_trigger = 99999;
|
||||||
options.allow_os_buffer = false;
|
options.use_direct_writes = true;
|
||||||
options.write_buffer_size = FLAGS_memtable_size;
|
options.write_buffer_size = FLAGS_memtable_size;
|
||||||
rocksdb::BlockBasedTableOptions table_options;
|
rocksdb::BlockBasedTableOptions table_options;
|
||||||
table_options.block_cache = rocksdb::NewLRUCache(FLAGS_block_cache_size);
|
table_options.block_cache = rocksdb::NewLRUCache(FLAGS_block_cache_size);
|
||||||
|
@ -73,11 +73,12 @@
|
|||||||
error_if_exists=false
|
error_if_exists=false
|
||||||
recycle_log_file_num=0
|
recycle_log_file_num=0
|
||||||
skip_log_error_on_recovery=false
|
skip_log_error_on_recovery=false
|
||||||
allow_mmap_reads=false
|
|
||||||
allow_os_buffer=true
|
|
||||||
db_log_dir=
|
db_log_dir=
|
||||||
new_table_reader_for_compaction_inputs=true
|
new_table_reader_for_compaction_inputs=true
|
||||||
|
allow_mmap_reads=false
|
||||||
allow_mmap_writes=false
|
allow_mmap_writes=false
|
||||||
|
use_direct_reads=false
|
||||||
|
use_direct_writes=false
|
||||||
|
|
||||||
|
|
||||||
[CFOptions "default"]
|
[CFOptions "default"]
|
||||||
|
@ -657,12 +657,14 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_set_manifest_preallocation_size(
|
|||||||
extern ROCKSDB_LIBRARY_API void
|
extern ROCKSDB_LIBRARY_API void
|
||||||
rocksdb_options_set_purge_redundant_kvs_while_flush(rocksdb_options_t*,
|
rocksdb_options_set_purge_redundant_kvs_while_flush(rocksdb_options_t*,
|
||||||
unsigned char);
|
unsigned char);
|
||||||
extern ROCKSDB_LIBRARY_API void rocksdb_options_set_allow_os_buffer(
|
|
||||||
rocksdb_options_t*, unsigned char);
|
|
||||||
extern ROCKSDB_LIBRARY_API void rocksdb_options_set_allow_mmap_reads(
|
extern ROCKSDB_LIBRARY_API void rocksdb_options_set_allow_mmap_reads(
|
||||||
rocksdb_options_t*, unsigned char);
|
rocksdb_options_t*, unsigned char);
|
||||||
extern ROCKSDB_LIBRARY_API void rocksdb_options_set_allow_mmap_writes(
|
extern ROCKSDB_LIBRARY_API void rocksdb_options_set_allow_mmap_writes(
|
||||||
rocksdb_options_t*, unsigned char);
|
rocksdb_options_t*, unsigned char);
|
||||||
|
extern ROCKSDB_LIBRARY_API void rocksdb_options_set_use_direct_reads(
|
||||||
|
rocksdb_options_t*, unsigned char);
|
||||||
|
extern ROCKSDB_LIBRARY_API void rocksdb_options_set_use_direct_writes(
|
||||||
|
rocksdb_options_t*, unsigned char);
|
||||||
extern ROCKSDB_LIBRARY_API void rocksdb_options_set_is_fd_close_on_exec(
|
extern ROCKSDB_LIBRARY_API void rocksdb_options_set_is_fd_close_on_exec(
|
||||||
rocksdb_options_t*, unsigned char);
|
rocksdb_options_t*, unsigned char);
|
||||||
extern ROCKSDB_LIBRARY_API void rocksdb_options_set_skip_log_error_on_recovery(
|
extern ROCKSDB_LIBRARY_API void rocksdb_options_set_skip_log_error_on_recovery(
|
||||||
|
@ -61,9 +61,6 @@ struct EnvOptions {
|
|||||||
// construct from Options
|
// construct from Options
|
||||||
explicit EnvOptions(const DBOptions& options);
|
explicit EnvOptions(const DBOptions& options);
|
||||||
|
|
||||||
// If true, then allow caching of data in environment buffers
|
|
||||||
bool use_os_buffer = true;
|
|
||||||
|
|
||||||
// If true, then use mmap to read data
|
// If true, then use mmap to read data
|
||||||
bool use_mmap_reads = false;
|
bool use_mmap_reads = false;
|
||||||
|
|
||||||
@ -373,8 +370,8 @@ class Env {
|
|||||||
// OptimizeForManifestWrite will create a new EnvOptions object that is a copy
|
// OptimizeForManifestWrite will create a new EnvOptions object that is a copy
|
||||||
// of the EnvOptions in the parameters, but is optimized for writing manifest
|
// of the EnvOptions in the parameters, but is optimized for writing manifest
|
||||||
// files. Default implementation returns the copy of the same object.
|
// files. Default implementation returns the copy of the same object.
|
||||||
virtual EnvOptions OptimizeForManifestWrite(const EnvOptions& env_options)
|
virtual EnvOptions OptimizeForManifestWrite(
|
||||||
const;
|
const EnvOptions& env_options) const;
|
||||||
|
|
||||||
// Returns the status of all threads that belong to the current Env.
|
// Returns the status of all threads that belong to the current Env.
|
||||||
virtual Status GetThreadList(std::vector<ThreadStatus>* thread_list) {
|
virtual Status GetThreadList(std::vector<ThreadStatus>* thread_list) {
|
||||||
@ -512,17 +509,15 @@ class WritableFile {
|
|||||||
}
|
}
|
||||||
virtual ~WritableFile();
|
virtual ~WritableFile();
|
||||||
|
|
||||||
// Indicates if the class makes use of unbuffered I/O
|
// Indicates if the class makes use of direct IO
|
||||||
// If false you must pass aligned buffer to Write()
|
// If true you must pass aligned buffer to Write()
|
||||||
virtual bool UseOSBuffer() const {
|
virtual bool UseDirectIO() const { return false; }
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
const size_t c_DefaultPageSize = 4 * 1024;
|
const size_t c_DefaultPageSize = 4 * 1024;
|
||||||
|
|
||||||
// Use the returned alignment value to allocate
|
// Use the returned alignment value to allocate
|
||||||
// aligned buffer for Write() when UseOSBuffer()
|
// aligned buffer for Write() when UseDirectIO()
|
||||||
// returns false
|
// returns true
|
||||||
virtual size_t GetRequiredBufferAlignment() const {
|
virtual size_t GetRequiredBufferAlignment() const {
|
||||||
return c_DefaultPageSize;
|
return c_DefaultPageSize;
|
||||||
}
|
}
|
||||||
@ -538,7 +533,7 @@ class WritableFile {
|
|||||||
// the sector. The implementation thus needs to also rewrite the last
|
// the sector. The implementation thus needs to also rewrite the last
|
||||||
// partial sector.
|
// partial sector.
|
||||||
// Note: PositionAppend does not guarantee moving the file offset after the
|
// Note: PositionAppend does not guarantee moving the file offset after the
|
||||||
// write. A WriteabelFile object must support either Append or
|
// write. A WritableFile object must support either Append or
|
||||||
// PositionedAppend, so the users cannot mix the two.
|
// PositionedAppend, so the users cannot mix the two.
|
||||||
//
|
//
|
||||||
// PositionedAppend() can only happen on the page/sector boundaries. For that
|
// PositionedAppend() can only happen on the page/sector boundaries. For that
|
||||||
@ -583,10 +578,6 @@ class WritableFile {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Indicates the upper layers if the current WritableFile implementation
|
|
||||||
// uses direct IO.
|
|
||||||
virtual bool UseDirectIO() const { return false; }
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Change the priority in rate limiter if rate limiting is enabled.
|
* Change the priority in rate limiter if rate limiting is enabled.
|
||||||
* If rate limiting is not enabled, this call has no effect.
|
* If rate limiting is not enabled, this call has no effect.
|
||||||
@ -695,17 +686,14 @@ class RandomRWFile {
|
|||||||
RandomRWFile() {}
|
RandomRWFile() {}
|
||||||
virtual ~RandomRWFile() {}
|
virtual ~RandomRWFile() {}
|
||||||
|
|
||||||
// Indicates if the class makes use of unbuffered I/O
|
// Indicates if the class makes use of direct I/O
|
||||||
// If false you must pass aligned buffer to Write()
|
// If false you must pass aligned buffer to Write()
|
||||||
virtual bool UseOSBuffer() const {
|
virtual bool UseDirectIO() const { return false; }
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
const size_t c_DefaultPageSize = 4 * 1024;
|
const size_t c_DefaultPageSize = 4 * 1024;
|
||||||
|
|
||||||
// Use the returned alignment value to allocate
|
// Use the returned alignment value to allocate aligned
|
||||||
// aligned buffer for Write() when UseOSBuffer()
|
// buffer for Write() when UseDirectIO() returns true
|
||||||
// returns false
|
|
||||||
virtual size_t GetRequiredBufferAlignment() const {
|
virtual size_t GetRequiredBufferAlignment() const {
|
||||||
return c_DefaultPageSize;
|
return c_DefaultPageSize;
|
||||||
}
|
}
|
||||||
@ -722,7 +710,7 @@ class RandomRWFile {
|
|||||||
virtual void EnableReadAhead() {}
|
virtual void EnableReadAhead() {}
|
||||||
|
|
||||||
// Write bytes in `data` at offset `offset`, Returns Status::OK() on success.
|
// Write bytes in `data` at offset `offset`, Returns Status::OK() on success.
|
||||||
// Pass aligned buffer when UseOSBuffer() returns false.
|
// Pass aligned buffer when UseDirectIO() returns true.
|
||||||
virtual Status Write(uint64_t offset, const Slice& data) = 0;
|
virtual Status Write(uint64_t offset, const Slice& data) = 0;
|
||||||
|
|
||||||
// Read up to `n` bytes starting from offset `offset` and store them in
|
// Read up to `n` bytes starting from offset `offset` and store them in
|
||||||
|
@ -1100,26 +1100,6 @@ struct DBOptions {
|
|||||||
// large amounts of data (such as xfs's allocsize option).
|
// large amounts of data (such as xfs's allocsize option).
|
||||||
size_t manifest_preallocation_size;
|
size_t manifest_preallocation_size;
|
||||||
|
|
||||||
// Hint the OS that it should not buffer disk I/O. Enabling this
|
|
||||||
// parameter may improve performance but increases pressure on the
|
|
||||||
// system cache.
|
|
||||||
//
|
|
||||||
// The exact behavior of this parameter is platform dependent.
|
|
||||||
//
|
|
||||||
// On POSIX systems, after RocksDB reads data from disk it will
|
|
||||||
// mark the pages as "unneeded". The operating system may - or may not
|
|
||||||
// - evict these pages from memory, reducing pressure on the system
|
|
||||||
// cache. If the disk block is requested again this can result in
|
|
||||||
// additional disk I/O.
|
|
||||||
//
|
|
||||||
// On WINDOWS system, files will be opened in "unbuffered I/O" mode
|
|
||||||
// which means that data read from the disk will not be cached or
|
|
||||||
// bufferized. The hardware buffer of the devices may however still
|
|
||||||
// be used. Memory mapped files are not impacted by this parameter.
|
|
||||||
//
|
|
||||||
// Default: true
|
|
||||||
bool allow_os_buffer;
|
|
||||||
|
|
||||||
// Allow the OS to mmap file for reading sst tables. Default: false
|
// Allow the OS to mmap file for reading sst tables. Default: false
|
||||||
bool allow_mmap_reads;
|
bool allow_mmap_reads;
|
||||||
|
|
||||||
@ -1128,10 +1108,22 @@ struct DBOptions {
|
|||||||
// Default: false
|
// Default: false
|
||||||
bool allow_mmap_writes;
|
bool allow_mmap_writes;
|
||||||
|
|
||||||
|
// Enable direct I/O mode for read/write
|
||||||
|
// they may or may not improve performance depending on the use case
|
||||||
|
//
|
||||||
|
// Files will be opened in "direct I/O" mode
|
||||||
|
// which means that data r/w from the disk will not be cached or
|
||||||
|
// bufferized. The hardware buffer of the devices may however still
|
||||||
|
// be used. Memory mapped files are not impacted by these parameters.
|
||||||
|
|
||||||
// Use O_DIRECT for reading file
|
// Use O_DIRECT for reading file
|
||||||
// Default: false
|
// Default: false
|
||||||
bool use_direct_reads;
|
bool use_direct_reads;
|
||||||
|
|
||||||
|
// Use O_DIRECT for writing file
|
||||||
|
// Default: false
|
||||||
|
bool use_direct_writes;
|
||||||
|
|
||||||
// If false, fallocate() calls are bypassed
|
// If false, fallocate() calls are bypassed
|
||||||
bool allow_fallocate;
|
bool allow_fallocate;
|
||||||
|
|
||||||
|
@ -157,9 +157,9 @@ struct BlockBasedTableOptions {
|
|||||||
// a SstTable. Instead, buffer in WritableFileWriter will take
|
// a SstTable. Instead, buffer in WritableFileWriter will take
|
||||||
// care of the flushing when it is full.
|
// care of the flushing when it is full.
|
||||||
//
|
//
|
||||||
// On Windows, this option helps a lot when unbuffered I/O
|
// This option helps a lot when direct I/O writes
|
||||||
// (allow_os_buffer = false) is used, since it avoids small
|
// (use_direct_writes = true) is used, since it avoids small
|
||||||
// unbuffered disk write.
|
// direct disk write.
|
||||||
//
|
//
|
||||||
// User may also adjust writable_file_max_buffer_size to optimize disk I/O
|
// User may also adjust writable_file_max_buffer_size to optimize disk I/O
|
||||||
// size.
|
// size.
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||||
// vim: ts=8 sw=2 smarttab
|
// This source code is licensed under the BSD-style license found in the
|
||||||
|
// LICENSE file in the root directory of this source tree. An additional grant
|
||||||
|
// of patent rights can be found in the PATENTS file in the same directory.
|
||||||
#ifndef ROCKSDB_UTILITIES_ENV_LIBRADOS_H
|
#ifndef ROCKSDB_UTILITIES_ENV_LIBRADOS_H
|
||||||
#define ROCKSDB_UTILITIES_ENV_LIBRADOS_H
|
#define ROCKSDB_UTILITIES_ENV_LIBRADOS_H
|
||||||
|
|
||||||
@ -22,10 +24,9 @@ public:
|
|||||||
// not exist, returns a non-OK status.
|
// not exist, returns a non-OK status.
|
||||||
//
|
//
|
||||||
// The returned file will only be accessed by one thread at a time.
|
// The returned file will only be accessed by one thread at a time.
|
||||||
Status NewSequentialFile(
|
Status NewSequentialFile(const std::string& fname,
|
||||||
const std::string& fname,
|
|
||||||
std::unique_ptr<SequentialFile>* result,
|
std::unique_ptr<SequentialFile>* result,
|
||||||
const EnvOptions& options);
|
const EnvOptions& options) override;
|
||||||
|
|
||||||
// Create a brand new random access read-only file with the
|
// Create a brand new random access read-only file with the
|
||||||
// specified name. On success, stores a pointer to the new file in
|
// specified name. On success, stores a pointer to the new file in
|
||||||
@ -34,10 +35,9 @@ public:
|
|||||||
// status.
|
// status.
|
||||||
//
|
//
|
||||||
// The returned file may be concurrently accessed by multiple threads.
|
// The returned file may be concurrently accessed by multiple threads.
|
||||||
Status NewRandomAccessFile(
|
Status NewRandomAccessFile(const std::string& fname,
|
||||||
const std::string& fname,
|
|
||||||
std::unique_ptr<RandomAccessFile>* result,
|
std::unique_ptr<RandomAccessFile>* result,
|
||||||
const EnvOptions& options);
|
const EnvOptions& options) override;
|
||||||
|
|
||||||
// Create an object that writes to a new file with the specified
|
// Create an object that writes to a new file with the specified
|
||||||
// name. Deletes any existing file with the same name and creates a
|
// name. Deletes any existing file with the same name and creates a
|
||||||
@ -46,17 +46,15 @@ public:
|
|||||||
// returns non-OK.
|
// returns non-OK.
|
||||||
//
|
//
|
||||||
// The returned file will only be accessed by one thread at a time.
|
// The returned file will only be accessed by one thread at a time.
|
||||||
Status NewWritableFile(
|
Status NewWritableFile(const std::string& fname,
|
||||||
const std::string& fname,
|
|
||||||
std::unique_ptr<WritableFile>* result,
|
std::unique_ptr<WritableFile>* result,
|
||||||
const EnvOptions& options);
|
const EnvOptions& options) override;
|
||||||
|
|
||||||
// Reuse an existing file by renaming it and opening it as writable.
|
// Reuse an existing file by renaming it and opening it as writable.
|
||||||
Status ReuseWritableFile(
|
Status ReuseWritableFile(const std::string& fname,
|
||||||
const std::string& fname,
|
|
||||||
const std::string& old_fname,
|
const std::string& old_fname,
|
||||||
std::unique_ptr<WritableFile>* result,
|
std::unique_ptr<WritableFile>* result,
|
||||||
const EnvOptions& options);
|
const EnvOptions& options) override;
|
||||||
|
|
||||||
// Create an object that represents a directory. Will fail if directory
|
// Create an object that represents a directory. Will fail if directory
|
||||||
// doesn't exist. If the directory exists, it will open the directory
|
// doesn't exist. If the directory exists, it will open the directory
|
||||||
@ -65,47 +63,44 @@ public:
|
|||||||
// On success, stores a pointer to the new Directory in
|
// On success, stores a pointer to the new Directory in
|
||||||
// *result and returns OK. On failure stores nullptr in *result and
|
// *result and returns OK. On failure stores nullptr in *result and
|
||||||
// returns non-OK.
|
// returns non-OK.
|
||||||
Status NewDirectory(
|
Status NewDirectory(const std::string& name,
|
||||||
const std::string& name,
|
std::unique_ptr<Directory>* result) override;
|
||||||
std::unique_ptr<Directory>* result);
|
|
||||||
|
|
||||||
// Returns OK if the named file exists.
|
// Returns OK if the named file exists.
|
||||||
// NotFound if the named file does not exist,
|
// NotFound if the named file does not exist,
|
||||||
// the calling process does not have permission to determine
|
// the calling process does not have permission to determine
|
||||||
// whether this file exists, or if the path is invalid.
|
// whether this file exists, or if the path is invalid.
|
||||||
// IOError if an IO Error was encountered
|
// IOError if an IO Error was encountered
|
||||||
Status FileExists(const std::string& fname);
|
Status FileExists(const std::string& fname) overrdie;
|
||||||
|
|
||||||
// Store in *result the names of the children of the specified directory.
|
// Store in *result the names of the children of the specified directory.
|
||||||
// The names are relative to "dir".
|
// The names are relative to "dir".
|
||||||
// Original contents of *results are dropped.
|
// Original contents of *results are dropped.
|
||||||
Status GetChildren(const std::string& dir,
|
Status GetChildren(const std::string& dir, std::vector<std::string>* result);
|
||||||
std::vector<std::string>* result);
|
|
||||||
|
|
||||||
// Delete the named file.
|
// Delete the named file.
|
||||||
Status DeleteFile(const std::string& fname);
|
Status DeleteFile(const std::string& fname) override;
|
||||||
|
|
||||||
// Create the specified directory. Returns error if directory exists.
|
// Create the specified directory. Returns error if directory exists.
|
||||||
Status CreateDir(const std::string& dirname);
|
Status CreateDir(const std::string& dirname) override;
|
||||||
|
|
||||||
// Creates directory if missing. Return Ok if it exists, or successful in
|
// Creates directory if missing. Return Ok if it exists, or successful in
|
||||||
// Creating.
|
// Creating.
|
||||||
Status CreateDirIfMissing(const std::string& dirname);
|
Status CreateDirIfMissing(const std::string& dirname) override;
|
||||||
|
|
||||||
// Delete the specified directory.
|
// Delete the specified directory.
|
||||||
Status DeleteDir(const std::string& dirname);
|
Status DeleteDir(const std::string& dirname) override;
|
||||||
|
|
||||||
// Store the size of fname in *file_size.
|
// Store the size of fname in *file_size.
|
||||||
Status GetFileSize(const std::string& fname, uint64_t* file_size);
|
Status GetFileSize(const std::string& fname, uint64_t* file_size) override;
|
||||||
|
|
||||||
// Store the last modification time of fname in *file_mtime.
|
// Store the last modification time of fname in *file_mtime.
|
||||||
Status GetFileModificationTime(const std::string& fname,
|
Status GetFileModificationTime(const std::string& fname,
|
||||||
uint64_t* file_mtime);
|
uint64_t* file_mtime) override;
|
||||||
// Rename file src to target.
|
// Rename file src to target.
|
||||||
Status RenameFile(const std::string& src,
|
Status RenameFile(const std::string& src, const std::string& target) override;
|
||||||
const std::string& target);
|
|
||||||
// Hard Link file src to target.
|
// Hard Link file src to target.
|
||||||
Status LinkFile(const std::string& src, const std::string& target);
|
Status LinkFile(const std::string& src, const std::string& target) override;
|
||||||
|
|
||||||
// Lock the specified file. Used to prevent concurrent access to
|
// Lock the specified file. Used to prevent concurrent access to
|
||||||
// the same db by multiple processes. On failure, stores nullptr in
|
// the same db by multiple processes. On failure, stores nullptr in
|
||||||
@ -129,8 +124,7 @@ public:
|
|||||||
Status UnlockFile(FileLock* lock);
|
Status UnlockFile(FileLock* lock);
|
||||||
|
|
||||||
// Get full directory name for this db.
|
// Get full directory name for this db.
|
||||||
Status GetAbsolutePath(const std::string& db_path,
|
Status GetAbsolutePath(const std::string& db_path, std::string* output_path);
|
||||||
std::string* output_path);
|
|
||||||
|
|
||||||
// Generate unique id
|
// Generate unique id
|
||||||
std::string GenerateUniqueId();
|
std::string GenerateUniqueId();
|
||||||
@ -142,23 +136,21 @@ public:
|
|||||||
const std::string& config_path,
|
const std::string& config_path,
|
||||||
const std::string& db_pool);
|
const std::string& db_pool);
|
||||||
|
|
||||||
explicit EnvLibrados(const std::string& client_name, // first 3 parameters are for RADOS client init
|
explicit EnvLibrados(
|
||||||
const std::string& cluster_name,
|
const std::string& client_name, // first 3 parameters are
|
||||||
const uint64_t flags,
|
// for RADOS client init
|
||||||
const std::string& db_name,
|
const std::string& cluster_name, const uint64_t flags,
|
||||||
const std::string& config_path,
|
const std::string& db_name, const std::string& config_path,
|
||||||
const std::string& db_pool,
|
const std::string& db_pool, const std::string& wal_dir,
|
||||||
const std::string& wal_dir,
|
const std::string& wal_pool, const uint64_t write_buffer_size);
|
||||||
const std::string& wal_pool,
|
~EnvLibrados() { _rados.shutdown(); }
|
||||||
const uint64_t write_buffer_size);
|
|
||||||
~EnvLibrados() {
|
|
||||||
_rados.shutdown();
|
|
||||||
}
|
|
||||||
private:
|
private:
|
||||||
std::string _client_name;
|
std::string _client_name;
|
||||||
std::string _cluster_name;
|
std::string _cluster_name;
|
||||||
uint64_t _flags;
|
uint64_t _flags;
|
||||||
std::string _db_name; // get from user, readable string; Also used as db_id for db metadata
|
std::string _db_name; // get from user, readable string; Also used as db_id
|
||||||
|
// for db metadata
|
||||||
std::string _config_path;
|
std::string _config_path;
|
||||||
librados::Rados _rados; // RADOS client
|
librados::Rados _rados; // RADOS client
|
||||||
std::string _db_pool_name;
|
std::string _db_pool_name;
|
||||||
@ -175,10 +167,8 @@ private:
|
|||||||
Status _RenameFid(const std::string& old_fname, const std::string& new_fname);
|
Status _RenameFid(const std::string& old_fname, const std::string& new_fname);
|
||||||
Status _AddFid(const std::string& fname, const std::string& fid);
|
Status _AddFid(const std::string& fname, const std::string& fid);
|
||||||
Status _DelFid(const std::string& fname);
|
Status _DelFid(const std::string& fname);
|
||||||
Status _GetSubFnames(
|
Status _GetSubFnames(const std::string& dirname,
|
||||||
const std::string& dirname,
|
std::vector<std::string>* result);
|
||||||
std::vector<std::string> * result
|
|
||||||
);
|
|
||||||
librados::IoCtx* _GetIoctx(const std::string& prefix);
|
librados::IoCtx* _GetIoctx(const std::string& prefix);
|
||||||
friend class LibradosWritableFile;
|
friend class LibradosWritableFile;
|
||||||
};
|
};
|
||||||
|
@ -15,10 +15,9 @@
|
|||||||
// semantics and behavior are correct (in that they match that of an
|
// semantics and behavior are correct (in that they match that of an
|
||||||
// existing, stable Env, like the default POSIX one).
|
// existing, stable Env, like the default POSIX one).
|
||||||
|
|
||||||
#ifndef ROCKSDB_LITE
|
#pragma once
|
||||||
|
|
||||||
#ifndef STORAGE_ROCKSDB_INCLUDE_UTILITIES_ENVMIRROR_H_
|
#ifndef ROCKSDB_LITE
|
||||||
#define STORAGE_ROCKSDB_INCLUDE_UTLIITIES_ENVMIRROR_H_
|
|
||||||
|
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
@ -174,6 +173,4 @@ class EnvMirror : public EnvWrapper {
|
|||||||
|
|
||||||
} // namespace rocksdb
|
} // namespace rocksdb
|
||||||
|
|
||||||
#endif // STORAGE_ROCKSDB_INCLUDE_UTILITIES_ENVMIRROR_H_
|
|
||||||
|
|
||||||
#endif // ROCKSDB_LITE
|
#endif // ROCKSDB_LITE
|
||||||
|
@ -1417,8 +1417,14 @@ public class DbBenchmark {
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
/* TODO(yhchiang): enable the following
|
/* TODO(yhchiang): enable the following
|
||||||
bufferedio(rocksdb::EnvOptions().use_os_buffer,
|
direct_reads(rocksdb::EnvOptions().use_direct_reads,
|
||||||
"Allow buffered io using OS buffers.") {
|
"Allow direct I/O reads.") {
|
||||||
|
@Override public Object parseValue(String value) {
|
||||||
|
return parseBoolean(value);
|
||||||
|
}
|
||||||
|
},
|
||||||
|
direct_writes(rocksdb::EnvOptions().use_direct_reads,
|
||||||
|
"Allow direct I/O reads.") {
|
||||||
@Override public Object parseValue(String value) {
|
@Override public Object parseValue(String value) {
|
||||||
return parseBoolean(value);
|
return parseBoolean(value);
|
||||||
}
|
}
|
||||||
|
@ -49,23 +49,43 @@ void Java_org_rocksdb_EnvOptions_disposeInternal(JNIEnv *env, jobject jobj,
|
|||||||
|
|
||||||
/*
|
/*
|
||||||
* Class: org_rocksdb_EnvOptions
|
* Class: org_rocksdb_EnvOptions
|
||||||
* Method: setUseOsBuffer
|
* Method: setUseDirectReads
|
||||||
* Signature: (JZ)V
|
* Signature: (JZ)V
|
||||||
*/
|
*/
|
||||||
void Java_org_rocksdb_EnvOptions_setUseOsBuffer(JNIEnv *env, jobject jobj,
|
void Java_org_rocksdb_EnvOptions_setUseDirectReads(JNIEnv *env, jobject jobj,
|
||||||
jlong jhandle,
|
jlong jhandle,
|
||||||
jboolean use_os_buffer) {
|
jboolean use_direct_reads) {
|
||||||
ENV_OPTIONS_SET_BOOL(jhandle, use_os_buffer);
|
ENV_OPTIONS_SET_BOOL(jhandle, use_direct_reads);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Class: org_rocksdb_EnvOptions
|
* Class: org_rocksdb_EnvOptions
|
||||||
* Method: useOsBuffer
|
* Method: useDirectReads
|
||||||
* Signature: (J)Z
|
* Signature: (J)Z
|
||||||
*/
|
*/
|
||||||
jboolean Java_org_rocksdb_EnvOptions_useOsBuffer(JNIEnv *env, jobject jobj,
|
jboolean Java_org_rocksdb_EnvOptions_useDirectReads(JNIEnv *env, jobject jobj,
|
||||||
jlong jhandle) {
|
jlong jhandle) {
|
||||||
return ENV_OPTIONS_GET(jhandle, use_os_buffer);
|
return ENV_OPTIONS_GET(jhandle, use_direct_reads);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Class: org_rocksdb_EnvOptions
|
||||||
|
* Method: setUseDirectWrites
|
||||||
|
* Signature: (JZ)V
|
||||||
|
*/
|
||||||
|
void Java_org_rocksdb_EnvOptions_setUseDirectWrites(
|
||||||
|
JNIEnv *env, jobject jobj, jlong jhandle, jboolean use_direct_writes) {
|
||||||
|
ENV_OPTIONS_SET_BOOL(jhandle, use_direct_writes);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Class: org_rocksdb_EnvOptions
|
||||||
|
* Method: useDirectWrites
|
||||||
|
* Signature: (J)Z
|
||||||
|
*/
|
||||||
|
jboolean Java_org_rocksdb_EnvOptions_useDirectWrites(JNIEnv *env, jobject jobj,
|
||||||
|
jlong jhandle) {
|
||||||
|
return ENV_OPTIONS_GET(jhandle, use_direct_writes);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -110,47 +130,6 @@ jboolean Java_org_rocksdb_EnvOptions_useMmapWrites(JNIEnv *env, jobject jobj,
|
|||||||
return ENV_OPTIONS_GET(jhandle, use_mmap_writes);
|
return ENV_OPTIONS_GET(jhandle, use_mmap_writes);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* Class: org_rocksdb_EnvOptions
|
|
||||||
* Method: setUseDirectReads
|
|
||||||
* Signature: (JZ)V
|
|
||||||
*/
|
|
||||||
void Java_org_rocksdb_EnvOptions_setUseDirectReads(JNIEnv *env, jobject jobj,
|
|
||||||
jlong jhandle,
|
|
||||||
jboolean use_direct_reads) {
|
|
||||||
ENV_OPTIONS_SET_BOOL(jhandle, use_direct_reads);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Class: org_rocksdb_EnvOptions
|
|
||||||
* Method: useDirectReads
|
|
||||||
* Signature: (J)Z
|
|
||||||
*/
|
|
||||||
jboolean Java_org_rocksdb_EnvOptions_useDirectReads(JNIEnv *env, jobject jobj,
|
|
||||||
jlong jhandle) {
|
|
||||||
return ENV_OPTIONS_GET(jhandle, use_direct_reads);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Class: org_rocksdb_EnvOptions
|
|
||||||
* Method: setUseDirectWrites
|
|
||||||
* Signature: (JZ)V
|
|
||||||
*/
|
|
||||||
void Java_org_rocksdb_EnvOptions_setUseDirectWrites(
|
|
||||||
JNIEnv *env, jobject jobj, jlong jhandle, jboolean use_direct_writes) {
|
|
||||||
ENV_OPTIONS_SET_BOOL(jhandle, use_direct_writes);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Class: org_rocksdb_EnvOptions
|
|
||||||
* Method: useDirectWrites
|
|
||||||
* Signature: (J)Z
|
|
||||||
*/
|
|
||||||
jboolean Java_org_rocksdb_EnvOptions_useDirectWrites(JNIEnv *env, jobject jobj,
|
|
||||||
jlong jhandle) {
|
|
||||||
return ENV_OPTIONS_GET(jhandle, use_direct_writes);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Class: org_rocksdb_EnvOptions
|
* Class: org_rocksdb_EnvOptions
|
||||||
* Method: setAllowFallocate
|
* Method: setAllowFallocate
|
||||||
|
@ -870,27 +870,6 @@ void Java_org_rocksdb_Options_setManifestPreallocationSize(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* Class: org_rocksdb_Options
|
|
||||||
* Method: allowOsBuffer
|
|
||||||
* Signature: (J)Z
|
|
||||||
*/
|
|
||||||
jboolean Java_org_rocksdb_Options_allowOsBuffer(
|
|
||||||
JNIEnv* env, jobject jobj, jlong jhandle) {
|
|
||||||
return reinterpret_cast<rocksdb::Options*>(jhandle)->allow_os_buffer;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Class: org_rocksdb_Options
|
|
||||||
* Method: setAllowOsBuffer
|
|
||||||
* Signature: (JZ)V
|
|
||||||
*/
|
|
||||||
void Java_org_rocksdb_Options_setAllowOsBuffer(
|
|
||||||
JNIEnv* env, jobject jobj, jlong jhandle, jboolean allow_os_buffer) {
|
|
||||||
reinterpret_cast<rocksdb::Options*>(jhandle)->allow_os_buffer =
|
|
||||||
static_cast<bool>(allow_os_buffer);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Method: setTableFactory
|
* Method: setTableFactory
|
||||||
* Signature: (JJ)V
|
* Signature: (JJ)V
|
||||||
@ -943,6 +922,50 @@ void Java_org_rocksdb_Options_setAllowMmapWrites(
|
|||||||
static_cast<bool>(allow_mmap_writes);
|
static_cast<bool>(allow_mmap_writes);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Class: org_rocksdb_Options
|
||||||
|
* Method: useDirectReads
|
||||||
|
* Signature: (J)Z
|
||||||
|
*/
|
||||||
|
jboolean Java_org_rocksdb_Options_useDirectReads(JNIEnv* env, jobject jobj,
|
||||||
|
jlong jhandle) {
|
||||||
|
return reinterpret_cast<rocksdb::Options*>(jhandle)->use_direct_reads;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Class: org_rocksdb_Options
|
||||||
|
* Method: setUseDirectReads
|
||||||
|
* Signature: (JZ)V
|
||||||
|
*/
|
||||||
|
void Java_org_rocksdb_Options_setUseDirectReads(JNIEnv* env, jobject jobj,
|
||||||
|
jlong jhandle,
|
||||||
|
jboolean use_direct_reads) {
|
||||||
|
reinterpret_cast<rocksdb::Options*>(jhandle)->use_direct_reads =
|
||||||
|
static_cast<bool>(use_direct_reads);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Class: org_rocksdb_Options
|
||||||
|
* Method: useDirectWrites
|
||||||
|
* Signature: (J)Z
|
||||||
|
*/
|
||||||
|
jboolean Java_org_rocksdb_Options_useDirectWrites(JNIEnv* env, jobject jobj,
|
||||||
|
jlong jhandle) {
|
||||||
|
return reinterpret_cast<rocksdb::Options*>(jhandle)->use_direct_writes;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Class: org_rocksdb_Options
|
||||||
|
* Method: setUseDirectReads
|
||||||
|
* Signature: (JZ)V
|
||||||
|
*/
|
||||||
|
void Java_org_rocksdb_Options_setUseDirectWrites(JNIEnv* env, jobject jobj,
|
||||||
|
jlong jhandle,
|
||||||
|
jboolean use_direct_writes) {
|
||||||
|
reinterpret_cast<rocksdb::Options*>(jhandle)->use_direct_writes =
|
||||||
|
static_cast<bool>(use_direct_writes);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Class: org_rocksdb_Options
|
* Class: org_rocksdb_Options
|
||||||
* Method: isFdCloseOnExec
|
* Method: isFdCloseOnExec
|
||||||
@ -4144,23 +4167,46 @@ jlong Java_org_rocksdb_DBOptions_manifestPreallocationSize(
|
|||||||
|
|
||||||
/*
|
/*
|
||||||
* Class: org_rocksdb_DBOptions
|
* Class: org_rocksdb_DBOptions
|
||||||
* Method: setAllowOsBuffer
|
* Method: useDirectReads
|
||||||
* Signature: (JZ)V
|
* Signature: (J)Z
|
||||||
*/
|
*/
|
||||||
void Java_org_rocksdb_DBOptions_setAllowOsBuffer(
|
jboolean Java_org_rocksdb_DBOptions_useDirectReads(JNIEnv* env, jobject jobj,
|
||||||
JNIEnv* env, jobject jobj, jlong jhandle, jboolean allow_os_buffer) {
|
jlong jhandle) {
|
||||||
reinterpret_cast<rocksdb::DBOptions*>(jhandle)->allow_os_buffer =
|
return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->use_direct_reads;
|
||||||
static_cast<bool>(allow_os_buffer);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Class: org_rocksdb_DBOptions
|
* Class: org_rocksdb_DBOptions
|
||||||
* Method: allowOsBuffer
|
* Method: setUseDirectReads
|
||||||
|
* Signature: (JZ)V
|
||||||
|
*/
|
||||||
|
void Java_org_rocksdb_DBOptions_setUseDirectReads(JNIEnv* env, jobject jobj,
|
||||||
|
jlong jhandle,
|
||||||
|
jboolean use_direct_reads) {
|
||||||
|
reinterpret_cast<rocksdb::DBOptions*>(jhandle)->use_direct_reads =
|
||||||
|
static_cast<bool>(use_direct_reads);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Class: org_rocksdb_DBOptions
|
||||||
|
* Method: useDirectWrites
|
||||||
* Signature: (J)Z
|
* Signature: (J)Z
|
||||||
*/
|
*/
|
||||||
jboolean Java_org_rocksdb_DBOptions_allowOsBuffer(
|
jboolean Java_org_rocksdb_DBOptions_useDirectWrites(JNIEnv* env, jobject jobj,
|
||||||
JNIEnv* env, jobject jobj, jlong jhandle) {
|
jlong jhandle) {
|
||||||
return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->allow_os_buffer;
|
return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->use_direct_writes;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Class: org_rocksdb_DBOptions
|
||||||
|
* Method: setUseDirectReads
|
||||||
|
* Signature: (JZ)V
|
||||||
|
*/
|
||||||
|
void Java_org_rocksdb_DBOptions_setUseDirectWrites(JNIEnv* env, jobject jobj,
|
||||||
|
jlong jhandle,
|
||||||
|
jboolean use_direct_writes) {
|
||||||
|
reinterpret_cast<rocksdb::DBOptions*>(jhandle)->use_direct_writes =
|
||||||
|
static_cast<bool>(use_direct_writes);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -457,17 +457,31 @@ public class DBOptions extends RocksObject implements DBOptionsInterface {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public DBOptions setAllowOsBuffer(
|
public DBOptions setUseDirectReads(
|
||||||
final boolean allowOsBuffer) {
|
final boolean useDirectReads) {
|
||||||
assert(isOwningHandle());
|
assert(isOwningHandle());
|
||||||
setAllowOsBuffer(nativeHandle_, allowOsBuffer);
|
setUseDirectReads(nativeHandle_, useDirectReads);
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean allowOsBuffer() {
|
public boolean useDirectReads() {
|
||||||
assert(isOwningHandle());
|
assert(isOwningHandle());
|
||||||
return allowOsBuffer(nativeHandle_);
|
return useDirectReads(nativeHandle_);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public DBOptions setUseDirectWrites(
|
||||||
|
final boolean useDirectWrites) {
|
||||||
|
assert(isOwningHandle());
|
||||||
|
setUseDirectWrites(nativeHandle_, useDirectWrites);
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean useDirectWrites() {
|
||||||
|
assert(isOwningHandle());
|
||||||
|
return useDirectWrites(nativeHandle_);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@ -710,9 +724,10 @@ public long delayedWriteRate(){
|
|||||||
private native void setManifestPreallocationSize(
|
private native void setManifestPreallocationSize(
|
||||||
long handle, long size) throws IllegalArgumentException;
|
long handle, long size) throws IllegalArgumentException;
|
||||||
private native long manifestPreallocationSize(long handle);
|
private native long manifestPreallocationSize(long handle);
|
||||||
private native void setAllowOsBuffer(
|
private native void setUseDirectReads(long handle, boolean useDirectReads);
|
||||||
long handle, boolean allowOsBuffer);
|
private native boolean useDirectReads(long handle);
|
||||||
private native boolean allowOsBuffer(long handle);
|
private native void setUseDirectWrites(long handle, boolean useDirectWrites);
|
||||||
|
private native boolean useDirectWrites(long handle);
|
||||||
private native void setAllowMmapReads(
|
private native void setAllowMmapReads(
|
||||||
long handle, boolean allowMmapReads);
|
long handle, boolean allowMmapReads);
|
||||||
private native boolean allowMmapReads(long handle);
|
private native boolean allowMmapReads(long handle);
|
||||||
|
@ -673,21 +673,38 @@ public interface DBOptionsInterface {
|
|||||||
long manifestPreallocationSize();
|
long manifestPreallocationSize();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Data being read from file storage may be buffered in the OS
|
* Enable the OS to use direct I/O for reading sst tables.
|
||||||
* Default: true
|
* Default: false
|
||||||
*
|
*
|
||||||
* @param allowOsBuffer if true, then OS buffering is allowed.
|
* @param useDirectReads if true, then direct read is enabled
|
||||||
* @return the instance of the current Object.
|
* @return the instance of the current Object.
|
||||||
*/
|
*/
|
||||||
Object setAllowOsBuffer(boolean allowOsBuffer);
|
Object setUseDirectReads(boolean useDirectReads);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Data being read from file storage may be buffered in the OS
|
* Enable the OS to use direct I/O for reading sst tables.
|
||||||
* Default: true
|
* Default: false
|
||||||
*
|
*
|
||||||
* @return if true, then OS buffering is allowed.
|
* @return if true, then direct reads are enabled
|
||||||
*/
|
*/
|
||||||
boolean allowOsBuffer();
|
boolean useDirectReads();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Enable the OS to use direct I/O for writing sst tables.
|
||||||
|
* Default: false
|
||||||
|
*
|
||||||
|
* @param useDirectWrites if true, then direct write is enabled
|
||||||
|
* @return the instance of the current Object.
|
||||||
|
*/
|
||||||
|
Object setUseDirectWrites(boolean useDirectWrites);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Enable the OS to use direct I/O for writing sst tables.
|
||||||
|
* Default: false
|
||||||
|
*
|
||||||
|
* @return if true, then direct writes are enabled
|
||||||
|
*/
|
||||||
|
boolean useDirectWrites();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Allow the OS to mmap file for reading sst tables.
|
* Allow the OS to mmap file for reading sst tables.
|
||||||
|
@ -530,18 +530,32 @@ public class Options extends RocksObject
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean allowOsBuffer() {
|
public Options setUseDirectReads(final boolean useDirectReads) {
|
||||||
assert(isOwningHandle());
|
assert(isOwningHandle());
|
||||||
return allowOsBuffer(nativeHandle_);
|
setUseDirectReads(nativeHandle_, useDirectReads);
|
||||||
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Options setAllowOsBuffer(final boolean allowOsBuffer) {
|
public boolean useDirectReads() {
|
||||||
assert(isOwningHandle());
|
assert(isOwningHandle());
|
||||||
setAllowOsBuffer(nativeHandle_, allowOsBuffer);
|
return useDirectReads(nativeHandle_);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Options setUseDirectWrites(final boolean useDirectWrites) {
|
||||||
|
assert(isOwningHandle());
|
||||||
|
setUseDirectWrites(nativeHandle_, useDirectWrites);
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean useDirectWrites() {
|
||||||
|
assert(isOwningHandle());
|
||||||
|
return useDirectWrites(nativeHandle_);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean allowMmapReads() {
|
public boolean allowMmapReads() {
|
||||||
assert(isOwningHandle());
|
assert(isOwningHandle());
|
||||||
@ -1289,9 +1303,10 @@ public class Options extends RocksObject
|
|||||||
private native void setManifestPreallocationSize(
|
private native void setManifestPreallocationSize(
|
||||||
long handle, long size) throws IllegalArgumentException;
|
long handle, long size) throws IllegalArgumentException;
|
||||||
private native long manifestPreallocationSize(long handle);
|
private native long manifestPreallocationSize(long handle);
|
||||||
private native void setAllowOsBuffer(
|
private native void setUseDirectReads(long handle, boolean useDirectReads);
|
||||||
long handle, boolean allowOsBuffer);
|
private native boolean useDirectReads(long handle);
|
||||||
private native boolean allowOsBuffer(long handle);
|
private native void setUseDirectWrites(long handle, boolean useDirectWrites);
|
||||||
|
private native boolean useDirectWrites(long handle);
|
||||||
private native void setAllowMmapReads(
|
private native void setAllowMmapReads(
|
||||||
long handle, boolean allowMmapReads);
|
long handle, boolean allowMmapReads);
|
||||||
private native boolean allowMmapReads(long handle);
|
private native boolean allowMmapReads(long handle);
|
||||||
|
@ -281,11 +281,20 @@ public class DBOptionsTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void allowOsBuffer() {
|
public void useDirectReads() {
|
||||||
try(final DBOptions opt = new DBOptions()) {
|
try(final DBOptions opt = new DBOptions()) {
|
||||||
final boolean boolValue = rand.nextBoolean();
|
final boolean boolValue = rand.nextBoolean();
|
||||||
opt.setAllowOsBuffer(boolValue);
|
opt.setUseDirectReads(boolValue);
|
||||||
assertThat(opt.allowOsBuffer()).isEqualTo(boolValue);
|
assertThat(opt.useDirectReads()).isEqualTo(boolValue);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void useDirectWrites() {
|
||||||
|
try(final DBOptions opt = new DBOptions()) {
|
||||||
|
final boolean boolValue = rand.nextBoolean();
|
||||||
|
opt.setUseDirectWrites(boolValue);
|
||||||
|
assertThat(opt.useDirectWrites()).isEqualTo(boolValue);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -18,15 +18,6 @@ public class EnvOptionsTest {
|
|||||||
|
|
||||||
public static final Random rand = PlatformRandomHelper.getPlatformSpecificRandomFactory();
|
public static final Random rand = PlatformRandomHelper.getPlatformSpecificRandomFactory();
|
||||||
|
|
||||||
@Test
|
|
||||||
public void useOsBuffer() {
|
|
||||||
try (final EnvOptions envOptions = new EnvOptions()) {
|
|
||||||
final boolean boolValue = rand.nextBoolean();
|
|
||||||
envOptions.setUseOsBuffer(boolValue);
|
|
||||||
assertThat(envOptions.useOsBuffer()).isEqualTo(boolValue);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void useMmapReads() {
|
public void useMmapReads() {
|
||||||
try (final EnvOptions envOptions = new EnvOptions()) {
|
try (final EnvOptions envOptions = new EnvOptions()) {
|
||||||
|
@ -572,11 +572,20 @@ public class OptionsTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void allowOsBuffer() {
|
public void useDirectReads() {
|
||||||
try(final Options opt = new Options()) {
|
try(final Options opt = new Options()) {
|
||||||
final boolean boolValue = rand.nextBoolean();
|
final boolean boolValue = rand.nextBoolean();
|
||||||
opt.setAllowOsBuffer(boolValue);
|
opt.setUseDirectReads(boolValue);
|
||||||
assertThat(opt.allowOsBuffer()).isEqualTo(boolValue);
|
assertThat(opt.useDirectReads()).isEqualTo(boolValue);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void useDirectWrites() {
|
||||||
|
try(final Options opt = new Options()) {
|
||||||
|
final boolean boolValue = rand.nextBoolean();
|
||||||
|
opt.setUseDirectWrites(boolValue);
|
||||||
|
assertThat(opt.useDirectWrites()).isEqualTo(boolValue);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -7,9 +7,10 @@
|
|||||||
// Use of this source code is governed by a BSD-style license that can be
|
// Use of this source code is governed by a BSD-style license that can be
|
||||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||||
|
|
||||||
|
#include "port/win/env_win.h"
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <thread>
|
|
||||||
#include <ctime>
|
#include <ctime>
|
||||||
|
#include <thread>
|
||||||
|
|
||||||
#include <errno.h>
|
#include <errno.h>
|
||||||
#include <process.h> // _getpid
|
#include <process.h> // _getpid
|
||||||
@ -25,7 +26,6 @@
|
|||||||
#include "port/dirent.h"
|
#include "port/dirent.h"
|
||||||
#include "port/win/win_logger.h"
|
#include "port/win/win_logger.h"
|
||||||
#include "port/win/io_win.h"
|
#include "port/win/io_win.h"
|
||||||
#include "port/win/env_win.h"
|
|
||||||
|
|
||||||
#include "util/iostats_context_imp.h"
|
#include "util/iostats_context_imp.h"
|
||||||
|
|
||||||
@ -148,7 +148,7 @@ Status WinEnvIO::NewRandomAccessFile(const std::string& fname,
|
|||||||
// Random access is to disable read-ahead as the system reads too much data
|
// Random access is to disable read-ahead as the system reads too much data
|
||||||
DWORD fileFlags = FILE_ATTRIBUTE_READONLY;
|
DWORD fileFlags = FILE_ATTRIBUTE_READONLY;
|
||||||
|
|
||||||
if (!options.use_os_buffer && !options.use_mmap_reads) {
|
if (options.use_direct_reads && !options.use_mmap_reads) {
|
||||||
fileFlags |= FILE_FLAG_NO_BUFFERING;
|
fileFlags |= FILE_FLAG_NO_BUFFERING;
|
||||||
} else {
|
} else {
|
||||||
fileFlags |= FILE_FLAG_RANDOM_ACCESS;
|
fileFlags |= FILE_FLAG_RANDOM_ACCESS;
|
||||||
@ -240,7 +240,7 @@ Status WinEnvIO::NewWritableFile(const std::string& fname,
|
|||||||
|
|
||||||
DWORD fileFlags = FILE_ATTRIBUTE_NORMAL;
|
DWORD fileFlags = FILE_ATTRIBUTE_NORMAL;
|
||||||
|
|
||||||
if (!local_options.use_os_buffer && !local_options.use_mmap_writes) {
|
if (local_options.use_direct_writes && !local_options.use_mmap_writes) {
|
||||||
fileFlags = FILE_FLAG_NO_BUFFERING;
|
fileFlags = FILE_FLAG_NO_BUFFERING;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -305,7 +305,7 @@ Status WinEnvIO::NewRandomRWFile(const std::string & fname,
|
|||||||
DWORD creation_disposition = OPEN_ALWAYS; // Create if necessary or open existing
|
DWORD creation_disposition = OPEN_ALWAYS; // Create if necessary or open existing
|
||||||
DWORD file_flags = FILE_FLAG_RANDOM_ACCESS;
|
DWORD file_flags = FILE_FLAG_RANDOM_ACCESS;
|
||||||
|
|
||||||
if (!options.use_os_buffer) {
|
if (options.use_direct_reads && options.use_direct_writes) {
|
||||||
file_flags |= FILE_FLAG_NO_BUFFERING;
|
file_flags |= FILE_FLAG_NO_BUFFERING;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -744,11 +744,11 @@ std::string WinEnvIO::TimeToString(uint64_t secondsSince1970) {
|
|||||||
EnvOptions WinEnvIO::OptimizeForLogWrite(const EnvOptions& env_options,
|
EnvOptions WinEnvIO::OptimizeForLogWrite(const EnvOptions& env_options,
|
||||||
const DBOptions& db_options) const {
|
const DBOptions& db_options) const {
|
||||||
EnvOptions optimized = env_options;
|
EnvOptions optimized = env_options;
|
||||||
optimized.use_mmap_writes = false;
|
|
||||||
optimized.bytes_per_sync = db_options.wal_bytes_per_sync;
|
optimized.bytes_per_sync = db_options.wal_bytes_per_sync;
|
||||||
optimized.use_os_buffer =
|
optimized.use_mmap_writes = false;
|
||||||
true; // This is because we flush only whole pages on unbuffered io and
|
// This is because we flush only whole pages on unbuffered io and
|
||||||
// the last records are not guaranteed to be flushed.
|
// the last records are not guaranteed to be flushed.
|
||||||
|
optimized.use_direct_writes = false;
|
||||||
// TODO(icanadi) it's faster if fallocate_with_keep_size is false, but it
|
// TODO(icanadi) it's faster if fallocate_with_keep_size is false, but it
|
||||||
// breaks TransactionLogIteratorStallAtLastRecord unit test. Fix the unit
|
// breaks TransactionLogIteratorStallAtLastRecord unit test. Fix the unit
|
||||||
// test and make this false
|
// test and make this false
|
||||||
@ -760,7 +760,7 @@ EnvOptions WinEnvIO::OptimizeForManifestWrite(
|
|||||||
const EnvOptions& env_options) const {
|
const EnvOptions& env_options) const {
|
||||||
EnvOptions optimized = env_options;
|
EnvOptions optimized = env_options;
|
||||||
optimized.use_mmap_writes = false;
|
optimized.use_mmap_writes = false;
|
||||||
optimized.use_os_buffer = true;
|
optimized.use_direct_writes = false;
|
||||||
optimized.fallocate_with_keep_size = true;
|
optimized.fallocate_with_keep_size = true;
|
||||||
return optimized;
|
return optimized;
|
||||||
}
|
}
|
||||||
|
@ -12,7 +12,6 @@
|
|||||||
#include "util/sync_point.h"
|
#include "util/sync_point.h"
|
||||||
#include "util/coding.h"
|
#include "util/coding.h"
|
||||||
#include "util/iostats_context_imp.h"
|
#include "util/iostats_context_imp.h"
|
||||||
#include "util/sync_point.h"
|
|
||||||
#include "util/aligned_buffer.h"
|
#include "util/aligned_buffer.h"
|
||||||
|
|
||||||
|
|
||||||
@ -158,9 +157,11 @@ size_t GetUniqueIdFromFile(HANDLE hFile, char* id, size_t max_size) {
|
|||||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// WinMmapReadableFile
|
// WinMmapReadableFile
|
||||||
|
|
||||||
WinMmapReadableFile::WinMmapReadableFile(const std::string& fileName, HANDLE hFile, HANDLE hMap,
|
WinMmapReadableFile::WinMmapReadableFile(const std::string& fileName,
|
||||||
const void* mapped_region, size_t length)
|
HANDLE hFile, HANDLE hMap,
|
||||||
: WinFileData(fileName, hFile, false),
|
const void* mapped_region,
|
||||||
|
size_t length)
|
||||||
|
: WinFileData(fileName, hFile, false /* use_direct_io */),
|
||||||
hMap_(hMap),
|
hMap_(hMap),
|
||||||
mapped_region_(mapped_region),
|
mapped_region_(mapped_region),
|
||||||
length_(length) {}
|
length_(length) {}
|
||||||
@ -522,8 +523,7 @@ size_t WinMmapFile::GetUniqueId(char* id, size_t max_size) const {
|
|||||||
|
|
||||||
WinSequentialFile::WinSequentialFile(const std::string& fname, HANDLE f,
|
WinSequentialFile::WinSequentialFile(const std::string& fname, HANDLE f,
|
||||||
const EnvOptions& options)
|
const EnvOptions& options)
|
||||||
: WinFileData(fname, f, options.use_os_buffer)
|
: WinFileData(fname, f, options.use_direct_reads) {}
|
||||||
{}
|
|
||||||
|
|
||||||
WinSequentialFile::~WinSequentialFile() {
|
WinSequentialFile::~WinSequentialFile() {
|
||||||
assert(hFile_ != INVALID_HANDLE_VALUE);
|
assert(hFile_ != INVALID_HANDLE_VALUE);
|
||||||
@ -661,8 +661,8 @@ WinRandomAccessImpl::WinRandomAccessImpl(WinFileData* file_base,
|
|||||||
|
|
||||||
assert(!options.use_mmap_reads);
|
assert(!options.use_mmap_reads);
|
||||||
|
|
||||||
// Unbuffered access, use internal buffer for reads
|
// Direct access, use internal buffer for reads
|
||||||
if (!file_base_->UseOSBuffer()) {
|
if (file_base_->UseDirectIO()) {
|
||||||
// Do not allocate the buffer either until the first request or
|
// Do not allocate the buffer either until the first request or
|
||||||
// until there is a call to allocate a read-ahead buffer
|
// until there is a call to allocate a read-ahead buffer
|
||||||
buffer_.Alignment(alignment);
|
buffer_.Alignment(alignment);
|
||||||
@ -683,11 +683,10 @@ Status WinRandomAccessImpl::ReadImpl(uint64_t offset, size_t n, Slice* result,
|
|||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
// When in unbuffered mode we need to do the following changes:
|
// When in direct I/O mode we need to do the following changes:
|
||||||
// - use our own aligned buffer
|
// - use our own aligned buffer
|
||||||
// - always read at the offset of that is a multiple of alignment
|
// - always read at the offset of that is a multiple of alignment
|
||||||
if (!file_base_->UseOSBuffer()) {
|
if (file_base_->UseDirectIO()) {
|
||||||
|
|
||||||
uint64_t first_page_start = 0;
|
uint64_t first_page_start = 0;
|
||||||
size_t actual_bytes_toread = 0;
|
size_t actual_bytes_toread = 0;
|
||||||
size_t bytes_requested = left;
|
size_t bytes_requested = left;
|
||||||
@ -778,9 +777,7 @@ Status WinRandomAccessImpl::ReadImpl(uint64_t offset, size_t n, Slice* result,
|
|||||||
|
|
||||||
inline
|
inline
|
||||||
void WinRandomAccessImpl::HintImpl(RandomAccessFile::AccessPattern pattern) {
|
void WinRandomAccessImpl::HintImpl(RandomAccessFile::AccessPattern pattern) {
|
||||||
|
if (pattern == RandomAccessFile::SEQUENTIAL && file_base_->UseDirectIO() &&
|
||||||
if (pattern == RandomAccessFile::SEQUENTIAL &&
|
|
||||||
!file_base_->UseOSBuffer() &&
|
|
||||||
compaction_readahead_size_ > 0) {
|
compaction_readahead_size_ > 0) {
|
||||||
std::lock_guard<std::mutex> lg(buffer_mut_);
|
std::lock_guard<std::mutex> lg(buffer_mut_);
|
||||||
if (!read_ahead_) {
|
if (!read_ahead_) {
|
||||||
@ -798,11 +795,11 @@ void WinRandomAccessImpl::HintImpl(RandomAccessFile::AccessPattern pattern) {
|
|||||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
/// WinRandomAccessFile
|
/// WinRandomAccessFile
|
||||||
|
|
||||||
WinRandomAccessFile::WinRandomAccessFile(const std::string& fname, HANDLE hFile, size_t alignment,
|
WinRandomAccessFile::WinRandomAccessFile(const std::string& fname, HANDLE hFile,
|
||||||
const EnvOptions& options) :
|
size_t alignment,
|
||||||
WinFileData(fname, hFile, options.use_os_buffer),
|
const EnvOptions& options)
|
||||||
WinRandomAccessImpl(this, alignment, options) {
|
: WinFileData(fname, hFile, options.use_direct_reads),
|
||||||
}
|
WinRandomAccessImpl(this, alignment, options) {}
|
||||||
|
|
||||||
WinRandomAccessFile::~WinRandomAccessFile() {
|
WinRandomAccessFile::~WinRandomAccessFile() {
|
||||||
}
|
}
|
||||||
@ -851,7 +848,7 @@ WinWritableImpl::WinWritableImpl(WinFileData* file_data, size_t alignment)
|
|||||||
Status WinWritableImpl::AppendImpl(const Slice& data) {
|
Status WinWritableImpl::AppendImpl(const Slice& data) {
|
||||||
|
|
||||||
// Used for buffered access ONLY
|
// Used for buffered access ONLY
|
||||||
assert(file_data_->UseOSBuffer());
|
assert(!file_data_->UseDirectIO());
|
||||||
assert(data.size() < std::numeric_limits<DWORD>::max());
|
assert(data.size() < std::numeric_limits<DWORD>::max());
|
||||||
|
|
||||||
Status s;
|
Status s;
|
||||||
@ -934,9 +931,8 @@ Status WinWritableImpl::SyncImpl() {
|
|||||||
// Calls flush buffers
|
// Calls flush buffers
|
||||||
if (fsync(file_data_->GetFileHandle()) < 0) {
|
if (fsync(file_data_->GetFileHandle()) < 0) {
|
||||||
auto lastError = GetLastError();
|
auto lastError = GetLastError();
|
||||||
s = IOErrorFromWindowsError("fsync failed at Sync() for: " +
|
s = IOErrorFromWindowsError(
|
||||||
file_data_->GetName(),
|
"fsync failed at Sync() for: " + file_data_->GetName(), lastError);
|
||||||
lastError);
|
|
||||||
}
|
}
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
@ -967,21 +963,19 @@ Status WinWritableImpl::AllocateImpl(uint64_t offset, uint64_t len) {
|
|||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
/// WinWritableFile
|
/// WinWritableFile
|
||||||
|
|
||||||
WinWritableFile::WinWritableFile(const std::string& fname, HANDLE hFile, size_t alignment,
|
WinWritableFile::WinWritableFile(const std::string& fname, HANDLE hFile,
|
||||||
size_t /* capacity */, const EnvOptions& options)
|
size_t alignment, size_t /* capacity */,
|
||||||
: WinFileData(fname, hFile, options.use_os_buffer),
|
const EnvOptions& options)
|
||||||
|
: WinFileData(fname, hFile, options.use_direct_writes),
|
||||||
WinWritableImpl(this, alignment) {
|
WinWritableImpl(this, alignment) {
|
||||||
|
|
||||||
assert(!options.use_mmap_writes);
|
assert(!options.use_mmap_writes);
|
||||||
}
|
}
|
||||||
|
|
||||||
WinWritableFile::~WinWritableFile() {
|
WinWritableFile::~WinWritableFile() {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Indicates if the class makes use of unbuffered I/O
|
// Indicates if the class makes use of direct I/O
|
||||||
bool WinWritableFile::UseOSBuffer() const {
|
bool WinWritableFile::UseDirectIO() const { return WinFileData::UseDirectIO(); }
|
||||||
return WinFileData::UseOSBuffer();
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t WinWritableFile::GetRequiredBufferAlignment() const {
|
size_t WinWritableFile::GetRequiredBufferAlignment() const {
|
||||||
return GetAlignement();
|
return GetAlignement();
|
||||||
@ -1015,9 +1009,7 @@ Status WinWritableFile::Sync() {
|
|||||||
return SyncImpl();
|
return SyncImpl();
|
||||||
}
|
}
|
||||||
|
|
||||||
Status WinWritableFile::Fsync() {
|
Status WinWritableFile::Fsync() { return SyncImpl(); }
|
||||||
return SyncImpl();
|
|
||||||
}
|
|
||||||
|
|
||||||
uint64_t WinWritableFile::GetFileSize() {
|
uint64_t WinWritableFile::GetFileSize() {
|
||||||
return GetFileSizeImpl();
|
return GetFileSizeImpl();
|
||||||
@ -1034,17 +1026,14 @@ size_t WinWritableFile::GetUniqueId(char* id, size_t max_size) const {
|
|||||||
/////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////
|
||||||
/// WinRandomRWFile
|
/// WinRandomRWFile
|
||||||
|
|
||||||
WinRandomRWFile::WinRandomRWFile(const std::string& fname, HANDLE hFile, size_t alignment,
|
WinRandomRWFile::WinRandomRWFile(const std::string& fname, HANDLE hFile,
|
||||||
const EnvOptions& options) :
|
size_t alignment, const EnvOptions& options)
|
||||||
WinFileData(fname, hFile, options.use_os_buffer),
|
: WinFileData(fname, hFile,
|
||||||
|
options.use_direct_reads && options.use_direct_writes),
|
||||||
WinRandomAccessImpl(this, alignment, options),
|
WinRandomAccessImpl(this, alignment, options),
|
||||||
WinWritableImpl(this, alignment) {
|
WinWritableImpl(this, alignment) {}
|
||||||
|
|
||||||
}
|
bool WinRandomRWFile::UseDirectIO() const { return WinFileData::UseDirectIO(); }
|
||||||
|
|
||||||
bool WinRandomRWFile::UseOSBuffer() const {
|
|
||||||
return WinFileData::UseOSBuffer();
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t WinRandomRWFile::GetRequiredBufferAlignment() const {
|
size_t WinRandomRWFile::GetRequiredBufferAlignment() const {
|
||||||
return GetAlignement();
|
return GetAlignement();
|
||||||
@ -1094,4 +1083,3 @@ WinFileLock::~WinFileLock() {
|
|||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -8,17 +8,16 @@
|
|||||||
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <rocksdb/Status.h>
|
|
||||||
#include <rocksdb/env.h>
|
|
||||||
|
|
||||||
#include "util/aligned_buffer.h"
|
|
||||||
|
|
||||||
#include <string>
|
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
#include <mutex>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
#include "rocksdb/Status.h"
|
||||||
|
#include "rocksdb/env.h"
|
||||||
|
#include "util/aligned_buffer.h"
|
||||||
|
|
||||||
#include <Windows.h>
|
#include <Windows.h>
|
||||||
|
|
||||||
#include <mutex>
|
|
||||||
|
|
||||||
namespace rocksdb {
|
namespace rocksdb {
|
||||||
namespace port {
|
namespace port {
|
||||||
@ -26,9 +25,9 @@ namespace port {
|
|||||||
std::string GetWindowsErrSz(DWORD err);
|
std::string GetWindowsErrSz(DWORD err);
|
||||||
|
|
||||||
inline Status IOErrorFromWindowsError(const std::string& context, DWORD err) {
|
inline Status IOErrorFromWindowsError(const std::string& context, DWORD err) {
|
||||||
return ((err == ERROR_HANDLE_DISK_FULL) || (err == ERROR_DISK_FULL)) ?
|
return ((err == ERROR_HANDLE_DISK_FULL) || (err == ERROR_DISK_FULL))
|
||||||
Status::NoSpace(context, GetWindowsErrSz(err)) :
|
? Status::NoSpace(context, GetWindowsErrSz(err))
|
||||||
Status::IOError(context, GetWindowsErrSz(err));
|
: Status::IOError(context, GetWindowsErrSz(err));
|
||||||
}
|
}
|
||||||
|
|
||||||
inline Status IOErrorFromLastWindowsError(const std::string& context) {
|
inline Status IOErrorFromLastWindowsError(const std::string& context) {
|
||||||
@ -36,9 +35,9 @@ inline Status IOErrorFromLastWindowsError(const std::string& context) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
inline Status IOError(const std::string& context, int err_number) {
|
inline Status IOError(const std::string& context, int err_number) {
|
||||||
return (err_number == ENOSPC) ?
|
return (err_number == ENOSPC)
|
||||||
Status::NoSpace(context, strerror(err_number)) :
|
? Status::NoSpace(context, strerror(err_number))
|
||||||
Status::IOError(context, strerror(err_number));
|
: Status::IOError(context, strerror(err_number));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Note the below two do not set errno because they are used only here in this
|
// Note the below two do not set errno because they are used only here in this
|
||||||
@ -54,49 +53,34 @@ inline int fsync(HANDLE hFile) {
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
SSIZE_T pwrite(HANDLE hFile, const char* src, size_t numBytes,
|
SSIZE_T pwrite(HANDLE hFile, const char* src, size_t numBytes, uint64_t offset);
|
||||||
uint64_t offset);
|
|
||||||
|
|
||||||
SSIZE_T pread(HANDLE hFile, char* src, size_t numBytes, uint64_t offset);
|
SSIZE_T pread(HANDLE hFile, char* src, size_t numBytes, uint64_t offset);
|
||||||
|
|
||||||
Status fallocate(const std::string& filename, HANDLE hFile,
|
Status fallocate(const std::string& filename, HANDLE hFile, uint64_t to_size);
|
||||||
uint64_t to_size);
|
|
||||||
|
|
||||||
Status ftruncate(const std::string& filename, HANDLE hFile,
|
|
||||||
uint64_t toSize);
|
|
||||||
|
|
||||||
|
Status ftruncate(const std::string& filename, HANDLE hFile, uint64_t toSize);
|
||||||
|
|
||||||
size_t GetUniqueIdFromFile(HANDLE hFile, char* id, size_t max_size);
|
size_t GetUniqueIdFromFile(HANDLE hFile, char* id, size_t max_size);
|
||||||
|
|
||||||
class WinFileData {
|
class WinFileData {
|
||||||
protected:
|
protected:
|
||||||
|
|
||||||
const std::string filename_;
|
const std::string filename_;
|
||||||
HANDLE hFile_;
|
HANDLE hFile_;
|
||||||
// There is no equivalent of advising away buffered pages as in posix.
|
// If ture, the I/O issued would be direct I/O which the buffer
|
||||||
// To implement this flag we would need to do unbuffered reads which
|
|
||||||
// will need to be aligned (not sure there is a guarantee that the buffer
|
// will need to be aligned (not sure there is a guarantee that the buffer
|
||||||
// passed in is aligned).
|
// passed in is aligned).
|
||||||
// Hence we currently ignore this flag. It is used only in a few cases
|
const bool use_direct_io_;
|
||||||
// which should not be perf critical.
|
|
||||||
// If perf evaluation finds this to be a problem, we can look into
|
|
||||||
// implementing this.
|
|
||||||
const bool use_os_buffer_;
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
// We want this class be usable both for inheritance (prive
|
// We want this class be usable both for inheritance (prive
|
||||||
// or protected) and for containment so __ctor and __dtor public
|
// or protected) and for containment so __ctor and __dtor public
|
||||||
WinFileData(const std::string& filename, HANDLE hFile, bool use_os_buffer) :
|
WinFileData(const std::string& filename, HANDLE hFile, bool use_direct_io)
|
||||||
filename_(filename), hFile_(hFile), use_os_buffer_(use_os_buffer)
|
: filename_(filename), hFile_(hFile), use_direct_io_(use_direct_io) {}
|
||||||
{}
|
|
||||||
|
|
||||||
virtual ~WinFileData() {
|
virtual ~WinFileData() { this->CloseFile(); }
|
||||||
this->CloseFile();
|
|
||||||
}
|
|
||||||
|
|
||||||
bool CloseFile() {
|
bool CloseFile() {
|
||||||
|
|
||||||
bool result = true;
|
bool result = true;
|
||||||
|
|
||||||
if (hFile_ != NULL && hFile_ != INVALID_HANDLE_VALUE) {
|
if (hFile_ != NULL && hFile_ != INVALID_HANDLE_VALUE) {
|
||||||
@ -111,13 +95,12 @@ public:
|
|||||||
|
|
||||||
HANDLE GetFileHandle() const { return hFile_; }
|
HANDLE GetFileHandle() const { return hFile_; }
|
||||||
|
|
||||||
bool UseOSBuffer() const { return use_os_buffer_; }
|
bool UseDirectIO() const { return use_direct_io_; }
|
||||||
|
|
||||||
WinFileData(const WinFileData&) = delete;
|
WinFileData(const WinFileData&) = delete;
|
||||||
WinFileData& operator=(const WinFileData&) = delete;
|
WinFileData& operator=(const WinFileData&) = delete;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
// mmap() based random-access
|
// mmap() based random-access
|
||||||
class WinMmapReadableFile : private WinFileData, public RandomAccessFile {
|
class WinMmapReadableFile : private WinFileData, public RandomAccessFile {
|
||||||
HANDLE hMap_;
|
HANDLE hMap_;
|
||||||
@ -185,7 +168,6 @@ private:
|
|||||||
virtual Status PreallocateInternal(uint64_t spaceToReserve);
|
virtual Status PreallocateInternal(uint64_t spaceToReserve);
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
WinMmapFile(const std::string& fname, HANDLE hFile, size_t page_size,
|
WinMmapFile(const std::string& fname, HANDLE hFile, size_t page_size,
|
||||||
size_t allocation_granularity, const EnvOptions& options);
|
size_t allocation_granularity, const EnvOptions& options);
|
||||||
|
|
||||||
@ -245,7 +227,6 @@ public:
|
|||||||
|
|
||||||
class WinRandomAccessImpl {
|
class WinRandomAccessImpl {
|
||||||
protected:
|
protected:
|
||||||
|
|
||||||
WinFileData* file_base_;
|
WinFileData* file_base_;
|
||||||
bool read_ahead_;
|
bool read_ahead_;
|
||||||
const size_t compaction_readahead_size_;
|
const size_t compaction_readahead_size_;
|
||||||
@ -301,11 +282,9 @@ protected:
|
|||||||
virtual ~WinRandomAccessImpl() {}
|
virtual ~WinRandomAccessImpl() {}
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
WinRandomAccessImpl(const WinRandomAccessImpl&) = delete;
|
WinRandomAccessImpl(const WinRandomAccessImpl&) = delete;
|
||||||
WinRandomAccessImpl& operator=(const WinRandomAccessImpl&) = delete;
|
WinRandomAccessImpl& operator=(const WinRandomAccessImpl&) = delete;
|
||||||
|
|
||||||
|
|
||||||
Status ReadImpl(uint64_t offset, size_t n, Slice* result,
|
Status ReadImpl(uint64_t offset, size_t n, Slice* result,
|
||||||
char* scratch) const;
|
char* scratch) const;
|
||||||
|
|
||||||
@ -313,10 +292,11 @@ public:
|
|||||||
};
|
};
|
||||||
|
|
||||||
// pread() based random-access
|
// pread() based random-access
|
||||||
class WinRandomAccessFile : private WinFileData,
|
class WinRandomAccessFile
|
||||||
protected WinRandomAccessImpl, // Want to be able to override PositionedReadInternal
|
: private WinFileData,
|
||||||
|
protected WinRandomAccessImpl, // Want to be able to override
|
||||||
|
// PositionedReadInternal
|
||||||
public RandomAccessFile {
|
public RandomAccessFile {
|
||||||
|
|
||||||
public:
|
public:
|
||||||
WinRandomAccessFile(const std::string& fname, HANDLE hFile, size_t alignment,
|
WinRandomAccessFile(const std::string& fname, HANDLE hFile, size_t alignment,
|
||||||
const EnvOptions& options);
|
const EnvOptions& options);
|
||||||
@ -337,7 +317,6 @@ public:
|
|||||||
virtual size_t GetUniqueId(char* id, size_t max_size) const override;
|
virtual size_t GetUniqueId(char* id, size_t max_size) const override;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
// This is a sequential write class. It has been mimicked (as others) after
|
// This is a sequential write class. It has been mimicked (as others) after
|
||||||
// the original Posix class. We add support for unbuffered I/O on windows as
|
// the original Posix class. We add support for unbuffered I/O on windows as
|
||||||
// well
|
// well
|
||||||
@ -352,7 +331,6 @@ public:
|
|||||||
// buffered access.
|
// buffered access.
|
||||||
class WinWritableImpl {
|
class WinWritableImpl {
|
||||||
protected:
|
protected:
|
||||||
|
|
||||||
WinFileData* file_data_;
|
WinFileData* file_data_;
|
||||||
const uint64_t alignment_;
|
const uint64_t alignment_;
|
||||||
uint64_t filesize_; // How much data is actually written disk
|
uint64_t filesize_; // How much data is actually written disk
|
||||||
@ -368,7 +346,8 @@ protected:
|
|||||||
|
|
||||||
Status AppendImpl(const Slice& data);
|
Status AppendImpl(const Slice& data);
|
||||||
|
|
||||||
// Requires that the data is aligned as specified by GetRequiredBufferAlignment()
|
// Requires that the data is aligned as specified by
|
||||||
|
// GetRequiredBufferAlignment()
|
||||||
Status PositionedAppendImpl(const Slice& data, uint64_t offset);
|
Status PositionedAppendImpl(const Slice& data, uint64_t offset);
|
||||||
|
|
||||||
Status TruncateImpl(uint64_t size);
|
Status TruncateImpl(uint64_t size);
|
||||||
@ -380,7 +359,8 @@ protected:
|
|||||||
uint64_t GetFileSizeImpl() {
|
uint64_t GetFileSizeImpl() {
|
||||||
// Double accounting now here with WritableFileWriter
|
// Double accounting now here with WritableFileWriter
|
||||||
// and this size will be wrong when unbuffered access is used
|
// and this size will be wrong when unbuffered access is used
|
||||||
// but tests implement their own writable files and do not use WritableFileWrapper
|
// but tests implement their own writable files and do not use
|
||||||
|
// WritableFileWrapper
|
||||||
// so we need to squeeze a square peg through
|
// so we need to squeeze a square peg through
|
||||||
// a round hole here.
|
// a round hole here.
|
||||||
return filesize_;
|
return filesize_;
|
||||||
@ -389,31 +369,29 @@ protected:
|
|||||||
Status AllocateImpl(uint64_t offset, uint64_t len);
|
Status AllocateImpl(uint64_t offset, uint64_t len);
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
WinWritableImpl(const WinWritableImpl&) = delete;
|
WinWritableImpl(const WinWritableImpl&) = delete;
|
||||||
WinWritableImpl& operator=(const WinWritableImpl&) = delete;
|
WinWritableImpl& operator=(const WinWritableImpl&) = delete;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
class WinWritableFile : private WinFileData,
|
class WinWritableFile : private WinFileData,
|
||||||
protected WinWritableImpl,
|
protected WinWritableImpl,
|
||||||
public WritableFile {
|
public WritableFile {
|
||||||
|
|
||||||
public:
|
public:
|
||||||
WinWritableFile(const std::string& fname, HANDLE hFile, size_t alignment,
|
WinWritableFile(const std::string& fname, HANDLE hFile, size_t alignment,
|
||||||
size_t capacity, const EnvOptions& options);
|
size_t capacity, const EnvOptions& options);
|
||||||
|
|
||||||
~WinWritableFile();
|
~WinWritableFile();
|
||||||
|
|
||||||
// Indicates if the class makes use of unbuffered I/O
|
// Indicates if the class makes use of direct I/O
|
||||||
// Use PositionedAppend
|
// Use PositionedAppend
|
||||||
virtual bool UseOSBuffer() const override;
|
virtual bool UseDirectIO() const override;
|
||||||
|
|
||||||
virtual size_t GetRequiredBufferAlignment() const override;
|
virtual size_t GetRequiredBufferAlignment() const override;
|
||||||
|
|
||||||
virtual Status Append(const Slice& data) override;
|
virtual Status Append(const Slice& data) override;
|
||||||
|
|
||||||
// Requires that the data is aligned as specified by GetRequiredBufferAlignment()
|
// Requires that the data is aligned as specified by
|
||||||
|
// GetRequiredBufferAlignment()
|
||||||
virtual Status PositionedAppend(const Slice& data, uint64_t offset) override;
|
virtual Status PositionedAppend(const Slice& data, uint64_t offset) override;
|
||||||
|
|
||||||
// Need to implement this so the file is truncated correctly
|
// Need to implement this so the file is truncated correctly
|
||||||
@ -437,30 +415,27 @@ public:
|
|||||||
virtual size_t GetUniqueId(char* id, size_t max_size) const override;
|
virtual size_t GetUniqueId(char* id, size_t max_size) const override;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
class WinRandomRWFile : private WinFileData,
|
class WinRandomRWFile : private WinFileData,
|
||||||
protected WinRandomAccessImpl,
|
protected WinRandomAccessImpl,
|
||||||
protected WinWritableImpl,
|
protected WinWritableImpl,
|
||||||
public RandomRWFile {
|
public RandomRWFile {
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
WinRandomRWFile(const std::string& fname, HANDLE hFile, size_t alignment,
|
WinRandomRWFile(const std::string& fname, HANDLE hFile, size_t alignment,
|
||||||
const EnvOptions& options);
|
const EnvOptions& options);
|
||||||
|
|
||||||
~WinRandomRWFile() {}
|
~WinRandomRWFile() {}
|
||||||
|
|
||||||
// Indicates if the class makes use of unbuffered I/O
|
// Indicates if the class makes use of direct I/O
|
||||||
// If false you must pass aligned buffer to Write()
|
// If false you must pass aligned buffer to Write()
|
||||||
virtual bool UseOSBuffer() const override;
|
virtual bool UseDirectIO() const override;
|
||||||
|
|
||||||
// Use the returned alignment value to allocate
|
// Use the returned alignment value to allocate aligned
|
||||||
// aligned buffer for Write() when UseOSBuffer()
|
// buffer for Write() when UseDirectIO() returns true
|
||||||
// returns false
|
|
||||||
virtual size_t GetRequiredBufferAlignment() const override;
|
virtual size_t GetRequiredBufferAlignment() const override;
|
||||||
|
|
||||||
// Used by the file_reader_writer to decide if the ReadAhead wrapper
|
// Used by the file_reader_writer to decide if the ReadAhead wrapper
|
||||||
// should simply forward the call and do not enact read_ahead buffering or locking.
|
// should simply forward the call and do not enact read_ahead buffering or
|
||||||
|
// locking.
|
||||||
// The implementation below takes care of reading ahead
|
// The implementation below takes care of reading ahead
|
||||||
virtual bool ShouldForwardRawRequest() const override;
|
virtual bool ShouldForwardRawRequest() const override;
|
||||||
|
|
||||||
@ -469,7 +444,7 @@ public:
|
|||||||
virtual void EnableReadAhead() override;
|
virtual void EnableReadAhead() override;
|
||||||
|
|
||||||
// Write bytes in `data` at offset `offset`, Returns Status::OK() on success.
|
// Write bytes in `data` at offset `offset`, Returns Status::OK() on success.
|
||||||
// Pass aligned buffer when UseOSBuffer() returns false.
|
// Pass aligned buffer when UseDirectIO() returns true.
|
||||||
virtual Status Write(uint64_t offset, const Slice& data) override;
|
virtual Status Write(uint64_t offset, const Slice& data) override;
|
||||||
|
|
||||||
// Read up to `n` bytes starting from offset `offset` and store them in
|
// Read up to `n` bytes starting from offset `offset` and store them in
|
||||||
@ -487,7 +462,6 @@ public:
|
|||||||
virtual Status Close() override;
|
virtual Status Close() override;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
class WinDirectory : public Directory {
|
class WinDirectory : public Directory {
|
||||||
public:
|
public:
|
||||||
WinDirectory() {}
|
WinDirectory() {}
|
||||||
@ -507,6 +481,5 @@ public:
|
|||||||
private:
|
private:
|
||||||
HANDLE hFile_;
|
HANDLE hFile_;
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -771,9 +771,6 @@ DEFINE_uint64(wal_size_limit_MB, 0, "Set the size limit for the WAL Files"
|
|||||||
" in MB.");
|
" in MB.");
|
||||||
DEFINE_uint64(max_total_wal_size, 0, "Set total max WAL size");
|
DEFINE_uint64(max_total_wal_size, 0, "Set total max WAL size");
|
||||||
|
|
||||||
DEFINE_bool(bufferedio, rocksdb::EnvOptions().use_os_buffer,
|
|
||||||
"Allow buffered io using OS buffers");
|
|
||||||
|
|
||||||
DEFINE_bool(mmap_read, rocksdb::EnvOptions().use_mmap_reads,
|
DEFINE_bool(mmap_read, rocksdb::EnvOptions().use_mmap_reads,
|
||||||
"Allow reads to occur via mmap-ing files");
|
"Allow reads to occur via mmap-ing files");
|
||||||
|
|
||||||
@ -783,6 +780,9 @@ DEFINE_bool(mmap_write, rocksdb::EnvOptions().use_mmap_writes,
|
|||||||
DEFINE_bool(use_direct_reads, rocksdb::EnvOptions().use_direct_reads,
|
DEFINE_bool(use_direct_reads, rocksdb::EnvOptions().use_direct_reads,
|
||||||
"Use O_DIRECT for reading data");
|
"Use O_DIRECT for reading data");
|
||||||
|
|
||||||
|
DEFINE_bool(use_direct_writes, rocksdb::EnvOptions().use_direct_writes,
|
||||||
|
"Use O_DIRECT for writing data");
|
||||||
|
|
||||||
DEFINE_bool(advise_random_on_open, rocksdb::Options().advise_random_on_open,
|
DEFINE_bool(advise_random_on_open, rocksdb::Options().advise_random_on_open,
|
||||||
"Advise random access on table file open");
|
"Advise random access on table file open");
|
||||||
|
|
||||||
@ -2750,6 +2750,7 @@ class Benchmark {
|
|||||||
options.allow_mmap_reads = FLAGS_mmap_read;
|
options.allow_mmap_reads = FLAGS_mmap_read;
|
||||||
options.allow_mmap_writes = FLAGS_mmap_write;
|
options.allow_mmap_writes = FLAGS_mmap_write;
|
||||||
options.use_direct_reads = FLAGS_use_direct_reads;
|
options.use_direct_reads = FLAGS_use_direct_reads;
|
||||||
|
options.use_direct_writes = FLAGS_use_direct_writes;
|
||||||
if (FLAGS_prefix_size != 0) {
|
if (FLAGS_prefix_size != 0) {
|
||||||
options.prefix_extractor.reset(
|
options.prefix_extractor.reset(
|
||||||
NewFixedPrefixTransform(FLAGS_prefix_size));
|
NewFixedPrefixTransform(FLAGS_prefix_size));
|
||||||
@ -2951,9 +2952,6 @@ class Benchmark {
|
|||||||
options.optimize_filters_for_hits = FLAGS_optimize_filters_for_hits;
|
options.optimize_filters_for_hits = FLAGS_optimize_filters_for_hits;
|
||||||
|
|
||||||
// fill storage options
|
// fill storage options
|
||||||
options.allow_os_buffer = FLAGS_bufferedio;
|
|
||||||
options.allow_mmap_reads = FLAGS_mmap_read;
|
|
||||||
options.allow_mmap_writes = FLAGS_mmap_write;
|
|
||||||
options.advise_random_on_open = FLAGS_advise_random_on_open;
|
options.advise_random_on_open = FLAGS_advise_random_on_open;
|
||||||
options.access_hint_on_compaction_start = FLAGS_compaction_fadvice_e;
|
options.access_hint_on_compaction_start = FLAGS_compaction_fadvice_e;
|
||||||
options.use_adaptive_mutex = FLAGS_use_adaptive_mutex;
|
options.use_adaptive_mutex = FLAGS_use_adaptive_mutex;
|
||||||
|
@ -208,13 +208,14 @@ const std::string options_file_content = R"OPTIONS_FILE(
|
|||||||
max_background_flushes=1
|
max_background_flushes=1
|
||||||
create_if_missing=true
|
create_if_missing=true
|
||||||
error_if_exists=false
|
error_if_exists=false
|
||||||
allow_os_buffer=true
|
|
||||||
delayed_write_rate=1048576
|
delayed_write_rate=1048576
|
||||||
manifest_preallocation_size=4194304
|
manifest_preallocation_size=4194304
|
||||||
|
allow_mmap_reads=false
|
||||||
allow_mmap_writes=false
|
allow_mmap_writes=false
|
||||||
|
use_direct_reads=false
|
||||||
|
use_direct_writes=false
|
||||||
stats_dump_period_sec=600
|
stats_dump_period_sec=600
|
||||||
allow_fallocate=true
|
allow_fallocate=true
|
||||||
allow_mmap_reads=false
|
|
||||||
max_log_file_size=83886080
|
max_log_file_size=83886080
|
||||||
random_access_max_buffer_size=1048576
|
random_access_max_buffer_size=1048576
|
||||||
advise_random_on_open=true
|
advise_random_on_open=true
|
||||||
|
@ -24,7 +24,7 @@ inline size_t Roundup(size_t x, size_t y) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// This class is to manage an aligned user
|
// This class is to manage an aligned user
|
||||||
// allocated buffer for unbuffered I/O purposes
|
// allocated buffer for direct I/O purposes
|
||||||
// though can be used for any purpose.
|
// though can be used for any purpose.
|
||||||
class AlignedBuffer {
|
class AlignedBuffer {
|
||||||
size_t alignment_;
|
size_t alignment_;
|
||||||
|
@ -50,10 +50,10 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options)
|
|||||||
wal_ttl_seconds(options.WAL_ttl_seconds),
|
wal_ttl_seconds(options.WAL_ttl_seconds),
|
||||||
wal_size_limit_mb(options.WAL_size_limit_MB),
|
wal_size_limit_mb(options.WAL_size_limit_MB),
|
||||||
manifest_preallocation_size(options.manifest_preallocation_size),
|
manifest_preallocation_size(options.manifest_preallocation_size),
|
||||||
allow_os_buffer(options.allow_os_buffer),
|
|
||||||
allow_mmap_reads(options.allow_mmap_reads),
|
allow_mmap_reads(options.allow_mmap_reads),
|
||||||
allow_mmap_writes(options.allow_mmap_writes),
|
allow_mmap_writes(options.allow_mmap_writes),
|
||||||
use_direct_reads(options.use_direct_reads),
|
use_direct_reads(options.use_direct_reads),
|
||||||
|
use_direct_writes(options.use_direct_writes),
|
||||||
allow_fallocate(options.allow_fallocate),
|
allow_fallocate(options.allow_fallocate),
|
||||||
is_fd_close_on_exec(options.is_fd_close_on_exec),
|
is_fd_close_on_exec(options.is_fd_close_on_exec),
|
||||||
stats_dump_period_sec(options.stats_dump_period_sec),
|
stats_dump_period_sec(options.stats_dump_period_sec),
|
||||||
@ -119,16 +119,16 @@ void ImmutableDBOptions::Dump(Logger* log) const {
|
|||||||
Header(log,
|
Header(log,
|
||||||
" Options.recycle_log_file_num: %" ROCKSDB_PRIszt,
|
" Options.recycle_log_file_num: %" ROCKSDB_PRIszt,
|
||||||
recycle_log_file_num);
|
recycle_log_file_num);
|
||||||
Header(log, " Options.allow_os_buffer: %d",
|
|
||||||
allow_os_buffer);
|
|
||||||
Header(log, " Options.allow_mmap_reads: %d",
|
|
||||||
allow_mmap_reads);
|
|
||||||
Header(log, " Options.allow_fallocate: %d",
|
Header(log, " Options.allow_fallocate: %d",
|
||||||
allow_fallocate);
|
allow_fallocate);
|
||||||
|
Header(log, " Options.allow_mmap_reads: %d",
|
||||||
|
allow_mmap_reads);
|
||||||
Header(log, " Options.allow_mmap_writes: %d",
|
Header(log, " Options.allow_mmap_writes: %d",
|
||||||
allow_mmap_writes);
|
allow_mmap_writes);
|
||||||
Header(log, " Options.use_direct_reads: %d",
|
Header(log, " Options.use_direct_reads: %d",
|
||||||
use_direct_reads);
|
use_direct_reads);
|
||||||
|
Header(log, " Options.use_direct_writes: %d",
|
||||||
|
use_direct_writes);
|
||||||
Header(log, " Options.create_missing_column_families: %d",
|
Header(log, " Options.create_missing_column_families: %d",
|
||||||
create_missing_column_families);
|
create_missing_column_families);
|
||||||
Header(log, " Options.db_log_dir: %s",
|
Header(log, " Options.db_log_dir: %s",
|
||||||
@ -148,12 +148,6 @@ void ImmutableDBOptions::Dump(Logger* log) const {
|
|||||||
Header(log,
|
Header(log,
|
||||||
" Options.manifest_preallocation_size: %" ROCKSDB_PRIszt,
|
" Options.manifest_preallocation_size: %" ROCKSDB_PRIszt,
|
||||||
manifest_preallocation_size);
|
manifest_preallocation_size);
|
||||||
Header(log, " Options.allow_os_buffer: %d",
|
|
||||||
allow_os_buffer);
|
|
||||||
Header(log, " Options.allow_mmap_reads: %d",
|
|
||||||
allow_mmap_reads);
|
|
||||||
Header(log, " Options.allow_mmap_writes: %d",
|
|
||||||
allow_mmap_writes);
|
|
||||||
Header(log, " Options.is_fd_close_on_exec: %d",
|
Header(log, " Options.is_fd_close_on_exec: %d",
|
||||||
is_fd_close_on_exec);
|
is_fd_close_on_exec);
|
||||||
Header(log, " Options.stats_dump_period_sec: %u",
|
Header(log, " Options.stats_dump_period_sec: %u",
|
||||||
|
@ -46,10 +46,10 @@ struct ImmutableDBOptions {
|
|||||||
uint64_t wal_ttl_seconds;
|
uint64_t wal_ttl_seconds;
|
||||||
uint64_t wal_size_limit_mb;
|
uint64_t wal_size_limit_mb;
|
||||||
size_t manifest_preallocation_size;
|
size_t manifest_preallocation_size;
|
||||||
bool allow_os_buffer;
|
|
||||||
bool allow_mmap_reads;
|
bool allow_mmap_reads;
|
||||||
bool allow_mmap_writes;
|
bool allow_mmap_writes;
|
||||||
bool use_direct_reads;
|
bool use_direct_reads;
|
||||||
|
bool use_direct_writes;
|
||||||
bool allow_fallocate;
|
bool allow_fallocate;
|
||||||
bool is_fd_close_on_exec;
|
bool is_fd_close_on_exec;
|
||||||
unsigned int stats_dump_period_sec;
|
unsigned int stats_dump_period_sec;
|
||||||
|
@ -313,10 +313,10 @@ EnvWrapper::~EnvWrapper() {
|
|||||||
namespace { // anonymous namespace
|
namespace { // anonymous namespace
|
||||||
|
|
||||||
void AssignEnvOptions(EnvOptions* env_options, const DBOptions& options) {
|
void AssignEnvOptions(EnvOptions* env_options, const DBOptions& options) {
|
||||||
env_options->use_os_buffer = options.allow_os_buffer;
|
|
||||||
env_options->use_mmap_reads = options.allow_mmap_reads;
|
env_options->use_mmap_reads = options.allow_mmap_reads;
|
||||||
env_options->use_mmap_writes = options.allow_mmap_writes;
|
env_options->use_mmap_writes = options.allow_mmap_writes;
|
||||||
env_options->use_direct_reads = options.use_direct_reads;
|
env_options->use_direct_reads = options.use_direct_reads;
|
||||||
|
env_options->use_direct_writes = options.use_direct_writes;
|
||||||
env_options->set_fd_cloexec = options.is_fd_close_on_exec;
|
env_options->set_fd_cloexec = options.is_fd_close_on_exec;
|
||||||
env_options->bytes_per_sync = options.bytes_per_sync;
|
env_options->bytes_per_sync = options.bytes_per_sync;
|
||||||
env_options->compaction_readahead_size = options.compaction_readahead_size;
|
env_options->compaction_readahead_size = options.compaction_readahead_size;
|
||||||
|
@ -38,6 +38,7 @@
|
|||||||
#endif
|
#endif
|
||||||
#include <deque>
|
#include <deque>
|
||||||
#include <set>
|
#include <set>
|
||||||
|
#include <vector>
|
||||||
#include "port/port.h"
|
#include "port/port.h"
|
||||||
#include "rocksdb/slice.h"
|
#include "rocksdb/slice.h"
|
||||||
#include "util/coding.h"
|
#include "util/coding.h"
|
||||||
@ -250,31 +251,9 @@ class PosixEnv : public Env {
|
|||||||
result->reset();
|
result->reset();
|
||||||
Status s;
|
Status s;
|
||||||
int fd = -1;
|
int fd = -1;
|
||||||
do {
|
int flags = O_CREAT | O_TRUNC;
|
||||||
IOSTATS_TIMER_GUARD(open_nanos);
|
// Direct IO mode with O_DIRECT flag or F_NOCAHCE (MAC OSX)
|
||||||
fd = open(fname.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644);
|
if (options.use_direct_writes && !options.use_mmap_writes) {
|
||||||
} while (fd < 0 && errno == EINTR);
|
|
||||||
if (fd < 0) {
|
|
||||||
s = IOError(fname, errno);
|
|
||||||
} else {
|
|
||||||
SetFD_CLOEXEC(fd, &options);
|
|
||||||
if (options.use_mmap_writes) {
|
|
||||||
if (!checkedDiskForMmap_) {
|
|
||||||
// this will be executed once in the program's lifetime.
|
|
||||||
// do not use mmapWrite on non ext-3/xfs/tmpfs systems.
|
|
||||||
if (!SupportsFastAllocate(fname)) {
|
|
||||||
forceMmapOff = true;
|
|
||||||
}
|
|
||||||
checkedDiskForMmap_ = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (options.use_mmap_writes && !forceMmapOff) {
|
|
||||||
result->reset(new PosixMmapFile(fname, fd, page_size_, options));
|
|
||||||
} else if (options.use_direct_writes) {
|
|
||||||
close(fd);
|
|
||||||
#ifdef OS_MACOSX
|
|
||||||
int flags = O_WRONLY | O_APPEND | O_TRUNC | O_CREAT;
|
|
||||||
#else
|
|
||||||
// Note: we should avoid O_APPEND here due to ta the following bug:
|
// Note: we should avoid O_APPEND here due to ta the following bug:
|
||||||
// POSIX requires that opening a file with the O_APPEND flag should
|
// POSIX requires that opening a file with the O_APPEND flag should
|
||||||
// have no affect on the location at which pwrite() writes data.
|
// have no affect on the location at which pwrite() writes data.
|
||||||
@ -282,32 +261,56 @@ class PosixEnv : public Env {
|
|||||||
// appends data to the end of the file, regardless of the value of
|
// appends data to the end of the file, regardless of the value of
|
||||||
// offset.
|
// offset.
|
||||||
// More info here: https://linux.die.net/man/2/pwrite
|
// More info here: https://linux.die.net/man/2/pwrite
|
||||||
int flags = O_WRONLY | O_TRUNC | O_CREAT | O_DIRECT;
|
flags |= O_WRONLY;
|
||||||
|
#ifndef OS_MACOSX
|
||||||
|
flags |= O_DIRECT;
|
||||||
#endif
|
#endif
|
||||||
TEST_SYNC_POINT_CALLBACK("NewWritableFile:O_DIRECT", &flags);
|
TEST_SYNC_POINT_CALLBACK("NewWritableFile:O_DIRECT", &flags);
|
||||||
|
} else if (options.use_mmap_writes) {
|
||||||
|
// non-direct I/O
|
||||||
|
flags |= O_RDWR;
|
||||||
|
} else {
|
||||||
|
flags |= O_WRONLY;
|
||||||
|
}
|
||||||
|
|
||||||
|
do {
|
||||||
|
IOSTATS_TIMER_GUARD(open_nanos);
|
||||||
fd = open(fname.c_str(), flags, 0644);
|
fd = open(fname.c_str(), flags, 0644);
|
||||||
|
} while (fd < 0 && errno == EINTR);
|
||||||
|
|
||||||
if (fd < 0) {
|
if (fd < 0) {
|
||||||
s = IOError(fname, errno);
|
s = IOError(fname, errno);
|
||||||
} else {
|
return s;
|
||||||
std::unique_ptr<PosixDirectIOWritableFile> file(
|
}
|
||||||
new PosixDirectIOWritableFile(fname, fd));
|
SetFD_CLOEXEC(fd, &options);
|
||||||
*result = std::move(file);
|
|
||||||
s = Status::OK();
|
if (options.use_mmap_writes) {
|
||||||
|
if (!checkedDiskForMmap_) {
|
||||||
|
// this will be executed once in the program's lifetime.
|
||||||
|
// do not use mmapWrite on non ext-3/xfs/tmpfs systems.
|
||||||
|
if (!SupportsFastAllocate(fname)) {
|
||||||
|
forceMmapOff_ = true;
|
||||||
|
}
|
||||||
|
checkedDiskForMmap_ = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (options.use_mmap_writes && !forceMmapOff_) {
|
||||||
|
result->reset(new PosixMmapFile(fname, fd, page_size_, options));
|
||||||
|
} else if (options.use_direct_writes && !options.use_mmap_writes) {
|
||||||
#ifdef OS_MACOSX
|
#ifdef OS_MACOSX
|
||||||
if (fcntl(fd, F_NOCACHE, 1) == -1) {
|
if (fcntl(fd, F_NOCACHE, 1) == -1) {
|
||||||
close(fd);
|
close(fd);
|
||||||
s = IOError(fname, errno);
|
s = IOError(fname, errno);
|
||||||
|
return s;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
result->reset(new PosixWritableFile(fname, fd, options));
|
||||||
} else {
|
} else {
|
||||||
// disable mmap writes
|
// disable mmap writes
|
||||||
EnvOptions no_mmap_writes_options = options;
|
EnvOptions no_mmap_writes_options = options;
|
||||||
no_mmap_writes_options.use_mmap_writes = false;
|
no_mmap_writes_options.use_mmap_writes = false;
|
||||||
|
|
||||||
result->reset(new PosixWritableFile(fname, fd, no_mmap_writes_options));
|
result->reset(new PosixWritableFile(fname, fd, no_mmap_writes_options));
|
||||||
}
|
}
|
||||||
}
|
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -318,40 +321,68 @@ class PosixEnv : public Env {
|
|||||||
result->reset();
|
result->reset();
|
||||||
Status s;
|
Status s;
|
||||||
int fd = -1;
|
int fd = -1;
|
||||||
|
|
||||||
|
int flags = 0;
|
||||||
|
// Direct IO mode with O_DIRECT flag or F_NOCAHCE (MAC OSX)
|
||||||
|
if (options.use_direct_writes && !options.use_mmap_writes) {
|
||||||
|
flags |= O_WRONLY;
|
||||||
|
#ifndef OS_MACOSX
|
||||||
|
flags |= O_DIRECT;
|
||||||
|
#endif
|
||||||
|
TEST_SYNC_POINT_CALLBACK("NewWritableFile:O_DIRECT", &flags);
|
||||||
|
} else if (options.use_mmap_writes) {
|
||||||
|
// mmap needs O_RDWR mode
|
||||||
|
flags |= O_RDWR;
|
||||||
|
} else {
|
||||||
|
flags |= O_WRONLY;
|
||||||
|
}
|
||||||
|
|
||||||
do {
|
do {
|
||||||
IOSTATS_TIMER_GUARD(open_nanos);
|
IOSTATS_TIMER_GUARD(open_nanos);
|
||||||
fd = open(old_fname.c_str(), O_RDWR, 0644);
|
fd = open(old_fname.c_str(), flags, 0644);
|
||||||
} while (fd < 0 && errno == EINTR);
|
} while (fd < 0 && errno == EINTR);
|
||||||
if (fd < 0) {
|
if (fd < 0) {
|
||||||
s = IOError(fname, errno);
|
s = IOError(fname, errno);
|
||||||
} else {
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
SetFD_CLOEXEC(fd, &options);
|
SetFD_CLOEXEC(fd, &options);
|
||||||
// rename into place
|
// rename into place
|
||||||
if (rename(old_fname.c_str(), fname.c_str()) != 0) {
|
if (rename(old_fname.c_str(), fname.c_str()) != 0) {
|
||||||
Status r = IOError(old_fname, errno);
|
s = IOError(old_fname, errno);
|
||||||
close(fd);
|
close(fd);
|
||||||
return r;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (options.use_mmap_writes) {
|
if (options.use_mmap_writes) {
|
||||||
if (!checkedDiskForMmap_) {
|
if (!checkedDiskForMmap_) {
|
||||||
// this will be executed once in the program's lifetime.
|
// this will be executed once in the program's lifetime.
|
||||||
// do not use mmapWrite on non ext-3/xfs/tmpfs systems.
|
// do not use mmapWrite on non ext-3/xfs/tmpfs systems.
|
||||||
if (!SupportsFastAllocate(fname)) {
|
if (!SupportsFastAllocate(fname)) {
|
||||||
forceMmapOff = true;
|
forceMmapOff_ = true;
|
||||||
}
|
}
|
||||||
checkedDiskForMmap_ = true;
|
checkedDiskForMmap_ = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (options.use_mmap_writes && !forceMmapOff) {
|
if (options.use_mmap_writes && !forceMmapOff_) {
|
||||||
result->reset(new PosixMmapFile(fname, fd, page_size_, options));
|
result->reset(new PosixMmapFile(fname, fd, page_size_, options));
|
||||||
|
} else if (options.use_direct_writes && !options.use_mmap_writes) {
|
||||||
|
#ifdef OS_MACOSX
|
||||||
|
if (fcntl(fd, F_NOCACHE, 1) == -1) {
|
||||||
|
close(fd);
|
||||||
|
s = IOError(fname, errno);
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
result->reset(new PosixWritableFile(fname, fd, options));
|
||||||
} else {
|
} else {
|
||||||
// disable mmap writes
|
// disable mmap writes
|
||||||
EnvOptions no_mmap_writes_options = options;
|
EnvOptions no_mmap_writes_options = options;
|
||||||
no_mmap_writes_options.use_mmap_writes = false;
|
no_mmap_writes_options.use_mmap_writes = false;
|
||||||
|
|
||||||
result->reset(new PosixWritableFile(fname, fd, no_mmap_writes_options));
|
result->reset(new PosixWritableFile(fname, fd, no_mmap_writes_options));
|
||||||
}
|
}
|
||||||
}
|
return s;
|
||||||
|
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -724,6 +755,7 @@ class PosixEnv : public Env {
|
|||||||
const DBOptions& db_options) const override {
|
const DBOptions& db_options) const override {
|
||||||
EnvOptions optimized = env_options;
|
EnvOptions optimized = env_options;
|
||||||
optimized.use_mmap_writes = false;
|
optimized.use_mmap_writes = false;
|
||||||
|
optimized.use_direct_writes = false;
|
||||||
optimized.bytes_per_sync = db_options.wal_bytes_per_sync;
|
optimized.bytes_per_sync = db_options.wal_bytes_per_sync;
|
||||||
// TODO(icanadi) it's faster if fallocate_with_keep_size is false, but it
|
// TODO(icanadi) it's faster if fallocate_with_keep_size is false, but it
|
||||||
// breaks TransactionLogIteratorStallAtLastRecord unit test. Fix the unit
|
// breaks TransactionLogIteratorStallAtLastRecord unit test. Fix the unit
|
||||||
@ -736,14 +768,14 @@ class PosixEnv : public Env {
|
|||||||
const EnvOptions& env_options) const override {
|
const EnvOptions& env_options) const override {
|
||||||
EnvOptions optimized = env_options;
|
EnvOptions optimized = env_options;
|
||||||
optimized.use_mmap_writes = false;
|
optimized.use_mmap_writes = false;
|
||||||
|
optimized.use_direct_writes = false;
|
||||||
optimized.fallocate_with_keep_size = true;
|
optimized.fallocate_with_keep_size = true;
|
||||||
return optimized;
|
return optimized;
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
bool checkedDiskForMmap_;
|
bool checkedDiskForMmap_;
|
||||||
bool forceMmapOff; // do we override Env options?
|
bool forceMmapOff_; // do we override Env options?
|
||||||
|
|
||||||
|
|
||||||
// Returns true iff the named directory exists and is a directory.
|
// Returns true iff the named directory exists and is a directory.
|
||||||
virtual bool DirExists(const std::string& dname) {
|
virtual bool DirExists(const std::string& dname) {
|
||||||
@ -784,7 +816,7 @@ class PosixEnv : public Env {
|
|||||||
|
|
||||||
PosixEnv::PosixEnv()
|
PosixEnv::PosixEnv()
|
||||||
: checkedDiskForMmap_(false),
|
: checkedDiskForMmap_(false),
|
||||||
forceMmapOff(false),
|
forceMmapOff_(false),
|
||||||
page_size_(getpagesize()),
|
page_size_(getpagesize()),
|
||||||
thread_pools_(Priority::TOTAL) {
|
thread_pools_(Priority::TOTAL) {
|
||||||
ThreadPoolImpl::PthreadCall("mutex_init", pthread_mutex_init(&mu_, nullptr));
|
ThreadPoolImpl::PthreadCall("mutex_init", pthread_mutex_init(&mu_, nullptr));
|
||||||
|
@ -61,9 +61,8 @@ Status WritableFileWriter::Append(const Slice& data) {
|
|||||||
writable_file_->PrepareWrite(static_cast<size_t>(GetFileSize()), left);
|
writable_file_->PrepareWrite(static_cast<size_t>(GetFileSize()), left);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Flush only when I/O is buffered
|
// Flush only when buffered I/O
|
||||||
if (use_os_buffer_ &&
|
if (!direct_io_ && (buf_.Capacity() - buf_.CurrentSize()) < left) {
|
||||||
(buf_.Capacity() - buf_.CurrentSize()) < left) {
|
|
||||||
if (buf_.CurrentSize() > 0) {
|
if (buf_.CurrentSize() > 0) {
|
||||||
s = Flush();
|
s = Flush();
|
||||||
if (!s.ok()) {
|
if (!s.ok()) {
|
||||||
@ -79,10 +78,10 @@ Status WritableFileWriter::Append(const Slice& data) {
|
|||||||
assert(buf_.CurrentSize() == 0);
|
assert(buf_.CurrentSize() == 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
// We never write directly to disk with unbuffered I/O on.
|
// We never write directly to disk with direct I/O on.
|
||||||
// or we simply use it for its original purpose to accumulate many small
|
// or we simply use it for its original purpose to accumulate many small
|
||||||
// chunks
|
// chunks
|
||||||
if (!use_os_buffer_ || (buf_.Capacity() >= left)) {
|
if (direct_io_ || (buf_.Capacity() >= left)) {
|
||||||
while (left > 0) {
|
while (left > 0) {
|
||||||
size_t appended = buf_.Append(src, left);
|
size_t appended = buf_.Append(src, left);
|
||||||
left -= appended;
|
left -= appended;
|
||||||
@ -96,7 +95,7 @@ Status WritableFileWriter::Append(const Slice& data) {
|
|||||||
|
|
||||||
// We double the buffer here because
|
// We double the buffer here because
|
||||||
// Flush calls do not keep up with the incoming bytes
|
// Flush calls do not keep up with the incoming bytes
|
||||||
// This is the only place when buffer is changed with unbuffered I/O
|
// This is the only place when buffer is changed with direct I/O
|
||||||
if (buf_.Capacity() < max_buffer_size_) {
|
if (buf_.Capacity() < max_buffer_size_) {
|
||||||
size_t desiredCapacity = buf_.Capacity() * 2;
|
size_t desiredCapacity = buf_.Capacity() * 2;
|
||||||
desiredCapacity = std::min(desiredCapacity, max_buffer_size_);
|
desiredCapacity = std::min(desiredCapacity, max_buffer_size_);
|
||||||
@ -132,7 +131,7 @@ Status WritableFileWriter::Close() {
|
|||||||
|
|
||||||
s = Flush(); // flush cache to OS
|
s = Flush(); // flush cache to OS
|
||||||
|
|
||||||
// In unbuffered mode we write whole pages so
|
// In direct I/O mode we write whole pages so
|
||||||
// we need to let the file know where data ends.
|
// we need to let the file know where data ends.
|
||||||
Status interim = writable_file_->Truncate(filesize_);
|
Status interim = writable_file_->Truncate(filesize_);
|
||||||
if (!interim.ok() && s.ok()) {
|
if (!interim.ok() && s.ok()) {
|
||||||
@ -151,17 +150,18 @@ Status WritableFileWriter::Close() {
|
|||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
// write out the cached data to the OS cache
|
// write out the cached data to the OS cache or storage if direct I/O
|
||||||
|
// enabled
|
||||||
Status WritableFileWriter::Flush() {
|
Status WritableFileWriter::Flush() {
|
||||||
Status s;
|
Status s;
|
||||||
TEST_KILL_RANDOM("WritableFileWriter::Flush:0",
|
TEST_KILL_RANDOM("WritableFileWriter::Flush:0",
|
||||||
rocksdb_kill_odds * REDUCE_ODDS2);
|
rocksdb_kill_odds * REDUCE_ODDS2);
|
||||||
|
|
||||||
if (buf_.CurrentSize() > 0) {
|
if (buf_.CurrentSize() > 0) {
|
||||||
if (use_os_buffer_) {
|
if (direct_io_) {
|
||||||
s = WriteBuffered(buf_.BufferStart(), buf_.CurrentSize());
|
s = WriteDirect();
|
||||||
} else {
|
} else {
|
||||||
s = WriteUnbuffered();
|
s = WriteBuffered(buf_.BufferStart(), buf_.CurrentSize());
|
||||||
}
|
}
|
||||||
if (!s.ok()) {
|
if (!s.ok()) {
|
||||||
return s;
|
return s;
|
||||||
@ -259,8 +259,8 @@ size_t WritableFileWriter::RequestToken(size_t bytes, bool align) {
|
|||||||
|
|
||||||
if (align) {
|
if (align) {
|
||||||
// Here we may actually require more than burst and block
|
// Here we may actually require more than burst and block
|
||||||
// but we can not write less than one page at a time on unbuffered
|
// but we can not write less than one page at a time on direct I/O
|
||||||
// thus we may want not to use ratelimiter s
|
// thus we may want not to use ratelimiter
|
||||||
size_t alignment = buf_.Alignment();
|
size_t alignment = buf_.Alignment();
|
||||||
bytes = std::max(alignment, TruncateToPageBoundary(alignment, bytes));
|
bytes = std::max(alignment, TruncateToPageBoundary(alignment, bytes));
|
||||||
}
|
}
|
||||||
@ -273,7 +273,7 @@ size_t WritableFileWriter::RequestToken(size_t bytes, bool align) {
|
|||||||
// limiter if available
|
// limiter if available
|
||||||
Status WritableFileWriter::WriteBuffered(const char* data, size_t size) {
|
Status WritableFileWriter::WriteBuffered(const char* data, size_t size) {
|
||||||
Status s;
|
Status s;
|
||||||
assert(use_os_buffer_);
|
assert(!direct_io_);
|
||||||
const char* src = data;
|
const char* src = data;
|
||||||
size_t left = size;
|
size_t left = size;
|
||||||
|
|
||||||
@ -308,10 +308,10 @@ Status WritableFileWriter::WriteBuffered(const char* data, size_t size) {
|
|||||||
// whole number of pages to be written again on the next flush because we can
|
// whole number of pages to be written again on the next flush because we can
|
||||||
// only write on aligned
|
// only write on aligned
|
||||||
// offsets.
|
// offsets.
|
||||||
Status WritableFileWriter::WriteUnbuffered() {
|
Status WritableFileWriter::WriteDirect() {
|
||||||
Status s;
|
Status s;
|
||||||
|
|
||||||
assert(!use_os_buffer_);
|
assert(direct_io_);
|
||||||
const size_t alignment = buf_.Alignment();
|
const size_t alignment = buf_.Alignment();
|
||||||
assert((next_write_offset_ % alignment) == 0);
|
assert((next_write_offset_ % alignment) == 0);
|
||||||
|
|
||||||
@ -339,7 +339,7 @@ Status WritableFileWriter::WriteUnbuffered() {
|
|||||||
{
|
{
|
||||||
IOSTATS_TIMER_GUARD(write_nanos);
|
IOSTATS_TIMER_GUARD(write_nanos);
|
||||||
TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend");
|
TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend");
|
||||||
// Unbuffered writes must be positional
|
// direct writes must be positional
|
||||||
s = writable_file_->PositionedAppend(Slice(src, size), write_offset);
|
s = writable_file_->PositionedAppend(Slice(src, size), write_offset);
|
||||||
if (!s.ok()) {
|
if (!s.ok()) {
|
||||||
buf_.Size(file_advance + leftover_tail);
|
buf_.Size(file_advance + leftover_tail);
|
||||||
|
@ -103,7 +103,6 @@ class WritableFileWriter {
|
|||||||
uint64_t next_write_offset_;
|
uint64_t next_write_offset_;
|
||||||
bool pending_sync_;
|
bool pending_sync_;
|
||||||
const bool direct_io_;
|
const bool direct_io_;
|
||||||
const bool use_os_buffer_;
|
|
||||||
uint64_t last_sync_size_;
|
uint64_t last_sync_size_;
|
||||||
uint64_t bytes_per_sync_;
|
uint64_t bytes_per_sync_;
|
||||||
RateLimiter* rate_limiter_;
|
RateLimiter* rate_limiter_;
|
||||||
@ -118,7 +117,6 @@ class WritableFileWriter {
|
|||||||
next_write_offset_(0),
|
next_write_offset_(0),
|
||||||
pending_sync_(false),
|
pending_sync_(false),
|
||||||
direct_io_(writable_file_->UseDirectIO()),
|
direct_io_(writable_file_->UseDirectIO()),
|
||||||
use_os_buffer_(writable_file_->UseOSBuffer()),
|
|
||||||
last_sync_size_(0),
|
last_sync_size_(0),
|
||||||
bytes_per_sync_(options.bytes_per_sync),
|
bytes_per_sync_(options.bytes_per_sync),
|
||||||
rate_limiter_(options.rate_limiter) {
|
rate_limiter_(options.rate_limiter) {
|
||||||
@ -156,8 +154,8 @@ class WritableFileWriter {
|
|||||||
|
|
||||||
private:
|
private:
|
||||||
// Used when os buffering is OFF and we are writing
|
// Used when os buffering is OFF and we are writing
|
||||||
// DMA such as in Windows unbuffered mode
|
// DMA such as in Direct I/O mode
|
||||||
Status WriteUnbuffered();
|
Status WriteDirect();
|
||||||
// Normal write
|
// Normal write
|
||||||
Status WriteBuffered(const char* data, size_t size);
|
Status WriteBuffered(const char* data, size_t size);
|
||||||
Status RangeSync(uint64_t offset, uint64_t nbytes);
|
Status RangeSync(uint64_t offset, uint64_t nbytes);
|
||||||
|
@ -88,9 +88,9 @@ TEST_F(WritableFileWriterTest, RangeSync) {
|
|||||||
TEST_F(WritableFileWriterTest, AppendStatusReturn) {
|
TEST_F(WritableFileWriterTest, AppendStatusReturn) {
|
||||||
class FakeWF : public WritableFile {
|
class FakeWF : public WritableFile {
|
||||||
public:
|
public:
|
||||||
explicit FakeWF() : use_os_buffer_(true), io_error_(false) {}
|
explicit FakeWF() : use_direct_io_(false), io_error_(false) {}
|
||||||
|
|
||||||
virtual bool UseOSBuffer() const override { return use_os_buffer_; }
|
virtual bool UseDirectIO() const override { return use_direct_io_; }
|
||||||
Status Append(const Slice& data) override {
|
Status Append(const Slice& data) override {
|
||||||
if (io_error_) {
|
if (io_error_) {
|
||||||
return Status::IOError("Fake IO error");
|
return Status::IOError("Fake IO error");
|
||||||
@ -106,15 +106,15 @@ TEST_F(WritableFileWriterTest, AppendStatusReturn) {
|
|||||||
Status Close() override { return Status::OK(); }
|
Status Close() override { return Status::OK(); }
|
||||||
Status Flush() override { return Status::OK(); }
|
Status Flush() override { return Status::OK(); }
|
||||||
Status Sync() override { return Status::OK(); }
|
Status Sync() override { return Status::OK(); }
|
||||||
void SetUseOSBuffer(bool val) { use_os_buffer_ = val; }
|
void SetUseDirectIO(bool val) { use_direct_io_ = val; }
|
||||||
void SetIOError(bool val) { io_error_ = val; }
|
void SetIOError(bool val) { io_error_ = val; }
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
bool use_os_buffer_;
|
bool use_direct_io_;
|
||||||
bool io_error_;
|
bool io_error_;
|
||||||
};
|
};
|
||||||
unique_ptr<FakeWF> wf(new FakeWF());
|
unique_ptr<FakeWF> wf(new FakeWF());
|
||||||
wf->SetUseOSBuffer(false);
|
wf->SetUseDirectIO(true);
|
||||||
unique_ptr<WritableFileWriter> writer(
|
unique_ptr<WritableFileWriter> writer(
|
||||||
new WritableFileWriter(std::move(wf), EnvOptions()));
|
new WritableFileWriter(std::move(wf), EnvOptions()));
|
||||||
|
|
||||||
|
@ -165,7 +165,7 @@ PosixSequentialFile::PosixSequentialFile(const std::string& fname, FILE* f,
|
|||||||
: filename_(fname),
|
: filename_(fname),
|
||||||
file_(f),
|
file_(f),
|
||||||
fd_(fileno(f)),
|
fd_(fileno(f)),
|
||||||
use_os_buffer_(options.use_os_buffer) {}
|
use_direct_io_(options.use_direct_reads) {}
|
||||||
|
|
||||||
PosixSequentialFile::~PosixSequentialFile() { fclose(file_); }
|
PosixSequentialFile::~PosixSequentialFile() { fclose(file_); }
|
||||||
|
|
||||||
@ -187,7 +187,7 @@ Status PosixSequentialFile::Read(size_t n, Slice* result, char* scratch) {
|
|||||||
s = IOError(filename_, errno);
|
s = IOError(filename_, errno);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!use_os_buffer_) {
|
if (use_direct_io_) {
|
||||||
// we need to fadvise away the entire range of pages because
|
// we need to fadvise away the entire range of pages because
|
||||||
// we do not want readahead pages to be cached.
|
// we do not want readahead pages to be cached.
|
||||||
Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED); // free OS pages
|
Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED); // free OS pages
|
||||||
@ -294,7 +294,7 @@ size_t PosixHelper::GetUniqueIdFromFile(int fd, char* id, size_t max_size) {
|
|||||||
*/
|
*/
|
||||||
PosixRandomAccessFile::PosixRandomAccessFile(const std::string& fname, int fd,
|
PosixRandomAccessFile::PosixRandomAccessFile(const std::string& fname, int fd,
|
||||||
const EnvOptions& options)
|
const EnvOptions& options)
|
||||||
: filename_(fname), fd_(fd), use_os_buffer_(options.use_os_buffer) {
|
: filename_(fname), fd_(fd), use_direct_io_(options.use_direct_reads) {
|
||||||
assert(!options.use_mmap_reads || sizeof(void*) < 8);
|
assert(!options.use_mmap_reads || sizeof(void*) < 8);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -325,7 +325,8 @@ Status PosixRandomAccessFile::Read(uint64_t offset, size_t n, Slice* result,
|
|||||||
// An error: return a non-ok status
|
// An error: return a non-ok status
|
||||||
s = IOError(filename_, errno);
|
s = IOError(filename_, errno);
|
||||||
}
|
}
|
||||||
if (!use_os_buffer_) {
|
|
||||||
|
if (use_direct_io_) {
|
||||||
// we need to fadvise away the entire range of pages because
|
// we need to fadvise away the entire range of pages because
|
||||||
// we do not want readahead pages to be cached.
|
// we do not want readahead pages to be cached.
|
||||||
Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED); // free OS pages
|
Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED); // free OS pages
|
||||||
@ -397,7 +398,7 @@ PosixMmapReadableFile::PosixMmapReadableFile(const int fd,
|
|||||||
: fd_(fd), filename_(fname), mmapped_region_(base), length_(length) {
|
: fd_(fd), filename_(fname), mmapped_region_(base), length_(length) {
|
||||||
fd_ = fd_ + 0; // suppress the warning for used variables
|
fd_ = fd_ + 0; // suppress the warning for used variables
|
||||||
assert(options.use_mmap_reads);
|
assert(options.use_mmap_reads);
|
||||||
assert(options.use_os_buffer);
|
assert(!options.use_direct_reads);
|
||||||
}
|
}
|
||||||
|
|
||||||
PosixMmapReadableFile::~PosixMmapReadableFile() {
|
PosixMmapReadableFile::~PosixMmapReadableFile() {
|
||||||
@ -533,6 +534,7 @@ PosixMmapFile::PosixMmapFile(const std::string& fname, int fd, size_t page_size,
|
|||||||
#endif
|
#endif
|
||||||
assert((page_size & (page_size - 1)) == 0);
|
assert((page_size & (page_size - 1)) == 0);
|
||||||
assert(options.use_mmap_writes);
|
assert(options.use_mmap_writes);
|
||||||
|
assert(!options.use_direct_writes);
|
||||||
}
|
}
|
||||||
|
|
||||||
PosixMmapFile::~PosixMmapFile() {
|
PosixMmapFile::~PosixMmapFile() {
|
||||||
@ -665,7 +667,10 @@ Status PosixMmapFile::Allocate(uint64_t offset, uint64_t len) {
|
|||||||
*/
|
*/
|
||||||
PosixWritableFile::PosixWritableFile(const std::string& fname, int fd,
|
PosixWritableFile::PosixWritableFile(const std::string& fname, int fd,
|
||||||
const EnvOptions& options)
|
const EnvOptions& options)
|
||||||
: filename_(fname), fd_(fd), filesize_(0) {
|
: filename_(fname),
|
||||||
|
direct_io_(options.use_direct_writes),
|
||||||
|
fd_(fd),
|
||||||
|
filesize_(0) {
|
||||||
#ifdef ROCKSDB_FALLOCATE_PRESENT
|
#ifdef ROCKSDB_FALLOCATE_PRESENT
|
||||||
allow_fallocate_ = options.allow_fallocate;
|
allow_fallocate_ = options.allow_fallocate;
|
||||||
fallocate_with_keep_size_ = options.fallocate_with_keep_size;
|
fallocate_with_keep_size_ = options.fallocate_with_keep_size;
|
||||||
@ -680,6 +685,7 @@ PosixWritableFile::~PosixWritableFile() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
Status PosixWritableFile::Append(const Slice& data) {
|
Status PosixWritableFile::Append(const Slice& data) {
|
||||||
|
assert(!direct_io_|| (IsSectorAligned(data.size()) && IsPageAligned(data.data())));
|
||||||
const char* src = data.data();
|
const char* src = data.data();
|
||||||
size_t left = data.size();
|
size_t left = data.size();
|
||||||
while (left != 0) {
|
while (left != 0) {
|
||||||
@ -698,6 +704,8 @@ Status PosixWritableFile::Append(const Slice& data) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
Status PosixWritableFile::PositionedAppend(const Slice& data, uint64_t offset) {
|
Status PosixWritableFile::PositionedAppend(const Slice& data, uint64_t offset) {
|
||||||
|
assert(direct_io_ && IsSectorAligned(offset) &&
|
||||||
|
IsSectorAligned(data.size()) && IsPageAligned(data.data()));
|
||||||
assert(offset <= std::numeric_limits<off_t>::max());
|
assert(offset <= std::numeric_limits<off_t>::max());
|
||||||
const char* src = data.data();
|
const char* src = data.data();
|
||||||
size_t left = data.size();
|
size_t left = data.size();
|
||||||
@ -713,7 +721,7 @@ Status PosixWritableFile::PositionedAppend(const Slice& data, uint64_t offset) {
|
|||||||
offset += done;
|
offset += done;
|
||||||
src += done;
|
src += done;
|
||||||
}
|
}
|
||||||
filesize_ = offset + data.size();
|
filesize_ = offset;
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -778,6 +786,9 @@ bool PosixWritableFile::IsSyncThreadSafe() const { return true; }
|
|||||||
uint64_t PosixWritableFile::GetFileSize() { return filesize_; }
|
uint64_t PosixWritableFile::GetFileSize() { return filesize_; }
|
||||||
|
|
||||||
Status PosixWritableFile::InvalidateCache(size_t offset, size_t length) {
|
Status PosixWritableFile::InvalidateCache(size_t offset, size_t length) {
|
||||||
|
if (direct_io_) {
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
#ifndef OS_LINUX
|
#ifndef OS_LINUX
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
#else
|
#else
|
||||||
@ -825,29 +836,6 @@ size_t PosixWritableFile::GetUniqueId(char* id, size_t max_size) const {
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/*
|
|
||||||
* PosixDirectIOWritableFile
|
|
||||||
*/
|
|
||||||
Status PosixDirectIOWritableFile::Append(const Slice& data) {
|
|
||||||
assert(IsSectorAligned(data.size()) && IsPageAligned(data.data()));
|
|
||||||
if (!IsSectorAligned(data.size()) || !IsPageAligned(data.data())) {
|
|
||||||
return Status::IOError("Unaligned buffer for direct IO");
|
|
||||||
}
|
|
||||||
return PosixWritableFile::Append(data);
|
|
||||||
}
|
|
||||||
|
|
||||||
Status PosixDirectIOWritableFile::PositionedAppend(const Slice& data,
|
|
||||||
uint64_t offset) {
|
|
||||||
assert(IsSectorAligned(offset));
|
|
||||||
assert(IsSectorAligned(data.size()));
|
|
||||||
assert(IsPageAligned(data.data()));
|
|
||||||
if (!IsSectorAligned(offset) || !IsSectorAligned(data.size()) ||
|
|
||||||
!IsPageAligned(data.data())) {
|
|
||||||
return Status::IOError("offset or size is not aligned");
|
|
||||||
}
|
|
||||||
return PosixWritableFile::PositionedAppend(data, offset);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* PosixRandomRWFile
|
* PosixRandomRWFile
|
||||||
*/
|
*/
|
||||||
|
@ -41,7 +41,7 @@ class PosixSequentialFile : public SequentialFile {
|
|||||||
std::string filename_;
|
std::string filename_;
|
||||||
FILE* file_;
|
FILE* file_;
|
||||||
int fd_;
|
int fd_;
|
||||||
bool use_os_buffer_;
|
bool use_direct_io_;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
PosixSequentialFile(const std::string& fname, FILE* f,
|
PosixSequentialFile(const std::string& fname, FILE* f,
|
||||||
@ -74,7 +74,7 @@ class PosixRandomAccessFile : public RandomAccessFile {
|
|||||||
protected:
|
protected:
|
||||||
std::string filename_;
|
std::string filename_;
|
||||||
int fd_;
|
int fd_;
|
||||||
bool use_os_buffer_;
|
bool use_direct_io_;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
PosixRandomAccessFile(const std::string& fname, int fd,
|
PosixRandomAccessFile(const std::string& fname, int fd,
|
||||||
@ -108,6 +108,7 @@ class PosixDirectIORandomAccessFile : public PosixRandomAccessFile {
|
|||||||
class PosixWritableFile : public WritableFile {
|
class PosixWritableFile : public WritableFile {
|
||||||
protected:
|
protected:
|
||||||
const std::string filename_;
|
const std::string filename_;
|
||||||
|
const bool direct_io_;
|
||||||
int fd_;
|
int fd_;
|
||||||
uint64_t filesize_;
|
uint64_t filesize_;
|
||||||
#ifdef ROCKSDB_FALLOCATE_PRESENT
|
#ifdef ROCKSDB_FALLOCATE_PRESENT
|
||||||
@ -130,7 +131,13 @@ class PosixWritableFile : public WritableFile {
|
|||||||
virtual Status Sync() override;
|
virtual Status Sync() override;
|
||||||
virtual Status Fsync() override;
|
virtual Status Fsync() override;
|
||||||
virtual bool IsSyncThreadSafe() const override;
|
virtual bool IsSyncThreadSafe() const override;
|
||||||
|
virtual bool UseDirectIO() const override { return direct_io_; }
|
||||||
virtual uint64_t GetFileSize() override;
|
virtual uint64_t GetFileSize() override;
|
||||||
|
virtual size_t GetRequiredBufferAlignment() const override {
|
||||||
|
// TODO(gzh): It should be the logical sector size/filesystem block size
|
||||||
|
// hardcoded as 4k for most cases
|
||||||
|
return 4 * 1024;
|
||||||
|
}
|
||||||
virtual Status InvalidateCache(size_t offset, size_t length) override;
|
virtual Status InvalidateCache(size_t offset, size_t length) override;
|
||||||
#ifdef ROCKSDB_FALLOCATE_PRESENT
|
#ifdef ROCKSDB_FALLOCATE_PRESENT
|
||||||
virtual Status Allocate(uint64_t offset, uint64_t len) override;
|
virtual Status Allocate(uint64_t offset, uint64_t len) override;
|
||||||
@ -139,22 +146,7 @@ class PosixWritableFile : public WritableFile {
|
|||||||
#endif
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
class PosixDirectIOWritableFile : public PosixWritableFile {
|
// mmap() based random-access
|
||||||
public:
|
|
||||||
explicit PosixDirectIOWritableFile(const std::string& filename, int fd)
|
|
||||||
: PosixWritableFile(filename, fd, EnvOptions()) {}
|
|
||||||
virtual ~PosixDirectIOWritableFile() {}
|
|
||||||
|
|
||||||
bool UseOSBuffer() const override { return false; }
|
|
||||||
size_t GetRequiredBufferAlignment() const override { return 4 * 1024; }
|
|
||||||
Status Append(const Slice& data) override;
|
|
||||||
Status PositionedAppend(const Slice& data, uint64_t offset) override;
|
|
||||||
bool UseDirectIO() const override { return true; }
|
|
||||||
Status InvalidateCache(size_t offset, size_t length) override {
|
|
||||||
return Status::OK();
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
class PosixMmapReadableFile : public RandomAccessFile {
|
class PosixMmapReadableFile : public RandomAccessFile {
|
||||||
private:
|
private:
|
||||||
int fd_;
|
int fd_;
|
||||||
|
@ -34,8 +34,7 @@ namespace rocksdb {
|
|||||||
void RunBenchmark() {
|
void RunBenchmark() {
|
||||||
std::string file_name = test::TmpDir() + "/log_write_benchmark.log";
|
std::string file_name = test::TmpDir() + "/log_write_benchmark.log";
|
||||||
Env* env = Env::Default();
|
Env* env = Env::Default();
|
||||||
EnvOptions env_options;
|
EnvOptions env_options = env->OptimizeForLogWrite(EnvOptions());
|
||||||
env_options.use_mmap_writes = false;
|
|
||||||
env_options.bytes_per_sync = FLAGS_bytes_per_sync;
|
env_options.bytes_per_sync = FLAGS_bytes_per_sync;
|
||||||
unique_ptr<WritableFile> file;
|
unique_ptr<WritableFile> file;
|
||||||
env->NewWritableFile(file_name, &file, env_options);
|
env->NewWritableFile(file_name, &file, env_options);
|
||||||
|
@ -221,9 +221,7 @@ class RandomAccessFileImpl : public RandomAccessFile {
|
|||||||
|
|
||||||
class WritableFileImpl : public WritableFile {
|
class WritableFileImpl : public WritableFile {
|
||||||
public:
|
public:
|
||||||
WritableFileImpl(FileState* file) : file_(file) {
|
explicit WritableFileImpl(FileState* file) : file_(file) { file_->Ref(); }
|
||||||
file_->Ref();
|
|
||||||
}
|
|
||||||
|
|
||||||
~WritableFileImpl() {
|
~WritableFileImpl() {
|
||||||
file_->Unref();
|
file_->Unref();
|
||||||
|
@ -198,10 +198,10 @@ DBOptions::DBOptions()
|
|||||||
WAL_ttl_seconds(0),
|
WAL_ttl_seconds(0),
|
||||||
WAL_size_limit_MB(0),
|
WAL_size_limit_MB(0),
|
||||||
manifest_preallocation_size(4 * 1024 * 1024),
|
manifest_preallocation_size(4 * 1024 * 1024),
|
||||||
allow_os_buffer(true),
|
|
||||||
allow_mmap_reads(false),
|
allow_mmap_reads(false),
|
||||||
allow_mmap_writes(false),
|
allow_mmap_writes(false),
|
||||||
use_direct_reads(false),
|
use_direct_reads(false),
|
||||||
|
use_direct_writes(false),
|
||||||
allow_fallocate(true),
|
allow_fallocate(true),
|
||||||
is_fd_close_on_exec(true),
|
is_fd_close_on_exec(true),
|
||||||
skip_log_error_on_recovery(false),
|
skip_log_error_on_recovery(false),
|
||||||
@ -269,10 +269,10 @@ DBOptions::DBOptions(const Options& options)
|
|||||||
WAL_ttl_seconds(options.WAL_ttl_seconds),
|
WAL_ttl_seconds(options.WAL_ttl_seconds),
|
||||||
WAL_size_limit_MB(options.WAL_size_limit_MB),
|
WAL_size_limit_MB(options.WAL_size_limit_MB),
|
||||||
manifest_preallocation_size(options.manifest_preallocation_size),
|
manifest_preallocation_size(options.manifest_preallocation_size),
|
||||||
allow_os_buffer(options.allow_os_buffer),
|
|
||||||
allow_mmap_reads(options.allow_mmap_reads),
|
allow_mmap_reads(options.allow_mmap_reads),
|
||||||
allow_mmap_writes(options.allow_mmap_writes),
|
allow_mmap_writes(options.allow_mmap_writes),
|
||||||
use_direct_reads(options.use_direct_reads),
|
use_direct_reads(options.use_direct_reads),
|
||||||
|
use_direct_writes(options.use_direct_writes),
|
||||||
allow_fallocate(options.allow_fallocate),
|
allow_fallocate(options.allow_fallocate),
|
||||||
is_fd_close_on_exec(options.is_fd_close_on_exec),
|
is_fd_close_on_exec(options.is_fd_close_on_exec),
|
||||||
skip_log_error_on_recovery(options.skip_log_error_on_recovery),
|
skip_log_error_on_recovery(options.skip_log_error_on_recovery),
|
||||||
@ -336,11 +336,11 @@ void DBOptions::Dump(Logger* log) const {
|
|||||||
keep_log_file_num);
|
keep_log_file_num);
|
||||||
Header(log, " Options.recycle_log_file_num: %" ROCKSDB_PRIszt,
|
Header(log, " Options.recycle_log_file_num: %" ROCKSDB_PRIszt,
|
||||||
recycle_log_file_num);
|
recycle_log_file_num);
|
||||||
Header(log, " Options.allow_os_buffer: %d", allow_os_buffer);
|
|
||||||
Header(log, " Options.allow_mmap_reads: %d", allow_mmap_reads);
|
|
||||||
Header(log, " Options.allow_fallocate: %d", allow_fallocate);
|
Header(log, " Options.allow_fallocate: %d", allow_fallocate);
|
||||||
|
Header(log, " Options.allow_mmap_reads: %d", allow_mmap_reads);
|
||||||
Header(log, " Options.allow_mmap_writes: %d", allow_mmap_writes);
|
Header(log, " Options.allow_mmap_writes: %d", allow_mmap_writes);
|
||||||
Header(log, " Options.use_direct_reads: %d", use_direct_reads);
|
Header(log, " Options.use_direct_reads: %d", use_direct_reads);
|
||||||
|
Header(log, " Options.use_direct_writes: %d", use_direct_writes);
|
||||||
Header(log, " Options.create_missing_column_families: %d",
|
Header(log, " Options.create_missing_column_families: %d",
|
||||||
create_missing_column_families);
|
create_missing_column_families);
|
||||||
Header(log, " Options.db_log_dir: %s",
|
Header(log, " Options.db_log_dir: %s",
|
||||||
@ -366,12 +366,6 @@ void DBOptions::Dump(Logger* log) const {
|
|||||||
Header(log,
|
Header(log,
|
||||||
" Options.manifest_preallocation_size: %" ROCKSDB_PRIszt,
|
" Options.manifest_preallocation_size: %" ROCKSDB_PRIszt,
|
||||||
manifest_preallocation_size);
|
manifest_preallocation_size);
|
||||||
Header(log, " Options.allow_os_buffer: %d",
|
|
||||||
allow_os_buffer);
|
|
||||||
Header(log, " Options.allow_mmap_reads: %d",
|
|
||||||
allow_mmap_reads);
|
|
||||||
Header(log, " Options.allow_mmap_writes: %d",
|
|
||||||
allow_mmap_writes);
|
|
||||||
Header(log, " Options.is_fd_close_on_exec: %d",
|
Header(log, " Options.is_fd_close_on_exec: %d",
|
||||||
is_fd_close_on_exec);
|
is_fd_close_on_exec);
|
||||||
Header(log, " Options.stats_dump_period_sec: %u",
|
Header(log, " Options.stats_dump_period_sec: %u",
|
||||||
|
@ -71,10 +71,10 @@ DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options,
|
|||||||
options.WAL_size_limit_MB = immutable_db_options.wal_size_limit_mb;
|
options.WAL_size_limit_MB = immutable_db_options.wal_size_limit_mb;
|
||||||
options.manifest_preallocation_size =
|
options.manifest_preallocation_size =
|
||||||
immutable_db_options.manifest_preallocation_size;
|
immutable_db_options.manifest_preallocation_size;
|
||||||
options.allow_os_buffer = immutable_db_options.allow_os_buffer;
|
|
||||||
options.allow_mmap_reads = immutable_db_options.allow_mmap_reads;
|
options.allow_mmap_reads = immutable_db_options.allow_mmap_reads;
|
||||||
options.allow_mmap_writes = immutable_db_options.allow_mmap_writes;
|
options.allow_mmap_writes = immutable_db_options.allow_mmap_writes;
|
||||||
options.use_direct_reads = immutable_db_options.use_direct_reads;
|
options.use_direct_reads = immutable_db_options.use_direct_reads;
|
||||||
|
options.use_direct_writes = immutable_db_options.use_direct_writes;
|
||||||
options.allow_fallocate = immutable_db_options.allow_fallocate;
|
options.allow_fallocate = immutable_db_options.allow_fallocate;
|
||||||
options.is_fd_close_on_exec = immutable_db_options.is_fd_close_on_exec;
|
options.is_fd_close_on_exec = immutable_db_options.is_fd_close_on_exec;
|
||||||
options.stats_dump_period_sec = immutable_db_options.stats_dump_period_sec;
|
options.stats_dump_period_sec = immutable_db_options.stats_dump_period_sec;
|
||||||
|
@ -186,12 +186,14 @@ static std::unordered_map<std::string, OptionTypeInfo> db_options_type_info = {
|
|||||||
{"use_direct_reads",
|
{"use_direct_reads",
|
||||||
{offsetof(struct DBOptions, use_direct_reads), OptionType::kBoolean,
|
{offsetof(struct DBOptions, use_direct_reads), OptionType::kBoolean,
|
||||||
OptionVerificationType::kNormal, false, 0}},
|
OptionVerificationType::kNormal, false, 0}},
|
||||||
|
{"use_direct_writes",
|
||||||
|
{offsetof(struct DBOptions, use_direct_writes), OptionType::kBoolean,
|
||||||
|
OptionVerificationType::kNormal, false, 0}},
|
||||||
{"allow_2pc",
|
{"allow_2pc",
|
||||||
{offsetof(struct DBOptions, allow_2pc), OptionType::kBoolean,
|
{offsetof(struct DBOptions, allow_2pc), OptionType::kBoolean,
|
||||||
OptionVerificationType::kNormal, false, 0}},
|
OptionVerificationType::kNormal, false, 0}},
|
||||||
{"allow_os_buffer",
|
{"allow_os_buffer",
|
||||||
{offsetof(struct DBOptions, allow_os_buffer), OptionType::kBoolean,
|
{0, OptionType::kBoolean, OptionVerificationType::kDeprecated, true, 0}},
|
||||||
OptionVerificationType::kNormal, false, 0}},
|
|
||||||
{"create_if_missing",
|
{"create_if_missing",
|
||||||
{offsetof(struct DBOptions, create_if_missing), OptionType::kBoolean,
|
{offsetof(struct DBOptions, create_if_missing), OptionType::kBoolean,
|
||||||
OptionVerificationType::kNormal, false, 0}},
|
OptionVerificationType::kNormal, false, 0}},
|
||||||
|
@ -267,7 +267,6 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) {
|
|||||||
"max_background_flushes=35;"
|
"max_background_flushes=35;"
|
||||||
"create_if_missing=false;"
|
"create_if_missing=false;"
|
||||||
"error_if_exists=true;"
|
"error_if_exists=true;"
|
||||||
"allow_os_buffer=false;"
|
|
||||||
"delayed_write_rate=4294976214;"
|
"delayed_write_rate=4294976214;"
|
||||||
"manifest_preallocation_size=1222;"
|
"manifest_preallocation_size=1222;"
|
||||||
"allow_mmap_writes=false;"
|
"allow_mmap_writes=false;"
|
||||||
@ -275,6 +274,7 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) {
|
|||||||
"allow_fallocate=true;"
|
"allow_fallocate=true;"
|
||||||
"allow_mmap_reads=false;"
|
"allow_mmap_reads=false;"
|
||||||
"use_direct_reads=false;"
|
"use_direct_reads=false;"
|
||||||
|
"use_direct_writes=false;"
|
||||||
"max_log_file_size=4607;"
|
"max_log_file_size=4607;"
|
||||||
"random_access_max_buffer_size=1048576;"
|
"random_access_max_buffer_size=1048576;"
|
||||||
"advise_random_on_open=true;"
|
"advise_random_on_open=true;"
|
||||||
|
@ -116,9 +116,10 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
|
|||||||
{"WAL_ttl_seconds", "43"},
|
{"WAL_ttl_seconds", "43"},
|
||||||
{"WAL_size_limit_MB", "44"},
|
{"WAL_size_limit_MB", "44"},
|
||||||
{"manifest_preallocation_size", "45"},
|
{"manifest_preallocation_size", "45"},
|
||||||
{"allow_os_buffer", "false"},
|
|
||||||
{"allow_mmap_reads", "true"},
|
{"allow_mmap_reads", "true"},
|
||||||
{"allow_mmap_writes", "false"},
|
{"allow_mmap_writes", "false"},
|
||||||
|
{"use_direct_reads", "false"},
|
||||||
|
{"use_direct_writes", "false"},
|
||||||
{"is_fd_close_on_exec", "true"},
|
{"is_fd_close_on_exec", "true"},
|
||||||
{"skip_log_error_on_recovery", "false"},
|
{"skip_log_error_on_recovery", "false"},
|
||||||
{"stats_dump_period_sec", "46"},
|
{"stats_dump_period_sec", "46"},
|
||||||
@ -231,9 +232,10 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
|
|||||||
ASSERT_EQ(new_db_opt.WAL_ttl_seconds, static_cast<uint64_t>(43));
|
ASSERT_EQ(new_db_opt.WAL_ttl_seconds, static_cast<uint64_t>(43));
|
||||||
ASSERT_EQ(new_db_opt.WAL_size_limit_MB, static_cast<uint64_t>(44));
|
ASSERT_EQ(new_db_opt.WAL_size_limit_MB, static_cast<uint64_t>(44));
|
||||||
ASSERT_EQ(new_db_opt.manifest_preallocation_size, 45U);
|
ASSERT_EQ(new_db_opt.manifest_preallocation_size, 45U);
|
||||||
ASSERT_EQ(new_db_opt.allow_os_buffer, false);
|
|
||||||
ASSERT_EQ(new_db_opt.allow_mmap_reads, true);
|
ASSERT_EQ(new_db_opt.allow_mmap_reads, true);
|
||||||
ASSERT_EQ(new_db_opt.allow_mmap_writes, false);
|
ASSERT_EQ(new_db_opt.allow_mmap_writes, false);
|
||||||
|
ASSERT_EQ(new_db_opt.use_direct_reads, false);
|
||||||
|
ASSERT_EQ(new_db_opt.use_direct_writes, false);
|
||||||
ASSERT_EQ(new_db_opt.is_fd_close_on_exec, true);
|
ASSERT_EQ(new_db_opt.is_fd_close_on_exec, true);
|
||||||
ASSERT_EQ(new_db_opt.skip_log_error_on_recovery, false);
|
ASSERT_EQ(new_db_opt.skip_log_error_on_recovery, false);
|
||||||
ASSERT_EQ(new_db_opt.stats_dump_period_sec, 46U);
|
ASSERT_EQ(new_db_opt.stats_dump_period_sec, 46U);
|
||||||
|
@ -240,7 +240,8 @@ void RandomInitDBOptions(DBOptions* db_opt, Random* rnd) {
|
|||||||
db_opt->advise_random_on_open = rnd->Uniform(2);
|
db_opt->advise_random_on_open = rnd->Uniform(2);
|
||||||
db_opt->allow_mmap_reads = rnd->Uniform(2);
|
db_opt->allow_mmap_reads = rnd->Uniform(2);
|
||||||
db_opt->allow_mmap_writes = rnd->Uniform(2);
|
db_opt->allow_mmap_writes = rnd->Uniform(2);
|
||||||
db_opt->allow_os_buffer = rnd->Uniform(2);
|
db_opt->use_direct_reads = rnd->Uniform(2);
|
||||||
|
db_opt->use_direct_writes = rnd->Uniform(2);
|
||||||
db_opt->create_if_missing = rnd->Uniform(2);
|
db_opt->create_if_missing = rnd->Uniform(2);
|
||||||
db_opt->create_missing_column_families = rnd->Uniform(2);
|
db_opt->create_missing_column_families = rnd->Uniform(2);
|
||||||
db_opt->disableDataSync = rnd->Uniform(2);
|
db_opt->disableDataSync = rnd->Uniform(2);
|
||||||
|
@ -1162,7 +1162,7 @@ Status BackupEngineImpl::CopyOrCreateFile(
|
|||||||
unique_ptr<SequentialFile> src_file;
|
unique_ptr<SequentialFile> src_file;
|
||||||
EnvOptions env_options;
|
EnvOptions env_options;
|
||||||
env_options.use_mmap_writes = false;
|
env_options.use_mmap_writes = false;
|
||||||
env_options.use_os_buffer = false;
|
// TODO:(gzh) maybe use direct writes here if possible
|
||||||
if (size != nullptr) {
|
if (size != nullptr) {
|
||||||
*size = 0;
|
*size = 0;
|
||||||
}
|
}
|
||||||
@ -1365,7 +1365,7 @@ Status BackupEngineImpl::CalculateChecksum(const std::string& src, Env* src_env,
|
|||||||
|
|
||||||
EnvOptions env_options;
|
EnvOptions env_options;
|
||||||
env_options.use_mmap_writes = false;
|
env_options.use_mmap_writes = false;
|
||||||
env_options.use_os_buffer = false;
|
env_options.use_direct_reads = false;
|
||||||
|
|
||||||
std::unique_ptr<SequentialFile> src_file;
|
std::unique_ptr<SequentialFile> src_file;
|
||||||
Status s = src_env->NewSequentialFile(src, &src_file, env_options);
|
Status s = src_env->NewSequentialFile(src, &src_file, env_options);
|
||||||
@ -1671,6 +1671,7 @@ Status BackupEngineImpl::BackupMeta::StoreToFile(bool sync) {
|
|||||||
unique_ptr<WritableFile> backup_meta_file;
|
unique_ptr<WritableFile> backup_meta_file;
|
||||||
EnvOptions env_options;
|
EnvOptions env_options;
|
||||||
env_options.use_mmap_writes = false;
|
env_options.use_mmap_writes = false;
|
||||||
|
env_options.use_direct_writes = false;
|
||||||
s = env_->NewWritableFile(meta_filename_ + ".tmp", &backup_meta_file,
|
s = env_->NewWritableFile(meta_filename_ + ".tmp", &backup_meta_file,
|
||||||
env_options);
|
env_options);
|
||||||
if (!s.ok()) {
|
if (!s.ok()) {
|
||||||
|
@ -18,7 +18,7 @@ class SequentialFileMirror : public SequentialFile {
|
|||||||
public:
|
public:
|
||||||
unique_ptr<SequentialFile> a_, b_;
|
unique_ptr<SequentialFile> a_, b_;
|
||||||
std::string fname;
|
std::string fname;
|
||||||
SequentialFileMirror(std::string f) : fname(f) {}
|
explicit SequentialFileMirror(std::string f) : fname(f) {}
|
||||||
|
|
||||||
Status Read(size_t n, Slice* result, char* scratch) {
|
Status Read(size_t n, Slice* result, char* scratch) {
|
||||||
Slice aslice;
|
Slice aslice;
|
||||||
@ -62,7 +62,7 @@ class RandomAccessFileMirror : public RandomAccessFile {
|
|||||||
public:
|
public:
|
||||||
unique_ptr<RandomAccessFile> a_, b_;
|
unique_ptr<RandomAccessFile> a_, b_;
|
||||||
std::string fname;
|
std::string fname;
|
||||||
RandomAccessFileMirror(std::string f) : fname(f) {}
|
explicit RandomAccessFileMirror(std::string f) : fname(f) {}
|
||||||
|
|
||||||
Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const {
|
Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const {
|
||||||
Status as = a_->Read(offset, n, result, scratch);
|
Status as = a_->Read(offset, n, result, scratch);
|
||||||
@ -101,7 +101,7 @@ class WritableFileMirror : public WritableFile {
|
|||||||
public:
|
public:
|
||||||
unique_ptr<WritableFile> a_, b_;
|
unique_ptr<WritableFile> a_, b_;
|
||||||
std::string fname;
|
std::string fname;
|
||||||
WritableFileMirror(std::string f) : fname(f) {}
|
explicit WritableFileMirror(std::string f) : fname(f) {}
|
||||||
|
|
||||||
Status Append(const Slice& data) override {
|
Status Append(const Slice& data) override {
|
||||||
Status as = a_->Append(data);
|
Status as = a_->Append(data);
|
||||||
|
Loading…
Reference in New Issue
Block a user