Break large file writes into 1GB chunks (#5213)
Summary: This is a workaround for the issue described in #5169. It has been tested on a database with very large values, but not dedicated test has been added to the code base. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5213 Differential Revision: D15243116 Pulled By: siying fbshipit-source-id: e0c226a6cd71a60924dcd7ce7af74abcb4054484
This commit is contained in:
parent
f0e8216197
commit
468ca61105
135
env/io_posix.cc
vendored
135
env/io_posix.cc
vendored
@ -37,7 +37,7 @@
|
||||
|
||||
#if defined(OS_LINUX) && !defined(F_SET_RW_HINT)
|
||||
#define F_LINUX_SPECIFIC_BASE 1024
|
||||
#define F_SET_RW_HINT (F_LINUX_SPECIFIC_BASE + 12)
|
||||
#define F_SET_RW_HINT (F_LINUX_SPECIFIC_BASE + 12)
|
||||
#endif
|
||||
|
||||
namespace rocksdb {
|
||||
@ -58,6 +58,57 @@ int Fadvise(int fd, off_t offset, size_t len, int advice) {
|
||||
|
||||
namespace {
|
||||
|
||||
// On MacOS (and probably *BSD), the posix write and pwrite calls do not support
|
||||
// buffers larger than 2^31-1 bytes. These two wrappers fix this issue by
|
||||
// cutting the buffer in 1GB chunks. We use this chunk size to be sure to keep
|
||||
// the writes aligned.
|
||||
|
||||
bool PosixWrite(int fd, const char* buf, size_t nbyte) {
|
||||
const size_t kLimit1Gb = 1UL << 30;
|
||||
|
||||
const char* src = buf;
|
||||
size_t left = nbyte;
|
||||
|
||||
while (left != 0) {
|
||||
size_t bytes_to_write = std::min(left, kLimit1Gb);
|
||||
|
||||
ssize_t done = write(fd, src, bytes_to_write);
|
||||
if (done < 0) {
|
||||
if (errno == EINTR) {
|
||||
continue;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
left -= done;
|
||||
src += done;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool PosixPositionedWrite(int fd, const char* buf, size_t nbyte, off_t offset) {
|
||||
const size_t kLimit1Gb = 1UL << 30;
|
||||
|
||||
const char* src = buf;
|
||||
size_t left = nbyte;
|
||||
|
||||
while (left != 0) {
|
||||
size_t bytes_to_write = std::min(left, kLimit1Gb);
|
||||
|
||||
ssize_t done = pwrite(fd, src, bytes_to_write, offset);
|
||||
if (done < 0) {
|
||||
if (errno == EINTR) {
|
||||
continue;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
left -= done;
|
||||
offset += done;
|
||||
src += done;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
size_t GetLogicalBufferSize(int __attribute__((__unused__)) fd) {
|
||||
#ifdef OS_LINUX
|
||||
struct stat buf;
|
||||
@ -180,7 +231,7 @@ bool IsSectorAligned(const void* ptr, size_t sector_size) {
|
||||
return uintptr_t(ptr) % sector_size == 0;
|
||||
}
|
||||
|
||||
}
|
||||
} // namespace
|
||||
#endif
|
||||
|
||||
/*
|
||||
@ -752,9 +803,9 @@ Status PosixMmapFile::Allocate(uint64_t offset, uint64_t len) {
|
||||
TEST_KILL_RANDOM("PosixMmapFile::Allocate:0", rocksdb_kill_odds);
|
||||
int alloc_status = 0;
|
||||
if (allow_fallocate_) {
|
||||
alloc_status = fallocate(
|
||||
fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0,
|
||||
static_cast<off_t>(offset), static_cast<off_t>(len));
|
||||
alloc_status =
|
||||
fallocate(fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0,
|
||||
static_cast<off_t>(offset), static_cast<off_t>(len));
|
||||
}
|
||||
if (alloc_status == 0) {
|
||||
return Status::OK();
|
||||
@ -801,19 +852,13 @@ Status PosixWritableFile::Append(const Slice& data) {
|
||||
assert(IsSectorAligned(data.data(), GetRequiredBufferAlignment()));
|
||||
}
|
||||
const char* src = data.data();
|
||||
size_t left = data.size();
|
||||
while (left != 0) {
|
||||
ssize_t done = write(fd_, src, left);
|
||||
if (done < 0) {
|
||||
if (errno == EINTR) {
|
||||
continue;
|
||||
}
|
||||
return IOError("While appending to file", filename_, errno);
|
||||
}
|
||||
left -= done;
|
||||
src += done;
|
||||
size_t nbytes = data.size();
|
||||
|
||||
if (!PosixWrite(fd_, src, nbytes)) {
|
||||
return IOError("While appending to file", filename_, errno);
|
||||
}
|
||||
filesize_ += data.size();
|
||||
|
||||
filesize_ += nbytes;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
@ -825,21 +870,12 @@ Status PosixWritableFile::PositionedAppend(const Slice& data, uint64_t offset) {
|
||||
}
|
||||
assert(offset <= std::numeric_limits<off_t>::max());
|
||||
const char* src = data.data();
|
||||
size_t left = data.size();
|
||||
while (left != 0) {
|
||||
ssize_t done = pwrite(fd_, src, left, static_cast<off_t>(offset));
|
||||
if (done < 0) {
|
||||
if (errno == EINTR) {
|
||||
continue;
|
||||
}
|
||||
return IOError("While pwrite to file at offset " + ToString(offset),
|
||||
filename_, errno);
|
||||
}
|
||||
left -= done;
|
||||
offset += done;
|
||||
src += done;
|
||||
size_t nbytes = data.size();
|
||||
if (!PosixPositionedWrite(fd_, src, nbytes, static_cast<off_t>(offset))) {
|
||||
return IOError("While pwrite to file at offset " + ToString(offset),
|
||||
filename_, errno);
|
||||
}
|
||||
filesize_ = offset;
|
||||
filesize_ = offset + nbytes;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
@ -891,8 +927,8 @@ Status PosixWritableFile::Close() {
|
||||
// If not, we should hack it with FALLOC_FL_PUNCH_HOLE
|
||||
if (result == 0 &&
|
||||
(file_stats.st_size + file_stats.st_blksize - 1) /
|
||||
file_stats.st_blksize !=
|
||||
file_stats.st_blocks / (file_stats.st_blksize / 512)) {
|
||||
file_stats.st_blksize !=
|
||||
file_stats.st_blocks / (file_stats.st_blksize / 512)) {
|
||||
IOSTATS_TIMER_GUARD(allocate_nanos);
|
||||
if (allow_fallocate_) {
|
||||
fallocate(fd_, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, filesize_,
|
||||
@ -942,10 +978,10 @@ void PosixWritableFile::SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) {
|
||||
}
|
||||
#else
|
||||
(void)hint;
|
||||
#endif // ROCKSDB_VALGRIND_RUN
|
||||
#endif // ROCKSDB_VALGRIND_RUN
|
||||
#else
|
||||
(void)hint;
|
||||
#endif // OS_LINUX
|
||||
#endif // OS_LINUX
|
||||
}
|
||||
|
||||
Status PosixWritableFile::InvalidateCache(size_t offset, size_t length) {
|
||||
@ -974,9 +1010,9 @@ Status PosixWritableFile::Allocate(uint64_t offset, uint64_t len) {
|
||||
IOSTATS_TIMER_GUARD(allocate_nanos);
|
||||
int alloc_status = 0;
|
||||
if (allow_fallocate_) {
|
||||
alloc_status = fallocate(
|
||||
fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0,
|
||||
static_cast<off_t>(offset), static_cast<off_t>(len));
|
||||
alloc_status =
|
||||
fallocate(fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0,
|
||||
static_cast<off_t>(offset), static_cast<off_t>(len));
|
||||
}
|
||||
if (alloc_status == 0) {
|
||||
return Status::OK();
|
||||
@ -1037,24 +1073,11 @@ PosixRandomRWFile::~PosixRandomRWFile() {
|
||||
|
||||
Status PosixRandomRWFile::Write(uint64_t offset, const Slice& data) {
|
||||
const char* src = data.data();
|
||||
size_t left = data.size();
|
||||
while (left != 0) {
|
||||
ssize_t done = pwrite(fd_, src, left, offset);
|
||||
if (done < 0) {
|
||||
// error while writing to file
|
||||
if (errno == EINTR) {
|
||||
// write was interrupted, try again.
|
||||
continue;
|
||||
}
|
||||
return IOError(
|
||||
"While write random read/write file at offset " + ToString(offset),
|
||||
filename_, errno);
|
||||
}
|
||||
|
||||
// Wrote `done` bytes
|
||||
left -= done;
|
||||
offset += done;
|
||||
src += done;
|
||||
size_t nbytes = data.size();
|
||||
if (!PosixPositionedWrite(fd_, src, nbytes, static_cast<off_t>(offset))) {
|
||||
return IOError(
|
||||
"While write random read/write file at offset " + ToString(offset),
|
||||
filename_, errno);
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
|
Loading…
Reference in New Issue
Block a user