Break large file writes into 1GB chunks (#5213)

Summary:
This is a workaround for the issue described in #5169.
It has been tested on a database with very large values, but not dedicated test has been added to the code base.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5213

Differential Revision: D15243116

Pulled By: siying

fbshipit-source-id: e0c226a6cd71a60924dcd7ce7af74abcb4054484
This commit is contained in:
Raphael Bost 2019-05-15 14:16:36 -07:00 committed by Facebook Github Bot
parent f0e8216197
commit 468ca61105

135
env/io_posix.cc vendored
View File

@ -37,7 +37,7 @@
#if defined(OS_LINUX) && !defined(F_SET_RW_HINT)
#define F_LINUX_SPECIFIC_BASE 1024
#define F_SET_RW_HINT (F_LINUX_SPECIFIC_BASE + 12)
#define F_SET_RW_HINT (F_LINUX_SPECIFIC_BASE + 12)
#endif
namespace rocksdb {
@ -58,6 +58,57 @@ int Fadvise(int fd, off_t offset, size_t len, int advice) {
namespace {
// On MacOS (and probably *BSD), the posix write and pwrite calls do not support
// buffers larger than 2^31-1 bytes. These two wrappers fix this issue by
// cutting the buffer in 1GB chunks. We use this chunk size to be sure to keep
// the writes aligned.
bool PosixWrite(int fd, const char* buf, size_t nbyte) {
const size_t kLimit1Gb = 1UL << 30;
const char* src = buf;
size_t left = nbyte;
while (left != 0) {
size_t bytes_to_write = std::min(left, kLimit1Gb);
ssize_t done = write(fd, src, bytes_to_write);
if (done < 0) {
if (errno == EINTR) {
continue;
}
return false;
}
left -= done;
src += done;
}
return true;
}
bool PosixPositionedWrite(int fd, const char* buf, size_t nbyte, off_t offset) {
const size_t kLimit1Gb = 1UL << 30;
const char* src = buf;
size_t left = nbyte;
while (left != 0) {
size_t bytes_to_write = std::min(left, kLimit1Gb);
ssize_t done = pwrite(fd, src, bytes_to_write, offset);
if (done < 0) {
if (errno == EINTR) {
continue;
}
return false;
}
left -= done;
offset += done;
src += done;
}
return true;
}
size_t GetLogicalBufferSize(int __attribute__((__unused__)) fd) {
#ifdef OS_LINUX
struct stat buf;
@ -180,7 +231,7 @@ bool IsSectorAligned(const void* ptr, size_t sector_size) {
return uintptr_t(ptr) % sector_size == 0;
}
}
} // namespace
#endif
/*
@ -752,9 +803,9 @@ Status PosixMmapFile::Allocate(uint64_t offset, uint64_t len) {
TEST_KILL_RANDOM("PosixMmapFile::Allocate:0", rocksdb_kill_odds);
int alloc_status = 0;
if (allow_fallocate_) {
alloc_status = fallocate(
fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0,
static_cast<off_t>(offset), static_cast<off_t>(len));
alloc_status =
fallocate(fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0,
static_cast<off_t>(offset), static_cast<off_t>(len));
}
if (alloc_status == 0) {
return Status::OK();
@ -801,19 +852,13 @@ Status PosixWritableFile::Append(const Slice& data) {
assert(IsSectorAligned(data.data(), GetRequiredBufferAlignment()));
}
const char* src = data.data();
size_t left = data.size();
while (left != 0) {
ssize_t done = write(fd_, src, left);
if (done < 0) {
if (errno == EINTR) {
continue;
}
return IOError("While appending to file", filename_, errno);
}
left -= done;
src += done;
size_t nbytes = data.size();
if (!PosixWrite(fd_, src, nbytes)) {
return IOError("While appending to file", filename_, errno);
}
filesize_ += data.size();
filesize_ += nbytes;
return Status::OK();
}
@ -825,21 +870,12 @@ Status PosixWritableFile::PositionedAppend(const Slice& data, uint64_t offset) {
}
assert(offset <= std::numeric_limits<off_t>::max());
const char* src = data.data();
size_t left = data.size();
while (left != 0) {
ssize_t done = pwrite(fd_, src, left, static_cast<off_t>(offset));
if (done < 0) {
if (errno == EINTR) {
continue;
}
return IOError("While pwrite to file at offset " + ToString(offset),
filename_, errno);
}
left -= done;
offset += done;
src += done;
size_t nbytes = data.size();
if (!PosixPositionedWrite(fd_, src, nbytes, static_cast<off_t>(offset))) {
return IOError("While pwrite to file at offset " + ToString(offset),
filename_, errno);
}
filesize_ = offset;
filesize_ = offset + nbytes;
return Status::OK();
}
@ -891,8 +927,8 @@ Status PosixWritableFile::Close() {
// If not, we should hack it with FALLOC_FL_PUNCH_HOLE
if (result == 0 &&
(file_stats.st_size + file_stats.st_blksize - 1) /
file_stats.st_blksize !=
file_stats.st_blocks / (file_stats.st_blksize / 512)) {
file_stats.st_blksize !=
file_stats.st_blocks / (file_stats.st_blksize / 512)) {
IOSTATS_TIMER_GUARD(allocate_nanos);
if (allow_fallocate_) {
fallocate(fd_, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, filesize_,
@ -942,10 +978,10 @@ void PosixWritableFile::SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) {
}
#else
(void)hint;
#endif // ROCKSDB_VALGRIND_RUN
#endif // ROCKSDB_VALGRIND_RUN
#else
(void)hint;
#endif // OS_LINUX
#endif // OS_LINUX
}
Status PosixWritableFile::InvalidateCache(size_t offset, size_t length) {
@ -974,9 +1010,9 @@ Status PosixWritableFile::Allocate(uint64_t offset, uint64_t len) {
IOSTATS_TIMER_GUARD(allocate_nanos);
int alloc_status = 0;
if (allow_fallocate_) {
alloc_status = fallocate(
fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0,
static_cast<off_t>(offset), static_cast<off_t>(len));
alloc_status =
fallocate(fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0,
static_cast<off_t>(offset), static_cast<off_t>(len));
}
if (alloc_status == 0) {
return Status::OK();
@ -1037,24 +1073,11 @@ PosixRandomRWFile::~PosixRandomRWFile() {
Status PosixRandomRWFile::Write(uint64_t offset, const Slice& data) {
const char* src = data.data();
size_t left = data.size();
while (left != 0) {
ssize_t done = pwrite(fd_, src, left, offset);
if (done < 0) {
// error while writing to file
if (errno == EINTR) {
// write was interrupted, try again.
continue;
}
return IOError(
"While write random read/write file at offset " + ToString(offset),
filename_, errno);
}
// Wrote `done` bytes
left -= done;
offset += done;
src += done;
size_t nbytes = data.size();
if (!PosixPositionedWrite(fd_, src, nbytes, static_cast<off_t>(offset))) {
return IOError(
"While write random read/write file at offset " + ToString(offset),
filename_, errno);
}
return Status::OK();