Use fcntl(F_FULLFSYNC) on OS X (#9356)

Summary:
Closing https://github.com/facebook/rocksdb/issues/5954

fsync/fdatasync on Linux:
```
(fsync/fdatasync) includes writing through or flushing a disk cache if present.
```

However, on OS X and iOS:
```
(fsync) will flush all data from the host to the drive (i.e. the "permanent storage device"),
the drive itself may not physically write the data to the platters for quite some time and it
may be written in an out-of-order sequence.
```

Solution is to use `fcntl(F_FULLFSYNC)` on OS X so that we get the same
persistence guarantee.

According to OSX man page,
```
The F_FULLFSYNC fcntl asks the drive to flush **all** buffered data to permanent storage.
```
This suggests that it will be no faster than `fsync` on Linux, since Linux, according to its man page,
```
writing through or flushing a disk cache if present
```
It means Linux may not flush **all** data from disk cache.

This is similar to bug reports/fixes in:
- golang: https://github.com/golang/go/issues/26650
- leveldb: 296de8d5b8.

Not sure if we should fallback to fsync since we break persistence contract.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/9356

Reviewed By: jay-zhuang

Differential Revision: D33417416

Pulled By: riversand963

fbshipit-source-id: 475548ff9c5eaccde325e0f6842694271cbc8cb7
This commit is contained in:
Yanqin Jin 2022-01-18 20:21:37 -08:00 committed by Facebook GitHub Bot
parent 5576ded762
commit 1a8e9f0e07
4 changed files with 65 additions and 2 deletions

View File

@ -616,6 +616,11 @@ if(HAVE_AUXV_GETAUXVAL)
add_definitions(-DROCKSDB_AUXV_GETAUXVAL_PRESENT) add_definitions(-DROCKSDB_AUXV_GETAUXVAL_PRESENT)
endif() endif()
check_cxx_symbol_exists(F_FULLFSYNC "fcntl.h" HAVE_FULLFSYNC)
if(HAVE_FULLFSYNC)
add_definitions(-DHAVE_FULLFSYNC)
endif()
include_directories(${PROJECT_SOURCE_DIR}) include_directories(${PROJECT_SOURCE_DIR})
include_directories(${PROJECT_SOURCE_DIR}/include) include_directories(${PROJECT_SOURCE_DIR}/include)
if(WITH_FOLLY_DISTRIBUTED_MUTEX) if(WITH_FOLLY_DISTRIBUTED_MUTEX)

View File

@ -15,6 +15,7 @@
### Bug Fixes ### Bug Fixes
* Fix a bug that FlushMemTable may return ok even flush not succeed. * Fix a bug that FlushMemTable may return ok even flush not succeed.
* Fixed a bug of Sync() and Fsync() not using `fcntl(F_FULLFSYNC)` on OS X and iOS.
## 6.28.0 (2021-12-17) ## 6.28.0 (2021-12-17)
### New Features ### New Features

View File

@ -842,7 +842,19 @@ EOF
fi fi
fi fi
rm -f test.o # check for F_FULLFSYNC
$CXX $PLATFORM_CXXFALGS -x c++ - -o test.o 2>/dev/null <<EOF
#include <fcntl.h>
int main() {
fcntl(0, F_FULLFSYNC);
return 0;
}
EOF
if [ "$?" = 0 ]; then
COMMON_FLAGS="$COMMON_FLAGS -DHAVE_FULLFSYNC"
fi
rm -f test.o test_dl.o
PLATFORM_CCFLAGS="$PLATFORM_CCFLAGS $COMMON_FLAGS" PLATFORM_CCFLAGS="$PLATFORM_CCFLAGS $COMMON_FLAGS"
PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS $COMMON_FLAGS" PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS $COMMON_FLAGS"

47
env/io_posix.cc vendored
View File

@ -1108,9 +1108,15 @@ IOStatus PosixMmapFile::Flush(const IOOptions& /*opts*/,
IOStatus PosixMmapFile::Sync(const IOOptions& /*opts*/, IOStatus PosixMmapFile::Sync(const IOOptions& /*opts*/,
IODebugContext* /*dbg*/) { IODebugContext* /*dbg*/) {
#ifdef HAVE_FULLFSYNC
if (::fcntl(fd_, F_FULLFSYNC) < 0) {
return IOError("while fcntl(F_FULLSYNC) mmapped file", filename_, errno);
}
#else // HAVE_FULLFSYNC
if (fdatasync(fd_) < 0) { if (fdatasync(fd_) < 0) {
return IOError("While fdatasync mmapped file", filename_, errno); return IOError("While fdatasync mmapped file", filename_, errno);
} }
#endif // HAVE_FULLFSYNC
return Msync(); return Msync();
} }
@ -1120,9 +1126,15 @@ IOStatus PosixMmapFile::Sync(const IOOptions& /*opts*/,
*/ */
IOStatus PosixMmapFile::Fsync(const IOOptions& /*opts*/, IOStatus PosixMmapFile::Fsync(const IOOptions& /*opts*/,
IODebugContext* /*dbg*/) { IODebugContext* /*dbg*/) {
#ifdef HAVE_FULLFSYNC
if (::fcntl(fd_, F_FULLFSYNC) < 0) {
return IOError("While fcntl(F_FULLSYNC) on mmaped file", filename_, errno);
}
#else // HAVE_FULLFSYNC
if (fsync(fd_) < 0) { if (fsync(fd_) < 0) {
return IOError("While fsync mmaped file", filename_, errno); return IOError("While fsync mmaped file", filename_, errno);
} }
#endif // HAVE_FULLFSYNC
return Msync(); return Msync();
} }
@ -1320,17 +1332,29 @@ IOStatus PosixWritableFile::Flush(const IOOptions& /*opts*/,
IOStatus PosixWritableFile::Sync(const IOOptions& /*opts*/, IOStatus PosixWritableFile::Sync(const IOOptions& /*opts*/,
IODebugContext* /*dbg*/) { IODebugContext* /*dbg*/) {
#ifdef HAVE_FULLFSYNC
if (::fcntl(fd_, F_FULLFSYNC) < 0) {
return IOError("while fcntl(F_FULLFSYNC)", filename_, errno);
}
#else // HAVE_FULLFSYNC
if (fdatasync(fd_) < 0) { if (fdatasync(fd_) < 0) {
return IOError("While fdatasync", filename_, errno); return IOError("While fdatasync", filename_, errno);
} }
#endif // HAVE_FULLFSYNC
return IOStatus::OK(); return IOStatus::OK();
} }
IOStatus PosixWritableFile::Fsync(const IOOptions& /*opts*/, IOStatus PosixWritableFile::Fsync(const IOOptions& /*opts*/,
IODebugContext* /*dbg*/) { IODebugContext* /*dbg*/) {
#ifdef HAVE_FULLFSYNC
if (::fcntl(fd_, F_FULLFSYNC) < 0) {
return IOError("while fcntl(F_FULLFSYNC)", filename_, errno);
}
#else // HAVE_FULLFSYNC
if (fsync(fd_) < 0) { if (fsync(fd_) < 0) {
return IOError("While fsync", filename_, errno); return IOError("While fsync", filename_, errno);
} }
#endif // HAVE_FULLFSYNC
return IOStatus::OK(); return IOStatus::OK();
} }
@ -1503,17 +1527,29 @@ IOStatus PosixRandomRWFile::Flush(const IOOptions& /*opts*/,
IOStatus PosixRandomRWFile::Sync(const IOOptions& /*opts*/, IOStatus PosixRandomRWFile::Sync(const IOOptions& /*opts*/,
IODebugContext* /*dbg*/) { IODebugContext* /*dbg*/) {
#ifdef HAVE_FULLFSYNC
if (::fcntl(fd_, F_FULLFSYNC) < 0) {
return IOError("while fcntl(F_FULLFSYNC) random rw file", filename_, errno);
}
#else // HAVE_FULLFSYNC
if (fdatasync(fd_) < 0) { if (fdatasync(fd_) < 0) {
return IOError("While fdatasync random read/write file", filename_, errno); return IOError("While fdatasync random read/write file", filename_, errno);
} }
#endif // HAVE_FULLFSYNC
return IOStatus::OK(); return IOStatus::OK();
} }
IOStatus PosixRandomRWFile::Fsync(const IOOptions& /*opts*/, IOStatus PosixRandomRWFile::Fsync(const IOOptions& /*opts*/,
IODebugContext* /*dbg*/) { IODebugContext* /*dbg*/) {
#ifdef HAVE_FULLFSYNC
if (::fcntl(fd_, F_FULLFSYNC) < 0) {
return IOError("While fcntl(F_FULLSYNC) random rw file", filename_, errno);
}
#else // HAVE_FULLFSYNC
if (fsync(fd_) < 0) { if (fsync(fd_) < 0) {
return IOError("While fsync random read/write file", filename_, errno); return IOError("While fsync random read/write file", filename_, errno);
} }
#endif // HAVE_FULLFSYNC
return IOStatus::OK(); return IOStatus::OK();
} }
@ -1585,10 +1621,19 @@ IOStatus PosixDirectory::FsyncWithDirOptions(
} }
// fallback to dir-fsync for kDefault, kDirRenamed and kFileDeleted // fallback to dir-fsync for kDefault, kDirRenamed and kFileDeleted
} }
#ifdef HAVE_FULLFSYNC
// btrfs is a Linux file system, while currently F_FULLFSYNC is available on
// Mac OS.
assert(!is_btrfs_);
if (::fcntl(fd_, F_FULLFSYNC) < 0) {
return IOError("while fcntl(F_FULLFSYNC)", "a directory", errno);
}
#else // HAVE_FULLFSYNC
if (fsync(fd_) == -1) { if (fsync(fd_) == -1) {
s = IOError("While fsync", "a directory", errno); s = IOError("While fsync", "a directory", errno);
} }
#endif #endif // HAVE_FULLFSYNC
#endif // OS_AIX
return s; return s;
} }
} // namespace ROCKSDB_NAMESPACE } // namespace ROCKSDB_NAMESPACE