From 1a8e9f0e0716c6fe1ec4fb7d8b74d0114693debc Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Tue, 18 Jan 2022 20:21:37 -0800 Subject: [PATCH] Use fcntl(F_FULLFSYNC) on OS X (#9356) Summary: Closing https://github.com/facebook/rocksdb/issues/5954 fsync/fdatasync on Linux: ``` (fsync/fdatasync) includes writing through or flushing a disk cache if present. ``` However, on OS X and iOS: ``` (fsync) will flush all data from the host to the drive (i.e. the "permanent storage device"), the drive itself may not physically write the data to the platters for quite some time and it may be written in an out-of-order sequence. ``` Solution is to use `fcntl(F_FULLFSYNC)` on OS X so that we get the same persistence guarantee. According to OSX man page, ``` The F_FULLFSYNC fcntl asks the drive to flush **all** buffered data to permanent storage. ``` This suggests that it will be no faster than `fsync` on Linux, since Linux, according to its man page, ``` writing through or flushing a disk cache if present ``` It means Linux may not flush **all** data from disk cache. This is similar to bug reports/fixes in: - golang: https://github.com/golang/go/issues/26650 - leveldb: https://github.com/google/leveldb/commit/296de8d5b8e4e57bd1e46c981114dfbe58a8c4fa. Not sure if we should fallback to fsync since we break persistence contract. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9356 Reviewed By: jay-zhuang Differential Revision: D33417416 Pulled By: riversand963 fbshipit-source-id: 475548ff9c5eaccde325e0f6842694271cbc8cb7 --- CMakeLists.txt | 5 ++++ HISTORY.md | 1 + build_tools/build_detect_platform | 14 ++++++++- env/io_posix.cc | 47 ++++++++++++++++++++++++++++++- 4 files changed, 65 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5b53058e0..3a872889c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -616,6 +616,11 @@ if(HAVE_AUXV_GETAUXVAL) add_definitions(-DROCKSDB_AUXV_GETAUXVAL_PRESENT) endif() +check_cxx_symbol_exists(F_FULLFSYNC "fcntl.h" HAVE_FULLFSYNC) +if(HAVE_FULLFSYNC) + add_definitions(-DHAVE_FULLFSYNC) +endif() + include_directories(${PROJECT_SOURCE_DIR}) include_directories(${PROJECT_SOURCE_DIR}/include) if(WITH_FOLLY_DISTRIBUTED_MUTEX) diff --git a/HISTORY.md b/HISTORY.md index 7ae9bbbaf..cfc40d0bc 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -15,6 +15,7 @@ ### Bug Fixes * Fix a bug that FlushMemTable may return ok even flush not succeed. +* Fixed a bug of Sync() and Fsync() not using `fcntl(F_FULLFSYNC)` on OS X and iOS. ## 6.28.0 (2021-12-17) ### New Features diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform index 17ea8bbdd..60639eb59 100755 --- a/build_tools/build_detect_platform +++ b/build_tools/build_detect_platform @@ -842,7 +842,19 @@ EOF fi fi -rm -f test.o +# check for F_FULLFSYNC +$CXX $PLATFORM_CXXFALGS -x c++ - -o test.o 2>/dev/null < + int main() { + fcntl(0, F_FULLFSYNC); + return 0; + } +EOF +if [ "$?" = 0 ]; then + COMMON_FLAGS="$COMMON_FLAGS -DHAVE_FULLFSYNC" +fi + +rm -f test.o test_dl.o PLATFORM_CCFLAGS="$PLATFORM_CCFLAGS $COMMON_FLAGS" PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS $COMMON_FLAGS" diff --git a/env/io_posix.cc b/env/io_posix.cc index dbc6bec7e..3e519dce4 100644 --- a/env/io_posix.cc +++ b/env/io_posix.cc @@ -1108,9 +1108,15 @@ IOStatus PosixMmapFile::Flush(const IOOptions& /*opts*/, IOStatus PosixMmapFile::Sync(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) { +#ifdef HAVE_FULLFSYNC + if (::fcntl(fd_, F_FULLFSYNC) < 0) { + return IOError("while fcntl(F_FULLSYNC) mmapped file", filename_, errno); + } +#else // HAVE_FULLFSYNC if (fdatasync(fd_) < 0) { return IOError("While fdatasync mmapped file", filename_, errno); } +#endif // HAVE_FULLFSYNC return Msync(); } @@ -1120,9 +1126,15 @@ IOStatus PosixMmapFile::Sync(const IOOptions& /*opts*/, */ IOStatus PosixMmapFile::Fsync(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) { +#ifdef HAVE_FULLFSYNC + if (::fcntl(fd_, F_FULLFSYNC) < 0) { + return IOError("While fcntl(F_FULLSYNC) on mmaped file", filename_, errno); + } +#else // HAVE_FULLFSYNC if (fsync(fd_) < 0) { return IOError("While fsync mmaped file", filename_, errno); } +#endif // HAVE_FULLFSYNC return Msync(); } @@ -1320,17 +1332,29 @@ IOStatus PosixWritableFile::Flush(const IOOptions& /*opts*/, IOStatus PosixWritableFile::Sync(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) { +#ifdef HAVE_FULLFSYNC + if (::fcntl(fd_, F_FULLFSYNC) < 0) { + return IOError("while fcntl(F_FULLFSYNC)", filename_, errno); + } +#else // HAVE_FULLFSYNC if (fdatasync(fd_) < 0) { return IOError("While fdatasync", filename_, errno); } +#endif // HAVE_FULLFSYNC return IOStatus::OK(); } IOStatus PosixWritableFile::Fsync(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) { +#ifdef HAVE_FULLFSYNC + if (::fcntl(fd_, F_FULLFSYNC) < 0) { + return IOError("while fcntl(F_FULLFSYNC)", filename_, errno); + } +#else // HAVE_FULLFSYNC if (fsync(fd_) < 0) { return IOError("While fsync", filename_, errno); } +#endif // HAVE_FULLFSYNC return IOStatus::OK(); } @@ -1503,17 +1527,29 @@ IOStatus PosixRandomRWFile::Flush(const IOOptions& /*opts*/, IOStatus PosixRandomRWFile::Sync(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) { +#ifdef HAVE_FULLFSYNC + if (::fcntl(fd_, F_FULLFSYNC) < 0) { + return IOError("while fcntl(F_FULLFSYNC) random rw file", filename_, errno); + } +#else // HAVE_FULLFSYNC if (fdatasync(fd_) < 0) { return IOError("While fdatasync random read/write file", filename_, errno); } +#endif // HAVE_FULLFSYNC return IOStatus::OK(); } IOStatus PosixRandomRWFile::Fsync(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) { +#ifdef HAVE_FULLFSYNC + if (::fcntl(fd_, F_FULLFSYNC) < 0) { + return IOError("While fcntl(F_FULLSYNC) random rw file", filename_, errno); + } +#else // HAVE_FULLFSYNC if (fsync(fd_) < 0) { return IOError("While fsync random read/write file", filename_, errno); } +#endif // HAVE_FULLFSYNC return IOStatus::OK(); } @@ -1585,10 +1621,19 @@ IOStatus PosixDirectory::FsyncWithDirOptions( } // fallback to dir-fsync for kDefault, kDirRenamed and kFileDeleted } +#ifdef HAVE_FULLFSYNC + // btrfs is a Linux file system, while currently F_FULLFSYNC is available on + // Mac OS. + assert(!is_btrfs_); + if (::fcntl(fd_, F_FULLFSYNC) < 0) { + return IOError("while fcntl(F_FULLFSYNC)", "a directory", errno); + } +#else // HAVE_FULLFSYNC if (fsync(fd_) == -1) { s = IOError("While fsync", "a directory", errno); } -#endif +#endif // HAVE_FULLFSYNC +#endif // OS_AIX return s; } } // namespace ROCKSDB_NAMESPACE