use atomic O_CLOEXEC when available (#4328)

Summary:
In our application we spawn helper child processes concurrently with
opening rocksdb.  In one situation I observed that the child process had inherited
the rocksdb lock file as well as directory handles to the rocksdb storage location.

The code in env_posix takes care to set CLOEXEC but doesn't use `O_CLOEXEC` at the
time that the files are opened which means that there is a window of opportunity
to leak the descriptors across a fork/exec boundary.

This diff introduces a helper that can conditionally set the `O_CLOEXEC` bit for
the open call using the same logic as that in the existing helper for setting
that flag post-open.

I've preserved the post-open logic for systems that don't have `O_CLOEXEC`.

I've introduced setting `O_CLOEXEC` for what appears to be a number of temporary
or transient files and directory handles; I suspect that none of the files
opened by Rocks are intended to be inherited by a forked child process.

In one case, `fopen` is used to open a file.  I've added the use of the glibc-specific `e`
mode to turn on `O_CLOEXEC` for this case.  While this doesn't cover all posix systems,
it is an improvement for our common deployment system.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4328

Reviewed By: ajkr

Differential Revision: D9553046

Pulled By: wez

fbshipit-source-id: acdb89f7a85ca649b22fe3c3bd76f82142bec2bf
This commit is contained in:
Wez Furlong 2018-08-29 20:24:17 -07:00 committed by Facebook Github Bot
parent 927f274939
commit d00e5de7fc

45
env/env_posix.cc vendored
View File

@ -102,6 +102,18 @@ class PosixFileLock : public FileLock {
std::string filename;
};
int cloexec_flags(int flags, const EnvOptions* options) {
// If the system supports opening the file with cloexec enabled,
// do so, as this avoids a race condition if a db is opened around
// the same time that a child process is forked
#ifdef O_CLOEXEC
if (options == nullptr || options->set_fd_cloexec) {
flags |= O_CLOEXEC;
}
#endif
return flags;
}
class PosixEnv : public Env {
public:
PosixEnv();
@ -133,7 +145,7 @@ class PosixEnv : public Env {
const EnvOptions& options) override {
result->reset();
int fd = -1;
int flags = O_RDONLY;
int flags = cloexec_flags(O_RDONLY, &options);
FILE* file = nullptr;
if (options.use_direct_reads && !options.use_mmap_reads) {
@ -184,7 +196,8 @@ class PosixEnv : public Env {
result->reset();
Status s;
int fd;
int flags = O_RDONLY;
int flags = cloexec_flags(O_RDONLY, &options);
if (options.use_direct_reads && !options.use_mmap_reads) {
#ifdef ROCKSDB_LITE
return Status::IOError(fname, "Direct I/O not supported in RocksDB lite");
@ -266,6 +279,8 @@ class PosixEnv : public Env {
flags |= O_WRONLY;
}
flags = cloexec_flags(flags, &options);
do {
IOSTATS_TIMER_GUARD(open_nanos);
fd = open(fname.c_str(), flags, GetDBFileMode(allow_non_owner_access_));
@ -354,6 +369,8 @@ class PosixEnv : public Env {
flags |= O_WRONLY;
}
flags = cloexec_flags(flags, &options);
do {
IOSTATS_TIMER_GUARD(open_nanos);
fd = open(old_fname.c_str(), flags,
@ -415,9 +432,12 @@ class PosixEnv : public Env {
unique_ptr<RandomRWFile>* result,
const EnvOptions& options) override {
int fd = -1;
int flags = cloexec_flags(O_RDWR, &options);
while (fd < 0) {
IOSTATS_TIMER_GUARD(open_nanos);
fd = open(fname.c_str(), O_RDWR, GetDBFileMode(allow_non_owner_access_));
fd = open(fname.c_str(), flags, GetDBFileMode(allow_non_owner_access_));
if (fd < 0) {
// Error while opening the file
if (errno == EINTR) {
@ -437,9 +457,11 @@ class PosixEnv : public Env {
unique_ptr<MemoryMappedFileBuffer>* result) override {
int fd = -1;
Status status;
int flags = cloexec_flags(O_RDWR, nullptr);
while (fd < 0) {
IOSTATS_TIMER_GUARD(open_nanos);
fd = open(fname.c_str(), O_RDWR, 0644);
fd = open(fname.c_str(), flags, 0644);
if (fd < 0) {
// Error while opening the file
if (errno == EINTR) {
@ -477,9 +499,10 @@ class PosixEnv : public Env {
unique_ptr<Directory>* result) override {
result->reset();
int fd;
int flags = cloexec_flags(0, nullptr);
{
IOSTATS_TIMER_GUARD(open_nanos);
fd = open(name.c_str(), 0);
fd = open(name.c_str(), flags);
}
if (fd < 0) {
return IOError("While open directory", name, errno);
@ -663,9 +686,11 @@ class PosixEnv : public Env {
}
int fd;
int flags = cloexec_flags(O_RDWR | O_CREAT, nullptr);
{
IOSTATS_TIMER_GUARD(open_nanos);
fd = open(fname.c_str(), O_RDWR | O_CREAT, 0644);
fd = open(fname.c_str(), flags, 0644);
}
if (fd < 0) {
result = IOError("while open a file for lock", fname, errno);
@ -756,7 +781,13 @@ class PosixEnv : public Env {
FILE* f;
{
IOSTATS_TIMER_GUARD(open_nanos);
f = fopen(fname.c_str(), "w");
f = fopen(fname.c_str(), "w"
#ifdef __GLIBC_PREREQ
#if __GLIBC_PREREQ(2, 7)
"e" // glibc extension to enable O_CLOEXEC
#endif
#endif
);
}
if (f == nullptr) {
result->reset();