From fc20273e73b2496a107502c69fe55a33e070075c Mon Sep 17 00:00:00 2001 From: Dhruba Borthakur Date: Mon, 27 Aug 2012 12:10:26 -0700 Subject: [PATCH] Introduce a new method Env->Fsync() that issues fsync (instead of fdatasync). Summary: Introduce a new method Env->Fsync() that issues fsync (instead of fdatasync). This is needed for data durability when running on ext3 filesystems. Added options to the benchmark db_bench to generate performance numbers with either fsync or fdatasync enabled. Cleaned up Makefile to build leveldb_shell only when building the thrift leveldb server. Test Plan: build and run benchmark Reviewers: heyongqiang Reviewed By: heyongqiang Differential Revision: https://reviews.facebook.net/D4911 --- Makefile | 6 +----- db/builder.cc | 6 +++++- db/db_bench.cc | 7 +++++++ db/db_impl.cc | 12 ++++++++++-- db/version_set.cc | 6 +++++- include/leveldb/env.h | 12 +++++++++++- include/leveldb/options.h | 7 +++++++ util/env_posix.cc | 16 ++++++++++++++++ util/options.cc | 29 ++++++++++++++++++++++++++++- 9 files changed, 90 insertions(+), 11 deletions(-) diff --git a/Makefile b/Makefile index 18b607b58..9de1f1b54 100644 --- a/Makefile +++ b/Makefile @@ -58,7 +58,6 @@ TESTS = \ TOOLS = \ manifest_dump \ - leveldb_shell \ sst_dump PROGRAMS = db_bench $(TESTS) $(TOOLS) @@ -171,7 +170,7 @@ $(MEMENVLIBRARY) : $(MEMENVOBJECTS) memenv_test : helpers/memenv/memenv_test.o $(MEMENVLIBRARY) $(LIBRARY) $(TESTHARNESS) $(CXX) helpers/memenv/memenv_test.o $(MEMENVLIBRARY) $(LIBRARY) $(TESTHARNESS) -o $@ $(LDFLAGS) -leveldb_server: thrift/server.o $(LIBRARY) +leveldb_server: thrift/server.o leveldb_shell $(LIBRARY) $(CXX) thrift/server.o $(LIBRARY) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) leveldb_server_test: thrift/test/simpletest.o $(LIBRARY) @@ -196,9 +195,6 @@ sst_dump: tools/sst_dump.o $(LIBOBJECTS) $(VERSIONFILE): build_detect_version $(shell ./build_detect_platform build_config.mk) -filelock_test: util/filelock_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) util/filelock_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS) - # recreate the version file with the latest git revision $(VERSIONFILE): build_detect_version $(shell ./build_detect_platform build_config.mk) diff --git a/db/builder.cc b/db/builder.cc index 7b03d3c14..4a07aeeb2 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -54,7 +54,11 @@ Status BuildTable(const std::string& dbname, // Finish and check for file errors if (s.ok() && !options.disableDataSync) { - s = file->Sync(); + if (options.use_fsync) { + s = file->Fsync(); + } else { + s = file->Sync(); + } } if (s.ok()) { s = file->Close(); diff --git a/db/db_bench.cc b/db/db_bench.cc index 2ee030c90..7e768a444 100644 --- a/db/db_bench.cc +++ b/db/db_bench.cc @@ -128,6 +128,9 @@ static bool FLAGS_sync = false; // If true, do not wait until data is synced to disk. static bool FLAGS_disable_data_sync = false; +// If true, issue fsync instead of fdatasync +static bool FLAGS_use_fsync = false; + // If true, do not write WAL for write. static bool FLAGS_disable_wal = false; @@ -774,6 +777,7 @@ class Benchmark { options.statistics = dbstats; options.env = FLAGS_env; options.disableDataSync = FLAGS_disable_data_sync; + options.use_fsync = FLAGS_use_fsync; options.target_file_size_base = FLAGS_target_file_size_base; options.target_file_size_multiplier = FLAGS_target_file_size_multiplier; options.max_bytes_for_level_base = FLAGS_max_bytes_for_level_base; @@ -1070,6 +1074,9 @@ int main(int argc, char** argv) { } else if (sscanf(argv[i], "--disable_data_sync=%d%c", &n, &junk) == 1 && (n == 0 || n == 1)) { FLAGS_disable_data_sync = n; + } else if (sscanf(argv[i], "--use_fsync=%d%c", &n, &junk) == 1 && + (n == 0 || n == 1)) { + FLAGS_use_fsync = n; } else if (sscanf(argv[i], "--disable_wal=%d%c", &n, &junk) == 1 && (n == 0 || n == 1)) { FLAGS_disable_wal = n; diff --git a/db/db_impl.cc b/db/db_impl.cc index f3b9c5279..17c52454e 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -858,7 +858,11 @@ Status DBImpl::FinishCompactionOutputFile(CompactionState* compact, // Finish and check for file errors if (s.ok() && !options_.disableDataSync) { - s = compact->outfile->Sync(); + if (options_.use_fsync) { + s = compact->outfile->Fsync(); + } else { + s = compact->outfile->Sync(); + } } if (s.ok()) { s = compact->outfile->Close(); @@ -1235,7 +1239,11 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { if (!options.disableWAL) { status = log_->AddRecord(WriteBatchInternal::Contents(updates)); if (status.ok() && options.sync) { - status = logfile_->Sync(); + if (options_.use_fsync) { + status = logfile_->Fsync(); + } else { + status = logfile_->Sync(); + } } } if (status.ok()) { diff --git a/db/version_set.cc b/db/version_set.cc index 7e16c1fd6..56a78e8f5 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -793,7 +793,11 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu) { edit->EncodeTo(&record); s = descriptor_log_->AddRecord(record); if (s.ok()) { - s = descriptor_file_->Sync(); + if (options_->use_fsync) { + s = descriptor_file_->Fsync(); + } else { + s = descriptor_file_->Sync(); + } } } diff --git a/include/leveldb/env.h b/include/leveldb/env.h index 378212ab0..2d045cc1d 100644 --- a/include/leveldb/env.h +++ b/include/leveldb/env.h @@ -217,7 +217,17 @@ class WritableFile { virtual Status Append(const Slice& data) = 0; virtual Status Close() = 0; virtual Status Flush() = 0; - virtual Status Sync() = 0; + virtual Status Sync() = 0; // sync data + + /* + * Sync data and/or metadata as well. + * By default, sync only metadata. + * Override this method for environments where we need to sync + * metadata as well. + */ + virtual Status Fsync() { + return Sync(); + } private: // No copying allowed diff --git a/include/leveldb/options.h b/include/leveldb/options.h index a271905fc..6fa9f7068 100644 --- a/include/leveldb/options.h +++ b/include/leveldb/options.h @@ -198,6 +198,13 @@ struct Options { // Default: false bool disableDataSync; + // If true, then every store to stable storage will issue a fsync. + // If false, then every store to stable storage will issue a fdatasync. + // This parameter should be set to true while storing data to + // filesystem like ext3 which can lose files after a reboot. + // Default: false + bool use_fsync; + // This number controls how often a new scribe log about // db deploy stats is written out. // -1 indicates no logging at all. diff --git a/util/env_posix.cc b/util/env_posix.cc index b343e4c70..ee294e472 100644 --- a/util/env_posix.cc +++ b/util/env_posix.cc @@ -294,6 +294,22 @@ class PosixMmapFile : public WritableFile { return s; } + + /** + * Flush data as well as metadata to stable storage. + */ + virtual Status Fsync() { + if (pending_sync_) { + // Some unmapped data was not synced + pending_sync_ = false; + if (fsync(fd_) < 0) { + return IOError(filename_, errno); + } + } + // This invocation to Sync will not issue the call to + // fdatasync because pending_sync_ has already been cleared. + return Sync(); + } }; static int LockOrUnlock(const std::string& fname, int fd, bool lock) { diff --git a/util/options.cc b/util/options.cc index c9306387d..d05c5e0e3 100644 --- a/util/options.cc +++ b/util/options.cc @@ -37,6 +37,7 @@ Options::Options() filter_policy(NULL), statistics(NULL), disableDataSync(false), + use_fsync(false), db_stats_log_interval(1800) { } @@ -56,7 +57,33 @@ Options::Dump( Log(log," Options.block_size: %zd", block_size); Log(log,"Options.block_restart_interval: %d", block_restart_interval); Log(log," Options.compression: %d", compression); - Log(log," Options.filter_policy: %s", filter_policy == NULL ? "NULL" : filter_policy->Name()); + Log(log," Options.filter_policy: %s", + filter_policy == NULL ? "NULL" : filter_policy->Name()); + Log(log," Options.num_levels: %d", num_levels); + Log(log," Options.disableDataSync: %d", disableDataSync); + Log(log," Options.use_fsync: %d", use_fsync); + Log(log," Options.db_stats_log_interval: %d", + db_stats_log_interval); + Log(log," Options.level0_file_num_compaction_trigger: %d", + level0_file_num_compaction_trigger); + Log(log," Options.level0_slowdown_writes_trigger: %d", + level0_slowdown_writes_trigger); + Log(log," Options.level0_stop_writes_trigger: %d", + level0_stop_writes_trigger); + Log(log," Options.max_mem_compaction_level: %d", + max_mem_compaction_level); + Log(log," Options.target_file_size_base: %d", + target_file_size_base); + Log(log," Options.target_file_size_multiplier: %d", + target_file_size_multiplier); + Log(log," Options.max_bytes_for_level_base: %d", + max_bytes_for_level_base); + Log(log," Options.max_bytes_for_level_multiplier: %d", + max_bytes_for_level_multiplier); + Log(log," Options.expanded_compaction_factor: %d", + expanded_compaction_factor); + Log(log," Options.max_grandparent_overlap_factor: %d", + max_grandparent_overlap_factor); } // Options::Dump