Introduce a new method Env->Fsync() that issues fsync (instead of fdatasync).

Summary:
Introduce a new method Env->Fsync() that issues fsync (instead of fdatasync).
This is needed for data durability when running on ext3 filesystems.
Added options to the benchmark db_bench to generate performance numbers
with either fsync or fdatasync enabled.

Cleaned up Makefile to build leveldb_shell only when building the thrift
leveldb server.

Test Plan: build and run benchmark

Reviewers: heyongqiang

Reviewed By: heyongqiang

Differential Revision: https://reviews.facebook.net/D4911
This commit is contained in:
Dhruba Borthakur 2012-08-27 12:10:26 -07:00
parent e675351ad7
commit fc20273e73
9 changed files with 90 additions and 11 deletions

View File

@ -58,7 +58,6 @@ TESTS = \
TOOLS = \ TOOLS = \
manifest_dump \ manifest_dump \
leveldb_shell \
sst_dump sst_dump
PROGRAMS = db_bench $(TESTS) $(TOOLS) PROGRAMS = db_bench $(TESTS) $(TOOLS)
@ -171,7 +170,7 @@ $(MEMENVLIBRARY) : $(MEMENVOBJECTS)
memenv_test : helpers/memenv/memenv_test.o $(MEMENVLIBRARY) $(LIBRARY) $(TESTHARNESS) memenv_test : helpers/memenv/memenv_test.o $(MEMENVLIBRARY) $(LIBRARY) $(TESTHARNESS)
$(CXX) helpers/memenv/memenv_test.o $(MEMENVLIBRARY) $(LIBRARY) $(TESTHARNESS) -o $@ $(LDFLAGS) $(CXX) helpers/memenv/memenv_test.o $(MEMENVLIBRARY) $(LIBRARY) $(TESTHARNESS) -o $@ $(LDFLAGS)
leveldb_server: thrift/server.o $(LIBRARY) leveldb_server: thrift/server.o leveldb_shell $(LIBRARY)
$(CXX) thrift/server.o $(LIBRARY) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(CXX) thrift/server.o $(LIBRARY) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS)
leveldb_server_test: thrift/test/simpletest.o $(LIBRARY) leveldb_server_test: thrift/test/simpletest.o $(LIBRARY)
@ -196,9 +195,6 @@ sst_dump: tools/sst_dump.o $(LIBOBJECTS)
$(VERSIONFILE): build_detect_version $(VERSIONFILE): build_detect_version
$(shell ./build_detect_platform build_config.mk) $(shell ./build_detect_platform build_config.mk)
filelock_test: util/filelock_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) util/filelock_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS)
# recreate the version file with the latest git revision # recreate the version file with the latest git revision
$(VERSIONFILE): build_detect_version $(VERSIONFILE): build_detect_version
$(shell ./build_detect_platform build_config.mk) $(shell ./build_detect_platform build_config.mk)

View File

@ -54,7 +54,11 @@ Status BuildTable(const std::string& dbname,
// Finish and check for file errors // Finish and check for file errors
if (s.ok() && !options.disableDataSync) { if (s.ok() && !options.disableDataSync) {
s = file->Sync(); if (options.use_fsync) {
s = file->Fsync();
} else {
s = file->Sync();
}
} }
if (s.ok()) { if (s.ok()) {
s = file->Close(); s = file->Close();

View File

@ -128,6 +128,9 @@ static bool FLAGS_sync = false;
// If true, do not wait until data is synced to disk. // If true, do not wait until data is synced to disk.
static bool FLAGS_disable_data_sync = false; static bool FLAGS_disable_data_sync = false;
// If true, issue fsync instead of fdatasync
static bool FLAGS_use_fsync = false;
// If true, do not write WAL for write. // If true, do not write WAL for write.
static bool FLAGS_disable_wal = false; static bool FLAGS_disable_wal = false;
@ -774,6 +777,7 @@ class Benchmark {
options.statistics = dbstats; options.statistics = dbstats;
options.env = FLAGS_env; options.env = FLAGS_env;
options.disableDataSync = FLAGS_disable_data_sync; options.disableDataSync = FLAGS_disable_data_sync;
options.use_fsync = FLAGS_use_fsync;
options.target_file_size_base = FLAGS_target_file_size_base; options.target_file_size_base = FLAGS_target_file_size_base;
options.target_file_size_multiplier = FLAGS_target_file_size_multiplier; options.target_file_size_multiplier = FLAGS_target_file_size_multiplier;
options.max_bytes_for_level_base = FLAGS_max_bytes_for_level_base; options.max_bytes_for_level_base = FLAGS_max_bytes_for_level_base;
@ -1070,6 +1074,9 @@ int main(int argc, char** argv) {
} else if (sscanf(argv[i], "--disable_data_sync=%d%c", &n, &junk) == 1 && } else if (sscanf(argv[i], "--disable_data_sync=%d%c", &n, &junk) == 1 &&
(n == 0 || n == 1)) { (n == 0 || n == 1)) {
FLAGS_disable_data_sync = n; FLAGS_disable_data_sync = n;
} else if (sscanf(argv[i], "--use_fsync=%d%c", &n, &junk) == 1 &&
(n == 0 || n == 1)) {
FLAGS_use_fsync = n;
} else if (sscanf(argv[i], "--disable_wal=%d%c", &n, &junk) == 1 && } else if (sscanf(argv[i], "--disable_wal=%d%c", &n, &junk) == 1 &&
(n == 0 || n == 1)) { (n == 0 || n == 1)) {
FLAGS_disable_wal = n; FLAGS_disable_wal = n;

View File

@ -858,7 +858,11 @@ Status DBImpl::FinishCompactionOutputFile(CompactionState* compact,
// Finish and check for file errors // Finish and check for file errors
if (s.ok() && !options_.disableDataSync) { if (s.ok() && !options_.disableDataSync) {
s = compact->outfile->Sync(); if (options_.use_fsync) {
s = compact->outfile->Fsync();
} else {
s = compact->outfile->Sync();
}
} }
if (s.ok()) { if (s.ok()) {
s = compact->outfile->Close(); s = compact->outfile->Close();
@ -1235,7 +1239,11 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
if (!options.disableWAL) { if (!options.disableWAL) {
status = log_->AddRecord(WriteBatchInternal::Contents(updates)); status = log_->AddRecord(WriteBatchInternal::Contents(updates));
if (status.ok() && options.sync) { if (status.ok() && options.sync) {
status = logfile_->Sync(); if (options_.use_fsync) {
status = logfile_->Fsync();
} else {
status = logfile_->Sync();
}
} }
} }
if (status.ok()) { if (status.ok()) {

View File

@ -793,7 +793,11 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu) {
edit->EncodeTo(&record); edit->EncodeTo(&record);
s = descriptor_log_->AddRecord(record); s = descriptor_log_->AddRecord(record);
if (s.ok()) { if (s.ok()) {
s = descriptor_file_->Sync(); if (options_->use_fsync) {
s = descriptor_file_->Fsync();
} else {
s = descriptor_file_->Sync();
}
} }
} }

View File

@ -217,7 +217,17 @@ class WritableFile {
virtual Status Append(const Slice& data) = 0; virtual Status Append(const Slice& data) = 0;
virtual Status Close() = 0; virtual Status Close() = 0;
virtual Status Flush() = 0; virtual Status Flush() = 0;
virtual Status Sync() = 0; virtual Status Sync() = 0; // sync data
/*
* Sync data and/or metadata as well.
* By default, sync only metadata.
* Override this method for environments where we need to sync
* metadata as well.
*/
virtual Status Fsync() {
return Sync();
}
private: private:
// No copying allowed // No copying allowed

View File

@ -198,6 +198,13 @@ struct Options {
// Default: false // Default: false
bool disableDataSync; bool disableDataSync;
// If true, then every store to stable storage will issue a fsync.
// If false, then every store to stable storage will issue a fdatasync.
// This parameter should be set to true while storing data to
// filesystem like ext3 which can lose files after a reboot.
// Default: false
bool use_fsync;
// This number controls how often a new scribe log about // This number controls how often a new scribe log about
// db deploy stats is written out. // db deploy stats is written out.
// -1 indicates no logging at all. // -1 indicates no logging at all.

View File

@ -294,6 +294,22 @@ class PosixMmapFile : public WritableFile {
return s; return s;
} }
/**
* Flush data as well as metadata to stable storage.
*/
virtual Status Fsync() {
if (pending_sync_) {
// Some unmapped data was not synced
pending_sync_ = false;
if (fsync(fd_) < 0) {
return IOError(filename_, errno);
}
}
// This invocation to Sync will not issue the call to
// fdatasync because pending_sync_ has already been cleared.
return Sync();
}
}; };
static int LockOrUnlock(const std::string& fname, int fd, bool lock) { static int LockOrUnlock(const std::string& fname, int fd, bool lock) {

View File

@ -37,6 +37,7 @@ Options::Options()
filter_policy(NULL), filter_policy(NULL),
statistics(NULL), statistics(NULL),
disableDataSync(false), disableDataSync(false),
use_fsync(false),
db_stats_log_interval(1800) { db_stats_log_interval(1800) {
} }
@ -56,7 +57,33 @@ Options::Dump(
Log(log," Options.block_size: %zd", block_size); Log(log," Options.block_size: %zd", block_size);
Log(log,"Options.block_restart_interval: %d", block_restart_interval); Log(log,"Options.block_restart_interval: %d", block_restart_interval);
Log(log," Options.compression: %d", compression); Log(log," Options.compression: %d", compression);
Log(log," Options.filter_policy: %s", filter_policy == NULL ? "NULL" : filter_policy->Name()); Log(log," Options.filter_policy: %s",
filter_policy == NULL ? "NULL" : filter_policy->Name());
Log(log," Options.num_levels: %d", num_levels);
Log(log," Options.disableDataSync: %d", disableDataSync);
Log(log," Options.use_fsync: %d", use_fsync);
Log(log," Options.db_stats_log_interval: %d",
db_stats_log_interval);
Log(log," Options.level0_file_num_compaction_trigger: %d",
level0_file_num_compaction_trigger);
Log(log," Options.level0_slowdown_writes_trigger: %d",
level0_slowdown_writes_trigger);
Log(log," Options.level0_stop_writes_trigger: %d",
level0_stop_writes_trigger);
Log(log," Options.max_mem_compaction_level: %d",
max_mem_compaction_level);
Log(log," Options.target_file_size_base: %d",
target_file_size_base);
Log(log," Options.target_file_size_multiplier: %d",
target_file_size_multiplier);
Log(log," Options.max_bytes_for_level_base: %d",
max_bytes_for_level_base);
Log(log," Options.max_bytes_for_level_multiplier: %d",
max_bytes_for_level_multiplier);
Log(log," Options.expanded_compaction_factor: %d",
expanded_compaction_factor);
Log(log," Options.max_grandparent_overlap_factor: %d",
max_grandparent_overlap_factor);
} // Options::Dump } // Options::Dump