From bd998a5213b8e05c4e130acd2ec5c01f67abcc6e Mon Sep 17 00:00:00 2001 From: Bart Trojanowski Date: Tue, 3 Dec 2013 16:27:12 -0500 Subject: [PATCH 01/22] fix missing gflags library On Debian/testing and RHEL6 builds would fail due to undefined references to google::FlagRegisterer::FlagRegisterer. It would seem that -lgflags was missing from the build script. --- build_tools/build_detect_platform | 1 + 1 file changed, 1 insertion(+) diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform index dfe89963a..5d2434539 100755 --- a/build_tools/build_detect_platform +++ b/build_tools/build_detect_platform @@ -184,6 +184,7 @@ EOF EOF if [ "$?" = 0 ]; then COMMON_FLAGS="$COMMON_FLAGS -DGFLAGS" + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lgflags" fi # Test whether zlib library is installed From 43c386b72ee834c88a1a22500ce1fc36a8208277 Mon Sep 17 00:00:00 2001 From: James Golick Date: Tue, 10 Dec 2013 22:34:19 -0800 Subject: [PATCH 02/22] only try to use fallocate if it's actually present on the system --- build_tools/build_detect_platform | 12 ++++++++++++ util/env_posix.cc | 8 ++++---- util/posix_logger.h | 2 +- 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform index 59e2e4619..96a1fb331 100755 --- a/build_tools/build_detect_platform +++ b/build_tools/build_detect_platform @@ -189,6 +189,18 @@ EOF COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_ATOMIC_PRESENT" fi + # Test whether fallocate is available + $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null < + int main() { + int fd = open("/dev/null", 0); + fallocate(fd, 0, 0, 1024); + } +EOF + if [ "$?" = 0 ]; then + COMMON_FLAGS="$PLATFORM_LDFLAGS -DROCKSDB_FALLOCATE_PRESENT" + fi + # Test whether Snappy library is installed # http://code.google.com/p/snappy/ $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null < Date: Wed, 11 Dec 2013 11:18:00 -0800 Subject: [PATCH 03/22] oops - missed a spot --- util/env_posix.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/util/env_posix.cc b/util/env_posix.cc index e81c59dcc..2be524e95 100644 --- a/util/env_posix.cc +++ b/util/env_posix.cc @@ -1297,7 +1297,7 @@ class PosixEnv : public Env { } bool SupportsFastAllocate(const std::string& path) { -#ifdef OS_LINUX +#ifdef ROCKSDB_FALLOCATE_PRESENT struct statfs s; if (statfs(path.c_str(), &s)){ return false; From ca92068b12c7d2c4ba9cfc6022dc7dfaf6ba0708 Mon Sep 17 00:00:00 2001 From: Mark Callaghan Date: Wed, 18 Dec 2013 16:50:48 -0800 Subject: [PATCH 04/22] Add 'readtocache' test Summary: For some tests I want to cache the database prior to running other tests on the same invocation of db_bench. The readtocache test ignores --threads and --reads so those can be used by other tests and it will still do a full read of --num rows with one thread. It might be invoked like: db_bench --benchmarks=readtocache,readrandom --reads 100 --num 10000 --threads 8 Task ID: # Blame Rev: Test Plan: run db_bench Revert Plan: Database Impact: Memcache Impact: Other Notes: EImportant: - begin *PUBLIC* platform impact section - Bugzilla: # - end platform impact - Reviewers: dhruba CC: leveldb Differential Revision: https://reviews.facebook.net/D14739 --- db/db_bench.cc | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/db/db_bench.cc b/db/db_bench.cc index 158a5faa2..eb5d7cb42 100644 --- a/db/db_bench.cc +++ b/db/db_bench.cc @@ -48,6 +48,7 @@ DEFINE_string(benchmarks, "compact," "readrandom," "readseq," + "readtocache," "readreverse," "readwhilewriting," "readrandomwriterandom," @@ -75,6 +76,7 @@ DEFINE_string(benchmarks, "\tdeleteseq -- delete N keys in sequential order\n" "\tdeleterandom -- delete N keys in random order\n" "\treadseq -- read N times sequentially\n" + "\treadtocache -- 1 thread reading database sequentially\n" "\treadreverse -- read N times in reverse order\n" "\treadrandom -- read N times in random order\n" "\treadmissing -- read N missing keys in random order\n" @@ -1057,6 +1059,10 @@ class Benchmark { method = &Benchmark::WriteRandom; } else if (name == Slice("readseq")) { method = &Benchmark::ReadSequential; + } else if (name == Slice("readtocache")) { + method = &Benchmark::ReadSequential; + num_threads = 1; + reads_ = num_; } else if (name == Slice("readreverse")) { method = &Benchmark::ReadReverse; } else if (name == Slice("readrandom")) { From 1fdb3f7dc60e96394e3e5b69a46ede5d67fb976c Mon Sep 17 00:00:00 2001 From: Igor Canadi Date: Fri, 20 Dec 2013 09:57:58 -0800 Subject: [PATCH 05/22] [RocksDB] Optimize locking for Get Summary: Instead of locking and saving a DB state, we can cache a DB state and update it only when it changes. This change reduces lock contention and speeds up read operations on the DB. Performance improvements are substantial, although there is some cost in no-read workloads. I ran the regression tests on my devserver and here are the numbers: overwrite 56345 -> 63001 fillseq 193730 -> 185296 readrandom 771301 -> 1219803 (58% improvement!) readrandom_smallblockcache 677609 -> 862850 readrandom_memtable_sst 710440 -> 1109223 readrandom_fillunique_random 221589 -> 247869 memtablefillrandom 105286 -> 92643 memtablereadrandom 763033 -> 1288862 Test Plan: make asan_check I am also running db_stress Reviewers: dhruba, haobo, sdong, kailiu Reviewed By: haobo CC: leveldb Differential Revision: https://reviews.facebook.net/D14679 --- db/db_impl.cc | 208 +++++++++++++++++++++++++++++++++++++---------- db/db_impl.h | 74 +++++++++++++++-- db/version_set.h | 8 +- 3 files changed, 237 insertions(+), 53 deletions(-) diff --git a/db/db_impl.cc b/db/db_impl.cc index 6c57a986d..ece08db8b 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -241,6 +241,7 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname) mem_(new MemTable(internal_comparator_, mem_rep_factory_, NumberLevels(), options_)), logfile_number_(0), + super_version_(nullptr), tmp_batch_(), bg_compaction_scheduled_(0), bg_flush_scheduled_(0), @@ -316,6 +317,13 @@ DBImpl::~DBImpl() { bg_logstats_scheduled_) { bg_cv_.Wait(); } + if (super_version_ != nullptr) { + bool is_last_reference __attribute__((unused)); + is_last_reference = super_version_->Unref(); + assert(is_last_reference); + super_version_->Cleanup(); + delete super_version_; + } mutex_.Unlock(); if (db_lock_ != nullptr) { @@ -345,6 +353,13 @@ void DBImpl::TEST_Destroy_DBImpl() { bg_logstats_scheduled_) { bg_cv_.Wait(); } + if (super_version_ != nullptr) { + bool is_last_reference __attribute__((unused)); + is_last_reference = super_version_->Unref(); + assert(is_last_reference); + super_version_->Cleanup(); + delete super_version_; + } // Prevent new compactions from occuring. bg_work_gate_closed_ = true; @@ -443,6 +458,49 @@ void DBImpl::MaybeDumpStats() { } } +// DBImpl::SuperVersion methods +DBImpl::SuperVersion::SuperVersion(const int num_memtables) { + to_delete.resize(num_memtables); +} + +DBImpl::SuperVersion::~SuperVersion() { + for (auto td : to_delete) { + delete td; + } +} + +DBImpl::SuperVersion* DBImpl::SuperVersion::Ref() { + refs.fetch_add(1, std::memory_order_relaxed); + return this; +} + +bool DBImpl::SuperVersion::Unref() { + assert(refs > 0); + // fetch_sub returns the previous value of ref + return refs.fetch_sub(1, std::memory_order_relaxed) == 1; +} + +void DBImpl::SuperVersion::Cleanup() { + assert(refs.load(std::memory_order_relaxed) == 0); + imm.UnrefAll(&to_delete); + MemTable* m = mem->Unref(); + if (m != nullptr) { + to_delete.push_back(m); + } + current->Unref(); +} + +void DBImpl::SuperVersion::Init(MemTable* new_mem, const MemTableList& new_imm, + Version* new_current) { + mem = new_mem; + imm = new_imm; + current = new_current; + mem->Ref(); + imm.RefAll(); + current->Ref(); + refs.store(1, std::memory_order_relaxed); +} + // Returns the list of live files in 'sst_live' and the list // of all files in the filesystem in 'all_files'. // no_full_scan = true -- never do the full scan using GetChildren() @@ -518,11 +576,6 @@ void DBImpl::FindObsoleteFiles(DeletionState& deletion_state, // It is not necessary to hold the mutex when invoking this method. void DBImpl::PurgeObsoleteFiles(DeletionState& state) { - // free pending memtables - for (auto m : state.memtables_to_free) { - delete m; - } - // check if there is anything to do if (!state.all_files.size() && !state.sst_delete_files.size() && @@ -1188,6 +1241,7 @@ Status DBImpl::FlushMemTableToOutputFile(bool* madeProgress, file_number, pending_outputs_, &deletion_state.memtables_to_free); if (s.ok()) { + InstallSuperVersion(deletion_state); if (madeProgress) { *madeProgress = 1; } @@ -1247,11 +1301,17 @@ int DBImpl::FindMinimumEmptyLevelFitting(int level) { void DBImpl::ReFitLevel(int level, int target_level) { assert(level < NumberLevels()); - MutexLock l(&mutex_); + SuperVersion* superversion_to_free = nullptr; + SuperVersion* new_superversion = + new SuperVersion(options_.max_write_buffer_number); + + mutex_.Lock(); // only allow one thread refitting if (refitting_level_) { + mutex_.Unlock(); Log(options_.info_log, "ReFitLevel: another thread is refitting"); + delete new_superversion; return; } refitting_level_ = true; @@ -1287,6 +1347,8 @@ void DBImpl::ReFitLevel(int level, int target_level) { edit.DebugString().data()); auto status = versions_->LogAndApply(&edit, &mutex_); + superversion_to_free = InstallSuperVersion(new_superversion); + new_superversion = nullptr; Log(options_.info_log, "LogAndApply: %s\n", status.ToString().data()); @@ -1298,6 +1360,10 @@ void DBImpl::ReFitLevel(int level, int target_level) { refitting_level_ = false; bg_work_gate_closed_ = false; + + mutex_.Unlock(); + delete superversion_to_free; + delete new_superversion; } int DBImpl::NumberLevels() { @@ -1671,7 +1737,7 @@ Status DBImpl::BackgroundFlush(bool* madeProgress, void DBImpl::BackgroundCallFlush() { bool madeProgress = false; - DeletionState deletion_state(options_.max_write_buffer_number); + DeletionState deletion_state(options_.max_write_buffer_number, true); assert(bg_flush_scheduled_); MutexLock l(&mutex_); @@ -1717,7 +1783,7 @@ void DBImpl::TEST_PurgeObsoleteteWAL() { void DBImpl::BackgroundCallCompaction() { bool madeProgress = false; - DeletionState deletion_state(options_.max_write_buffer_number); + DeletionState deletion_state(options_.max_write_buffer_number, true); MaybeDumpStats(); @@ -1770,7 +1836,7 @@ void DBImpl::BackgroundCallCompaction() { } Status DBImpl::BackgroundCompaction(bool* madeProgress, - DeletionState& deletion_state) { + DeletionState& deletion_state) { *madeProgress = false; mutex_.AssertHeld(); @@ -1823,6 +1889,7 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, f->smallest, f->largest, f->smallest_seqno, f->largest_seqno); status = versions_->LogAndApply(c->edit(), &mutex_); + InstallSuperVersion(deletion_state); VersionSet::LevelSummaryStorage tmp; Log(options_.info_log, "Moved #%lld to level-%d %lld bytes %s: %s\n", static_cast(f->number), @@ -2484,6 +2551,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact, if (status.ok()) { status = InstallCompactionResults(compact); + InstallSuperVersion(deletion_state); } VersionSet::LevelSummaryStorage tmp; Log(options_.info_log, @@ -2588,6 +2656,44 @@ Status DBImpl::Get(const ReadOptions& options, return GetImpl(options, key, value); } +// DeletionState gets created and destructed outside of the lock -- we +// use this convinently to: +// * malloc one SuperVersion() outside of the lock -- new_superversion +// * delete one SuperVersion() outside of the lock -- superversion_to_free +// +// However, if InstallSuperVersion() gets called twice with the same, +// deletion_state, we can't reuse the SuperVersion() that got malloced because +// first call already used it. In that rare case, we take a hit and create a +// new SuperVersion() inside of the mutex. We do similar thing +// for superversion_to_free +void DBImpl::InstallSuperVersion(DeletionState& deletion_state) { + // if new_superversion == nullptr, it means somebody already used it + SuperVersion* new_superversion = + (deletion_state.new_superversion != nullptr) ? + deletion_state.new_superversion : new SuperVersion(); + SuperVersion* old_superversion = InstallSuperVersion(new_superversion); + deletion_state.new_superversion = nullptr; + if (deletion_state.superversion_to_free != nullptr) { + // somebody already put it there + delete old_superversion; + } else { + deletion_state.superversion_to_free = old_superversion; + } +} + +DBImpl::SuperVersion* DBImpl::InstallSuperVersion( + SuperVersion* new_superversion) { + mutex_.AssertHeld(); + new_superversion->Init(mem_, imm_, versions_->current()); + SuperVersion* old_superversion = super_version_; + super_version_ = new_superversion; + if (old_superversion != nullptr && old_superversion->Unref()) { + old_superversion->Cleanup(); + return old_superversion; // will let caller delete outside of mutex + } + return nullptr; +} + Status DBImpl::GetImpl(const ReadOptions& options, const Slice& key, std::string* value, @@ -2596,27 +2702,20 @@ Status DBImpl::GetImpl(const ReadOptions& options, StopWatch sw(env_, options_.statistics.get(), DB_GET); SequenceNumber snapshot; - std::vector to_delete; - mutex_.Lock(); if (options.snapshot != nullptr) { snapshot = reinterpret_cast(options.snapshot)->number_; } else { snapshot = versions_->LastSequence(); } - MemTable* mem = mem_; - MemTableList imm = imm_; - Version* current = versions_->current(); - mem->Ref(); - imm.RefAll(); - current->Ref(); - - // Unlock while reading from files and memtables + // This can be replaced by using atomics and spinlock instead of big mutex + mutex_.Lock(); + SuperVersion* get_version = super_version_->Ref(); mutex_.Unlock(); + bool have_stat_update = false; Version::GetStats stats; - // Prepare to store a list of merge operations if merge occurs. MergeContext merge_context; @@ -2624,32 +2723,41 @@ Status DBImpl::GetImpl(const ReadOptions& options, // s is both in/out. When in, s could either be OK or MergeInProgress. // merge_operands will contain the sequence of merges in the latter case. LookupKey lkey(key, snapshot); - if (mem->Get(lkey, value, &s, merge_context, options_)) { + if (get_version->mem->Get(lkey, value, &s, merge_context, options_)) { // Done RecordTick(options_.statistics.get(), MEMTABLE_HIT); - } else if (imm.Get(lkey, value, &s, merge_context, options_)) { + } else if (get_version->imm.Get(lkey, value, &s, merge_context, options_)) { // Done RecordTick(options_.statistics.get(), MEMTABLE_HIT); } else { - current->Get(options, lkey, value, &s, &merge_context, &stats, - options_, value_found); + get_version->current->Get(options, lkey, value, &s, &merge_context, &stats, + options_, value_found); have_stat_update = true; RecordTick(options_.statistics.get(), MEMTABLE_MISS); } - mutex_.Lock(); - if (!options_.disable_seek_compaction && - have_stat_update && current->UpdateStats(stats)) { - MaybeScheduleFlushOrCompaction(); + bool delete_get_version = false; + if (!options_.disable_seek_compaction && have_stat_update) { + mutex_.Lock(); + if (get_version->current->UpdateStats(stats)) { + MaybeScheduleFlushOrCompaction(); + } + if (get_version->Unref()) { + get_version->Cleanup(); + delete_get_version = true; + } + mutex_.Unlock(); + } else { + if (get_version->Unref()) { + mutex_.Lock(); + get_version->Cleanup(); + mutex_.Unlock(); + delete_get_version = true; + } + } + if (delete_get_version) { + delete get_version; } - MemTable* m = mem->Unref(); - imm.UnrefAll(&to_delete); - current->Unref(); - mutex_.Unlock(); - - // free up all obsolete memtables outside the mutex - delete m; - for (MemTable* v: to_delete) delete v; // Note, tickers are atomic now - no lock protection needed any more. RecordTick(options_.statistics.get(), NUMBER_KEYS_READ); @@ -2813,7 +2921,7 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { w.done = false; StopWatch sw(env_, options_.statistics.get(), DB_WRITE); - MutexLock l(&mutex_); + mutex_.Lock(); writers_.push_back(&w); while (!w.done && &w != writers_.front()) { w.cv.Wait(); @@ -2824,6 +2932,7 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { } if (w.done) { + mutex_.Unlock(); RecordTick(options_.statistics.get(), WRITE_DONE_BY_OTHER, 1); return w.status; } else { @@ -2831,7 +2940,8 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { } // May temporarily unlock and wait. - Status status = MakeRoomForWrite(my_batch == nullptr); + SuperVersion* superversion_to_free = nullptr; + Status status = MakeRoomForWrite(my_batch == nullptr, &superversion_to_free); uint64_t last_sequence = versions_->LastSequence(); Writer* last_writer = &w; if (status.ok() && my_batch != nullptr) { // nullptr batch is for compactions @@ -2919,6 +3029,8 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { if (!writers_.empty()) { writers_.front()->cv.Signal(); } + mutex_.Unlock(); + delete superversion_to_free; return status; } @@ -3011,7 +3123,8 @@ uint64_t DBImpl::SlowdownAmount(int n, int top, int bottom) { // REQUIRES: mutex_ is held // REQUIRES: this thread is currently at the front of the writer queue -Status DBImpl::MakeRoomForWrite(bool force) { +Status DBImpl::MakeRoomForWrite(bool force, + SuperVersion** superversion_to_free) { mutex_.AssertHeld(); assert(!writers_.empty()); bool allow_delay = !force; @@ -3020,6 +3133,7 @@ Status DBImpl::MakeRoomForWrite(bool force) { uint64_t rate_limit_delay_millis = 0; Status s; double score; + *superversion_to_free = nullptr; while (true) { if (!bg_error_.ok()) { @@ -3146,6 +3260,7 @@ Status DBImpl::MakeRoomForWrite(bool force) { // Do this without holding the dbmutex lock. assert(versions_->PrevLogNumber() == 0); uint64_t new_log_number = versions_->NewFileNumber(); + SuperVersion* new_superversion = nullptr; mutex_.Unlock(); { EnvOptions soptions(storage_options_); @@ -3162,6 +3277,7 @@ Status DBImpl::MakeRoomForWrite(bool force) { lfile->SetPreallocationBlockSize(1.1 * options_.write_buffer_size); memtmp = new MemTable( internal_comparator_, mem_rep_factory_, NumberLevels(), options_); + new_superversion = new SuperVersion(options_.max_write_buffer_number); } } mutex_.Lock(); @@ -3186,6 +3302,7 @@ Status DBImpl::MakeRoomForWrite(bool force) { mem_->SetLogNumber(logfile_number_); force = false; // Do not force another compaction if have room MaybeScheduleFlushOrCompaction(); + *superversion_to_free = InstallSuperVersion(new_superversion); } } return s; @@ -3541,7 +3658,7 @@ Status DBImpl::DeleteFile(std::string name) { FileMetaData metadata; int maxlevel = NumberLevels(); VersionEdit edit(maxlevel); - DeletionState deletion_state; + DeletionState deletion_state(0, true); { MutexLock l(&mutex_); status = versions_->GetMetadataForFile(number, &level, &metadata); @@ -3571,14 +3688,14 @@ Status DBImpl::DeleteFile(std::string name) { } edit.DeleteFile(level, number); status = versions_->LogAndApply(&edit, &mutex_); + if (status.ok()) { + InstallSuperVersion(deletion_state); + } FindObsoleteFiles(deletion_state, false); } // lock released here LogFlush(options_.info_log); - - if (status.ok()) { - // remove files outside the db-lock - PurgeObsoleteFiles(deletion_state); - } + // remove files outside the db-lock + PurgeObsoleteFiles(deletion_state); return status; } @@ -3678,6 +3795,7 @@ Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) { s = impl->versions_->LogAndApply(&edit, &impl->mutex_); } if (s.ok()) { + delete impl->InstallSuperVersion(new DBImpl::SuperVersion()); impl->mem_->SetLogNumber(impl->logfile_number_); impl->DeleteObsoleteFiles(); impl->MaybeScheduleFlushOrCompaction(); diff --git a/db/db_impl.h b/db/db_impl.h index 39e132979..2447b31fa 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -128,12 +128,38 @@ class DBImpl : public DB { default_interval_to_delete_obsolete_WAL_ = default_interval_to_delete_obsolete_WAL; } - // needed for CleanupIteratorState + // holds references to memtable, all immutable memtables and version + struct SuperVersion { + MemTable* mem; + MemTableList imm; + Version* current; + std::atomic refs; + // We need to_delete because during Cleanup(), imm.UnrefAll() returns + // all memtables that we need to free through this vector. We then + // delete all those memtables outside of mutex, during destruction + std::vector to_delete; + // should be called outside the mutex + explicit SuperVersion(const int num_memtables = 0); + ~SuperVersion(); + SuperVersion* Ref(); + // Returns true if this was the last reference and caller should + // call Clenaup() and delete the object + bool Unref(); + + // call these two methods with db mutex held + // Cleanup unrefs mem, imm and current. Also, it stores all memtables + // that needs to be deleted in to_delete vector. Unrefing those + // objects needs to be done in the mutex + void Cleanup(); + void Init(MemTable* new_mem, const MemTableList& new_imm, + Version* new_current); + }; + + // needed for CleanupIteratorState struct DeletionState { inline bool HaveSomethingToDelete() const { - return memtables_to_free.size() || - all_files.size() || + return all_files.size() || sst_delete_files.size() || log_delete_files.size(); } @@ -155,15 +181,35 @@ class DBImpl : public DB { // a list of memtables to be free std::vector memtables_to_free; + SuperVersion* superversion_to_free; // if nullptr nothing to free + + SuperVersion* new_superversion; // if nullptr no new superversion + // the current manifest_file_number, log_number and prev_log_number // that corresponds to the set of files in 'live'. uint64_t manifest_file_number, log_number, prev_log_number; - explicit DeletionState(const int num_memtables = 0) { + explicit DeletionState(const int num_memtables = 0, + bool create_superversion = false) { manifest_file_number = 0; log_number = 0; prev_log_number = 0; memtables_to_free.reserve(num_memtables); + superversion_to_free = nullptr; + new_superversion = + create_superversion ? new SuperVersion(num_memtables) : nullptr; + } + + ~DeletionState() { + // free pending memtables + for (auto m : memtables_to_free) { + delete m; + } + // free superversion. if nullptr, this will be noop + delete superversion_to_free; + // if new_superversion was not used, it will be non-nullptr and needs + // to be freed here + delete new_superversion; } }; @@ -240,7 +286,11 @@ class DBImpl : public DB { uint64_t* filenumber); uint64_t SlowdownAmount(int n, int top, int bottom); - Status MakeRoomForWrite(bool force /* compact even if there is room? */); + // MakeRoomForWrite will return superversion_to_free through an arugment, + // which the caller needs to delete. We do it because caller can delete + // the superversion outside of mutex + Status MakeRoomForWrite(bool force /* compact even if there is room? */, + SuperVersion** superversion_to_free); WriteBatch* BuildBatchGroup(Writer** last_writer); // Force current memtable contents to be flushed. @@ -324,6 +374,8 @@ class DBImpl : public DB { uint64_t logfile_number_; unique_ptr log_; + SuperVersion* super_version_; + std::string host_name_; // Queue of writers. @@ -491,6 +543,18 @@ class DBImpl : public DB { std::vector& snapshots, SequenceNumber* prev_snapshot); + // will return a pointer to SuperVersion* if previous SuperVersion + // if its reference count is zero and needs deletion or nullptr if not + // As argument takes a pointer to allocated SuperVersion + // Foreground threads call this function directly (they don't carry + // deletion state and have to handle their own creation and deletion + // of SuperVersion) + SuperVersion* InstallSuperVersion(SuperVersion* new_superversion); + // Background threads call this function, which is just a wrapper around + // the InstallSuperVersion() function above. Background threads carry + // deletion_state which can have new_superversion already allocated. + void InstallSuperVersion(DeletionState& deletion_state); + // Function that Get and KeyMayExist call with no_io true or false // Note: 'value_found' from KeyMayExist propagates here Status GetImpl(const ReadOptions& options, diff --git a/db/version_set.h b/db/version_set.h index bf466a932..75b529942 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -272,12 +272,14 @@ class VersionSet { int64_t NumLevelBytes(int level) const; // Return the last sequence number. - uint64_t LastSequence() const { return last_sequence_; } + uint64_t LastSequence() const { + return last_sequence_.load(std::memory_order_acquire); + } // Set the last sequence number to s. void SetLastSequence(uint64_t s) { assert(s >= last_sequence_); - last_sequence_ = s; + last_sequence_.store(s, std::memory_order_release); } // Mark the specified file number as used. @@ -476,7 +478,7 @@ class VersionSet { const InternalKeyComparator icmp_; uint64_t next_file_number_; uint64_t manifest_file_number_; - uint64_t last_sequence_; + std::atomic last_sequence_; uint64_t log_number_; uint64_t prev_log_number_; // 0 or backing store for memtable being compacted From b26dc9562801d935ceb1f4410fbb709851840c99 Mon Sep 17 00:00:00 2001 From: Igor Canadi Date: Fri, 20 Dec 2013 10:01:12 -0800 Subject: [PATCH 06/22] Initialize sequence number in BatchResult - issue #39 --- include/rocksdb/transaction_log.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/rocksdb/transaction_log.h b/include/rocksdb/transaction_log.h index abf072574..41a3250d8 100644 --- a/include/rocksdb/transaction_log.h +++ b/include/rocksdb/transaction_log.h @@ -56,7 +56,7 @@ class LogFile { }; struct BatchResult { - SequenceNumber sequence = SequenceNumber(); + SequenceNumber sequence = 0; std::unique_ptr writeBatchPtr; }; From 71ddb117c840f285d19d43eccf252d7c614cefc9 Mon Sep 17 00:00:00 2001 From: Dhruba Borthakur Date: Mon, 23 Dec 2013 12:19:18 -0800 Subject: [PATCH 07/22] Add a pointer to the engineering design discussion forum. Summary: Add a pointer to the engineering design discussion forum. Test Plan: Reviewers: CC: Task ID: # Blame Rev: --- README | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README b/README index c55149d44..473e4145b 100644 --- a/README +++ b/README @@ -79,4 +79,4 @@ include/rocksdb/statistics.h include/rocksdb/transaction_log.h An API to retrieve transaction logs from a database. - +Design discussions are conducted in https://www.facebook.com/groups/rocksdb.dev/ From c01676e46d3be08c3c140361ef1f5884f47d3b3c Mon Sep 17 00:00:00 2001 From: kailiu Date: Thu, 12 Dec 2013 15:32:56 -0800 Subject: [PATCH 08/22] Implement autovector Summary: A vector that leverages pre-allocated stack-based array to achieve better performance for array with small amount of items. Test Plan: Added tests for both correctness and performance Here is the performance benchmark between vector and autovector Please note that in the test "Creation and Insertion Test", the test case were designed with the motivation described below: * no element inserted: internal array of std::vector may not really get initialize. * one element inserted: internal array of std::vector must have initialized. * kSize elements inserted. This shows the most time we'll spend if we keep everything in stack. * 2 * kSize elements inserted. The internal vector of autovector must have been initialized. Note: kSize is the capacity of autovector ===================================================== Creation and Insertion Test ===================================================== created 100000 vectors: each was inserted with 0 elements total time elapsed: 128000 (ns) created 100000 autovectors: each was inserted with 0 elements total time elapsed: 3641000 (ns) created 100000 VectorWithReserveSizes: each was inserted with 0 elements total time elapsed: 9896000 (ns) ----------------------------------- created 100000 vectors: each was inserted with 1 elements total time elapsed: 11089000 (ns) created 100000 autovectors: each was inserted with 1 elements total time elapsed: 5008000 (ns) created 100000 VectorWithReserveSizes: each was inserted with 1 elements total time elapsed: 24271000 (ns) ----------------------------------- created 100000 vectors: each was inserted with 4 elements total time elapsed: 39369000 (ns) created 100000 autovectors: each was inserted with 4 elements total time elapsed: 10121000 (ns) created 100000 VectorWithReserveSizes: each was inserted with 4 elements total time elapsed: 28473000 (ns) ----------------------------------- created 100000 vectors: each was inserted with 8 elements total time elapsed: 75013000 (ns) created 100000 autovectors: each was inserted with 8 elements total time elapsed: 18237000 (ns) created 100000 VectorWithReserveSizes: each was inserted with 8 elements total time elapsed: 42464000 (ns) ----------------------------------- created 100000 vectors: each was inserted with 16 elements total time elapsed: 102319000 (ns) created 100000 autovectors: each was inserted with 16 elements total time elapsed: 76724000 (ns) created 100000 VectorWithReserveSizes: each was inserted with 16 elements total time elapsed: 68285000 (ns) ----------------------------------- ===================================================== Sequence Access Test ===================================================== performed 100000 sequence access against vector size: 4 total time elapsed: 198000 (ns) performed 100000 sequence access against autovector size: 4 total time elapsed: 306000 (ns) ----------------------------------- performed 100000 sequence access against vector size: 8 total time elapsed: 565000 (ns) performed 100000 sequence access against autovector size: 8 total time elapsed: 512000 (ns) ----------------------------------- performed 100000 sequence access against vector size: 16 total time elapsed: 1076000 (ns) performed 100000 sequence access against autovector size: 16 total time elapsed: 1070000 (ns) ----------------------------------- Reviewers: dhruba, haobo, sdong, chip Reviewed By: dhruba CC: leveldb Differential Revision: https://reviews.facebook.net/D14655 --- Makefile | 4 + util/autovector.h | 329 ++++++++++++++++++++++++++++++++++++++++ util/autovector_test.cc | 286 ++++++++++++++++++++++++++++++++++ 3 files changed, 619 insertions(+) create mode 100644 util/autovector.h create mode 100644 util/autovector_test.cc diff --git a/Makefile b/Makefile index 0b113c1b5..68bc489c3 100644 --- a/Makefile +++ b/Makefile @@ -49,6 +49,7 @@ VALGRIND_VER := $(join $(VALGRIND_VER),valgrind) VALGRIND_OPTS = --error-exitcode=$(VALGRIND_ERROR) --leak-check=full TESTS = \ + autovector_test \ db_test \ table_properties_collector_test \ arena_test \ @@ -226,6 +227,9 @@ signal_test: util/signal_test.o $(LIBOBJECTS) arena_test: util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) +autovector_test: util/autovector_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) util/autovector_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + table_properties_collector_test: db/table_properties_collector_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) db/table_properties_collector_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) diff --git a/util/autovector.h b/util/autovector.h new file mode 100644 index 000000000..2b9cb40e9 --- /dev/null +++ b/util/autovector.h @@ -0,0 +1,329 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +#pragma once + +#include +#include +#include +#include +#include + +namespace rocksdb { + +// A vector that leverages pre-allocated stack-based array to achieve better +// performance for array with small amount of items. +// +// The interface resembles that of vector, but with less features since we aim +// to solve the problem that we have in hand, rather than implementing a +// full-fledged generic container. +// +// Currently we don't support: +// * reserve()/shrink_to_fit()/resize() +// If used correctly, in most cases, people should not touch the +// underlying vector at all. +// * random insert()/erase(), please only use push_back()/pop_back(). +// * No move/swap operations. Each autovector instance has a +// stack-allocated array and if we want support move/swap operations, we +// need to copy the arrays other than just swapping the pointers. In this +// case we'll just explicitly forbid these operations since they may +// lead users to make false assumption by thinking they are inexpensive +// operations. +// +// Naming style of public methods almost follows that of the STL's. +template +class autovector { + public: + // General STL-style container member types. + typedef T value_type; + typedef typename std::vector::difference_type difference_type; + typedef typename std::vector::size_type size_type; + typedef value_type& reference; + typedef const value_type& const_reference; + typedef value_type* pointer; + typedef const value_type* const_pointer; + + // This class is the base for regular/const iterator + template + class iterator_impl { + public: + // -- iterator traits + typedef iterator_impl self_type; + typedef TValueType value_type; + typedef TValueType& reference; + typedef TValueType* pointer; + typedef typename TAutoVector::difference_type difference_type; + typedef std::random_access_iterator_tag iterator_category; + + iterator_impl(TAutoVector* vect, size_t index) + : vect_(vect) + , index_(index) { + }; + iterator_impl(const iterator_impl&) = default; + ~iterator_impl() { } + iterator_impl& operator=(const iterator_impl&) = default; + + // -- Advancement + // iterator++ + self_type& operator++() { + ++index_; + return *this; + } + + // ++iterator + self_type operator++(int) { + auto old = *this; + ++index_; + return old; + } + + // iterator-- + self_type& operator--() { + --index_; + return *this; + } + + // --iterator + self_type operator--(int) { + auto old = *this; + --index_; + return old; + } + + self_type operator-(difference_type len) { + return self_type(vect_, index_ - len); + } + + difference_type operator-(const self_type& other) { + assert(vect_ == other.vect_); + return index_ - other.index_; + } + + self_type operator+(difference_type len) { + return self_type(vect_, index_ + len); + } + + self_type& operator+=(difference_type len) { + index_ += len; + return *this; + } + + self_type& operator-=(difference_type len) { + index_ -= len; + return *this; + } + + // -- Reference + reference operator*() { + assert(vect_->size() >= index_); + return (*vect_)[index_]; + } + pointer operator->() { + assert(vect_->size() >= index_); + return &(*vect_)[index_]; + } + + // -- Logical Operators + bool operator==(const self_type& other) const { + assert(vect_ == other.vect_); + return index_ == other.index_; + } + + bool operator!=(const self_type& other) const { + return !(*this == other); + } + + bool operator>(const self_type& other) const { + assert(vect_ == other.vect_); + return index_ > other.index_; + } + + bool operator<(const self_type& other) const { + assert(vect_ == other.vect_); + return index_ < other.index_; + } + + bool operator>=(const self_type& other) const { + assert(vect_ == other.vect_); + return index_ >= other.index_; + } + + bool operator<=(const self_type& other) const { + assert(vect_ == other.vect_); + return index_ <= other.index_; + } + + private: + TAutoVector* vect_ = nullptr; + size_t index_ = 0; + }; + + typedef iterator_impl iterator; + typedef iterator_impl const_iterator; + typedef std::reverse_iterator reverse_iterator; + typedef std::reverse_iterator const_reverse_iterator; + + autovector() = default; + ~autovector() = default; + + // -- Immutable operations + // Indicate if all data resides in in-stack data structure. + bool only_in_stack() const { + // If no element was inserted at all, the vector's capacity will be `0`. + return vect_.capacity() == 0; + } + + size_type size() const { + return num_stack_items_ + vect_.size(); + } + + bool empty() const { + return size() == 0; + } + + // will not check boundry + const_reference operator[](size_type n) const { + return n < kSize ? values_[n] : vect_[n - kSize]; + } + + reference operator[](size_type n) { + return n < kSize ? values_[n] : vect_[n - kSize]; + } + + // will check boundry + const_reference at(size_type n) const { + if (n >= size()) { + throw std::out_of_range("autovector: index out of range"); + } + return (*this)[n]; + } + + reference at(size_type n) { + if (n >= size()) { + throw std::out_of_range("autovector: index out of range"); + } + return (*this)[n]; + } + + reference front() { + assert(!empty()); + return *begin(); + } + + const_reference front() const { + assert(!empty()); + return *begin(); + } + + reference back() { + assert(!empty()); + return *(end() - 1); + } + + const_reference back() const { + assert(!empty()); + return *(end() - 1); + } + + // -- Mutable Operations + void push_back(T&& item) { + if (num_stack_items_ < kSize) { + values_[num_stack_items_++] = std::move(item); + } else { + vect_.push_back(item); + } + } + + void push_back(const T& item) { + push_back(value_type(item)); + } + + template + void emplace_back(Args&&... args) { + push_back(value_type(args...)); + } + + void pop_back() { + assert(!empty()); + if (!vect_.empty()) { + vect_.pop_back(); + } else { + --num_stack_items_; + } + } + + void clear() { + num_stack_items_ = 0; + vect_.clear(); + } + + // -- Copy and Assignment + autovector& assign(const autovector& other); + + autovector(const autovector& other) { + assign(other); + } + + autovector& operator=(const autovector& other) { + return assign(other); + } + + // move operation are disallowed since it is very hard to make sure both + // autovectors are allocated from the same function stack. + autovector& operator=(autovector&& other) = delete; + autovector(autovector&& other) = delete; + + // -- Iterator Operations + iterator begin() { + return iterator(this, 0); + } + + const_iterator begin() const { + return const_iterator(this, 0); + } + + iterator end() { + return iterator(this, this->size()); + } + + const_iterator end() const { + return const_iterator(this, this->size()); + } + + reverse_iterator rbegin() { + return reverse_iterator(end()); + } + + const_reverse_iterator rbegin() const { + return const_reverse_iterator(end()); + } + + reverse_iterator rend() { + return reverse_iterator(begin()); + } + + const_reverse_iterator rend() const { + return const_reverse_iterator(begin()); + } + + private: + size_type num_stack_items_ = 0; // current number of items + value_type values_[kSize]; // the first `kSize` items + // used only if there are more than `kSize` items. + std::vector vect_; +}; + +template +autovector& autovector::assign(const autovector& other) { + // copy the internal vector + vect_.assign(other.vect_.begin(), other.vect_.end()); + + // copy array + num_stack_items_ = other.num_stack_items_; + std::copy(other.values_, other.values_ + num_stack_items_, values_); + + return *this; +} + +} // rocksdb diff --git a/util/autovector_test.cc b/util/autovector_test.cc new file mode 100644 index 000000000..31ce4ed19 --- /dev/null +++ b/util/autovector_test.cc @@ -0,0 +1,286 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include +#include + +#include "rocksdb/env.h" +#include "util/autovector.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { + +using namespace std; + +class AutoVectorTest { }; + +const size_t kSize = 8; +TEST(AutoVectorTest, PushBackAndPopBack) { + autovector vec; + ASSERT_TRUE(vec.empty()); + ASSERT_EQ(0ul, vec.size()); + + for (size_t i = 0; i < 1000 * kSize; ++i) { + vec.push_back(i); + ASSERT_TRUE(!vec.empty()); + if (i < kSize) { + ASSERT_TRUE(vec.only_in_stack()); + } else { + ASSERT_TRUE(!vec.only_in_stack()); + } + ASSERT_EQ(i + 1, vec.size()); + ASSERT_EQ(i, vec[i]); + ASSERT_EQ(i, vec.at(i)); + } + + size_t size = vec.size(); + while (size != 0) { + vec.pop_back(); + // will always be in heap + ASSERT_TRUE(!vec.only_in_stack()); + ASSERT_EQ(--size, vec.size()); + } + + ASSERT_TRUE(vec.empty()); +} + +TEST(AutoVectorTest, EmplaceBack) { + typedef std::pair ValueType; + autovector vec; + + for (size_t i = 0; i < 1000 * kSize; ++i) { + vec.emplace_back(i, std::to_string(i + 123)); + ASSERT_TRUE(!vec.empty()); + if (i < kSize) { + ASSERT_TRUE(vec.only_in_stack()); + } else { + ASSERT_TRUE(!vec.only_in_stack()); + } + + ASSERT_EQ(i + 1, vec.size()); + ASSERT_EQ(i, vec[i].first); + ASSERT_EQ(std::to_string(i + 123), vec[i].second); + } + + vec.clear(); + ASSERT_TRUE(vec.empty()); + ASSERT_TRUE(!vec.only_in_stack()); +} + +void AssertEqual( + const autovector& a, const autovector& b) { + ASSERT_EQ(a.size(), b.size()); + ASSERT_EQ(a.empty(), b.empty()); + ASSERT_EQ(a.only_in_stack(), b.only_in_stack()); + for (size_t i = 0; i < a.size(); ++i) { + ASSERT_EQ(a[i], b[i]); + } +} + +TEST(AutoVectorTest, CopyAndAssignment) { + // Test both heap-allocated and stack-allocated cases. + for (auto size : { kSize / 2, kSize * 1000 }) { + autovector vec; + for (size_t i = 0; i < size; ++i) { + vec.push_back(i); + } + + { + autovector other; + other = vec; + AssertEqual(other, vec); + } + + { + autovector other(vec); + AssertEqual(other, vec); + } + } +} + +TEST(AutoVectorTest, Iterators) { + autovector vec; + for (size_t i = 0; i < kSize * 1000; ++i) { + vec.push_back(std::to_string(i)); + } + + // basic operator test + ASSERT_EQ(vec.front(), *vec.begin()); + ASSERT_EQ(vec.back(), *(vec.end() - 1)); + ASSERT_TRUE(vec.begin() < vec.end()); + + // non-const iterator + size_t index = 0; + for (const auto& item : vec) { + ASSERT_EQ(vec[index++], item); + } + + index = vec.size() - 1; + for (auto pos = vec.rbegin(); pos != vec.rend(); ++pos) { + ASSERT_EQ(vec[index--], *pos); + } + + // const iterator + const auto& cvec = vec; + index = 0; + for (const auto& item : cvec) { + ASSERT_EQ(cvec[index++], item); + } + + index = vec.size() - 1; + for (auto pos = cvec.rbegin(); pos != cvec.rend(); ++pos) { + ASSERT_EQ(cvec[index--], *pos); + } + + // forward and backward + auto pos = vec.begin(); + while (pos != vec.end()) { + auto old_val = *pos; + auto old = pos++; + // HACK: make sure -> works + ASSERT_TRUE(!old->empty()); + ASSERT_EQ(old_val, *old); + ASSERT_TRUE(old_val != *pos); + } + + pos = vec.begin(); + for (size_t i = 0; i < vec.size(); i += 2) { + // Cannot use ASSERT_EQ since that macro depends on iostream serialization + ASSERT_TRUE(pos + 2 - 2 == pos); + pos += 2; + ASSERT_TRUE(i + 2 == pos - vec.begin()); + ASSERT_TRUE(pos >= vec.begin()); + ASSERT_TRUE(pos <= vec.end()); + } +} + +vector GetTestKeys(size_t size) { + vector keys; + keys.resize(size); + + int index = 0; + for (auto& key : keys) { + key = "item-" + to_string(index++); + } + return keys; +} + +template +void BenchmarkVectorCreationAndInsertion( + string name, size_t ops, size_t item_size, + const std::vector& items) { + auto env = Env::Default(); + + int index = 0; + auto start_time = env->NowNanos(); + auto ops_remaining = ops; + while(ops_remaining--) { + TVector v; + for (size_t i = 0; i < item_size; ++i) { + v.push_back(items[index++]); + } + } + auto elapsed = env->NowNanos() - start_time; + cout << "created " << ops << " " << name << " instances:\n\t" + << "each was inserted with " << item_size << " elements\n\t" + << "total time elapsed: " << elapsed << " (ns)" << endl; +} + +template +void BenchmarkSequenceAccess(string name, size_t ops, size_t elem_size) { + TVector v; + for (const auto& item : GetTestKeys(elem_size)) { + v.push_back(item); + } + auto env = Env::Default(); + + auto ops_remaining = ops; + auto start_time = env->NowNanos(); + size_t total = 0; + while (ops_remaining--) { + auto end = v.end(); + for (auto pos = v.begin(); pos != end; ++pos) { + total += pos->size(); + } + } + auto elapsed = env->NowNanos() - start_time; + cout << "performed " << ops << " sequence access against " << name << "\n\t" + << "size: " << elem_size << "\n\t" + << "total time elapsed: " << elapsed << " (ns)" << endl; +} + +// This test case only reports the performance between std::vector +// and autovector. We chose string for comparison because in most +// o our use cases we used std::vector. +TEST(AutoVectorTest, PerfBench) { + // We run same operations for kOps times in order to get a more fair result. + size_t kOps = 100000; + + // Creation and insertion test + // Test the case when there is: + // * no element inserted: internal array of std::vector may not really get + // initialize. + // * one element inserted: internal array of std::vector must have + // initialized. + // * kSize elements inserted. This shows the most time we'll spend if we + // keep everything in stack. + // * 2 * kSize elements inserted. The internal vector of + // autovector must have been initialized. + cout << "=====================================================" << endl; + cout << "Creation and Insertion Test (value type: std::string)" << endl; + cout << "=====================================================" << endl; + + // pre-generated unique keys + auto string_keys = GetTestKeys(kOps * 2 * kSize); + for (auto insertions : { 0ul, 1ul, kSize / 2, kSize, 2 * kSize }) { + BenchmarkVectorCreationAndInsertion>( + "vector", kOps, insertions, string_keys + ); + BenchmarkVectorCreationAndInsertion>( + "autovector", kOps, insertions, string_keys + ); + cout << "-----------------------------------" << endl; + } + + cout << "=====================================================" << endl; + cout << "Creation and Insertion Test (value type: uint64_t)" << endl; + cout << "=====================================================" << endl; + + // pre-generated unique keys + vector int_keys(kOps * 2 * kSize); + for (size_t i = 0; i < kOps * 2 * kSize; ++i) { + int_keys[i] = i; + } + for (auto insertions : { 0ul, 1ul, kSize / 2, kSize, 2 * kSize }) { + BenchmarkVectorCreationAndInsertion>( + "vector", kOps, insertions, int_keys + ); + BenchmarkVectorCreationAndInsertion>( + "autovector", kOps, insertions, int_keys + ); + cout << "-----------------------------------" << endl; + } + + // Sequence Access Test + cout << "=====================================================" << endl; + cout << "Sequence Access Test" << endl; + cout << "=====================================================" << endl; + for (auto elem_size : { kSize / 2, kSize, 2 * kSize }) { + BenchmarkSequenceAccess>( + "vector", kOps, elem_size + ); + BenchmarkSequenceAccess>( + "autovector", kOps, elem_size + ); + cout << "-----------------------------------" << endl; + } +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} From 079a21ba99cd83c7e94f631c0ba6e250e690bbf4 Mon Sep 17 00:00:00 2001 From: kailiu Date: Thu, 26 Dec 2013 15:12:30 -0800 Subject: [PATCH 09/22] Fix the unused variable warning message in mac os --- db/db_test.cc | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/db/db_test.cc b/db/db_test.cc index 8cfdedd5e..9615c8969 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -2549,12 +2549,13 @@ class DeleteFilter : public CompactionFilter { class ChangeFilter : public CompactionFilter { public: - explicit ChangeFilter(int argv) : argv_(argv) {} + explicit ChangeFilter(int argv) { + assert(argv == 100); + } virtual bool Filter(int level, const Slice& key, const Slice& value, std::string* new_value, bool* value_changed) const override { - assert(argv_ == 100); assert(new_value != nullptr); *new_value = NEW_VALUE; *value_changed = true; @@ -2564,9 +2565,6 @@ class ChangeFilter : public CompactionFilter { virtual const char* Name() const override { return "ChangeFilter"; } - - private: - const int argv_; }; class KeepFilterFactory : public CompactionFilterFactory { From 113a08c9291a8a723458f4426312e9c5add61139 Mon Sep 17 00:00:00 2001 From: kailiu Date: Thu, 26 Dec 2013 15:47:07 -0800 Subject: [PATCH 10/22] Fix [-Werror=sign-compare] in autovector_test --- util/autovector_test.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/util/autovector_test.cc b/util/autovector_test.cc index 31ce4ed19..67fb67b05 100644 --- a/util/autovector_test.cc +++ b/util/autovector_test.cc @@ -147,7 +147,8 @@ TEST(AutoVectorTest, Iterators) { } pos = vec.begin(); - for (size_t i = 0; i < vec.size(); i += 2) { + typedef autovector::difference_type diff_type; + for (diff_type i = 0; i < vec.size(); i += 2) { // Cannot use ASSERT_EQ since that macro depends on iostream serialization ASSERT_TRUE(pos + 2 - 2 == pos); pos += 2; From b40c052bfa4e5ec1777f56cf83d572eb53e6d6d1 Mon Sep 17 00:00:00 2001 From: Kai Liu Date: Thu, 26 Dec 2013 15:56:20 -0800 Subject: [PATCH 11/22] Fix all the comparison issue in fb dev servers --- util/autovector.h | 2 +- util/autovector_test.cc | 15 +++++++++------ 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/util/autovector.h b/util/autovector.h index 2b9cb40e9..9998e2956 100644 --- a/util/autovector.h +++ b/util/autovector.h @@ -6,7 +6,7 @@ #include #include -#include +#include #include #include diff --git a/util/autovector_test.cc b/util/autovector_test.cc index 67fb67b05..6d709a374 100644 --- a/util/autovector_test.cc +++ b/util/autovector_test.cc @@ -48,7 +48,7 @@ TEST(AutoVectorTest, PushBackAndPopBack) { } TEST(AutoVectorTest, EmplaceBack) { - typedef std::pair ValueType; + typedef std::pair ValueType; autovector vec; for (size_t i = 0; i < 1000 * kSize; ++i) { @@ -143,18 +143,19 @@ TEST(AutoVectorTest, Iterators) { // HACK: make sure -> works ASSERT_TRUE(!old->empty()); ASSERT_EQ(old_val, *old); - ASSERT_TRUE(old_val != *pos); + ASSERT_TRUE(pos == vec.end() || old_val != *pos); } pos = vec.begin(); - typedef autovector::difference_type diff_type; - for (diff_type i = 0; i < vec.size(); i += 2) { + for (size_t i = 0; i < vec.size(); i += 2) { // Cannot use ASSERT_EQ since that macro depends on iostream serialization ASSERT_TRUE(pos + 2 - 2 == pos); pos += 2; - ASSERT_TRUE(i + 2 == pos - vec.begin()); ASSERT_TRUE(pos >= vec.begin()); ASSERT_TRUE(pos <= vec.end()); + + size_t diff = static_cast(pos - vec.begin()); + ASSERT_EQ(i + 2, diff); } } @@ -191,7 +192,7 @@ void BenchmarkVectorCreationAndInsertion( } template -void BenchmarkSequenceAccess(string name, size_t ops, size_t elem_size) { +size_t BenchmarkSequenceAccess(string name, size_t ops, size_t elem_size) { TVector v; for (const auto& item : GetTestKeys(elem_size)) { v.push_back(item); @@ -211,6 +212,8 @@ void BenchmarkSequenceAccess(string name, size_t ops, size_t elem_size) { cout << "performed " << ops << " sequence access against " << name << "\n\t" << "size: " << elem_size << "\n\t" << "total time elapsed: " << elapsed << " (ns)" << endl; + // HACK avoid compiler's optimization to ignore total + return total; } // This test case only reports the performance between std::vector From 18df47b79aaee1bab0442a45caa9d73db8d6fa6f Mon Sep 17 00:00:00 2001 From: Siying Dong Date: Thu, 26 Dec 2013 13:49:04 -0800 Subject: [PATCH 12/22] Avoid malloc in NotFound key status if no message is given. Summary: In some places we have NotFound status created with empty message, but it doesn't avoid a malloc. With this patch, the malloc is avoided for that case. The motivation of it is that I found in db_bench readrandom test when all keys are not existing, about 4% of the total running time is spent on malloc of Status, plus a similar amount of CPU spent on free of them, which is not necessary. Test Plan: make all check Reviewers: dhruba, haobo, igor Reviewed By: haobo CC: leveldb Differential Revision: https://reviews.facebook.net/D14691 --- db/memtable.cc | 2 +- db/version_set.cc | 4 +- include/rocksdb/status.h | 31 ++++++++------ util/status.cc | 87 +++++++++++++++++++--------------------- 4 files changed, 64 insertions(+), 60 deletions(-) diff --git a/db/memtable.cc b/db/memtable.cc index d2a51a125..675a314ff 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -225,7 +225,7 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, *s = Status::Corruption("Error: Could not perform merge."); } } else { - *s = Status::NotFound(Slice()); + *s = Status::NotFound(); } return true; } diff --git a/db/version_set.cc b/db/version_set.cc index 933affd18..46cdfaa61 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -545,7 +545,7 @@ void Version::Get(const ReadOptions& options, case kFound: return; case kDeleted: - *status = Status::NotFound(Slice()); // Use empty error message for speed + *status = Status::NotFound(); // Use empty error message for speed return; case kCorrupt: *status = Status::Corruption("corrupted key for ", user_key); @@ -570,7 +570,7 @@ void Version::Get(const ReadOptions& options, user_key); } } else { - *status = Status::NotFound(Slice()); // Use an empty error message for speed + *status = Status::NotFound(); // Use an empty error message for speed } } diff --git a/include/rocksdb/status.h b/include/rocksdb/status.h index b118e3db4..e2304fdb6 100644 --- a/include/rocksdb/status.h +++ b/include/rocksdb/status.h @@ -25,7 +25,7 @@ namespace rocksdb { class Status { public: // Create a success status. - Status() : state_(nullptr) { } + Status() : code_(kOk), state_(nullptr) { } ~Status() { delete[] state_; } // Copy the specified status. @@ -39,6 +39,10 @@ class Status { static Status NotFound(const Slice& msg, const Slice& msg2 = Slice()) { return Status(kNotFound, msg, msg2); } + // Fast path for not found without malloc; + static Status NotFound() { + return Status(kNotFound); + } static Status Corruption(const Slice& msg, const Slice& msg2 = Slice()) { return Status(kCorruption, msg, msg2); } @@ -59,7 +63,7 @@ class Status { } // Returns true iff the status indicates success. - bool ok() const { return (state_ == nullptr); } + bool ok() const { return code() == kOk; } // Returns true iff the status indicates a NotFound error. bool IsNotFound() const { return code() == kNotFound; } @@ -87,13 +91,6 @@ class Status { std::string ToString() const; private: - // OK status has a nullptr state_. Otherwise, state_ is a new[] array - // of the following form: - // state_[0..3] == length of message - // state_[4] == code - // state_[5..] == message - const char* state_; - enum Code { kOk = 0, kNotFound = 1, @@ -105,20 +102,30 @@ class Status { kIncomplete = 7 }; - Code code() const { - return (state_ == nullptr) ? kOk : static_cast(state_[4]); - } + // A nullptr state_ (which is always the case for OK) means the message + // is empty. + // of the following form: + // state_[0..3] == length of message + // state_[4..] == message + Code code_; + const char* state_; + Code code() const { + return code_; + } + explicit Status(Code code) : code_(code), state_(nullptr) { } Status(Code code, const Slice& msg, const Slice& msg2); static const char* CopyState(const char* s); }; inline Status::Status(const Status& s) { + code_ = s.code_; state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_); } inline void Status::operator=(const Status& s) { // The following condition catches both aliasing (when this == &s), // and the common case where both s and *this are ok. + code_ = s.code_; if (state_ != s.state_) { delete[] state_; state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_); diff --git a/util/status.cc b/util/status.cc index f7c40e952..69060a7cc 100644 --- a/util/status.cc +++ b/util/status.cc @@ -16,68 +16,65 @@ namespace rocksdb { const char* Status::CopyState(const char* state) { uint32_t size; memcpy(&size, state, sizeof(size)); - char* result = new char[size + 5]; - memcpy(result, state, size + 5); + char* result = new char[size + 4]; + memcpy(result, state, size + 4); return result; } -Status::Status(Code code, const Slice& msg, const Slice& msg2) { +Status::Status(Code code, const Slice& msg, const Slice& msg2) : + code_(code) { assert(code != kOk); const uint32_t len1 = msg.size(); const uint32_t len2 = msg2.size(); const uint32_t size = len1 + (len2 ? (2 + len2) : 0); - char* result = new char[size + 5]; + char* result = new char[size + 4]; memcpy(result, &size, sizeof(size)); - result[4] = static_cast(code); - memcpy(result + 5, msg.data(), len1); + memcpy(result + 4, msg.data(), len1); if (len2) { - result[5 + len1] = ':'; - result[6 + len1] = ' '; - memcpy(result + 7 + len1, msg2.data(), len2); + result[4 + len1] = ':'; + result[5 + len1] = ' '; + memcpy(result + 6 + len1, msg2.data(), len2); } state_ = result; } std::string Status::ToString() const { - if (state_ == nullptr) { - return "OK"; - } else { - char tmp[30]; - const char* type; - switch (code()) { - case kOk: - type = "OK"; - break; - case kNotFound: - type = "NotFound: "; - break; - case kCorruption: - type = "Corruption: "; - break; - case kNotSupported: - type = "Not implemented: "; - break; - case kInvalidArgument: - type = "Invalid argument: "; - break; - case kIOError: - type = "IO error: "; - break; - case kMergeInProgress: - type = "Merge In Progress: "; - break; - default: - snprintf(tmp, sizeof(tmp), "Unknown code(%d): ", - static_cast(code())); - type = tmp; - break; - } - std::string result(type); + char tmp[30]; + const char* type; + switch (code_) { + case kOk: + return "OK"; + case kNotFound: + type = "NotFound: "; + break; + case kCorruption: + type = "Corruption: "; + break; + case kNotSupported: + type = "Not implemented: "; + break; + case kInvalidArgument: + type = "Invalid argument: "; + break; + case kIOError: + type = "IO error: "; + break; + case kMergeInProgress: + type = "Merge In Progress: "; + break; + default: + snprintf(tmp, sizeof(tmp), "Unknown code(%d): ", + static_cast(code())); + type = tmp; + break; + } + std::string result(type); + if (state_ != nullptr) { uint32_t length; memcpy(&length, state_, sizeof(length)); - result.append(state_ + 5, length); - return result; + result.append(state_ + 4, length); } + return result; } } // namespace rocksdb From a094f3b3b5568f4af37c081c7cea4b77e747b2e3 Mon Sep 17 00:00:00 2001 From: Siying Dong Date: Thu, 26 Dec 2013 16:25:45 -0800 Subject: [PATCH 13/22] TableCache.FindTable() to avoid the mem copy of file number Summary: I'm not sure what's the purpose of encoding file number to a new buffer for looking up the table cache. It seems to be unnecessary to me. With this patch, we point the lookup key to the address of the int64 of the file number. Test Plan: make all check Reviewers: dhruba, haobo, igor, kailiu Reviewed By: dhruba CC: leveldb Differential Revision: https://reviews.facebook.net/D14811 --- db/table_cache.cc | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/db/table_cache.cc b/db/table_cache.cc index e18c20c99..20eb68e4b 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -29,6 +29,11 @@ static void UnrefEntry(void* arg1, void* arg2) { cache->Release(h); } +static Slice GetSliceForFileNumber(uint64_t file_number) { + return Slice(reinterpret_cast(&file_number), + sizeof(file_number)); +} + TableCache::TableCache(const std::string& dbname, const Options* options, const EnvOptions& storage_options, @@ -50,9 +55,7 @@ Status TableCache::FindTable(const EnvOptions& toptions, Cache::Handle** handle, bool* table_io, const bool no_io) { Status s; - char buf[sizeof(file_number)]; - EncodeFixed64(buf, file_number); - Slice key(buf, sizeof(buf)); + Slice key = GetSliceForFileNumber(file_number); *handle = cache_->Lookup(key); if (*handle == nullptr) { if (no_io) { // Dont do IO and return a not-found status @@ -165,9 +168,7 @@ bool TableCache::PrefixMayMatch(const ReadOptions& options, } void TableCache::Evict(uint64_t file_number) { - char buf[sizeof(file_number)]; - EncodeFixed64(buf, file_number); - cache_->Erase(Slice(buf, sizeof(buf))); + cache_->Erase(GetSliceForFileNumber(file_number)); } } // namespace rocksdb From 9d4dc0da2746ddf02acf1739a53243b6a4df384b Mon Sep 17 00:00:00 2001 From: dyu Date: Fri, 27 Dec 2013 15:19:31 +0800 Subject: [PATCH 14/22] fix build bug from recent commit:https://github.com/facebook/rocksdb/commit/43c386b72ee834c88a1a22500ce1fc36a8208277 --- build_tools/build_detect_platform | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform index 96a1fb331..497ca2f3d 100755 --- a/build_tools/build_detect_platform +++ b/build_tools/build_detect_platform @@ -198,7 +198,7 @@ EOF } EOF if [ "$?" = 0 ]; then - COMMON_FLAGS="$PLATFORM_LDFLAGS -DROCKSDB_FALLOCATE_PRESENT" + COMMON_FLAGS="$COMMON_FLAGS $PLATFORM_LDFLAGS -DROCKSDB_FALLOCATE_PRESENT" fi # Test whether Snappy library is installed From a6b476a2ac453398327b8d4d00ef85911520841c Mon Sep 17 00:00:00 2001 From: dyu Date: Mon, 30 Dec 2013 21:33:52 +0800 Subject: [PATCH 15/22] tweak build bug fix --- build_tools/build_detect_platform | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform index 497ca2f3d..87c4c871d 100755 --- a/build_tools/build_detect_platform +++ b/build_tools/build_detect_platform @@ -198,7 +198,7 @@ EOF } EOF if [ "$?" = 0 ]; then - COMMON_FLAGS="$COMMON_FLAGS $PLATFORM_LDFLAGS -DROCKSDB_FALLOCATE_PRESENT" + COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_FALLOCATE_PRESENT" fi # Test whether Snappy library is installed From e842b99fc5f4536cadf80b287c6887a49d4c2d79 Mon Sep 17 00:00:00 2001 From: dyu Date: Mon, 30 Dec 2013 21:34:45 +0800 Subject: [PATCH 16/22] docs for shared library builds --- INSTALL.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/INSTALL.md b/INSTALL.md index 07d975068..ab0460341 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -44,5 +44,7 @@ libraries. You are on your own. `make clean; make` will compile librocksdb.a (RocskDB static library) and all the unit tests. You can run all unit tests with `make check`. +For shared library builds, exec `make librocksdb.so` instead. + If you followed the above steps and your compile or unit tests fail, please submit an issue: (https://github.com/facebook/rocksdb/issues) From 1795397bf0f28949d93ee820513eae1bf37c88e8 Mon Sep 17 00:00:00 2001 From: Kai Liu Date: Mon, 30 Dec 2013 14:53:56 -0800 Subject: [PATCH 17/22] Update README.fb Update the latest version number. --- README.fb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.fb b/README.fb index d17eed832..1c06ae980 100644 --- a/README.fb +++ b/README.fb @@ -1,3 +1,3 @@ * Detailed instructions on how to compile using fbcode and jemalloc -* Latest release is 2.5.fb +* Latest release is 2.6.fb From 5a20744a6a581ae3932eb8634bc1aef8b1bf51ae Mon Sep 17 00:00:00 2001 From: Kai Liu Date: Mon, 30 Dec 2013 16:14:55 -0800 Subject: [PATCH 18/22] Simplify build_tools/build_detect_version --- build_tools/build_detect_version | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/build_tools/build_detect_version b/build_tools/build_detect_version index eef905d8e..3ee6c92bd 100755 --- a/build_tools/build_detect_version +++ b/build_tools/build_detect_version @@ -8,7 +8,7 @@ # # create git version file -VFILE=$ROCKSDB_ROOT/util/build_version.cc.tmp +VFILE=$PWD/util/build_version.cc.tmp trap "rm $VFILE" EXIT # check to see if git is in the path @@ -36,7 +36,7 @@ echo "const char* rocksdb_build_git_datetime = \"rocksdb_build_git_datetime:$(da echo "const char* rocksdb_build_compile_date = __DATE__;" >> ${VFILE} echo "const char* rocksdb_build_compile_time = __TIME__;" >> ${VFILE} -OUTFILE=$ROCKSDB_ROOT/util/build_version.cc +OUTFILE=$PWD/util/build_version.cc if [ ! -e $OUTFILE ] || ! cmp -s $VFILE $OUTFILE; then cp $VFILE $OUTFILE fi From fe030bd1ca775baed7ae8e4f0bf2d94a419d9af1 Mon Sep 17 00:00:00 2001 From: Kai Liu Date: Mon, 30 Dec 2013 16:16:24 -0800 Subject: [PATCH 19/22] update the latest version in README.fb to 2.7 --- README.fb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.fb b/README.fb index 1c06ae980..d3cc4110b 100644 --- a/README.fb +++ b/README.fb @@ -1,3 +1,3 @@ * Detailed instructions on how to compile using fbcode and jemalloc -* Latest release is 2.6.fb +* Latest release is 2.7.fb From 52ea1be90aeac4560a4993cda2c48b4dc084f2ce Mon Sep 17 00:00:00 2001 From: Igor Canadi Date: Thu, 2 Jan 2014 02:00:04 -0800 Subject: [PATCH 20/22] Add -DROCKSDB_FALLOCATE_PRESENT to fbcode build --- build_tools/fbcode.gcc471.sh | 2 +- build_tools/fbcode.gcc481.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/build_tools/fbcode.gcc471.sh b/build_tools/fbcode.gcc471.sh index e8a0cdeaa..929405715 100644 --- a/build_tools/fbcode.gcc471.sh +++ b/build_tools/fbcode.gcc471.sh @@ -54,7 +54,7 @@ RANLIB=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ranlib CFLAGS="-B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/gold -m64 -mtune=generic" CFLAGS+=" -I $TOOLCHAIN_LIB_BASE/jemalloc/$TOOL_JEMALLOC/include -DHAVE_JEMALLOC" CFLAGS+=" $LIBGCC_INCLUDE $GLIBC_INCLUDE" -CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_ATOMIC_PRESENT" +CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_ATOMIC_PRESENT -DROCKSDB_FALLOCATE_PRESENT" CFLAGS+=" -DSNAPPY -DGFLAGS -DZLIB -DBZIP2" EXEC_LDFLAGS=" -Wl,--whole-archive $TOOLCHAIN_LIB_BASE/jemalloc/$TOOL_JEMALLOC/lib/libjemalloc.a" diff --git a/build_tools/fbcode.gcc481.sh b/build_tools/fbcode.gcc481.sh index 7ca337cf2..ae2bb57da 100644 --- a/build_tools/fbcode.gcc481.sh +++ b/build_tools/fbcode.gcc481.sh @@ -61,7 +61,7 @@ RANLIB=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ranlib CFLAGS="-B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/gold -m64 -mtune=generic" CFLAGS+=" -nostdlib $LIBGCC_INCLUDE $GLIBC_INCLUDE" -CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_ATOMIC_PRESENT" +CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_ATOMIC_PRESENT -DROCKSDB_FALLOCATE_PRESENT" CFLAGS+=" -DSNAPPY -DGFLAGS -DZLIB -DBZIP2" EXEC_LDFLAGS="-Wl,--dynamic-linker,/usr/local/fbcode/gcc-4.8.1-glibc-2.17/lib/ld.so" From 345fb94d26abeb7f37b6398ded949fb213af3f8a Mon Sep 17 00:00:00 2001 From: Igor Canadi Date: Thu, 2 Jan 2014 03:30:29 -0800 Subject: [PATCH 21/22] moving autovector_test after db_test --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 68bc489c3..ff8347957 100644 --- a/Makefile +++ b/Makefile @@ -49,8 +49,8 @@ VALGRIND_VER := $(join $(VALGRIND_VER),valgrind) VALGRIND_OPTS = --error-exitcode=$(VALGRIND_ERROR) --leak-check=full TESTS = \ - autovector_test \ db_test \ + autovector_test \ table_properties_collector_test \ arena_test \ auto_roll_logger_test \ From b60c14f6ee00dc179d400573d4b172d228a8c5a8 Mon Sep 17 00:00:00 2001 From: Igor Canadi Date: Thu, 2 Jan 2014 03:33:42 -0800 Subject: [PATCH 22/22] Support multi-threaded DisableFileDeletions() and EnableFileDeletions() Summary: We don't want two threads to clash if they concurrently call DisableFileDeletions() and EnableFileDeletions(). I'm adding a counter that will enable file deletions only after all DisableFileDeletions() calls have been negated with EnableFileDeletions(). However, we also don't want to break the old behavior, so I added a parameter force to EnableFileDeletions(). If force is true, we will still enable file deletions after every call to EnableFileDeletions(), which is what is happening now. Test Plan: make check Reviewers: dhruba, haobo, sanketh Reviewed By: dhruba CC: leveldb Differential Revision: https://reviews.facebook.net/D14781 --- db/db_filesnapshot.cc | 28 ++++++++++++++++------ db/db_impl.cc | 6 ++--- db/db_impl.h | 9 +++++-- db/db_impl_readonly.h | 2 +- db/db_test.cc | 2 +- include/rocksdb/db.h | 10 +++++++- include/utilities/stackable_db.h | 4 ++-- utilities/backupable/backupable_db_test.cc | 2 +- 8 files changed, 45 insertions(+), 18 deletions(-) diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc index 7b9c5ddeb..a7232246a 100644 --- a/db/db_filesnapshot.cc +++ b/db/db_filesnapshot.cc @@ -22,20 +22,34 @@ namespace rocksdb { Status DBImpl::DisableFileDeletions() { MutexLock l(&mutex_); - disable_delete_obsolete_files_ = true; - Log(options_.info_log, "File Deletions Disabled"); + ++disable_delete_obsolete_files_; + if (disable_delete_obsolete_files_ == 1) { + // if not, it has already been disabled, so don't log anything + Log(options_.info_log, "File Deletions Disabled"); + } return Status::OK(); } -Status DBImpl::EnableFileDeletions() { +Status DBImpl::EnableFileDeletions(bool force) { DeletionState deletion_state; + bool should_purge_files = false; { MutexLock l(&mutex_); - disable_delete_obsolete_files_ = false; - Log(options_.info_log, "File Deletions Enabled"); - FindObsoleteFiles(deletion_state, true); + if (force) { + // if force, we need to enable file deletions right away + disable_delete_obsolete_files_ = 0; + } else if (disable_delete_obsolete_files_ > 0) { + --disable_delete_obsolete_files_; + } + if (disable_delete_obsolete_files_ == 0) { + Log(options_.info_log, "File Deletions Enabled"); + should_purge_files = true; + FindObsoleteFiles(deletion_state, true); + } + } + if (should_purge_files) { + PurgeObsoleteFiles(deletion_state); } - PurgeObsoleteFiles(deletion_state); LogFlush(options_.info_log); return Status::OK(); } diff --git a/db/db_impl.cc b/db/db_impl.cc index ece08db8b..48dc367da 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -248,7 +248,7 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname) bg_logstats_scheduled_(false), manual_compaction_(nullptr), logger_(nullptr), - disable_delete_obsolete_files_(false), + disable_delete_obsolete_files_(0), delete_obsolete_files_last_run_(options.env->NowMicros()), purge_wal_files_last_run_(0), last_stats_dump_time_microsec_(0), @@ -513,7 +513,7 @@ void DBImpl::FindObsoleteFiles(DeletionState& deletion_state, mutex_.AssertHeld(); // if deletion is disabled, do nothing - if (disable_delete_obsolete_files_) { + if (disable_delete_obsolete_files_ > 0) { return; } @@ -1248,7 +1248,7 @@ Status DBImpl::FlushMemTableToOutputFile(bool* madeProgress, MaybeScheduleLogDBDeployStats(); - if (!disable_delete_obsolete_files_) { + if (disable_delete_obsolete_files_ == 0) { // add to deletion state deletion_state.log_delete_files.insert( deletion_state.log_delete_files.end(), diff --git a/db/db_impl.h b/db/db_impl.h index 2447b31fa..52e8221f5 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -72,7 +72,7 @@ class DBImpl : public DB { virtual const Options& GetOptions() const; virtual Status Flush(const FlushOptions& options); virtual Status DisableFileDeletions(); - virtual Status EnableFileDeletions(); + virtual Status EnableFileDeletions(bool force); // All the returned filenames start with "/" virtual Status GetLiveFiles(std::vector&, uint64_t* manifest_file_size, @@ -416,7 +416,12 @@ class DBImpl : public DB { int64_t volatile last_log_ts; // shall we disable deletion of obsolete files - bool disable_delete_obsolete_files_; + // if 0 the deletion is enabled. + // if non-zero, files will not be getting deleted + // This enables two different threads to call + // EnableFileDeletions() and DisableFileDeletions() + // without any synchronization + int disable_delete_obsolete_files_; // last time when DeleteObsoleteFiles was invoked uint64_t delete_obsolete_files_last_run_; diff --git a/db/db_impl_readonly.h b/db/db_impl_readonly.h index af9c79ed0..4beaedd01 100644 --- a/db/db_impl_readonly.h +++ b/db/db_impl_readonly.h @@ -55,7 +55,7 @@ public: virtual Status DisableFileDeletions() { return Status::NotSupported("Not supported operation in read only mode."); } - virtual Status EnableFileDeletions() { + virtual Status EnableFileDeletions(bool force) { return Status::NotSupported("Not supported operation in read only mode."); } virtual Status GetLiveFiles(std::vector&, diff --git a/db/db_test.cc b/db/db_test.cc index 9615c8969..a0b3d9aaa 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -4462,7 +4462,7 @@ class ModelDB: public DB { virtual Status DisableFileDeletions() { return Status::OK(); } - virtual Status EnableFileDeletions() { + virtual Status EnableFileDeletions(bool force) { return Status::OK(); } virtual Status GetLiveFiles(std::vector&, uint64_t* size, diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h index c4c5aa87f..dd17d9e9b 100644 --- a/include/rocksdb/db.h +++ b/include/rocksdb/db.h @@ -247,7 +247,15 @@ class DB { virtual Status DisableFileDeletions() = 0; // Allow compactions to delete obselete files. - virtual Status EnableFileDeletions() = 0; + // If force == true, the call to EnableFileDeletions() will guarantee that + // file deletions are enabled after the call, even if DisableFileDeletions() + // was called multiple times before. + // If force == false, EnableFileDeletions will only enable file deletion + // after it's been called at least as many times as DisableFileDeletions(), + // enabling the two methods to be called by two threads concurrently without + // synchronization -- i.e., file deletions will be enabled only after both + // threads call EnableFileDeletions() + virtual Status EnableFileDeletions(bool force = true) = 0; // GetLiveFiles followed by GetSortedWalFiles can generate a lossless backup diff --git a/include/utilities/stackable_db.h b/include/utilities/stackable_db.h index 2d86a611b..908fe10b7 100644 --- a/include/utilities/stackable_db.h +++ b/include/utilities/stackable_db.h @@ -123,8 +123,8 @@ class StackableDB : public DB { return db_->DisableFileDeletions(); } - virtual Status EnableFileDeletions() override { - return db_->EnableFileDeletions(); + virtual Status EnableFileDeletions(bool force) override { + return db_->EnableFileDeletions(force); } virtual Status GetLiveFiles(std::vector& vec, uint64_t* mfs, diff --git a/utilities/backupable/backupable_db_test.cc b/utilities/backupable/backupable_db_test.cc index af4af0d02..c64f0170b 100644 --- a/utilities/backupable/backupable_db_test.cc +++ b/utilities/backupable/backupable_db_test.cc @@ -48,7 +48,7 @@ class DummyDB : public StackableDB { return options_; } - virtual Status EnableFileDeletions() override { + virtual Status EnableFileDeletions(bool force) override { ASSERT_TRUE(!deletions_enabled_); deletions_enabled_ = true; return Status::OK();