diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform index ecc408fe2..884f2e836 100755 --- a/build_tools/build_detect_platform +++ b/build_tools/build_detect_platform @@ -82,7 +82,7 @@ case "$TARGET_OS" in if [ -z "$USE_CLANG" ]; then COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp" fi - PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread" + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt" # PORT_FILES=port/linux/linux_specific.cc ;; SunOS) diff --git a/db/dbformat.cc b/db/dbformat.cc index 8456ba2f2..c1caa955f 100644 --- a/db/dbformat.cc +++ b/db/dbformat.cc @@ -6,7 +6,7 @@ #include "db/dbformat.h" #include "port/port.h" #include "util/coding.h" -#include "rocksdb/perf_context.h" +#include "util/perf_context_imp.h" namespace leveldb { @@ -54,7 +54,7 @@ int InternalKeyComparator::Compare(const Slice& akey, const Slice& bkey) const { // decreasing sequence number // decreasing type (though sequence# should be enough to disambiguate) int r = user_comparator_->Compare(ExtractUserKey(akey), ExtractUserKey(bkey)); - perf_context.user_key_comparison_count++; + BumpPerfCount(&perf_context.user_key_comparison_count); if (r == 0) { const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8); const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8); diff --git a/db/perf_context.cc b/db/perf_context.cc deleted file mode 100644 index 08b5d8c6e..000000000 --- a/db/perf_context.cc +++ /dev/null @@ -1,12 +0,0 @@ -#include "rocksdb/perf_context.h" - - -namespace leveldb { - -void PerfContext::Reset() { - user_key_comparison_count = 0; -} - -__thread PerfContext perf_context; - -} diff --git a/db/perf_context_test.cc b/db/perf_context_test.cc index cccbb356b..95b188022 100644 --- a/db/perf_context_test.cc +++ b/db/perf_context_test.cc @@ -1,7 +1,10 @@ #include +#include #include "rocksdb/db.h" #include "rocksdb/perf_context.h" +#include "util/histogram.h" +#include "util/stop_watch.h" #include "util/testharness.h" @@ -10,11 +13,11 @@ namespace leveldb { // Path to the database on file system const std::string kDbName = test::TmpDir() + "/perf_context_test"; -std::shared_ptr OpenDb() { +std::shared_ptr OpenDb(size_t write_buffer_size) { DB* db; Options options; options.create_if_missing = true; - options.write_buffer_size = 1000000000; // give it a big memtable + options.write_buffer_size = write_buffer_size; Status s = DB::Open(options, kDbName, &db); ASSERT_OK(s); return std::shared_ptr(db); @@ -24,11 +27,48 @@ class PerfContextTest { }; int kTotalKeys = 100; -TEST(PerfContextTest, KeyComparisonCount) { +TEST(PerfContextTest, StopWatchNanoOverhead) { + // profile the timer cost by itself! + const int kTotalIterations = 1000000; + std::vector timings(kTotalIterations); + StopWatchNano timer(Env::Default(), true); + for (auto& timing : timings) { + timing = timer.ElapsedNanos(true /* reset */); + } + + HistogramImpl histogram; + for (const auto timing : timings) { + histogram.Add(timing); + } + + std::cout << histogram.ToString(); +} + +TEST(PerfContextTest, StopWatchOverhead) { + // profile the timer cost by itself! + const int kTotalIterations = 1000000; + std::vector timings(kTotalIterations); + + StopWatch timer(Env::Default()); + for (auto& timing : timings) { + timing = timer.ElapsedMicros(); + } + + HistogramImpl histogram; + uint64_t prev_timing = 0; + for (const auto timing : timings) { + histogram.Add(timing - prev_timing); + prev_timing = timing; + } + + std::cout << histogram.ToString(); +} + +void ProfileKeyComparison() { DestroyDB(kDbName, Options()); // Start this test with a fresh DB - auto db = OpenDb(); + auto db = OpenDb(1000000000); WriteOptions write_options; ReadOptions read_options; @@ -63,12 +103,20 @@ TEST(PerfContextTest, KeyComparisonCount) { << max_user_key_comparison_get << "\n" << "avg user key comparison get:" << total_user_key_comparison_get/kTotalKeys << "\n"; - } +TEST(PerfContextTest, KeyComparisonCount) { + SetPerfLevel(kDisable); + ProfileKeyComparison(); + SetPerfLevel(kEnableCount); + ProfileKeyComparison(); + + SetPerfLevel(kEnableTime); + ProfileKeyComparison(); } +} int main(int argc, char** argv) { diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h index ebfe4021b..f4803e2e5 100644 --- a/include/rocksdb/env.h +++ b/include/rocksdb/env.h @@ -190,6 +190,13 @@ class Env { // useful for computing deltas of time. virtual uint64_t NowMicros() = 0; + // Returns the number of nano-seconds since some fixed point in time. Only + // useful for computing deltas of time in one run. + // Default implementation simply relies on NowMicros + virtual uint64_t NowNanos() { + return NowMicros() * 1000; + } + // Sleep/delay the thread for the perscribed number of micro-seconds. virtual void SleepForMicroseconds(int micros) = 0; diff --git a/include/rocksdb/perf_context.h b/include/rocksdb/perf_context.h index 953a160a6..45566ff19 100644 --- a/include/rocksdb/perf_context.h +++ b/include/rocksdb/perf_context.h @@ -5,6 +5,15 @@ namespace leveldb { +enum PerfLevel { + kDisable = 0, // disable perf stats + kEnableCount = 1, // enable only count stats + kEnableTime = 2 // enable time stats too +}; + +// set the perf stats level +void SetPerfLevel(PerfLevel level); + // A thread local context for gathering performance counter efficiently // and transparently. @@ -12,8 +21,13 @@ struct PerfContext { void Reset(); // reset all performance counters to zero - uint64_t user_key_comparison_count; // total number of user key comparisons + uint64_t block_cache_hit_count; + uint64_t block_read_count; + uint64_t block_read_byte; + uint64_t block_read_time; + uint64_t block_checksum_time; + uint64_t block_decompress_time; }; extern __thread PerfContext perf_context; diff --git a/table/format.cc b/table/format.cc index e2e6103bd..60467297f 100644 --- a/table/format.cc +++ b/table/format.cc @@ -4,11 +4,12 @@ #include "table/format.h" -#include "rocksdb/env.h" #include "port/port.h" +#include "rocksdb/env.h" #include "table/block.h" #include "util/coding.h" #include "util/crc32c.h" +#include "util/perf_context_imp.h" namespace leveldb { @@ -69,7 +70,8 @@ Status Footer::DecodeFrom(Slice* input) { Status ReadBlockContents(RandomAccessFile* file, const ReadOptions& options, const BlockHandle& handle, - BlockContents* result) { + BlockContents* result, + Env* env) { result->data = Slice(); result->cachable = false; result->heap_allocated = false; @@ -79,7 +81,14 @@ Status ReadBlockContents(RandomAccessFile* file, size_t n = static_cast(handle.size()); char* buf = new char[n + kBlockTrailerSize]; Slice contents; + + StopWatchNano timer(env); + StartPerfTimer(&timer); Status s = file->Read(handle.offset(), n + kBlockTrailerSize, &contents, buf); + BumpPerfCount(&perf_context.block_read_count); + BumpPerfCount(&perf_context.block_read_byte, n + kBlockTrailerSize); + BumpPerfTime(&perf_context.block_read_time, &timer); + if (!s.ok()) { delete[] buf; return s; @@ -99,6 +108,7 @@ Status ReadBlockContents(RandomAccessFile* file, s = Status::Corruption("block checksum mismatch"); return s; } + BumpPerfTime(&perf_context.block_checksum_time, &timer); } char* ubuf = nullptr; @@ -172,6 +182,8 @@ Status ReadBlockContents(RandomAccessFile* file, return Status::Corruption("bad block type"); } + BumpPerfTime(&perf_context.block_decompress_time, &timer); + return Status::OK(); } diff --git a/table/format.h b/table/format.h index 88008dafd..31fa4f6b6 100644 --- a/table/format.h +++ b/table/format.h @@ -94,7 +94,8 @@ struct BlockContents { extern Status ReadBlockContents(RandomAccessFile* file, const ReadOptions& options, const BlockHandle& handle, - BlockContents* result); + BlockContents* result, + Env* env); // Implementation details follow. Clients should ignore, diff --git a/table/table.cc b/table/table.cc index d2d198fb8..6b59b3ce3 100644 --- a/table/table.cc +++ b/table/table.cc @@ -19,6 +19,7 @@ #include "table/two_level_iterator.h" #include "util/coding.h" +#include "util/perf_context_imp.h" #include "util/stop_watch.h" namespace leveldb { @@ -81,9 +82,10 @@ Status ReadBlock(RandomAccessFile* file, const ReadOptions& options, const BlockHandle& handle, Block** result, + Env* env, bool* didIO = nullptr) { BlockContents contents; - Status s = ReadBlockContents(file, options, handle, &contents); + Status s = ReadBlockContents(file, options, handle, &contents, env); if (s.ok()) { *result = new Block(contents); } @@ -118,14 +120,14 @@ Status Table::Open(const Options& options, return Status::InvalidArgument("file is too short to be an sstable"); } - Footer footer; s = footer.DecodeFrom(&footer_input); if (!s.ok()) return s; Block* index_block = nullptr; // TODO: we never really verify check sum for index block - s = ReadBlock(file.get(), ReadOptions(), footer.index_handle(), &index_block); + s = ReadBlock(file.get(), ReadOptions(), footer.index_handle(), &index_block, + options.env); if (s.ok()) { // We've successfully read the footer and the index block: we're @@ -176,7 +178,7 @@ void Table::ReadMeta(const Footer& footer) { // TODO: we never really verify check sum for meta index block Block* meta = nullptr; if (!ReadBlock(rep_->file.get(), ReadOptions(), footer.metaindex_handle(), - &meta).ok()) { + &meta, rep_->options.env).ok()) { // Do not propagate errors since meta info is not needed for operation return; } @@ -203,7 +205,8 @@ void Table::ReadFilter(const Slice& filter_handle_value) { // requiring checksum verification in Table::Open. ReadOptions opt; BlockContents block; - if (!ReadBlockContents(rep_->file.get(), opt, filter_handle, &block).ok()) { + if (!ReadBlockContents(rep_->file.get(), opt, filter_handle, &block, + rep_->options.env).ok()) { return; } if (block.heap_allocated) { @@ -266,6 +269,7 @@ Iterator* Table::BlockReader(void* arg, if (cache_handle != nullptr) { block = reinterpret_cast(block_cache->Value(cache_handle)); + BumpPerfCount(&perf_context.block_cache_hit_count); RecordTick(statistics, BLOCK_CACHE_HIT); } else if (no_io) { // Did not find in block_cache and can't do IO @@ -280,6 +284,7 @@ Iterator* Table::BlockReader(void* arg, options, handle, &block, + table->rep_->options.env, didIO ); } @@ -295,8 +300,9 @@ Iterator* Table::BlockReader(void* arg, } else if (no_io) { // Could not read from block_cache and can't do IO return NewErrorIterator(Status::Incomplete("no blocking io")); - }else { - s = ReadBlock(table->rep_->file.get(), options, handle, &block, didIO); + } else { + s = ReadBlock(table->rep_->file.get(), options, handle, &block, + table->rep_->options.env, didIO); } } diff --git a/util/env_posix.cc b/util/env_posix.cc index 59314fefb..3771c406d 100644 --- a/util/env_posix.cc +++ b/util/env_posix.cc @@ -960,6 +960,12 @@ class PosixEnv : public Env { return static_cast(tv.tv_sec) * 1000000 + tv.tv_usec; } + virtual uint64_t NowNanos() { + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return static_cast(ts.tv_sec) * 1000000000 + ts.tv_nsec; + } + virtual void SleepForMicroseconds(int micros) { usleep(micros); } diff --git a/util/perf_context.cc b/util/perf_context.cc new file mode 100644 index 000000000..48525d28a --- /dev/null +++ b/util/perf_context.cc @@ -0,0 +1,22 @@ +#include "util/perf_context_imp.h" + +namespace leveldb { + +// by default, enable counts only +PerfLevel perf_level = kEnableCount; + +void SetPerfLevel(PerfLevel level) { perf_level = level; } + +void PerfContext::Reset() { + user_key_comparison_count = 0; + block_cache_hit_count = 0; + block_read_count = 0; + block_read_byte = 0; + block_read_time = 0; + block_checksum_time = 0; + block_decompress_time = 0; +} + +__thread PerfContext perf_context; + +} diff --git a/util/perf_context_imp.h b/util/perf_context_imp.h new file mode 100644 index 000000000..66df1c871 --- /dev/null +++ b/util/perf_context_imp.h @@ -0,0 +1,33 @@ +#ifndef PERF_CONTEXT_IMP_H +#define PERF_CONTEXT_IMP_H + +#include "rocksdb/perf_context.h" +#include "util/stop_watch.h" + +namespace leveldb { + +extern enum PerfLevel perf_level; + +inline void StartPerfTimer(StopWatchNano* timer) { + if (perf_level >= PerfLevel::kEnableTime) { + timer->Start(); + } +} + +inline void BumpPerfCount(uint64_t* count, uint64_t delta = 1) { + if (perf_level >= PerfLevel::kEnableCount) { + *count += delta; + } +} + +inline void BumpPerfTime(uint64_t* time, + StopWatchNano* timer, + bool reset = true) { + if (perf_level >= PerfLevel::kEnableTime) { + *time += timer->ElapsedNanos(reset); + } +} + +} + +#endif diff --git a/util/stop_watch.h b/util/stop_watch.h index ac0c02b9b..8ccf2a006 100644 --- a/util/stop_watch.h +++ b/util/stop_watch.h @@ -11,8 +11,8 @@ class StopWatch { public: StopWatch( Env * const env, - std::shared_ptr statistics, - const Histograms histogram_name) : + std::shared_ptr statistics = nullptr, + const Histograms histogram_name = DB_GET) : env_(env), start_time_(env->NowMicros()), statistics_(statistics), @@ -37,5 +37,32 @@ class StopWatch { const Histograms histogram_name_; }; + +// a nano second precision stopwatch +class StopWatchNano { + public: + StopWatchNano(Env* const env, bool auto_start = false) + : env_(env), start_(0) { + if (auto_start) { + Start(); + } + } + + void Start() { start_ = env_->NowNanos(); } + + uint64_t ElapsedNanos(bool reset = false) { + auto now = env_->NowNanos(); + auto elapsed = now - start_; + if (reset) { + start_ = now; + } + return elapsed; + } + + private: + Env* const env_; + uint64_t start_; +}; + } // namespace leveldb #endif // STORAGE_LEVELDB_UTIL_STOP_WATCH_H_