From da1c64b6e75048068bf8599f198b8fe0e54eedfc Mon Sep 17 00:00:00 2001 From: Siying Dong Date: Thu, 20 Dec 2018 12:00:40 -0800 Subject: [PATCH] Introduce a CPU time counter in perf_context (#4741) Summary: Introduce the first CPU timing counter, perf_context.get_cpu_nanos. This opens a door to more CPU counters in the future. Only Posix Env has it implemented using clock_gettime() with CLOCK_THREAD_CPUTIME_ID. How accurate the counter is depends on the platform. Make PerfStepTimer to take an Env as an argument, and sometimes pass it in. The direct reason is to make the unit tests to use SpecialEnv where we can ingest logic there. But in long term, this is a good change. Pull Request resolved: https://github.com/facebook/rocksdb/pull/4741 Differential Revision: D13287798 Pulled By: siying fbshipit-source-id: 090361049d9d5095d1d1a369fe1338d2e2e1c73f --- db/db_impl.cc | 2 ++ db/db_test2.cc | 34 ++++++++++++++++++++++++++++++++++ db/db_test_util.h | 7 +++++++ db/table_cache.cc | 2 +- env/env_posix.cc | 9 +++++++++ include/rocksdb/env.h | 3 +++ include/rocksdb/perf_context.h | 3 +++ include/rocksdb/perf_level.h | 7 +++++-- monitoring/perf_context.cc | 2 ++ monitoring/perf_context_imp.h | 25 +++++++++++++++++++------ monitoring/perf_step_timer.h | 32 +++++++++++++++++++++----------- 11 files changed, 106 insertions(+), 20 deletions(-) diff --git a/db/db_impl.cc b/db/db_impl.cc index d94ee76fa..8502e47b7 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -1216,6 +1216,7 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, PinnableSlice* pinnable_val, bool* value_found, ReadCallback* callback, bool* is_blob_index) { assert(pinnable_val != nullptr); + PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_); StopWatch sw(env_, stats_, DB_GET); PERF_TIMER_GUARD(get_snapshot_time); @@ -1330,6 +1331,7 @@ std::vector DBImpl::MultiGet( const ReadOptions& read_options, const std::vector& column_family, const std::vector& keys, std::vector* values) { + PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_); StopWatch sw(env_, stats_, DB_MULTIGET); PERF_TIMER_GUARD(get_snapshot_time); diff --git a/db/db_test2.cc b/db/db_test2.cc index 2d6fadc35..e7a7bfe82 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -1646,6 +1646,40 @@ class MockPersistentCache : public PersistentCache { const size_t max_size_ = 10 * 1024; // 10KiB }; +#ifdef OS_LINUX +// Make sure that in CPU time perf context counters, Env::NowCPUNanos() +// is used, rather than Env::CPUNanos(); +TEST_F(DBTest2, TestPerfContextCpuTime) { + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Flush()); + + env_->now_cpu_count_.store(0); + + // CPU timing is not enabled with kEnableTimeExceptForMutex + SetPerfLevel(PerfLevel::kEnableTimeExceptForMutex); + ASSERT_EQ("bar", Get("foo")); + ASSERT_EQ(0, get_perf_context()->get_cpu_nanos); + ASSERT_EQ(0, env_->now_cpu_count_.load()); + + uint64_t kDummyAddonTime = uint64_t{1000000000000}; + + // Add time to NowNanos() reading. + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "TableCache::FindTable:0", + [&](void* /*arg*/) { env_->addon_time_.fetch_add(kDummyAddonTime); }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex); + ASSERT_EQ("bar", Get("foo")); + ASSERT_EQ(env_->now_cpu_count_.load(), 2); + ASSERT_LT(get_perf_context()->get_cpu_nanos, kDummyAddonTime); + ASSERT_GT(get_perf_context()->find_table_nanos, kDummyAddonTime); + + SetPerfLevel(PerfLevel::kDisable); + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); +} +#endif // OS_LINUX + #ifndef OS_SOLARIS // GetUniqueIdFromFile is not implemented TEST_F(DBTest2, PersistentCache) { int num_iter = 80; diff --git a/db/db_test_util.h b/db/db_test_util.h index 7c2d79498..3638ca7fa 100644 --- a/db/db_test_util.h +++ b/db/db_test_util.h @@ -503,6 +503,11 @@ class SpecialEnv : public EnvWrapper { return s; } + virtual uint64_t NowCPUNanos() override { + now_cpu_count_.fetch_add(1); + return target()->NowCPUNanos(); + } + virtual uint64_t NowNanos() override { return (time_elapse_only_sleep_ ? 0 : target()->NowNanos()) + addon_time_.load() * 1000; @@ -572,6 +577,8 @@ class SpecialEnv : public EnvWrapper { std::atomic addon_time_; + std::atomic now_cpu_count_; + std::atomic delete_count_; std::atomic time_elapse_only_sleep_; diff --git a/db/table_cache.cc b/db/table_cache.cc index 5c0f95716..764c05bfa 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -147,7 +147,7 @@ Status TableCache::FindTable(const EnvOptions& env_options, HistogramImpl* file_read_hist, bool skip_filters, int level, bool prefetch_index_and_filter_in_cache) { - PERF_TIMER_GUARD(find_table_nanos); + PERF_TIMER_GUARD_WITH_ENV(find_table_nanos, ioptions_.env); Status s; uint64_t number = fd.GetNumber(); Slice key = GetSliceForFileNumber(&number); diff --git a/env/env_posix.cc b/env/env_posix.cc index c2e456a66..b64a13bb4 100644 --- a/env/env_posix.cc +++ b/env/env_posix.cc @@ -843,6 +843,15 @@ class PosixEnv : public Env { #endif } + virtual uint64_t NowCPUNanos() override { +#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_AIX) + struct timespec ts; + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts); + return static_cast(ts.tv_sec) * 1000000000 + ts.tv_nsec; +#endif + return 0; + } + virtual void SleepForMicroseconds(int micros) override { usleep(micros); } virtual Status GetHostName(char* name, uint64_t len) override { diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h index bc439ac1c..07bc3aba3 100644 --- a/include/rocksdb/env.h +++ b/include/rocksdb/env.h @@ -384,6 +384,9 @@ class Env { return NowMicros() * 1000; } + // 0 indicates not supported. + virtual uint64_t NowCPUNanos() { return 0; } + // Sleep/delay the thread for the prescribed number of micro-seconds. virtual void SleepForMicroseconds(int micros) = 0; diff --git a/include/rocksdb/perf_context.h b/include/rocksdb/perf_context.h index 9fec56b8a..d72f8f371 100644 --- a/include/rocksdb/perf_context.h +++ b/include/rocksdb/perf_context.h @@ -205,6 +205,9 @@ struct PerfContext { uint64_t env_lock_file_nanos; uint64_t env_unlock_file_nanos; uint64_t env_new_logger_nanos; + + uint64_t get_cpu_nanos; + std::map* level_to_perf_context; bool per_level_perf_context_enabled; }; diff --git a/include/rocksdb/perf_level.h b/include/rocksdb/perf_level.h index 218c6015f..de0a214d6 100644 --- a/include/rocksdb/perf_level.h +++ b/include/rocksdb/perf_level.h @@ -17,8 +17,11 @@ enum PerfLevel : unsigned char { kEnableCount = 2, // enable only count stats kEnableTimeExceptForMutex = 3, // Other than count stats, also enable time // stats except for mutexes - kEnableTime = 4, // enable count and time stats - kOutOfBounds = 5 // N.B. Must always be the last value! + // Other than time, also measure CPU time counters. Still don't measure + // time (neither wall time nor CPU time) for mutexes. + kEnableTimeAndCPUTimeExceptForMutex = 4, + kEnableTime = 5, // enable count and time stats + kOutOfBounds = 6 // N.B. Must always be the last value! }; // set the perf stats level for current thread diff --git a/monitoring/perf_context.cc b/monitoring/perf_context.cc index 6c164cfe9..b82fa4f27 100644 --- a/monitoring/perf_context.cc +++ b/monitoring/perf_context.cc @@ -114,6 +114,7 @@ void PerfContext::Reset() { env_lock_file_nanos = 0; env_unlock_file_nanos = 0; env_new_logger_nanos = 0; + get_cpu_nanos = 0; if (per_level_perf_context_enabled && level_to_perf_context) { for (auto& kv : *level_to_perf_context) { kv.second.Reset(); @@ -224,6 +225,7 @@ std::string PerfContext::ToString(bool exclude_zero_counters) const { PERF_CONTEXT_OUTPUT(env_lock_file_nanos); PERF_CONTEXT_OUTPUT(env_unlock_file_nanos); PERF_CONTEXT_OUTPUT(env_new_logger_nanos); + PERF_CONTEXT_OUTPUT(get_cpu_nanos); PERF_CONTEXT_BY_LEVEL_OUTPUT_ONE_COUNTER(bloom_filter_useful); PERF_CONTEXT_BY_LEVEL_OUTPUT_ONE_COUNTER(bloom_filter_full_positive); PERF_CONTEXT_BY_LEVEL_OUTPUT_ONE_COUNTER(bloom_filter_full_true_positive); diff --git a/monitoring/perf_context_imp.h b/monitoring/perf_context_imp.h index d67654914..e0ff8afc5 100644 --- a/monitoring/perf_context_imp.h +++ b/monitoring/perf_context_imp.h @@ -41,12 +41,25 @@ extern thread_local PerfContext perf_context; PerfStepTimer perf_step_timer_##metric(&(perf_context.metric)); \ perf_step_timer_##metric.Start(); -#define PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD(metric, condition, stats, \ - ticker_type) \ - PerfStepTimer perf_step_timer_##metric(&(perf_context.metric), true, stats, \ - ticker_type); \ - if (condition) { \ - perf_step_timer_##metric.Start(); \ +// Declare and set start time of the timer +#define PERF_TIMER_GUARD_WITH_ENV(metric, env) \ + PerfStepTimer perf_step_timer_##metric(&(perf_context.metric), env); \ + perf_step_timer_##metric.Start(); + +// Declare and set start time of the timer +#define PERF_CPU_TIMER_GUARD(metric, env) \ + PerfStepTimer perf_step_timer_##metric( \ + &(perf_context.metric), env, true, \ + PerfLevel::kEnableTimeAndCPUTimeExceptForMutex); \ + perf_step_timer_##metric.Start(); + +#define PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD(metric, condition, stats, \ + ticker_type) \ + PerfStepTimer perf_step_timer_##metric(&(perf_context.metric), nullptr, \ + false, PerfLevel::kEnableTime, stats, \ + ticker_type); \ + if (condition) { \ + perf_step_timer_##metric.Start(); \ } // Update metric with time elapsed since last START. start time is reset diff --git a/monitoring/perf_step_timer.h b/monitoring/perf_step_timer.h index 246d6eb75..6501bd54a 100644 --- a/monitoring/perf_step_timer.h +++ b/monitoring/perf_step_timer.h @@ -12,14 +12,15 @@ namespace rocksdb { class PerfStepTimer { public: - explicit PerfStepTimer(uint64_t* metric, bool for_mutex = false, - Statistics* statistics = nullptr, - uint32_t ticker_type = 0) - : perf_counter_enabled_( - perf_level >= PerfLevel::kEnableTime || - (!for_mutex && perf_level >= kEnableTimeExceptForMutex)), - env_((perf_counter_enabled_ || statistics != nullptr) ? Env::Default() - : nullptr), + explicit PerfStepTimer( + uint64_t* metric, Env* env = nullptr, bool use_cpu_time = false, + PerfLevel enable_level = PerfLevel::kEnableTimeExceptForMutex, + Statistics* statistics = nullptr, uint32_t ticker_type = 0) + : perf_counter_enabled_(perf_level >= enable_level), + use_cpu_time_(use_cpu_time), + env_((perf_counter_enabled_ || statistics != nullptr) + ? ((env != nullptr) ? env : Env::Default()) + : nullptr), start_(0), metric_(metric), statistics_(statistics), @@ -31,13 +32,21 @@ class PerfStepTimer { void Start() { if (perf_counter_enabled_ || statistics_ != nullptr) { - start_ = env_->NowNanos(); + start_ = time_now(); + } + } + + uint64_t time_now() { + if (!use_cpu_time_) { + return env_->NowNanos(); + } else { + return env_->NowCPUNanos(); } } void Measure() { if (start_) { - uint64_t now = env_->NowNanos(); + uint64_t now = time_now(); *metric_ += now - start_; start_ = now; } @@ -45,7 +54,7 @@ class PerfStepTimer { void Stop() { if (start_) { - uint64_t duration = env_->NowNanos() - start_; + uint64_t duration = time_now() - start_; if (perf_counter_enabled_) { *metric_ += duration; } @@ -59,6 +68,7 @@ class PerfStepTimer { private: const bool perf_counter_enabled_; + const bool use_cpu_time_; Env* const env_; uint64_t start_; uint64_t* metric_;