diff --git a/db/db_impl.cc b/db/db_impl.cc index d94ee76fa..8502e47b7 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -1216,6 +1216,7 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, PinnableSlice* pinnable_val, bool* value_found, ReadCallback* callback, bool* is_blob_index) { assert(pinnable_val != nullptr); + PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_); StopWatch sw(env_, stats_, DB_GET); PERF_TIMER_GUARD(get_snapshot_time); @@ -1330,6 +1331,7 @@ std::vector DBImpl::MultiGet( const ReadOptions& read_options, const std::vector& column_family, const std::vector& keys, std::vector* values) { + PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_); StopWatch sw(env_, stats_, DB_MULTIGET); PERF_TIMER_GUARD(get_snapshot_time); diff --git a/db/db_test2.cc b/db/db_test2.cc index 2d6fadc35..e7a7bfe82 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -1646,6 +1646,40 @@ class MockPersistentCache : public PersistentCache { const size_t max_size_ = 10 * 1024; // 10KiB }; +#ifdef OS_LINUX +// Make sure that in CPU time perf context counters, Env::NowCPUNanos() +// is used, rather than Env::CPUNanos(); +TEST_F(DBTest2, TestPerfContextCpuTime) { + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Flush()); + + env_->now_cpu_count_.store(0); + + // CPU timing is not enabled with kEnableTimeExceptForMutex + SetPerfLevel(PerfLevel::kEnableTimeExceptForMutex); + ASSERT_EQ("bar", Get("foo")); + ASSERT_EQ(0, get_perf_context()->get_cpu_nanos); + ASSERT_EQ(0, env_->now_cpu_count_.load()); + + uint64_t kDummyAddonTime = uint64_t{1000000000000}; + + // Add time to NowNanos() reading. + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "TableCache::FindTable:0", + [&](void* /*arg*/) { env_->addon_time_.fetch_add(kDummyAddonTime); }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex); + ASSERT_EQ("bar", Get("foo")); + ASSERT_EQ(env_->now_cpu_count_.load(), 2); + ASSERT_LT(get_perf_context()->get_cpu_nanos, kDummyAddonTime); + ASSERT_GT(get_perf_context()->find_table_nanos, kDummyAddonTime); + + SetPerfLevel(PerfLevel::kDisable); + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); +} +#endif // OS_LINUX + #ifndef OS_SOLARIS // GetUniqueIdFromFile is not implemented TEST_F(DBTest2, PersistentCache) { int num_iter = 80; diff --git a/db/db_test_util.h b/db/db_test_util.h index 7c2d79498..3638ca7fa 100644 --- a/db/db_test_util.h +++ b/db/db_test_util.h @@ -503,6 +503,11 @@ class SpecialEnv : public EnvWrapper { return s; } + virtual uint64_t NowCPUNanos() override { + now_cpu_count_.fetch_add(1); + return target()->NowCPUNanos(); + } + virtual uint64_t NowNanos() override { return (time_elapse_only_sleep_ ? 0 : target()->NowNanos()) + addon_time_.load() * 1000; @@ -572,6 +577,8 @@ class SpecialEnv : public EnvWrapper { std::atomic addon_time_; + std::atomic now_cpu_count_; + std::atomic delete_count_; std::atomic time_elapse_only_sleep_; diff --git a/db/table_cache.cc b/db/table_cache.cc index 5c0f95716..764c05bfa 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -147,7 +147,7 @@ Status TableCache::FindTable(const EnvOptions& env_options, HistogramImpl* file_read_hist, bool skip_filters, int level, bool prefetch_index_and_filter_in_cache) { - PERF_TIMER_GUARD(find_table_nanos); + PERF_TIMER_GUARD_WITH_ENV(find_table_nanos, ioptions_.env); Status s; uint64_t number = fd.GetNumber(); Slice key = GetSliceForFileNumber(&number); diff --git a/env/env_posix.cc b/env/env_posix.cc index c2e456a66..b64a13bb4 100644 --- a/env/env_posix.cc +++ b/env/env_posix.cc @@ -843,6 +843,15 @@ class PosixEnv : public Env { #endif } + virtual uint64_t NowCPUNanos() override { +#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_AIX) + struct timespec ts; + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts); + return static_cast(ts.tv_sec) * 1000000000 + ts.tv_nsec; +#endif + return 0; + } + virtual void SleepForMicroseconds(int micros) override { usleep(micros); } virtual Status GetHostName(char* name, uint64_t len) override { diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h index bc439ac1c..07bc3aba3 100644 --- a/include/rocksdb/env.h +++ b/include/rocksdb/env.h @@ -384,6 +384,9 @@ class Env { return NowMicros() * 1000; } + // 0 indicates not supported. + virtual uint64_t NowCPUNanos() { return 0; } + // Sleep/delay the thread for the prescribed number of micro-seconds. virtual void SleepForMicroseconds(int micros) = 0; diff --git a/include/rocksdb/perf_context.h b/include/rocksdb/perf_context.h index 9fec56b8a..d72f8f371 100644 --- a/include/rocksdb/perf_context.h +++ b/include/rocksdb/perf_context.h @@ -205,6 +205,9 @@ struct PerfContext { uint64_t env_lock_file_nanos; uint64_t env_unlock_file_nanos; uint64_t env_new_logger_nanos; + + uint64_t get_cpu_nanos; + std::map* level_to_perf_context; bool per_level_perf_context_enabled; }; diff --git a/include/rocksdb/perf_level.h b/include/rocksdb/perf_level.h index 218c6015f..de0a214d6 100644 --- a/include/rocksdb/perf_level.h +++ b/include/rocksdb/perf_level.h @@ -17,8 +17,11 @@ enum PerfLevel : unsigned char { kEnableCount = 2, // enable only count stats kEnableTimeExceptForMutex = 3, // Other than count stats, also enable time // stats except for mutexes - kEnableTime = 4, // enable count and time stats - kOutOfBounds = 5 // N.B. Must always be the last value! + // Other than time, also measure CPU time counters. Still don't measure + // time (neither wall time nor CPU time) for mutexes. + kEnableTimeAndCPUTimeExceptForMutex = 4, + kEnableTime = 5, // enable count and time stats + kOutOfBounds = 6 // N.B. Must always be the last value! }; // set the perf stats level for current thread diff --git a/monitoring/perf_context.cc b/monitoring/perf_context.cc index 6c164cfe9..b82fa4f27 100644 --- a/monitoring/perf_context.cc +++ b/monitoring/perf_context.cc @@ -114,6 +114,7 @@ void PerfContext::Reset() { env_lock_file_nanos = 0; env_unlock_file_nanos = 0; env_new_logger_nanos = 0; + get_cpu_nanos = 0; if (per_level_perf_context_enabled && level_to_perf_context) { for (auto& kv : *level_to_perf_context) { kv.second.Reset(); @@ -224,6 +225,7 @@ std::string PerfContext::ToString(bool exclude_zero_counters) const { PERF_CONTEXT_OUTPUT(env_lock_file_nanos); PERF_CONTEXT_OUTPUT(env_unlock_file_nanos); PERF_CONTEXT_OUTPUT(env_new_logger_nanos); + PERF_CONTEXT_OUTPUT(get_cpu_nanos); PERF_CONTEXT_BY_LEVEL_OUTPUT_ONE_COUNTER(bloom_filter_useful); PERF_CONTEXT_BY_LEVEL_OUTPUT_ONE_COUNTER(bloom_filter_full_positive); PERF_CONTEXT_BY_LEVEL_OUTPUT_ONE_COUNTER(bloom_filter_full_true_positive); diff --git a/monitoring/perf_context_imp.h b/monitoring/perf_context_imp.h index d67654914..e0ff8afc5 100644 --- a/monitoring/perf_context_imp.h +++ b/monitoring/perf_context_imp.h @@ -41,12 +41,25 @@ extern thread_local PerfContext perf_context; PerfStepTimer perf_step_timer_##metric(&(perf_context.metric)); \ perf_step_timer_##metric.Start(); -#define PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD(metric, condition, stats, \ - ticker_type) \ - PerfStepTimer perf_step_timer_##metric(&(perf_context.metric), true, stats, \ - ticker_type); \ - if (condition) { \ - perf_step_timer_##metric.Start(); \ +// Declare and set start time of the timer +#define PERF_TIMER_GUARD_WITH_ENV(metric, env) \ + PerfStepTimer perf_step_timer_##metric(&(perf_context.metric), env); \ + perf_step_timer_##metric.Start(); + +// Declare and set start time of the timer +#define PERF_CPU_TIMER_GUARD(metric, env) \ + PerfStepTimer perf_step_timer_##metric( \ + &(perf_context.metric), env, true, \ + PerfLevel::kEnableTimeAndCPUTimeExceptForMutex); \ + perf_step_timer_##metric.Start(); + +#define PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD(metric, condition, stats, \ + ticker_type) \ + PerfStepTimer perf_step_timer_##metric(&(perf_context.metric), nullptr, \ + false, PerfLevel::kEnableTime, stats, \ + ticker_type); \ + if (condition) { \ + perf_step_timer_##metric.Start(); \ } // Update metric with time elapsed since last START. start time is reset diff --git a/monitoring/perf_step_timer.h b/monitoring/perf_step_timer.h index 246d6eb75..6501bd54a 100644 --- a/monitoring/perf_step_timer.h +++ b/monitoring/perf_step_timer.h @@ -12,14 +12,15 @@ namespace rocksdb { class PerfStepTimer { public: - explicit PerfStepTimer(uint64_t* metric, bool for_mutex = false, - Statistics* statistics = nullptr, - uint32_t ticker_type = 0) - : perf_counter_enabled_( - perf_level >= PerfLevel::kEnableTime || - (!for_mutex && perf_level >= kEnableTimeExceptForMutex)), - env_((perf_counter_enabled_ || statistics != nullptr) ? Env::Default() - : nullptr), + explicit PerfStepTimer( + uint64_t* metric, Env* env = nullptr, bool use_cpu_time = false, + PerfLevel enable_level = PerfLevel::kEnableTimeExceptForMutex, + Statistics* statistics = nullptr, uint32_t ticker_type = 0) + : perf_counter_enabled_(perf_level >= enable_level), + use_cpu_time_(use_cpu_time), + env_((perf_counter_enabled_ || statistics != nullptr) + ? ((env != nullptr) ? env : Env::Default()) + : nullptr), start_(0), metric_(metric), statistics_(statistics), @@ -31,13 +32,21 @@ class PerfStepTimer { void Start() { if (perf_counter_enabled_ || statistics_ != nullptr) { - start_ = env_->NowNanos(); + start_ = time_now(); + } + } + + uint64_t time_now() { + if (!use_cpu_time_) { + return env_->NowNanos(); + } else { + return env_->NowCPUNanos(); } } void Measure() { if (start_) { - uint64_t now = env_->NowNanos(); + uint64_t now = time_now(); *metric_ += now - start_; start_ = now; } @@ -45,7 +54,7 @@ class PerfStepTimer { void Stop() { if (start_) { - uint64_t duration = env_->NowNanos() - start_; + uint64_t duration = time_now() - start_; if (perf_counter_enabled_) { *metric_ += duration; } @@ -59,6 +68,7 @@ class PerfStepTimer { private: const bool perf_counter_enabled_; + const bool use_cpu_time_; Env* const env_; uint64_t start_; uint64_t* metric_;