From f6f1533c6f87a1d40547d882b654ea56ccf66b06 Mon Sep 17 00:00:00 2001 From: Lei Jin Date: Mon, 21 Jul 2014 12:57:29 -0700 Subject: [PATCH] make internal stats independent of statistics Summary: also make it aware of column family output from db_bench ``` ** Compaction Stats [default] ** Level Files Size(MB) Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) RW-Amp W-Amp Rd(MB/s) Wr(MB/s) Rn(cnt) Rnp1(cnt) Wnp1(cnt) Wnew(cnt) Comp(sec) Comp(cnt) Avg(sec) Stall(sec) Stall(cnt) Avg(ms) ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ L0 14 956 0.9 0.0 0.0 0.0 2.7 2.7 0.0 0.0 0.0 111.6 0 0 0 0 24 40 0.612 75.20 492387 0.15 L1 21 2001 2.0 5.7 2.0 3.7 5.3 1.6 5.4 2.6 71.2 65.7 31 43 55 12 82 2 41.242 43.72 41183 1.06 L2 217 18974 1.9 16.5 2.0 14.4 15.1 0.7 15.6 7.4 70.1 64.3 17 182 185 3 241 16 15.052 0.00 0 0.00 L3 1641 188245 1.8 9.1 1.1 8.0 8.5 0.5 15.4 7.4 61.3 57.2 9 75 76 1 152 9 16.887 0.00 0 0.00 L4 4447 449025 0.4 13.4 4.8 8.6 9.1 0.5 4.7 1.9 77.8 52.7 38 79 100 21 176 38 4.639 0.00 0 0.00 Sum 6340 659201 0.0 44.7 10.0 34.7 40.6 6.0 32.0 15.2 67.7 61.6 95 379 416 37 676 105 6.439 118.91 533570 0.22 Int 0 0 0.0 1.2 0.4 0.8 1.3 0.5 5.2 2.7 59.1 65.6 3 7 9 2 20 10 2.003 0.00 0 0.00 Stalls(secs): 75.197 level0_slowdown, 0.000 level0_numfiles, 0.000 memtable_compaction, 43.717 leveln_slowdown Stalls(count): 492387 level0_slowdown, 0 level0_numfiles, 0 memtable_compaction, 41183 leveln_slowdown ** DB Stats ** Uptime(secs): 202.1 total, 13.5 interval Cumulative writes: 6291456 writes, 6291456 batches, 1.0 writes per batch, 4.90 ingest GB Cumulative WAL: 6291456 writes, 6291456 syncs, 1.00 writes per sync, 4.90 GB written Interval writes: 1048576 writes, 1048576 batches, 1.0 writes per batch, 836.0 ingest MB Interval WAL: 1048576 writes, 1048576 syncs, 1.00 writes per sync, 0.82 MB written Test Plan: ran it Reviewers: sdong, yhchiang, igor Reviewed By: igor Subscribers: leveldb Differential Revision: https://reviews.facebook.net/D19917 --- db/column_family.cc | 4 +- db/db_impl.cc | 77 +++++-- db/db_impl.h | 1 + db/internal_stats.cc | 471 ++++++++++++++++++++++--------------------- db/internal_stats.h | 187 +++++++++++------ db/version_set.h | 1 + 6 files changed, 431 insertions(+), 310 deletions(-) diff --git a/db/column_family.cc b/db/column_family.cc index ec90872b8..c081431cd 100644 --- a/db/column_family.cc +++ b/db/column_family.cc @@ -222,8 +222,8 @@ ColumnFamilyData::ColumnFamilyData(const std::string& dbname, uint32_t id, // if dummy_versions is nullptr, then this is a dummy column family. if (dummy_versions != nullptr) { - internal_stats_.reset(new InternalStats( - options_.num_levels, db_options->env, db_options->statistics.get())); + internal_stats_.reset( + new InternalStats(options_.num_levels, db_options->env)); table_cache_.reset(new TableCache(&options_, storage_options, table_cache)); if (options_.compaction_style == kCompactionStyleUniversal) { compaction_picker_.reset( diff --git a/db/db_impl.cc b/db/db_impl.cc index 5b506ba15..d008c7699 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -510,9 +510,23 @@ void DBImpl::MaybeDumpStats() { // atomically. We could see more than one dump during one dump // period in rare cases. last_stats_dump_time_microsec_ = now_micros; + + DBPropertyType cf_property_type = GetPropertyType("rocksdb.cfstats"); std::string stats; - GetProperty("rocksdb.stats", &stats); + { + MutexLock l(&mutex_); + for (auto cfd : *versions_->GetColumnFamilySet()) { + cfd->internal_stats()->GetProperty(cf_property_type, "rocksdb.cfstats", + &stats, cfd); + } + DBPropertyType db_property_type = GetPropertyType("rocksdb.dbstats"); + default_cf_internal_stats_->GetProperty( + db_property_type, "rocksdb.dbstats", &stats, + default_cf_handle_->cfd()); + } + Log(options_.info_log, "------- DUMPING STATS -------"); Log(options_.info_log, "%s", stats.c_str()); + PrintStatistics(); } } @@ -1157,6 +1171,7 @@ Status DBImpl::Recover( SequenceNumber max_sequence(0); default_cf_handle_ = new ColumnFamilyHandleImpl( versions_->GetColumnFamilySet()->GetDefault(), this, &mutex_); + default_cf_internal_stats_ = default_cf_handle_->cfd()->internal_stats(); single_column_family_mode_ = versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1; @@ -1419,6 +1434,8 @@ Status DBImpl::WriteLevel0TableForRecovery(ColumnFamilyData* cfd, MemTable* mem, stats.bytes_written = meta.fd.GetFileSize(); stats.files_out_levelnp1 = 1; cfd->internal_stats()->AddCompactionStats(level, stats); + cfd->internal_stats()->AddCFStats( + InternalStats::BYTES_FLUSHED, meta.fd.GetFileSize()); RecordTick(options_.statistics.get(), COMPACT_WRITE_BYTES, meta.fd.GetFileSize()); return s; @@ -1509,6 +1526,8 @@ Status DBImpl::WriteLevel0Table(ColumnFamilyData* cfd, stats.micros = env_->NowMicros() - start_micros; stats.bytes_written = meta.fd.GetFileSize(); cfd->internal_stats()->AddCompactionStats(level, stats); + cfd->internal_stats()->AddCFStats( + InternalStats::BYTES_FLUSHED, meta.fd.GetFileSize()); RecordTick(options_.statistics.get(), COMPACT_WRITE_BYTES, meta.fd.GetFileSize()); return s; @@ -1979,9 +1998,8 @@ void DBImpl::BackgroundCallFlush() { // case this is an environmental problem and we do not want to // chew up resources for failed compactions for the duration of // the problem. - uint64_t error_cnt = default_cf_handle_->cfd() - ->internal_stats() - ->BumpAndGetBackgroundErrorCount(); + uint64_t error_cnt = + default_cf_internal_stats_->BumpAndGetBackgroundErrorCount(); bg_cv_.SignalAll(); // In case a waiter can proceed despite the error mutex_.Unlock(); Log(options_.info_log, @@ -2047,9 +2065,8 @@ void DBImpl::BackgroundCallCompaction() { // case this is an environmental problem and we do not want to // chew up resources for failed compactions for the duration of // the problem. - uint64_t error_cnt = default_cf_handle_->cfd() - ->internal_stats() - ->BumpAndGetBackgroundErrorCount(); + uint64_t error_cnt = + default_cf_internal_stats_->BumpAndGetBackgroundErrorCount(); bg_cv_.SignalAll(); // In case a waiter can proceed despite the error mutex_.Unlock(); log_buffer.FlushBufferToLog(); @@ -3146,8 +3163,8 @@ Status DBImpl::DoCompactionWork(CompactionState* compact, LogFlush(options_.info_log); mutex_.Lock(); - cfd->internal_stats()->AddCompactionStats(compact->compaction->output_level(), - stats); + cfd->internal_stats()->AddCompactionStats( + compact->compaction->output_level(), stats); // if there were any unused file number (mostly in case of // compaction error), free up the entry from pending_putputs @@ -3774,9 +3791,13 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { if (!options.disableWAL) { RecordTick(options_.statistics.get(), WRITE_WITH_WAL, 1); + default_cf_internal_stats_->AddDBStats( + InternalStats::WRITE_WITH_WAL, 1); } if (w.done) { + default_cf_internal_stats_->AddDBStats( + InternalStats::WRITE_DONE_BY_OTHER, 1); mutex_.Unlock(); RecordTick(options_.statistics.get(), WRITE_DONE_BY_OTHER, 1); return w.status; @@ -3807,6 +3828,8 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { return Status::TimedOut(); } else { RecordTick(options_.statistics.get(), WRITE_DONE_BY_SELF, 1); + default_cf_internal_stats_->AddDBStats( + InternalStats::WRITE_DONE_BY_SELF, 1); } // Once reaches this point, the current writer "w" will try to do its write @@ -3892,17 +3915,19 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { WriteBatchInternal::SetSequence(updates, current_sequence); int my_batch_count = WriteBatchInternal::Count(updates); last_sequence += my_batch_count; + const uint64_t batch_size = WriteBatchInternal::ByteSize(updates); // Record statistics RecordTick(options_.statistics.get(), NUMBER_KEYS_WRITTEN, my_batch_count); RecordTick(options_.statistics.get(), BYTES_WRITTEN, - WriteBatchInternal::ByteSize(updates)); + batch_size); if (options.disableWAL) { flush_on_destroy_ = true; } PERF_TIMER_STOP(write_pre_and_post_process_time); + uint64_t log_size = 0; if (!options.disableWAL) { PERF_TIMER_START(write_wal_time); Slice log_entry = WriteBatchInternal::Contents(updates); @@ -3910,8 +3935,9 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { total_log_size_ += log_entry.size(); alive_log_files_.back().AddSize(log_entry.size()); log_empty_ = false; + log_size = log_entry.size(); RecordTick(options_.statistics.get(), WAL_FILE_SYNCED, 1); - RecordTick(options_.statistics.get(), WAL_FILE_BYTES, log_entry.size()); + RecordTick(options_.statistics.get(), WAL_FILE_BYTES, log_size); if (status.ok() && options.sync) { if (options_.use_fsync) { StopWatch(env_, options_.statistics.get(), WAL_FILE_SYNC_MICROS); @@ -3942,8 +3968,19 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { last_sequence); } PERF_TIMER_START(write_pre_and_post_process_time); - if (updates == &tmp_batch_) tmp_batch_.Clear(); + if (updates == &tmp_batch_) { + tmp_batch_.Clear(); + } mutex_.Lock(); + // internal stats + default_cf_internal_stats_->AddDBStats( + InternalStats::BYTES_WRITTEN, batch_size); + if (!options.disableWAL) { + default_cf_internal_stats_->AddDBStats( + InternalStats::WAL_FILE_SYNCED, 1); + default_cf_internal_stats_->AddDBStats( + InternalStats::WAL_FILE_BYTES, log_size); + } if (status.ok()) { versions_->SetLastSequence(last_sequence); } @@ -4126,10 +4163,10 @@ Status DBImpl::MakeRoomForWrite( delayed = sw.ElapsedMicros(); } RecordTick(options_.statistics.get(), STALL_L0_SLOWDOWN_MICROS, delayed); - cfd->internal_stats()->RecordWriteStall(InternalStats::LEVEL0_SLOWDOWN, - delayed); allow_delay = false; // Do not delay a single write more than once mutex_.Lock(); + cfd->internal_stats()->AddCFStats( + InternalStats::LEVEL0_SLOWDOWN, delayed); delayed_writes_++; } else if (!force && !cfd->mem()->ShouldFlush()) { // There is room in current memtable @@ -4160,7 +4197,7 @@ Status DBImpl::MakeRoomForWrite( } RecordTick(options_.statistics.get(), STALL_MEMTABLE_COMPACTION_MICROS, stall); - cfd->internal_stats()->RecordWriteStall( + cfd->internal_stats()->AddCFStats( InternalStats::MEMTABLE_COMPACTION, stall); } else if (cfd->NeedWaitForNumLevel0Files()) { DelayLoggingAndReset(); @@ -4179,11 +4216,11 @@ Status DBImpl::MakeRoomForWrite( } RecordTick(options_.statistics.get(), STALL_L0_NUM_FILES_MICROS, stall); - cfd->internal_stats()->RecordWriteStall(InternalStats::LEVEL0_NUM_FILES, - stall); + cfd->internal_stats()->AddCFStats( + InternalStats::LEVEL0_NUM_FILES, stall); } else if (allow_hard_rate_limit_delay && cfd->ExceedsHardRateLimit()) { // Delay a write when the compaction score for any level is too large. - int max_level = cfd->current()->MaxCompactionScoreLevel(); + const int max_level = cfd->current()->MaxCompactionScoreLevel(); score = cfd->current()->MaxCompactionScore(); mutex_.Unlock(); uint64_t delayed; @@ -4193,7 +4230,6 @@ Status DBImpl::MakeRoomForWrite( env_->SleepForMicroseconds(1000); delayed = sw.ElapsedMicros(); } - cfd->internal_stats()->RecordLevelNSlowdown(max_level, delayed); // Make sure the following value doesn't round to zero. uint64_t rate_limit = std::max((delayed / 1000), (uint64_t) 1); rate_limit_delay_millis += rate_limit; @@ -4205,7 +4241,9 @@ Status DBImpl::MakeRoomForWrite( allow_hard_rate_limit_delay = false; } mutex_.Lock(); + cfd->internal_stats()->RecordLevelNSlowdown(max_level, delayed, false); } else if (allow_soft_rate_limit_delay && cfd->ExceedsSoftRateLimit()) { + const int max_level = cfd->current()->MaxCompactionScoreLevel(); score = cfd->current()->MaxCompactionScore(); // Delay a write when the compaction score for any level is too large. // TODO: add statistics @@ -4221,6 +4259,7 @@ Status DBImpl::MakeRoomForWrite( } allow_soft_rate_limit_delay = false; mutex_.Lock(); + cfd->internal_stats()->RecordLevelNSlowdown(max_level, slowdown, true); } else { unique_ptr lfile; log::Writer* new_log = nullptr; diff --git a/db/db_impl.h b/db/db_impl.h index 08a4f6660..9e9d10817 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -475,6 +475,7 @@ class DBImpl : public DB { unique_ptr log_; bool log_empty_; ColumnFamilyHandleImpl* default_cf_handle_; + InternalStats* default_cf_internal_stats_; unique_ptr column_family_memtables_; struct LogFileNumberSize { explicit LogFileNumberSize(uint64_t _number) diff --git a/db/internal_stats.cc b/db/internal_stats.cc index eac3625ed..74b4f9e63 100644 --- a/db/internal_stats.cc +++ b/db/internal_stats.cc @@ -7,10 +7,10 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "db/internal_stats.h" - #define __STDC_FORMAT_MACROS #include #include +#include "db/column_family.h" #include "db/column_family.h" @@ -20,33 +20,30 @@ namespace { const double kMB = 1048576.0; const double kGB = kMB * 1024; -void PrintLevelStatsHeader(char* buf, size_t len) { +void PrintLevelStatsHeader(char* buf, size_t len, const std::string& cf_name) { snprintf( buf, len, - "\n** Compaction Stats **\n" - "Level Files Size(MB) Score Read(GB) Rn(GB) Rnp1(GB) " + "\n** Compaction Stats [%s] **\n" + "Level Files Size(MB) Score Read(GB) Rn(GB) Rnp1(GB) " "Write(GB) Wnew(GB) RW-Amp W-Amp Rd(MB/s) Wr(MB/s) Rn(cnt) " "Rnp1(cnt) Wnp1(cnt) Wnew(cnt) Comp(sec) Comp(cnt) Avg(sec) " "Stall(sec) Stall(cnt) Avg(ms)\n" "--------------------------------------------------------------------" "--------------------------------------------------------------------" - "--------------------------------------------------------------------\n"); + "--------------------------------------------------------------------\n", + cf_name.c_str()); } void PrintLevelStats(char* buf, size_t len, const std::string& name, - int num_files, double total_file_size, double score, - double stall_us, uint64_t stalls, + int num_files, int being_compacted, double total_file_size, double score, + double rw_amp, double w_amp, double stall_us, uint64_t stalls, const InternalStats::CompactionStats& stats) { - int64_t bytes_read = stats.bytes_readn + stats.bytes_readnp1; - int64_t bytes_new = stats.bytes_written - stats.bytes_readnp1; - double rw_amp = (stats.bytes_readn== 0) ? 0.0 - : (stats.bytes_written + bytes_read) / (double)stats.bytes_readn; - double w_amp = (stats.bytes_readn == 0) ? 0.0 - : stats.bytes_written / (double)stats.bytes_readn; + uint64_t bytes_read = stats.bytes_readn + stats.bytes_readnp1; + uint64_t bytes_new = stats.bytes_written - stats.bytes_readnp1; double elapsed = (stats.micros + 1) / 1000000.0; snprintf(buf, len, - "%4s %5d %8.0f %5.1f " /* Level, Files, Size(MB), Score */ + "%4s %5d/%-3d %8.0f %5.1f " /* Level, Files, Size(MB), Score */ "%8.1f " /* Read(GB) */ "%7.1f " /* Rn(GB) */ "%8.1f " /* Rnp1(GB) */ @@ -63,10 +60,10 @@ void PrintLevelStats(char* buf, size_t len, const std::string& name, "%10.0f " /* Comp(sec) */ "%9d " /* Comp(cnt) */ "%8.3f " /* Avg(sec) */ - "%10.2f " /* Sta(sec) */ - "%10" PRIu64 " " /* Sta(cnt) */ + "%10.2f " /* Stall(sec) */ + "%10" PRIu64 " " /* Stall(cnt) */ "%7.2f\n" /* Avg(ms) */, - name.c_str(), num_files, total_file_size / kMB, score, + name.c_str(), num_files, being_compacted, total_file_size / kMB, score, bytes_read / kGB, stats.bytes_readn / kGB, stats.bytes_readnp1 / kGB, @@ -88,6 +85,7 @@ void PrintLevelStats(char* buf, size_t len, const std::string& name, stalls == 0 ? 0 : stall_us / 1000.0 / stalls); } + } DBPropertyType GetPropertyType(const Slice& property) { @@ -102,6 +100,10 @@ DBPropertyType GetPropertyType(const Slice& property) { return kLevelStats; } else if (in == "stats") { return kStats; + } else if (in == "cfstats") { + return kCFStats; + } else if (in == "dbstats") { + return kDBStats; } else if (in == "sstables") { return kSsTables; } else if (in == "num-immutable-mem-table") { @@ -159,223 +161,20 @@ bool InternalStats::GetProperty(DBPropertyType property_type, return true; } case kStats: { - char buf[1000]; - - uint64_t wal_bytes = 0; - uint64_t wal_synced = 0; - uint64_t user_bytes_written = 0; - uint64_t write_other = 0; - uint64_t write_self = 0; - uint64_t write_with_wal = 0; - uint64_t micros_up = env_->NowMicros() - started_at_; - // Add "+1" to make sure seconds_up is > 0 and avoid NaN later - double seconds_up = (micros_up + 1) / 1000000.0; - uint64_t total_slowdown = 0; - uint64_t total_slowdown_count = 0; - uint64_t interval_bytes_written = 0; - uint64_t interval_bytes_read = 0; - uint64_t interval_bytes_new = 0; - double interval_seconds_up = 0; - - if (statistics_) { - wal_bytes = statistics_->getTickerCount(WAL_FILE_BYTES); - wal_synced = statistics_->getTickerCount(WAL_FILE_SYNCED); - user_bytes_written = statistics_->getTickerCount(BYTES_WRITTEN); - write_other = statistics_->getTickerCount(WRITE_DONE_BY_OTHER); - write_self = statistics_->getTickerCount(WRITE_DONE_BY_SELF); - write_with_wal = statistics_->getTickerCount(WRITE_WITH_WAL); + if (!GetProperty(kCFStats, "rocksdb.cfstats", value, cfd)) { + return false; } - - PrintLevelStatsHeader(buf, sizeof(buf)); - value->append(buf); - - CompactionStats stats_total(0); - int total_files = 0; - double total_file_size = 0; - uint64_t total_stalls = 0; - double total_stall_us = 0; - int level_printed = 0; - for (int level = 0; level < number_levels_; level++) { - int files = current->NumLevelFiles(level); - total_files += files; - if (stats_[level].micros > 0 || files > 0) { - ++level_printed; - uint64_t stalls = level == 0 ? (stall_counts_[LEVEL0_SLOWDOWN] + - stall_counts_[LEVEL0_NUM_FILES] + - stall_counts_[MEMTABLE_COMPACTION]) - : stall_leveln_slowdown_count_[level]; - - double stall_us = level == 0 ? (stall_micros_[LEVEL0_SLOWDOWN] + - stall_micros_[LEVEL0_NUM_FILES] + - stall_micros_[MEMTABLE_COMPACTION]) - : stall_leveln_slowdown_[level]; - - stats_total.Add(stats_[level]); - total_file_size += current->NumLevelBytes(level); - total_stall_us += stall_us; - total_stalls += stalls; - total_slowdown += stall_leveln_slowdown_[level]; - total_slowdown_count += stall_leveln_slowdown_count_[level]; - double score = current->NumLevelBytes(level) / - cfd->compaction_picker()->MaxBytesForLevel(level); - PrintLevelStats(buf, sizeof(buf), "L" + std::to_string(level), files, - current->NumLevelBytes(level), score, stall_us, stalls, - stats_[level]); - value->append(buf); - } + if (!GetProperty(kDBStats, "rocksdb.dbstats", value, cfd)) { + return false; } - // Stats summary across levels - if (level_printed > 1) { - PrintLevelStats(buf, sizeof(buf), "Sum", total_files, total_file_size, - 0, total_stall_us, total_stalls, stats_total); - value->append(buf); - } - - uint64_t total_bytes_read = - stats_total.bytes_readn + stats_total.bytes_readnp1; - uint64_t total_bytes_written = stats_total.bytes_written; - - interval_bytes_new = user_bytes_written - last_stats_.ingest_bytes_; - interval_bytes_read = - total_bytes_read - last_stats_.compaction_bytes_read_; - interval_bytes_written = - total_bytes_written - last_stats_.compaction_bytes_written_; - interval_seconds_up = seconds_up - last_stats_.seconds_up_; - - snprintf(buf, sizeof(buf), "\nUptime(secs): %.1f total, %.1f interval\n", - seconds_up, interval_seconds_up); - value->append(buf); - - snprintf(buf, sizeof(buf), - "Writes cumulative: %llu total, %llu batches, " - "%.1f per batch, %.2f ingest GB\n", - (unsigned long long)(write_other + write_self), - (unsigned long long)write_self, - (write_other + write_self) / (double)(write_self + 1), - user_bytes_written / kGB); - value->append(buf); - - snprintf(buf, sizeof(buf), - "WAL cumulative: %llu WAL writes, %llu WAL syncs, " - "%.2f writes per sync, %.2f GB written\n", - (unsigned long long)write_with_wal, - (unsigned long long)wal_synced, - write_with_wal / (double)(wal_synced + 1), - wal_bytes / kGB); - value->append(buf); - - snprintf(buf, sizeof(buf), - "Compaction IO cumulative (GB): " - "%.2f new, %.2f read, %.2f write, %.2f read+write\n", - user_bytes_written / kGB, - total_bytes_read / kGB, - total_bytes_written / kGB, - (total_bytes_read + total_bytes_written) / kGB); - value->append(buf); - - snprintf( - buf, sizeof(buf), - "Compaction IO cumulative (MB/sec): " - "%.1f new, %.1f read, %.1f write, %.1f read+write\n", - user_bytes_written / kMB / seconds_up, - total_bytes_read / kMB / seconds_up, - total_bytes_written / kMB / seconds_up, - (total_bytes_read + total_bytes_written) / kMB / seconds_up); - value->append(buf); - - // +1 to avoid divide by 0 and NaN - snprintf( - buf, sizeof(buf), - "Amplification cumulative: %.1f write, %.1f compaction\n", - (double)(total_bytes_written + wal_bytes) / (user_bytes_written + 1), - (double)(total_bytes_written + total_bytes_read + wal_bytes) / - (user_bytes_written + 1)); - value->append(buf); - - uint64_t interval_write_other = write_other - last_stats_.write_other_; - uint64_t interval_write_self = write_self - last_stats_.write_self_; - - snprintf(buf, sizeof(buf), - "Writes interval: %llu total, %llu batches, " - "%.1f per batch, %.1f ingest MB\n", - (unsigned long long)(interval_write_other + interval_write_self), - (unsigned long long)interval_write_self, - (double)(interval_write_other + interval_write_self) / - (interval_write_self + 1), - (user_bytes_written - last_stats_.ingest_bytes_) / kMB); - value->append(buf); - - uint64_t interval_write_with_wal = - write_with_wal - last_stats_.write_with_wal_; - - uint64_t interval_wal_synced = wal_synced - last_stats_.wal_synced_; - uint64_t interval_wal_bytes = wal_bytes - last_stats_.wal_bytes_; - - snprintf(buf, sizeof(buf), - "WAL interval: %llu WAL writes, %llu WAL syncs, " - "%.2f writes per sync, %.2f MB written\n", - (unsigned long long)interval_write_with_wal, - (unsigned long long)interval_wal_synced, - interval_write_with_wal / (double)(interval_wal_synced + 1), - interval_wal_bytes / kGB); - value->append(buf); - - snprintf(buf, sizeof(buf), - "Compaction IO interval (MB): " - "%.2f new, %.2f read, %.2f write, %.2f read+write\n", - interval_bytes_new / kMB, interval_bytes_read / kMB, - interval_bytes_written / kMB, - (interval_bytes_read + interval_bytes_written) / kMB); - value->append(buf); - - snprintf(buf, sizeof(buf), - "Compaction IO interval (MB/sec): " - "%.1f new, %.1f read, %.1f write, %.1f read+write\n", - interval_bytes_new / kMB / interval_seconds_up, - interval_bytes_read / kMB / interval_seconds_up, - interval_bytes_written / kMB / interval_seconds_up, - (interval_bytes_read + interval_bytes_written) / kMB / - interval_seconds_up); - value->append(buf); - - // +1 to avoid divide by 0 and NaN - snprintf( - buf, sizeof(buf), - "Amplification interval: %.1f write, %.1f compaction\n", - (double)(interval_bytes_written + interval_wal_bytes) / - (interval_bytes_new + 1), - (double)(interval_bytes_written + interval_bytes_read + interval_wal_bytes) / - (interval_bytes_new + 1)); - value->append(buf); - - snprintf(buf, sizeof(buf), - "Stalls(secs): %.3f level0_slowdown, %.3f level0_numfiles, " - "%.3f memtable_compaction, %.3f leveln_slowdown\n", - stall_micros_[LEVEL0_SLOWDOWN] / 1000000.0, - stall_micros_[LEVEL0_NUM_FILES] / 1000000.0, - stall_micros_[MEMTABLE_COMPACTION] / 1000000.0, - total_slowdown / 1000000.0); - value->append(buf); - - snprintf(buf, sizeof(buf), - "Stalls(count): %lu level0_slowdown, %lu level0_numfiles, " - "%lu memtable_compaction, %lu leveln_slowdown\n", - (unsigned long)stall_counts_[LEVEL0_SLOWDOWN], - (unsigned long)stall_counts_[LEVEL0_NUM_FILES], - (unsigned long)stall_counts_[MEMTABLE_COMPACTION], - (unsigned long)total_slowdown_count); - value->append(buf); - - last_stats_.compaction_bytes_read_ = total_bytes_read; - last_stats_.compaction_bytes_written_ = total_bytes_written; - last_stats_.ingest_bytes_ = user_bytes_written; - last_stats_.seconds_up_ = seconds_up; - last_stats_.wal_bytes_ = wal_bytes; - last_stats_.wal_synced_ = wal_synced; - last_stats_.write_with_wal_ = write_with_wal; - last_stats_.write_other_ = write_other; - last_stats_.write_self_ = write_self; - + return true; + } + case kCFStats: { + DumpCFStats(value, cfd); + return true; + } + case kDBStats: { + DumpDBStats(value); return true; } case kSsTables: @@ -414,4 +213,212 @@ bool InternalStats::GetProperty(DBPropertyType property_type, } } +void InternalStats::DumpDBStats(std::string* value) { + char buf[1000]; + // DB-level stats, only available from default column family + double seconds_up = (env_->NowMicros() - started_at_ + 1) / 1000000.0; + double interval_seconds_up = seconds_up - db_stats_snapshot_.seconds_up; + snprintf(buf, sizeof(buf), + "\n** DB Stats **\nUptime(secs): %.1f total, %.1f interval\n", + seconds_up, interval_seconds_up); + value->append(buf); + // Cumulative + uint64_t user_bytes_written = db_stats_[InternalStats::BYTES_WRITTEN]; + uint64_t write_other = db_stats_[InternalStats::WRITE_DONE_BY_OTHER]; + uint64_t write_self = db_stats_[InternalStats::WRITE_DONE_BY_SELF]; + uint64_t wal_bytes = db_stats_[InternalStats::WAL_FILE_BYTES]; + uint64_t wal_synced = db_stats_[InternalStats::WAL_FILE_SYNCED]; + uint64_t write_with_wal = db_stats_[InternalStats::WRITE_WITH_WAL]; + // Data + snprintf(buf, sizeof(buf), + "Cumulative writes: %" PRIu64 " writes, %" PRIu64 " batches, " + "%.1f writes per batch, %.2f GB user ingest\n", + write_other + write_self, write_self, + (write_other + write_self) / static_cast(write_self + 1), + user_bytes_written / kGB); + value->append(buf); + // WAL + snprintf(buf, sizeof(buf), + "Cumulative WAL: %" PRIu64 " writes, %" PRIu64 " syncs, " + "%.2f writes per sync, %.2f GB written\n", + write_with_wal, wal_synced, + write_with_wal / static_cast(wal_synced + 1), + wal_bytes / kGB); + value->append(buf); + + // Interval + uint64_t interval_write_other = write_other - db_stats_snapshot_.write_other; + uint64_t interval_write_self = write_self - db_stats_snapshot_.write_self; + snprintf(buf, sizeof(buf), + "Interval writes: %" PRIu64 " writes, %" PRIu64 " batches, " + "%.1f writes per batch, %.1f MB user ingest\n", + interval_write_other + interval_write_self, + interval_write_self, + static_cast(interval_write_other + interval_write_self) / + (interval_write_self + 1), + (user_bytes_written - db_stats_snapshot_.ingest_bytes) / kMB); + value->append(buf); + + uint64_t interval_write_with_wal = + write_with_wal - db_stats_snapshot_.write_with_wal; + uint64_t interval_wal_synced = wal_synced - db_stats_snapshot_.wal_synced; + uint64_t interval_wal_bytes = wal_bytes - db_stats_snapshot_.wal_bytes; + + snprintf(buf, sizeof(buf), + "Interval WAL: %" PRIu64 " writes, %" PRIu64 " syncs, " + "%.2f writes per sync, %.2f MB written\n", + interval_write_with_wal, + interval_wal_synced, + interval_write_with_wal / + static_cast(interval_wal_synced + 1), + interval_wal_bytes / kGB); + value->append(buf); + + db_stats_snapshot_.seconds_up = seconds_up; + db_stats_snapshot_.ingest_bytes = user_bytes_written; + db_stats_snapshot_.write_other = write_other; + db_stats_snapshot_.write_self = write_self; + db_stats_snapshot_.wal_bytes = wal_bytes; + db_stats_snapshot_.wal_synced = wal_synced; + db_stats_snapshot_.write_with_wal = write_with_wal; +} + +void InternalStats::DumpCFStats(std::string* value, ColumnFamilyData* cfd) { + Version* current = cfd->current(); + + int num_levels_to_check = + (cfd->options()->compaction_style != kCompactionStyleUniversal && + cfd->options()->compaction_style != kCompactionStyleFIFO) + ? current->NumberLevels() - 1 + : 1; + // Compaction scores are sorted base on its value. Restore them to the + // level order + std::vector compaction_score(number_levels_, 0); + for (int i = 0; i < num_levels_to_check; ++i) { + compaction_score[current->compaction_level_[i]] = + current->compaction_score_[i]; + } + // Count # of files being compacted for each level + std::vector files_being_compacted(number_levels_, 0); + for (int level = 0; level < num_levels_to_check; ++level) { + for (auto* f : current->files_[level]) { + if (f->being_compacted) { + ++files_being_compacted[level]; + } + } + } + + char buf[1000]; + // Per-ColumnFamily stats + PrintLevelStatsHeader(buf, sizeof(buf), cfd->GetName()); + value->append(buf); + + CompactionStats stats_sum(0); + int total_files = 0; + int total_files_being_compacted = 0; + double total_file_size = 0; + uint64_t total_slowdown_soft = 0; + uint64_t total_slowdown_count_soft = 0; + uint64_t total_slowdown_hard = 0; + uint64_t total_slowdown_count_hard = 0; + uint64_t total_stall_count = 0; + double total_stall_us = 0; + for (int level = 0; level < number_levels_; level++) { + int files = current->NumLevelFiles(level); + total_files += files; + total_files_being_compacted += files_being_compacted[level]; + if (comp_stats_[level].micros > 0 || files > 0) { + uint64_t stalls = level == 0 ? + (cf_stats_count_[LEVEL0_SLOWDOWN] + + cf_stats_count_[LEVEL0_NUM_FILES] + + cf_stats_count_[MEMTABLE_COMPACTION]) + : (stall_leveln_slowdown_count_soft_[level] + + stall_leveln_slowdown_count_hard_[level]); + + double stall_us = level == 0 ? + (cf_stats_value_[LEVEL0_SLOWDOWN] + + cf_stats_value_[LEVEL0_NUM_FILES] + + cf_stats_value_[MEMTABLE_COMPACTION]) + : (stall_leveln_slowdown_soft_[level] + + stall_leveln_slowdown_hard_[level]); + + stats_sum.Add(comp_stats_[level]); + total_file_size += current->NumLevelBytes(level); + total_stall_us += stall_us; + total_stall_count += stalls; + total_slowdown_soft += stall_leveln_slowdown_soft_[level]; + total_slowdown_count_soft += stall_leveln_slowdown_count_soft_[level]; + total_slowdown_hard += stall_leveln_slowdown_hard_[level]; + total_slowdown_count_hard += stall_leveln_slowdown_count_hard_[level]; + int64_t bytes_read = comp_stats_[level].bytes_readn + + comp_stats_[level].bytes_readnp1; + double rw_amp = (comp_stats_[level].bytes_readn == 0) ? 0.0 + : (comp_stats_[level].bytes_written + bytes_read) / + static_cast(comp_stats_[level].bytes_readn); + double w_amp = (comp_stats_[level].bytes_readn == 0) ? 0.0 + : comp_stats_[level].bytes_written / + static_cast(comp_stats_[level].bytes_readn); + PrintLevelStats(buf, sizeof(buf), "L" + std::to_string(level), + files, files_being_compacted[level], current->NumLevelBytes(level), + compaction_score[level], rw_amp, w_amp, stall_us, stalls, + comp_stats_[level]); + value->append(buf); + } + } + uint64_t curr_ingest = cf_stats_value_[BYTES_FLUSHED]; + // Cumulative summary + double rw_amp = (stats_sum.bytes_written + stats_sum.bytes_readn + + stats_sum.bytes_readnp1) / static_cast(curr_ingest + 1); + double w_amp = stats_sum.bytes_written / static_cast(curr_ingest + 1); + // Stats summary across levels + PrintLevelStats(buf, sizeof(buf), "Sum", total_files, + total_files_being_compacted, total_file_size, 0, rw_amp, w_amp, + total_stall_us, total_stall_count, stats_sum); + value->append(buf); + // Interval summary + uint64_t interval_ingest = + curr_ingest - cf_stats_snapshot_.ingest_bytes + 1; + CompactionStats interval_stats(stats_sum); + interval_stats.Subtract(cf_stats_snapshot_.comp_stats); + rw_amp = (interval_stats.bytes_written + + interval_stats.bytes_readn + interval_stats.bytes_readnp1) / + static_cast(interval_ingest); + w_amp = interval_stats.bytes_written / static_cast(interval_ingest); + PrintLevelStats(buf, sizeof(buf), "Int", 0, 0, 0, 0, + rw_amp, w_amp, total_stall_us - cf_stats_snapshot_.stall_us, + total_stall_count - cf_stats_snapshot_.stall_count, interval_stats); + value->append(buf); + + snprintf(buf, sizeof(buf), + "Flush(GB): accumulative %.3f, interval %.3f\n", + curr_ingest / kGB, interval_ingest / kGB); + value->append(buf); + snprintf(buf, sizeof(buf), + "Stalls(secs): %.3f level0_slowdown, %.3f level0_numfiles, " + "%.3f memtable_compaction, %.3f leveln_slowdown_soft, " + "%.3f leveln_slowdown_hard\n", + cf_stats_value_[LEVEL0_SLOWDOWN] / 1000000.0, + cf_stats_value_[LEVEL0_NUM_FILES] / 1000000.0, + cf_stats_value_[MEMTABLE_COMPACTION] / 1000000.0, + total_slowdown_soft / 1000000.0, + total_slowdown_hard / 1000000.0); + value->append(buf); + + snprintf(buf, sizeof(buf), + "Stalls(count): %" PRIu64 " level0_slowdown, " + "%" PRIu64 " level0_numfiles, %" PRIu64 " memtable_compaction, " + "%" PRIu64 " leveln_slowdown_soft, " + "%" PRIu64 " leveln_slowdown_hard\n", + cf_stats_count_[LEVEL0_SLOWDOWN], + cf_stats_count_[LEVEL0_NUM_FILES], + cf_stats_count_[MEMTABLE_COMPACTION], + total_slowdown_count_soft, total_slowdown_count_hard); + value->append(buf); + + cf_stats_snapshot_.ingest_bytes = curr_ingest; + cf_stats_snapshot_.comp_stats = stats_sum; + cf_stats_snapshot_.stall_us = total_stall_us; + cf_stats_snapshot_.stall_count = total_stall_count; +} + } // namespace rocksdb diff --git a/db/internal_stats.h b/db/internal_stats.h index eae436473..28d84364c 100644 --- a/db/internal_stats.h +++ b/db/internal_stats.h @@ -9,8 +9,6 @@ // #pragma once -#include "rocksdb/statistics.h" -#include "util/statistics.h" #include "db/version_set.h" #include @@ -26,7 +24,9 @@ class DBImpl; enum DBPropertyType { kNumFilesAtLevel, // Number of files at a specific level kLevelStats, // Return number of files and total sizes of each level - kStats, // Return general statitistics of DB + kCFStats, // Return general statitistics of CF + kDBStats, // Return general statitistics of DB + kStats, // Return general statitistics of both DB and CF kSsTables, // Return a human readable string of current SST files kNumImmutableMemTable, // Return number of immutable mem tables kMemtableFlushPending, // Return 1 if mem table flushing is pending, @@ -45,38 +45,66 @@ extern DBPropertyType GetPropertyType(const Slice& property); class InternalStats { public: - enum WriteStallType { + enum InternalCFStatsType { LEVEL0_SLOWDOWN, MEMTABLE_COMPACTION, LEVEL0_NUM_FILES, WRITE_STALLS_ENUM_MAX, + BYTES_FLUSHED, + INTERNAL_CF_STATS_ENUM_MAX, }; - InternalStats(int num_levels, Env* env, Statistics* statistics) - : stats_(num_levels), - stall_micros_(WRITE_STALLS_ENUM_MAX, 0), - stall_counts_(WRITE_STALLS_ENUM_MAX, 0), - stall_leveln_slowdown_(num_levels, 0), - stall_leveln_slowdown_count_(num_levels, 0), + enum InternalDBStatsType { + WAL_FILE_BYTES, + WAL_FILE_SYNCED, + BYTES_WRITTEN, + WRITE_DONE_BY_OTHER, + WRITE_DONE_BY_SELF, + WRITE_WITH_WAL, + INTERNAL_DB_STATS_ENUM_MAX, + }; + + InternalStats(int num_levels, Env* env) + : db_stats_(INTERNAL_DB_STATS_ENUM_MAX), + cf_stats_value_(INTERNAL_CF_STATS_ENUM_MAX), + cf_stats_count_(INTERNAL_CF_STATS_ENUM_MAX), + comp_stats_(num_levels), + stall_leveln_slowdown_hard_(num_levels), + stall_leveln_slowdown_count_hard_(num_levels), + stall_leveln_slowdown_soft_(num_levels), + stall_leveln_slowdown_count_soft_(num_levels), bg_error_count_(0), number_levels_(num_levels), - statistics_(statistics), env_(env), - started_at_(env->NowMicros()) {} + started_at_(env->NowMicros()) { + for (int i = 0; i< INTERNAL_DB_STATS_ENUM_MAX; ++i) { + db_stats_[i] = 0; + } + for (int i = 0; i< INTERNAL_CF_STATS_ENUM_MAX; ++i) { + cf_stats_value_[i] = 0; + cf_stats_count_[i] = 0; + } + for (int i = 0; i < num_levels; ++i) { + stall_leveln_slowdown_hard_[i] = 0; + stall_leveln_slowdown_count_hard_[i] = 0; + stall_leveln_slowdown_soft_[i] = 0; + stall_leveln_slowdown_count_soft_[i] = 0; + } + } - // Per level compaction stats. stats_[level] stores the stats for + // Per level compaction stats. comp_stats_[level] stores the stats for // compactions that produced data for the specified "level". struct CompactionStats { uint64_t micros; // Bytes read from level N during compaction between levels N and N+1 - int64_t bytes_readn; + uint64_t bytes_readn; // Bytes read from level N+1 during compaction between levels N and N+1 - int64_t bytes_readnp1; + uint64_t bytes_readnp1; // Total bytes written during compaction between levels N and N+1 - int64_t bytes_written; + uint64_t bytes_written; // Files read from level N during compaction between levels N and N+1 int files_in_leveln; @@ -100,6 +128,16 @@ class InternalStats { files_out_levelnp1(0), count(count) {} + explicit CompactionStats(const CompactionStats& c) + : micros(c.micros), + bytes_readn(c.bytes_readn), + bytes_readnp1(c.bytes_readnp1), + bytes_written(c.bytes_written), + files_in_leveln(c.files_in_leveln), + files_in_levelnp1(c.files_in_levelnp1), + files_out_levelnp1(c.files_out_levelnp1), + count(c.count) {} + void Add(const CompactionStats& c) { this->micros += c.micros; this->bytes_readn += c.bytes_readn; @@ -110,20 +148,40 @@ class InternalStats { this->files_out_levelnp1 += c.files_out_levelnp1; this->count += c.count; } + + void Subtract(const CompactionStats& c) { + this->micros -= c.micros; + this->bytes_readn -= c.bytes_readn; + this->bytes_readnp1 -= c.bytes_readnp1; + this->bytes_written -= c.bytes_written; + this->files_in_leveln -= c.files_in_leveln; + this->files_in_levelnp1 -= c.files_in_levelnp1; + this->files_out_levelnp1 -= c.files_out_levelnp1; + this->count -= c.count; + } }; void AddCompactionStats(int level, const CompactionStats& stats) { - stats_[level].Add(stats); + comp_stats_[level].Add(stats); } - void RecordWriteStall(WriteStallType write_stall_type, uint64_t micros) { - stall_micros_[write_stall_type] += micros; - ++stall_counts_[write_stall_type]; + void RecordLevelNSlowdown(int level, uint64_t micros, bool soft) { + if (soft) { + stall_leveln_slowdown_soft_[level] += micros; + ++stall_leveln_slowdown_count_soft_[level]; + } else { + stall_leveln_slowdown_hard_[level] += micros; + ++stall_leveln_slowdown_count_hard_[level]; + } } - void RecordLevelNSlowdown(int level, uint64_t micros) { - stall_leveln_slowdown_[level] += micros; - ++stall_leveln_slowdown_count_[level]; + void AddCFStats(InternalCFStatsType type, uint64_t value) { + cf_stats_value_[type] += value; + ++cf_stats_count_[type]; + } + + void AddDBStats(InternalDBStatsType type, uint64_t value) { + db_stats_[type] += value; } uint64_t GetBackgroundErrorCount() const { return bg_error_count_; } @@ -134,42 +192,58 @@ class InternalStats { std::string* value, ColumnFamilyData* cfd); private: - std::vector stats_; + void DumpDBStats(std::string* value); + void DumpCFStats(std::string* value, ColumnFamilyData* cfd); + + // Per-DB stats + std::vector db_stats_; + // Per-ColumnFamily stats + std::vector cf_stats_value_; + std::vector cf_stats_count_; + // Per-ColumnFamily/level compaction stats + std::vector comp_stats_; + // These count the number of microseconds for which MakeRoomForWrite stalls. + std::vector stall_leveln_slowdown_hard_; + std::vector stall_leveln_slowdown_count_hard_; + std::vector stall_leveln_slowdown_soft_; + std::vector stall_leveln_slowdown_count_soft_; // Used to compute per-interval statistics - struct StatsSnapshot { - uint64_t compaction_bytes_read_; // Bytes read by compaction - uint64_t compaction_bytes_written_; // Bytes written by compaction - uint64_t ingest_bytes_; // Bytes written by user - uint64_t wal_bytes_; // Bytes written to WAL - uint64_t wal_synced_; // Number of times WAL is synced - uint64_t write_with_wal_; // Number of writes that request WAL + struct CFStatsSnapshot { + // ColumnFamily-level stats + CompactionStats comp_stats; + uint64_t ingest_bytes; // Bytes written to L0 + uint64_t stall_us; // Stall time in micro-seconds + uint64_t stall_count; // Stall count + + CFStatsSnapshot() + : comp_stats(0), + ingest_bytes(0), + stall_us(0), + stall_count(0) {} + } cf_stats_snapshot_; + + struct DBStatsSnapshot { + // DB-level stats + uint64_t ingest_bytes; // Bytes written by user + uint64_t wal_bytes; // Bytes written to WAL + uint64_t wal_synced; // Number of times WAL is synced + uint64_t write_with_wal; // Number of writes that request WAL // These count the number of writes processed by the calling thread or // another thread. - uint64_t write_other_; - uint64_t write_self_; - double seconds_up_; + uint64_t write_other; + uint64_t write_self; + double seconds_up; - StatsSnapshot() - : compaction_bytes_read_(0), - compaction_bytes_written_(0), - ingest_bytes_(0), - wal_bytes_(0), - wal_synced_(0), - write_with_wal_(0), - write_other_(0), - write_self_(0), - seconds_up_(0) {} - }; - - // Counters from the previous time per-interval stats were computed - StatsSnapshot last_stats_; - - // These count the number of microseconds for which MakeRoomForWrite stalls. - std::vector stall_micros_; - std::vector stall_counts_; - std::vector stall_leveln_slowdown_; - std::vector stall_leveln_slowdown_count_; + DBStatsSnapshot() + : ingest_bytes(0), + wal_bytes(0), + wal_synced(0), + write_with_wal(0), + write_other(0), + write_self(0), + seconds_up(0) {} + } db_stats_snapshot_; // Total number of background errors encountered. Every time a flush task // or compaction task fails, this counter is incremented. The failure can @@ -178,10 +252,9 @@ class InternalStats { // or compaction will cause the counter to increase too. uint64_t bg_error_count_; - int number_levels_; - Statistics* statistics_; + const int number_levels_; Env* env_; - uint64_t started_at_; + const uint64_t started_at_; }; } // namespace rocksdb diff --git a/db/version_set.h b/db/version_set.h index 2ac2ed0e3..c73a1d60f 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -248,6 +248,7 @@ class Version { friend class UniversalCompactionPicker; friend class FIFOCompactionPicker; friend class ForwardIterator; + friend class InternalStats; class LevelFileNumIterator; class LevelFileIteratorState;