Fix a major performance bug in 6.21 for cache entry stats (#8369)

Summary:
In final polishing of https://github.com/facebook/rocksdb/issues/8297 (after most manual testing), I
broke my own caching layer by sanitizing an input parameter with
std::min(0, x) instead of std::max(0, x). I resisted unit testing the
timing part of the result caching because historically, these test
are either flaky or difficult to write, and this was not a correctness
issue. This bug is essentially unnoticeable with a small number
of column families but can explode background work with a
large number of column families.

This change fixes the logical error, removes some unnecessary related
optimization, and adds mock time/sleeps to the unit test to ensure we
can cache hit within the age limit.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/8369

Test Plan: added time testing logic to existing unit test

Reviewed By: ajkr

Differential Revision: D28950892

Pulled By: pdillinger

fbshipit-source-id: e79cd4ff3eec68fd0119d994f1ed468c38026c3b
This commit is contained in:
Peter Dillinger 2021-06-08 05:02:29 -07:00 committed by Facebook GitHub Bot
parent 80a59a03a7
commit 2f93a3b809
3 changed files with 18 additions and 9 deletions

View File

@ -56,15 +56,11 @@ class CacheEntryStatsCollector {
// Waits for any pending reader or writer (collector)
std::lock_guard<std::mutex> lock(mutex_);
// Maximum allowed age is nominally given by the parameter
// Maximum allowed age is nominally given by the parameter, but
// to limit the possibility of accidental repeated scans, impose
// a minimum TTL of 1 second.
uint64_t max_age_micros =
static_cast<uint64_t>(std::min(maximum_age_in_seconds, 0)) * 1000000U;
// But we will re-scan more frequently if it means scanning < 1%
// of the time and no more than once per second.
max_age_micros = std::min(
max_age_micros,
std::max(uint64_t{1000000},
100U * (last_end_time_micros_ - last_start_time_micros_)));
static_cast<uint64_t>(std::max(maximum_age_in_seconds, 1)) * 1000000U;
uint64_t start_time_micros = clock_->NowMicros();
if ((start_time_micros - last_end_time_micros_) > max_age_micros) {

View File

@ -930,9 +930,9 @@ TEST_F(DBBlockCacheTest, CacheEntryRoleStats) {
++iterations_tested;
Options options = CurrentOptions();
SetTimeElapseOnlySleepOnReopen(&options);
options.create_if_missing = true;
options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
options.stats_dump_period_sec = 0;
options.max_open_files = 13;
options.table_cache_numshardbits = 0;
@ -972,12 +972,21 @@ TEST_F(DBBlockCacheTest, CacheEntryRoleStats) {
expected[static_cast<size_t>(CacheEntryRole::kMisc)] = 1;
EXPECT_EQ(expected, GetCacheEntryRoleCounts());
std::array<size_t, kNumCacheEntryRoles> prev_expected = expected;
// First access only filters
ASSERT_EQ("NOT_FOUND", Get("different from any key added"));
expected[static_cast<size_t>(CacheEntryRole::kFilterBlock)] += 2;
if (partition) {
expected[static_cast<size_t>(CacheEntryRole::kFilterMetaBlock)] += 2;
}
// Within some time window, we will get cached entry stats
EXPECT_EQ(prev_expected, GetCacheEntryRoleCounts());
// Not enough to force a miss
env_->MockSleepForSeconds(10);
EXPECT_EQ(prev_expected, GetCacheEntryRoleCounts());
// Enough to force a miss
env_->MockSleepForSeconds(1000);
EXPECT_EQ(expected, GetCacheEntryRoleCounts());
// Now access index and data block
@ -988,6 +997,7 @@ TEST_F(DBBlockCacheTest, CacheEntryRoleStats) {
expected[static_cast<size_t>(CacheEntryRole::kIndexBlock)]++;
}
expected[static_cast<size_t>(CacheEntryRole::kDataBlock)]++;
env_->MockSleepForSeconds(1000);
EXPECT_EQ(expected, GetCacheEntryRoleCounts());
// The same for other file
@ -998,6 +1008,7 @@ TEST_F(DBBlockCacheTest, CacheEntryRoleStats) {
expected[static_cast<size_t>(CacheEntryRole::kIndexBlock)]++;
}
expected[static_cast<size_t>(CacheEntryRole::kDataBlock)]++;
env_->MockSleepForSeconds(1000);
EXPECT_EQ(expected, GetCacheEntryRoleCounts());
// Also check the GetProperty interface

View File

@ -537,6 +537,8 @@ Status InternalStats::CollectCacheEntryStats() {
std::shared_ptr<Collector> collector;
Status s = Collector::GetShared(block_cache, clock_, &collector);
if (s.ok()) {
// TODO: use a max age like stats_dump_period_sec / 2, but it's
// difficult to access that setting from here with just cfd_
collector->GetStats(&cache_entry_stats);
} else {
// Block cache likely under pressure. Scanning could make it worse,