diff --git a/cache/cache_bench_tool.cc b/cache/cache_bench_tool.cc index d779e54b4..b63afc1e0 100644 --- a/cache/cache_bench_tool.cc +++ b/cache/cache_bench_tool.cc @@ -79,31 +79,52 @@ static class std::shared_ptr secondary_cache; DEFINE_bool(use_clock_cache, false, ""); // ## BEGIN stress_cache_key sub-tool options ## +// See class StressCacheKey below. DEFINE_bool(stress_cache_key, false, "If true, run cache key stress test instead"); -DEFINE_uint32(sck_files_per_day, 2500000, - "(-stress_cache_key) Simulated files generated per day"); -DEFINE_uint32(sck_duration, 90, +DEFINE_uint32( + sck_files_per_day, 2500000, + "(-stress_cache_key) Simulated files generated per simulated day"); +// NOTE: Giving each run a specified lifetime, rather than e.g. "until +// first collision" ensures equal skew from start-up, when collisions are +// less likely. +DEFINE_uint32(sck_days_per_run, 90, "(-stress_cache_key) Number of days to simulate in each run"); +// NOTE: The number of observed collisions directly affects the relative +// accuracy of the predicted probabilities. 15 observations should be well +// within factor-of-2 accuracy. DEFINE_uint32( sck_min_collision, 15, "(-stress_cache_key) Keep running until this many collisions seen"); +// sck_file_size_mb can be thought of as average file size. The simulation is +// not precise enough to care about the distribution of file sizes; other +// simulations (https://github.com/pdillinger/unique_id/tree/main/monte_carlo) +// indicate the distribution only makes a small difference (e.g. < 2x factor) DEFINE_uint32( sck_file_size_mb, 32, "(-stress_cache_key) Simulated file size in MiB, for accounting purposes"); DEFINE_uint32(sck_reopen_nfiles, 100, - "(-stress_cache_key) Re-opens DB average every n files"); + "(-stress_cache_key) Simulate DB re-open average every n files"); +DEFINE_uint32(sck_restarts_per_day, 24, + "(-stress_cache_key) Average simulated process restarts per day " + "(across DBs)"); DEFINE_uint32( - sck_restarts_per_day, 24, - "(-stress_cache_key) Simulated process restarts per day (across DBs)"); -DEFINE_uint32(sck_db_count, 100, - "(-stress_cache_key) Parallel DBs in operation"); -DEFINE_uint32(sck_table_bits, 20, - "(-stress_cache_key) Log2 number of tracked files"); -DEFINE_uint32(sck_keep_bits, 50, - "(-stress_cache_key) Number of cache key bits to keep"); + sck_db_count, 100, + "(-stress_cache_key) Parallel DBs in simulation sharing a block cache"); +DEFINE_uint32( + sck_table_bits, 20, + "(-stress_cache_key) Log2 number of tracked (live) files (across DBs)"); +// sck_keep_bits being well below full 128 bits amplifies the collision +// probability so that the true probability can be estimated through observed +// collisions. (More explanation below.) +DEFINE_uint32( + sck_keep_bits, 50, + "(-stress_cache_key) Number of bits to keep from each cache key (<= 64)"); +// sck_randomize is used to validate whether cache key is performing "better +// than random." Even with this setting, file offsets are not randomized. DEFINE_bool(sck_randomize, false, "(-stress_cache_key) Randomize (hash) cache key"); +// See https://github.com/facebook/rocksdb/pull/9058 DEFINE_bool(sck_footer_unique_id, false, "(-stress_cache_key) Simulate using proposed footer unique id"); // ## END stress_cache_key sub-tool options ## @@ -583,20 +604,97 @@ class CacheBench { } }; -// TODO: better description (see PR #9126 for some info) +// cache_bench -stress_cache_key is an independent embedded tool for +// estimating the probability of CacheKey collisions through simulation. +// At a high level, it simulates generating SST files over many months, +// keeping them in the DB and/or cache for some lifetime while staying +// under resource caps, and checking for any cache key collisions that +// arise among the set of live files. For efficient simulation, we make +// some simplifying "pessimistic" assumptions (that only increase the +// chance of the simulation reporting a collision relative to the chance +// of collision in practice): +// * Every generated file has a cache entry for every byte offset in the +// file (contiguous range of cache keys) +// * All of every file is cached for its entire lifetime. (Here "lifetime" +// is technically the union of DB and Cache lifetime, though we only +// model a generous DB lifetime, where space usage is always maximized. +// In a effective Cache, lifetime in cache can only substantially exceed +// lifetime in DB if there is little cache activity; cache activity is +// required to hit cache key collisions.) +// +// It would be possible to track an exact set of cache key ranges for the +// set of live files, but we would have no hope of observing collisions +// (overlap in live files) in our simulation. We need to employ some way +// of amplifying collision probability that allows us to predict the real +// collision probability by extrapolation from observed collisions. Our +// basic approach is to reduce each cache key range down to some smaller +// number of bits, and limiting to bits that are shared over the whole +// range. Now we can observe collisions using a set of smaller stripped-down +// (reduced) cache keys. Let's do some case analysis to understand why this +// works: +// * No collision in reduced key - because the reduction is a pure function +// this implies no collision in the full keys +// * Collision detected between two reduced keys - either +// * The reduction has dropped some structured uniqueness info (from one of +// session counter or file number; file offsets are never materialized here). +// This can only artificially inflate the observed and extrapolated collision +// probabilities. We only have to worry about this in designing the reduction. +// * The reduction has preserved all the structured uniqueness in the cache +// key, which means either +// * REJECTED: We have a uniqueness bug in generating cache keys, where +// structured uniqueness info should have been different but isn't. In such a +// case, increasing by 1 the number of bits kept after reduction would not +// reduce observed probabilities by half. (In our observations, the +// probabilities are reduced approximately by half.) +// * ACCEPTED: The lost unstructured uniqueness in the key determines the +// probability that an observed collision would imply an overlap in ranges. +// In short, dropping n bits from key would increase collision probability by +// 2**n, assuming those n bits have full entropy in unstructured uniqueness. +// +// But we also have to account for the key ranges based on file size. If file +// sizes are roughly 2**b offsets, using XOR in 128-bit cache keys for +// "ranges", we know from other simulations (see +// https://github.com/pdillinger/unique_id/) that that's roughly equivalent to +// (less than 2x higher collision probability) using a cache key of size +// 128 - b bits for the whole file. (This is the only place we make an +// "optimistic" assumption, which is more than offset by the real +// implementation stripping off 2 lower bits from block byte offsets for cache +// keys. The simulation assumes byte offsets, which is net pessimistic.) +// +// So to accept the extrapolation as valid, we need to be confident that all +// "lost" bits, excluding those covered by file offset, are full entropy. +// Recall that we have assumed (verifiably, safely) that other structured data +// (file number and session counter) are kept, not lost. Based on the +// implementation comments for OffsetableCacheKey, the only potential hole here +// is that we only have ~103 bits of entropy in "all new" session IDs, and in +// extreme cases, there might be only 1 DB ID. However, because the upper ~39 +// bits of session ID are hashed, the combination of file number and file +// offset only has to add to 25 bits (or more) to ensure full entropy in +// unstructured uniqueness lost in the reduction. Typical file size of 32MB +// suffices (at least for simulation purposes where we assume each file offset +// occupies a cache key). +// +// Example results in comments on OffsetableCacheKey. class StressCacheKey { public: void Run() { if (FLAGS_sck_footer_unique_id) { + // Proposed footer unique IDs are DB-independent and session-independent + // (but process-dependent) which is most easily simulated here by + // assuming 1 DB and (later below) no session resets without process + // reset. FLAGS_sck_db_count = 1; } + // Describe the simulated workload uint64_t mb_per_day = uint64_t{FLAGS_sck_files_per_day} * FLAGS_sck_file_size_mb; printf("Total cache or DBs size: %gTiB Writing %g MiB/s or %gTiB/day\n", FLAGS_sck_file_size_mb / 1024.0 / 1024.0 * std::pow(2.0, FLAGS_sck_table_bits), mb_per_day / 86400.0, mb_per_day / 1024.0 / 1024.0); + // For extrapolating probability of any collisions from a number of + // observed collisions multiplier_ = std::pow(2.0, 128 - FLAGS_sck_keep_bits) / (FLAGS_sck_file_size_mb * 1024.0 * 1024.0); printf( @@ -606,6 +704,9 @@ class StressCacheKey { restart_nfiles_ = FLAGS_sck_files_per_day / FLAGS_sck_restarts_per_day; double without_ejection = std::pow(1.414214, FLAGS_sck_keep_bits) / FLAGS_sck_files_per_day; + // This should be a lower bound for -sck_randomize, usually a terribly + // rough lower bound. + // If observation is worse than this, then something has gone wrong. printf( "Without ejection, expect random collision after %g days (%g " "corrected)\n", @@ -613,30 +714,36 @@ class StressCacheKey { double with_full_table = std::pow(2.0, FLAGS_sck_keep_bits - FLAGS_sck_table_bits) / FLAGS_sck_files_per_day; + // This is an alternate lower bound for -sck_randomize, usually pretty + // accurate. Our cache keys should usually perform "better than random" + // but always no worse. (If observation is substantially worse than this, + // then something has gone wrong.) printf( "With ejection and full table, expect random collision after %g " "days (%g corrected)\n", with_full_table, with_full_table * multiplier_); collisions_ = 0; + // Run until sufficient number of observed collisions. for (int i = 1; collisions_ < FLAGS_sck_min_collision; i++) { RunOnce(); if (collisions_ == 0) { printf( "No collisions after %d x %u days " " \n", - i, FLAGS_sck_duration); + i, FLAGS_sck_days_per_run); } else { - double est = 1.0 * i * FLAGS_sck_duration / collisions_; + double est = 1.0 * i * FLAGS_sck_days_per_run / collisions_; printf("%" PRIu64 " collisions after %d x %u days, est %g days between (%g " "corrected) \n", - collisions_, i, FLAGS_sck_duration, est, est * multiplier_); + collisions_, i, FLAGS_sck_days_per_run, est, est * multiplier_); } } } void RunOnce() { + // Re-initialized simulated state const size_t db_count = FLAGS_sck_db_count; dbs_.reset(new TableProperties[db_count]{}); const size_t table_mask = (size_t{1} << FLAGS_sck_table_bits) - 1; @@ -644,7 +751,11 @@ class StressCacheKey { if (FLAGS_sck_keep_bits > 64) { FLAGS_sck_keep_bits = 64; } + + // Details of which bits are dropped in reduction uint32_t shift_away = 64 - FLAGS_sck_keep_bits; + // Shift away fewer potential file number bits (b) than potential + // session counter bits (a). uint32_t shift_away_b = shift_away / 3; uint32_t shift_away_a = shift_away - shift_away_b; @@ -655,62 +766,78 @@ class StressCacheKey { Random64 r{std::random_device{}()}; uint64_t max_file_count = - uint64_t{FLAGS_sck_files_per_day} * FLAGS_sck_duration; - uint64_t file_count = 0; + uint64_t{FLAGS_sck_files_per_day} * FLAGS_sck_days_per_run; + uint64_t file_size = FLAGS_sck_file_size_mb * uint64_t{1024} * 1024U; uint32_t report_count = 0; uint32_t collisions_this_run = 0; - // Round robin through DBs - for (size_t db_i = 0;; ++db_i) { + size_t db_i = 0; + + for (uint64_t file_count = 1; file_count <= max_file_count; + ++file_count, ++db_i) { + // Round-robin through DBs (this faster than %) if (db_i >= db_count) { db_i = 0; } - if (file_count >= max_file_count) { - break; - } + // Any other periodic actions before simulating next file if (!FLAGS_sck_footer_unique_id && r.OneIn(FLAGS_sck_reopen_nfiles)) { ResetSession(db_i); } else if (r.OneIn(restart_nfiles_)) { ResetProcess(); } + // Simulate next file OffsetableCacheKey ock; dbs_[db_i].orig_file_number += 1; - // skip some file numbers, unless 1 DB so that that can simulate - // better (DB-independent) unique IDs - if (db_count > 1) { + // skip some file numbers for other file kinds, except in footer unique + // ID, orig_file_number here tracks process-wide generated SST file + // count. + if (!FLAGS_sck_footer_unique_id) { dbs_[db_i].orig_file_number += (r.Next() & 3); } - BlockBasedTable::SetupBaseCacheKey(&dbs_[db_i], "", 42, 42, &ock); + bool is_stable; + BlockBasedTable::SetupBaseCacheKey(&dbs_[db_i], /* ignored */ "", + /* ignored */ 42, file_size, &ock, + &is_stable); + assert(is_stable); + // Get a representative cache key, which later we analytically generalize + // to a range. CacheKey ck = ock.WithOffset(0); - uint64_t stripped; + uint64_t reduced_key; if (FLAGS_sck_randomize) { - stripped = GetSliceHash64(ck.AsSlice()) >> shift_away; + reduced_key = GetSliceHash64(ck.AsSlice()) >> shift_away; } else if (FLAGS_sck_footer_unique_id) { + // Special case: keep only file number, not session counter uint32_t a = DecodeFixed32(ck.AsSlice().data() + 4) >> shift_away_a; uint32_t b = DecodeFixed32(ck.AsSlice().data() + 12) >> shift_away_b; - stripped = (uint64_t{a} << 32) + b; + reduced_key = (uint64_t{a} << 32) + b; } else { + // Try to keep file number and session counter (shift away other bits) uint32_t a = DecodeFixed32(ck.AsSlice().data()) << shift_away_a; uint32_t b = DecodeFixed32(ck.AsSlice().data() + 12) >> shift_away_b; - stripped = (uint64_t{a} << 32) + b; + reduced_key = (uint64_t{a} << 32) + b; } - if (stripped == 0) { - // Unlikely, but we need to exclude tracking this value + if (reduced_key == 0) { + // Unlikely, but we need to exclude tracking this value because we + // use it to mean "empty" in table. This case is OK as long as we + // don't hit it often. printf("Hit Zero! \n"); + file_count--; continue; } - file_count++; - uint64_t h = NPHash64(reinterpret_cast(&stripped), 8); - // Skew lifetimes + uint64_t h = + NPHash64(reinterpret_cast(&reduced_key), sizeof(reduced_key)); + // Skew expected lifetimes, for high variance (super-Poisson) variance + // in actual lifetimes. size_t pos = std::min(Lower32of64(h) & table_mask, Upper32of64(h) & table_mask); - if (table_[pos] == stripped) { + if (table_[pos] == reduced_key) { collisions_this_run++; - // To predict probability of no collisions, we have to get rid of - // correlated collisions, which this takes care of: + // Our goal is to predict probability of no collisions, not expected + // number of collisions. To make the distinction, we have to get rid + // of observing correlated collisions, which this takes care of: ResetProcess(); } else { - // Replace - table_[pos] = stripped; + // Replace (end of lifetime for file that was in this slot) + table_[pos] = reduced_key; } if (++report_count == FLAGS_sck_files_per_day) { @@ -748,6 +875,8 @@ class StressCacheKey { ResetSession(i); } if (FLAGS_sck_footer_unique_id) { + // For footer unique ID, this tracks process-wide generated SST file + // count. dbs_[0].orig_file_number = 0; } } diff --git a/cache/cache_key.cc b/cache/cache_key.cc index 17445924f..f99921b88 100644 --- a/cache/cache_key.cc +++ b/cache/cache_key.cc @@ -35,7 +35,8 @@ CacheKey CacheKey::CreateUniqueForCacheLifetime(Cache *cache) { CacheKey CacheKey::CreateUniqueForProcessLifetime() { // To avoid colliding with CreateUniqueForCacheLifetime, assuming // Cache::NewId counts up from zero, here we count down from UINT64_MAX. - // If this ever becomes a point of contention, we could use CoreLocalArray. + // If this ever becomes a point of contention, we could sub-divide the + // space and use CoreLocalArray. static std::atomic counter{UINT64_MAX}; uint64_t id = counter.fetch_sub(1, std::memory_order_relaxed); // Ensure we don't collide with CreateUniqueForCacheLifetime @@ -118,9 +119,10 @@ CacheKey CacheKey::CreateUniqueForProcessLifetime() { // "structured" uniqueness hasn't been cloned. Using a static // SemiStructuredUniqueIdGen for db_session_ids, this means we only get an // "all new" session id when a new process uses RocksDB. (Between processes, -// we don't know if a DB or other persistent storage has been cloned.) Within -// a process, only the session_lower of the db_session_id changes -// incrementally ("structured" uniqueness). +// we don't know if a DB or other persistent storage has been cloned. We +// assume that if VM hot cloning is used, subsequently generated SST files +// do not interact.) Within a process, only the session_lower of the +// db_session_id changes incrementally ("structured" uniqueness). // // This basically means that our offsets, counters and file numbers allow us // to do somewhat "better than random" (birthday paradox) while in the @@ -168,12 +170,83 @@ CacheKey CacheKey::CreateUniqueForProcessLifetime() { // data from the last 180 days is in cache, but NOT the other assumption // for the 1 in a trillion estimate above). // -// Conclusion: Burning through session IDs, particularly "all new" IDs that -// only arise when a new process is started, is the only way to have a -// plausible chance of cache key collision. When processes live for hours -// or days, the chance of a cache key collision seems more plausibly due -// to bad hardware than to bad luck in random session ID data. // +// Collision probability estimation through simulation: +// A tool ./cache_bench -stress_cache_key broadly simulates host-wide cache +// activity over many months, by making some pessimistic simplifying +// assumptions. See class StressCacheKey in cache_bench_tool.cc for details. +// Here is some sample output with +// `./cache_bench -stress_cache_key -sck_keep_bits=40`: +// +// Total cache or DBs size: 32TiB Writing 925.926 MiB/s or 76.2939TiB/day +// Multiply by 9.22337e+18 to correct for simulation losses (but still +// assume whole file cached) +// +// These come from default settings of 2.5M files per day of 32 MB each, and +// `-sck_keep_bits=40` means that to represent a single file, we are only +// keeping 40 bits of the 128-bit (base) cache key. With file size of 2**25 +// contiguous keys (pessimistic), our simulation is about 2\*\*(128-40-25) or +// about 9 billion billion times more prone to collision than reality. +// +// More default assumptions, relatively pessimistic: +// * 100 DBs in same process (doesn't matter much) +// * Re-open DB in same process (new session ID related to old session ID) on +// average every 100 files generated +// * Restart process (all new session IDs unrelated to old) 24 times per day +// +// After enough data, we get a result at the end (-sck_keep_bits=40): +// +// (keep 40 bits) 17 collisions after 2 x 90 days, est 10.5882 days between +// (9.76592e+19 corrected) +// +// If we believe the (pessimistic) simulation and the mathematical +// extrapolation, we would need to run a billion machines all for 97 billion +// days to expect a cache key collision. To help verify that our extrapolation +// ("corrected") is robust, we can make our simulation more precise with +// `-sck_keep_bits=41` and `42`, which takes more running time to get enough +// collision data: +// +// (keep 41 bits) 16 collisions after 4 x 90 days, est 22.5 days between +// (1.03763e+20 corrected) +// (keep 42 bits) 19 collisions after 10 x 90 days, est 47.3684 days between +// (1.09224e+20 corrected) +// +// The extrapolated prediction is very close. If anything, we might have some +// very small losses of structured data (see class StressCacheKey in +// cache_bench_tool.cc) leading to more accurate & more attractive prediction +// with more bits kept. +// +// With the `-sck_randomize` option, we can see that typical workloads like +// above have lower collision probability than "random" cache keys (note: +// offsets still non-randomized) by a modest amount (roughly 20x less collision +// prone than random), which should make us reasonably comfortable even in +// "degenerate" cases (e.g. repeatedly launch a process to generate 1 file +// with SstFileWriter): +// +// (rand 40 bits) 197 collisions after 1 x 90 days, est 0.456853 days between +// (4.21372e+18 corrected) +// +// We can see that with more frequent process restarts (all new session IDs), +// we get closer to the "random" cache key performance: +// +// (-sck_restarts_per_day=5000): 140 collisions after 1 x 90 days, ... +// (5.92931e+18 corrected) +// +// Other tests have been run to validate other conditions behave as expected, +// never behaving "worse than random" unless we start chopping off structured +// data. +// +// +// Conclusion: Even in extreme cases, rapidly burning through "all new" IDs +// that only arise when a new process is started, the chance of any cache key +// collisions in a giant fleet of machines is negligible. Especially when +// processes live for hours or days, the chance of a cache key collision is +// likely more plausibly due to bad hardware than to bad luck in random +// session ID data. Software defects are surely more likely to cause corruption +// than both of those. +// +// TODO: Nevertheless / regardless, an efficient way to detect (and thus +// quantify) block cache corruptions, including collisions, should be added. OffsetableCacheKey::OffsetableCacheKey(const std::string &db_id, const std::string &db_session_id, uint64_t file_number, diff --git a/db/db_block_cache_test.cc b/db/db_block_cache_test.cc index 71cf49f15..fc99adaaf 100644 --- a/db/db_block_cache_test.cc +++ b/db/db_block_cache_test.cc @@ -9,17 +9,26 @@ #include #include #include +#include #include "cache/cache_entry_roles.h" +#include "cache/cache_key.h" #include "cache/lru_cache.h" #include "db/column_family.h" +#include "db/db_impl/db_impl.h" #include "db/db_test_util.h" +#include "env/unique_id_gen.h" #include "port/stack_trace.h" #include "rocksdb/persistent_cache.h" #include "rocksdb/statistics.h" #include "rocksdb/table.h" +#include "rocksdb/table_properties.h" +#include "table/block_based/block_based_table_reader.h" +#include "table/unique_id_impl.h" #include "util/compression.h" #include "util/defer.h" +#include "util/hash.h" +#include "util/math.h" #include "util/random.h" #include "utilities/fault_injection_fs.h" @@ -1714,6 +1723,238 @@ TEST_P(DBBlockCacheKeyTest, StableCacheKeys) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } +class CacheKeyTest : public testing::Test { + public: + void SetupStableBase() { + // Like SemiStructuredUniqueIdGen::GenerateNext + tp_.db_session_id = EncodeSessionId(base_session_upper_, + base_session_lower_ ^ session_counter_); + tp_.db_id = ToString(db_id_); + tp_.orig_file_number = file_number_; + bool is_stable; + std::string cur_session_id = ""; // ignored + uint64_t cur_file_number = 42; // ignored + BlockBasedTable::SetupBaseCacheKey(&tp_, cur_session_id, cur_file_number, + file_size_, &base_cache_key_, + &is_stable); + ASSERT_TRUE(is_stable); + } + CacheKey WithOffset(uint64_t offset) { + return BlockBasedTable::GetCacheKey(base_cache_key_, + BlockHandle(offset, /*size*/ 5)); + } + + protected: + OffsetableCacheKey base_cache_key_; + TableProperties tp_; + uint64_t file_size_ = 0; + uint64_t base_session_upper_ = 0; + uint64_t base_session_lower_ = 0; + uint64_t session_counter_ = 0; + uint64_t file_number_ = 0; + uint64_t db_id_ = 0; +}; + +namespace { +template +int CountBitsDifferent(const T& t1, const T& t2) { + int diff = 0; + const uint8_t* p1 = reinterpret_cast(&t1); + const uint8_t* p2 = reinterpret_cast(&t2); + static_assert(sizeof(*p1) == 1, "Expecting uint8_t byte"); + for (size_t i = 0; i < sizeof(T); ++i) { + diff += BitsSetToOne(p1[i] ^ p2[i]); + } + return diff; +} + +} // namespace + +TEST_F(CacheKeyTest, DBImplSessionIdStructure) { + // We have to generate our own session IDs for simulation purposes in other + // tests. Here we verify that the DBImpl implementation seems to match + // our construction here, by using lowest XORed-in bits for "session + // counter." + std::string session_id1 = DBImpl::GenerateDbSessionId(/*env*/ nullptr); + std::string session_id2 = DBImpl::GenerateDbSessionId(/*env*/ nullptr); + uint64_t upper1, upper2, lower1, lower2; + ASSERT_OK(DecodeSessionId(session_id1, &upper1, &lower1)); + ASSERT_OK(DecodeSessionId(session_id2, &upper2, &lower2)); + // Because generated in same process + ASSERT_EQ(upper1, upper2); + // Unless we generate > 4 billion session IDs in this process... + ASSERT_EQ(Upper32of64(lower1), Upper32of64(lower2)); + // But they must be different somewhere + ASSERT_NE(Lower32of64(lower1), Lower32of64(lower2)); +} + +TEST_F(CacheKeyTest, StandardEncodingLimit) { + base_session_upper_ = 1234; + base_session_lower_ = 5678; + session_counter_ = 42; + file_number_ = 42; + db_id_ = 1234; + + file_size_ = 42; + SetupStableBase(); + CacheKey ck1; + ASSERT_TRUE(ck1.IsEmpty()); + ck1 = WithOffset(0); + ASSERT_FALSE(ck1.IsEmpty()); + + // Should use same encoding + file_size_ = BlockBasedTable::kMaxFileSizeStandardEncoding; + SetupStableBase(); + CacheKey ck2 = WithOffset(0); + ASSERT_EQ(CountBitsDifferent(ck1, ck2), 0); + + // Should use different encoding + ++file_size_; + SetupStableBase(); + CacheKey ck3 = WithOffset(0); + ASSERT_GT(CountBitsDifferent(ck2, ck3), 0); +} + +TEST_F(CacheKeyTest, Encodings) { + // Claim from cache_key.cc: + // In fact, if our SST files are all < 4TB (see + // BlockBasedTable::kMaxFileSizeStandardEncoding), then SST files generated + // in a single process are guaranteed to have unique cache keys, unless/until + // number session ids * max file number = 2**86, e.g. 1 trillion DB::Open in + // a single process and 64 trillion files generated. + + // We can generalize that. For + // * z bits in maximum file size + // * n bits in maximum file number + // * s bits in maximum session counter + // uniqueness is guaranteed at least when all of these hold: + // * z + n + s <= 121 (128 - 2 meta + 2 offset trim - (8-1) byte granularity + // in encoding) + // * n + s <= 86 (encoding limitation) + // * s <= 62 (because of 2-bit metadata) + + // We can verify this indirectly by how input bits get into the cache key, + // but we have to be mindful that for sufficiently large file sizes, + // different encodings might be used. But for cases mixing large and small + // files, we have to verify uniqueness between encodings. + + // Going through all combinations would be a little expensive, so we test + // only one random "stripe" of the configuration space per run. + constexpr uint32_t kStripeBits = 8; + constexpr uint32_t kStripeMask = (uint32_t{1} << kStripeBits) - 1; + + // Also cycle through stripes on repeated runs (not thread safe) + static uint32_t stripe = + static_cast(std::random_device{}()) & kStripeMask; + stripe = (stripe + 1) & kStripeMask; + + fprintf(stderr, "%u\n", stripe); + + // We are going to randomly initialize some values which *should* not affect + // result + Random64 r{std::random_device{}()}; + + int max_num_encodings = 0; + uint32_t config_num = 0; + uint32_t session_counter_bits, file_number_bits, max_file_size_bits; + + // Inner loop body, used later in a loop over configurations + auto TestConfig = [&]() { + base_session_upper_ = r.Next(); + base_session_lower_ = r.Next(); + session_counter_ = r.Next(); + if (session_counter_bits < 64) { + // Avoid shifting UB + session_counter_ = session_counter_ >> 1 >> (63 - session_counter_bits); + } + file_number_ = r.Next() >> (64 - file_number_bits); + // Need two bits set to avoid temporary zero below + if (BitsSetToOne(file_number_) < 2) { + file_number_ = 3; + } + db_id_ = r.Next(); + + // Work-around clang-analyzer which thinks empty last_base is garbage + CacheKey last_base = CacheKey::CreateUniqueForProcessLifetime(); + + std::unordered_set seen; + int num_encodings = 0; + + // Loop over encodings by increasing file size bits + for (uint32_t file_size_bits = 1; file_size_bits <= max_file_size_bits; + ++file_size_bits) { + file_size_ = uint64_t{1} << (file_size_bits - 1); + SetupStableBase(); + CacheKey new_base = WithOffset(0); + if (CountBitsDifferent(last_base, new_base) == 0) { + // Same as previous encoding + continue; + } + + // New encoding + ++num_encodings; + ASSERT_TRUE(seen.insert(new_base.AsSlice().ToString()).second); + last_base = new_base; + for (uint32_t i = 0; i < file_size_bits; ++i) { + CacheKey ck = WithOffset(uint64_t{1} << i); + if (i < 2) { + // These cases are not relevant and optimized by dropping two + // lowest bits because there's always at least 5 bytes between + // blocks. + ASSERT_EQ(CountBitsDifferent(ck, new_base), 0); + } else { + // Normal case + // 1 bit different from base and never been seen implies the bit + // is encoded into cache key without overlapping other structured + // data. + ASSERT_EQ(CountBitsDifferent(ck, new_base), 1); + ASSERT_TRUE(seen.insert(ck.AsSlice().ToString()).second); + } + } + for (uint32_t i = 0; i < session_counter_bits; ++i) { + SaveAndRestore tmp(&session_counter_, + session_counter_ ^ (uint64_t{1} << i)); + SetupStableBase(); + CacheKey ck = WithOffset(0); + ASSERT_EQ(CountBitsDifferent(ck, new_base), 1); + ASSERT_TRUE(seen.insert(ck.AsSlice().ToString()).second); + } + for (uint32_t i = 0; i < file_number_bits; ++i) { + SaveAndRestore tmp(&file_number_, + file_number_ ^ (uint64_t{1} << i)); + SetupStableBase(); + CacheKey ck = WithOffset(0); + ASSERT_EQ(CountBitsDifferent(ck, new_base), 1); + ASSERT_TRUE(seen.insert(ck.AsSlice().ToString()).second); + } + max_num_encodings = std::max(max_num_encodings, num_encodings); + } + }; + + // Loop over configurations and test those in stripe + for (session_counter_bits = 0; session_counter_bits <= 62; + ++session_counter_bits) { + uint32_t max_file_number_bits = + std::min(uint32_t{64}, uint32_t{86} - session_counter_bits); + // Start with 2 to avoid file_number_ == 0 in testing + for (file_number_bits = 2; file_number_bits <= max_file_number_bits; + ++file_number_bits) { + uint32_t max_max_file_size_bits = + std::min(uint32_t{64}, + uint32_t{121} - file_number_bits - session_counter_bits); + for (max_file_size_bits = 1; max_file_size_bits <= max_max_file_size_bits; + ++max_file_size_bits) { + if ((config_num++ & kStripeMask) == stripe) { + TestConfig(); + } + } + } + } + + // Make sure the current implementation is exercised + ASSERT_EQ(max_num_encodings, 4); +} + INSTANTIATE_TEST_CASE_P(DBBlockCacheKeyTest, DBBlockCacheKeyTest, ::testing::Combine(::testing::Bool(), ::testing::Bool()));