Enhance new cache key testing & comments (#9329)
Summary: Follow-up to https://github.com/facebook/rocksdb/issues/9126 Added new unit tests to validate some of the claims of guaranteed uniqueness within certain large bounds. Also cleaned up the cache_bench -stress-cache-key tool with better comments and description. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9329 Test Plan: no changes to production code Reviewed By: mrambacher Differential Revision: D33269328 Pulled By: pdillinger fbshipit-source-id: 3a2b684a6b2b15f79dc872e563e3d16563be26de
This commit is contained in:
parent
42e0751b3a
commit
afc280fdfd
211
cache/cache_bench_tool.cc
vendored
211
cache/cache_bench_tool.cc
vendored
@ -79,31 +79,52 @@ static class std::shared_ptr<ROCKSDB_NAMESPACE::SecondaryCache> secondary_cache;
|
||||
DEFINE_bool(use_clock_cache, false, "");
|
||||
|
||||
// ## BEGIN stress_cache_key sub-tool options ##
|
||||
// See class StressCacheKey below.
|
||||
DEFINE_bool(stress_cache_key, false,
|
||||
"If true, run cache key stress test instead");
|
||||
DEFINE_uint32(sck_files_per_day, 2500000,
|
||||
"(-stress_cache_key) Simulated files generated per day");
|
||||
DEFINE_uint32(sck_duration, 90,
|
||||
DEFINE_uint32(
|
||||
sck_files_per_day, 2500000,
|
||||
"(-stress_cache_key) Simulated files generated per simulated day");
|
||||
// NOTE: Giving each run a specified lifetime, rather than e.g. "until
|
||||
// first collision" ensures equal skew from start-up, when collisions are
|
||||
// less likely.
|
||||
DEFINE_uint32(sck_days_per_run, 90,
|
||||
"(-stress_cache_key) Number of days to simulate in each run");
|
||||
// NOTE: The number of observed collisions directly affects the relative
|
||||
// accuracy of the predicted probabilities. 15 observations should be well
|
||||
// within factor-of-2 accuracy.
|
||||
DEFINE_uint32(
|
||||
sck_min_collision, 15,
|
||||
"(-stress_cache_key) Keep running until this many collisions seen");
|
||||
// sck_file_size_mb can be thought of as average file size. The simulation is
|
||||
// not precise enough to care about the distribution of file sizes; other
|
||||
// simulations (https://github.com/pdillinger/unique_id/tree/main/monte_carlo)
|
||||
// indicate the distribution only makes a small difference (e.g. < 2x factor)
|
||||
DEFINE_uint32(
|
||||
sck_file_size_mb, 32,
|
||||
"(-stress_cache_key) Simulated file size in MiB, for accounting purposes");
|
||||
DEFINE_uint32(sck_reopen_nfiles, 100,
|
||||
"(-stress_cache_key) Re-opens DB average every n files");
|
||||
"(-stress_cache_key) Simulate DB re-open average every n files");
|
||||
DEFINE_uint32(sck_restarts_per_day, 24,
|
||||
"(-stress_cache_key) Average simulated process restarts per day "
|
||||
"(across DBs)");
|
||||
DEFINE_uint32(
|
||||
sck_restarts_per_day, 24,
|
||||
"(-stress_cache_key) Simulated process restarts per day (across DBs)");
|
||||
DEFINE_uint32(sck_db_count, 100,
|
||||
"(-stress_cache_key) Parallel DBs in operation");
|
||||
DEFINE_uint32(sck_table_bits, 20,
|
||||
"(-stress_cache_key) Log2 number of tracked files");
|
||||
DEFINE_uint32(sck_keep_bits, 50,
|
||||
"(-stress_cache_key) Number of cache key bits to keep");
|
||||
sck_db_count, 100,
|
||||
"(-stress_cache_key) Parallel DBs in simulation sharing a block cache");
|
||||
DEFINE_uint32(
|
||||
sck_table_bits, 20,
|
||||
"(-stress_cache_key) Log2 number of tracked (live) files (across DBs)");
|
||||
// sck_keep_bits being well below full 128 bits amplifies the collision
|
||||
// probability so that the true probability can be estimated through observed
|
||||
// collisions. (More explanation below.)
|
||||
DEFINE_uint32(
|
||||
sck_keep_bits, 50,
|
||||
"(-stress_cache_key) Number of bits to keep from each cache key (<= 64)");
|
||||
// sck_randomize is used to validate whether cache key is performing "better
|
||||
// than random." Even with this setting, file offsets are not randomized.
|
||||
DEFINE_bool(sck_randomize, false,
|
||||
"(-stress_cache_key) Randomize (hash) cache key");
|
||||
// See https://github.com/facebook/rocksdb/pull/9058
|
||||
DEFINE_bool(sck_footer_unique_id, false,
|
||||
"(-stress_cache_key) Simulate using proposed footer unique id");
|
||||
// ## END stress_cache_key sub-tool options ##
|
||||
@ -583,20 +604,97 @@ class CacheBench {
|
||||
}
|
||||
};
|
||||
|
||||
// TODO: better description (see PR #9126 for some info)
|
||||
// cache_bench -stress_cache_key is an independent embedded tool for
|
||||
// estimating the probability of CacheKey collisions through simulation.
|
||||
// At a high level, it simulates generating SST files over many months,
|
||||
// keeping them in the DB and/or cache for some lifetime while staying
|
||||
// under resource caps, and checking for any cache key collisions that
|
||||
// arise among the set of live files. For efficient simulation, we make
|
||||
// some simplifying "pessimistic" assumptions (that only increase the
|
||||
// chance of the simulation reporting a collision relative to the chance
|
||||
// of collision in practice):
|
||||
// * Every generated file has a cache entry for every byte offset in the
|
||||
// file (contiguous range of cache keys)
|
||||
// * All of every file is cached for its entire lifetime. (Here "lifetime"
|
||||
// is technically the union of DB and Cache lifetime, though we only
|
||||
// model a generous DB lifetime, where space usage is always maximized.
|
||||
// In a effective Cache, lifetime in cache can only substantially exceed
|
||||
// lifetime in DB if there is little cache activity; cache activity is
|
||||
// required to hit cache key collisions.)
|
||||
//
|
||||
// It would be possible to track an exact set of cache key ranges for the
|
||||
// set of live files, but we would have no hope of observing collisions
|
||||
// (overlap in live files) in our simulation. We need to employ some way
|
||||
// of amplifying collision probability that allows us to predict the real
|
||||
// collision probability by extrapolation from observed collisions. Our
|
||||
// basic approach is to reduce each cache key range down to some smaller
|
||||
// number of bits, and limiting to bits that are shared over the whole
|
||||
// range. Now we can observe collisions using a set of smaller stripped-down
|
||||
// (reduced) cache keys. Let's do some case analysis to understand why this
|
||||
// works:
|
||||
// * No collision in reduced key - because the reduction is a pure function
|
||||
// this implies no collision in the full keys
|
||||
// * Collision detected between two reduced keys - either
|
||||
// * The reduction has dropped some structured uniqueness info (from one of
|
||||
// session counter or file number; file offsets are never materialized here).
|
||||
// This can only artificially inflate the observed and extrapolated collision
|
||||
// probabilities. We only have to worry about this in designing the reduction.
|
||||
// * The reduction has preserved all the structured uniqueness in the cache
|
||||
// key, which means either
|
||||
// * REJECTED: We have a uniqueness bug in generating cache keys, where
|
||||
// structured uniqueness info should have been different but isn't. In such a
|
||||
// case, increasing by 1 the number of bits kept after reduction would not
|
||||
// reduce observed probabilities by half. (In our observations, the
|
||||
// probabilities are reduced approximately by half.)
|
||||
// * ACCEPTED: The lost unstructured uniqueness in the key determines the
|
||||
// probability that an observed collision would imply an overlap in ranges.
|
||||
// In short, dropping n bits from key would increase collision probability by
|
||||
// 2**n, assuming those n bits have full entropy in unstructured uniqueness.
|
||||
//
|
||||
// But we also have to account for the key ranges based on file size. If file
|
||||
// sizes are roughly 2**b offsets, using XOR in 128-bit cache keys for
|
||||
// "ranges", we know from other simulations (see
|
||||
// https://github.com/pdillinger/unique_id/) that that's roughly equivalent to
|
||||
// (less than 2x higher collision probability) using a cache key of size
|
||||
// 128 - b bits for the whole file. (This is the only place we make an
|
||||
// "optimistic" assumption, which is more than offset by the real
|
||||
// implementation stripping off 2 lower bits from block byte offsets for cache
|
||||
// keys. The simulation assumes byte offsets, which is net pessimistic.)
|
||||
//
|
||||
// So to accept the extrapolation as valid, we need to be confident that all
|
||||
// "lost" bits, excluding those covered by file offset, are full entropy.
|
||||
// Recall that we have assumed (verifiably, safely) that other structured data
|
||||
// (file number and session counter) are kept, not lost. Based on the
|
||||
// implementation comments for OffsetableCacheKey, the only potential hole here
|
||||
// is that we only have ~103 bits of entropy in "all new" session IDs, and in
|
||||
// extreme cases, there might be only 1 DB ID. However, because the upper ~39
|
||||
// bits of session ID are hashed, the combination of file number and file
|
||||
// offset only has to add to 25 bits (or more) to ensure full entropy in
|
||||
// unstructured uniqueness lost in the reduction. Typical file size of 32MB
|
||||
// suffices (at least for simulation purposes where we assume each file offset
|
||||
// occupies a cache key).
|
||||
//
|
||||
// Example results in comments on OffsetableCacheKey.
|
||||
class StressCacheKey {
|
||||
public:
|
||||
void Run() {
|
||||
if (FLAGS_sck_footer_unique_id) {
|
||||
// Proposed footer unique IDs are DB-independent and session-independent
|
||||
// (but process-dependent) which is most easily simulated here by
|
||||
// assuming 1 DB and (later below) no session resets without process
|
||||
// reset.
|
||||
FLAGS_sck_db_count = 1;
|
||||
}
|
||||
|
||||
// Describe the simulated workload
|
||||
uint64_t mb_per_day =
|
||||
uint64_t{FLAGS_sck_files_per_day} * FLAGS_sck_file_size_mb;
|
||||
printf("Total cache or DBs size: %gTiB Writing %g MiB/s or %gTiB/day\n",
|
||||
FLAGS_sck_file_size_mb / 1024.0 / 1024.0 *
|
||||
std::pow(2.0, FLAGS_sck_table_bits),
|
||||
mb_per_day / 86400.0, mb_per_day / 1024.0 / 1024.0);
|
||||
// For extrapolating probability of any collisions from a number of
|
||||
// observed collisions
|
||||
multiplier_ = std::pow(2.0, 128 - FLAGS_sck_keep_bits) /
|
||||
(FLAGS_sck_file_size_mb * 1024.0 * 1024.0);
|
||||
printf(
|
||||
@ -606,6 +704,9 @@ class StressCacheKey {
|
||||
restart_nfiles_ = FLAGS_sck_files_per_day / FLAGS_sck_restarts_per_day;
|
||||
double without_ejection =
|
||||
std::pow(1.414214, FLAGS_sck_keep_bits) / FLAGS_sck_files_per_day;
|
||||
// This should be a lower bound for -sck_randomize, usually a terribly
|
||||
// rough lower bound.
|
||||
// If observation is worse than this, then something has gone wrong.
|
||||
printf(
|
||||
"Without ejection, expect random collision after %g days (%g "
|
||||
"corrected)\n",
|
||||
@ -613,30 +714,36 @@ class StressCacheKey {
|
||||
double with_full_table =
|
||||
std::pow(2.0, FLAGS_sck_keep_bits - FLAGS_sck_table_bits) /
|
||||
FLAGS_sck_files_per_day;
|
||||
// This is an alternate lower bound for -sck_randomize, usually pretty
|
||||
// accurate. Our cache keys should usually perform "better than random"
|
||||
// but always no worse. (If observation is substantially worse than this,
|
||||
// then something has gone wrong.)
|
||||
printf(
|
||||
"With ejection and full table, expect random collision after %g "
|
||||
"days (%g corrected)\n",
|
||||
with_full_table, with_full_table * multiplier_);
|
||||
collisions_ = 0;
|
||||
|
||||
// Run until sufficient number of observed collisions.
|
||||
for (int i = 1; collisions_ < FLAGS_sck_min_collision; i++) {
|
||||
RunOnce();
|
||||
if (collisions_ == 0) {
|
||||
printf(
|
||||
"No collisions after %d x %u days "
|
||||
" \n",
|
||||
i, FLAGS_sck_duration);
|
||||
i, FLAGS_sck_days_per_run);
|
||||
} else {
|
||||
double est = 1.0 * i * FLAGS_sck_duration / collisions_;
|
||||
double est = 1.0 * i * FLAGS_sck_days_per_run / collisions_;
|
||||
printf("%" PRIu64
|
||||
" collisions after %d x %u days, est %g days between (%g "
|
||||
"corrected) \n",
|
||||
collisions_, i, FLAGS_sck_duration, est, est * multiplier_);
|
||||
collisions_, i, FLAGS_sck_days_per_run, est, est * multiplier_);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void RunOnce() {
|
||||
// Re-initialized simulated state
|
||||
const size_t db_count = FLAGS_sck_db_count;
|
||||
dbs_.reset(new TableProperties[db_count]{});
|
||||
const size_t table_mask = (size_t{1} << FLAGS_sck_table_bits) - 1;
|
||||
@ -644,7 +751,11 @@ class StressCacheKey {
|
||||
if (FLAGS_sck_keep_bits > 64) {
|
||||
FLAGS_sck_keep_bits = 64;
|
||||
}
|
||||
|
||||
// Details of which bits are dropped in reduction
|
||||
uint32_t shift_away = 64 - FLAGS_sck_keep_bits;
|
||||
// Shift away fewer potential file number bits (b) than potential
|
||||
// session counter bits (a).
|
||||
uint32_t shift_away_b = shift_away / 3;
|
||||
uint32_t shift_away_a = shift_away - shift_away_b;
|
||||
|
||||
@ -655,62 +766,78 @@ class StressCacheKey {
|
||||
Random64 r{std::random_device{}()};
|
||||
|
||||
uint64_t max_file_count =
|
||||
uint64_t{FLAGS_sck_files_per_day} * FLAGS_sck_duration;
|
||||
uint64_t file_count = 0;
|
||||
uint64_t{FLAGS_sck_files_per_day} * FLAGS_sck_days_per_run;
|
||||
uint64_t file_size = FLAGS_sck_file_size_mb * uint64_t{1024} * 1024U;
|
||||
uint32_t report_count = 0;
|
||||
uint32_t collisions_this_run = 0;
|
||||
// Round robin through DBs
|
||||
for (size_t db_i = 0;; ++db_i) {
|
||||
size_t db_i = 0;
|
||||
|
||||
for (uint64_t file_count = 1; file_count <= max_file_count;
|
||||
++file_count, ++db_i) {
|
||||
// Round-robin through DBs (this faster than %)
|
||||
if (db_i >= db_count) {
|
||||
db_i = 0;
|
||||
}
|
||||
if (file_count >= max_file_count) {
|
||||
break;
|
||||
}
|
||||
// Any other periodic actions before simulating next file
|
||||
if (!FLAGS_sck_footer_unique_id && r.OneIn(FLAGS_sck_reopen_nfiles)) {
|
||||
ResetSession(db_i);
|
||||
} else if (r.OneIn(restart_nfiles_)) {
|
||||
ResetProcess();
|
||||
}
|
||||
// Simulate next file
|
||||
OffsetableCacheKey ock;
|
||||
dbs_[db_i].orig_file_number += 1;
|
||||
// skip some file numbers, unless 1 DB so that that can simulate
|
||||
// better (DB-independent) unique IDs
|
||||
if (db_count > 1) {
|
||||
// skip some file numbers for other file kinds, except in footer unique
|
||||
// ID, orig_file_number here tracks process-wide generated SST file
|
||||
// count.
|
||||
if (!FLAGS_sck_footer_unique_id) {
|
||||
dbs_[db_i].orig_file_number += (r.Next() & 3);
|
||||
}
|
||||
BlockBasedTable::SetupBaseCacheKey(&dbs_[db_i], "", 42, 42, &ock);
|
||||
bool is_stable;
|
||||
BlockBasedTable::SetupBaseCacheKey(&dbs_[db_i], /* ignored */ "",
|
||||
/* ignored */ 42, file_size, &ock,
|
||||
&is_stable);
|
||||
assert(is_stable);
|
||||
// Get a representative cache key, which later we analytically generalize
|
||||
// to a range.
|
||||
CacheKey ck = ock.WithOffset(0);
|
||||
uint64_t stripped;
|
||||
uint64_t reduced_key;
|
||||
if (FLAGS_sck_randomize) {
|
||||
stripped = GetSliceHash64(ck.AsSlice()) >> shift_away;
|
||||
reduced_key = GetSliceHash64(ck.AsSlice()) >> shift_away;
|
||||
} else if (FLAGS_sck_footer_unique_id) {
|
||||
// Special case: keep only file number, not session counter
|
||||
uint32_t a = DecodeFixed32(ck.AsSlice().data() + 4) >> shift_away_a;
|
||||
uint32_t b = DecodeFixed32(ck.AsSlice().data() + 12) >> shift_away_b;
|
||||
stripped = (uint64_t{a} << 32) + b;
|
||||
reduced_key = (uint64_t{a} << 32) + b;
|
||||
} else {
|
||||
// Try to keep file number and session counter (shift away other bits)
|
||||
uint32_t a = DecodeFixed32(ck.AsSlice().data()) << shift_away_a;
|
||||
uint32_t b = DecodeFixed32(ck.AsSlice().data() + 12) >> shift_away_b;
|
||||
stripped = (uint64_t{a} << 32) + b;
|
||||
reduced_key = (uint64_t{a} << 32) + b;
|
||||
}
|
||||
if (stripped == 0) {
|
||||
// Unlikely, but we need to exclude tracking this value
|
||||
if (reduced_key == 0) {
|
||||
// Unlikely, but we need to exclude tracking this value because we
|
||||
// use it to mean "empty" in table. This case is OK as long as we
|
||||
// don't hit it often.
|
||||
printf("Hit Zero! \n");
|
||||
file_count--;
|
||||
continue;
|
||||
}
|
||||
file_count++;
|
||||
uint64_t h = NPHash64(reinterpret_cast<char*>(&stripped), 8);
|
||||
// Skew lifetimes
|
||||
uint64_t h =
|
||||
NPHash64(reinterpret_cast<char*>(&reduced_key), sizeof(reduced_key));
|
||||
// Skew expected lifetimes, for high variance (super-Poisson) variance
|
||||
// in actual lifetimes.
|
||||
size_t pos =
|
||||
std::min(Lower32of64(h) & table_mask, Upper32of64(h) & table_mask);
|
||||
if (table_[pos] == stripped) {
|
||||
if (table_[pos] == reduced_key) {
|
||||
collisions_this_run++;
|
||||
// To predict probability of no collisions, we have to get rid of
|
||||
// correlated collisions, which this takes care of:
|
||||
// Our goal is to predict probability of no collisions, not expected
|
||||
// number of collisions. To make the distinction, we have to get rid
|
||||
// of observing correlated collisions, which this takes care of:
|
||||
ResetProcess();
|
||||
} else {
|
||||
// Replace
|
||||
table_[pos] = stripped;
|
||||
// Replace (end of lifetime for file that was in this slot)
|
||||
table_[pos] = reduced_key;
|
||||
}
|
||||
|
||||
if (++report_count == FLAGS_sck_files_per_day) {
|
||||
@ -748,6 +875,8 @@ class StressCacheKey {
|
||||
ResetSession(i);
|
||||
}
|
||||
if (FLAGS_sck_footer_unique_id) {
|
||||
// For footer unique ID, this tracks process-wide generated SST file
|
||||
// count.
|
||||
dbs_[0].orig_file_number = 0;
|
||||
}
|
||||
}
|
||||
|
91
cache/cache_key.cc
vendored
91
cache/cache_key.cc
vendored
@ -35,7 +35,8 @@ CacheKey CacheKey::CreateUniqueForCacheLifetime(Cache *cache) {
|
||||
CacheKey CacheKey::CreateUniqueForProcessLifetime() {
|
||||
// To avoid colliding with CreateUniqueForCacheLifetime, assuming
|
||||
// Cache::NewId counts up from zero, here we count down from UINT64_MAX.
|
||||
// If this ever becomes a point of contention, we could use CoreLocalArray.
|
||||
// If this ever becomes a point of contention, we could sub-divide the
|
||||
// space and use CoreLocalArray.
|
||||
static std::atomic<uint64_t> counter{UINT64_MAX};
|
||||
uint64_t id = counter.fetch_sub(1, std::memory_order_relaxed);
|
||||
// Ensure we don't collide with CreateUniqueForCacheLifetime
|
||||
@ -118,9 +119,10 @@ CacheKey CacheKey::CreateUniqueForProcessLifetime() {
|
||||
// "structured" uniqueness hasn't been cloned. Using a static
|
||||
// SemiStructuredUniqueIdGen for db_session_ids, this means we only get an
|
||||
// "all new" session id when a new process uses RocksDB. (Between processes,
|
||||
// we don't know if a DB or other persistent storage has been cloned.) Within
|
||||
// a process, only the session_lower of the db_session_id changes
|
||||
// incrementally ("structured" uniqueness).
|
||||
// we don't know if a DB or other persistent storage has been cloned. We
|
||||
// assume that if VM hot cloning is used, subsequently generated SST files
|
||||
// do not interact.) Within a process, only the session_lower of the
|
||||
// db_session_id changes incrementally ("structured" uniqueness).
|
||||
//
|
||||
// This basically means that our offsets, counters and file numbers allow us
|
||||
// to do somewhat "better than random" (birthday paradox) while in the
|
||||
@ -168,12 +170,83 @@ CacheKey CacheKey::CreateUniqueForProcessLifetime() {
|
||||
// data from the last 180 days is in cache, but NOT the other assumption
|
||||
// for the 1 in a trillion estimate above).
|
||||
//
|
||||
// Conclusion: Burning through session IDs, particularly "all new" IDs that
|
||||
// only arise when a new process is started, is the only way to have a
|
||||
// plausible chance of cache key collision. When processes live for hours
|
||||
// or days, the chance of a cache key collision seems more plausibly due
|
||||
// to bad hardware than to bad luck in random session ID data.
|
||||
//
|
||||
// Collision probability estimation through simulation:
|
||||
// A tool ./cache_bench -stress_cache_key broadly simulates host-wide cache
|
||||
// activity over many months, by making some pessimistic simplifying
|
||||
// assumptions. See class StressCacheKey in cache_bench_tool.cc for details.
|
||||
// Here is some sample output with
|
||||
// `./cache_bench -stress_cache_key -sck_keep_bits=40`:
|
||||
//
|
||||
// Total cache or DBs size: 32TiB Writing 925.926 MiB/s or 76.2939TiB/day
|
||||
// Multiply by 9.22337e+18 to correct for simulation losses (but still
|
||||
// assume whole file cached)
|
||||
//
|
||||
// These come from default settings of 2.5M files per day of 32 MB each, and
|
||||
// `-sck_keep_bits=40` means that to represent a single file, we are only
|
||||
// keeping 40 bits of the 128-bit (base) cache key. With file size of 2**25
|
||||
// contiguous keys (pessimistic), our simulation is about 2\*\*(128-40-25) or
|
||||
// about 9 billion billion times more prone to collision than reality.
|
||||
//
|
||||
// More default assumptions, relatively pessimistic:
|
||||
// * 100 DBs in same process (doesn't matter much)
|
||||
// * Re-open DB in same process (new session ID related to old session ID) on
|
||||
// average every 100 files generated
|
||||
// * Restart process (all new session IDs unrelated to old) 24 times per day
|
||||
//
|
||||
// After enough data, we get a result at the end (-sck_keep_bits=40):
|
||||
//
|
||||
// (keep 40 bits) 17 collisions after 2 x 90 days, est 10.5882 days between
|
||||
// (9.76592e+19 corrected)
|
||||
//
|
||||
// If we believe the (pessimistic) simulation and the mathematical
|
||||
// extrapolation, we would need to run a billion machines all for 97 billion
|
||||
// days to expect a cache key collision. To help verify that our extrapolation
|
||||
// ("corrected") is robust, we can make our simulation more precise with
|
||||
// `-sck_keep_bits=41` and `42`, which takes more running time to get enough
|
||||
// collision data:
|
||||
//
|
||||
// (keep 41 bits) 16 collisions after 4 x 90 days, est 22.5 days between
|
||||
// (1.03763e+20 corrected)
|
||||
// (keep 42 bits) 19 collisions after 10 x 90 days, est 47.3684 days between
|
||||
// (1.09224e+20 corrected)
|
||||
//
|
||||
// The extrapolated prediction is very close. If anything, we might have some
|
||||
// very small losses of structured data (see class StressCacheKey in
|
||||
// cache_bench_tool.cc) leading to more accurate & more attractive prediction
|
||||
// with more bits kept.
|
||||
//
|
||||
// With the `-sck_randomize` option, we can see that typical workloads like
|
||||
// above have lower collision probability than "random" cache keys (note:
|
||||
// offsets still non-randomized) by a modest amount (roughly 20x less collision
|
||||
// prone than random), which should make us reasonably comfortable even in
|
||||
// "degenerate" cases (e.g. repeatedly launch a process to generate 1 file
|
||||
// with SstFileWriter):
|
||||
//
|
||||
// (rand 40 bits) 197 collisions after 1 x 90 days, est 0.456853 days between
|
||||
// (4.21372e+18 corrected)
|
||||
//
|
||||
// We can see that with more frequent process restarts (all new session IDs),
|
||||
// we get closer to the "random" cache key performance:
|
||||
//
|
||||
// (-sck_restarts_per_day=5000): 140 collisions after 1 x 90 days, ...
|
||||
// (5.92931e+18 corrected)
|
||||
//
|
||||
// Other tests have been run to validate other conditions behave as expected,
|
||||
// never behaving "worse than random" unless we start chopping off structured
|
||||
// data.
|
||||
//
|
||||
//
|
||||
// Conclusion: Even in extreme cases, rapidly burning through "all new" IDs
|
||||
// that only arise when a new process is started, the chance of any cache key
|
||||
// collisions in a giant fleet of machines is negligible. Especially when
|
||||
// processes live for hours or days, the chance of a cache key collision is
|
||||
// likely more plausibly due to bad hardware than to bad luck in random
|
||||
// session ID data. Software defects are surely more likely to cause corruption
|
||||
// than both of those.
|
||||
//
|
||||
// TODO: Nevertheless / regardless, an efficient way to detect (and thus
|
||||
// quantify) block cache corruptions, including collisions, should be added.
|
||||
OffsetableCacheKey::OffsetableCacheKey(const std::string &db_id,
|
||||
const std::string &db_session_id,
|
||||
uint64_t file_number,
|
||||
|
@ -9,17 +9,26 @@
|
||||
#include <cstdlib>
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <unordered_set>
|
||||
|
||||
#include "cache/cache_entry_roles.h"
|
||||
#include "cache/cache_key.h"
|
||||
#include "cache/lru_cache.h"
|
||||
#include "db/column_family.h"
|
||||
#include "db/db_impl/db_impl.h"
|
||||
#include "db/db_test_util.h"
|
||||
#include "env/unique_id_gen.h"
|
||||
#include "port/stack_trace.h"
|
||||
#include "rocksdb/persistent_cache.h"
|
||||
#include "rocksdb/statistics.h"
|
||||
#include "rocksdb/table.h"
|
||||
#include "rocksdb/table_properties.h"
|
||||
#include "table/block_based/block_based_table_reader.h"
|
||||
#include "table/unique_id_impl.h"
|
||||
#include "util/compression.h"
|
||||
#include "util/defer.h"
|
||||
#include "util/hash.h"
|
||||
#include "util/math.h"
|
||||
#include "util/random.h"
|
||||
#include "utilities/fault_injection_fs.h"
|
||||
|
||||
@ -1714,6 +1723,238 @@ TEST_P(DBBlockCacheKeyTest, StableCacheKeys) {
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
||||
}
|
||||
|
||||
class CacheKeyTest : public testing::Test {
|
||||
public:
|
||||
void SetupStableBase() {
|
||||
// Like SemiStructuredUniqueIdGen::GenerateNext
|
||||
tp_.db_session_id = EncodeSessionId(base_session_upper_,
|
||||
base_session_lower_ ^ session_counter_);
|
||||
tp_.db_id = ToString(db_id_);
|
||||
tp_.orig_file_number = file_number_;
|
||||
bool is_stable;
|
||||
std::string cur_session_id = ""; // ignored
|
||||
uint64_t cur_file_number = 42; // ignored
|
||||
BlockBasedTable::SetupBaseCacheKey(&tp_, cur_session_id, cur_file_number,
|
||||
file_size_, &base_cache_key_,
|
||||
&is_stable);
|
||||
ASSERT_TRUE(is_stable);
|
||||
}
|
||||
CacheKey WithOffset(uint64_t offset) {
|
||||
return BlockBasedTable::GetCacheKey(base_cache_key_,
|
||||
BlockHandle(offset, /*size*/ 5));
|
||||
}
|
||||
|
||||
protected:
|
||||
OffsetableCacheKey base_cache_key_;
|
||||
TableProperties tp_;
|
||||
uint64_t file_size_ = 0;
|
||||
uint64_t base_session_upper_ = 0;
|
||||
uint64_t base_session_lower_ = 0;
|
||||
uint64_t session_counter_ = 0;
|
||||
uint64_t file_number_ = 0;
|
||||
uint64_t db_id_ = 0;
|
||||
};
|
||||
|
||||
namespace {
|
||||
template <typename T>
|
||||
int CountBitsDifferent(const T& t1, const T& t2) {
|
||||
int diff = 0;
|
||||
const uint8_t* p1 = reinterpret_cast<const uint8_t*>(&t1);
|
||||
const uint8_t* p2 = reinterpret_cast<const uint8_t*>(&t2);
|
||||
static_assert(sizeof(*p1) == 1, "Expecting uint8_t byte");
|
||||
for (size_t i = 0; i < sizeof(T); ++i) {
|
||||
diff += BitsSetToOne(p1[i] ^ p2[i]);
|
||||
}
|
||||
return diff;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
TEST_F(CacheKeyTest, DBImplSessionIdStructure) {
|
||||
// We have to generate our own session IDs for simulation purposes in other
|
||||
// tests. Here we verify that the DBImpl implementation seems to match
|
||||
// our construction here, by using lowest XORed-in bits for "session
|
||||
// counter."
|
||||
std::string session_id1 = DBImpl::GenerateDbSessionId(/*env*/ nullptr);
|
||||
std::string session_id2 = DBImpl::GenerateDbSessionId(/*env*/ nullptr);
|
||||
uint64_t upper1, upper2, lower1, lower2;
|
||||
ASSERT_OK(DecodeSessionId(session_id1, &upper1, &lower1));
|
||||
ASSERT_OK(DecodeSessionId(session_id2, &upper2, &lower2));
|
||||
// Because generated in same process
|
||||
ASSERT_EQ(upper1, upper2);
|
||||
// Unless we generate > 4 billion session IDs in this process...
|
||||
ASSERT_EQ(Upper32of64(lower1), Upper32of64(lower2));
|
||||
// But they must be different somewhere
|
||||
ASSERT_NE(Lower32of64(lower1), Lower32of64(lower2));
|
||||
}
|
||||
|
||||
TEST_F(CacheKeyTest, StandardEncodingLimit) {
|
||||
base_session_upper_ = 1234;
|
||||
base_session_lower_ = 5678;
|
||||
session_counter_ = 42;
|
||||
file_number_ = 42;
|
||||
db_id_ = 1234;
|
||||
|
||||
file_size_ = 42;
|
||||
SetupStableBase();
|
||||
CacheKey ck1;
|
||||
ASSERT_TRUE(ck1.IsEmpty());
|
||||
ck1 = WithOffset(0);
|
||||
ASSERT_FALSE(ck1.IsEmpty());
|
||||
|
||||
// Should use same encoding
|
||||
file_size_ = BlockBasedTable::kMaxFileSizeStandardEncoding;
|
||||
SetupStableBase();
|
||||
CacheKey ck2 = WithOffset(0);
|
||||
ASSERT_EQ(CountBitsDifferent(ck1, ck2), 0);
|
||||
|
||||
// Should use different encoding
|
||||
++file_size_;
|
||||
SetupStableBase();
|
||||
CacheKey ck3 = WithOffset(0);
|
||||
ASSERT_GT(CountBitsDifferent(ck2, ck3), 0);
|
||||
}
|
||||
|
||||
TEST_F(CacheKeyTest, Encodings) {
|
||||
// Claim from cache_key.cc:
|
||||
// In fact, if our SST files are all < 4TB (see
|
||||
// BlockBasedTable::kMaxFileSizeStandardEncoding), then SST files generated
|
||||
// in a single process are guaranteed to have unique cache keys, unless/until
|
||||
// number session ids * max file number = 2**86, e.g. 1 trillion DB::Open in
|
||||
// a single process and 64 trillion files generated.
|
||||
|
||||
// We can generalize that. For
|
||||
// * z bits in maximum file size
|
||||
// * n bits in maximum file number
|
||||
// * s bits in maximum session counter
|
||||
// uniqueness is guaranteed at least when all of these hold:
|
||||
// * z + n + s <= 121 (128 - 2 meta + 2 offset trim - (8-1) byte granularity
|
||||
// in encoding)
|
||||
// * n + s <= 86 (encoding limitation)
|
||||
// * s <= 62 (because of 2-bit metadata)
|
||||
|
||||
// We can verify this indirectly by how input bits get into the cache key,
|
||||
// but we have to be mindful that for sufficiently large file sizes,
|
||||
// different encodings might be used. But for cases mixing large and small
|
||||
// files, we have to verify uniqueness between encodings.
|
||||
|
||||
// Going through all combinations would be a little expensive, so we test
|
||||
// only one random "stripe" of the configuration space per run.
|
||||
constexpr uint32_t kStripeBits = 8;
|
||||
constexpr uint32_t kStripeMask = (uint32_t{1} << kStripeBits) - 1;
|
||||
|
||||
// Also cycle through stripes on repeated runs (not thread safe)
|
||||
static uint32_t stripe =
|
||||
static_cast<uint32_t>(std::random_device{}()) & kStripeMask;
|
||||
stripe = (stripe + 1) & kStripeMask;
|
||||
|
||||
fprintf(stderr, "%u\n", stripe);
|
||||
|
||||
// We are going to randomly initialize some values which *should* not affect
|
||||
// result
|
||||
Random64 r{std::random_device{}()};
|
||||
|
||||
int max_num_encodings = 0;
|
||||
uint32_t config_num = 0;
|
||||
uint32_t session_counter_bits, file_number_bits, max_file_size_bits;
|
||||
|
||||
// Inner loop body, used later in a loop over configurations
|
||||
auto TestConfig = [&]() {
|
||||
base_session_upper_ = r.Next();
|
||||
base_session_lower_ = r.Next();
|
||||
session_counter_ = r.Next();
|
||||
if (session_counter_bits < 64) {
|
||||
// Avoid shifting UB
|
||||
session_counter_ = session_counter_ >> 1 >> (63 - session_counter_bits);
|
||||
}
|
||||
file_number_ = r.Next() >> (64 - file_number_bits);
|
||||
// Need two bits set to avoid temporary zero below
|
||||
if (BitsSetToOne(file_number_) < 2) {
|
||||
file_number_ = 3;
|
||||
}
|
||||
db_id_ = r.Next();
|
||||
|
||||
// Work-around clang-analyzer which thinks empty last_base is garbage
|
||||
CacheKey last_base = CacheKey::CreateUniqueForProcessLifetime();
|
||||
|
||||
std::unordered_set<std::string> seen;
|
||||
int num_encodings = 0;
|
||||
|
||||
// Loop over encodings by increasing file size bits
|
||||
for (uint32_t file_size_bits = 1; file_size_bits <= max_file_size_bits;
|
||||
++file_size_bits) {
|
||||
file_size_ = uint64_t{1} << (file_size_bits - 1);
|
||||
SetupStableBase();
|
||||
CacheKey new_base = WithOffset(0);
|
||||
if (CountBitsDifferent(last_base, new_base) == 0) {
|
||||
// Same as previous encoding
|
||||
continue;
|
||||
}
|
||||
|
||||
// New encoding
|
||||
++num_encodings;
|
||||
ASSERT_TRUE(seen.insert(new_base.AsSlice().ToString()).second);
|
||||
last_base = new_base;
|
||||
for (uint32_t i = 0; i < file_size_bits; ++i) {
|
||||
CacheKey ck = WithOffset(uint64_t{1} << i);
|
||||
if (i < 2) {
|
||||
// These cases are not relevant and optimized by dropping two
|
||||
// lowest bits because there's always at least 5 bytes between
|
||||
// blocks.
|
||||
ASSERT_EQ(CountBitsDifferent(ck, new_base), 0);
|
||||
} else {
|
||||
// Normal case
|
||||
// 1 bit different from base and never been seen implies the bit
|
||||
// is encoded into cache key without overlapping other structured
|
||||
// data.
|
||||
ASSERT_EQ(CountBitsDifferent(ck, new_base), 1);
|
||||
ASSERT_TRUE(seen.insert(ck.AsSlice().ToString()).second);
|
||||
}
|
||||
}
|
||||
for (uint32_t i = 0; i < session_counter_bits; ++i) {
|
||||
SaveAndRestore<uint64_t> tmp(&session_counter_,
|
||||
session_counter_ ^ (uint64_t{1} << i));
|
||||
SetupStableBase();
|
||||
CacheKey ck = WithOffset(0);
|
||||
ASSERT_EQ(CountBitsDifferent(ck, new_base), 1);
|
||||
ASSERT_TRUE(seen.insert(ck.AsSlice().ToString()).second);
|
||||
}
|
||||
for (uint32_t i = 0; i < file_number_bits; ++i) {
|
||||
SaveAndRestore<uint64_t> tmp(&file_number_,
|
||||
file_number_ ^ (uint64_t{1} << i));
|
||||
SetupStableBase();
|
||||
CacheKey ck = WithOffset(0);
|
||||
ASSERT_EQ(CountBitsDifferent(ck, new_base), 1);
|
||||
ASSERT_TRUE(seen.insert(ck.AsSlice().ToString()).second);
|
||||
}
|
||||
max_num_encodings = std::max(max_num_encodings, num_encodings);
|
||||
}
|
||||
};
|
||||
|
||||
// Loop over configurations and test those in stripe
|
||||
for (session_counter_bits = 0; session_counter_bits <= 62;
|
||||
++session_counter_bits) {
|
||||
uint32_t max_file_number_bits =
|
||||
std::min(uint32_t{64}, uint32_t{86} - session_counter_bits);
|
||||
// Start with 2 to avoid file_number_ == 0 in testing
|
||||
for (file_number_bits = 2; file_number_bits <= max_file_number_bits;
|
||||
++file_number_bits) {
|
||||
uint32_t max_max_file_size_bits =
|
||||
std::min(uint32_t{64},
|
||||
uint32_t{121} - file_number_bits - session_counter_bits);
|
||||
for (max_file_size_bits = 1; max_file_size_bits <= max_max_file_size_bits;
|
||||
++max_file_size_bits) {
|
||||
if ((config_num++ & kStripeMask) == stripe) {
|
||||
TestConfig();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Make sure the current implementation is exercised
|
||||
ASSERT_EQ(max_num_encodings, 4);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(DBBlockCacheKeyTest, DBBlockCacheKeyTest,
|
||||
::testing::Combine(::testing::Bool(),
|
||||
::testing::Bool()));
|
||||
|
Loading…
x
Reference in New Issue
Block a user