// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. // This source code is licensed under both the GPLv2 (found in the // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). #include "cache/cache_key.h" #include #include #include "rocksdb/cache.h" #include "table/unique_id_impl.h" #include "util/hash.h" #include "util/math.h" namespace ROCKSDB_NAMESPACE { // Value space plan for CacheKey: // // session_etc64_ | offset_etc64_ | Only generated by // ---------------+---------------+------------------------------------------ // 0 | 0 | Reserved for "empty" CacheKey() // 0 | > 0, < 1<<63 | CreateUniqueForCacheLifetime // 0 | >= 1<<63 | CreateUniqueForProcessLifetime // > 0 | any | OffsetableCacheKey.WithOffset CacheKey CacheKey::CreateUniqueForCacheLifetime(Cache *cache) { // +1 so that we can reserve all zeros for "unset" cache key uint64_t id = cache->NewId() + 1; // Ensure we don't collide with CreateUniqueForProcessLifetime assert((id >> 63) == 0U); return CacheKey(0, id); } CacheKey CacheKey::CreateUniqueForProcessLifetime() { // To avoid colliding with CreateUniqueForCacheLifetime, assuming // Cache::NewId counts up from zero, here we count down from UINT64_MAX. // If this ever becomes a point of contention, we could sub-divide the // space and use CoreLocalArray. static std::atomic counter{UINT64_MAX}; uint64_t id = counter.fetch_sub(1, std::memory_order_relaxed); // Ensure we don't collide with CreateUniqueForCacheLifetime assert((id >> 63) == 1U); return CacheKey(0, id); } // Value plan for CacheKeys from OffsetableCacheKey, assuming that // db_session_ids are generated from a base_session_id and // session_id_counter (by SemiStructuredUniqueIdGen+EncodeSessionId // in DBImpl::GenerateDbSessionId): // // Conceptual inputs: // db_id (unstructured, from GenerateRawUniqueId or equiv) // * could be shared between cloned DBs but rare // * could be constant, if session id suffices // base_session_id (unstructured, from GenerateRawUniqueId) // session_id_counter (structured) // * usually much smaller than 2**24 // file_number (structured) // * usually smaller than 2**24 // offset_in_file (structured, might skip lots of values) // * usually smaller than 2**32 // max_offset determines placement of file_number to prevent // overlapping with offset // // Outputs come from bitwise-xor of the constituent pieces, low bits on left: // // |------------------------- session_etc64 -------------------------| // | +++++++++++++++ base_session_id (lower 64 bits) +++++++++++++++ | // |-----------------------------------------------------------------| // | session_id_counter ...| | // |-----------------------------------------------------------------| // | | ... file_number | // | | overflow & meta | // |-----------------------------------------------------------------| // // // |------------------------- offset_etc64 --------------------------| // | hash of: ++++++++++++++++++++++++++++++++++++++++++++++++++++++ | // | * base_session_id (upper ~39 bits) | // | * db_id (~122 bits entropy) | // |-----------------------------------------------------------------| // | offset_in_file ............... | | // |-----------------------------------------------------------------| // | | file_number, 0-3 | // | | lower bytes | // |-----------------------------------------------------------------| // // Based on max_offset, a maximal number of bytes 0..3 is chosen for // including from lower bits of file_number in offset_etc64. The choice // is encoded in two bits of metadata going into session_etc64, though // the common case of 3 bytes is encoded as 0 so that session_etc64 // is unmodified by file_number concerns in the common case. // // There is nothing preventing "file number overflow & meta" from meeting // and overlapping with session_id_counter, but reaching such a case requires // an intractable combination of large file offsets (thus at least some large // files), large file numbers (thus large number of files generated), and // large number of session IDs generated in a single process. A trillion each // (2**40) of session ids, offsets, and file numbers comes to 120 bits. // With two bits of metadata and byte granularity, this is on the verge of // overlap, but even in the overlap case, it doesn't seem likely that // a file from billions of files or session ids ago will still be live // or cached. // // In fact, if our SST files are all < 4TB (see // BlockBasedTable::kMaxFileSizeStandardEncoding), then SST files generated // in a single process are guaranteed to have unique cache keys, unless/until // number session ids * max file number = 2**86, e.g. 1 trillion DB::Open in // a single process and 64 trillion files generated. Even at that point, to // see a collision we would need a miraculous re-synchronization of session // id and file number, along with a live file or stale cache entry from // trillions of files ago. // // How https://github.com/pdillinger/unique_id applies here: // Every bit of output always includes "unstructured" uniqueness bits and // often combines with "structured" uniqueness bits. The "unstructured" bits // change infrequently: only when we cannot guarantee our state tracking for // "structured" uniqueness hasn't been cloned. Using a static // SemiStructuredUniqueIdGen for db_session_ids, this means we only get an // "all new" session id when a new process uses RocksDB. (Between processes, // we don't know if a DB or other persistent storage has been cloned. We // assume that if VM hot cloning is used, subsequently generated SST files // do not interact.) Within a process, only the session_lower of the // db_session_id changes incrementally ("structured" uniqueness). // // This basically means that our offsets, counters and file numbers allow us // to do somewhat "better than random" (birthday paradox) while in the // degenerate case of completely new session for each tiny file, we still // have strong uniqueness properties from the birthday paradox, with ~103 // bit session IDs or up to 128 bits entropy with different DB IDs sharing a // cache. // // More collision probability analysis: // Suppose a RocksDB host generates (generously) 2 GB/s (10TB data, 17 DWPD) // with average process/session lifetime of (pessimistically) 4 minutes. // In 180 days (generous allowable data lifespan), we generate 31 million GB // of data, or 2^55 bytes, and 2^16 "all new" session IDs. // // First, suppose this is in a single DB (lifetime 180 days): // 128 bits cache key size // - 55 <- ideal size for byte offsets + file numbers // - 2 <- bits for offsets and file numbers not exactly powers of two // - 2 <- bits for file number encoding metadata // + 2 <- bits saved not using byte offsets in BlockBasedTable::GetCacheKey // ---- // 71 <- bits remaining for distinguishing session IDs // The probability of a collision in 71 bits of session ID data is less than // 1 in 2**(71 - (2 * 16)), or roughly 1 in a trillion. And this assumes all // data from the last 180 days is in cache for potential collision, and that // cache keys under each session id exhaustively cover the remaining 57 bits // while in reality they'll only cover a small fraction of it. // // Although data could be transferred between hosts, each host has its own // cache and we are already assuming a high rate of "all new" session ids. // So this doesn't really change the collision calculation. Across a fleet // of 1 million, each with <1 in a trillion collision possibility, // fleetwide collision probability is <1 in a million. // // Now suppose we have many DBs per host, say 2**10, with same host-wide write // rate and process/session lifetime. File numbers will be ~10 bits smaller // and we will have 2**10 times as many session IDs because of simultaneous // lifetimes. So now collision chance is less than 1 in 2**(81 - (2 * 26)), // or roughly 1 in a billion. // // Suppose instead we generated random or hashed cache keys for each // (compressed) block. For 1KB compressed block size, that is 2^45 cache keys // in 180 days. Collision probability is more easily estimated at roughly // 1 in 2**(128 - (2 * 45)) or roughly 1 in a trillion (assuming all // data from the last 180 days is in cache, but NOT the other assumption // for the 1 in a trillion estimate above). // // // Collision probability estimation through simulation: // A tool ./cache_bench -stress_cache_key broadly simulates host-wide cache // activity over many months, by making some pessimistic simplifying // assumptions. See class StressCacheKey in cache_bench_tool.cc for details. // Here is some sample output with // `./cache_bench -stress_cache_key -sck_keep_bits=40`: // // Total cache or DBs size: 32TiB Writing 925.926 MiB/s or 76.2939TiB/day // Multiply by 9.22337e+18 to correct for simulation losses (but still // assume whole file cached) // // These come from default settings of 2.5M files per day of 32 MB each, and // `-sck_keep_bits=40` means that to represent a single file, we are only // keeping 40 bits of the 128-bit (base) cache key. With file size of 2**25 // contiguous keys (pessimistic), our simulation is about 2\*\*(128-40-25) or // about 9 billion billion times more prone to collision than reality. // // More default assumptions, relatively pessimistic: // * 100 DBs in same process (doesn't matter much) // * Re-open DB in same process (new session ID related to old session ID) on // average every 100 files generated // * Restart process (all new session IDs unrelated to old) 24 times per day // // After enough data, we get a result at the end (-sck_keep_bits=40): // // (keep 40 bits) 17 collisions after 2 x 90 days, est 10.5882 days between // (9.76592e+19 corrected) // // If we believe the (pessimistic) simulation and the mathematical // extrapolation, we would need to run a billion machines all for 97 billion // days to expect a cache key collision. To help verify that our extrapolation // ("corrected") is robust, we can make our simulation more precise with // `-sck_keep_bits=41` and `42`, which takes more running time to get enough // collision data: // // (keep 41 bits) 16 collisions after 4 x 90 days, est 22.5 days between // (1.03763e+20 corrected) // (keep 42 bits) 19 collisions after 10 x 90 days, est 47.3684 days between // (1.09224e+20 corrected) // // The extrapolated prediction is very close. If anything, we might have some // very small losses of structured data (see class StressCacheKey in // cache_bench_tool.cc) leading to more accurate & more attractive prediction // with more bits kept. // // With the `-sck_randomize` option, we can see that typical workloads like // above have lower collision probability than "random" cache keys (note: // offsets still non-randomized) by a modest amount (roughly 20x less collision // prone than random), which should make us reasonably comfortable even in // "degenerate" cases (e.g. repeatedly launch a process to generate 1 file // with SstFileWriter): // // (rand 40 bits) 197 collisions after 1 x 90 days, est 0.456853 days between // (4.21372e+18 corrected) // // We can see that with more frequent process restarts (all new session IDs), // we get closer to the "random" cache key performance: // // (-sck_restarts_per_day=5000): 140 collisions after 1 x 90 days, ... // (5.92931e+18 corrected) // // Other tests have been run to validate other conditions behave as expected, // never behaving "worse than random" unless we start chopping off structured // data. // // // Conclusion: Even in extreme cases, rapidly burning through "all new" IDs // that only arise when a new process is started, the chance of any cache key // collisions in a giant fleet of machines is negligible. Especially when // processes live for hours or days, the chance of a cache key collision is // likely more plausibly due to bad hardware than to bad luck in random // session ID data. Software defects are surely more likely to cause corruption // than both of those. // // TODO: Nevertheless / regardless, an efficient way to detect (and thus // quantify) block cache corruptions, including collisions, should be added. OffsetableCacheKey::OffsetableCacheKey(const std::string &db_id, const std::string &db_session_id, uint64_t file_number, uint64_t max_offset) { #ifndef NDEBUG max_offset_ = max_offset; #endif // Closely related to GetSstInternalUniqueId, but only need 128 bits and // need to include an offset within the file. // See also https://github.com/pdillinger/unique_id for background. uint64_t session_upper = 0; // Assignment to appease clang-analyze uint64_t session_lower = 0; // Assignment to appease clang-analyze { Status s = DecodeSessionId(db_session_id, &session_upper, &session_lower); if (!s.ok()) { // A reasonable fallback in case malformed Hash2x64(db_session_id.data(), db_session_id.size(), &session_upper, &session_lower); } } // Hash the session upper (~39 bits entropy) and DB id (120+ bits entropy) // for more global uniqueness entropy. // (It is possible that many DBs descended from one common DB id are copied // around and proliferate, in which case session id is critical, but it is // more common for different DBs to have different DB ids.) uint64_t db_hash = Hash64(db_id.data(), db_id.size(), session_upper); // This establishes the db+session id part of the cache key. // // Exactly preserve (in common cases; see modifiers below) session lower to // ensure that session ids generated during the same process lifetime are // guaranteed unique. // // We put this first for CommonPrefixSlice(), so that a small-ish set of // cache key prefixes to cover entries relevant to any DB. session_etc64_ = session_lower; // This provides extra entopy in case of different DB id or process // generating a session id, but is also partly/variably obscured by // file_number and offset (see below). offset_etc64_ = db_hash; // Into offset_etc64_ we are (eventually) going to pack & xor in an offset and // a file_number, but we might need the file_number to overflow into // session_etc64_. (There must only be one session_etc64_ value per // file, and preferably shared among many files.) // // Figure out how many bytes of file_number we are going to be able to // pack in with max_offset, though our encoding will only support packing // in up to 3 bytes of file_number. (16M file numbers is enough for a new // file number every second for half a year.) int file_number_bytes_in_offset_etc = (63 - FloorLog2(max_offset | 0x100000000U)) / 8; int file_number_bits_in_offset_etc = file_number_bytes_in_offset_etc * 8; // Assert two bits of metadata assert(file_number_bytes_in_offset_etc >= 0 && file_number_bytes_in_offset_etc <= 3); // Assert we couldn't have used a larger allowed number of bytes (shift // would chop off bytes). assert(file_number_bytes_in_offset_etc == 3 || (max_offset << (file_number_bits_in_offset_etc + 8) >> (file_number_bits_in_offset_etc + 8)) != max_offset); uint64_t mask = (uint64_t{1} << (file_number_bits_in_offset_etc)) - 1; // Pack into high bits of etc so that offset can go in low bits of etc // TODO: could be EndianSwapValue? uint64_t offset_etc_modifier = ReverseBits(file_number & mask); assert(offset_etc_modifier << file_number_bits_in_offset_etc == 0U); // Overflow and 3 - byte count (likely both zero) go into session_id part uint64_t session_etc_modifier = (file_number >> file_number_bits_in_offset_etc << 2) | static_cast(3 - file_number_bytes_in_offset_etc); // Packed into high bits to minimize interference with session id counter. session_etc_modifier = ReverseBits(session_etc_modifier); // Assert session_id part is only modified in extreme cases assert(session_etc_modifier == 0 || file_number > /*3 bytes*/ 0xffffffU || max_offset > /*5 bytes*/ 0xffffffffffU); // Xor in the modifiers session_etc64_ ^= session_etc_modifier; offset_etc64_ ^= offset_etc_modifier; // Although DBImpl guarantees (in recent versions) that session_lower is not // zero, that's not entirely sufficient to guarantee that session_etc64_ is // not zero (so that the 0 case can be used by CacheKey::CreateUnique*) if (session_etc64_ == 0U) { session_etc64_ = session_upper | 1U; } assert(session_etc64_ != 0); } } // namespace ROCKSDB_NAMESPACE