2021-08-24 12:42:31 -07:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
//
|
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
|
|
|
#pragma once
|
|
|
|
|
|
|
|
#include <atomic>
|
|
|
|
#include <cstddef>
|
|
|
|
#include <cstdint>
|
|
|
|
#include <memory>
|
|
|
|
#include <vector>
|
|
|
|
|
|
|
|
#include "cache/cache_entry_roles.h"
|
|
|
|
#include "rocksdb/cache.h"
|
|
|
|
#include "rocksdb/slice.h"
|
|
|
|
#include "rocksdb/status.h"
|
|
|
|
#include "table/block_based/block_based_table_reader.h"
|
|
|
|
#include "util/coding.h"
|
|
|
|
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
|
2021-11-09 12:04:51 -08:00
|
|
|
template <CacheEntryRole R>
|
|
|
|
class CacheReservationHandle;
|
|
|
|
|
2021-08-24 12:42:31 -07:00
|
|
|
// CacheReservationManager is for reserving cache space for the memory used
|
|
|
|
// through inserting/releasing dummy entries in the cache.
|
2021-11-09 08:15:29 -08:00
|
|
|
//
|
|
|
|
// This class is NOT thread-safe, except that GetTotalReservedCacheSize()
|
|
|
|
// can be called without external synchronization.
|
2021-11-09 12:04:51 -08:00
|
|
|
class CacheReservationManager
|
|
|
|
: public std::enable_shared_from_this<CacheReservationManager> {
|
2021-08-24 12:42:31 -07:00
|
|
|
public:
|
|
|
|
// Construct a CacheReservationManager
|
|
|
|
// @param cache The cache where dummy entries are inserted and released for
|
|
|
|
// reserving cache space
|
|
|
|
// @param delayed_decrease If set true, then dummy entries won't be released
|
|
|
|
// immediately when memory usage decreases.
|
|
|
|
// Instead, it will be released when the memory usage
|
|
|
|
// decreases to 3/4 of what we have reserved so far.
|
|
|
|
// This is for saving some future dummy entry
|
|
|
|
// insertion when memory usage increases are likely to
|
|
|
|
// happen in the near future.
|
|
|
|
explicit CacheReservationManager(std::shared_ptr<Cache> cache,
|
|
|
|
bool delayed_decrease = false);
|
|
|
|
|
|
|
|
// no copy constructor, copy assignment, move constructor, move assignment
|
|
|
|
CacheReservationManager(const CacheReservationManager &) = delete;
|
|
|
|
CacheReservationManager &operator=(const CacheReservationManager &) = delete;
|
|
|
|
CacheReservationManager(CacheReservationManager &&) = delete;
|
|
|
|
CacheReservationManager &operator=(CacheReservationManager &&) = delete;
|
|
|
|
|
|
|
|
~CacheReservationManager();
|
|
|
|
|
|
|
|
template <CacheEntryRole R>
|
|
|
|
|
2021-11-09 12:04:51 -08:00
|
|
|
// One of the two ways of reserving/releasing cache,
|
|
|
|
// see CacheReservationManager::MakeCacheReservation() for the other.
|
|
|
|
// Use ONLY one of them to prevent unexpected behavior.
|
|
|
|
//
|
2021-08-24 12:42:31 -07:00
|
|
|
// Insert and release dummy entries in the cache to
|
2021-11-09 08:15:29 -08:00
|
|
|
// match the size of total dummy entries with the least multiple of
|
|
|
|
// kSizeDummyEntry greater than or equal to new_mem_used
|
2021-08-24 12:42:31 -07:00
|
|
|
//
|
|
|
|
// Insert dummy entries if new_memory_used > cache_allocated_size_;
|
|
|
|
//
|
|
|
|
// Release dummy entries if new_memory_used < cache_allocated_size_
|
|
|
|
// (and new_memory_used < cache_allocated_size_ * 3/4
|
|
|
|
// when delayed_decrease is set true);
|
|
|
|
//
|
|
|
|
// Keey dummy entries the same if (1) new_memory_used == cache_allocated_size_
|
|
|
|
// or (2) new_memory_used is in the interval of
|
|
|
|
// [cache_allocated_size_ * 3/4, cache_allocated_size) when delayed_decrease
|
|
|
|
// is set true.
|
|
|
|
//
|
2021-09-09 15:24:15 -07:00
|
|
|
// @param new_memory_used The number of bytes used by new memory
|
2021-11-09 08:15:29 -08:00
|
|
|
// The most recent new_memoy_used passed in will be returned
|
|
|
|
// in GetTotalMemoryUsed() even when the call return non-ok status.
|
|
|
|
//
|
|
|
|
// Since the class is NOT thread-safe, external synchronization on the
|
|
|
|
// order of calling UpdateCacheReservation() is needed if you want
|
|
|
|
// GetTotalMemoryUsed() indeed returns the latest memory used.
|
|
|
|
//
|
2021-09-09 15:24:15 -07:00
|
|
|
// @return On inserting dummy entries, it returns Status::OK() if all dummy
|
2021-11-09 08:15:29 -08:00
|
|
|
// entry insertions succeed.
|
|
|
|
// Otherwise, it returns the first non-ok status;
|
|
|
|
// On releasing dummy entries, it always returns Status::OK().
|
|
|
|
// On keeping dummy entries the same, it always returns Status::OK().
|
2021-08-24 12:42:31 -07:00
|
|
|
Status UpdateCacheReservation(std::size_t new_memory_used);
|
2021-11-09 08:15:29 -08:00
|
|
|
|
2021-11-09 12:04:51 -08:00
|
|
|
// One of the two ways of reserving/releasing cache,
|
|
|
|
// see CacheReservationManager::UpdateCacheReservation() for the other.
|
|
|
|
// Use ONLY one of them to prevent unexpected behavior.
|
|
|
|
//
|
|
|
|
// Insert dummy entries in the cache for the incremental memory usage
|
|
|
|
// to match the size of total dummy entries with the least multiple of
|
|
|
|
// kSizeDummyEntry greater than or equal to the total memory used.
|
|
|
|
//
|
|
|
|
// A CacheReservationHandle is returned as an output parameter.
|
|
|
|
// The reserved dummy entries are automatically released on the destruction of
|
|
|
|
// this handle, which achieves better RAII per cache reservation.
|
|
|
|
//
|
|
|
|
// WARNING: Deallocate all the handles of the CacheReservationManager object
|
|
|
|
// before deallocating the object to prevent unexpected behavior.
|
|
|
|
//
|
|
|
|
// @param incremental_memory_used The number of bytes increased in memory
|
|
|
|
// usage.
|
|
|
|
//
|
|
|
|
// Calling GetTotalMemoryUsed() afterward will return the total memory
|
|
|
|
// increased by this number, even when calling MakeCacheReservation()
|
|
|
|
// returns non-ok status.
|
|
|
|
//
|
|
|
|
// Since the class is NOT thread-safe, external synchronization in
|
|
|
|
// calling MakeCacheReservation() is needed if you want
|
|
|
|
// GetTotalMemoryUsed() indeed returns the latest memory used.
|
|
|
|
//
|
|
|
|
// @param handle An pointer to std::unique_ptr<CacheReservationHandle<R>> that
|
|
|
|
// manages the lifetime of the handle and its cache reservation.
|
|
|
|
//
|
|
|
|
// @return It returns Status::OK() if all dummy
|
|
|
|
// entry insertions succeed.
|
|
|
|
// Otherwise, it returns the first non-ok status;
|
|
|
|
//
|
|
|
|
// REQUIRES: handle != nullptr
|
|
|
|
// REQUIRES: The CacheReservationManager object is NOT managed by
|
|
|
|
// std::unique_ptr as CacheReservationHandle needs to
|
|
|
|
// shares ownership to the CacheReservationManager object.
|
|
|
|
template <CacheEntryRole R>
|
|
|
|
Status MakeCacheReservation(
|
|
|
|
std::size_t incremental_memory_used,
|
|
|
|
std::unique_ptr<CacheReservationHandle<R>> *handle);
|
|
|
|
|
2021-11-09 08:15:29 -08:00
|
|
|
// Return the size of the cache (which is a multiple of kSizeDummyEntry)
|
|
|
|
// successfully reserved by calling UpdateCacheReservation().
|
|
|
|
//
|
|
|
|
// When UpdateCacheReservation() returns non-ok status,
|
|
|
|
// calling GetTotalReservedCacheSize() after that might return a slightly
|
|
|
|
// smaller number than the actual reserved cache size due to
|
|
|
|
// the returned number will always be a multiple of kSizeDummyEntry
|
|
|
|
// and cache full might happen in the middle of inserting a dummy entry.
|
2021-08-24 12:42:31 -07:00
|
|
|
std::size_t GetTotalReservedCacheSize();
|
|
|
|
|
2021-11-09 08:15:29 -08:00
|
|
|
// Return the latest total memory used indicated by the most recent call of
|
|
|
|
// UpdateCacheReservation(std::size_t new_memory_used);
|
|
|
|
std::size_t GetTotalMemoryUsed();
|
|
|
|
|
2021-11-01 14:42:11 -07:00
|
|
|
static constexpr std::size_t GetDummyEntrySize() { return kSizeDummyEntry; }
|
|
|
|
|
Account Bloom/Ribbon filter construction memory in global memory limit (#9073)
Summary:
Note: This PR is the 4th part of a bigger PR stack (https://github.com/facebook/rocksdb/pull/9073) and will rebase/merge only after the first three PRs (https://github.com/facebook/rocksdb/pull/9070, https://github.com/facebook/rocksdb/pull/9071, https://github.com/facebook/rocksdb/pull/9130) merge.
**Context:**
Similar to https://github.com/facebook/rocksdb/pull/8428, this PR is to track memory usage during (new) Bloom Filter (i.e,FastLocalBloom) and Ribbon Filter (i.e, Ribbon128) construction, moving toward the goal of [single global memory limit using block cache capacity](https://github.com/facebook/rocksdb/wiki/Projects-Being-Developed#improving-memory-efficiency). It also constrains the size of the banding portion of Ribbon Filter during construction by falling back to Bloom Filter if that banding is, at some point, larger than the available space in the cache under `LRUCacheOptions::strict_capacity_limit=true`.
The option to turn on this feature is `BlockBasedTableOptions::reserve_table_builder_memory = true` which by default is set to `false`. We [decided](https://github.com/facebook/rocksdb/pull/9073#discussion_r741548409) not to have separate option for separate memory user in table building therefore their memory accounting are all bundled under one general option.
**Summary:**
- Reserved/released cache for creation/destruction of three main memory users with the passed-in `FilterBuildingContext::cache_res_mgr` during filter construction:
- hash entries (i.e`hash_entries`.size(), we bucket-charge hash entries during insertion for performance),
- banding (Ribbon Filter only, `bytes_coeff_rows` +`bytes_result_rows` + `bytes_backtrack`),
- final filter (i.e, `mutable_buf`'s size).
- Implementation details: in order to use `CacheReservationManager::CacheReservationHandle` to account final filter's memory, we have to store the `CacheReservationManager` object and `CacheReservationHandle` for final filter in `XXPH3BitsFilterBuilder` as well as explicitly delete the filter bits builder when done with the final filter in block based table.
- Added option fo run `filter_bench` with this memory reservation feature
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9073
Test Plan:
- Added new tests in `db_bloom_filter_test` to verify filter construction peak cache reservation under combination of `BlockBasedTable::Rep::FilterType` (e.g, `kFullFilter`, `kPartitionedFilter`), `BloomFilterPolicy::Mode`(e.g, `kFastLocalBloom`, `kStandard128Ribbon`, `kDeprecatedBlock`) and `BlockBasedTableOptions::reserve_table_builder_memory`
- To address the concern for slow test: tests with memory reservation under `kFullFilter` + `kStandard128Ribbon` and `kPartitionedFilter` take around **3000 - 6000 ms** and others take around **1500 - 2000 ms**, in total adding **20000 - 25000 ms** to the test suit running locally
- Added new test in `bloom_test` to verify Ribbon Filter fallback on large banding in FullFilter
- Added test in `filter_bench` to verify that this feature does not significantly slow down Bloom/Ribbon Filter construction speed. Local result averaged over **20** run as below:
- FastLocalBloom
- baseline `./filter_bench -impl=2 -quick -runs 20 | grep 'Build avg'`:
- **Build avg ns/key: 29.56295** (DEBUG_LEVEL=1), **29.98153** (DEBUG_LEVEL=0)
- new feature (expected to be similar as above)`./filter_bench -impl=2 -quick -runs 20 -reserve_table_builder_memory=true | grep 'Build avg'`:
- **Build avg ns/key: 30.99046** (DEBUG_LEVEL=1), **30.48867** (DEBUG_LEVEL=0)
- new feature of RibbonFilter with fallback (expected to be similar as above) `./filter_bench -impl=2 -quick -runs 20 -reserve_table_builder_memory=true -strict_capacity_limit=true | grep 'Build avg'` :
- **Build avg ns/key: 31.146975** (DEBUG_LEVEL=1), **30.08165** (DEBUG_LEVEL=0)
- Ribbon128
- baseline `./filter_bench -impl=3 -quick -runs 20 | grep 'Build avg'`:
- **Build avg ns/key: 129.17585** (DEBUG_LEVEL=1), **130.5225** (DEBUG_LEVEL=0)
- new feature (expected to be similar as above) `./filter_bench -impl=3 -quick -runs 20 -reserve_table_builder_memory=true | grep 'Build avg' `:
- **Build avg ns/key: 131.61645** (DEBUG_LEVEL=1), **132.98075** (DEBUG_LEVEL=0)
- new feature of RibbonFilter with fallback (expected to be a lot faster than above due to fallback) `./filter_bench -impl=3 -quick -runs 20 -reserve_table_builder_memory=true -strict_capacity_limit=true | grep 'Build avg'` :
- **Build avg ns/key: 52.032965** (DEBUG_LEVEL=1), **52.597825** (DEBUG_LEVEL=0)
- And the warning message of `"Cache reservation for Ribbon filter banding failed due to cache full"` is indeed logged to console.
Reviewed By: pdillinger
Differential Revision: D31991348
Pulled By: hx235
fbshipit-source-id: 9336b2c60f44d530063da518ceaf56dac5f9df8e
2021-11-18 09:41:10 -08:00
|
|
|
// For testing only - it is to help ensure the NoopDeleterForRole<R>
|
|
|
|
// accessed from CacheReservationManager and the one accessed from the test
|
|
|
|
// are from the same translation units
|
|
|
|
template <CacheEntryRole R>
|
|
|
|
static Cache::DeleterFn TEST_GetNoopDeleterForRole();
|
|
|
|
|
2021-08-24 12:42:31 -07:00
|
|
|
private:
|
|
|
|
static constexpr std::size_t kSizeDummyEntry = 256 * 1024;
|
|
|
|
|
|
|
|
Slice GetNextCacheKey();
|
|
|
|
template <CacheEntryRole R>
|
|
|
|
Status IncreaseCacheReservation(std::size_t new_mem_used);
|
|
|
|
Status DecreaseCacheReservation(std::size_t new_mem_used);
|
|
|
|
|
|
|
|
std::shared_ptr<Cache> cache_;
|
|
|
|
bool delayed_decrease_;
|
|
|
|
std::atomic<std::size_t> cache_allocated_size_;
|
2021-11-09 08:15:29 -08:00
|
|
|
std::size_t memory_used_;
|
2021-08-24 12:42:31 -07:00
|
|
|
std::vector<Cache::Handle *> dummy_handles_;
|
New stable, fixed-length cache keys (#9126)
Summary:
This change standardizes on a new 16-byte cache key format for
block cache (incl compressed and secondary) and persistent cache (but
not table cache and row cache).
The goal is a really fast cache key with practically ideal stability and
uniqueness properties without external dependencies (e.g. from FileSystem).
A fixed key size of 16 bytes should enable future optimizations to the
concurrent hash table for block cache, which is a heavy CPU user /
bottleneck, but there appears to be measurable performance improvement
even with no changes to LRUCache.
This change replaces a lot of disjointed and ugly code handling cache
keys with calls to a simple, clean new internal API (cache_key.h).
(Preserving the old cache key logic under an option would be very ugly
and likely negate the performance gain of the new approach. Complete
replacement carries some inherent risk, but I think that's acceptable
with sufficient analysis and testing.)
The scheme for encoding new cache keys is complicated but explained
in cache_key.cc.
Also: EndianSwapValue is moved to math.h to be next to other bit
operations. (Explains some new include "math.h".) ReverseBits operation
added and unit tests added to hash_test for both.
Fixes https://github.com/facebook/rocksdb/issues/7405 (presuming a root cause)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9126
Test Plan:
### Basic correctness
Several tests needed updates to work with the new functionality, mostly
because we are no longer relying on filesystem for stable cache keys
so table builders & readers need more context info to agree on cache
keys. This functionality is so core, a huge number of existing tests
exercise the cache key functionality.
### Performance
Create db with
`TEST_TMPDIR=/dev/shm ./db_bench -bloom_bits=10 -benchmarks=fillrandom -num=3000000 -partition_index_and_filters`
And test performance with
`TEST_TMPDIR=/dev/shm ./db_bench -readonly -use_existing_db -bloom_bits=10 -benchmarks=readrandom -num=3000000 -duration=30 -cache_index_and_filter_blocks -cache_size=250000 -threads=4`
using DEBUG_LEVEL=0 and simultaneous before & after runs.
Before ops/sec, avg over 100 runs: 121924
After ops/sec, avg over 100 runs: 125385 (+2.8%)
### Collision probability
I have built a tool, ./cache_bench -stress_cache_key to broadly simulate host-wide cache activity
over many months, by making some pessimistic simplifying assumptions:
* Every generated file has a cache entry for every byte offset in the file (contiguous range of cache keys)
* All of every file is cached for its entire lifetime
We use a simple table with skewed address assignment and replacement on address collision
to simulate files coming & going, with quite a variance (super-Poisson) in ages. Some output
with `./cache_bench -stress_cache_key -sck_keep_bits=40`:
```
Total cache or DBs size: 32TiB Writing 925.926 MiB/s or 76.2939TiB/day
Multiply by 9.22337e+18 to correct for simulation losses (but still assume whole file cached)
```
These come from default settings of 2.5M files per day of 32 MB each, and
`-sck_keep_bits=40` means that to represent a single file, we are only keeping 40 bits of
the 128-bit cache key. With file size of 2\*\*25 contiguous keys (pessimistic), our simulation
is about 2\*\*(128-40-25) or about 9 billion billion times more prone to collision than reality.
More default assumptions, relatively pessimistic:
* 100 DBs in same process (doesn't matter much)
* Re-open DB in same process (new session ID related to old session ID) on average
every 100 files generated
* Restart process (all new session IDs unrelated to old) 24 times per day
After enough data, we get a result at the end:
```
(keep 40 bits) 17 collisions after 2 x 90 days, est 10.5882 days between (9.76592e+19 corrected)
```
If we believe the (pessimistic) simulation and the mathematical generalization, we would need to run a billion machines all for 97 billion days to expect a cache key collision. To help verify that our generalization ("corrected") is robust, we can make our simulation more precise with `-sck_keep_bits=41` and `42`, which takes more running time to get enough data:
```
(keep 41 bits) 16 collisions after 4 x 90 days, est 22.5 days between (1.03763e+20 corrected)
(keep 42 bits) 19 collisions after 10 x 90 days, est 47.3684 days between (1.09224e+20 corrected)
```
The generalized prediction still holds. With the `-sck_randomize` option, we can see that we are beating "random" cache keys (except offsets still non-randomized) by a modest amount (roughly 20x less collision prone than random), which should make us reasonably comfortable even in "degenerate" cases:
```
197 collisions after 1 x 90 days, est 0.456853 days between (4.21372e+18 corrected)
```
I've run other tests to validate other conditions behave as expected, never behaving "worse than random" unless we start chopping off structured data.
Reviewed By: zhichao-cao
Differential Revision: D33171746
Pulled By: pdillinger
fbshipit-source-id: f16a57e369ed37be5e7e33525ace848d0537c88f
2021-12-16 17:13:55 -08:00
|
|
|
CacheKey cache_key_;
|
2021-08-24 12:42:31 -07:00
|
|
|
};
|
2021-11-09 12:04:51 -08:00
|
|
|
|
|
|
|
// CacheReservationHandle is for managing the lifetime of a cache reservation
|
|
|
|
// This class is NOT thread-safe
|
|
|
|
template <CacheEntryRole R>
|
|
|
|
class CacheReservationHandle {
|
|
|
|
public:
|
|
|
|
// REQUIRES: cache_res_mgr != nullptr
|
|
|
|
explicit CacheReservationHandle(
|
|
|
|
std::size_t incremental_memory_used,
|
|
|
|
std::shared_ptr<CacheReservationManager> cache_res_mgr);
|
|
|
|
|
|
|
|
~CacheReservationHandle();
|
|
|
|
|
|
|
|
private:
|
|
|
|
std::size_t incremental_memory_used_;
|
|
|
|
std::shared_ptr<CacheReservationManager> cache_res_mgr_;
|
|
|
|
};
|
2021-11-01 14:42:11 -07:00
|
|
|
} // namespace ROCKSDB_NAMESPACE
|