2016-02-09 15:12:00 -08:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
2017-07-15 16:03:42 -07:00
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
2013-10-16 14:59:46 -07:00
|
|
|
//
|
2011-03-18 22:37:00 +00:00
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
2019-05-30 14:47:29 -07:00
|
|
|
#include "table/block_based/block_based_table_builder.h"
|
2011-03-18 22:37:00 +00:00
|
|
|
|
|
|
|
#include <assert.h>
|
2013-11-19 16:29:42 -08:00
|
|
|
#include <stdio.h>
|
Limit buffering for collecting samples for compression dictionary (#7970)
Summary:
For dictionary compression, we need to collect some representative samples of the data to be compressed, which we use to either generate or train (when `CompressionOptions::zstd_max_train_bytes > 0`) a dictionary. Previously, the strategy was to buffer all the data blocks during flush, and up to the target file size during compaction. That strategy allowed us to randomly pick samples from as wide a range as possible that'd be guaranteed to land in a single output file.
However, some users try to make huge files in memory-constrained environments, where this strategy can cause OOM. This PR introduces an option, `CompressionOptions::max_dict_buffer_bytes`, that limits how much data blocks are buffered before we switch to unbuffered mode (which means creating the per-SST dictionary, writing out the buffered data, and compressing/writing new blocks as soon as they are built). It is not strict as we currently buffer more than just data blocks -- also keys are buffered. But it does make a step towards giving users predictable memory usage.
Related changes include:
- Changed sampling for dictionary compression to select unique data blocks when there is limited availability of data blocks
- Made use of `BlockBuilder::SwapAndReset()` to save an allocation+memcpy when buffering data blocks for building a dictionary
- Changed `ParseBoolean()` to accept an input containing characters after the boolean. This is necessary since, with this PR, a value for `CompressionOptions::enabled` is no longer necessarily the final component in the `CompressionOptions` string.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7970
Test Plan:
- updated `CompressionOptions` unit tests to verify limit is respected (to the extent expected in the current implementation) in various scenarios of flush/compaction to bottommost/non-bottommost level
- looked at jemalloc heap profiles right before and after switching to unbuffered mode during flush/compaction. Verified memory usage in buffering is proportional to the limit set.
Reviewed By: pdillinger
Differential Revision: D26467994
Pulled By: ajkr
fbshipit-source-id: 3da4ef9fba59974e4ef40e40c01611002c861465
2021-02-19 14:06:59 -08:00
|
|
|
|
2020-04-01 16:37:54 -07:00
|
|
|
#include <atomic>
|
2017-02-06 16:29:29 -08:00
|
|
|
#include <list>
|
2014-02-28 18:19:07 -08:00
|
|
|
#include <map>
|
|
|
|
#include <memory>
|
Limit buffering for collecting samples for compression dictionary (#7970)
Summary:
For dictionary compression, we need to collect some representative samples of the data to be compressed, which we use to either generate or train (when `CompressionOptions::zstd_max_train_bytes > 0`) a dictionary. Previously, the strategy was to buffer all the data blocks during flush, and up to the target file size during compaction. That strategy allowed us to randomly pick samples from as wide a range as possible that'd be guaranteed to land in a single output file.
However, some users try to make huge files in memory-constrained environments, where this strategy can cause OOM. This PR introduces an option, `CompressionOptions::max_dict_buffer_bytes`, that limits how much data blocks are buffered before we switch to unbuffered mode (which means creating the per-SST dictionary, writing out the buffered data, and compressing/writing new blocks as soon as they are built). It is not strict as we currently buffer more than just data blocks -- also keys are buffered. But it does make a step towards giving users predictable memory usage.
Related changes include:
- Changed sampling for dictionary compression to select unique data blocks when there is limited availability of data blocks
- Made use of `BlockBuilder::SwapAndReset()` to save an allocation+memcpy when buffering data blocks for building a dictionary
- Changed `ParseBoolean()` to accept an input containing characters after the boolean. This is necessary since, with this PR, a value for `CompressionOptions::enabled` is no longer necessarily the final component in the `CompressionOptions` string.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7970
Test Plan:
- updated `CompressionOptions` unit tests to verify limit is respected (to the extent expected in the current implementation) in various scenarios of flush/compaction to bottommost/non-bottommost level
- looked at jemalloc heap profiles right before and after switching to unbuffered mode during flush/compaction. Verified memory usage in buffering is proportional to the limit set.
Reviewed By: pdillinger
Differential Revision: D26467994
Pulled By: ajkr
fbshipit-source-id: 3da4ef9fba59974e4ef40e40c01611002c861465
2021-02-19 14:06:59 -08:00
|
|
|
#include <numeric>
|
2014-05-15 14:09:03 -07:00
|
|
|
#include <string>
|
|
|
|
#include <unordered_map>
|
2014-08-15 15:05:09 -07:00
|
|
|
#include <utility>
|
2014-02-28 18:19:07 -08:00
|
|
|
|
2021-09-08 12:34:35 -07:00
|
|
|
#include "cache/cache_entry_roles.h"
|
New stable, fixed-length cache keys (#9126)
Summary:
This change standardizes on a new 16-byte cache key format for
block cache (incl compressed and secondary) and persistent cache (but
not table cache and row cache).
The goal is a really fast cache key with practically ideal stability and
uniqueness properties without external dependencies (e.g. from FileSystem).
A fixed key size of 16 bytes should enable future optimizations to the
concurrent hash table for block cache, which is a heavy CPU user /
bottleneck, but there appears to be measurable performance improvement
even with no changes to LRUCache.
This change replaces a lot of disjointed and ugly code handling cache
keys with calls to a simple, clean new internal API (cache_key.h).
(Preserving the old cache key logic under an option would be very ugly
and likely negate the performance gain of the new approach. Complete
replacement carries some inherent risk, but I think that's acceptable
with sufficient analysis and testing.)
The scheme for encoding new cache keys is complicated but explained
in cache_key.cc.
Also: EndianSwapValue is moved to math.h to be next to other bit
operations. (Explains some new include "math.h".) ReverseBits operation
added and unit tests added to hash_test for both.
Fixes https://github.com/facebook/rocksdb/issues/7405 (presuming a root cause)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9126
Test Plan:
### Basic correctness
Several tests needed updates to work with the new functionality, mostly
because we are no longer relying on filesystem for stable cache keys
so table builders & readers need more context info to agree on cache
keys. This functionality is so core, a huge number of existing tests
exercise the cache key functionality.
### Performance
Create db with
`TEST_TMPDIR=/dev/shm ./db_bench -bloom_bits=10 -benchmarks=fillrandom -num=3000000 -partition_index_and_filters`
And test performance with
`TEST_TMPDIR=/dev/shm ./db_bench -readonly -use_existing_db -bloom_bits=10 -benchmarks=readrandom -num=3000000 -duration=30 -cache_index_and_filter_blocks -cache_size=250000 -threads=4`
using DEBUG_LEVEL=0 and simultaneous before & after runs.
Before ops/sec, avg over 100 runs: 121924
After ops/sec, avg over 100 runs: 125385 (+2.8%)
### Collision probability
I have built a tool, ./cache_bench -stress_cache_key to broadly simulate host-wide cache activity
over many months, by making some pessimistic simplifying assumptions:
* Every generated file has a cache entry for every byte offset in the file (contiguous range of cache keys)
* All of every file is cached for its entire lifetime
We use a simple table with skewed address assignment and replacement on address collision
to simulate files coming & going, with quite a variance (super-Poisson) in ages. Some output
with `./cache_bench -stress_cache_key -sck_keep_bits=40`:
```
Total cache or DBs size: 32TiB Writing 925.926 MiB/s or 76.2939TiB/day
Multiply by 9.22337e+18 to correct for simulation losses (but still assume whole file cached)
```
These come from default settings of 2.5M files per day of 32 MB each, and
`-sck_keep_bits=40` means that to represent a single file, we are only keeping 40 bits of
the 128-bit cache key. With file size of 2\*\*25 contiguous keys (pessimistic), our simulation
is about 2\*\*(128-40-25) or about 9 billion billion times more prone to collision than reality.
More default assumptions, relatively pessimistic:
* 100 DBs in same process (doesn't matter much)
* Re-open DB in same process (new session ID related to old session ID) on average
every 100 files generated
* Restart process (all new session IDs unrelated to old) 24 times per day
After enough data, we get a result at the end:
```
(keep 40 bits) 17 collisions after 2 x 90 days, est 10.5882 days between (9.76592e+19 corrected)
```
If we believe the (pessimistic) simulation and the mathematical generalization, we would need to run a billion machines all for 97 billion days to expect a cache key collision. To help verify that our generalization ("corrected") is robust, we can make our simulation more precise with `-sck_keep_bits=41` and `42`, which takes more running time to get enough data:
```
(keep 41 bits) 16 collisions after 4 x 90 days, est 22.5 days between (1.03763e+20 corrected)
(keep 42 bits) 19 collisions after 10 x 90 days, est 47.3684 days between (1.09224e+20 corrected)
```
The generalized prediction still holds. With the `-sck_randomize` option, we can see that we are beating "random" cache keys (except offsets still non-randomized) by a modest amount (roughly 20x less collision prone than random), which should make us reasonably comfortable even in "degenerate" cases:
```
197 collisions after 1 x 90 days, est 0.456853 days between (4.21372e+18 corrected)
```
I've run other tests to validate other conditions behave as expected, never behaving "worse than random" unless we start chopping off structured data.
Reviewed By: zhichao-cao
Differential Revision: D33171746
Pulled By: pdillinger
fbshipit-source-id: f16a57e369ed37be5e7e33525ace848d0537c88f
2021-12-16 17:13:55 -08:00
|
|
|
#include "cache/cache_key.h"
|
2021-09-08 12:34:35 -07:00
|
|
|
#include "cache/cache_reservation_manager.h"
|
2014-02-28 18:19:07 -08:00
|
|
|
#include "db/dbformat.h"
|
2019-05-30 14:47:29 -07:00
|
|
|
#include "index_builder.h"
|
2021-09-29 04:01:57 -07:00
|
|
|
#include "logging/logging.h"
|
Limit buffering for collecting samples for compression dictionary (#7970)
Summary:
For dictionary compression, we need to collect some representative samples of the data to be compressed, which we use to either generate or train (when `CompressionOptions::zstd_max_train_bytes > 0`) a dictionary. Previously, the strategy was to buffer all the data blocks during flush, and up to the target file size during compaction. That strategy allowed us to randomly pick samples from as wide a range as possible that'd be guaranteed to land in a single output file.
However, some users try to make huge files in memory-constrained environments, where this strategy can cause OOM. This PR introduces an option, `CompressionOptions::max_dict_buffer_bytes`, that limits how much data blocks are buffered before we switch to unbuffered mode (which means creating the per-SST dictionary, writing out the buffered data, and compressing/writing new blocks as soon as they are built). It is not strict as we currently buffer more than just data blocks -- also keys are buffered. But it does make a step towards giving users predictable memory usage.
Related changes include:
- Changed sampling for dictionary compression to select unique data blocks when there is limited availability of data blocks
- Made use of `BlockBuilder::SwapAndReset()` to save an allocation+memcpy when buffering data blocks for building a dictionary
- Changed `ParseBoolean()` to accept an input containing characters after the boolean. This is necessary since, with this PR, a value for `CompressionOptions::enabled` is no longer necessarily the final component in the `CompressionOptions` string.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7970
Test Plan:
- updated `CompressionOptions` unit tests to verify limit is respected (to the extent expected in the current implementation) in various scenarios of flush/compaction to bottommost/non-bottommost level
- looked at jemalloc heap profiles right before and after switching to unbuffered mode during flush/compaction. Verified memory usage in buffering is proportional to the limit set.
Reviewed By: pdillinger
Differential Revision: D26467994
Pulled By: ajkr
fbshipit-source-id: 3da4ef9fba59974e4ef40e40c01611002c861465
2021-02-19 14:06:59 -08:00
|
|
|
#include "memory/memory_allocator.h"
|
2013-09-01 23:23:40 -07:00
|
|
|
#include "rocksdb/cache.h"
|
2013-08-23 08:38:13 -07:00
|
|
|
#include "rocksdb/comparator.h"
|
|
|
|
#include "rocksdb/env.h"
|
2021-04-29 06:59:53 -07:00
|
|
|
#include "rocksdb/filter_policy.h"
|
2014-02-28 18:19:07 -08:00
|
|
|
#include "rocksdb/flush_block_policy.h"
|
2016-04-21 10:16:28 -07:00
|
|
|
#include "rocksdb/merge_operator.h"
|
2014-02-28 18:19:07 -08:00
|
|
|
#include "rocksdb/table.h"
|
2021-09-15 15:32:07 -07:00
|
|
|
#include "rocksdb/types.h"
|
2019-05-30 14:47:29 -07:00
|
|
|
#include "table/block_based/block.h"
|
2019-05-30 17:39:43 -07:00
|
|
|
#include "table/block_based/block_based_filter_block.h"
|
2019-05-30 14:47:29 -07:00
|
|
|
#include "table/block_based/block_based_table_factory.h"
|
|
|
|
#include "table/block_based/block_based_table_reader.h"
|
|
|
|
#include "table/block_based/block_builder.h"
|
2021-08-03 12:42:22 -07:00
|
|
|
#include "table/block_based/block_like_traits.h"
|
2019-05-30 14:47:29 -07:00
|
|
|
#include "table/block_based/filter_block.h"
|
New Bloom filter implementation for full and partitioned filters (#6007)
Summary:
Adds an improved, replacement Bloom filter implementation (FastLocalBloom) for full and partitioned filters in the block-based table. This replacement is faster and more accurate, especially for high bits per key or millions of keys in a single filter.
Speed
The improved speed, at least on recent x86_64, comes from
* Using fastrange instead of modulo (%)
* Using our new hash function (XXH3 preview, added in a previous commit), which is much faster for large keys and only *slightly* slower on keys around 12 bytes if hashing the same size many thousands of times in a row.
* Optimizing the Bloom filter queries with AVX2 SIMD operations. (Added AVX2 to the USE_SSE=1 build.) Careful design was required to support (a) SIMD-optimized queries, (b) compatible non-SIMD code that's simple and efficient, (c) flexible choice of number of probes, and (d) essentially maximized accuracy for a cache-local Bloom filter. Probes are made eight at a time, so any number of probes up to 8 is the same speed, then up to 16, etc.
* Prefetching cache lines when building the filter. Although this optimization could be applied to the old structure as well, it seems to balance out the small added cost of accumulating 64 bit hashes for adding to the filter rather than 32 bit hashes.
Here's nominal speed data from filter_bench (200MB in filters, about 10k keys each, 10 bits filter data / key, 6 probes, avg key size 24 bytes, includes hashing time) on Skylake DE (relatively low clock speed):
$ ./filter_bench -quick -impl=2 -net_includes_hashing # New Bloom filter
Build avg ns/key: 47.7135
Mixed inside/outside queries...
Single filter net ns/op: 26.2825
Random filter net ns/op: 150.459
Average FP rate %: 0.954651
$ ./filter_bench -quick -impl=0 -net_includes_hashing # Old Bloom filter
Build avg ns/key: 47.2245
Mixed inside/outside queries...
Single filter net ns/op: 63.2978
Random filter net ns/op: 188.038
Average FP rate %: 1.13823
Similar build time but dramatically faster query times on hot data (63 ns to 26 ns), and somewhat faster on stale data (188 ns to 150 ns). Performance differences on batched and skewed query loads are between these extremes as expected.
The only other interesting thing about speed is "inside" (query key was added to filter) vs. "outside" (query key was not added to filter) query times. The non-SIMD implementations are substantially slower when most queries are "outside" vs. "inside". This goes against what one might expect or would have observed years ago, as "outside" queries only need about two probes on average, due to short-circuiting, while "inside" always have num_probes (say 6). The problem is probably the nastily unpredictable branch. The SIMD implementation has few branches (very predictable) and has pretty consistent running time regardless of query outcome.
Accuracy
The generally improved accuracy (re: Issue https://github.com/facebook/rocksdb/issues/5857) comes from a better design for probing indices
within a cache line (re: Issue https://github.com/facebook/rocksdb/issues/4120) and improved accuracy for millions of keys in a single filter from using a 64-bit hash function (XXH3p). Design details in code comments.
Accuracy data (generalizes, except old impl gets worse with millions of keys):
Memory bits per key: FP rate percent old impl -> FP rate percent new impl
6: 5.70953 -> 5.69888
8: 2.45766 -> 2.29709
10: 1.13977 -> 0.959254
12: 0.662498 -> 0.411593
16: 0.353023 -> 0.0873754
24: 0.261552 -> 0.0060971
50: 0.225453 -> ~0.00003 (less than 1 in a million queries are FP)
Fixes https://github.com/facebook/rocksdb/issues/5857
Fixes https://github.com/facebook/rocksdb/issues/4120
Unlike the old implementation, this implementation has a fixed cache line size (64 bytes). At 10 bits per key, the accuracy of this new implementation is very close to the old implementation with 128-byte cache line size. If there's sufficient demand, this implementation could be generalized.
Compatibility
Although old releases would see the new structure as corrupt filter data and read the table as if there's no filter, we've decided only to enable the new Bloom filter with new format_version=5. This provides a smooth path for automatic adoption over time, with an option for early opt-in.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6007
Test Plan: filter_bench has been used thoroughly to validate speed, accuracy, and correctness. Unit tests have been carefully updated to exercise new and old implementations, as well as the logic to select an implementation based on context (format_version).
Differential Revision: D18294749
Pulled By: pdillinger
fbshipit-source-id: d44c9db3696e4d0a17caaec47075b7755c262c5f
2019-11-13 16:31:26 -08:00
|
|
|
#include "table/block_based/filter_policy_internal.h"
|
2019-05-30 14:47:29 -07:00
|
|
|
#include "table/block_based/full_filter_block.h"
|
|
|
|
#include "table/block_based/partitioned_filter_block.h"
|
2011-03-18 22:37:00 +00:00
|
|
|
#include "table/format.h"
|
Improve / clean up meta block code & integrity (#9163)
Summary:
* Checksums are now checked on meta blocks unless specifically
suppressed or not applicable (e.g. plain table). (Was other way around.)
This means a number of cases that were not checking checksums now are,
including direct read TableProperties in Version::GetTableProperties
(fixed in meta_blocks ReadTableProperties), reading any block from
PersistentCache (fixed in BlockFetcher), read TableProperties in
SstFileDumper (ldb/sst_dump/BackupEngine) before table reader open,
maybe more.
* For that to work, I moved the global_seqno+TableProperties checksum
logic to the shared table/ code, because that is used by many utilies
such as SstFileDumper.
* Also for that to work, we have to know when we're dealing with a block
that has a checksum (trailer), so added that capability to Footer based
on magic number, and from there BlockFetcher.
* Knowledge of trailer presence has also fixed a problem where other
table formats were reading blocks including bytes for a non-existant
trailer--and awkwardly kind-of not using them, e.g. no shared code
checking checksums. (BlockFetcher compression type was populated
incorrectly.) Now we only read what is needed.
* Minimized code duplication and differing/incompatible/awkward
abstractions in meta_blocks.{cc,h} (e.g. SeekTo in metaindex block
without parsing block handle)
* Moved some meta block handling code from table_properties*.*
* Moved some code specific to block-based table from shared table/ code
to BlockBasedTable class. The checksum stuff means we can't completely
separate it, but things that don't need to be in shared table/ code
should not be.
* Use unique_ptr rather than raw ptr in more places. (Note: you can
std::move from unique_ptr to shared_ptr.)
Without enhancements to GetPropertiesOfAllTablesTest (see below),
net reduction of roughly 100 lines of code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9163
Test Plan:
existing tests and
* Enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to verify that
checksums are now checked on direct read of table properties by TableCache
(new test would fail before this change)
* Also enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to test
putting table properties under old meta name
* Also generally enhanced that same test to actually test what it was
supposed to be testing already, by kicking things out of table cache when
we don't want them there.
Reviewed By: ajkr, mrambacher
Differential Revision: D32514757
Pulled By: pdillinger
fbshipit-source-id: 507964b9311d186ae8d1131182290cbd97a99fa9
2021-11-18 11:42:12 -08:00
|
|
|
#include "table/meta_blocks.h"
|
2014-02-28 18:19:07 -08:00
|
|
|
#include "table/table_builder.h"
|
2011-03-18 22:37:00 +00:00
|
|
|
#include "util/coding.h"
|
2015-01-09 12:57:11 -08:00
|
|
|
#include "util/compression.h"
|
2013-06-17 10:11:10 -07:00
|
|
|
#include "util/stop_watch.h"
|
2018-07-20 14:34:07 -07:00
|
|
|
#include "util/string_util.h"
|
2020-04-01 16:37:54 -07:00
|
|
|
#include "util/work_queue.h"
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2020-02-20 12:07:53 -08:00
|
|
|
namespace ROCKSDB_NAMESPACE {
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2014-05-15 14:09:03 -07:00
|
|
|
extern const std::string kHashIndexPrefixesBlock;
|
|
|
|
extern const std::string kHashIndexPrefixesMetadataBlock;
|
2013-10-10 11:43:24 -07:00
|
|
|
|
2014-04-10 14:19:43 -07:00
|
|
|
|
2014-11-13 14:39:30 -05:00
|
|
|
// Without anonymous namespace here, we fail the warning -Wmissing-prototypes
|
|
|
|
namespace {
|
|
|
|
|
Improve / clean up meta block code & integrity (#9163)
Summary:
* Checksums are now checked on meta blocks unless specifically
suppressed or not applicable (e.g. plain table). (Was other way around.)
This means a number of cases that were not checking checksums now are,
including direct read TableProperties in Version::GetTableProperties
(fixed in meta_blocks ReadTableProperties), reading any block from
PersistentCache (fixed in BlockFetcher), read TableProperties in
SstFileDumper (ldb/sst_dump/BackupEngine) before table reader open,
maybe more.
* For that to work, I moved the global_seqno+TableProperties checksum
logic to the shared table/ code, because that is used by many utilies
such as SstFileDumper.
* Also for that to work, we have to know when we're dealing with a block
that has a checksum (trailer), so added that capability to Footer based
on magic number, and from there BlockFetcher.
* Knowledge of trailer presence has also fixed a problem where other
table formats were reading blocks including bytes for a non-existant
trailer--and awkwardly kind-of not using them, e.g. no shared code
checking checksums. (BlockFetcher compression type was populated
incorrectly.) Now we only read what is needed.
* Minimized code duplication and differing/incompatible/awkward
abstractions in meta_blocks.{cc,h} (e.g. SeekTo in metaindex block
without parsing block handle)
* Moved some meta block handling code from table_properties*.*
* Moved some code specific to block-based table from shared table/ code
to BlockBasedTable class. The checksum stuff means we can't completely
separate it, but things that don't need to be in shared table/ code
should not be.
* Use unique_ptr rather than raw ptr in more places. (Note: you can
std::move from unique_ptr to shared_ptr.)
Without enhancements to GetPropertiesOfAllTablesTest (see below),
net reduction of roughly 100 lines of code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9163
Test Plan:
existing tests and
* Enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to verify that
checksums are now checked on direct read of table properties by TableCache
(new test would fail before this change)
* Also enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to test
putting table properties under old meta name
* Also generally enhanced that same test to actually test what it was
supposed to be testing already, by kicking things out of table cache when
we don't want them there.
Reviewed By: ajkr, mrambacher
Differential Revision: D32514757
Pulled By: pdillinger
fbshipit-source-id: 507964b9311d186ae8d1131182290cbd97a99fa9
2021-11-18 11:42:12 -08:00
|
|
|
constexpr size_t kBlockTrailerSize = BlockBasedTable::kBlockTrailerSize;
|
|
|
|
|
2017-03-07 13:48:02 -08:00
|
|
|
// Create a filter block builder based on its type.
|
|
|
|
FilterBlockBuilder* CreateFilterBlockBuilder(
|
2018-05-21 14:33:55 -07:00
|
|
|
const ImmutableCFOptions& /*opt*/, const MutableCFOptions& mopt,
|
2019-11-26 18:18:29 -08:00
|
|
|
const FilterBuildingContext& context,
|
2018-08-09 16:49:45 -07:00
|
|
|
const bool use_delta_encoding_for_index_values,
|
2017-03-07 13:48:02 -08:00
|
|
|
PartitionedIndexBuilder* const p_index_builder) {
|
2019-11-26 18:18:29 -08:00
|
|
|
const BlockBasedTableOptions& table_opt = context.table_options;
|
Add more LSM info to FilterBuildingContext (#8246)
Summary:
Add `num_levels`, `is_bottommost`, and table file creation
`reason` to `FilterBuildingContext`, in anticipation of more powerful
Bloom-like filter support.
To support this, added `is_bottommost` and `reason` to
`TableBuilderOptions`, which allowed removing `reason` parameter from
`rocksdb::BuildTable`.
I attempted to remove `skip_filters` from `TableBuilderOptions`, because
filter construction decisions should arise from options, not one-off
parameters. I could not completely remove it because the public API for
SstFileWriter takes a `skip_filters` parameter, and translating this
into an option change would mean awkwardly replacing the table_factory
if it is BlockBasedTableFactory with new filter_policy=nullptr option.
I marked this public skip_filters option as deprecated because of this
oddity. (skip_filters on the read side probably makes sense.)
At least `skip_filters` is now largely hidden for users of
`TableBuilderOptions` and is no longer used for implementing the
optimize_filters_for_hits option. Bringing the logic for that option
closer to handling of FilterBuildingContext makes it more obvious that
hese two are using the same notion of "bottommost." (Planned:
configuration options for Bloom-like filters that generalize
`optimize_filters_for_hits`)
Recommended follow-up: Try to get away from "bottommost level" naming of
things, which is inaccurate (see
VersionStorageInfo::RangeMightExistAfterSortedRun), and move to
"bottommost run" or just "bottommost."
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8246
Test Plan:
extended an existing unit test to exercise and check various
filter building contexts. Also, existing tests for
optimize_filters_for_hits validate some of the "bottommost" handling,
which is now closely connected to FilterBuildingContext::is_bottommost
through TableBuilderOptions::is_bottommost
Reviewed By: mrambacher
Differential Revision: D28099346
Pulled By: pdillinger
fbshipit-source-id: 2c1072e29c24d4ac404c761a7b7663292372600a
2021-04-30 13:49:24 -07:00
|
|
|
assert(table_opt.filter_policy); // precondition
|
2014-09-08 10:37:05 -07:00
|
|
|
|
|
|
|
FilterBitsBuilder* filter_bits_builder =
|
2019-11-26 18:18:29 -08:00
|
|
|
BloomFilterPolicy::GetBuilderFromContext(context);
|
2014-09-08 10:37:05 -07:00
|
|
|
if (filter_bits_builder == nullptr) {
|
2022-02-08 13:54:29 -08:00
|
|
|
return nullptr;
|
2014-09-08 10:37:05 -07:00
|
|
|
} else {
|
2022-02-08 13:54:29 -08:00
|
|
|
// Check for backdoor deprecated block-based bloom config
|
|
|
|
size_t starting_est = filter_bits_builder->EstimateEntriesAdded();
|
2022-02-16 08:27:37 -08:00
|
|
|
constexpr auto kSecretStart =
|
|
|
|
DeprecatedBlockBasedBloomFilterPolicy::kSecretBitsPerKeyStart;
|
2022-02-08 13:54:29 -08:00
|
|
|
if (starting_est >= kSecretStart && starting_est < kSecretStart + 100) {
|
|
|
|
int bits_per_key = static_cast<int>(starting_est - kSecretStart);
|
|
|
|
delete filter_bits_builder;
|
|
|
|
return new BlockBasedFilterBlockBuilder(mopt.prefix_extractor.get(),
|
|
|
|
table_opt, bits_per_key);
|
|
|
|
}
|
|
|
|
// END check for backdoor deprecated block-based bloom config
|
2017-03-07 13:48:02 -08:00
|
|
|
if (table_opt.partition_filters) {
|
|
|
|
assert(p_index_builder != nullptr);
|
2017-07-02 10:36:10 -07:00
|
|
|
// Since after partition cut request from filter builder it takes time
|
2020-12-11 22:17:08 -08:00
|
|
|
// until index builder actully cuts the partition, until the end of a
|
|
|
|
// data block potentially with many keys, we take the lower bound as
|
|
|
|
// partition size.
|
2017-07-02 10:36:10 -07:00
|
|
|
assert(table_opt.block_size_deviation <= 100);
|
2019-03-27 16:13:08 -07:00
|
|
|
auto partition_size =
|
|
|
|
static_cast<uint32_t>(((table_opt.metadata_block_size *
|
|
|
|
(100 - table_opt.block_size_deviation)) +
|
|
|
|
99) /
|
|
|
|
100);
|
2017-07-12 09:27:12 -07:00
|
|
|
partition_size = std::max(partition_size, static_cast<uint32_t>(1));
|
2017-03-07 13:48:02 -08:00
|
|
|
return new PartitionedFilterBlockBuilder(
|
2018-05-21 14:33:55 -07:00
|
|
|
mopt.prefix_extractor.get(), table_opt.whole_key_filtering,
|
2017-03-07 13:48:02 -08:00
|
|
|
filter_bits_builder, table_opt.index_block_restart_interval,
|
2018-08-09 16:49:45 -07:00
|
|
|
use_delta_encoding_for_index_values, p_index_builder, partition_size);
|
2017-03-07 13:48:02 -08:00
|
|
|
} else {
|
2018-05-21 14:33:55 -07:00
|
|
|
return new FullFilterBlockBuilder(mopt.prefix_extractor.get(),
|
2017-03-07 13:48:02 -08:00
|
|
|
table_opt.whole_key_filtering,
|
|
|
|
filter_bits_builder);
|
|
|
|
}
|
2014-09-08 10:37:05 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-02-28 18:19:07 -08:00
|
|
|
bool GoodCompressionRatio(size_t compressed_size, size_t raw_size) {
|
2013-10-10 11:43:24 -07:00
|
|
|
// Check to see if compressed less than 12.5%
|
|
|
|
return compressed_size < raw_size - (raw_size / 8u);
|
|
|
|
}
|
|
|
|
|
2019-03-18 12:07:35 -07:00
|
|
|
} // namespace
|
|
|
|
|
|
|
|
// format_version is the block format as defined in include/rocksdb/table.h
|
|
|
|
Slice CompressBlock(const Slice& raw, const CompressionInfo& info,
|
|
|
|
CompressionType* type, uint32_t format_version,
|
|
|
|
bool do_sample, std::string* compressed_output,
|
|
|
|
std::string* sampled_output_fast,
|
|
|
|
std::string* sampled_output_slow) {
|
2020-08-12 18:24:27 -07:00
|
|
|
assert(type);
|
|
|
|
assert(compressed_output);
|
|
|
|
assert(compressed_output->empty());
|
2014-02-28 18:19:07 -08:00
|
|
|
|
2019-03-18 12:07:35 -07:00
|
|
|
// If requested, we sample one in every N block with a
|
|
|
|
// fast and slow compression algorithm and report the stats.
|
|
|
|
// The users can use these stats to decide if it is worthwhile
|
|
|
|
// enabling compression and they also get a hint about which
|
|
|
|
// compression algorithm wil be beneficial.
|
|
|
|
if (do_sample && info.SampleForCompression() &&
|
2020-08-12 18:24:27 -07:00
|
|
|
Random::GetTLSInstance()->OneIn(
|
|
|
|
static_cast<int>(info.SampleForCompression()))) {
|
2019-03-18 12:07:35 -07:00
|
|
|
// Sampling with a fast compression algorithm
|
2020-08-12 18:24:27 -07:00
|
|
|
if (sampled_output_fast && (LZ4_Supported() || Snappy_Supported())) {
|
2019-03-18 12:07:35 -07:00
|
|
|
CompressionType c =
|
|
|
|
LZ4_Supported() ? kLZ4Compression : kSnappyCompression;
|
|
|
|
CompressionContext context(c);
|
|
|
|
CompressionOptions options;
|
|
|
|
CompressionInfo info_tmp(options, context,
|
|
|
|
CompressionDict::GetEmptyDict(), c,
|
|
|
|
info.SampleForCompression());
|
|
|
|
|
2020-08-12 18:24:27 -07:00
|
|
|
CompressData(raw, info_tmp, GetCompressFormatForVersion(format_version),
|
|
|
|
sampled_output_fast);
|
2019-03-18 12:07:35 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
// Sampling with a slow but high-compression algorithm
|
2020-08-12 18:24:27 -07:00
|
|
|
if (sampled_output_slow && (ZSTD_Supported() || Zlib_Supported())) {
|
2019-03-18 12:07:35 -07:00
|
|
|
CompressionType c = ZSTD_Supported() ? kZSTD : kZlibCompression;
|
|
|
|
CompressionContext context(c);
|
|
|
|
CompressionOptions options;
|
|
|
|
CompressionInfo info_tmp(options, context,
|
|
|
|
CompressionDict::GetEmptyDict(), c,
|
|
|
|
info.SampleForCompression());
|
2020-08-12 18:24:27 -07:00
|
|
|
|
|
|
|
CompressData(raw, info_tmp, GetCompressFormatForVersion(format_version),
|
|
|
|
sampled_output_slow);
|
2019-03-18 12:07:35 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-08-12 18:24:27 -07:00
|
|
|
if (info.type() == kNoCompression) {
|
|
|
|
*type = kNoCompression;
|
|
|
|
return raw;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Actually compress the data; if the compression method is not supported,
|
|
|
|
// or the compression fails etc., just fall back to uncompressed
|
|
|
|
if (!CompressData(raw, info, GetCompressFormatForVersion(format_version),
|
|
|
|
compressed_output)) {
|
|
|
|
*type = kNoCompression;
|
|
|
|
return raw;
|
2019-03-18 12:07:35 -07:00
|
|
|
}
|
|
|
|
|
2020-08-12 18:24:27 -07:00
|
|
|
// Check the compression ratio; if it's not good enough, just fall back to
|
|
|
|
// uncompressed
|
|
|
|
if (!GoodCompressionRatio(compressed_output->size(), raw.size())) {
|
|
|
|
*type = kNoCompression;
|
|
|
|
return raw;
|
|
|
|
}
|
|
|
|
|
|
|
|
*type = info.type();
|
|
|
|
return *compressed_output;
|
2014-02-28 18:19:07 -08:00
|
|
|
}
|
|
|
|
|
2013-12-04 16:35:48 -08:00
|
|
|
// kBlockBasedTableMagicNumber was picked by running
|
2014-05-01 14:09:32 -04:00
|
|
|
// echo rocksdb.table.block_based | sha1sum
|
2013-12-04 15:09:41 -08:00
|
|
|
// and taking the leading 64 bits.
|
2015-07-13 12:11:05 -07:00
|
|
|
// Please note that kBlockBasedTableMagicNumber may also be accessed by other
|
|
|
|
// .cc files
|
|
|
|
// for that reason we declare it extern in the header but to get the space
|
|
|
|
// allocated
|
2015-07-01 16:13:49 -07:00
|
|
|
// it must be not extern in one place.
|
|
|
|
const uint64_t kBlockBasedTableMagicNumber = 0x88e241b785f4cff7ull;
|
2014-05-01 14:09:32 -04:00
|
|
|
// We also support reading and writing legacy block based table format (for
|
|
|
|
// backwards compatibility)
|
2015-07-01 16:13:49 -07:00
|
|
|
const uint64_t kLegacyBlockBasedTableMagicNumber = 0xdb4775248b80fb57ull;
|
2013-12-04 15:09:41 -08:00
|
|
|
|
2014-02-28 18:19:07 -08:00
|
|
|
// A collector that collects properties of interest to block-based table.
|
|
|
|
// For now this class looks heavy-weight since we only write one additional
|
|
|
|
// property.
|
2015-04-25 18:14:27 +09:00
|
|
|
// But in the foreseeable future, we will add more and more properties that are
|
2014-02-28 18:19:07 -08:00
|
|
|
// specific to block-based table.
|
|
|
|
class BlockBasedTableBuilder::BlockBasedTablePropertiesCollector
|
A new call back to TablePropertiesCollector to allow users know the entry is add, delete or merge
Summary:
Currently users have no idea a key is add, delete or merge from TablePropertiesCollector call back. Add a new function to add it.
Also refactor the codes so that
(1) make table property collector and internal table property collector two separate data structures with the later one now exposed
(2) table builders only receive internal table properties
Test Plan: Add cases in table_properties_collector_test to cover both of old and new ways of using TablePropertiesCollector.
Reviewers: yhchiang, igor.sugak, rven, igor
Reviewed By: rven, igor
Subscribers: meyering, yoshinorim, maykov, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D35373
2015-04-06 10:04:30 -07:00
|
|
|
: public IntTblPropCollector {
|
2014-02-28 18:19:07 -08:00
|
|
|
public:
|
2014-05-15 14:09:03 -07:00
|
|
|
explicit BlockBasedTablePropertiesCollector(
|
2015-02-04 17:03:57 -08:00
|
|
|
BlockBasedTableOptions::IndexType index_type, bool whole_key_filtering,
|
|
|
|
bool prefix_filtering)
|
|
|
|
: index_type_(index_type),
|
|
|
|
whole_key_filtering_(whole_key_filtering),
|
|
|
|
prefix_filtering_(prefix_filtering) {}
|
2014-02-28 18:19:07 -08:00
|
|
|
|
2019-02-14 13:52:47 -08:00
|
|
|
Status InternalAdd(const Slice& /*key*/, const Slice& /*value*/,
|
|
|
|
uint64_t /*file_size*/) override {
|
2014-02-28 18:19:07 -08:00
|
|
|
// Intentionally left blank. Have no interest in collecting stats for
|
|
|
|
// individual key/value pairs.
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
2021-03-25 14:58:23 -07:00
|
|
|
virtual void BlockAdd(uint64_t /* block_raw_bytes */,
|
|
|
|
uint64_t /* block_compressed_bytes_fast */,
|
|
|
|
uint64_t /* block_compressed_bytes_slow */) override {
|
2019-03-18 12:07:35 -07:00
|
|
|
// Intentionally left blank. No interest in collecting stats for
|
|
|
|
// blocks.
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2019-02-14 13:52:47 -08:00
|
|
|
Status Finish(UserCollectedProperties* properties) override {
|
2014-02-28 18:19:07 -08:00
|
|
|
std::string val;
|
|
|
|
PutFixed32(&val, static_cast<uint32_t>(index_type_));
|
|
|
|
properties->insert({BlockBasedTablePropertyNames::kIndexType, val});
|
2015-02-04 17:03:57 -08:00
|
|
|
properties->insert({BlockBasedTablePropertyNames::kWholeKeyFiltering,
|
|
|
|
whole_key_filtering_ ? kPropTrue : kPropFalse});
|
|
|
|
properties->insert({BlockBasedTablePropertyNames::kPrefixFiltering,
|
|
|
|
prefix_filtering_ ? kPropTrue : kPropFalse});
|
2014-02-28 18:19:07 -08:00
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
// The name of the properties collector can be used for debugging purpose.
|
2019-02-14 13:52:47 -08:00
|
|
|
const char* Name() const override {
|
2014-02-28 18:19:07 -08:00
|
|
|
return "BlockBasedTablePropertiesCollector";
|
|
|
|
}
|
|
|
|
|
2019-02-14 13:52:47 -08:00
|
|
|
UserCollectedProperties GetReadableProperties() const override {
|
2014-02-28 18:19:07 -08:00
|
|
|
// Intentionally left blank.
|
|
|
|
return UserCollectedProperties();
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
BlockBasedTableOptions::IndexType index_type_;
|
2015-02-04 17:03:57 -08:00
|
|
|
bool whole_key_filtering_;
|
|
|
|
bool prefix_filtering_;
|
2014-02-28 18:19:07 -08:00
|
|
|
};
|
|
|
|
|
2013-10-28 17:54:09 -07:00
|
|
|
struct BlockBasedTableBuilder::Rep {
|
2021-05-05 13:59:21 -07:00
|
|
|
const ImmutableOptions ioptions;
|
2018-05-21 14:33:55 -07:00
|
|
|
const MutableCFOptions moptions;
|
2014-08-25 14:22:05 -07:00
|
|
|
const BlockBasedTableOptions table_options;
|
2014-01-27 13:53:22 -08:00
|
|
|
const InternalKeyComparator& internal_comparator;
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-17 16:16:11 -07:00
|
|
|
WritableFileWriter* file;
|
2020-04-02 16:13:44 -07:00
|
|
|
std::atomic<uint64_t> offset;
|
2018-03-26 20:14:24 -07:00
|
|
|
size_t alignment;
|
2011-03-18 22:37:00 +00:00
|
|
|
BlockBuilder data_block;
|
2021-04-23 12:44:11 -07:00
|
|
|
// Buffers uncompressed data blocks to replay later. Needed when
|
Reduce scope of compression dictionary to single SST (#4952)
Summary:
Our previous approach was to train one compression dictionary per compaction, using the first output SST to train a dictionary, and then applying it on subsequent SSTs in the same compaction. While this was great for minimizing CPU/memory/I/O overhead, it did not achieve good compression ratios in practice. In our most promising potential use case, moderate reductions in a dictionary's scope make a major difference on compression ratio.
So, this PR changes compression dictionary to be scoped per-SST. It accepts the tradeoff during table building to use more memory and CPU. Important changes include:
- The `BlockBasedTableBuilder` has a new state when dictionary compression is in-use: `kBuffered`. In that state it accumulates uncompressed data in-memory whenever `Add` is called.
- After accumulating target file size bytes or calling `BlockBasedTableBuilder::Finish`, a `BlockBasedTableBuilder` moves to the `kUnbuffered` state. The transition (`EnterUnbuffered()`) involves sampling the buffered data, training a dictionary, and compressing/writing out all buffered data. In the `kUnbuffered` state, a `BlockBasedTableBuilder` behaves the same as before -- blocks are compressed/written out as soon as they fill up.
- Samples are now whole uncompressed data blocks, except the final sample may be a partial data block so we don't breach the user's configured `max_dict_bytes` or `zstd_max_train_bytes`. The dictionary trainer is supposed to work better when we pass it real units of compression. Previously we were passing 64-byte KV samples which was not realistic.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4952
Differential Revision: D13967980
Pulled By: ajkr
fbshipit-source-id: 82bea6f7537e1529c7a1a4cdee84585f5949300f
2019-02-11 19:42:25 -08:00
|
|
|
// compression dictionary is enabled so we can finalize the dictionary before
|
|
|
|
// compressing any data blocks.
|
2021-04-23 12:44:11 -07:00
|
|
|
std::vector<std::string> data_block_buffers;
|
2016-08-19 15:10:31 -07:00
|
|
|
BlockBuilder range_del_block;
|
2014-05-15 14:09:03 -07:00
|
|
|
|
|
|
|
InternalKeySliceTransform internal_prefix_transform;
|
2014-02-28 18:19:07 -08:00
|
|
|
std::unique_ptr<IndexBuilder> index_builder;
|
2017-06-13 10:59:22 -07:00
|
|
|
PartitionedIndexBuilder* p_index_builder_ = nullptr;
|
2014-02-28 18:19:07 -08:00
|
|
|
|
2011-03-18 22:37:00 +00:00
|
|
|
std::string last_key;
|
2020-04-01 16:37:54 -07:00
|
|
|
const Slice* first_key_in_next_block = nullptr;
|
2019-01-18 19:10:17 -08:00
|
|
|
CompressionType compression_type;
|
2019-03-18 12:07:35 -07:00
|
|
|
uint64_t sample_for_compression;
|
2021-03-31 18:20:44 -07:00
|
|
|
std::atomic<uint64_t> compressible_input_data_bytes;
|
|
|
|
std::atomic<uint64_t> uncompressible_input_data_bytes;
|
|
|
|
std::atomic<uint64_t> sampled_input_data_bytes;
|
|
|
|
std::atomic<uint64_t> sampled_output_slow_data_bytes;
|
|
|
|
std::atomic<uint64_t> sampled_output_fast_data_bytes;
|
2019-01-18 19:10:17 -08:00
|
|
|
CompressionOptions compression_opts;
|
Reduce scope of compression dictionary to single SST (#4952)
Summary:
Our previous approach was to train one compression dictionary per compaction, using the first output SST to train a dictionary, and then applying it on subsequent SSTs in the same compaction. While this was great for minimizing CPU/memory/I/O overhead, it did not achieve good compression ratios in practice. In our most promising potential use case, moderate reductions in a dictionary's scope make a major difference on compression ratio.
So, this PR changes compression dictionary to be scoped per-SST. It accepts the tradeoff during table building to use more memory and CPU. Important changes include:
- The `BlockBasedTableBuilder` has a new state when dictionary compression is in-use: `kBuffered`. In that state it accumulates uncompressed data in-memory whenever `Add` is called.
- After accumulating target file size bytes or calling `BlockBasedTableBuilder::Finish`, a `BlockBasedTableBuilder` moves to the `kUnbuffered` state. The transition (`EnterUnbuffered()`) involves sampling the buffered data, training a dictionary, and compressing/writing out all buffered data. In the `kUnbuffered` state, a `BlockBasedTableBuilder` behaves the same as before -- blocks are compressed/written out as soon as they fill up.
- Samples are now whole uncompressed data blocks, except the final sample may be a partial data block so we don't breach the user's configured `max_dict_bytes` or `zstd_max_train_bytes`. The dictionary trainer is supposed to work better when we pass it real units of compression. Previously we were passing 64-byte KV samples which was not realistic.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4952
Differential Revision: D13967980
Pulled By: ajkr
fbshipit-source-id: 82bea6f7537e1529c7a1a4cdee84585f5949300f
2019-02-11 19:42:25 -08:00
|
|
|
std::unique_ptr<CompressionDict> compression_dict;
|
2020-04-01 16:37:54 -07:00
|
|
|
std::vector<std::unique_ptr<CompressionContext>> compression_ctxs;
|
|
|
|
std::vector<std::unique_ptr<UncompressionContext>> verify_ctxs;
|
Reduce scope of compression dictionary to single SST (#4952)
Summary:
Our previous approach was to train one compression dictionary per compaction, using the first output SST to train a dictionary, and then applying it on subsequent SSTs in the same compaction. While this was great for minimizing CPU/memory/I/O overhead, it did not achieve good compression ratios in practice. In our most promising potential use case, moderate reductions in a dictionary's scope make a major difference on compression ratio.
So, this PR changes compression dictionary to be scoped per-SST. It accepts the tradeoff during table building to use more memory and CPU. Important changes include:
- The `BlockBasedTableBuilder` has a new state when dictionary compression is in-use: `kBuffered`. In that state it accumulates uncompressed data in-memory whenever `Add` is called.
- After accumulating target file size bytes or calling `BlockBasedTableBuilder::Finish`, a `BlockBasedTableBuilder` moves to the `kUnbuffered` state. The transition (`EnterUnbuffered()`) involves sampling the buffered data, training a dictionary, and compressing/writing out all buffered data. In the `kUnbuffered` state, a `BlockBasedTableBuilder` behaves the same as before -- blocks are compressed/written out as soon as they fill up.
- Samples are now whole uncompressed data blocks, except the final sample may be a partial data block so we don't breach the user's configured `max_dict_bytes` or `zstd_max_train_bytes`. The dictionary trainer is supposed to work better when we pass it real units of compression. Previously we were passing 64-byte KV samples which was not realistic.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4952
Differential Revision: D13967980
Pulled By: ajkr
fbshipit-source-id: 82bea6f7537e1529c7a1a4cdee84585f5949300f
2019-02-11 19:42:25 -08:00
|
|
|
std::unique_ptr<UncompressionDict> verify_dict;
|
|
|
|
|
|
|
|
size_t data_begin_offset = 0;
|
|
|
|
|
2013-11-19 16:29:42 -08:00
|
|
|
TableProperties props;
|
2013-10-10 11:43:24 -07:00
|
|
|
|
Reduce scope of compression dictionary to single SST (#4952)
Summary:
Our previous approach was to train one compression dictionary per compaction, using the first output SST to train a dictionary, and then applying it on subsequent SSTs in the same compaction. While this was great for minimizing CPU/memory/I/O overhead, it did not achieve good compression ratios in practice. In our most promising potential use case, moderate reductions in a dictionary's scope make a major difference on compression ratio.
So, this PR changes compression dictionary to be scoped per-SST. It accepts the tradeoff during table building to use more memory and CPU. Important changes include:
- The `BlockBasedTableBuilder` has a new state when dictionary compression is in-use: `kBuffered`. In that state it accumulates uncompressed data in-memory whenever `Add` is called.
- After accumulating target file size bytes or calling `BlockBasedTableBuilder::Finish`, a `BlockBasedTableBuilder` moves to the `kUnbuffered` state. The transition (`EnterUnbuffered()`) involves sampling the buffered data, training a dictionary, and compressing/writing out all buffered data. In the `kUnbuffered` state, a `BlockBasedTableBuilder` behaves the same as before -- blocks are compressed/written out as soon as they fill up.
- Samples are now whole uncompressed data blocks, except the final sample may be a partial data block so we don't breach the user's configured `max_dict_bytes` or `zstd_max_train_bytes`. The dictionary trainer is supposed to work better when we pass it real units of compression. Previously we were passing 64-byte KV samples which was not realistic.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4952
Differential Revision: D13967980
Pulled By: ajkr
fbshipit-source-id: 82bea6f7537e1529c7a1a4cdee84585f5949300f
2019-02-11 19:42:25 -08:00
|
|
|
// States of the builder.
|
|
|
|
//
|
|
|
|
// - `kBuffered`: This is the initial state where zero or more data blocks are
|
|
|
|
// accumulated uncompressed in-memory. From this state, call
|
|
|
|
// `EnterUnbuffered()` to finalize the compression dictionary if enabled,
|
|
|
|
// compress/write out any buffered blocks, and proceed to the `kUnbuffered`
|
|
|
|
// state.
|
|
|
|
//
|
|
|
|
// - `kUnbuffered`: This is the state when compression dictionary is finalized
|
|
|
|
// either because it wasn't enabled in the first place or it's been created
|
|
|
|
// from sampling previously buffered data. In this state, blocks are simply
|
|
|
|
// compressed/written out as they fill up. From this state, call `Finish()`
|
|
|
|
// to complete the file (write meta-blocks, etc.), or `Abandon()` to delete
|
|
|
|
// the partially created file.
|
|
|
|
//
|
|
|
|
// - `kClosed`: This indicates either `Finish()` or `Abandon()` has been
|
|
|
|
// called, so the table builder is no longer usable. We must be in this
|
|
|
|
// state by the time the destructor runs.
|
|
|
|
enum class State {
|
|
|
|
kBuffered,
|
|
|
|
kUnbuffered,
|
|
|
|
kClosed,
|
|
|
|
};
|
|
|
|
State state;
|
Limit buffering for collecting samples for compression dictionary (#7970)
Summary:
For dictionary compression, we need to collect some representative samples of the data to be compressed, which we use to either generate or train (when `CompressionOptions::zstd_max_train_bytes > 0`) a dictionary. Previously, the strategy was to buffer all the data blocks during flush, and up to the target file size during compaction. That strategy allowed us to randomly pick samples from as wide a range as possible that'd be guaranteed to land in a single output file.
However, some users try to make huge files in memory-constrained environments, where this strategy can cause OOM. This PR introduces an option, `CompressionOptions::max_dict_buffer_bytes`, that limits how much data blocks are buffered before we switch to unbuffered mode (which means creating the per-SST dictionary, writing out the buffered data, and compressing/writing new blocks as soon as they are built). It is not strict as we currently buffer more than just data blocks -- also keys are buffered. But it does make a step towards giving users predictable memory usage.
Related changes include:
- Changed sampling for dictionary compression to select unique data blocks when there is limited availability of data blocks
- Made use of `BlockBuilder::SwapAndReset()` to save an allocation+memcpy when buffering data blocks for building a dictionary
- Changed `ParseBoolean()` to accept an input containing characters after the boolean. This is necessary since, with this PR, a value for `CompressionOptions::enabled` is no longer necessarily the final component in the `CompressionOptions` string.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7970
Test Plan:
- updated `CompressionOptions` unit tests to verify limit is respected (to the extent expected in the current implementation) in various scenarios of flush/compaction to bottommost/non-bottommost level
- looked at jemalloc heap profiles right before and after switching to unbuffered mode during flush/compaction. Verified memory usage in buffering is proportional to the limit set.
Reviewed By: pdillinger
Differential Revision: D26467994
Pulled By: ajkr
fbshipit-source-id: 3da4ef9fba59974e4ef40e40c01611002c861465
2021-02-19 14:06:59 -08:00
|
|
|
// `kBuffered` state is allowed only as long as the buffering of uncompressed
|
2021-04-23 12:44:11 -07:00
|
|
|
// data blocks (see `data_block_buffers`) does not exceed `buffer_limit`.
|
Limit buffering for collecting samples for compression dictionary (#7970)
Summary:
For dictionary compression, we need to collect some representative samples of the data to be compressed, which we use to either generate or train (when `CompressionOptions::zstd_max_train_bytes > 0`) a dictionary. Previously, the strategy was to buffer all the data blocks during flush, and up to the target file size during compaction. That strategy allowed us to randomly pick samples from as wide a range as possible that'd be guaranteed to land in a single output file.
However, some users try to make huge files in memory-constrained environments, where this strategy can cause OOM. This PR introduces an option, `CompressionOptions::max_dict_buffer_bytes`, that limits how much data blocks are buffered before we switch to unbuffered mode (which means creating the per-SST dictionary, writing out the buffered data, and compressing/writing new blocks as soon as they are built). It is not strict as we currently buffer more than just data blocks -- also keys are buffered. But it does make a step towards giving users predictable memory usage.
Related changes include:
- Changed sampling for dictionary compression to select unique data blocks when there is limited availability of data blocks
- Made use of `BlockBuilder::SwapAndReset()` to save an allocation+memcpy when buffering data blocks for building a dictionary
- Changed `ParseBoolean()` to accept an input containing characters after the boolean. This is necessary since, with this PR, a value for `CompressionOptions::enabled` is no longer necessarily the final component in the `CompressionOptions` string.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7970
Test Plan:
- updated `CompressionOptions` unit tests to verify limit is respected (to the extent expected in the current implementation) in various scenarios of flush/compaction to bottommost/non-bottommost level
- looked at jemalloc heap profiles right before and after switching to unbuffered mode during flush/compaction. Verified memory usage in buffering is proportional to the limit set.
Reviewed By: pdillinger
Differential Revision: D26467994
Pulled By: ajkr
fbshipit-source-id: 3da4ef9fba59974e4ef40e40c01611002c861465
2021-02-19 14:06:59 -08:00
|
|
|
uint64_t buffer_limit;
|
2021-11-01 14:26:50 -07:00
|
|
|
std::unique_ptr<CacheReservationManager>
|
|
|
|
compression_dict_buffer_cache_res_mgr;
|
2018-08-09 16:49:45 -07:00
|
|
|
const bool use_delta_encoding_for_index_values;
|
2017-03-07 13:48:02 -08:00
|
|
|
std::unique_ptr<FilterBlockBuilder> filter_builder;
|
New stable, fixed-length cache keys (#9126)
Summary:
This change standardizes on a new 16-byte cache key format for
block cache (incl compressed and secondary) and persistent cache (but
not table cache and row cache).
The goal is a really fast cache key with practically ideal stability and
uniqueness properties without external dependencies (e.g. from FileSystem).
A fixed key size of 16 bytes should enable future optimizations to the
concurrent hash table for block cache, which is a heavy CPU user /
bottleneck, but there appears to be measurable performance improvement
even with no changes to LRUCache.
This change replaces a lot of disjointed and ugly code handling cache
keys with calls to a simple, clean new internal API (cache_key.h).
(Preserving the old cache key logic under an option would be very ugly
and likely negate the performance gain of the new approach. Complete
replacement carries some inherent risk, but I think that's acceptable
with sufficient analysis and testing.)
The scheme for encoding new cache keys is complicated but explained
in cache_key.cc.
Also: EndianSwapValue is moved to math.h to be next to other bit
operations. (Explains some new include "math.h".) ReverseBits operation
added and unit tests added to hash_test for both.
Fixes https://github.com/facebook/rocksdb/issues/7405 (presuming a root cause)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9126
Test Plan:
### Basic correctness
Several tests needed updates to work with the new functionality, mostly
because we are no longer relying on filesystem for stable cache keys
so table builders & readers need more context info to agree on cache
keys. This functionality is so core, a huge number of existing tests
exercise the cache key functionality.
### Performance
Create db with
`TEST_TMPDIR=/dev/shm ./db_bench -bloom_bits=10 -benchmarks=fillrandom -num=3000000 -partition_index_and_filters`
And test performance with
`TEST_TMPDIR=/dev/shm ./db_bench -readonly -use_existing_db -bloom_bits=10 -benchmarks=readrandom -num=3000000 -duration=30 -cache_index_and_filter_blocks -cache_size=250000 -threads=4`
using DEBUG_LEVEL=0 and simultaneous before & after runs.
Before ops/sec, avg over 100 runs: 121924
After ops/sec, avg over 100 runs: 125385 (+2.8%)
### Collision probability
I have built a tool, ./cache_bench -stress_cache_key to broadly simulate host-wide cache activity
over many months, by making some pessimistic simplifying assumptions:
* Every generated file has a cache entry for every byte offset in the file (contiguous range of cache keys)
* All of every file is cached for its entire lifetime
We use a simple table with skewed address assignment and replacement on address collision
to simulate files coming & going, with quite a variance (super-Poisson) in ages. Some output
with `./cache_bench -stress_cache_key -sck_keep_bits=40`:
```
Total cache or DBs size: 32TiB Writing 925.926 MiB/s or 76.2939TiB/day
Multiply by 9.22337e+18 to correct for simulation losses (but still assume whole file cached)
```
These come from default settings of 2.5M files per day of 32 MB each, and
`-sck_keep_bits=40` means that to represent a single file, we are only keeping 40 bits of
the 128-bit cache key. With file size of 2\*\*25 contiguous keys (pessimistic), our simulation
is about 2\*\*(128-40-25) or about 9 billion billion times more prone to collision than reality.
More default assumptions, relatively pessimistic:
* 100 DBs in same process (doesn't matter much)
* Re-open DB in same process (new session ID related to old session ID) on average
every 100 files generated
* Restart process (all new session IDs unrelated to old) 24 times per day
After enough data, we get a result at the end:
```
(keep 40 bits) 17 collisions after 2 x 90 days, est 10.5882 days between (9.76592e+19 corrected)
```
If we believe the (pessimistic) simulation and the mathematical generalization, we would need to run a billion machines all for 97 billion days to expect a cache key collision. To help verify that our generalization ("corrected") is robust, we can make our simulation more precise with `-sck_keep_bits=41` and `42`, which takes more running time to get enough data:
```
(keep 41 bits) 16 collisions after 4 x 90 days, est 22.5 days between (1.03763e+20 corrected)
(keep 42 bits) 19 collisions after 10 x 90 days, est 47.3684 days between (1.09224e+20 corrected)
```
The generalized prediction still holds. With the `-sck_randomize` option, we can see that we are beating "random" cache keys (except offsets still non-randomized) by a modest amount (roughly 20x less collision prone than random), which should make us reasonably comfortable even in "degenerate" cases:
```
197 collisions after 1 x 90 days, est 0.456853 days between (4.21372e+18 corrected)
```
I've run other tests to validate other conditions behave as expected, never behaving "worse than random" unless we start chopping off structured data.
Reviewed By: zhichao-cao
Differential Revision: D33171746
Pulled By: pdillinger
fbshipit-source-id: f16a57e369ed37be5e7e33525ace848d0537c88f
2021-12-16 17:13:55 -08:00
|
|
|
OffsetableCacheKey base_cache_key;
|
2021-09-15 15:32:07 -07:00
|
|
|
const TableFileCreationReason reason;
|
2011-03-18 22:37:00 +00:00
|
|
|
|
|
|
|
BlockHandle pending_handle; // Handle to add to index block
|
|
|
|
|
|
|
|
std::string compressed_output;
|
2013-11-07 21:27:21 -08:00
|
|
|
std::unique_ptr<FlushBlockPolicy> flush_block_policy;
|
2020-06-17 10:55:42 -07:00
|
|
|
|
A new call back to TablePropertiesCollector to allow users know the entry is add, delete or merge
Summary:
Currently users have no idea a key is add, delete or merge from TablePropertiesCollector call back. Add a new function to add it.
Also refactor the codes so that
(1) make table property collector and internal table property collector two separate data structures with the later one now exposed
(2) table builders only receive internal table properties
Test Plan: Add cases in table_properties_collector_test to cover both of old and new ways of using TablePropertiesCollector.
Reviewers: yhchiang, igor.sugak, rven, igor
Reviewed By: rven, igor
Subscribers: meyering, yoshinorim, maykov, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D35373
2015-04-06 10:04:30 -07:00
|
|
|
std::vector<std::unique_ptr<IntTblPropCollector>> table_properties_collectors;
|
TablePropertiesCollectorFactory
Summary:
This diff addresses task #4296714 and rethinks how users provide us with TablePropertiesCollectors as part of Options.
Here's description of task #4296714:
I'm debugging #4295529 and noticed that our count of user properties kDeletedKeys is wrong. We're sharing one single InternalKeyPropertiesCollector with all Table Builders. In LOG Files, we're outputting number of kDeletedKeys as connected with a single table, while it's actually the total count of deleted keys since creation of the DB.
For example, this table has 3155 entries and 1391828 deleted keys.
The problem with current approach that we call methods on a single TablePropertiesCollector for all the tables we create. Even worse, we could do it from multiple threads at the same time and TablePropertiesCollector has no way of knowing which table we're calling it for.
Good part: Looks like nobody inside Facebook is using Options::table_properties_collectors. This means we should be able to painfully change the API.
In this change, I introduce TablePropertiesCollectorFactory. For every table we create, we call `CreateTablePropertiesCollector`, which creates a TablePropertiesCollector for a single table. We then use it sequentially from a single thread, which means it doesn't have to be thread-safe.
Test Plan:
Added a test in table_properties_collector_test that fails on master (build two tables, assert that kDeletedKeys count is correct for the second one).
Also, all other tests
Reviewers: sdong, dhruba, haobo, kailiu
Reviewed By: kailiu
CC: leveldb
Differential Revision: https://reviews.facebook.net/D18579
2014-05-13 12:30:55 -07:00
|
|
|
|
2020-04-01 16:37:54 -07:00
|
|
|
std::unique_ptr<ParallelCompressionRep> pc_rep;
|
|
|
|
|
2020-04-02 16:13:44 -07:00
|
|
|
uint64_t get_offset() { return offset.load(std::memory_order_relaxed); }
|
|
|
|
void set_offset(uint64_t o) { offset.store(o, std::memory_order_relaxed); }
|
|
|
|
|
2020-10-22 11:03:10 -07:00
|
|
|
bool IsParallelCompressionEnabled() const {
|
|
|
|
return compression_opts.parallel_threads > 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status GetStatus() {
|
|
|
|
// We need to make modifications of status visible when status_ok is set
|
|
|
|
// to false, and this is ensured by status_mutex, so no special memory
|
|
|
|
// order for status_ok is required.
|
|
|
|
if (status_ok.load(std::memory_order_relaxed)) {
|
|
|
|
return Status::OK();
|
2020-04-30 15:34:43 -07:00
|
|
|
} else {
|
2020-10-22 11:03:10 -07:00
|
|
|
return CopyStatus();
|
2020-04-30 15:34:43 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-10-22 11:03:10 -07:00
|
|
|
Status CopyStatus() {
|
|
|
|
std::lock_guard<std::mutex> lock(status_mutex);
|
|
|
|
return status;
|
|
|
|
}
|
|
|
|
|
|
|
|
IOStatus GetIOStatus() {
|
|
|
|
// We need to make modifications of io_status visible when status_ok is set
|
|
|
|
// to false, and this is ensured by io_status_mutex, so no special memory
|
|
|
|
// order for io_status_ok is required.
|
|
|
|
if (io_status_ok.load(std::memory_order_relaxed)) {
|
|
|
|
return IOStatus::OK();
|
2020-04-30 15:34:43 -07:00
|
|
|
} else {
|
2020-10-22 11:03:10 -07:00
|
|
|
return CopyIOStatus();
|
2020-04-30 15:34:43 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-10-22 11:03:10 -07:00
|
|
|
IOStatus CopyIOStatus() {
|
|
|
|
std::lock_guard<std::mutex> lock(io_status_mutex);
|
|
|
|
return io_status;
|
2020-04-30 15:34:43 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
// Never erase an existing status that is not OK.
|
|
|
|
void SetStatus(Status s) {
|
2020-10-22 11:03:10 -07:00
|
|
|
if (!s.ok() && status_ok.load(std::memory_order_relaxed)) {
|
2020-04-30 15:34:43 -07:00
|
|
|
// Locking is an overkill for non compression_opts.parallel_threads
|
|
|
|
// case but since it's unlikely that s is not OK, we take this cost
|
|
|
|
// to be simplicity.
|
|
|
|
std::lock_guard<std::mutex> lock(status_mutex);
|
2020-10-22 11:03:10 -07:00
|
|
|
status = s;
|
|
|
|
status_ok.store(false, std::memory_order_relaxed);
|
2020-04-30 15:34:43 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Never erase an existing I/O status that is not OK.
|
2022-01-26 10:14:56 -08:00
|
|
|
// Calling this will also SetStatus(ios)
|
2020-04-30 15:34:43 -07:00
|
|
|
void SetIOStatus(IOStatus ios) {
|
2020-10-22 11:03:10 -07:00
|
|
|
if (!ios.ok() && io_status_ok.load(std::memory_order_relaxed)) {
|
2020-04-30 15:34:43 -07:00
|
|
|
// Locking is an overkill for non compression_opts.parallel_threads
|
|
|
|
// case but since it's unlikely that s is not OK, we take this cost
|
|
|
|
// to be simplicity.
|
2020-10-22 11:03:10 -07:00
|
|
|
std::lock_guard<std::mutex> lock(io_status_mutex);
|
|
|
|
io_status = ios;
|
|
|
|
io_status_ok.store(false, std::memory_order_relaxed);
|
2020-04-30 15:34:43 -07:00
|
|
|
}
|
2022-01-26 10:14:56 -08:00
|
|
|
SetStatus(ios);
|
2020-04-30 15:34:43 -07:00
|
|
|
}
|
|
|
|
|
2021-04-29 06:59:53 -07:00
|
|
|
Rep(const BlockBasedTableOptions& table_opt, const TableBuilderOptions& tbo,
|
|
|
|
WritableFileWriter* f)
|
|
|
|
: ioptions(tbo.ioptions),
|
|
|
|
moptions(tbo.moptions),
|
2014-08-25 14:22:05 -07:00
|
|
|
table_options(table_opt),
|
2021-04-29 06:59:53 -07:00
|
|
|
internal_comparator(tbo.internal_comparator),
|
2011-03-18 22:37:00 +00:00
|
|
|
file(f),
|
2020-04-02 16:13:44 -07:00
|
|
|
offset(0),
|
2018-03-26 20:14:24 -07:00
|
|
|
alignment(table_options.block_align
|
2022-01-14 11:57:12 -08:00
|
|
|
? std::min(static_cast<size_t>(table_options.block_size),
|
|
|
|
kDefaultPageSize)
|
2018-03-26 20:14:24 -07:00
|
|
|
: 0),
|
2015-12-16 12:08:30 -08:00
|
|
|
data_block(table_options.block_restart_interval,
|
2018-08-15 14:27:47 -07:00
|
|
|
table_options.use_delta_encoding,
|
|
|
|
false /* use_value_delta_encoding */,
|
2021-04-29 06:59:53 -07:00
|
|
|
tbo.internal_comparator.user_comparator()
|
2018-08-15 14:27:47 -07:00
|
|
|
->CanKeysWithDifferentByteContentsBeEqual()
|
|
|
|
? BlockBasedTableOptions::kDataBlockBinarySearch
|
|
|
|
: table_options.data_block_index_type,
|
|
|
|
table_options.data_block_hash_table_util_ratio),
|
2018-05-21 09:42:49 -07:00
|
|
|
range_del_block(1 /* block_restart_interval */),
|
2021-04-29 06:59:53 -07:00
|
|
|
internal_prefix_transform(tbo.moptions.prefix_extractor.get()),
|
|
|
|
compression_type(tbo.compression_type),
|
|
|
|
sample_for_compression(tbo.moptions.sample_for_compression),
|
2021-03-31 18:20:44 -07:00
|
|
|
compressible_input_data_bytes(0),
|
|
|
|
uncompressible_input_data_bytes(0),
|
|
|
|
sampled_input_data_bytes(0),
|
|
|
|
sampled_output_slow_data_bytes(0),
|
|
|
|
sampled_output_fast_data_bytes(0),
|
2021-04-29 06:59:53 -07:00
|
|
|
compression_opts(tbo.compression_opts),
|
Reduce scope of compression dictionary to single SST (#4952)
Summary:
Our previous approach was to train one compression dictionary per compaction, using the first output SST to train a dictionary, and then applying it on subsequent SSTs in the same compaction. While this was great for minimizing CPU/memory/I/O overhead, it did not achieve good compression ratios in practice. In our most promising potential use case, moderate reductions in a dictionary's scope make a major difference on compression ratio.
So, this PR changes compression dictionary to be scoped per-SST. It accepts the tradeoff during table building to use more memory and CPU. Important changes include:
- The `BlockBasedTableBuilder` has a new state when dictionary compression is in-use: `kBuffered`. In that state it accumulates uncompressed data in-memory whenever `Add` is called.
- After accumulating target file size bytes or calling `BlockBasedTableBuilder::Finish`, a `BlockBasedTableBuilder` moves to the `kUnbuffered` state. The transition (`EnterUnbuffered()`) involves sampling the buffered data, training a dictionary, and compressing/writing out all buffered data. In the `kUnbuffered` state, a `BlockBasedTableBuilder` behaves the same as before -- blocks are compressed/written out as soon as they fill up.
- Samples are now whole uncompressed data blocks, except the final sample may be a partial data block so we don't breach the user's configured `max_dict_bytes` or `zstd_max_train_bytes`. The dictionary trainer is supposed to work better when we pass it real units of compression. Previously we were passing 64-byte KV samples which was not realistic.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4952
Differential Revision: D13967980
Pulled By: ajkr
fbshipit-source-id: 82bea6f7537e1529c7a1a4cdee84585f5949300f
2019-02-11 19:42:25 -08:00
|
|
|
compression_dict(),
|
2021-04-29 06:59:53 -07:00
|
|
|
compression_ctxs(tbo.compression_opts.parallel_threads),
|
|
|
|
verify_ctxs(tbo.compression_opts.parallel_threads),
|
Reduce scope of compression dictionary to single SST (#4952)
Summary:
Our previous approach was to train one compression dictionary per compaction, using the first output SST to train a dictionary, and then applying it on subsequent SSTs in the same compaction. While this was great for minimizing CPU/memory/I/O overhead, it did not achieve good compression ratios in practice. In our most promising potential use case, moderate reductions in a dictionary's scope make a major difference on compression ratio.
So, this PR changes compression dictionary to be scoped per-SST. It accepts the tradeoff during table building to use more memory and CPU. Important changes include:
- The `BlockBasedTableBuilder` has a new state when dictionary compression is in-use: `kBuffered`. In that state it accumulates uncompressed data in-memory whenever `Add` is called.
- After accumulating target file size bytes or calling `BlockBasedTableBuilder::Finish`, a `BlockBasedTableBuilder` moves to the `kUnbuffered` state. The transition (`EnterUnbuffered()`) involves sampling the buffered data, training a dictionary, and compressing/writing out all buffered data. In the `kUnbuffered` state, a `BlockBasedTableBuilder` behaves the same as before -- blocks are compressed/written out as soon as they fill up.
- Samples are now whole uncompressed data blocks, except the final sample may be a partial data block so we don't breach the user's configured `max_dict_bytes` or `zstd_max_train_bytes`. The dictionary trainer is supposed to work better when we pass it real units of compression. Previously we were passing 64-byte KV samples which was not realistic.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4952
Differential Revision: D13967980
Pulled By: ajkr
fbshipit-source-id: 82bea6f7537e1529c7a1a4cdee84585f5949300f
2019-02-11 19:42:25 -08:00
|
|
|
verify_dict(),
|
2021-04-29 06:59:53 -07:00
|
|
|
state((tbo.compression_opts.max_dict_bytes > 0) ? State::kBuffered
|
|
|
|
: State::kUnbuffered),
|
2018-08-09 16:49:45 -07:00
|
|
|
use_delta_encoding_for_index_values(table_opt.format_version >= 4 &&
|
|
|
|
!table_opt.block_align),
|
2021-09-15 15:32:07 -07:00
|
|
|
reason(tbo.reason),
|
2014-08-25 14:22:05 -07:00
|
|
|
flush_block_policy(
|
|
|
|
table_options.flush_block_policy_factory->NewFlushBlockPolicy(
|
2016-04-06 23:10:32 -07:00
|
|
|
table_options, data_block)),
|
2020-10-22 11:03:10 -07:00
|
|
|
status_ok(true),
|
|
|
|
io_status_ok(true) {
|
2021-04-29 06:59:53 -07:00
|
|
|
if (tbo.target_file_size == 0) {
|
Limit buffering for collecting samples for compression dictionary (#7970)
Summary:
For dictionary compression, we need to collect some representative samples of the data to be compressed, which we use to either generate or train (when `CompressionOptions::zstd_max_train_bytes > 0`) a dictionary. Previously, the strategy was to buffer all the data blocks during flush, and up to the target file size during compaction. That strategy allowed us to randomly pick samples from as wide a range as possible that'd be guaranteed to land in a single output file.
However, some users try to make huge files in memory-constrained environments, where this strategy can cause OOM. This PR introduces an option, `CompressionOptions::max_dict_buffer_bytes`, that limits how much data blocks are buffered before we switch to unbuffered mode (which means creating the per-SST dictionary, writing out the buffered data, and compressing/writing new blocks as soon as they are built). It is not strict as we currently buffer more than just data blocks -- also keys are buffered. But it does make a step towards giving users predictable memory usage.
Related changes include:
- Changed sampling for dictionary compression to select unique data blocks when there is limited availability of data blocks
- Made use of `BlockBuilder::SwapAndReset()` to save an allocation+memcpy when buffering data blocks for building a dictionary
- Changed `ParseBoolean()` to accept an input containing characters after the boolean. This is necessary since, with this PR, a value for `CompressionOptions::enabled` is no longer necessarily the final component in the `CompressionOptions` string.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7970
Test Plan:
- updated `CompressionOptions` unit tests to verify limit is respected (to the extent expected in the current implementation) in various scenarios of flush/compaction to bottommost/non-bottommost level
- looked at jemalloc heap profiles right before and after switching to unbuffered mode during flush/compaction. Verified memory usage in buffering is proportional to the limit set.
Reviewed By: pdillinger
Differential Revision: D26467994
Pulled By: ajkr
fbshipit-source-id: 3da4ef9fba59974e4ef40e40c01611002c861465
2021-02-19 14:06:59 -08:00
|
|
|
buffer_limit = compression_opts.max_dict_buffer_bytes;
|
|
|
|
} else if (compression_opts.max_dict_buffer_bytes == 0) {
|
2021-04-29 06:59:53 -07:00
|
|
|
buffer_limit = tbo.target_file_size;
|
Limit buffering for collecting samples for compression dictionary (#7970)
Summary:
For dictionary compression, we need to collect some representative samples of the data to be compressed, which we use to either generate or train (when `CompressionOptions::zstd_max_train_bytes > 0`) a dictionary. Previously, the strategy was to buffer all the data blocks during flush, and up to the target file size during compaction. That strategy allowed us to randomly pick samples from as wide a range as possible that'd be guaranteed to land in a single output file.
However, some users try to make huge files in memory-constrained environments, where this strategy can cause OOM. This PR introduces an option, `CompressionOptions::max_dict_buffer_bytes`, that limits how much data blocks are buffered before we switch to unbuffered mode (which means creating the per-SST dictionary, writing out the buffered data, and compressing/writing new blocks as soon as they are built). It is not strict as we currently buffer more than just data blocks -- also keys are buffered. But it does make a step towards giving users predictable memory usage.
Related changes include:
- Changed sampling for dictionary compression to select unique data blocks when there is limited availability of data blocks
- Made use of `BlockBuilder::SwapAndReset()` to save an allocation+memcpy when buffering data blocks for building a dictionary
- Changed `ParseBoolean()` to accept an input containing characters after the boolean. This is necessary since, with this PR, a value for `CompressionOptions::enabled` is no longer necessarily the final component in the `CompressionOptions` string.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7970
Test Plan:
- updated `CompressionOptions` unit tests to verify limit is respected (to the extent expected in the current implementation) in various scenarios of flush/compaction to bottommost/non-bottommost level
- looked at jemalloc heap profiles right before and after switching to unbuffered mode during flush/compaction. Verified memory usage in buffering is proportional to the limit set.
Reviewed By: pdillinger
Differential Revision: D26467994
Pulled By: ajkr
fbshipit-source-id: 3da4ef9fba59974e4ef40e40c01611002c861465
2021-02-19 14:06:59 -08:00
|
|
|
} else {
|
2021-04-29 06:59:53 -07:00
|
|
|
buffer_limit = std::min(tbo.target_file_size,
|
|
|
|
compression_opts.max_dict_buffer_bytes);
|
Limit buffering for collecting samples for compression dictionary (#7970)
Summary:
For dictionary compression, we need to collect some representative samples of the data to be compressed, which we use to either generate or train (when `CompressionOptions::zstd_max_train_bytes > 0`) a dictionary. Previously, the strategy was to buffer all the data blocks during flush, and up to the target file size during compaction. That strategy allowed us to randomly pick samples from as wide a range as possible that'd be guaranteed to land in a single output file.
However, some users try to make huge files in memory-constrained environments, where this strategy can cause OOM. This PR introduces an option, `CompressionOptions::max_dict_buffer_bytes`, that limits how much data blocks are buffered before we switch to unbuffered mode (which means creating the per-SST dictionary, writing out the buffered data, and compressing/writing new blocks as soon as they are built). It is not strict as we currently buffer more than just data blocks -- also keys are buffered. But it does make a step towards giving users predictable memory usage.
Related changes include:
- Changed sampling for dictionary compression to select unique data blocks when there is limited availability of data blocks
- Made use of `BlockBuilder::SwapAndReset()` to save an allocation+memcpy when buffering data blocks for building a dictionary
- Changed `ParseBoolean()` to accept an input containing characters after the boolean. This is necessary since, with this PR, a value for `CompressionOptions::enabled` is no longer necessarily the final component in the `CompressionOptions` string.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7970
Test Plan:
- updated `CompressionOptions` unit tests to verify limit is respected (to the extent expected in the current implementation) in various scenarios of flush/compaction to bottommost/non-bottommost level
- looked at jemalloc heap profiles right before and after switching to unbuffered mode during flush/compaction. Verified memory usage in buffering is proportional to the limit set.
Reviewed By: pdillinger
Differential Revision: D26467994
Pulled By: ajkr
fbshipit-source-id: 3da4ef9fba59974e4ef40e40c01611002c861465
2021-02-19 14:06:59 -08:00
|
|
|
}
|
2021-11-01 14:26:50 -07:00
|
|
|
if (table_options.no_block_cache || table_options.block_cache == nullptr) {
|
|
|
|
compression_dict_buffer_cache_res_mgr.reset(nullptr);
|
2021-09-08 12:34:35 -07:00
|
|
|
} else {
|
2021-11-01 14:26:50 -07:00
|
|
|
compression_dict_buffer_cache_res_mgr.reset(
|
2021-09-08 12:34:35 -07:00
|
|
|
new CacheReservationManager(table_options.block_cache));
|
|
|
|
}
|
2020-04-01 16:37:54 -07:00
|
|
|
for (uint32_t i = 0; i < compression_opts.parallel_threads; i++) {
|
|
|
|
compression_ctxs[i].reset(new CompressionContext(compression_type));
|
|
|
|
}
|
2017-03-07 13:48:02 -08:00
|
|
|
if (table_options.index_type ==
|
|
|
|
BlockBasedTableOptions::kTwoLevelIndexSearch) {
|
2017-06-13 10:59:22 -07:00
|
|
|
p_index_builder_ = PartitionedIndexBuilder::CreateIndexBuilder(
|
2018-08-09 16:49:45 -07:00
|
|
|
&internal_comparator, use_delta_encoding_for_index_values,
|
|
|
|
table_options);
|
2017-06-13 10:59:22 -07:00
|
|
|
index_builder.reset(p_index_builder_);
|
2017-03-07 13:48:02 -08:00
|
|
|
} else {
|
|
|
|
index_builder.reset(IndexBuilder::CreateIndexBuilder(
|
|
|
|
table_options.index_type, &internal_comparator,
|
2018-08-09 16:49:45 -07:00
|
|
|
&this->internal_prefix_transform, use_delta_encoding_for_index_values,
|
|
|
|
table_options));
|
2017-03-07 13:48:02 -08:00
|
|
|
}
|
Add more LSM info to FilterBuildingContext (#8246)
Summary:
Add `num_levels`, `is_bottommost`, and table file creation
`reason` to `FilterBuildingContext`, in anticipation of more powerful
Bloom-like filter support.
To support this, added `is_bottommost` and `reason` to
`TableBuilderOptions`, which allowed removing `reason` parameter from
`rocksdb::BuildTable`.
I attempted to remove `skip_filters` from `TableBuilderOptions`, because
filter construction decisions should arise from options, not one-off
parameters. I could not completely remove it because the public API for
SstFileWriter takes a `skip_filters` parameter, and translating this
into an option change would mean awkwardly replacing the table_factory
if it is BlockBasedTableFactory with new filter_policy=nullptr option.
I marked this public skip_filters option as deprecated because of this
oddity. (skip_filters on the read side probably makes sense.)
At least `skip_filters` is now largely hidden for users of
`TableBuilderOptions` and is no longer used for implementing the
optimize_filters_for_hits option. Bringing the logic for that option
closer to handling of FilterBuildingContext makes it more obvious that
hese two are using the same notion of "bottommost." (Planned:
configuration options for Bloom-like filters that generalize
`optimize_filters_for_hits`)
Recommended follow-up: Try to get away from "bottommost level" naming of
things, which is inaccurate (see
VersionStorageInfo::RangeMightExistAfterSortedRun), and move to
"bottommost run" or just "bottommost."
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8246
Test Plan:
extended an existing unit test to exercise and check various
filter building contexts. Also, existing tests for
optimize_filters_for_hits validate some of the "bottommost" handling,
which is now closely connected to FilterBuildingContext::is_bottommost
through TableBuilderOptions::is_bottommost
Reviewed By: mrambacher
Differential Revision: D28099346
Pulled By: pdillinger
fbshipit-source-id: 2c1072e29c24d4ac404c761a7b7663292372600a
2021-04-30 13:49:24 -07:00
|
|
|
if (ioptions.optimize_filters_for_hits && tbo.is_bottommost) {
|
|
|
|
// Apply optimize_filters_for_hits setting here when applicable by
|
|
|
|
// skipping filter generation
|
|
|
|
filter_builder.reset();
|
|
|
|
} else if (tbo.skip_filters) {
|
|
|
|
// For SstFileWriter skip_filters
|
|
|
|
filter_builder.reset();
|
|
|
|
} else if (!table_options.filter_policy) {
|
|
|
|
// Null filter_policy -> no filter
|
|
|
|
filter_builder.reset();
|
2017-03-07 13:48:02 -08:00
|
|
|
} else {
|
2021-04-29 06:59:53 -07:00
|
|
|
FilterBuildingContext filter_context(table_options);
|
|
|
|
|
|
|
|
filter_context.info_log = ioptions.logger;
|
Add more LSM info to FilterBuildingContext (#8246)
Summary:
Add `num_levels`, `is_bottommost`, and table file creation
`reason` to `FilterBuildingContext`, in anticipation of more powerful
Bloom-like filter support.
To support this, added `is_bottommost` and `reason` to
`TableBuilderOptions`, which allowed removing `reason` parameter from
`rocksdb::BuildTable`.
I attempted to remove `skip_filters` from `TableBuilderOptions`, because
filter construction decisions should arise from options, not one-off
parameters. I could not completely remove it because the public API for
SstFileWriter takes a `skip_filters` parameter, and translating this
into an option change would mean awkwardly replacing the table_factory
if it is BlockBasedTableFactory with new filter_policy=nullptr option.
I marked this public skip_filters option as deprecated because of this
oddity. (skip_filters on the read side probably makes sense.)
At least `skip_filters` is now largely hidden for users of
`TableBuilderOptions` and is no longer used for implementing the
optimize_filters_for_hits option. Bringing the logic for that option
closer to handling of FilterBuildingContext makes it more obvious that
hese two are using the same notion of "bottommost." (Planned:
configuration options for Bloom-like filters that generalize
`optimize_filters_for_hits`)
Recommended follow-up: Try to get away from "bottommost level" naming of
things, which is inaccurate (see
VersionStorageInfo::RangeMightExistAfterSortedRun), and move to
"bottommost run" or just "bottommost."
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8246
Test Plan:
extended an existing unit test to exercise and check various
filter building contexts. Also, existing tests for
optimize_filters_for_hits validate some of the "bottommost" handling,
which is now closely connected to FilterBuildingContext::is_bottommost
through TableBuilderOptions::is_bottommost
Reviewed By: mrambacher
Differential Revision: D28099346
Pulled By: pdillinger
fbshipit-source-id: 2c1072e29c24d4ac404c761a7b7663292372600a
2021-04-30 13:49:24 -07:00
|
|
|
filter_context.column_family_name = tbo.column_family_name;
|
2021-09-15 15:32:07 -07:00
|
|
|
filter_context.reason = reason;
|
Add more LSM info to FilterBuildingContext (#8246)
Summary:
Add `num_levels`, `is_bottommost`, and table file creation
`reason` to `FilterBuildingContext`, in anticipation of more powerful
Bloom-like filter support.
To support this, added `is_bottommost` and `reason` to
`TableBuilderOptions`, which allowed removing `reason` parameter from
`rocksdb::BuildTable`.
I attempted to remove `skip_filters` from `TableBuilderOptions`, because
filter construction decisions should arise from options, not one-off
parameters. I could not completely remove it because the public API for
SstFileWriter takes a `skip_filters` parameter, and translating this
into an option change would mean awkwardly replacing the table_factory
if it is BlockBasedTableFactory with new filter_policy=nullptr option.
I marked this public skip_filters option as deprecated because of this
oddity. (skip_filters on the read side probably makes sense.)
At least `skip_filters` is now largely hidden for users of
`TableBuilderOptions` and is no longer used for implementing the
optimize_filters_for_hits option. Bringing the logic for that option
closer to handling of FilterBuildingContext makes it more obvious that
hese two are using the same notion of "bottommost." (Planned:
configuration options for Bloom-like filters that generalize
`optimize_filters_for_hits`)
Recommended follow-up: Try to get away from "bottommost level" naming of
things, which is inaccurate (see
VersionStorageInfo::RangeMightExistAfterSortedRun), and move to
"bottommost run" or just "bottommost."
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8246
Test Plan:
extended an existing unit test to exercise and check various
filter building contexts. Also, existing tests for
optimize_filters_for_hits validate some of the "bottommost" handling,
which is now closely connected to FilterBuildingContext::is_bottommost
through TableBuilderOptions::is_bottommost
Reviewed By: mrambacher
Differential Revision: D28099346
Pulled By: pdillinger
fbshipit-source-id: 2c1072e29c24d4ac404c761a7b7663292372600a
2021-04-30 13:49:24 -07:00
|
|
|
|
|
|
|
// Only populate other fields if known to be in LSM rather than
|
|
|
|
// generating external SST file
|
2021-09-15 15:32:07 -07:00
|
|
|
if (reason != TableFileCreationReason::kMisc) {
|
Add more LSM info to FilterBuildingContext (#8246)
Summary:
Add `num_levels`, `is_bottommost`, and table file creation
`reason` to `FilterBuildingContext`, in anticipation of more powerful
Bloom-like filter support.
To support this, added `is_bottommost` and `reason` to
`TableBuilderOptions`, which allowed removing `reason` parameter from
`rocksdb::BuildTable`.
I attempted to remove `skip_filters` from `TableBuilderOptions`, because
filter construction decisions should arise from options, not one-off
parameters. I could not completely remove it because the public API for
SstFileWriter takes a `skip_filters` parameter, and translating this
into an option change would mean awkwardly replacing the table_factory
if it is BlockBasedTableFactory with new filter_policy=nullptr option.
I marked this public skip_filters option as deprecated because of this
oddity. (skip_filters on the read side probably makes sense.)
At least `skip_filters` is now largely hidden for users of
`TableBuilderOptions` and is no longer used for implementing the
optimize_filters_for_hits option. Bringing the logic for that option
closer to handling of FilterBuildingContext makes it more obvious that
hese two are using the same notion of "bottommost." (Planned:
configuration options for Bloom-like filters that generalize
`optimize_filters_for_hits`)
Recommended follow-up: Try to get away from "bottommost level" naming of
things, which is inaccurate (see
VersionStorageInfo::RangeMightExistAfterSortedRun), and move to
"bottommost run" or just "bottommost."
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8246
Test Plan:
extended an existing unit test to exercise and check various
filter building contexts. Also, existing tests for
optimize_filters_for_hits validate some of the "bottommost" handling,
which is now closely connected to FilterBuildingContext::is_bottommost
through TableBuilderOptions::is_bottommost
Reviewed By: mrambacher
Differential Revision: D28099346
Pulled By: pdillinger
fbshipit-source-id: 2c1072e29c24d4ac404c761a7b7663292372600a
2021-04-30 13:49:24 -07:00
|
|
|
filter_context.compaction_style = ioptions.compaction_style;
|
|
|
|
filter_context.num_levels = ioptions.num_levels;
|
|
|
|
filter_context.level_at_creation = tbo.level_at_creation;
|
|
|
|
filter_context.is_bottommost = tbo.is_bottommost;
|
|
|
|
assert(filter_context.level_at_creation < filter_context.num_levels);
|
|
|
|
}
|
2021-04-29 06:59:53 -07:00
|
|
|
|
2018-05-21 14:33:55 -07:00
|
|
|
filter_builder.reset(CreateFilterBlockBuilder(
|
2021-04-29 06:59:53 -07:00
|
|
|
ioptions, moptions, filter_context,
|
|
|
|
use_delta_encoding_for_index_values, p_index_builder_));
|
2017-03-07 13:48:02 -08:00
|
|
|
}
|
|
|
|
|
2021-07-06 10:13:40 -07:00
|
|
|
assert(tbo.int_tbl_prop_collector_factories);
|
|
|
|
for (auto& factory : *tbo.int_tbl_prop_collector_factories) {
|
|
|
|
assert(factory);
|
2021-05-17 18:27:42 -07:00
|
|
|
|
TablePropertiesCollectorFactory
Summary:
This diff addresses task #4296714 and rethinks how users provide us with TablePropertiesCollectors as part of Options.
Here's description of task #4296714:
I'm debugging #4295529 and noticed that our count of user properties kDeletedKeys is wrong. We're sharing one single InternalKeyPropertiesCollector with all Table Builders. In LOG Files, we're outputting number of kDeletedKeys as connected with a single table, while it's actually the total count of deleted keys since creation of the DB.
For example, this table has 3155 entries and 1391828 deleted keys.
The problem with current approach that we call methods on a single TablePropertiesCollector for all the tables we create. Even worse, we could do it from multiple threads at the same time and TablePropertiesCollector has no way of knowing which table we're calling it for.
Good part: Looks like nobody inside Facebook is using Options::table_properties_collectors. This means we should be able to painfully change the API.
In this change, I introduce TablePropertiesCollectorFactory. For every table we create, we call `CreateTablePropertiesCollector`, which creates a TablePropertiesCollector for a single table. We then use it sequentially from a single thread, which means it doesn't have to be thread-safe.
Test Plan:
Added a test in table_properties_collector_test that fails on master (build two tables, assert that kDeletedKeys count is correct for the second one).
Also, all other tests
Reviewers: sdong, dhruba, haobo, kailiu
Reviewed By: kailiu
CC: leveldb
Differential Revision: https://reviews.facebook.net/D18579
2014-05-13 12:30:55 -07:00
|
|
|
table_properties_collectors.emplace_back(
|
2021-09-28 12:33:03 -07:00
|
|
|
factory->CreateIntTblPropCollector(tbo.column_family_id,
|
|
|
|
tbo.level_at_creation));
|
TablePropertiesCollectorFactory
Summary:
This diff addresses task #4296714 and rethinks how users provide us with TablePropertiesCollectors as part of Options.
Here's description of task #4296714:
I'm debugging #4295529 and noticed that our count of user properties kDeletedKeys is wrong. We're sharing one single InternalKeyPropertiesCollector with all Table Builders. In LOG Files, we're outputting number of kDeletedKeys as connected with a single table, while it's actually the total count of deleted keys since creation of the DB.
For example, this table has 3155 entries and 1391828 deleted keys.
The problem with current approach that we call methods on a single TablePropertiesCollector for all the tables we create. Even worse, we could do it from multiple threads at the same time and TablePropertiesCollector has no way of knowing which table we're calling it for.
Good part: Looks like nobody inside Facebook is using Options::table_properties_collectors. This means we should be able to painfully change the API.
In this change, I introduce TablePropertiesCollectorFactory. For every table we create, we call `CreateTablePropertiesCollector`, which creates a TablePropertiesCollector for a single table. We then use it sequentially from a single thread, which means it doesn't have to be thread-safe.
Test Plan:
Added a test in table_properties_collector_test that fails on master (build two tables, assert that kDeletedKeys count is correct for the second one).
Also, all other tests
Reviewers: sdong, dhruba, haobo, kailiu
Reviewed By: kailiu
CC: leveldb
Differential Revision: https://reviews.facebook.net/D18579
2014-05-13 12:30:55 -07:00
|
|
|
}
|
|
|
|
table_properties_collectors.emplace_back(
|
2015-02-04 17:03:57 -08:00
|
|
|
new BlockBasedTablePropertiesCollector(
|
|
|
|
table_options.index_type, table_options.whole_key_filtering,
|
2021-04-29 06:59:53 -07:00
|
|
|
moptions.prefix_extractor != nullptr));
|
2021-11-19 11:36:03 -08:00
|
|
|
const Comparator* ucmp = tbo.internal_comparator.user_comparator();
|
|
|
|
assert(ucmp);
|
|
|
|
if (ucmp->timestamp_size() > 0) {
|
|
|
|
table_properties_collectors.emplace_back(
|
|
|
|
new TimestampTablePropertiesCollector(ucmp));
|
|
|
|
}
|
2018-06-04 12:04:52 -07:00
|
|
|
if (table_options.verify_compression) {
|
2020-04-01 16:37:54 -07:00
|
|
|
for (uint32_t i = 0; i < compression_opts.parallel_threads; i++) {
|
2020-05-12 09:25:21 -07:00
|
|
|
verify_ctxs[i].reset(new UncompressionContext(compression_type));
|
2020-04-01 16:37:54 -07:00
|
|
|
}
|
2018-06-04 12:04:52 -07:00
|
|
|
}
|
2020-10-19 11:37:05 -07:00
|
|
|
|
2021-08-20 20:39:52 -07:00
|
|
|
// These are only needed for populating table properties
|
|
|
|
props.column_family_id = tbo.column_family_id;
|
|
|
|
props.column_family_name = tbo.column_family_name;
|
|
|
|
props.creation_time = tbo.creation_time;
|
|
|
|
props.oldest_key_time = tbo.oldest_key_time;
|
|
|
|
props.file_creation_time = tbo.file_creation_time;
|
|
|
|
props.orig_file_number = tbo.cur_file_num;
|
|
|
|
props.db_id = tbo.db_id;
|
|
|
|
props.db_session_id = tbo.db_session_id;
|
|
|
|
props.db_host_id = ioptions.db_host_id;
|
|
|
|
if (!ReifyDbHostIdProperty(ioptions.env, &props.db_host_id).ok()) {
|
2021-04-26 12:43:02 -07:00
|
|
|
ROCKS_LOG_INFO(ioptions.logger, "db_host_id property will not be set");
|
2020-10-19 11:37:05 -07:00
|
|
|
}
|
2018-06-04 12:04:52 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
Rep(const Rep&) = delete;
|
|
|
|
Rep& operator=(const Rep&) = delete;
|
|
|
|
|
2020-04-30 15:34:43 -07:00
|
|
|
private:
|
2020-10-22 11:03:10 -07:00
|
|
|
// Synchronize status & io_status accesses across threads from main thread,
|
|
|
|
// compression thread and write thread in parallel compression.
|
|
|
|
std::mutex status_mutex;
|
|
|
|
std::atomic<bool> status_ok;
|
2020-04-30 15:34:43 -07:00
|
|
|
Status status;
|
2020-10-22 11:03:10 -07:00
|
|
|
std::mutex io_status_mutex;
|
|
|
|
std::atomic<bool> io_status_ok;
|
2020-04-30 15:34:43 -07:00
|
|
|
IOStatus io_status;
|
2011-03-18 22:37:00 +00:00
|
|
|
};
|
|
|
|
|
2020-04-01 16:37:54 -07:00
|
|
|
struct BlockBasedTableBuilder::ParallelCompressionRep {
|
|
|
|
// Keys is a wrapper of vector of strings avoiding
|
|
|
|
// releasing string memories during vector clear()
|
|
|
|
// in order to save memory allocation overhead
|
|
|
|
class Keys {
|
|
|
|
public:
|
|
|
|
Keys() : keys_(kKeysInitSize), size_(0) {}
|
|
|
|
void PushBack(const Slice& key) {
|
|
|
|
if (size_ == keys_.size()) {
|
|
|
|
keys_.emplace_back(key.data(), key.size());
|
|
|
|
} else {
|
|
|
|
keys_[size_].assign(key.data(), key.size());
|
|
|
|
}
|
|
|
|
size_++;
|
|
|
|
}
|
|
|
|
void SwapAssign(std::vector<std::string>& keys) {
|
|
|
|
size_ = keys.size();
|
|
|
|
std::swap(keys_, keys);
|
|
|
|
}
|
|
|
|
void Clear() { size_ = 0; }
|
|
|
|
size_t Size() { return size_; }
|
|
|
|
std::string& Back() { return keys_[size_ - 1]; }
|
|
|
|
std::string& operator[](size_t idx) {
|
|
|
|
assert(idx < size_);
|
|
|
|
return keys_[idx];
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
const size_t kKeysInitSize = 32;
|
|
|
|
std::vector<std::string> keys_;
|
|
|
|
size_t size_;
|
|
|
|
};
|
|
|
|
std::unique_ptr<Keys> curr_block_keys;
|
|
|
|
|
|
|
|
class BlockRepSlot;
|
|
|
|
|
|
|
|
// BlockRep instances are fetched from and recycled to
|
|
|
|
// block_rep_pool during parallel compression.
|
|
|
|
struct BlockRep {
|
|
|
|
Slice contents;
|
2020-05-12 09:25:21 -07:00
|
|
|
Slice compressed_contents;
|
2020-04-01 16:37:54 -07:00
|
|
|
std::unique_ptr<std::string> data;
|
|
|
|
std::unique_ptr<std::string> compressed_data;
|
|
|
|
CompressionType compression_type;
|
|
|
|
std::unique_ptr<std::string> first_key_in_next_block;
|
|
|
|
std::unique_ptr<Keys> keys;
|
|
|
|
std::unique_ptr<BlockRepSlot> slot;
|
|
|
|
Status status;
|
|
|
|
};
|
|
|
|
// Use a vector of BlockRep as a buffer for a determined number
|
|
|
|
// of BlockRep structures. All data referenced by pointers in
|
|
|
|
// BlockRep will be freed when this vector is destructed.
|
2021-09-07 11:31:12 -07:00
|
|
|
using BlockRepBuffer = std::vector<BlockRep>;
|
2020-04-01 16:37:54 -07:00
|
|
|
BlockRepBuffer block_rep_buf;
|
|
|
|
// Use a thread-safe queue for concurrent access from block
|
|
|
|
// building thread and writer thread.
|
2021-09-07 11:31:12 -07:00
|
|
|
using BlockRepPool = WorkQueue<BlockRep*>;
|
2020-04-01 16:37:54 -07:00
|
|
|
BlockRepPool block_rep_pool;
|
|
|
|
|
|
|
|
// Use BlockRepSlot to keep block order in write thread.
|
|
|
|
// slot_ will pass references to BlockRep
|
|
|
|
class BlockRepSlot {
|
|
|
|
public:
|
|
|
|
BlockRepSlot() : slot_(1) {}
|
|
|
|
template <typename T>
|
|
|
|
void Fill(T&& rep) {
|
|
|
|
slot_.push(std::forward<T>(rep));
|
|
|
|
};
|
|
|
|
void Take(BlockRep*& rep) { slot_.pop(rep); }
|
|
|
|
|
|
|
|
private:
|
|
|
|
// slot_ will pass references to BlockRep in block_rep_buf,
|
|
|
|
// and those references are always valid before the destruction of
|
|
|
|
// block_rep_buf.
|
|
|
|
WorkQueue<BlockRep*> slot_;
|
|
|
|
};
|
|
|
|
|
|
|
|
// Compression queue will pass references to BlockRep in block_rep_buf,
|
|
|
|
// and those references are always valid before the destruction of
|
|
|
|
// block_rep_buf.
|
2021-09-07 11:31:12 -07:00
|
|
|
using CompressQueue = WorkQueue<BlockRep*>;
|
2020-04-01 16:37:54 -07:00
|
|
|
CompressQueue compress_queue;
|
|
|
|
std::vector<port::Thread> compress_thread_pool;
|
|
|
|
|
|
|
|
// Write queue will pass references to BlockRep::slot in block_rep_buf,
|
|
|
|
// and those references are always valid before the corresponding
|
|
|
|
// BlockRep::slot is destructed, which is before the destruction of
|
|
|
|
// block_rep_buf.
|
2021-09-07 11:31:12 -07:00
|
|
|
using WriteQueue = WorkQueue<BlockRepSlot*>;
|
2020-04-01 16:37:54 -07:00
|
|
|
WriteQueue write_queue;
|
|
|
|
std::unique_ptr<port::Thread> write_thread;
|
|
|
|
|
2020-10-22 11:03:10 -07:00
|
|
|
// Estimate output file size when parallel compression is enabled. This is
|
|
|
|
// necessary because compression & flush are no longer synchronized,
|
|
|
|
// and BlockBasedTableBuilder::FileSize() is no longer accurate.
|
|
|
|
// memory_order_relaxed suffices because accurate statistics is not required.
|
|
|
|
class FileSizeEstimator {
|
|
|
|
public:
|
|
|
|
explicit FileSizeEstimator()
|
|
|
|
: raw_bytes_compressed(0),
|
|
|
|
raw_bytes_curr_block(0),
|
|
|
|
raw_bytes_curr_block_set(false),
|
|
|
|
raw_bytes_inflight(0),
|
|
|
|
blocks_inflight(0),
|
|
|
|
curr_compression_ratio(0),
|
|
|
|
estimated_file_size(0) {}
|
|
|
|
|
|
|
|
// Estimate file size when a block is about to be emitted to
|
|
|
|
// compression thread
|
|
|
|
void EmitBlock(uint64_t raw_block_size, uint64_t curr_file_size) {
|
|
|
|
uint64_t new_raw_bytes_inflight =
|
|
|
|
raw_bytes_inflight.fetch_add(raw_block_size,
|
|
|
|
std::memory_order_relaxed) +
|
|
|
|
raw_block_size;
|
|
|
|
|
|
|
|
uint64_t new_blocks_inflight =
|
|
|
|
blocks_inflight.fetch_add(1, std::memory_order_relaxed) + 1;
|
|
|
|
|
|
|
|
estimated_file_size.store(
|
|
|
|
curr_file_size +
|
|
|
|
static_cast<uint64_t>(
|
|
|
|
static_cast<double>(new_raw_bytes_inflight) *
|
|
|
|
curr_compression_ratio.load(std::memory_order_relaxed)) +
|
|
|
|
new_blocks_inflight * kBlockTrailerSize,
|
|
|
|
std::memory_order_relaxed);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Estimate file size when a block is already reaped from
|
|
|
|
// compression thread
|
|
|
|
void ReapBlock(uint64_t compressed_block_size, uint64_t curr_file_size) {
|
|
|
|
assert(raw_bytes_curr_block_set);
|
|
|
|
|
|
|
|
uint64_t new_raw_bytes_compressed =
|
|
|
|
raw_bytes_compressed + raw_bytes_curr_block;
|
|
|
|
assert(new_raw_bytes_compressed > 0);
|
|
|
|
|
|
|
|
curr_compression_ratio.store(
|
|
|
|
(curr_compression_ratio.load(std::memory_order_relaxed) *
|
|
|
|
raw_bytes_compressed +
|
|
|
|
compressed_block_size) /
|
|
|
|
static_cast<double>(new_raw_bytes_compressed),
|
|
|
|
std::memory_order_relaxed);
|
|
|
|
raw_bytes_compressed = new_raw_bytes_compressed;
|
|
|
|
|
|
|
|
uint64_t new_raw_bytes_inflight =
|
|
|
|
raw_bytes_inflight.fetch_sub(raw_bytes_curr_block,
|
|
|
|
std::memory_order_relaxed) -
|
|
|
|
raw_bytes_curr_block;
|
|
|
|
|
|
|
|
uint64_t new_blocks_inflight =
|
|
|
|
blocks_inflight.fetch_sub(1, std::memory_order_relaxed) - 1;
|
|
|
|
|
|
|
|
estimated_file_size.store(
|
|
|
|
curr_file_size +
|
|
|
|
static_cast<uint64_t>(
|
|
|
|
static_cast<double>(new_raw_bytes_inflight) *
|
|
|
|
curr_compression_ratio.load(std::memory_order_relaxed)) +
|
|
|
|
new_blocks_inflight * kBlockTrailerSize,
|
|
|
|
std::memory_order_relaxed);
|
|
|
|
|
|
|
|
raw_bytes_curr_block_set = false;
|
|
|
|
}
|
|
|
|
|
|
|
|
void SetEstimatedFileSize(uint64_t size) {
|
|
|
|
estimated_file_size.store(size, std::memory_order_relaxed);
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t GetEstimatedFileSize() {
|
|
|
|
return estimated_file_size.load(std::memory_order_relaxed);
|
|
|
|
}
|
|
|
|
|
|
|
|
void SetCurrBlockRawSize(uint64_t size) {
|
|
|
|
raw_bytes_curr_block = size;
|
|
|
|
raw_bytes_curr_block_set = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
// Raw bytes compressed so far.
|
|
|
|
uint64_t raw_bytes_compressed;
|
|
|
|
// Size of current block being appended.
|
|
|
|
uint64_t raw_bytes_curr_block;
|
|
|
|
// Whether raw_bytes_curr_block has been set for next
|
|
|
|
// ReapBlock call.
|
|
|
|
bool raw_bytes_curr_block_set;
|
|
|
|
// Raw bytes under compression and not appended yet.
|
|
|
|
std::atomic<uint64_t> raw_bytes_inflight;
|
|
|
|
// Number of blocks under compression and not appended yet.
|
|
|
|
std::atomic<uint64_t> blocks_inflight;
|
|
|
|
// Current compression ratio, maintained by BGWorkWriteRawBlock.
|
|
|
|
std::atomic<double> curr_compression_ratio;
|
|
|
|
// Estimated SST file size.
|
|
|
|
std::atomic<uint64_t> estimated_file_size;
|
|
|
|
};
|
|
|
|
FileSizeEstimator file_size_estimator;
|
|
|
|
|
|
|
|
// Facilities used for waiting first block completion. Need to Wait for
|
|
|
|
// the completion of first block compression and flush to get a non-zero
|
|
|
|
// compression ratio.
|
|
|
|
std::atomic<bool> first_block_processed;
|
2020-04-01 16:37:54 -07:00
|
|
|
std::condition_variable first_block_cond;
|
|
|
|
std::mutex first_block_mutex;
|
|
|
|
|
2020-10-22 11:03:10 -07:00
|
|
|
explicit ParallelCompressionRep(uint32_t parallel_threads)
|
2020-04-01 16:37:54 -07:00
|
|
|
: curr_block_keys(new Keys()),
|
|
|
|
block_rep_buf(parallel_threads),
|
|
|
|
block_rep_pool(parallel_threads),
|
|
|
|
compress_queue(parallel_threads),
|
|
|
|
write_queue(parallel_threads),
|
2020-10-22 11:03:10 -07:00
|
|
|
first_block_processed(false) {
|
2020-04-01 16:37:54 -07:00
|
|
|
for (uint32_t i = 0; i < parallel_threads; i++) {
|
|
|
|
block_rep_buf[i].contents = Slice();
|
2020-05-12 09:25:21 -07:00
|
|
|
block_rep_buf[i].compressed_contents = Slice();
|
2020-04-01 16:37:54 -07:00
|
|
|
block_rep_buf[i].data.reset(new std::string());
|
|
|
|
block_rep_buf[i].compressed_data.reset(new std::string());
|
|
|
|
block_rep_buf[i].compression_type = CompressionType();
|
|
|
|
block_rep_buf[i].first_key_in_next_block.reset(new std::string());
|
|
|
|
block_rep_buf[i].keys.reset(new Keys());
|
|
|
|
block_rep_buf[i].slot.reset(new BlockRepSlot());
|
|
|
|
block_rep_buf[i].status = Status::OK();
|
|
|
|
block_rep_pool.push(&block_rep_buf[i]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
~ParallelCompressionRep() { block_rep_pool.finish(); }
|
2020-10-22 11:03:10 -07:00
|
|
|
|
|
|
|
// Make a block prepared to be emitted to compression thread
|
|
|
|
// Used in non-buffered mode
|
|
|
|
BlockRep* PrepareBlock(CompressionType compression_type,
|
|
|
|
const Slice* first_key_in_next_block,
|
|
|
|
BlockBuilder* data_block) {
|
|
|
|
BlockRep* block_rep =
|
|
|
|
PrepareBlockInternal(compression_type, first_key_in_next_block);
|
|
|
|
assert(block_rep != nullptr);
|
|
|
|
data_block->SwapAndReset(*(block_rep->data));
|
|
|
|
block_rep->contents = *(block_rep->data);
|
|
|
|
std::swap(block_rep->keys, curr_block_keys);
|
|
|
|
curr_block_keys->Clear();
|
|
|
|
return block_rep;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Used in EnterUnbuffered
|
|
|
|
BlockRep* PrepareBlock(CompressionType compression_type,
|
|
|
|
const Slice* first_key_in_next_block,
|
|
|
|
std::string* data_block,
|
|
|
|
std::vector<std::string>* keys) {
|
|
|
|
BlockRep* block_rep =
|
|
|
|
PrepareBlockInternal(compression_type, first_key_in_next_block);
|
|
|
|
assert(block_rep != nullptr);
|
|
|
|
std::swap(*(block_rep->data), *data_block);
|
|
|
|
block_rep->contents = *(block_rep->data);
|
|
|
|
block_rep->keys->SwapAssign(*keys);
|
|
|
|
return block_rep;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Emit a block to compression thread
|
|
|
|
void EmitBlock(BlockRep* block_rep) {
|
|
|
|
assert(block_rep != nullptr);
|
|
|
|
assert(block_rep->status.ok());
|
|
|
|
if (!write_queue.push(block_rep->slot.get())) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (!compress_queue.push(block_rep)) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!first_block_processed.load(std::memory_order_relaxed)) {
|
|
|
|
std::unique_lock<std::mutex> lock(first_block_mutex);
|
|
|
|
first_block_cond.wait(lock, [this] {
|
|
|
|
return first_block_processed.load(std::memory_order_relaxed);
|
|
|
|
});
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Reap a block from compression thread
|
|
|
|
void ReapBlock(BlockRep* block_rep) {
|
|
|
|
assert(block_rep != nullptr);
|
|
|
|
block_rep->compressed_data->clear();
|
|
|
|
block_rep_pool.push(block_rep);
|
|
|
|
|
|
|
|
if (!first_block_processed.load(std::memory_order_relaxed)) {
|
|
|
|
std::lock_guard<std::mutex> lock(first_block_mutex);
|
|
|
|
first_block_processed.store(true, std::memory_order_relaxed);
|
|
|
|
first_block_cond.notify_one();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
BlockRep* PrepareBlockInternal(CompressionType compression_type,
|
|
|
|
const Slice* first_key_in_next_block) {
|
|
|
|
BlockRep* block_rep = nullptr;
|
|
|
|
block_rep_pool.pop(block_rep);
|
|
|
|
assert(block_rep != nullptr);
|
|
|
|
|
|
|
|
assert(block_rep->data);
|
|
|
|
|
|
|
|
block_rep->compression_type = compression_type;
|
|
|
|
|
|
|
|
if (first_key_in_next_block == nullptr) {
|
|
|
|
block_rep->first_key_in_next_block.reset(nullptr);
|
|
|
|
} else {
|
|
|
|
block_rep->first_key_in_next_block->assign(
|
|
|
|
first_key_in_next_block->data(), first_key_in_next_block->size());
|
|
|
|
}
|
|
|
|
|
|
|
|
return block_rep;
|
|
|
|
}
|
2020-04-01 16:37:54 -07:00
|
|
|
};
|
|
|
|
|
2013-11-19 22:00:48 -08:00
|
|
|
BlockBasedTableBuilder::BlockBasedTableBuilder(
|
2021-04-29 06:59:53 -07:00
|
|
|
const BlockBasedTableOptions& table_options, const TableBuilderOptions& tbo,
|
|
|
|
WritableFileWriter* file) {
|
2015-01-13 14:33:04 -08:00
|
|
|
BlockBasedTableOptions sanitized_table_options(table_options);
|
|
|
|
if (sanitized_table_options.format_version == 0 &&
|
|
|
|
sanitized_table_options.checksum != kCRC32c) {
|
2017-03-15 19:22:52 -07:00
|
|
|
ROCKS_LOG_WARN(
|
2021-04-29 06:59:53 -07:00
|
|
|
tbo.ioptions.logger,
|
2015-01-13 14:33:04 -08:00
|
|
|
"Silently converting format_version to 1 because checksum is "
|
|
|
|
"non-default");
|
|
|
|
// silently convert format_version to 1 to keep consistent with current
|
|
|
|
// behavior
|
|
|
|
sanitized_table_options.format_version = 1;
|
|
|
|
}
|
|
|
|
|
2021-04-29 06:59:53 -07:00
|
|
|
rep_ = new Rep(sanitized_table_options, tbo, file);
|
2015-02-17 08:03:45 -08:00
|
|
|
|
2017-03-07 13:48:02 -08:00
|
|
|
if (rep_->filter_builder != nullptr) {
|
|
|
|
rep_->filter_builder->StartBlock(0);
|
2012-04-17 08:36:46 -07:00
|
|
|
}
|
2021-06-17 21:55:42 -07:00
|
|
|
|
New stable, fixed-length cache keys (#9126)
Summary:
This change standardizes on a new 16-byte cache key format for
block cache (incl compressed and secondary) and persistent cache (but
not table cache and row cache).
The goal is a really fast cache key with practically ideal stability and
uniqueness properties without external dependencies (e.g. from FileSystem).
A fixed key size of 16 bytes should enable future optimizations to the
concurrent hash table for block cache, which is a heavy CPU user /
bottleneck, but there appears to be measurable performance improvement
even with no changes to LRUCache.
This change replaces a lot of disjointed and ugly code handling cache
keys with calls to a simple, clean new internal API (cache_key.h).
(Preserving the old cache key logic under an option would be very ugly
and likely negate the performance gain of the new approach. Complete
replacement carries some inherent risk, but I think that's acceptable
with sufficient analysis and testing.)
The scheme for encoding new cache keys is complicated but explained
in cache_key.cc.
Also: EndianSwapValue is moved to math.h to be next to other bit
operations. (Explains some new include "math.h".) ReverseBits operation
added and unit tests added to hash_test for both.
Fixes https://github.com/facebook/rocksdb/issues/7405 (presuming a root cause)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9126
Test Plan:
### Basic correctness
Several tests needed updates to work with the new functionality, mostly
because we are no longer relying on filesystem for stable cache keys
so table builders & readers need more context info to agree on cache
keys. This functionality is so core, a huge number of existing tests
exercise the cache key functionality.
### Performance
Create db with
`TEST_TMPDIR=/dev/shm ./db_bench -bloom_bits=10 -benchmarks=fillrandom -num=3000000 -partition_index_and_filters`
And test performance with
`TEST_TMPDIR=/dev/shm ./db_bench -readonly -use_existing_db -bloom_bits=10 -benchmarks=readrandom -num=3000000 -duration=30 -cache_index_and_filter_blocks -cache_size=250000 -threads=4`
using DEBUG_LEVEL=0 and simultaneous before & after runs.
Before ops/sec, avg over 100 runs: 121924
After ops/sec, avg over 100 runs: 125385 (+2.8%)
### Collision probability
I have built a tool, ./cache_bench -stress_cache_key to broadly simulate host-wide cache activity
over many months, by making some pessimistic simplifying assumptions:
* Every generated file has a cache entry for every byte offset in the file (contiguous range of cache keys)
* All of every file is cached for its entire lifetime
We use a simple table with skewed address assignment and replacement on address collision
to simulate files coming & going, with quite a variance (super-Poisson) in ages. Some output
with `./cache_bench -stress_cache_key -sck_keep_bits=40`:
```
Total cache or DBs size: 32TiB Writing 925.926 MiB/s or 76.2939TiB/day
Multiply by 9.22337e+18 to correct for simulation losses (but still assume whole file cached)
```
These come from default settings of 2.5M files per day of 32 MB each, and
`-sck_keep_bits=40` means that to represent a single file, we are only keeping 40 bits of
the 128-bit cache key. With file size of 2\*\*25 contiguous keys (pessimistic), our simulation
is about 2\*\*(128-40-25) or about 9 billion billion times more prone to collision than reality.
More default assumptions, relatively pessimistic:
* 100 DBs in same process (doesn't matter much)
* Re-open DB in same process (new session ID related to old session ID) on average
every 100 files generated
* Restart process (all new session IDs unrelated to old) 24 times per day
After enough data, we get a result at the end:
```
(keep 40 bits) 17 collisions after 2 x 90 days, est 10.5882 days between (9.76592e+19 corrected)
```
If we believe the (pessimistic) simulation and the mathematical generalization, we would need to run a billion machines all for 97 billion days to expect a cache key collision. To help verify that our generalization ("corrected") is robust, we can make our simulation more precise with `-sck_keep_bits=41` and `42`, which takes more running time to get enough data:
```
(keep 41 bits) 16 collisions after 4 x 90 days, est 22.5 days between (1.03763e+20 corrected)
(keep 42 bits) 19 collisions after 10 x 90 days, est 47.3684 days between (1.09224e+20 corrected)
```
The generalized prediction still holds. With the `-sck_randomize` option, we can see that we are beating "random" cache keys (except offsets still non-randomized) by a modest amount (roughly 20x less collision prone than random), which should make us reasonably comfortable even in "degenerate" cases:
```
197 collisions after 1 x 90 days, est 0.456853 days between (4.21372e+18 corrected)
```
I've run other tests to validate other conditions behave as expected, never behaving "worse than random" unless we start chopping off structured data.
Reviewed By: zhichao-cao
Differential Revision: D33171746
Pulled By: pdillinger
fbshipit-source-id: f16a57e369ed37be5e7e33525ace848d0537c88f
2021-12-16 17:13:55 -08:00
|
|
|
TEST_SYNC_POINT_CALLBACK(
|
|
|
|
"BlockBasedTableBuilder::BlockBasedTableBuilder:PreSetupBaseCacheKey",
|
|
|
|
const_cast<TableProperties*>(&rep_->props));
|
|
|
|
|
|
|
|
// Extremely large files use atypical cache key encoding, and we don't
|
|
|
|
// know ahead of time how big the file will be. But assuming it's less
|
|
|
|
// than 4TB, we will correctly predict the cache keys.
|
|
|
|
BlockBasedTable::SetupBaseCacheKey(
|
|
|
|
&rep_->props, tbo.db_session_id, tbo.cur_file_num,
|
|
|
|
BlockBasedTable::kMaxFileSizeStandardEncoding, &rep_->base_cache_key);
|
2020-04-01 16:37:54 -07:00
|
|
|
|
2020-10-22 11:03:10 -07:00
|
|
|
if (rep_->IsParallelCompressionEnabled()) {
|
|
|
|
StartParallelCompression();
|
2020-04-01 16:37:54 -07:00
|
|
|
}
|
2011-03-18 22:37:00 +00:00
|
|
|
}
|
|
|
|
|
2013-10-28 17:54:09 -07:00
|
|
|
BlockBasedTableBuilder::~BlockBasedTableBuilder() {
|
Reduce scope of compression dictionary to single SST (#4952)
Summary:
Our previous approach was to train one compression dictionary per compaction, using the first output SST to train a dictionary, and then applying it on subsequent SSTs in the same compaction. While this was great for minimizing CPU/memory/I/O overhead, it did not achieve good compression ratios in practice. In our most promising potential use case, moderate reductions in a dictionary's scope make a major difference on compression ratio.
So, this PR changes compression dictionary to be scoped per-SST. It accepts the tradeoff during table building to use more memory and CPU. Important changes include:
- The `BlockBasedTableBuilder` has a new state when dictionary compression is in-use: `kBuffered`. In that state it accumulates uncompressed data in-memory whenever `Add` is called.
- After accumulating target file size bytes or calling `BlockBasedTableBuilder::Finish`, a `BlockBasedTableBuilder` moves to the `kUnbuffered` state. The transition (`EnterUnbuffered()`) involves sampling the buffered data, training a dictionary, and compressing/writing out all buffered data. In the `kUnbuffered` state, a `BlockBasedTableBuilder` behaves the same as before -- blocks are compressed/written out as soon as they fill up.
- Samples are now whole uncompressed data blocks, except the final sample may be a partial data block so we don't breach the user's configured `max_dict_bytes` or `zstd_max_train_bytes`. The dictionary trainer is supposed to work better when we pass it real units of compression. Previously we were passing 64-byte KV samples which was not realistic.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4952
Differential Revision: D13967980
Pulled By: ajkr
fbshipit-source-id: 82bea6f7537e1529c7a1a4cdee84585f5949300f
2019-02-11 19:42:25 -08:00
|
|
|
// Catch errors where caller forgot to call Finish()
|
|
|
|
assert(rep_->state == Rep::State::kClosed);
|
2011-03-18 22:37:00 +00:00
|
|
|
delete rep_;
|
|
|
|
}
|
|
|
|
|
2013-10-28 17:54:09 -07:00
|
|
|
void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
|
2011-03-18 22:37:00 +00:00
|
|
|
Rep* r = rep_;
|
Reduce scope of compression dictionary to single SST (#4952)
Summary:
Our previous approach was to train one compression dictionary per compaction, using the first output SST to train a dictionary, and then applying it on subsequent SSTs in the same compaction. While this was great for minimizing CPU/memory/I/O overhead, it did not achieve good compression ratios in practice. In our most promising potential use case, moderate reductions in a dictionary's scope make a major difference on compression ratio.
So, this PR changes compression dictionary to be scoped per-SST. It accepts the tradeoff during table building to use more memory and CPU. Important changes include:
- The `BlockBasedTableBuilder` has a new state when dictionary compression is in-use: `kBuffered`. In that state it accumulates uncompressed data in-memory whenever `Add` is called.
- After accumulating target file size bytes or calling `BlockBasedTableBuilder::Finish`, a `BlockBasedTableBuilder` moves to the `kUnbuffered` state. The transition (`EnterUnbuffered()`) involves sampling the buffered data, training a dictionary, and compressing/writing out all buffered data. In the `kUnbuffered` state, a `BlockBasedTableBuilder` behaves the same as before -- blocks are compressed/written out as soon as they fill up.
- Samples are now whole uncompressed data blocks, except the final sample may be a partial data block so we don't breach the user's configured `max_dict_bytes` or `zstd_max_train_bytes`. The dictionary trainer is supposed to work better when we pass it real units of compression. Previously we were passing 64-byte KV samples which was not realistic.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4952
Differential Revision: D13967980
Pulled By: ajkr
fbshipit-source-id: 82bea6f7537e1529c7a1a4cdee84585f5949300f
2019-02-11 19:42:25 -08:00
|
|
|
assert(rep_->state != Rep::State::kClosed);
|
2011-03-18 22:37:00 +00:00
|
|
|
if (!ok()) return;
|
2016-08-19 15:10:31 -07:00
|
|
|
ValueType value_type = ExtractValueType(key);
|
|
|
|
if (IsValueType(value_type)) {
|
2019-01-02 15:05:41 -08:00
|
|
|
#ifndef NDEBUG
|
|
|
|
if (r->props.num_entries > r->props.num_range_deletions) {
|
2016-08-19 15:10:31 -07:00
|
|
|
assert(r->internal_comparator.Compare(key, Slice(r->last_key)) > 0);
|
2013-11-07 21:27:21 -08:00
|
|
|
}
|
2020-09-29 18:21:49 -07:00
|
|
|
#endif // !NDEBUG
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2016-08-19 15:10:31 -07:00
|
|
|
auto should_flush = r->flush_block_policy->Update(key, value);
|
|
|
|
if (should_flush) {
|
|
|
|
assert(!r->data_block.empty());
|
2020-04-01 16:37:54 -07:00
|
|
|
r->first_key_in_next_block = &key;
|
2016-08-19 15:10:31 -07:00
|
|
|
Flush();
|
2021-09-08 12:34:35 -07:00
|
|
|
if (r->state == Rep::State::kBuffered) {
|
|
|
|
bool exceeds_buffer_limit =
|
|
|
|
(r->buffer_limit != 0 && r->data_begin_offset > r->buffer_limit);
|
2021-11-01 14:26:50 -07:00
|
|
|
bool exceeds_global_block_cache_limit = false;
|
2021-09-08 12:34:35 -07:00
|
|
|
|
|
|
|
// Increase cache reservation for the last buffered data block
|
|
|
|
// only if the block is not going to be unbuffered immediately
|
|
|
|
// and there exists a cache reservation manager
|
2021-11-01 14:26:50 -07:00
|
|
|
if (!exceeds_buffer_limit &&
|
|
|
|
r->compression_dict_buffer_cache_res_mgr != nullptr) {
|
|
|
|
Status s =
|
|
|
|
r->compression_dict_buffer_cache_res_mgr->UpdateCacheReservation<
|
|
|
|
CacheEntryRole::kCompressionDictionaryBuildingBuffer>(
|
|
|
|
r->data_begin_offset);
|
|
|
|
exceeds_global_block_cache_limit = s.IsIncomplete();
|
2021-09-08 12:34:35 -07:00
|
|
|
}
|
2016-08-19 15:10:31 -07:00
|
|
|
|
2021-11-01 14:26:50 -07:00
|
|
|
if (exceeds_buffer_limit || exceeds_global_block_cache_limit) {
|
2021-09-08 12:34:35 -07:00
|
|
|
EnterUnbuffered();
|
|
|
|
}
|
Reduce scope of compression dictionary to single SST (#4952)
Summary:
Our previous approach was to train one compression dictionary per compaction, using the first output SST to train a dictionary, and then applying it on subsequent SSTs in the same compaction. While this was great for minimizing CPU/memory/I/O overhead, it did not achieve good compression ratios in practice. In our most promising potential use case, moderate reductions in a dictionary's scope make a major difference on compression ratio.
So, this PR changes compression dictionary to be scoped per-SST. It accepts the tradeoff during table building to use more memory and CPU. Important changes include:
- The `BlockBasedTableBuilder` has a new state when dictionary compression is in-use: `kBuffered`. In that state it accumulates uncompressed data in-memory whenever `Add` is called.
- After accumulating target file size bytes or calling `BlockBasedTableBuilder::Finish`, a `BlockBasedTableBuilder` moves to the `kUnbuffered` state. The transition (`EnterUnbuffered()`) involves sampling the buffered data, training a dictionary, and compressing/writing out all buffered data. In the `kUnbuffered` state, a `BlockBasedTableBuilder` behaves the same as before -- blocks are compressed/written out as soon as they fill up.
- Samples are now whole uncompressed data blocks, except the final sample may be a partial data block so we don't breach the user's configured `max_dict_bytes` or `zstd_max_train_bytes`. The dictionary trainer is supposed to work better when we pass it real units of compression. Previously we were passing 64-byte KV samples which was not realistic.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4952
Differential Revision: D13967980
Pulled By: ajkr
fbshipit-source-id: 82bea6f7537e1529c7a1a4cdee84585f5949300f
2019-02-11 19:42:25 -08:00
|
|
|
}
|
|
|
|
|
2016-08-19 15:10:31 -07:00
|
|
|
// Add item to index block.
|
|
|
|
// We do not emit the index entry for a block until we have seen the
|
|
|
|
// first key for the next data block. This allows us to use shorter
|
|
|
|
// keys in the index block. For example, consider a block boundary
|
|
|
|
// between the keys "the quick brown fox" and "the who". We can use
|
|
|
|
// "the r" as the key for the index block entry since it is >= all
|
|
|
|
// entries in the first block and < all entries in subsequent
|
|
|
|
// blocks.
|
Reduce scope of compression dictionary to single SST (#4952)
Summary:
Our previous approach was to train one compression dictionary per compaction, using the first output SST to train a dictionary, and then applying it on subsequent SSTs in the same compaction. While this was great for minimizing CPU/memory/I/O overhead, it did not achieve good compression ratios in practice. In our most promising potential use case, moderate reductions in a dictionary's scope make a major difference on compression ratio.
So, this PR changes compression dictionary to be scoped per-SST. It accepts the tradeoff during table building to use more memory and CPU. Important changes include:
- The `BlockBasedTableBuilder` has a new state when dictionary compression is in-use: `kBuffered`. In that state it accumulates uncompressed data in-memory whenever `Add` is called.
- After accumulating target file size bytes or calling `BlockBasedTableBuilder::Finish`, a `BlockBasedTableBuilder` moves to the `kUnbuffered` state. The transition (`EnterUnbuffered()`) involves sampling the buffered data, training a dictionary, and compressing/writing out all buffered data. In the `kUnbuffered` state, a `BlockBasedTableBuilder` behaves the same as before -- blocks are compressed/written out as soon as they fill up.
- Samples are now whole uncompressed data blocks, except the final sample may be a partial data block so we don't breach the user's configured `max_dict_bytes` or `zstd_max_train_bytes`. The dictionary trainer is supposed to work better when we pass it real units of compression. Previously we were passing 64-byte KV samples which was not realistic.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4952
Differential Revision: D13967980
Pulled By: ajkr
fbshipit-source-id: 82bea6f7537e1529c7a1a4cdee84585f5949300f
2019-02-11 19:42:25 -08:00
|
|
|
if (ok() && r->state == Rep::State::kUnbuffered) {
|
2020-10-22 11:03:10 -07:00
|
|
|
if (r->IsParallelCompressionEnabled()) {
|
2020-04-01 16:37:54 -07:00
|
|
|
r->pc_rep->curr_block_keys->Clear();
|
|
|
|
} else {
|
|
|
|
r->index_builder->AddIndexEntry(&r->last_key, &key,
|
|
|
|
r->pending_handle);
|
|
|
|
}
|
2016-08-19 15:10:31 -07:00
|
|
|
}
|
|
|
|
}
|
2012-04-17 08:36:46 -07:00
|
|
|
|
2017-03-07 13:48:02 -08:00
|
|
|
// Note: PartitionedFilterBlockBuilder requires key being added to filter
|
|
|
|
// builder after being added to index builder.
|
2020-04-01 16:37:54 -07:00
|
|
|
if (r->state == Rep::State::kUnbuffered) {
|
2020-10-22 11:03:10 -07:00
|
|
|
if (r->IsParallelCompressionEnabled()) {
|
2020-04-01 16:37:54 -07:00
|
|
|
r->pc_rep->curr_block_keys->PushBack(key);
|
|
|
|
} else {
|
|
|
|
if (r->filter_builder != nullptr) {
|
|
|
|
size_t ts_sz =
|
|
|
|
r->internal_comparator.user_comparator()->timestamp_size();
|
|
|
|
r->filter_builder->Add(ExtractUserKeyAndStripTimestamp(key, ts_sz));
|
|
|
|
}
|
|
|
|
}
|
2016-08-19 15:10:31 -07:00
|
|
|
}
|
2013-10-16 11:50:50 -07:00
|
|
|
|
2021-10-19 12:35:17 -07:00
|
|
|
r->data_block.AddWithLastKey(key, value, r->last_key);
|
2016-08-19 15:10:31 -07:00
|
|
|
r->last_key.assign(key.data(), key.size());
|
Reduce scope of compression dictionary to single SST (#4952)
Summary:
Our previous approach was to train one compression dictionary per compaction, using the first output SST to train a dictionary, and then applying it on subsequent SSTs in the same compaction. While this was great for minimizing CPU/memory/I/O overhead, it did not achieve good compression ratios in practice. In our most promising potential use case, moderate reductions in a dictionary's scope make a major difference on compression ratio.
So, this PR changes compression dictionary to be scoped per-SST. It accepts the tradeoff during table building to use more memory and CPU. Important changes include:
- The `BlockBasedTableBuilder` has a new state when dictionary compression is in-use: `kBuffered`. In that state it accumulates uncompressed data in-memory whenever `Add` is called.
- After accumulating target file size bytes or calling `BlockBasedTableBuilder::Finish`, a `BlockBasedTableBuilder` moves to the `kUnbuffered` state. The transition (`EnterUnbuffered()`) involves sampling the buffered data, training a dictionary, and compressing/writing out all buffered data. In the `kUnbuffered` state, a `BlockBasedTableBuilder` behaves the same as before -- blocks are compressed/written out as soon as they fill up.
- Samples are now whole uncompressed data blocks, except the final sample may be a partial data block so we don't breach the user's configured `max_dict_bytes` or `zstd_max_train_bytes`. The dictionary trainer is supposed to work better when we pass it real units of compression. Previously we were passing 64-byte KV samples which was not realistic.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4952
Differential Revision: D13967980
Pulled By: ajkr
fbshipit-source-id: 82bea6f7537e1529c7a1a4cdee84585f5949300f
2019-02-11 19:42:25 -08:00
|
|
|
if (r->state == Rep::State::kBuffered) {
|
2021-04-23 12:44:11 -07:00
|
|
|
// Buffered keys will be replayed from data_block_buffers during
|
|
|
|
// `Finish()` once compression dictionary has been finalized.
|
Reduce scope of compression dictionary to single SST (#4952)
Summary:
Our previous approach was to train one compression dictionary per compaction, using the first output SST to train a dictionary, and then applying it on subsequent SSTs in the same compaction. While this was great for minimizing CPU/memory/I/O overhead, it did not achieve good compression ratios in practice. In our most promising potential use case, moderate reductions in a dictionary's scope make a major difference on compression ratio.
So, this PR changes compression dictionary to be scoped per-SST. It accepts the tradeoff during table building to use more memory and CPU. Important changes include:
- The `BlockBasedTableBuilder` has a new state when dictionary compression is in-use: `kBuffered`. In that state it accumulates uncompressed data in-memory whenever `Add` is called.
- After accumulating target file size bytes or calling `BlockBasedTableBuilder::Finish`, a `BlockBasedTableBuilder` moves to the `kUnbuffered` state. The transition (`EnterUnbuffered()`) involves sampling the buffered data, training a dictionary, and compressing/writing out all buffered data. In the `kUnbuffered` state, a `BlockBasedTableBuilder` behaves the same as before -- blocks are compressed/written out as soon as they fill up.
- Samples are now whole uncompressed data blocks, except the final sample may be a partial data block so we don't breach the user's configured `max_dict_bytes` or `zstd_max_train_bytes`. The dictionary trainer is supposed to work better when we pass it real units of compression. Previously we were passing 64-byte KV samples which was not realistic.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4952
Differential Revision: D13967980
Pulled By: ajkr
fbshipit-source-id: 82bea6f7537e1529c7a1a4cdee84585f5949300f
2019-02-11 19:42:25 -08:00
|
|
|
} else {
|
2020-10-22 11:03:10 -07:00
|
|
|
if (!r->IsParallelCompressionEnabled()) {
|
2020-04-01 16:37:54 -07:00
|
|
|
r->index_builder->OnKeyAdded(key);
|
|
|
|
}
|
Reduce scope of compression dictionary to single SST (#4952)
Summary:
Our previous approach was to train one compression dictionary per compaction, using the first output SST to train a dictionary, and then applying it on subsequent SSTs in the same compaction. While this was great for minimizing CPU/memory/I/O overhead, it did not achieve good compression ratios in practice. In our most promising potential use case, moderate reductions in a dictionary's scope make a major difference on compression ratio.
So, this PR changes compression dictionary to be scoped per-SST. It accepts the tradeoff during table building to use more memory and CPU. Important changes include:
- The `BlockBasedTableBuilder` has a new state when dictionary compression is in-use: `kBuffered`. In that state it accumulates uncompressed data in-memory whenever `Add` is called.
- After accumulating target file size bytes or calling `BlockBasedTableBuilder::Finish`, a `BlockBasedTableBuilder` moves to the `kUnbuffered` state. The transition (`EnterUnbuffered()`) involves sampling the buffered data, training a dictionary, and compressing/writing out all buffered data. In the `kUnbuffered` state, a `BlockBasedTableBuilder` behaves the same as before -- blocks are compressed/written out as soon as they fill up.
- Samples are now whole uncompressed data blocks, except the final sample may be a partial data block so we don't breach the user's configured `max_dict_bytes` or `zstd_max_train_bytes`. The dictionary trainer is supposed to work better when we pass it real units of compression. Previously we were passing 64-byte KV samples which was not realistic.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4952
Differential Revision: D13967980
Pulled By: ajkr
fbshipit-source-id: 82bea6f7537e1529c7a1a4cdee84585f5949300f
2019-02-11 19:42:25 -08:00
|
|
|
}
|
2020-04-02 16:13:44 -07:00
|
|
|
// TODO offset passed in is not accurate for parallel compression case
|
|
|
|
NotifyCollectTableCollectorsOnAdd(key, value, r->get_offset(),
|
2016-08-19 15:10:31 -07:00
|
|
|
r->table_properties_collectors,
|
2021-04-26 12:43:02 -07:00
|
|
|
r->ioptions.logger);
|
2016-08-19 15:10:31 -07:00
|
|
|
|
|
|
|
} else if (value_type == kTypeRangeDeletion) {
|
|
|
|
r->range_del_block.Add(key, value);
|
2020-04-02 16:13:44 -07:00
|
|
|
// TODO offset passed in is not accurate for parallel compression case
|
|
|
|
NotifyCollectTableCollectorsOnAdd(key, value, r->get_offset(),
|
2016-09-12 14:14:40 -07:00
|
|
|
r->table_properties_collectors,
|
2021-04-26 12:43:02 -07:00
|
|
|
r->ioptions.logger);
|
2016-08-19 15:10:31 -07:00
|
|
|
} else {
|
|
|
|
assert(false);
|
|
|
|
}
|
2019-01-02 15:05:41 -08:00
|
|
|
|
|
|
|
r->props.num_entries++;
|
|
|
|
r->props.raw_key_size += key.size();
|
|
|
|
r->props.raw_value_size += value.size();
|
|
|
|
if (value_type == kTypeDeletion || value_type == kTypeSingleDeletion) {
|
|
|
|
r->props.num_deletions++;
|
|
|
|
} else if (value_type == kTypeRangeDeletion) {
|
|
|
|
r->props.num_deletions++;
|
|
|
|
r->props.num_range_deletions++;
|
|
|
|
} else if (value_type == kTypeMerge) {
|
|
|
|
r->props.num_merge_operands++;
|
|
|
|
}
|
2011-03-18 22:37:00 +00:00
|
|
|
}
|
|
|
|
|
2013-10-28 17:54:09 -07:00
|
|
|
void BlockBasedTableBuilder::Flush() {
|
2011-03-18 22:37:00 +00:00
|
|
|
Rep* r = rep_;
|
Reduce scope of compression dictionary to single SST (#4952)
Summary:
Our previous approach was to train one compression dictionary per compaction, using the first output SST to train a dictionary, and then applying it on subsequent SSTs in the same compaction. While this was great for minimizing CPU/memory/I/O overhead, it did not achieve good compression ratios in practice. In our most promising potential use case, moderate reductions in a dictionary's scope make a major difference on compression ratio.
So, this PR changes compression dictionary to be scoped per-SST. It accepts the tradeoff during table building to use more memory and CPU. Important changes include:
- The `BlockBasedTableBuilder` has a new state when dictionary compression is in-use: `kBuffered`. In that state it accumulates uncompressed data in-memory whenever `Add` is called.
- After accumulating target file size bytes or calling `BlockBasedTableBuilder::Finish`, a `BlockBasedTableBuilder` moves to the `kUnbuffered` state. The transition (`EnterUnbuffered()`) involves sampling the buffered data, training a dictionary, and compressing/writing out all buffered data. In the `kUnbuffered` state, a `BlockBasedTableBuilder` behaves the same as before -- blocks are compressed/written out as soon as they fill up.
- Samples are now whole uncompressed data blocks, except the final sample may be a partial data block so we don't breach the user's configured `max_dict_bytes` or `zstd_max_train_bytes`. The dictionary trainer is supposed to work better when we pass it real units of compression. Previously we were passing 64-byte KV samples which was not realistic.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4952
Differential Revision: D13967980
Pulled By: ajkr
fbshipit-source-id: 82bea6f7537e1529c7a1a4cdee84585f5949300f
2019-02-11 19:42:25 -08:00
|
|
|
assert(rep_->state != Rep::State::kClosed);
|
2011-03-18 22:37:00 +00:00
|
|
|
if (!ok()) return;
|
|
|
|
if (r->data_block.empty()) return;
|
2020-10-22 11:03:10 -07:00
|
|
|
if (r->IsParallelCompressionEnabled() &&
|
2020-04-01 16:37:54 -07:00
|
|
|
r->state == Rep::State::kUnbuffered) {
|
|
|
|
r->data_block.Finish();
|
2020-10-22 11:03:10 -07:00
|
|
|
ParallelCompressionRep::BlockRep* block_rep = r->pc_rep->PrepareBlock(
|
|
|
|
r->compression_type, r->first_key_in_next_block, &(r->data_block));
|
|
|
|
assert(block_rep != nullptr);
|
|
|
|
r->pc_rep->file_size_estimator.EmitBlock(block_rep->data->size(),
|
|
|
|
r->get_offset());
|
|
|
|
r->pc_rep->EmitBlock(block_rep);
|
2020-04-01 16:37:54 -07:00
|
|
|
} else {
|
2021-08-03 12:42:22 -07:00
|
|
|
WriteBlock(&r->data_block, &r->pending_handle, BlockType::kData);
|
2020-04-01 16:37:54 -07:00
|
|
|
}
|
2012-06-27 23:41:33 -07:00
|
|
|
}
|
|
|
|
|
2013-10-28 17:54:09 -07:00
|
|
|
void BlockBasedTableBuilder::WriteBlock(BlockBuilder* block,
|
Shared dictionary compression using reference block
Summary:
This adds a new metablock containing a shared dictionary that is used
to compress all data blocks in the SST file. The size of the shared dictionary
is configurable in CompressionOptions and defaults to 0. It's currently only
used for zlib/lz4/lz4hc, but the block will be stored in the SST regardless of
the compression type if the user chooses a nonzero dictionary size.
During compaction, computes the dictionary by randomly sampling the first
output file in each subcompaction. It pre-computes the intervals to sample
by assuming the output file will have the maximum allowable length. In case
the file is smaller, some of the pre-computed sampling intervals can be beyond
end-of-file, in which case we skip over those samples and the dictionary will
be a bit smaller. After the dictionary is generated using the first file in a
subcompaction, it is loaded into the compression library before writing each
block in each subsequent file of that subcompaction.
On the read path, gets the dictionary from the metablock, if it exists. Then,
loads that dictionary into the compression library before reading each block.
Test Plan: new unit test
Reviewers: yhchiang, IslamAbdelRahman, cyan, sdong
Reviewed By: sdong
Subscribers: andrewkr, yoshinorim, kradhakrishnan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D52287
2016-04-27 17:36:03 -07:00
|
|
|
BlockHandle* handle,
|
2021-08-03 12:42:22 -07:00
|
|
|
BlockType block_type) {
|
Limit buffering for collecting samples for compression dictionary (#7970)
Summary:
For dictionary compression, we need to collect some representative samples of the data to be compressed, which we use to either generate or train (when `CompressionOptions::zstd_max_train_bytes > 0`) a dictionary. Previously, the strategy was to buffer all the data blocks during flush, and up to the target file size during compaction. That strategy allowed us to randomly pick samples from as wide a range as possible that'd be guaranteed to land in a single output file.
However, some users try to make huge files in memory-constrained environments, where this strategy can cause OOM. This PR introduces an option, `CompressionOptions::max_dict_buffer_bytes`, that limits how much data blocks are buffered before we switch to unbuffered mode (which means creating the per-SST dictionary, writing out the buffered data, and compressing/writing new blocks as soon as they are built). It is not strict as we currently buffer more than just data blocks -- also keys are buffered. But it does make a step towards giving users predictable memory usage.
Related changes include:
- Changed sampling for dictionary compression to select unique data blocks when there is limited availability of data blocks
- Made use of `BlockBuilder::SwapAndReset()` to save an allocation+memcpy when buffering data blocks for building a dictionary
- Changed `ParseBoolean()` to accept an input containing characters after the boolean. This is necessary since, with this PR, a value for `CompressionOptions::enabled` is no longer necessarily the final component in the `CompressionOptions` string.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7970
Test Plan:
- updated `CompressionOptions` unit tests to verify limit is respected (to the extent expected in the current implementation) in various scenarios of flush/compaction to bottommost/non-bottommost level
- looked at jemalloc heap profiles right before and after switching to unbuffered mode during flush/compaction. Verified memory usage in buffering is proportional to the limit set.
Reviewed By: pdillinger
Differential Revision: D26467994
Pulled By: ajkr
fbshipit-source-id: 3da4ef9fba59974e4ef40e40c01611002c861465
2021-02-19 14:06:59 -08:00
|
|
|
block->Finish();
|
|
|
|
std::string raw_block_contents;
|
Two performance improvements in BlockBuilder (#9039)
Summary:
Primarily, this change reserves space in the std::string for building
the next block once a block is finished, using `block_size` as
reservation size. Note: also tried reusing same std::string in the
common "unbuffered" path but that showed no benefit or regression.
Secondarily, this slightly reduces the work in resetting `restarts_`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9039
Test Plan:
TEST_TMPDIR=/dev/shm/rocksdb1 ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=50000000
Compiled with DEBUG_LEVEL=0
Test vs. control runs simulaneous for better accuracy, units = ops/sec
Run 1, Primary change only: 292697 vs. 280267 (+4.4%)
Run 2, Primary change only: 288763 vs. 279621 (+3.3%)
Run 1, Secondary change only: 260065 vs. 254232 (+2.3%)
Run 2, Secondary change only: 275925 vs. 272248 (+1.4%)
Run 1, Both changes: 284890 vs. 270372 (+5.3%)
Run 2, Both changes: 263511 vs. 258188 (+2.0%)
Reviewed By: zhichao-cao
Differential Revision: D31701253
Pulled By: pdillinger
fbshipit-source-id: 7e40810afbb98e6b6446955e77bda59e69b19ffd
2021-10-18 08:34:45 -07:00
|
|
|
raw_block_contents.reserve(rep_->table_options.block_size);
|
Limit buffering for collecting samples for compression dictionary (#7970)
Summary:
For dictionary compression, we need to collect some representative samples of the data to be compressed, which we use to either generate or train (when `CompressionOptions::zstd_max_train_bytes > 0`) a dictionary. Previously, the strategy was to buffer all the data blocks during flush, and up to the target file size during compaction. That strategy allowed us to randomly pick samples from as wide a range as possible that'd be guaranteed to land in a single output file.
However, some users try to make huge files in memory-constrained environments, where this strategy can cause OOM. This PR introduces an option, `CompressionOptions::max_dict_buffer_bytes`, that limits how much data blocks are buffered before we switch to unbuffered mode (which means creating the per-SST dictionary, writing out the buffered data, and compressing/writing new blocks as soon as they are built). It is not strict as we currently buffer more than just data blocks -- also keys are buffered. But it does make a step towards giving users predictable memory usage.
Related changes include:
- Changed sampling for dictionary compression to select unique data blocks when there is limited availability of data blocks
- Made use of `BlockBuilder::SwapAndReset()` to save an allocation+memcpy when buffering data blocks for building a dictionary
- Changed `ParseBoolean()` to accept an input containing characters after the boolean. This is necessary since, with this PR, a value for `CompressionOptions::enabled` is no longer necessarily the final component in the `CompressionOptions` string.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7970
Test Plan:
- updated `CompressionOptions` unit tests to verify limit is respected (to the extent expected in the current implementation) in various scenarios of flush/compaction to bottommost/non-bottommost level
- looked at jemalloc heap profiles right before and after switching to unbuffered mode during flush/compaction. Verified memory usage in buffering is proportional to the limit set.
Reviewed By: pdillinger
Differential Revision: D26467994
Pulled By: ajkr
fbshipit-source-id: 3da4ef9fba59974e4ef40e40c01611002c861465
2021-02-19 14:06:59 -08:00
|
|
|
block->SwapAndReset(raw_block_contents);
|
|
|
|
if (rep_->state == Rep::State::kBuffered) {
|
2021-08-03 12:42:22 -07:00
|
|
|
assert(block_type == BlockType::kData);
|
2021-04-23 12:44:11 -07:00
|
|
|
rep_->data_block_buffers.emplace_back(std::move(raw_block_contents));
|
|
|
|
rep_->data_begin_offset += rep_->data_block_buffers.back().size();
|
Limit buffering for collecting samples for compression dictionary (#7970)
Summary:
For dictionary compression, we need to collect some representative samples of the data to be compressed, which we use to either generate or train (when `CompressionOptions::zstd_max_train_bytes > 0`) a dictionary. Previously, the strategy was to buffer all the data blocks during flush, and up to the target file size during compaction. That strategy allowed us to randomly pick samples from as wide a range as possible that'd be guaranteed to land in a single output file.
However, some users try to make huge files in memory-constrained environments, where this strategy can cause OOM. This PR introduces an option, `CompressionOptions::max_dict_buffer_bytes`, that limits how much data blocks are buffered before we switch to unbuffered mode (which means creating the per-SST dictionary, writing out the buffered data, and compressing/writing new blocks as soon as they are built). It is not strict as we currently buffer more than just data blocks -- also keys are buffered. But it does make a step towards giving users predictable memory usage.
Related changes include:
- Changed sampling for dictionary compression to select unique data blocks when there is limited availability of data blocks
- Made use of `BlockBuilder::SwapAndReset()` to save an allocation+memcpy when buffering data blocks for building a dictionary
- Changed `ParseBoolean()` to accept an input containing characters after the boolean. This is necessary since, with this PR, a value for `CompressionOptions::enabled` is no longer necessarily the final component in the `CompressionOptions` string.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7970
Test Plan:
- updated `CompressionOptions` unit tests to verify limit is respected (to the extent expected in the current implementation) in various scenarios of flush/compaction to bottommost/non-bottommost level
- looked at jemalloc heap profiles right before and after switching to unbuffered mode during flush/compaction. Verified memory usage in buffering is proportional to the limit set.
Reviewed By: pdillinger
Differential Revision: D26467994
Pulled By: ajkr
fbshipit-source-id: 3da4ef9fba59974e4ef40e40c01611002c861465
2021-02-19 14:06:59 -08:00
|
|
|
return;
|
|
|
|
}
|
2021-08-03 12:42:22 -07:00
|
|
|
WriteBlock(raw_block_contents, handle, block_type);
|
2014-02-28 18:19:07 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
void BlockBasedTableBuilder::WriteBlock(const Slice& raw_block_contents,
|
Shared dictionary compression using reference block
Summary:
This adds a new metablock containing a shared dictionary that is used
to compress all data blocks in the SST file. The size of the shared dictionary
is configurable in CompressionOptions and defaults to 0. It's currently only
used for zlib/lz4/lz4hc, but the block will be stored in the SST regardless of
the compression type if the user chooses a nonzero dictionary size.
During compaction, computes the dictionary by randomly sampling the first
output file in each subcompaction. It pre-computes the intervals to sample
by assuming the output file will have the maximum allowable length. In case
the file is smaller, some of the pre-computed sampling intervals can be beyond
end-of-file, in which case we skip over those samples and the dictionary will
be a bit smaller. After the dictionary is generated using the first file in a
subcompaction, it is loaded into the compression library before writing each
block in each subsequent file of that subcompaction.
On the read path, gets the dictionary from the metablock, if it exists. Then,
loads that dictionary into the compression library before reading each block.
Test Plan: new unit test
Reviewers: yhchiang, IslamAbdelRahman, cyan, sdong
Reviewed By: sdong
Subscribers: andrewkr, yoshinorim, kradhakrishnan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D52287
2016-04-27 17:36:03 -07:00
|
|
|
BlockHandle* handle,
|
2021-08-03 12:42:22 -07:00
|
|
|
BlockType block_type) {
|
2020-04-01 16:37:54 -07:00
|
|
|
Rep* r = rep_;
|
Limit buffering for collecting samples for compression dictionary (#7970)
Summary:
For dictionary compression, we need to collect some representative samples of the data to be compressed, which we use to either generate or train (when `CompressionOptions::zstd_max_train_bytes > 0`) a dictionary. Previously, the strategy was to buffer all the data blocks during flush, and up to the target file size during compaction. That strategy allowed us to randomly pick samples from as wide a range as possible that'd be guaranteed to land in a single output file.
However, some users try to make huge files in memory-constrained environments, where this strategy can cause OOM. This PR introduces an option, `CompressionOptions::max_dict_buffer_bytes`, that limits how much data blocks are buffered before we switch to unbuffered mode (which means creating the per-SST dictionary, writing out the buffered data, and compressing/writing new blocks as soon as they are built). It is not strict as we currently buffer more than just data blocks -- also keys are buffered. But it does make a step towards giving users predictable memory usage.
Related changes include:
- Changed sampling for dictionary compression to select unique data blocks when there is limited availability of data blocks
- Made use of `BlockBuilder::SwapAndReset()` to save an allocation+memcpy when buffering data blocks for building a dictionary
- Changed `ParseBoolean()` to accept an input containing characters after the boolean. This is necessary since, with this PR, a value for `CompressionOptions::enabled` is no longer necessarily the final component in the `CompressionOptions` string.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7970
Test Plan:
- updated `CompressionOptions` unit tests to verify limit is respected (to the extent expected in the current implementation) in various scenarios of flush/compaction to bottommost/non-bottommost level
- looked at jemalloc heap profiles right before and after switching to unbuffered mode during flush/compaction. Verified memory usage in buffering is proportional to the limit set.
Reviewed By: pdillinger
Differential Revision: D26467994
Pulled By: ajkr
fbshipit-source-id: 3da4ef9fba59974e4ef40e40c01611002c861465
2021-02-19 14:06:59 -08:00
|
|
|
assert(r->state == Rep::State::kUnbuffered);
|
2020-04-01 16:37:54 -07:00
|
|
|
Slice block_contents;
|
|
|
|
CompressionType type;
|
2020-04-30 15:34:43 -07:00
|
|
|
Status compress_status;
|
2021-08-03 12:42:22 -07:00
|
|
|
bool is_data_block = block_type == BlockType::kData;
|
2020-04-01 16:37:54 -07:00
|
|
|
CompressAndVerifyBlock(raw_block_contents, is_data_block,
|
|
|
|
*(r->compression_ctxs[0]), r->verify_ctxs[0].get(),
|
2020-04-30 15:34:43 -07:00
|
|
|
&(r->compressed_output), &(block_contents), &type,
|
|
|
|
&compress_status);
|
|
|
|
r->SetStatus(compress_status);
|
2020-04-01 16:37:54 -07:00
|
|
|
if (!ok()) {
|
|
|
|
return;
|
|
|
|
}
|
2021-06-17 21:55:42 -07:00
|
|
|
|
2021-08-03 12:42:22 -07:00
|
|
|
WriteRawBlock(block_contents, type, handle, block_type, &raw_block_contents);
|
2020-04-01 16:37:54 -07:00
|
|
|
r->compressed_output.clear();
|
|
|
|
if (is_data_block) {
|
|
|
|
if (r->filter_builder != nullptr) {
|
2020-04-02 16:13:44 -07:00
|
|
|
r->filter_builder->StartBlock(r->get_offset());
|
2020-04-01 16:37:54 -07:00
|
|
|
}
|
2020-04-02 16:13:44 -07:00
|
|
|
r->props.data_size = r->get_offset();
|
2020-04-01 16:37:54 -07:00
|
|
|
++r->props.num_data_blocks;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void BlockBasedTableBuilder::BGWorkCompression(
|
2020-10-22 11:03:10 -07:00
|
|
|
const CompressionContext& compression_ctx,
|
|
|
|
UncompressionContext* verify_ctx) {
|
|
|
|
ParallelCompressionRep::BlockRep* block_rep = nullptr;
|
2020-04-01 16:37:54 -07:00
|
|
|
while (rep_->pc_rep->compress_queue.pop(block_rep)) {
|
2020-10-22 11:03:10 -07:00
|
|
|
assert(block_rep != nullptr);
|
2020-04-01 16:37:54 -07:00
|
|
|
CompressAndVerifyBlock(block_rep->contents, true, /* is_data_block*/
|
|
|
|
compression_ctx, verify_ctx,
|
2020-04-30 15:34:43 -07:00
|
|
|
block_rep->compressed_data.get(),
|
2020-05-12 09:25:21 -07:00
|
|
|
&block_rep->compressed_contents,
|
|
|
|
&(block_rep->compression_type), &block_rep->status);
|
2020-04-01 16:37:54 -07:00
|
|
|
block_rep->slot->Fill(block_rep);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void BlockBasedTableBuilder::CompressAndVerifyBlock(
|
|
|
|
const Slice& raw_block_contents, bool is_data_block,
|
2020-10-22 11:03:10 -07:00
|
|
|
const CompressionContext& compression_ctx, UncompressionContext* verify_ctx,
|
2020-04-30 15:34:43 -07:00
|
|
|
std::string* compressed_output, Slice* block_contents,
|
|
|
|
CompressionType* type, Status* out_status) {
|
2011-03-18 22:37:00 +00:00
|
|
|
// File format contains a sequence of blocks where each block has:
|
|
|
|
// block_data: uint8[n]
|
|
|
|
// type: uint8
|
|
|
|
// crc: uint32
|
|
|
|
Rep* r = rep_;
|
2020-10-22 11:03:10 -07:00
|
|
|
bool is_status_ok = ok();
|
|
|
|
if (!r->IsParallelCompressionEnabled()) {
|
|
|
|
assert(is_status_ok);
|
|
|
|
}
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2020-04-30 15:34:43 -07:00
|
|
|
*type = r->compression_type;
|
2019-03-18 12:07:35 -07:00
|
|
|
uint64_t sample_for_compression = r->sample_for_compression;
|
2016-06-10 18:20:54 -07:00
|
|
|
bool abort_compression = false;
|
2016-08-19 15:10:31 -07:00
|
|
|
|
2019-03-27 16:13:08 -07:00
|
|
|
StopWatchNano timer(
|
2021-03-15 04:32:24 -07:00
|
|
|
r->ioptions.clock,
|
2021-04-26 12:43:02 -07:00
|
|
|
ShouldReportDetailedTime(r->ioptions.env, r->ioptions.stats));
|
2016-07-19 09:44:03 -07:00
|
|
|
|
2020-10-22 11:03:10 -07:00
|
|
|
if (is_status_ok && raw_block_contents.size() < kCompressionSizeLimit) {
|
2021-03-31 18:20:44 -07:00
|
|
|
if (is_data_block) {
|
|
|
|
r->compressible_input_data_bytes.fetch_add(raw_block_contents.size(),
|
|
|
|
std::memory_order_relaxed);
|
|
|
|
}
|
Reduce scope of compression dictionary to single SST (#4952)
Summary:
Our previous approach was to train one compression dictionary per compaction, using the first output SST to train a dictionary, and then applying it on subsequent SSTs in the same compaction. While this was great for minimizing CPU/memory/I/O overhead, it did not achieve good compression ratios in practice. In our most promising potential use case, moderate reductions in a dictionary's scope make a major difference on compression ratio.
So, this PR changes compression dictionary to be scoped per-SST. It accepts the tradeoff during table building to use more memory and CPU. Important changes include:
- The `BlockBasedTableBuilder` has a new state when dictionary compression is in-use: `kBuffered`. In that state it accumulates uncompressed data in-memory whenever `Add` is called.
- After accumulating target file size bytes or calling `BlockBasedTableBuilder::Finish`, a `BlockBasedTableBuilder` moves to the `kUnbuffered` state. The transition (`EnterUnbuffered()`) involves sampling the buffered data, training a dictionary, and compressing/writing out all buffered data. In the `kUnbuffered` state, a `BlockBasedTableBuilder` behaves the same as before -- blocks are compressed/written out as soon as they fill up.
- Samples are now whole uncompressed data blocks, except the final sample may be a partial data block so we don't breach the user's configured `max_dict_bytes` or `zstd_max_train_bytes`. The dictionary trainer is supposed to work better when we pass it real units of compression. Previously we were passing 64-byte KV samples which was not realistic.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4952
Differential Revision: D13967980
Pulled By: ajkr
fbshipit-source-id: 82bea6f7537e1529c7a1a4cdee84585f5949300f
2019-02-11 19:42:25 -08:00
|
|
|
const CompressionDict* compression_dict;
|
|
|
|
if (!is_data_block || r->compression_dict == nullptr) {
|
|
|
|
compression_dict = &CompressionDict::GetEmptyDict();
|
|
|
|
} else {
|
|
|
|
compression_dict = r->compression_dict.get();
|
|
|
|
}
|
|
|
|
assert(compression_dict != nullptr);
|
2020-04-01 16:37:54 -07:00
|
|
|
CompressionInfo compression_info(r->compression_opts, compression_ctx,
|
2020-04-30 15:34:43 -07:00
|
|
|
*compression_dict, *type,
|
2019-03-18 12:07:35 -07:00
|
|
|
sample_for_compression);
|
|
|
|
|
|
|
|
std::string sampled_output_fast;
|
|
|
|
std::string sampled_output_slow;
|
2020-04-30 15:34:43 -07:00
|
|
|
*block_contents = CompressBlock(
|
|
|
|
raw_block_contents, compression_info, type,
|
2019-03-18 12:07:35 -07:00
|
|
|
r->table_options.format_version, is_data_block /* do_sample */,
|
2020-04-30 15:34:43 -07:00
|
|
|
compressed_output, &sampled_output_fast, &sampled_output_slow);
|
2019-03-18 12:07:35 -07:00
|
|
|
|
2021-03-31 18:20:44 -07:00
|
|
|
if (sampled_output_slow.size() > 0 || sampled_output_fast.size() > 0) {
|
|
|
|
// Currently compression sampling is only enabled for data block.
|
|
|
|
assert(is_data_block);
|
|
|
|
r->sampled_input_data_bytes.fetch_add(raw_block_contents.size(),
|
|
|
|
std::memory_order_relaxed);
|
|
|
|
r->sampled_output_slow_data_bytes.fetch_add(sampled_output_slow.size(),
|
|
|
|
std::memory_order_relaxed);
|
|
|
|
r->sampled_output_fast_data_bytes.fetch_add(sampled_output_fast.size(),
|
|
|
|
std::memory_order_relaxed);
|
|
|
|
}
|
2019-03-18 12:07:35 -07:00
|
|
|
// notify collectors on block add
|
|
|
|
NotifyCollectTableCollectorsOnBlockAdd(
|
|
|
|
r->table_properties_collectors, raw_block_contents.size(),
|
|
|
|
sampled_output_fast.size(), sampled_output_slow.size());
|
2016-06-10 18:20:54 -07:00
|
|
|
|
|
|
|
// Some of the compression algorithms are known to be unreliable. If
|
|
|
|
// the verify_compression flag is set then try to de-compress the
|
|
|
|
// compressed data and compare to the input.
|
2020-04-30 15:34:43 -07:00
|
|
|
if (*type != kNoCompression && r->table_options.verify_compression) {
|
2016-06-10 18:20:54 -07:00
|
|
|
// Retrieve the uncompressed contents into a new buffer
|
Reduce scope of compression dictionary to single SST (#4952)
Summary:
Our previous approach was to train one compression dictionary per compaction, using the first output SST to train a dictionary, and then applying it on subsequent SSTs in the same compaction. While this was great for minimizing CPU/memory/I/O overhead, it did not achieve good compression ratios in practice. In our most promising potential use case, moderate reductions in a dictionary's scope make a major difference on compression ratio.
So, this PR changes compression dictionary to be scoped per-SST. It accepts the tradeoff during table building to use more memory and CPU. Important changes include:
- The `BlockBasedTableBuilder` has a new state when dictionary compression is in-use: `kBuffered`. In that state it accumulates uncompressed data in-memory whenever `Add` is called.
- After accumulating target file size bytes or calling `BlockBasedTableBuilder::Finish`, a `BlockBasedTableBuilder` moves to the `kUnbuffered` state. The transition (`EnterUnbuffered()`) involves sampling the buffered data, training a dictionary, and compressing/writing out all buffered data. In the `kUnbuffered` state, a `BlockBasedTableBuilder` behaves the same as before -- blocks are compressed/written out as soon as they fill up.
- Samples are now whole uncompressed data blocks, except the final sample may be a partial data block so we don't breach the user's configured `max_dict_bytes` or `zstd_max_train_bytes`. The dictionary trainer is supposed to work better when we pass it real units of compression. Previously we were passing 64-byte KV samples which was not realistic.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4952
Differential Revision: D13967980
Pulled By: ajkr
fbshipit-source-id: 82bea6f7537e1529c7a1a4cdee84585f5949300f
2019-02-11 19:42:25 -08:00
|
|
|
const UncompressionDict* verify_dict;
|
|
|
|
if (!is_data_block || r->verify_dict == nullptr) {
|
|
|
|
verify_dict = &UncompressionDict::GetEmptyDict();
|
|
|
|
} else {
|
|
|
|
verify_dict = r->verify_dict.get();
|
|
|
|
}
|
|
|
|
assert(verify_dict != nullptr);
|
2016-06-10 18:20:54 -07:00
|
|
|
BlockContents contents;
|
2020-10-22 11:03:10 -07:00
|
|
|
UncompressionInfo uncompression_info(*verify_ctx, *verify_dict,
|
Reduce scope of compression dictionary to single SST (#4952)
Summary:
Our previous approach was to train one compression dictionary per compaction, using the first output SST to train a dictionary, and then applying it on subsequent SSTs in the same compaction. While this was great for minimizing CPU/memory/I/O overhead, it did not achieve good compression ratios in practice. In our most promising potential use case, moderate reductions in a dictionary's scope make a major difference on compression ratio.
So, this PR changes compression dictionary to be scoped per-SST. It accepts the tradeoff during table building to use more memory and CPU. Important changes include:
- The `BlockBasedTableBuilder` has a new state when dictionary compression is in-use: `kBuffered`. In that state it accumulates uncompressed data in-memory whenever `Add` is called.
- After accumulating target file size bytes or calling `BlockBasedTableBuilder::Finish`, a `BlockBasedTableBuilder` moves to the `kUnbuffered` state. The transition (`EnterUnbuffered()`) involves sampling the buffered data, training a dictionary, and compressing/writing out all buffered data. In the `kUnbuffered` state, a `BlockBasedTableBuilder` behaves the same as before -- blocks are compressed/written out as soon as they fill up.
- Samples are now whole uncompressed data blocks, except the final sample may be a partial data block so we don't breach the user's configured `max_dict_bytes` or `zstd_max_train_bytes`. The dictionary trainer is supposed to work better when we pass it real units of compression. Previously we were passing 64-byte KV samples which was not realistic.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4952
Differential Revision: D13967980
Pulled By: ajkr
fbshipit-source-id: 82bea6f7537e1529c7a1a4cdee84585f5949300f
2019-02-11 19:42:25 -08:00
|
|
|
r->compression_type);
|
2018-06-05 12:51:05 -07:00
|
|
|
Status stat = UncompressBlockContentsForCompressionType(
|
2020-04-30 15:34:43 -07:00
|
|
|
uncompression_info, block_contents->data(), block_contents->size(),
|
2018-06-05 12:51:05 -07:00
|
|
|
&contents, r->table_options.format_version, r->ioptions);
|
2016-06-10 18:20:54 -07:00
|
|
|
|
|
|
|
if (stat.ok()) {
|
|
|
|
bool compressed_ok = contents.data.compare(raw_block_contents) == 0;
|
|
|
|
if (!compressed_ok) {
|
|
|
|
// The result of the compression was invalid. abort.
|
|
|
|
abort_compression = true;
|
2021-04-26 12:43:02 -07:00
|
|
|
ROCKS_LOG_ERROR(r->ioptions.logger,
|
2017-03-15 19:22:52 -07:00
|
|
|
"Decompressed block did not match raw block");
|
2020-04-30 15:34:43 -07:00
|
|
|
*out_status =
|
2016-06-10 18:20:54 -07:00
|
|
|
Status::Corruption("Decompressed block did not match raw block");
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// Decompression reported an error. abort.
|
2020-05-12 09:25:21 -07:00
|
|
|
*out_status = Status::Corruption(std::string("Could not decompress: ") +
|
|
|
|
stat.getState());
|
2016-06-10 18:20:54 -07:00
|
|
|
abort_compression = true;
|
|
|
|
}
|
|
|
|
}
|
2014-06-09 12:26:09 -07:00
|
|
|
} else {
|
2016-06-10 18:20:54 -07:00
|
|
|
// Block is too big to be compressed.
|
2021-03-31 18:20:44 -07:00
|
|
|
if (is_data_block) {
|
|
|
|
r->uncompressible_input_data_bytes.fetch_add(raw_block_contents.size(),
|
|
|
|
std::memory_order_relaxed);
|
|
|
|
}
|
2016-06-10 18:20:54 -07:00
|
|
|
abort_compression = true;
|
|
|
|
}
|
2021-03-31 18:20:44 -07:00
|
|
|
if (is_data_block) {
|
|
|
|
r->uncompressible_input_data_bytes.fetch_add(kBlockTrailerSize,
|
|
|
|
std::memory_order_relaxed);
|
|
|
|
}
|
2016-06-10 18:20:54 -07:00
|
|
|
|
|
|
|
// Abort compression if the block is too big, or did not pass
|
|
|
|
// verification.
|
|
|
|
if (abort_compression) {
|
2021-04-26 12:43:02 -07:00
|
|
|
RecordTick(r->ioptions.stats, NUMBER_BLOCK_NOT_COMPRESSED);
|
2020-04-30 15:34:43 -07:00
|
|
|
*type = kNoCompression;
|
|
|
|
*block_contents = raw_block_contents;
|
|
|
|
} else if (*type != kNoCompression) {
|
2021-04-26 12:43:02 -07:00
|
|
|
if (ShouldReportDetailedTime(r->ioptions.env, r->ioptions.stats)) {
|
|
|
|
RecordTimeToHistogram(r->ioptions.stats, COMPRESSION_TIMES_NANOS,
|
2019-02-28 10:14:19 -08:00
|
|
|
timer.ElapsedNanos());
|
2017-12-14 10:17:22 -08:00
|
|
|
}
|
2021-04-26 12:43:02 -07:00
|
|
|
RecordInHistogram(r->ioptions.stats, BYTES_COMPRESSED,
|
2019-02-28 10:14:19 -08:00
|
|
|
raw_block_contents.size());
|
2021-04-26 12:43:02 -07:00
|
|
|
RecordTick(r->ioptions.stats, NUMBER_BLOCK_COMPRESSED);
|
2020-04-30 15:34:43 -07:00
|
|
|
} else if (*type != r->compression_type) {
|
2021-04-26 12:43:02 -07:00
|
|
|
RecordTick(r->ioptions.stats, NUMBER_BLOCK_NOT_COMPRESSED);
|
2016-07-19 09:44:03 -07:00
|
|
|
}
|
2012-04-17 08:36:46 -07:00
|
|
|
}
|
|
|
|
|
2013-10-28 17:54:09 -07:00
|
|
|
void BlockBasedTableBuilder::WriteRawBlock(const Slice& block_contents,
|
|
|
|
CompressionType type,
|
2018-03-26 20:14:24 -07:00
|
|
|
BlockHandle* handle,
|
2021-08-03 12:42:22 -07:00
|
|
|
BlockType block_type,
|
2021-12-08 12:43:09 -08:00
|
|
|
const Slice* raw_block_contents,
|
|
|
|
bool is_top_level_filter_block) {
|
2012-04-17 08:36:46 -07:00
|
|
|
Rep* r = rep_;
|
2021-08-03 12:42:22 -07:00
|
|
|
bool is_data_block = block_type == BlockType::kData;
|
2021-04-26 12:43:02 -07:00
|
|
|
StopWatch sw(r->ioptions.clock, r->ioptions.stats, WRITE_RAW_BLOCK_MICROS);
|
2020-04-02 16:13:44 -07:00
|
|
|
handle->set_offset(r->get_offset());
|
2011-03-18 22:37:00 +00:00
|
|
|
handle->set_size(block_contents.size());
|
2020-04-01 16:37:54 -07:00
|
|
|
assert(status().ok());
|
|
|
|
assert(io_status().ok());
|
2022-01-26 10:14:56 -08:00
|
|
|
|
|
|
|
{
|
|
|
|
IOStatus io_s = r->file->Append(block_contents);
|
|
|
|
if (!io_s.ok()) {
|
|
|
|
r->SetIOStatus(io_s);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
std::array<char, kBlockTrailerSize> trailer;
|
|
|
|
trailer[0] = type;
|
|
|
|
uint32_t checksum = ComputeBuiltinChecksumWithLastByte(
|
|
|
|
r->table_options.checksum, block_contents.data(), block_contents.size(),
|
|
|
|
/*last_byte*/ type);
|
Detect (new) Bloom/Ribbon Filter construction corruption (#9342)
Summary:
Note: rebase on and merge after https://github.com/facebook/rocksdb/pull/9349, https://github.com/facebook/rocksdb/pull/9345, (optional) https://github.com/facebook/rocksdb/pull/9393
**Context:**
(Quoted from pdillinger) Layers of information during new Bloom/Ribbon Filter construction in building block-based tables includes the following:
a) set of keys to add to filter
b) set of hashes to add to filter (64-bit hash applied to each key)
c) set of Bloom indices to set in filter, with duplicates
d) set of Bloom indices to set in filter, deduplicated
e) final filter and its checksum
This PR aims to detect corruption (e.g, unexpected hardware/software corruption on data structures residing in the memory for a long time) from b) to e) and leave a) as future works for application level.
- b)'s corruption is detected by verifying the xor checksum of the hash entries calculated as the entries accumulate before being added to the filter. (i.e, `XXPH3FilterBitsBuilder::MaybeVerifyHashEntriesChecksum()`)
- c) - e)'s corruption is detected by verifying the hash entries indeed exists in the constructed filter by re-querying these hash entries in the filter (i.e, `FilterBitsBuilder::MaybePostVerify()`) after computing the block checksum (except for PartitionFilter, which is done right after each `FilterBitsBuilder::Finish` for impl simplicity - see code comment for more). For this stage of detection, we assume hash entries are not corrupted after checking on b) since the time interval from b) to c) is relatively short IMO.
Option to enable this feature of detection is `BlockBasedTableOptions::detect_filter_construct_corruption` which is false by default.
**Summary:**
- Implemented new functions `XXPH3FilterBitsBuilder::MaybeVerifyHashEntriesChecksum()` and `FilterBitsBuilder::MaybePostVerify()`
- Ensured hash entries, final filter and banding and their [cache reservation ](https://github.com/facebook/rocksdb/issues/9073) are released properly despite corruption
- See [Filter.construction.artifacts.release.point.pdf ](https://github.com/facebook/rocksdb/files/7923487/Design.Filter.construction.artifacts.release.point.pdf) for high-level design
- Bundled and refactored hash entries's related artifact in XXPH3FilterBitsBuilder into `HashEntriesInfo` for better control on lifetime of these artifact during `SwapEntires`, `ResetEntries`
- Ensured RocksDB block-based table builder calls `FilterBitsBuilder::MaybePostVerify()` after constructing the filter by `FilterBitsBuilder::Finish()`
- When encountering such filter construction corruption, stop writing the filter content to files and mark such a block-based table building non-ok by storing the corruption status in the builder.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9342
Test Plan:
- Added new unit test `DBFilterConstructionCorruptionTestWithParam.DetectCorruption`
- Included this new feature in `DBFilterConstructionReserveMemoryTestWithParam.ReserveMemory` as this feature heavily touch ReserveMemory's impl
- For fallback case, I run `./filter_bench -impl=3 -detect_filter_construct_corruption=true -reserve_table_builder_memory=true -strict_capacity_limit=true -quick -runs 10 | grep 'Build avg'` to make sure nothing break.
- Added to `filter_bench`: increased filter construction time by **30%**, mostly by `MaybePostVerify()`
- FastLocalBloom
- Before change: `./filter_bench -impl=2 -quick -runs 10 | grep 'Build avg'`: **28.86643s**
- After change:
- `./filter_bench -impl=2 -detect_filter_construct_corruption=false -quick -runs 10 | grep 'Build avg'` (expect a tiny increase due to MaybePostVerify is always called regardless): **27.6644s (-4% perf improvement might be due to now we don't drop bloom hash entry in `AddAllEntries` along iteration but in bulk later, same with the bypassing-MaybePostVerify case below)**
- `./filter_bench -impl=2 -detect_filter_construct_corruption=true -quick -runs 10 | grep 'Build avg'` (expect acceptable increase): **34.41159s (+20%)**
- `./filter_bench -impl=2 -detect_filter_construct_corruption=true -quick -runs 10 | grep 'Build avg'` (by-passing MaybePostVerify, expect minor increase): **27.13431s (-6%)**
- Standard128Ribbon
- Before change: `./filter_bench -impl=3 -quick -runs 10 | grep 'Build avg'`: **122.5384s**
- After change:
- `./filter_bench -impl=3 -detect_filter_construct_corruption=false -quick -runs 10 | grep 'Build avg'` (expect a tiny increase due to MaybePostVerify is always called regardless - verified by removing MaybePostVerify under this case and found only +-1ns difference): **124.3588s (+2%)**
- `./filter_bench -impl=3 -detect_filter_construct_corruption=true -quick -runs 10 | grep 'Build avg'`(expect acceptable increase): **159.4946s (+30%)**
- `./filter_bench -impl=3 -detect_filter_construct_corruption=true -quick -runs 10 | grep 'Build avg'`(by-passing MaybePostVerify, expect minor increase) : **125.258s (+2%)**
- Added to `db_stress`: `make crash_test`, `./db_stress --detect_filter_construct_corruption=true`
- Manually smoke-tested: manually corrupted the filter construction in some db level tests with basic PUT and background flush. As expected, the error did get returned to users in subsequent PUT and Flush status.
Reviewed By: pdillinger
Differential Revision: D33746928
Pulled By: hx235
fbshipit-source-id: cb056426be5a7debc1cd16f23bc250f36a08ca57
2022-02-01 17:41:20 -08:00
|
|
|
|
|
|
|
if (block_type == BlockType::kFilter) {
|
|
|
|
Status s = r->filter_builder->MaybePostVerifyFilter(block_contents);
|
|
|
|
if (!s.ok()) {
|
|
|
|
r->SetStatus(s);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-01-26 10:14:56 -08:00
|
|
|
EncodeFixed32(trailer.data() + 1, checksum);
|
|
|
|
TEST_SYNC_POINT_CALLBACK(
|
|
|
|
"BlockBasedTableBuilder::WriteRawBlock:TamperWithChecksum",
|
|
|
|
trailer.data());
|
|
|
|
{
|
|
|
|
IOStatus io_s = r->file->Append(Slice(trailer.data(), trailer.size()));
|
|
|
|
if (!io_s.ok()) {
|
|
|
|
r->SetIOStatus(io_s);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
Status s = Status::OK();
|
|
|
|
bool warm_cache;
|
|
|
|
switch (r->table_options.prepopulate_block_cache) {
|
|
|
|
case BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly:
|
|
|
|
warm_cache = (r->reason == TableFileCreationReason::kFlush);
|
|
|
|
break;
|
|
|
|
case BlockBasedTableOptions::PrepopulateBlockCache::kDisable:
|
|
|
|
warm_cache = false;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
// missing case
|
|
|
|
assert(false);
|
|
|
|
warm_cache = false;
|
|
|
|
}
|
|
|
|
if (warm_cache) {
|
|
|
|
if (type == kNoCompression) {
|
|
|
|
s = InsertBlockInCacheHelper(block_contents, handle, block_type,
|
|
|
|
is_top_level_filter_block);
|
|
|
|
} else if (raw_block_contents != nullptr) {
|
|
|
|
s = InsertBlockInCacheHelper(*raw_block_contents, handle, block_type,
|
|
|
|
is_top_level_filter_block);
|
2021-06-17 21:55:42 -07:00
|
|
|
}
|
2020-04-01 16:37:54 -07:00
|
|
|
if (!s.ok()) {
|
2020-04-30 15:34:43 -07:00
|
|
|
r->SetStatus(s);
|
2022-01-26 10:14:56 -08:00
|
|
|
return;
|
2020-04-01 16:37:54 -07:00
|
|
|
}
|
2022-01-26 10:14:56 -08:00
|
|
|
}
|
|
|
|
s = InsertBlockInCompressedCache(block_contents, type, handle);
|
|
|
|
if (!s.ok()) {
|
|
|
|
r->SetStatus(s);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
r->set_offset(r->get_offset() + block_contents.size() + kBlockTrailerSize);
|
|
|
|
if (r->table_options.block_align && is_data_block) {
|
|
|
|
size_t pad_bytes =
|
|
|
|
(r->alignment -
|
|
|
|
((block_contents.size() + kBlockTrailerSize) & (r->alignment - 1))) &
|
|
|
|
(r->alignment - 1);
|
|
|
|
IOStatus io_s = r->file->Pad(pad_bytes);
|
|
|
|
if (io_s.ok()) {
|
|
|
|
r->set_offset(r->get_offset() + pad_bytes);
|
2020-04-01 16:37:54 -07:00
|
|
|
} else {
|
2020-04-30 15:34:43 -07:00
|
|
|
r->SetIOStatus(io_s);
|
2022-01-26 10:14:56 -08:00
|
|
|
return;
|
2013-09-01 23:23:40 -07:00
|
|
|
}
|
2020-04-01 16:37:54 -07:00
|
|
|
}
|
2022-01-26 10:14:56 -08:00
|
|
|
|
|
|
|
if (r->IsParallelCompressionEnabled()) {
|
|
|
|
if (is_data_block) {
|
|
|
|
r->pc_rep->file_size_estimator.ReapBlock(block_contents.size(),
|
|
|
|
r->get_offset());
|
|
|
|
} else {
|
|
|
|
r->pc_rep->file_size_estimator.SetEstimatedFileSize(r->get_offset());
|
|
|
|
}
|
2011-03-18 22:37:00 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-04-01 16:37:54 -07:00
|
|
|
void BlockBasedTableBuilder::BGWorkWriteRawBlock() {
|
|
|
|
Rep* r = rep_;
|
2020-10-22 11:03:10 -07:00
|
|
|
ParallelCompressionRep::BlockRepSlot* slot = nullptr;
|
|
|
|
ParallelCompressionRep::BlockRep* block_rep = nullptr;
|
2020-04-01 16:37:54 -07:00
|
|
|
while (r->pc_rep->write_queue.pop(slot)) {
|
2020-10-22 11:03:10 -07:00
|
|
|
assert(slot != nullptr);
|
2020-04-01 16:37:54 -07:00
|
|
|
slot->Take(block_rep);
|
2020-10-22 11:03:10 -07:00
|
|
|
assert(block_rep != nullptr);
|
2020-04-01 16:37:54 -07:00
|
|
|
if (!block_rep->status.ok()) {
|
2020-04-30 15:34:43 -07:00
|
|
|
r->SetStatus(block_rep->status);
|
2020-10-22 11:03:10 -07:00
|
|
|
// Reap block so that blocked Flush() can finish
|
2020-05-12 09:25:21 -07:00
|
|
|
// if there is one, and Flush() will notice !ok() next time.
|
|
|
|
block_rep->status = Status::OK();
|
2020-10-22 11:03:10 -07:00
|
|
|
r->pc_rep->ReapBlock(block_rep);
|
|
|
|
continue;
|
2020-04-01 16:37:54 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
for (size_t i = 0; i < block_rep->keys->Size(); i++) {
|
|
|
|
auto& key = (*block_rep->keys)[i];
|
|
|
|
if (r->filter_builder != nullptr) {
|
|
|
|
size_t ts_sz =
|
|
|
|
r->internal_comparator.user_comparator()->timestamp_size();
|
|
|
|
r->filter_builder->Add(ExtractUserKeyAndStripTimestamp(key, ts_sz));
|
|
|
|
}
|
|
|
|
r->index_builder->OnKeyAdded(key);
|
|
|
|
}
|
|
|
|
|
2020-10-22 11:03:10 -07:00
|
|
|
r->pc_rep->file_size_estimator.SetCurrBlockRawSize(block_rep->data->size());
|
2020-05-12 09:25:21 -07:00
|
|
|
WriteRawBlock(block_rep->compressed_contents, block_rep->compression_type,
|
2021-08-03 12:42:22 -07:00
|
|
|
&r->pending_handle, BlockType::kData, &block_rep->contents);
|
2020-04-30 15:34:43 -07:00
|
|
|
if (!ok()) {
|
2020-04-01 16:37:54 -07:00
|
|
|
break;
|
|
|
|
}
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2020-04-01 16:37:54 -07:00
|
|
|
if (r->filter_builder != nullptr) {
|
2020-04-02 16:13:44 -07:00
|
|
|
r->filter_builder->StartBlock(r->get_offset());
|
2020-04-01 16:37:54 -07:00
|
|
|
}
|
2020-04-02 16:13:44 -07:00
|
|
|
r->props.data_size = r->get_offset();
|
2020-04-01 16:37:54 -07:00
|
|
|
++r->props.num_data_blocks;
|
|
|
|
|
|
|
|
if (block_rep->first_key_in_next_block == nullptr) {
|
|
|
|
r->index_builder->AddIndexEntry(&(block_rep->keys->Back()), nullptr,
|
|
|
|
r->pending_handle);
|
|
|
|
} else {
|
|
|
|
Slice first_key_in_next_block =
|
|
|
|
Slice(*block_rep->first_key_in_next_block);
|
|
|
|
r->index_builder->AddIndexEntry(&(block_rep->keys->Back()),
|
|
|
|
&first_key_in_next_block,
|
|
|
|
r->pending_handle);
|
|
|
|
}
|
2020-10-22 11:03:10 -07:00
|
|
|
|
|
|
|
r->pc_rep->ReapBlock(block_rep);
|
2020-04-01 16:37:54 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-10-22 11:03:10 -07:00
|
|
|
void BlockBasedTableBuilder::StartParallelCompression() {
|
|
|
|
rep_->pc_rep.reset(
|
|
|
|
new ParallelCompressionRep(rep_->compression_opts.parallel_threads));
|
|
|
|
rep_->pc_rep->compress_thread_pool.reserve(
|
|
|
|
rep_->compression_opts.parallel_threads);
|
|
|
|
for (uint32_t i = 0; i < rep_->compression_opts.parallel_threads; i++) {
|
|
|
|
rep_->pc_rep->compress_thread_pool.emplace_back([this, i] {
|
|
|
|
BGWorkCompression(*(rep_->compression_ctxs[i]),
|
|
|
|
rep_->verify_ctxs[i].get());
|
|
|
|
});
|
|
|
|
}
|
|
|
|
rep_->pc_rep->write_thread.reset(
|
|
|
|
new port::Thread([this] { BGWorkWriteRawBlock(); }));
|
|
|
|
}
|
|
|
|
|
|
|
|
void BlockBasedTableBuilder::StopParallelCompression() {
|
|
|
|
rep_->pc_rep->compress_queue.finish();
|
|
|
|
for (auto& thread : rep_->pc_rep->compress_thread_pool) {
|
|
|
|
thread.join();
|
|
|
|
}
|
|
|
|
rep_->pc_rep->write_queue.finish();
|
|
|
|
rep_->pc_rep->write_thread->join();
|
|
|
|
}
|
|
|
|
|
2020-04-30 15:34:43 -07:00
|
|
|
Status BlockBasedTableBuilder::status() const { return rep_->GetStatus(); }
|
2020-04-01 16:37:54 -07:00
|
|
|
|
|
|
|
IOStatus BlockBasedTableBuilder::io_status() const {
|
2020-04-30 15:34:43 -07:00
|
|
|
return rep_->GetIOStatus();
|
2020-04-01 16:37:54 -07:00
|
|
|
}
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
2020-03-27 16:03:05 -07:00
|
|
|
|
2021-06-17 21:55:42 -07:00
|
|
|
namespace {
|
|
|
|
// Delete the entry resided in the cache.
|
|
|
|
template <class Entry>
|
|
|
|
void DeleteEntryCached(const Slice& /*key*/, void* value) {
|
|
|
|
auto entry = reinterpret_cast<Entry*>(value);
|
|
|
|
delete entry;
|
|
|
|
}
|
|
|
|
} // namespace
|
|
|
|
|
2013-09-01 23:23:40 -07:00
|
|
|
//
|
|
|
|
// Make a copy of the block contents and insert into compressed block cache
|
|
|
|
//
|
2021-06-17 21:55:42 -07:00
|
|
|
Status BlockBasedTableBuilder::InsertBlockInCompressedCache(
|
|
|
|
const Slice& block_contents, const CompressionType type,
|
|
|
|
const BlockHandle* handle) {
|
2013-09-01 23:23:40 -07:00
|
|
|
Rep* r = rep_;
|
2014-08-25 14:22:05 -07:00
|
|
|
Cache* block_cache_compressed = r->table_options.block_cache_compressed.get();
|
2021-06-17 21:55:42 -07:00
|
|
|
Status s;
|
2013-09-01 23:23:40 -07:00
|
|
|
if (type != kNoCompression && block_cache_compressed != nullptr) {
|
|
|
|
size_t size = block_contents.size();
|
|
|
|
|
2018-10-02 17:21:54 -07:00
|
|
|
auto ubuf =
|
2018-11-21 11:28:02 -08:00
|
|
|
AllocateBlock(size + 1, block_cache_compressed->memory_allocator());
|
2014-08-15 15:05:09 -07:00
|
|
|
memcpy(ubuf.get(), block_contents.data(), size);
|
2014-07-16 06:45:49 -07:00
|
|
|
ubuf[size] = type;
|
2013-09-01 23:23:40 -07:00
|
|
|
|
2018-11-13 17:00:49 -08:00
|
|
|
BlockContents* block_contents_to_cache =
|
|
|
|
new BlockContents(std::move(ubuf), size);
|
|
|
|
#ifndef NDEBUG
|
|
|
|
block_contents_to_cache->is_raw_block = true;
|
|
|
|
#endif // NDEBUG
|
2013-09-01 23:23:40 -07:00
|
|
|
|
New stable, fixed-length cache keys (#9126)
Summary:
This change standardizes on a new 16-byte cache key format for
block cache (incl compressed and secondary) and persistent cache (but
not table cache and row cache).
The goal is a really fast cache key with practically ideal stability and
uniqueness properties without external dependencies (e.g. from FileSystem).
A fixed key size of 16 bytes should enable future optimizations to the
concurrent hash table for block cache, which is a heavy CPU user /
bottleneck, but there appears to be measurable performance improvement
even with no changes to LRUCache.
This change replaces a lot of disjointed and ugly code handling cache
keys with calls to a simple, clean new internal API (cache_key.h).
(Preserving the old cache key logic under an option would be very ugly
and likely negate the performance gain of the new approach. Complete
replacement carries some inherent risk, but I think that's acceptable
with sufficient analysis and testing.)
The scheme for encoding new cache keys is complicated but explained
in cache_key.cc.
Also: EndianSwapValue is moved to math.h to be next to other bit
operations. (Explains some new include "math.h".) ReverseBits operation
added and unit tests added to hash_test for both.
Fixes https://github.com/facebook/rocksdb/issues/7405 (presuming a root cause)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9126
Test Plan:
### Basic correctness
Several tests needed updates to work with the new functionality, mostly
because we are no longer relying on filesystem for stable cache keys
so table builders & readers need more context info to agree on cache
keys. This functionality is so core, a huge number of existing tests
exercise the cache key functionality.
### Performance
Create db with
`TEST_TMPDIR=/dev/shm ./db_bench -bloom_bits=10 -benchmarks=fillrandom -num=3000000 -partition_index_and_filters`
And test performance with
`TEST_TMPDIR=/dev/shm ./db_bench -readonly -use_existing_db -bloom_bits=10 -benchmarks=readrandom -num=3000000 -duration=30 -cache_index_and_filter_blocks -cache_size=250000 -threads=4`
using DEBUG_LEVEL=0 and simultaneous before & after runs.
Before ops/sec, avg over 100 runs: 121924
After ops/sec, avg over 100 runs: 125385 (+2.8%)
### Collision probability
I have built a tool, ./cache_bench -stress_cache_key to broadly simulate host-wide cache activity
over many months, by making some pessimistic simplifying assumptions:
* Every generated file has a cache entry for every byte offset in the file (contiguous range of cache keys)
* All of every file is cached for its entire lifetime
We use a simple table with skewed address assignment and replacement on address collision
to simulate files coming & going, with quite a variance (super-Poisson) in ages. Some output
with `./cache_bench -stress_cache_key -sck_keep_bits=40`:
```
Total cache or DBs size: 32TiB Writing 925.926 MiB/s or 76.2939TiB/day
Multiply by 9.22337e+18 to correct for simulation losses (but still assume whole file cached)
```
These come from default settings of 2.5M files per day of 32 MB each, and
`-sck_keep_bits=40` means that to represent a single file, we are only keeping 40 bits of
the 128-bit cache key. With file size of 2\*\*25 contiguous keys (pessimistic), our simulation
is about 2\*\*(128-40-25) or about 9 billion billion times more prone to collision than reality.
More default assumptions, relatively pessimistic:
* 100 DBs in same process (doesn't matter much)
* Re-open DB in same process (new session ID related to old session ID) on average
every 100 files generated
* Restart process (all new session IDs unrelated to old) 24 times per day
After enough data, we get a result at the end:
```
(keep 40 bits) 17 collisions after 2 x 90 days, est 10.5882 days between (9.76592e+19 corrected)
```
If we believe the (pessimistic) simulation and the mathematical generalization, we would need to run a billion machines all for 97 billion days to expect a cache key collision. To help verify that our generalization ("corrected") is robust, we can make our simulation more precise with `-sck_keep_bits=41` and `42`, which takes more running time to get enough data:
```
(keep 41 bits) 16 collisions after 4 x 90 days, est 22.5 days between (1.03763e+20 corrected)
(keep 42 bits) 19 collisions after 10 x 90 days, est 47.3684 days between (1.09224e+20 corrected)
```
The generalized prediction still holds. With the `-sck_randomize` option, we can see that we are beating "random" cache keys (except offsets still non-randomized) by a modest amount (roughly 20x less collision prone than random), which should make us reasonably comfortable even in "degenerate" cases:
```
197 collisions after 1 x 90 days, est 0.456853 days between (4.21372e+18 corrected)
```
I've run other tests to validate other conditions behave as expected, never behaving "worse than random" unless we start chopping off structured data.
Reviewed By: zhichao-cao
Differential Revision: D33171746
Pulled By: pdillinger
fbshipit-source-id: f16a57e369ed37be5e7e33525ace848d0537c88f
2021-12-16 17:13:55 -08:00
|
|
|
CacheKey key = BlockBasedTable::GetCacheKey(rep_->base_cache_key, *handle);
|
2013-09-01 23:23:40 -07:00
|
|
|
|
2021-06-17 21:55:42 -07:00
|
|
|
s = block_cache_compressed->Insert(
|
New stable, fixed-length cache keys (#9126)
Summary:
This change standardizes on a new 16-byte cache key format for
block cache (incl compressed and secondary) and persistent cache (but
not table cache and row cache).
The goal is a really fast cache key with practically ideal stability and
uniqueness properties without external dependencies (e.g. from FileSystem).
A fixed key size of 16 bytes should enable future optimizations to the
concurrent hash table for block cache, which is a heavy CPU user /
bottleneck, but there appears to be measurable performance improvement
even with no changes to LRUCache.
This change replaces a lot of disjointed and ugly code handling cache
keys with calls to a simple, clean new internal API (cache_key.h).
(Preserving the old cache key logic under an option would be very ugly
and likely negate the performance gain of the new approach. Complete
replacement carries some inherent risk, but I think that's acceptable
with sufficient analysis and testing.)
The scheme for encoding new cache keys is complicated but explained
in cache_key.cc.
Also: EndianSwapValue is moved to math.h to be next to other bit
operations. (Explains some new include "math.h".) ReverseBits operation
added and unit tests added to hash_test for both.
Fixes https://github.com/facebook/rocksdb/issues/7405 (presuming a root cause)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9126
Test Plan:
### Basic correctness
Several tests needed updates to work with the new functionality, mostly
because we are no longer relying on filesystem for stable cache keys
so table builders & readers need more context info to agree on cache
keys. This functionality is so core, a huge number of existing tests
exercise the cache key functionality.
### Performance
Create db with
`TEST_TMPDIR=/dev/shm ./db_bench -bloom_bits=10 -benchmarks=fillrandom -num=3000000 -partition_index_and_filters`
And test performance with
`TEST_TMPDIR=/dev/shm ./db_bench -readonly -use_existing_db -bloom_bits=10 -benchmarks=readrandom -num=3000000 -duration=30 -cache_index_and_filter_blocks -cache_size=250000 -threads=4`
using DEBUG_LEVEL=0 and simultaneous before & after runs.
Before ops/sec, avg over 100 runs: 121924
After ops/sec, avg over 100 runs: 125385 (+2.8%)
### Collision probability
I have built a tool, ./cache_bench -stress_cache_key to broadly simulate host-wide cache activity
over many months, by making some pessimistic simplifying assumptions:
* Every generated file has a cache entry for every byte offset in the file (contiguous range of cache keys)
* All of every file is cached for its entire lifetime
We use a simple table with skewed address assignment and replacement on address collision
to simulate files coming & going, with quite a variance (super-Poisson) in ages. Some output
with `./cache_bench -stress_cache_key -sck_keep_bits=40`:
```
Total cache or DBs size: 32TiB Writing 925.926 MiB/s or 76.2939TiB/day
Multiply by 9.22337e+18 to correct for simulation losses (but still assume whole file cached)
```
These come from default settings of 2.5M files per day of 32 MB each, and
`-sck_keep_bits=40` means that to represent a single file, we are only keeping 40 bits of
the 128-bit cache key. With file size of 2\*\*25 contiguous keys (pessimistic), our simulation
is about 2\*\*(128-40-25) or about 9 billion billion times more prone to collision than reality.
More default assumptions, relatively pessimistic:
* 100 DBs in same process (doesn't matter much)
* Re-open DB in same process (new session ID related to old session ID) on average
every 100 files generated
* Restart process (all new session IDs unrelated to old) 24 times per day
After enough data, we get a result at the end:
```
(keep 40 bits) 17 collisions after 2 x 90 days, est 10.5882 days between (9.76592e+19 corrected)
```
If we believe the (pessimistic) simulation and the mathematical generalization, we would need to run a billion machines all for 97 billion days to expect a cache key collision. To help verify that our generalization ("corrected") is robust, we can make our simulation more precise with `-sck_keep_bits=41` and `42`, which takes more running time to get enough data:
```
(keep 41 bits) 16 collisions after 4 x 90 days, est 22.5 days between (1.03763e+20 corrected)
(keep 42 bits) 19 collisions after 10 x 90 days, est 47.3684 days between (1.09224e+20 corrected)
```
The generalized prediction still holds. With the `-sck_randomize` option, we can see that we are beating "random" cache keys (except offsets still non-randomized) by a modest amount (roughly 20x less collision prone than random), which should make us reasonably comfortable even in "degenerate" cases:
```
197 collisions after 1 x 90 days, est 0.456853 days between (4.21372e+18 corrected)
```
I've run other tests to validate other conditions behave as expected, never behaving "worse than random" unless we start chopping off structured data.
Reviewed By: zhichao-cao
Differential Revision: D33171746
Pulled By: pdillinger
fbshipit-source-id: f16a57e369ed37be5e7e33525ace848d0537c88f
2021-12-16 17:13:55 -08:00
|
|
|
key.AsSlice(), block_contents_to_cache,
|
2021-06-17 21:55:42 -07:00
|
|
|
block_contents_to_cache->ApproximateMemoryUsage(),
|
|
|
|
&DeleteEntryCached<BlockContents>);
|
|
|
|
if (s.ok()) {
|
|
|
|
RecordTick(rep_->ioptions.stats, BLOCK_CACHE_COMPRESSED_ADD);
|
|
|
|
} else {
|
|
|
|
RecordTick(rep_->ioptions.stats, BLOCK_CACHE_COMPRESSED_ADD_FAILURES);
|
|
|
|
}
|
2013-09-01 23:23:40 -07:00
|
|
|
// Invalidate OS cache.
|
2020-09-29 09:47:33 -07:00
|
|
|
r->file->InvalidateCache(static_cast<size_t>(r->get_offset()), size)
|
|
|
|
.PermitUncheckedError();
|
2013-09-01 23:23:40 -07:00
|
|
|
}
|
2021-06-17 21:55:42 -07:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2021-08-03 12:42:22 -07:00
|
|
|
Status BlockBasedTableBuilder::InsertBlockInCacheHelper(
|
|
|
|
const Slice& block_contents, const BlockHandle* handle,
|
2021-12-08 12:43:09 -08:00
|
|
|
BlockType block_type, bool is_top_level_filter_block) {
|
2021-08-03 12:42:22 -07:00
|
|
|
Status s;
|
|
|
|
if (block_type == BlockType::kData || block_type == BlockType::kIndex) {
|
|
|
|
s = InsertBlockInCache<Block>(block_contents, handle, block_type);
|
|
|
|
} else if (block_type == BlockType::kFilter) {
|
2021-12-15 13:19:34 -08:00
|
|
|
if (rep_->filter_builder->IsBlockBased()) {
|
|
|
|
// for block-based filter which is deprecated.
|
|
|
|
s = InsertBlockInCache<BlockContents>(block_contents, handle, block_type);
|
|
|
|
} else if (is_top_level_filter_block) {
|
|
|
|
// for top level filter block in partitioned filter.
|
2021-08-03 12:42:22 -07:00
|
|
|
s = InsertBlockInCache<Block>(block_contents, handle, block_type);
|
|
|
|
} else {
|
2021-12-15 13:19:34 -08:00
|
|
|
// for second level partitioned filters and full filters.
|
2021-08-03 12:42:22 -07:00
|
|
|
s = InsertBlockInCache<ParsedFullFilterBlock>(block_contents, handle,
|
|
|
|
block_type);
|
|
|
|
}
|
|
|
|
} else if (block_type == BlockType::kCompressionDictionary) {
|
|
|
|
s = InsertBlockInCache<UncompressionDict>(block_contents, handle,
|
|
|
|
block_type);
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
template <typename TBlocklike>
|
2021-06-17 21:55:42 -07:00
|
|
|
Status BlockBasedTableBuilder::InsertBlockInCache(const Slice& block_contents,
|
2021-08-03 12:42:22 -07:00
|
|
|
const BlockHandle* handle,
|
|
|
|
BlockType block_type) {
|
2021-06-17 21:55:42 -07:00
|
|
|
// Uncompressed regular block cache
|
|
|
|
Cache* block_cache = rep_->table_options.block_cache.get();
|
|
|
|
Status s;
|
|
|
|
if (block_cache != nullptr) {
|
|
|
|
size_t size = block_contents.size();
|
|
|
|
auto buf = AllocateBlock(size, block_cache->memory_allocator());
|
|
|
|
memcpy(buf.get(), block_contents.data(), size);
|
|
|
|
BlockContents results(std::move(buf), size);
|
|
|
|
|
New stable, fixed-length cache keys (#9126)
Summary:
This change standardizes on a new 16-byte cache key format for
block cache (incl compressed and secondary) and persistent cache (but
not table cache and row cache).
The goal is a really fast cache key with practically ideal stability and
uniqueness properties without external dependencies (e.g. from FileSystem).
A fixed key size of 16 bytes should enable future optimizations to the
concurrent hash table for block cache, which is a heavy CPU user /
bottleneck, but there appears to be measurable performance improvement
even with no changes to LRUCache.
This change replaces a lot of disjointed and ugly code handling cache
keys with calls to a simple, clean new internal API (cache_key.h).
(Preserving the old cache key logic under an option would be very ugly
and likely negate the performance gain of the new approach. Complete
replacement carries some inherent risk, but I think that's acceptable
with sufficient analysis and testing.)
The scheme for encoding new cache keys is complicated but explained
in cache_key.cc.
Also: EndianSwapValue is moved to math.h to be next to other bit
operations. (Explains some new include "math.h".) ReverseBits operation
added and unit tests added to hash_test for both.
Fixes https://github.com/facebook/rocksdb/issues/7405 (presuming a root cause)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9126
Test Plan:
### Basic correctness
Several tests needed updates to work with the new functionality, mostly
because we are no longer relying on filesystem for stable cache keys
so table builders & readers need more context info to agree on cache
keys. This functionality is so core, a huge number of existing tests
exercise the cache key functionality.
### Performance
Create db with
`TEST_TMPDIR=/dev/shm ./db_bench -bloom_bits=10 -benchmarks=fillrandom -num=3000000 -partition_index_and_filters`
And test performance with
`TEST_TMPDIR=/dev/shm ./db_bench -readonly -use_existing_db -bloom_bits=10 -benchmarks=readrandom -num=3000000 -duration=30 -cache_index_and_filter_blocks -cache_size=250000 -threads=4`
using DEBUG_LEVEL=0 and simultaneous before & after runs.
Before ops/sec, avg over 100 runs: 121924
After ops/sec, avg over 100 runs: 125385 (+2.8%)
### Collision probability
I have built a tool, ./cache_bench -stress_cache_key to broadly simulate host-wide cache activity
over many months, by making some pessimistic simplifying assumptions:
* Every generated file has a cache entry for every byte offset in the file (contiguous range of cache keys)
* All of every file is cached for its entire lifetime
We use a simple table with skewed address assignment and replacement on address collision
to simulate files coming & going, with quite a variance (super-Poisson) in ages. Some output
with `./cache_bench -stress_cache_key -sck_keep_bits=40`:
```
Total cache or DBs size: 32TiB Writing 925.926 MiB/s or 76.2939TiB/day
Multiply by 9.22337e+18 to correct for simulation losses (but still assume whole file cached)
```
These come from default settings of 2.5M files per day of 32 MB each, and
`-sck_keep_bits=40` means that to represent a single file, we are only keeping 40 bits of
the 128-bit cache key. With file size of 2\*\*25 contiguous keys (pessimistic), our simulation
is about 2\*\*(128-40-25) or about 9 billion billion times more prone to collision than reality.
More default assumptions, relatively pessimistic:
* 100 DBs in same process (doesn't matter much)
* Re-open DB in same process (new session ID related to old session ID) on average
every 100 files generated
* Restart process (all new session IDs unrelated to old) 24 times per day
After enough data, we get a result at the end:
```
(keep 40 bits) 17 collisions after 2 x 90 days, est 10.5882 days between (9.76592e+19 corrected)
```
If we believe the (pessimistic) simulation and the mathematical generalization, we would need to run a billion machines all for 97 billion days to expect a cache key collision. To help verify that our generalization ("corrected") is robust, we can make our simulation more precise with `-sck_keep_bits=41` and `42`, which takes more running time to get enough data:
```
(keep 41 bits) 16 collisions after 4 x 90 days, est 22.5 days between (1.03763e+20 corrected)
(keep 42 bits) 19 collisions after 10 x 90 days, est 47.3684 days between (1.09224e+20 corrected)
```
The generalized prediction still holds. With the `-sck_randomize` option, we can see that we are beating "random" cache keys (except offsets still non-randomized) by a modest amount (roughly 20x less collision prone than random), which should make us reasonably comfortable even in "degenerate" cases:
```
197 collisions after 1 x 90 days, est 0.456853 days between (4.21372e+18 corrected)
```
I've run other tests to validate other conditions behave as expected, never behaving "worse than random" unless we start chopping off structured data.
Reviewed By: zhichao-cao
Differential Revision: D33171746
Pulled By: pdillinger
fbshipit-source-id: f16a57e369ed37be5e7e33525ace848d0537c88f
2021-12-16 17:13:55 -08:00
|
|
|
CacheKey key = BlockBasedTable::GetCacheKey(rep_->base_cache_key, *handle);
|
2021-06-17 21:55:42 -07:00
|
|
|
|
|
|
|
const size_t read_amp_bytes_per_bit =
|
|
|
|
rep_->table_options.read_amp_bytes_per_bit;
|
2021-08-03 12:42:22 -07:00
|
|
|
|
2021-08-04 17:11:47 -07:00
|
|
|
// TODO akanksha:: Dedup below code by calling
|
|
|
|
// BlockBasedTable::PutDataBlockToCache.
|
|
|
|
std::unique_ptr<TBlocklike> block_holder(
|
|
|
|
BlocklikeTraits<TBlocklike>::Create(
|
|
|
|
std::move(results), read_amp_bytes_per_bit,
|
|
|
|
rep_->ioptions.statistics.get(),
|
|
|
|
false /*rep_->blocks_definitely_zstd_compressed*/,
|
|
|
|
rep_->table_options.filter_policy.get()));
|
|
|
|
|
|
|
|
assert(block_holder->own_bytes());
|
|
|
|
size_t charge = block_holder->ApproximateMemoryUsage();
|
|
|
|
s = block_cache->Insert(
|
New stable, fixed-length cache keys (#9126)
Summary:
This change standardizes on a new 16-byte cache key format for
block cache (incl compressed and secondary) and persistent cache (but
not table cache and row cache).
The goal is a really fast cache key with practically ideal stability and
uniqueness properties without external dependencies (e.g. from FileSystem).
A fixed key size of 16 bytes should enable future optimizations to the
concurrent hash table for block cache, which is a heavy CPU user /
bottleneck, but there appears to be measurable performance improvement
even with no changes to LRUCache.
This change replaces a lot of disjointed and ugly code handling cache
keys with calls to a simple, clean new internal API (cache_key.h).
(Preserving the old cache key logic under an option would be very ugly
and likely negate the performance gain of the new approach. Complete
replacement carries some inherent risk, but I think that's acceptable
with sufficient analysis and testing.)
The scheme for encoding new cache keys is complicated but explained
in cache_key.cc.
Also: EndianSwapValue is moved to math.h to be next to other bit
operations. (Explains some new include "math.h".) ReverseBits operation
added and unit tests added to hash_test for both.
Fixes https://github.com/facebook/rocksdb/issues/7405 (presuming a root cause)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9126
Test Plan:
### Basic correctness
Several tests needed updates to work with the new functionality, mostly
because we are no longer relying on filesystem for stable cache keys
so table builders & readers need more context info to agree on cache
keys. This functionality is so core, a huge number of existing tests
exercise the cache key functionality.
### Performance
Create db with
`TEST_TMPDIR=/dev/shm ./db_bench -bloom_bits=10 -benchmarks=fillrandom -num=3000000 -partition_index_and_filters`
And test performance with
`TEST_TMPDIR=/dev/shm ./db_bench -readonly -use_existing_db -bloom_bits=10 -benchmarks=readrandom -num=3000000 -duration=30 -cache_index_and_filter_blocks -cache_size=250000 -threads=4`
using DEBUG_LEVEL=0 and simultaneous before & after runs.
Before ops/sec, avg over 100 runs: 121924
After ops/sec, avg over 100 runs: 125385 (+2.8%)
### Collision probability
I have built a tool, ./cache_bench -stress_cache_key to broadly simulate host-wide cache activity
over many months, by making some pessimistic simplifying assumptions:
* Every generated file has a cache entry for every byte offset in the file (contiguous range of cache keys)
* All of every file is cached for its entire lifetime
We use a simple table with skewed address assignment and replacement on address collision
to simulate files coming & going, with quite a variance (super-Poisson) in ages. Some output
with `./cache_bench -stress_cache_key -sck_keep_bits=40`:
```
Total cache or DBs size: 32TiB Writing 925.926 MiB/s or 76.2939TiB/day
Multiply by 9.22337e+18 to correct for simulation losses (but still assume whole file cached)
```
These come from default settings of 2.5M files per day of 32 MB each, and
`-sck_keep_bits=40` means that to represent a single file, we are only keeping 40 bits of
the 128-bit cache key. With file size of 2\*\*25 contiguous keys (pessimistic), our simulation
is about 2\*\*(128-40-25) or about 9 billion billion times more prone to collision than reality.
More default assumptions, relatively pessimistic:
* 100 DBs in same process (doesn't matter much)
* Re-open DB in same process (new session ID related to old session ID) on average
every 100 files generated
* Restart process (all new session IDs unrelated to old) 24 times per day
After enough data, we get a result at the end:
```
(keep 40 bits) 17 collisions after 2 x 90 days, est 10.5882 days between (9.76592e+19 corrected)
```
If we believe the (pessimistic) simulation and the mathematical generalization, we would need to run a billion machines all for 97 billion days to expect a cache key collision. To help verify that our generalization ("corrected") is robust, we can make our simulation more precise with `-sck_keep_bits=41` and `42`, which takes more running time to get enough data:
```
(keep 41 bits) 16 collisions after 4 x 90 days, est 22.5 days between (1.03763e+20 corrected)
(keep 42 bits) 19 collisions after 10 x 90 days, est 47.3684 days between (1.09224e+20 corrected)
```
The generalized prediction still holds. With the `-sck_randomize` option, we can see that we are beating "random" cache keys (except offsets still non-randomized) by a modest amount (roughly 20x less collision prone than random), which should make us reasonably comfortable even in "degenerate" cases:
```
197 collisions after 1 x 90 days, est 0.456853 days between (4.21372e+18 corrected)
```
I've run other tests to validate other conditions behave as expected, never behaving "worse than random" unless we start chopping off structured data.
Reviewed By: zhichao-cao
Differential Revision: D33171746
Pulled By: pdillinger
fbshipit-source-id: f16a57e369ed37be5e7e33525ace848d0537c88f
2021-12-16 17:13:55 -08:00
|
|
|
key.AsSlice(), block_holder.get(),
|
2021-08-04 17:11:47 -07:00
|
|
|
BlocklikeTraits<TBlocklike>::GetCacheItemHelper(block_type), charge,
|
|
|
|
nullptr, Cache::Priority::LOW);
|
|
|
|
|
|
|
|
if (s.ok()) {
|
|
|
|
// Release ownership of block_holder.
|
|
|
|
block_holder.release();
|
|
|
|
BlockBasedTable::UpdateCacheInsertionMetrics(
|
|
|
|
block_type, nullptr /*get_context*/, charge, s.IsOkOverwritten(),
|
|
|
|
rep_->ioptions.stats);
|
|
|
|
} else {
|
|
|
|
RecordTick(rep_->ioptions.stats, BLOCK_CACHE_ADD_FAILURES);
|
2021-06-17 21:55:42 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return s;
|
2013-09-01 23:23:40 -07:00
|
|
|
}
|
|
|
|
|
2018-07-20 09:00:33 -07:00
|
|
|
void BlockBasedTableBuilder::WriteFilterBlock(
|
|
|
|
MetaIndexBuilder* meta_index_builder) {
|
|
|
|
BlockHandle filter_block_handle;
|
2021-05-21 17:10:29 -07:00
|
|
|
bool empty_filter_block =
|
|
|
|
(rep_->filter_builder == nullptr || rep_->filter_builder->IsEmpty());
|
2018-03-21 22:56:48 -07:00
|
|
|
if (ok() && !empty_filter_block) {
|
2021-05-21 17:10:29 -07:00
|
|
|
rep_->props.num_filter_entries +=
|
|
|
|
rep_->filter_builder->EstimateEntriesAdded();
|
2017-03-07 13:48:02 -08:00
|
|
|
Status s = Status::Incomplete();
|
2018-07-20 09:00:33 -07:00
|
|
|
while (ok() && s.IsIncomplete()) {
|
2021-11-04 13:29:09 -07:00
|
|
|
// filter_data is used to store the transferred filter data payload from
|
|
|
|
// FilterBlockBuilder and deallocate the payload by going out of scope.
|
|
|
|
// Otherwise, the payload will unnecessarily remain until
|
|
|
|
// BlockBasedTableBuilder is deallocated.
|
|
|
|
//
|
|
|
|
// See FilterBlockBuilder::Finish() for more on the difference in
|
|
|
|
// transferred filter data payload among different FilterBlockBuilder
|
|
|
|
// subtypes.
|
|
|
|
std::unique_ptr<const char[]> filter_data;
|
2018-07-20 14:34:07 -07:00
|
|
|
Slice filter_content =
|
2021-11-04 13:29:09 -07:00
|
|
|
rep_->filter_builder->Finish(filter_block_handle, &s, &filter_data);
|
Detect (new) Bloom/Ribbon Filter construction corruption (#9342)
Summary:
Note: rebase on and merge after https://github.com/facebook/rocksdb/pull/9349, https://github.com/facebook/rocksdb/pull/9345, (optional) https://github.com/facebook/rocksdb/pull/9393
**Context:**
(Quoted from pdillinger) Layers of information during new Bloom/Ribbon Filter construction in building block-based tables includes the following:
a) set of keys to add to filter
b) set of hashes to add to filter (64-bit hash applied to each key)
c) set of Bloom indices to set in filter, with duplicates
d) set of Bloom indices to set in filter, deduplicated
e) final filter and its checksum
This PR aims to detect corruption (e.g, unexpected hardware/software corruption on data structures residing in the memory for a long time) from b) to e) and leave a) as future works for application level.
- b)'s corruption is detected by verifying the xor checksum of the hash entries calculated as the entries accumulate before being added to the filter. (i.e, `XXPH3FilterBitsBuilder::MaybeVerifyHashEntriesChecksum()`)
- c) - e)'s corruption is detected by verifying the hash entries indeed exists in the constructed filter by re-querying these hash entries in the filter (i.e, `FilterBitsBuilder::MaybePostVerify()`) after computing the block checksum (except for PartitionFilter, which is done right after each `FilterBitsBuilder::Finish` for impl simplicity - see code comment for more). For this stage of detection, we assume hash entries are not corrupted after checking on b) since the time interval from b) to c) is relatively short IMO.
Option to enable this feature of detection is `BlockBasedTableOptions::detect_filter_construct_corruption` which is false by default.
**Summary:**
- Implemented new functions `XXPH3FilterBitsBuilder::MaybeVerifyHashEntriesChecksum()` and `FilterBitsBuilder::MaybePostVerify()`
- Ensured hash entries, final filter and banding and their [cache reservation ](https://github.com/facebook/rocksdb/issues/9073) are released properly despite corruption
- See [Filter.construction.artifacts.release.point.pdf ](https://github.com/facebook/rocksdb/files/7923487/Design.Filter.construction.artifacts.release.point.pdf) for high-level design
- Bundled and refactored hash entries's related artifact in XXPH3FilterBitsBuilder into `HashEntriesInfo` for better control on lifetime of these artifact during `SwapEntires`, `ResetEntries`
- Ensured RocksDB block-based table builder calls `FilterBitsBuilder::MaybePostVerify()` after constructing the filter by `FilterBitsBuilder::Finish()`
- When encountering such filter construction corruption, stop writing the filter content to files and mark such a block-based table building non-ok by storing the corruption status in the builder.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9342
Test Plan:
- Added new unit test `DBFilterConstructionCorruptionTestWithParam.DetectCorruption`
- Included this new feature in `DBFilterConstructionReserveMemoryTestWithParam.ReserveMemory` as this feature heavily touch ReserveMemory's impl
- For fallback case, I run `./filter_bench -impl=3 -detect_filter_construct_corruption=true -reserve_table_builder_memory=true -strict_capacity_limit=true -quick -runs 10 | grep 'Build avg'` to make sure nothing break.
- Added to `filter_bench`: increased filter construction time by **30%**, mostly by `MaybePostVerify()`
- FastLocalBloom
- Before change: `./filter_bench -impl=2 -quick -runs 10 | grep 'Build avg'`: **28.86643s**
- After change:
- `./filter_bench -impl=2 -detect_filter_construct_corruption=false -quick -runs 10 | grep 'Build avg'` (expect a tiny increase due to MaybePostVerify is always called regardless): **27.6644s (-4% perf improvement might be due to now we don't drop bloom hash entry in `AddAllEntries` along iteration but in bulk later, same with the bypassing-MaybePostVerify case below)**
- `./filter_bench -impl=2 -detect_filter_construct_corruption=true -quick -runs 10 | grep 'Build avg'` (expect acceptable increase): **34.41159s (+20%)**
- `./filter_bench -impl=2 -detect_filter_construct_corruption=true -quick -runs 10 | grep 'Build avg'` (by-passing MaybePostVerify, expect minor increase): **27.13431s (-6%)**
- Standard128Ribbon
- Before change: `./filter_bench -impl=3 -quick -runs 10 | grep 'Build avg'`: **122.5384s**
- After change:
- `./filter_bench -impl=3 -detect_filter_construct_corruption=false -quick -runs 10 | grep 'Build avg'` (expect a tiny increase due to MaybePostVerify is always called regardless - verified by removing MaybePostVerify under this case and found only +-1ns difference): **124.3588s (+2%)**
- `./filter_bench -impl=3 -detect_filter_construct_corruption=true -quick -runs 10 | grep 'Build avg'`(expect acceptable increase): **159.4946s (+30%)**
- `./filter_bench -impl=3 -detect_filter_construct_corruption=true -quick -runs 10 | grep 'Build avg'`(by-passing MaybePostVerify, expect minor increase) : **125.258s (+2%)**
- Added to `db_stress`: `make crash_test`, `./db_stress --detect_filter_construct_corruption=true`
- Manually smoke-tested: manually corrupted the filter construction in some db level tests with basic PUT and background flush. As expected, the error did get returned to users in subsequent PUT and Flush status.
Reviewed By: pdillinger
Differential Revision: D33746928
Pulled By: hx235
fbshipit-source-id: cb056426be5a7debc1cd16f23bc250f36a08ca57
2022-02-01 17:41:20 -08:00
|
|
|
|
|
|
|
assert(s.ok() || s.IsIncomplete() || s.IsCorruption());
|
|
|
|
if (s.IsCorruption()) {
|
|
|
|
rep_->SetStatus(s);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2018-07-20 09:00:33 -07:00
|
|
|
rep_->props.filter_size += filter_content.size();
|
2021-12-15 13:19:34 -08:00
|
|
|
|
|
|
|
// TODO: Refactor code so that BlockType can determine both the C++ type
|
|
|
|
// of a block cache entry (TBlocklike) and the CacheEntryRole while
|
|
|
|
// inserting blocks in cache.
|
2021-12-08 12:43:09 -08:00
|
|
|
bool top_level_filter_block = false;
|
|
|
|
if (s.ok() && rep_->table_options.partition_filters &&
|
|
|
|
!rep_->filter_builder->IsBlockBased()) {
|
|
|
|
top_level_filter_block = true;
|
|
|
|
}
|
2021-08-03 12:42:22 -07:00
|
|
|
WriteRawBlock(filter_content, kNoCompression, &filter_block_handle,
|
2021-12-08 12:43:09 -08:00
|
|
|
BlockType::kFilter, nullptr /*raw_contents*/,
|
|
|
|
top_level_filter_block);
|
2017-03-07 13:48:02 -08:00
|
|
|
}
|
Account Bloom/Ribbon filter construction memory in global memory limit (#9073)
Summary:
Note: This PR is the 4th part of a bigger PR stack (https://github.com/facebook/rocksdb/pull/9073) and will rebase/merge only after the first three PRs (https://github.com/facebook/rocksdb/pull/9070, https://github.com/facebook/rocksdb/pull/9071, https://github.com/facebook/rocksdb/pull/9130) merge.
**Context:**
Similar to https://github.com/facebook/rocksdb/pull/8428, this PR is to track memory usage during (new) Bloom Filter (i.e,FastLocalBloom) and Ribbon Filter (i.e, Ribbon128) construction, moving toward the goal of [single global memory limit using block cache capacity](https://github.com/facebook/rocksdb/wiki/Projects-Being-Developed#improving-memory-efficiency). It also constrains the size of the banding portion of Ribbon Filter during construction by falling back to Bloom Filter if that banding is, at some point, larger than the available space in the cache under `LRUCacheOptions::strict_capacity_limit=true`.
The option to turn on this feature is `BlockBasedTableOptions::reserve_table_builder_memory = true` which by default is set to `false`. We [decided](https://github.com/facebook/rocksdb/pull/9073#discussion_r741548409) not to have separate option for separate memory user in table building therefore their memory accounting are all bundled under one general option.
**Summary:**
- Reserved/released cache for creation/destruction of three main memory users with the passed-in `FilterBuildingContext::cache_res_mgr` during filter construction:
- hash entries (i.e`hash_entries`.size(), we bucket-charge hash entries during insertion for performance),
- banding (Ribbon Filter only, `bytes_coeff_rows` +`bytes_result_rows` + `bytes_backtrack`),
- final filter (i.e, `mutable_buf`'s size).
- Implementation details: in order to use `CacheReservationManager::CacheReservationHandle` to account final filter's memory, we have to store the `CacheReservationManager` object and `CacheReservationHandle` for final filter in `XXPH3BitsFilterBuilder` as well as explicitly delete the filter bits builder when done with the final filter in block based table.
- Added option fo run `filter_bench` with this memory reservation feature
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9073
Test Plan:
- Added new tests in `db_bloom_filter_test` to verify filter construction peak cache reservation under combination of `BlockBasedTable::Rep::FilterType` (e.g, `kFullFilter`, `kPartitionedFilter`), `BloomFilterPolicy::Mode`(e.g, `kFastLocalBloom`, `kStandard128Ribbon`, `kDeprecatedBlock`) and `BlockBasedTableOptions::reserve_table_builder_memory`
- To address the concern for slow test: tests with memory reservation under `kFullFilter` + `kStandard128Ribbon` and `kPartitionedFilter` take around **3000 - 6000 ms** and others take around **1500 - 2000 ms**, in total adding **20000 - 25000 ms** to the test suit running locally
- Added new test in `bloom_test` to verify Ribbon Filter fallback on large banding in FullFilter
- Added test in `filter_bench` to verify that this feature does not significantly slow down Bloom/Ribbon Filter construction speed. Local result averaged over **20** run as below:
- FastLocalBloom
- baseline `./filter_bench -impl=2 -quick -runs 20 | grep 'Build avg'`:
- **Build avg ns/key: 29.56295** (DEBUG_LEVEL=1), **29.98153** (DEBUG_LEVEL=0)
- new feature (expected to be similar as above)`./filter_bench -impl=2 -quick -runs 20 -reserve_table_builder_memory=true | grep 'Build avg'`:
- **Build avg ns/key: 30.99046** (DEBUG_LEVEL=1), **30.48867** (DEBUG_LEVEL=0)
- new feature of RibbonFilter with fallback (expected to be similar as above) `./filter_bench -impl=2 -quick -runs 20 -reserve_table_builder_memory=true -strict_capacity_limit=true | grep 'Build avg'` :
- **Build avg ns/key: 31.146975** (DEBUG_LEVEL=1), **30.08165** (DEBUG_LEVEL=0)
- Ribbon128
- baseline `./filter_bench -impl=3 -quick -runs 20 | grep 'Build avg'`:
- **Build avg ns/key: 129.17585** (DEBUG_LEVEL=1), **130.5225** (DEBUG_LEVEL=0)
- new feature (expected to be similar as above) `./filter_bench -impl=3 -quick -runs 20 -reserve_table_builder_memory=true | grep 'Build avg' `:
- **Build avg ns/key: 131.61645** (DEBUG_LEVEL=1), **132.98075** (DEBUG_LEVEL=0)
- new feature of RibbonFilter with fallback (expected to be a lot faster than above due to fallback) `./filter_bench -impl=3 -quick -runs 20 -reserve_table_builder_memory=true -strict_capacity_limit=true | grep 'Build avg'` :
- **Build avg ns/key: 52.032965** (DEBUG_LEVEL=1), **52.597825** (DEBUG_LEVEL=0)
- And the warning message of `"Cache reservation for Ribbon filter banding failed due to cache full"` is indeed logged to console.
Reviewed By: pdillinger
Differential Revision: D31991348
Pulled By: hx235
fbshipit-source-id: 9336b2c60f44d530063da518ceaf56dac5f9df8e
2021-11-18 09:41:10 -08:00
|
|
|
rep_->filter_builder->ResetFilterBitsBuilder();
|
2017-03-07 13:48:02 -08:00
|
|
|
}
|
2018-07-20 09:00:33 -07:00
|
|
|
if (ok() && !empty_filter_block) {
|
|
|
|
// Add mapping from "<filter_block_prefix>.Name" to location
|
|
|
|
// of filter data.
|
|
|
|
std::string key;
|
|
|
|
if (rep_->filter_builder->IsBlockBased()) {
|
|
|
|
key = BlockBasedTable::kFilterBlockPrefix;
|
|
|
|
} else {
|
|
|
|
key = rep_->table_options.partition_filters
|
|
|
|
? BlockBasedTable::kPartitionedFilterBlockPrefix
|
|
|
|
: BlockBasedTable::kFullFilterBlockPrefix;
|
|
|
|
}
|
|
|
|
key.append(rep_->table_options.filter_policy->Name());
|
|
|
|
meta_index_builder->Add(key, filter_block_handle);
|
|
|
|
}
|
|
|
|
}
|
2017-03-07 13:48:02 -08:00
|
|
|
|
2018-07-20 09:00:33 -07:00
|
|
|
void BlockBasedTableBuilder::WriteIndexBlock(
|
|
|
|
MetaIndexBuilder* meta_index_builder, BlockHandle* index_block_handle) {
|
2022-01-25 09:32:29 -08:00
|
|
|
if (!ok()) {
|
|
|
|
return;
|
|
|
|
}
|
2014-05-15 14:09:03 -07:00
|
|
|
IndexBuilder::IndexBlocks index_blocks;
|
2018-07-20 09:00:33 -07:00
|
|
|
auto index_builder_status = rep_->index_builder->Finish(&index_blocks);
|
2017-02-06 16:29:29 -08:00
|
|
|
if (index_builder_status.IsIncomplete()) {
|
|
|
|
// We we have more than one index partition then meta_blocks are not
|
|
|
|
// supported for the index. Currently meta_blocks are used only by
|
|
|
|
// HashIndexBuilder which is not multi-partition.
|
|
|
|
assert(index_blocks.meta_blocks.empty());
|
2018-07-20 09:00:33 -07:00
|
|
|
} else if (ok() && !index_builder_status.ok()) {
|
2020-04-30 15:34:43 -07:00
|
|
|
rep_->SetStatus(index_builder_status);
|
2013-10-10 11:43:24 -07:00
|
|
|
}
|
2014-05-15 14:09:03 -07:00
|
|
|
if (ok()) {
|
2018-07-20 09:00:33 -07:00
|
|
|
for (const auto& item : index_blocks.meta_blocks) {
|
|
|
|
BlockHandle block_handle;
|
2021-08-03 12:42:22 -07:00
|
|
|
WriteBlock(item.second, &block_handle, BlockType::kIndex);
|
2018-07-20 09:00:33 -07:00
|
|
|
if (!ok()) {
|
|
|
|
break;
|
2014-09-08 10:37:05 -07:00
|
|
|
}
|
2018-07-20 09:00:33 -07:00
|
|
|
meta_index_builder->Add(item.first, block_handle);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (ok()) {
|
|
|
|
if (rep_->table_options.enable_index_compression) {
|
2021-08-03 12:42:22 -07:00
|
|
|
WriteBlock(index_blocks.index_block_contents, index_block_handle,
|
|
|
|
BlockType::kIndex);
|
2018-07-20 09:00:33 -07:00
|
|
|
} else {
|
|
|
|
WriteRawBlock(index_blocks.index_block_contents, kNoCompression,
|
2021-08-03 12:42:22 -07:00
|
|
|
index_block_handle, BlockType::kIndex);
|
2013-10-10 11:43:24 -07:00
|
|
|
}
|
2018-07-20 09:00:33 -07:00
|
|
|
}
|
|
|
|
// If there are more index partitions, finish them and write them out
|
2021-01-06 14:14:01 -08:00
|
|
|
if (index_builder_status.IsIncomplete()) {
|
2022-01-06 10:09:13 -08:00
|
|
|
bool index_building_finished = false;
|
|
|
|
while (ok() && !index_building_finished) {
|
|
|
|
Status s =
|
|
|
|
rep_->index_builder->Finish(&index_blocks, *index_block_handle);
|
|
|
|
if (s.ok()) {
|
|
|
|
index_building_finished = true;
|
|
|
|
} else if (s.IsIncomplete()) {
|
|
|
|
// More partitioned index after this one
|
|
|
|
assert(!index_building_finished);
|
|
|
|
} else {
|
|
|
|
// Error
|
2021-01-06 14:14:01 -08:00
|
|
|
rep_->SetStatus(s);
|
|
|
|
return;
|
|
|
|
}
|
2022-01-06 10:09:13 -08:00
|
|
|
|
2021-01-06 14:14:01 -08:00
|
|
|
if (rep_->table_options.enable_index_compression) {
|
|
|
|
WriteBlock(index_blocks.index_block_contents, index_block_handle,
|
2021-08-03 12:42:22 -07:00
|
|
|
BlockType::kIndex);
|
2021-01-06 14:14:01 -08:00
|
|
|
} else {
|
|
|
|
WriteRawBlock(index_blocks.index_block_contents, kNoCompression,
|
2021-08-03 12:42:22 -07:00
|
|
|
index_block_handle, BlockType::kIndex);
|
2021-01-06 14:14:01 -08:00
|
|
|
}
|
|
|
|
// The last index_block_handle will be for the partition index block
|
2018-07-20 09:00:33 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2013-10-10 11:43:24 -07:00
|
|
|
|
2018-07-20 09:00:33 -07:00
|
|
|
void BlockBasedTableBuilder::WritePropertiesBlock(
|
|
|
|
MetaIndexBuilder* meta_index_builder) {
|
|
|
|
BlockHandle properties_block_handle;
|
|
|
|
if (ok()) {
|
|
|
|
PropertyBlockBuilder property_block_builder;
|
2018-07-20 14:34:07 -07:00
|
|
|
rep_->props.filter_policy_name =
|
|
|
|
rep_->table_options.filter_policy != nullptr
|
|
|
|
? rep_->table_options.filter_policy->Name()
|
|
|
|
: "";
|
|
|
|
rep_->props.index_size =
|
2018-08-10 15:14:44 -07:00
|
|
|
rep_->index_builder->IndexSize() + kBlockTrailerSize;
|
2018-07-20 09:00:33 -07:00
|
|
|
rep_->props.comparator_name = rep_->ioptions.user_comparator != nullptr
|
2018-07-20 14:34:07 -07:00
|
|
|
? rep_->ioptions.user_comparator->Name()
|
|
|
|
: "nullptr";
|
|
|
|
rep_->props.merge_operator_name =
|
|
|
|
rep_->ioptions.merge_operator != nullptr
|
|
|
|
? rep_->ioptions.merge_operator->Name()
|
|
|
|
: "nullptr";
|
2018-07-20 09:00:33 -07:00
|
|
|
rep_->props.compression_name =
|
2019-01-18 19:10:17 -08:00
|
|
|
CompressionTypeToString(rep_->compression_type);
|
2019-04-02 14:48:52 -07:00
|
|
|
rep_->props.compression_options =
|
|
|
|
CompressionOptionsToString(rep_->compression_opts);
|
2018-07-20 14:34:07 -07:00
|
|
|
rep_->props.prefix_extractor_name =
|
|
|
|
rep_->moptions.prefix_extractor != nullptr
|
2021-09-27 07:42:36 -07:00
|
|
|
? rep_->moptions.prefix_extractor->AsString()
|
2018-07-20 14:34:07 -07:00
|
|
|
: "nullptr";
|
2018-07-20 09:00:33 -07:00
|
|
|
std::string property_collectors_names = "[";
|
|
|
|
for (size_t i = 0;
|
|
|
|
i < rep_->ioptions.table_properties_collector_factories.size(); ++i) {
|
|
|
|
if (i != 0) {
|
|
|
|
property_collectors_names += ",";
|
Shared dictionary compression using reference block
Summary:
This adds a new metablock containing a shared dictionary that is used
to compress all data blocks in the SST file. The size of the shared dictionary
is configurable in CompressionOptions and defaults to 0. It's currently only
used for zlib/lz4/lz4hc, but the block will be stored in the SST regardless of
the compression type if the user chooses a nonzero dictionary size.
During compaction, computes the dictionary by randomly sampling the first
output file in each subcompaction. It pre-computes the intervals to sample
by assuming the output file will have the maximum allowable length. In case
the file is smaller, some of the pre-computed sampling intervals can be beyond
end-of-file, in which case we skip over those samples and the dictionary will
be a bit smaller. After the dictionary is generated using the first file in a
subcompaction, it is loaded into the compression library before writing each
block in each subsequent file of that subcompaction.
On the read path, gets the dictionary from the metablock, if it exists. Then,
loads that dictionary into the compression library before reading each block.
Test Plan: new unit test
Reviewers: yhchiang, IslamAbdelRahman, cyan, sdong
Reviewed By: sdong
Subscribers: andrewkr, yoshinorim, kradhakrishnan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D52287
2016-04-27 17:36:03 -07:00
|
|
|
}
|
2018-07-20 09:00:33 -07:00
|
|
|
property_collectors_names +=
|
|
|
|
rep_->ioptions.table_properties_collector_factories[i]->Name();
|
|
|
|
}
|
|
|
|
property_collectors_names += "]";
|
|
|
|
rep_->props.property_collectors_names = property_collectors_names;
|
|
|
|
if (rep_->table_options.index_type ==
|
|
|
|
BlockBasedTableOptions::kTwoLevelIndexSearch) {
|
|
|
|
assert(rep_->p_index_builder_ != nullptr);
|
|
|
|
rep_->props.index_partitions = rep_->p_index_builder_->NumPartitions();
|
|
|
|
rep_->props.top_level_index_size =
|
2018-08-10 15:14:44 -07:00
|
|
|
rep_->p_index_builder_->TopLevelIndexSize(rep_->offset);
|
2018-07-20 09:00:33 -07:00
|
|
|
}
|
|
|
|
rep_->props.index_key_is_user_key =
|
|
|
|
!rep_->index_builder->seperator_is_key_plus_seq();
|
2018-08-09 16:49:45 -07:00
|
|
|
rep_->props.index_value_is_delta_encoded =
|
|
|
|
rep_->use_delta_encoding_for_index_values;
|
2021-03-31 18:20:44 -07:00
|
|
|
if (rep_->sampled_input_data_bytes > 0) {
|
|
|
|
rep_->props.slow_compression_estimated_data_size = static_cast<uint64_t>(
|
|
|
|
static_cast<double>(rep_->sampled_output_slow_data_bytes) /
|
|
|
|
rep_->sampled_input_data_bytes *
|
|
|
|
rep_->compressible_input_data_bytes +
|
|
|
|
rep_->uncompressible_input_data_bytes + 0.5);
|
|
|
|
rep_->props.fast_compression_estimated_data_size = static_cast<uint64_t>(
|
|
|
|
static_cast<double>(rep_->sampled_output_fast_data_bytes) /
|
|
|
|
rep_->sampled_input_data_bytes *
|
|
|
|
rep_->compressible_input_data_bytes +
|
|
|
|
rep_->uncompressible_input_data_bytes + 0.5);
|
|
|
|
} else if (rep_->sample_for_compression > 0) {
|
|
|
|
// We tried to sample but none were found. Assume worst-case (compression
|
|
|
|
// ratio 1.0) so data is complete and aggregatable.
|
|
|
|
rep_->props.slow_compression_estimated_data_size =
|
|
|
|
rep_->compressible_input_data_bytes +
|
|
|
|
rep_->uncompressible_input_data_bytes;
|
|
|
|
rep_->props.fast_compression_estimated_data_size =
|
|
|
|
rep_->compressible_input_data_bytes +
|
|
|
|
rep_->uncompressible_input_data_bytes;
|
|
|
|
}
|
2018-07-20 09:00:33 -07:00
|
|
|
|
|
|
|
// Add basic properties
|
|
|
|
property_block_builder.AddTableProperty(rep_->props);
|
2016-08-19 15:10:31 -07:00
|
|
|
|
2018-07-20 09:00:33 -07:00
|
|
|
// Add use collected properties
|
|
|
|
NotifyCollectTableCollectorsOnFinish(rep_->table_properties_collectors,
|
2021-04-26 12:43:02 -07:00
|
|
|
rep_->ioptions.logger,
|
2018-07-20 09:00:33 -07:00
|
|
|
&property_block_builder);
|
2012-04-17 08:36:46 -07:00
|
|
|
|
2021-11-19 17:30:12 -08:00
|
|
|
Slice block_data = property_block_builder.Finish();
|
|
|
|
TEST_SYNC_POINT_CALLBACK(
|
|
|
|
"BlockBasedTableBuilder::WritePropertiesBlock:BlockData", &block_data);
|
|
|
|
WriteRawBlock(block_data, kNoCompression, &properties_block_handle,
|
|
|
|
BlockType::kProperties);
|
2018-07-20 09:00:33 -07:00
|
|
|
}
|
2011-03-18 22:37:00 +00:00
|
|
|
if (ok()) {
|
2019-02-11 11:37:07 -08:00
|
|
|
#ifndef NDEBUG
|
|
|
|
{
|
|
|
|
uint64_t props_block_offset = properties_block_handle.offset();
|
|
|
|
uint64_t props_block_size = properties_block_handle.size();
|
|
|
|
TEST_SYNC_POINT_CALLBACK(
|
|
|
|
"BlockBasedTableBuilder::WritePropertiesBlock:GetPropsBlockOffset",
|
|
|
|
&props_block_offset);
|
|
|
|
TEST_SYNC_POINT_CALLBACK(
|
|
|
|
"BlockBasedTableBuilder::WritePropertiesBlock:GetPropsBlockSize",
|
|
|
|
&props_block_size);
|
|
|
|
}
|
|
|
|
#endif // !NDEBUG
|
Improve / clean up meta block code & integrity (#9163)
Summary:
* Checksums are now checked on meta blocks unless specifically
suppressed or not applicable (e.g. plain table). (Was other way around.)
This means a number of cases that were not checking checksums now are,
including direct read TableProperties in Version::GetTableProperties
(fixed in meta_blocks ReadTableProperties), reading any block from
PersistentCache (fixed in BlockFetcher), read TableProperties in
SstFileDumper (ldb/sst_dump/BackupEngine) before table reader open,
maybe more.
* For that to work, I moved the global_seqno+TableProperties checksum
logic to the shared table/ code, because that is used by many utilies
such as SstFileDumper.
* Also for that to work, we have to know when we're dealing with a block
that has a checksum (trailer), so added that capability to Footer based
on magic number, and from there BlockFetcher.
* Knowledge of trailer presence has also fixed a problem where other
table formats were reading blocks including bytes for a non-existant
trailer--and awkwardly kind-of not using them, e.g. no shared code
checking checksums. (BlockFetcher compression type was populated
incorrectly.) Now we only read what is needed.
* Minimized code duplication and differing/incompatible/awkward
abstractions in meta_blocks.{cc,h} (e.g. SeekTo in metaindex block
without parsing block handle)
* Moved some meta block handling code from table_properties*.*
* Moved some code specific to block-based table from shared table/ code
to BlockBasedTable class. The checksum stuff means we can't completely
separate it, but things that don't need to be in shared table/ code
should not be.
* Use unique_ptr rather than raw ptr in more places. (Note: you can
std::move from unique_ptr to shared_ptr.)
Without enhancements to GetPropertiesOfAllTablesTest (see below),
net reduction of roughly 100 lines of code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9163
Test Plan:
existing tests and
* Enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to verify that
checksums are now checked on direct read of table properties by TableCache
(new test would fail before this change)
* Also enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to test
putting table properties under old meta name
* Also generally enhanced that same test to actually test what it was
supposed to be testing already, by kicking things out of table cache when
we don't want them there.
Reviewed By: ajkr, mrambacher
Differential Revision: D32514757
Pulled By: pdillinger
fbshipit-source-id: 507964b9311d186ae8d1131182290cbd97a99fa9
2021-11-18 11:42:12 -08:00
|
|
|
|
2021-12-10 08:12:09 -08:00
|
|
|
const std::string* properties_block_meta = &kPropertiesBlockName;
|
Improve / clean up meta block code & integrity (#9163)
Summary:
* Checksums are now checked on meta blocks unless specifically
suppressed or not applicable (e.g. plain table). (Was other way around.)
This means a number of cases that were not checking checksums now are,
including direct read TableProperties in Version::GetTableProperties
(fixed in meta_blocks ReadTableProperties), reading any block from
PersistentCache (fixed in BlockFetcher), read TableProperties in
SstFileDumper (ldb/sst_dump/BackupEngine) before table reader open,
maybe more.
* For that to work, I moved the global_seqno+TableProperties checksum
logic to the shared table/ code, because that is used by many utilies
such as SstFileDumper.
* Also for that to work, we have to know when we're dealing with a block
that has a checksum (trailer), so added that capability to Footer based
on magic number, and from there BlockFetcher.
* Knowledge of trailer presence has also fixed a problem where other
table formats were reading blocks including bytes for a non-existant
trailer--and awkwardly kind-of not using them, e.g. no shared code
checking checksums. (BlockFetcher compression type was populated
incorrectly.) Now we only read what is needed.
* Minimized code duplication and differing/incompatible/awkward
abstractions in meta_blocks.{cc,h} (e.g. SeekTo in metaindex block
without parsing block handle)
* Moved some meta block handling code from table_properties*.*
* Moved some code specific to block-based table from shared table/ code
to BlockBasedTable class. The checksum stuff means we can't completely
separate it, but things that don't need to be in shared table/ code
should not be.
* Use unique_ptr rather than raw ptr in more places. (Note: you can
std::move from unique_ptr to shared_ptr.)
Without enhancements to GetPropertiesOfAllTablesTest (see below),
net reduction of roughly 100 lines of code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9163
Test Plan:
existing tests and
* Enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to verify that
checksums are now checked on direct read of table properties by TableCache
(new test would fail before this change)
* Also enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to test
putting table properties under old meta name
* Also generally enhanced that same test to actually test what it was
supposed to be testing already, by kicking things out of table cache when
we don't want them there.
Reviewed By: ajkr, mrambacher
Differential Revision: D32514757
Pulled By: pdillinger
fbshipit-source-id: 507964b9311d186ae8d1131182290cbd97a99fa9
2021-11-18 11:42:12 -08:00
|
|
|
TEST_SYNC_POINT_CALLBACK(
|
|
|
|
"BlockBasedTableBuilder::WritePropertiesBlock:Meta",
|
|
|
|
&properties_block_meta);
|
|
|
|
meta_index_builder->Add(*properties_block_meta, properties_block_handle);
|
2018-07-20 09:00:33 -07:00
|
|
|
}
|
|
|
|
}
|
2017-02-06 16:29:29 -08:00
|
|
|
|
2018-07-20 09:00:33 -07:00
|
|
|
void BlockBasedTableBuilder::WriteCompressionDictBlock(
|
|
|
|
MetaIndexBuilder* meta_index_builder) {
|
Reduce scope of compression dictionary to single SST (#4952)
Summary:
Our previous approach was to train one compression dictionary per compaction, using the first output SST to train a dictionary, and then applying it on subsequent SSTs in the same compaction. While this was great for minimizing CPU/memory/I/O overhead, it did not achieve good compression ratios in practice. In our most promising potential use case, moderate reductions in a dictionary's scope make a major difference on compression ratio.
So, this PR changes compression dictionary to be scoped per-SST. It accepts the tradeoff during table building to use more memory and CPU. Important changes include:
- The `BlockBasedTableBuilder` has a new state when dictionary compression is in-use: `kBuffered`. In that state it accumulates uncompressed data in-memory whenever `Add` is called.
- After accumulating target file size bytes or calling `BlockBasedTableBuilder::Finish`, a `BlockBasedTableBuilder` moves to the `kUnbuffered` state. The transition (`EnterUnbuffered()`) involves sampling the buffered data, training a dictionary, and compressing/writing out all buffered data. In the `kUnbuffered` state, a `BlockBasedTableBuilder` behaves the same as before -- blocks are compressed/written out as soon as they fill up.
- Samples are now whole uncompressed data blocks, except the final sample may be a partial data block so we don't breach the user's configured `max_dict_bytes` or `zstd_max_train_bytes`. The dictionary trainer is supposed to work better when we pass it real units of compression. Previously we were passing 64-byte KV samples which was not realistic.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4952
Differential Revision: D13967980
Pulled By: ajkr
fbshipit-source-id: 82bea6f7537e1529c7a1a4cdee84585f5949300f
2019-02-11 19:42:25 -08:00
|
|
|
if (rep_->compression_dict != nullptr &&
|
|
|
|
rep_->compression_dict->GetRawDict().size()) {
|
2018-07-20 09:00:33 -07:00
|
|
|
BlockHandle compression_dict_block_handle;
|
|
|
|
if (ok()) {
|
Reduce scope of compression dictionary to single SST (#4952)
Summary:
Our previous approach was to train one compression dictionary per compaction, using the first output SST to train a dictionary, and then applying it on subsequent SSTs in the same compaction. While this was great for minimizing CPU/memory/I/O overhead, it did not achieve good compression ratios in practice. In our most promising potential use case, moderate reductions in a dictionary's scope make a major difference on compression ratio.
So, this PR changes compression dictionary to be scoped per-SST. It accepts the tradeoff during table building to use more memory and CPU. Important changes include:
- The `BlockBasedTableBuilder` has a new state when dictionary compression is in-use: `kBuffered`. In that state it accumulates uncompressed data in-memory whenever `Add` is called.
- After accumulating target file size bytes or calling `BlockBasedTableBuilder::Finish`, a `BlockBasedTableBuilder` moves to the `kUnbuffered` state. The transition (`EnterUnbuffered()`) involves sampling the buffered data, training a dictionary, and compressing/writing out all buffered data. In the `kUnbuffered` state, a `BlockBasedTableBuilder` behaves the same as before -- blocks are compressed/written out as soon as they fill up.
- Samples are now whole uncompressed data blocks, except the final sample may be a partial data block so we don't breach the user's configured `max_dict_bytes` or `zstd_max_train_bytes`. The dictionary trainer is supposed to work better when we pass it real units of compression. Previously we were passing 64-byte KV samples which was not realistic.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4952
Differential Revision: D13967980
Pulled By: ajkr
fbshipit-source-id: 82bea6f7537e1529c7a1a4cdee84585f5949300f
2019-02-11 19:42:25 -08:00
|
|
|
WriteRawBlock(rep_->compression_dict->GetRawDict(), kNoCompression,
|
2021-08-03 12:42:22 -07:00
|
|
|
&compression_dict_block_handle,
|
|
|
|
BlockType::kCompressionDictionary);
|
Reduce scope of compression dictionary to single SST (#4952)
Summary:
Our previous approach was to train one compression dictionary per compaction, using the first output SST to train a dictionary, and then applying it on subsequent SSTs in the same compaction. While this was great for minimizing CPU/memory/I/O overhead, it did not achieve good compression ratios in practice. In our most promising potential use case, moderate reductions in a dictionary's scope make a major difference on compression ratio.
So, this PR changes compression dictionary to be scoped per-SST. It accepts the tradeoff during table building to use more memory and CPU. Important changes include:
- The `BlockBasedTableBuilder` has a new state when dictionary compression is in-use: `kBuffered`. In that state it accumulates uncompressed data in-memory whenever `Add` is called.
- After accumulating target file size bytes or calling `BlockBasedTableBuilder::Finish`, a `BlockBasedTableBuilder` moves to the `kUnbuffered` state. The transition (`EnterUnbuffered()`) involves sampling the buffered data, training a dictionary, and compressing/writing out all buffered data. In the `kUnbuffered` state, a `BlockBasedTableBuilder` behaves the same as before -- blocks are compressed/written out as soon as they fill up.
- Samples are now whole uncompressed data blocks, except the final sample may be a partial data block so we don't breach the user's configured `max_dict_bytes` or `zstd_max_train_bytes`. The dictionary trainer is supposed to work better when we pass it real units of compression. Previously we were passing 64-byte KV samples which was not realistic.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4952
Differential Revision: D13967980
Pulled By: ajkr
fbshipit-source-id: 82bea6f7537e1529c7a1a4cdee84585f5949300f
2019-02-11 19:42:25 -08:00
|
|
|
#ifndef NDEBUG
|
|
|
|
Slice compression_dict = rep_->compression_dict->GetRawDict();
|
|
|
|
TEST_SYNC_POINT_CALLBACK(
|
|
|
|
"BlockBasedTableBuilder::WriteCompressionDictBlock:RawDict",
|
|
|
|
&compression_dict);
|
|
|
|
#endif // NDEBUG
|
2018-01-10 15:06:29 -08:00
|
|
|
}
|
2018-07-20 09:00:33 -07:00
|
|
|
if (ok()) {
|
2021-12-10 08:12:09 -08:00
|
|
|
meta_index_builder->Add(kCompressionDictBlockName,
|
2018-07-20 09:00:33 -07:00
|
|
|
compression_dict_block_handle);
|
2017-02-06 16:29:29 -08:00
|
|
|
}
|
2011-03-18 22:37:00 +00:00
|
|
|
}
|
2018-07-20 09:00:33 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
void BlockBasedTableBuilder::WriteRangeDelBlock(
|
|
|
|
MetaIndexBuilder* meta_index_builder) {
|
|
|
|
if (ok() && !rep_->range_del_block.empty()) {
|
|
|
|
BlockHandle range_del_block_handle;
|
|
|
|
WriteRawBlock(rep_->range_del_block.Finish(), kNoCompression,
|
2021-08-03 12:42:22 -07:00
|
|
|
&range_del_block_handle, BlockType::kRangeDeletion);
|
2021-12-10 08:12:09 -08:00
|
|
|
meta_index_builder->Add(kRangeDelBlockName, range_del_block_handle);
|
2018-07-20 09:00:33 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-12-07 13:15:09 -08:00
|
|
|
void BlockBasedTableBuilder::WriteFooter(BlockHandle& metaindex_block_handle,
|
|
|
|
BlockHandle& index_block_handle) {
|
|
|
|
Rep* r = rep_;
|
|
|
|
// this is guaranteed by BlockBasedTableBuilder's constructor
|
|
|
|
assert(r->table_options.checksum == kCRC32c ||
|
|
|
|
r->table_options.format_version != 0);
|
2020-04-30 15:34:43 -07:00
|
|
|
assert(ok());
|
2021-12-13 17:42:05 -08:00
|
|
|
|
|
|
|
FooterBuilder footer;
|
|
|
|
footer.Build(kBlockBasedTableMagicNumber, r->table_options.format_version,
|
|
|
|
r->get_offset(), r->table_options.checksum,
|
|
|
|
metaindex_block_handle, index_block_handle);
|
|
|
|
IOStatus ios = r->file->Append(footer.GetSlice());
|
2020-04-30 15:34:43 -07:00
|
|
|
if (ios.ok()) {
|
2021-12-13 17:42:05 -08:00
|
|
|
r->set_offset(r->get_offset() + footer.GetSlice().size());
|
2020-10-22 11:03:10 -07:00
|
|
|
} else {
|
|
|
|
r->SetIOStatus(ios);
|
2018-12-07 13:15:09 -08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Reduce scope of compression dictionary to single SST (#4952)
Summary:
Our previous approach was to train one compression dictionary per compaction, using the first output SST to train a dictionary, and then applying it on subsequent SSTs in the same compaction. While this was great for minimizing CPU/memory/I/O overhead, it did not achieve good compression ratios in practice. In our most promising potential use case, moderate reductions in a dictionary's scope make a major difference on compression ratio.
So, this PR changes compression dictionary to be scoped per-SST. It accepts the tradeoff during table building to use more memory and CPU. Important changes include:
- The `BlockBasedTableBuilder` has a new state when dictionary compression is in-use: `kBuffered`. In that state it accumulates uncompressed data in-memory whenever `Add` is called.
- After accumulating target file size bytes or calling `BlockBasedTableBuilder::Finish`, a `BlockBasedTableBuilder` moves to the `kUnbuffered` state. The transition (`EnterUnbuffered()`) involves sampling the buffered data, training a dictionary, and compressing/writing out all buffered data. In the `kUnbuffered` state, a `BlockBasedTableBuilder` behaves the same as before -- blocks are compressed/written out as soon as they fill up.
- Samples are now whole uncompressed data blocks, except the final sample may be a partial data block so we don't breach the user's configured `max_dict_bytes` or `zstd_max_train_bytes`. The dictionary trainer is supposed to work better when we pass it real units of compression. Previously we were passing 64-byte KV samples which was not realistic.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4952
Differential Revision: D13967980
Pulled By: ajkr
fbshipit-source-id: 82bea6f7537e1529c7a1a4cdee84585f5949300f
2019-02-11 19:42:25 -08:00
|
|
|
void BlockBasedTableBuilder::EnterUnbuffered() {
|
|
|
|
Rep* r = rep_;
|
|
|
|
assert(r->state == Rep::State::kBuffered);
|
|
|
|
r->state = Rep::State::kUnbuffered;
|
|
|
|
const size_t kSampleBytes = r->compression_opts.zstd_max_train_bytes > 0
|
|
|
|
? r->compression_opts.zstd_max_train_bytes
|
|
|
|
: r->compression_opts.max_dict_bytes;
|
2021-04-23 12:44:11 -07:00
|
|
|
const size_t kNumBlocksBuffered = r->data_block_buffers.size();
|
2021-04-01 05:07:19 -07:00
|
|
|
if (kNumBlocksBuffered == 0) {
|
|
|
|
// The below code is neither safe nor necessary for handling zero data
|
|
|
|
// blocks.
|
|
|
|
return;
|
|
|
|
}
|
Limit buffering for collecting samples for compression dictionary (#7970)
Summary:
For dictionary compression, we need to collect some representative samples of the data to be compressed, which we use to either generate or train (when `CompressionOptions::zstd_max_train_bytes > 0`) a dictionary. Previously, the strategy was to buffer all the data blocks during flush, and up to the target file size during compaction. That strategy allowed us to randomly pick samples from as wide a range as possible that'd be guaranteed to land in a single output file.
However, some users try to make huge files in memory-constrained environments, where this strategy can cause OOM. This PR introduces an option, `CompressionOptions::max_dict_buffer_bytes`, that limits how much data blocks are buffered before we switch to unbuffered mode (which means creating the per-SST dictionary, writing out the buffered data, and compressing/writing new blocks as soon as they are built). It is not strict as we currently buffer more than just data blocks -- also keys are buffered. But it does make a step towards giving users predictable memory usage.
Related changes include:
- Changed sampling for dictionary compression to select unique data blocks when there is limited availability of data blocks
- Made use of `BlockBuilder::SwapAndReset()` to save an allocation+memcpy when buffering data blocks for building a dictionary
- Changed `ParseBoolean()` to accept an input containing characters after the boolean. This is necessary since, with this PR, a value for `CompressionOptions::enabled` is no longer necessarily the final component in the `CompressionOptions` string.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7970
Test Plan:
- updated `CompressionOptions` unit tests to verify limit is respected (to the extent expected in the current implementation) in various scenarios of flush/compaction to bottommost/non-bottommost level
- looked at jemalloc heap profiles right before and after switching to unbuffered mode during flush/compaction. Verified memory usage in buffering is proportional to the limit set.
Reviewed By: pdillinger
Differential Revision: D26467994
Pulled By: ajkr
fbshipit-source-id: 3da4ef9fba59974e4ef40e40c01611002c861465
2021-02-19 14:06:59 -08:00
|
|
|
|
2021-02-22 17:41:11 -08:00
|
|
|
// Abstract algebra teaches us that a finite cyclic group (such as the
|
|
|
|
// additive group of integers modulo N) can be generated by a number that is
|
|
|
|
// coprime with N. Since N is variable (number of buffered data blocks), we
|
|
|
|
// must then pick a prime number in order to guarantee coprimeness with any N.
|
|
|
|
//
|
|
|
|
// One downside of this approach is the spread will be poor when
|
|
|
|
// `kPrimeGeneratorRemainder` is close to zero or close to
|
|
|
|
// `kNumBlocksBuffered`.
|
|
|
|
//
|
|
|
|
// Picked a random number between one and one trillion and then chose the
|
|
|
|
// next prime number greater than or equal to it.
|
|
|
|
const uint64_t kPrimeGenerator = 545055921143ull;
|
|
|
|
// Can avoid repeated division by just adding the remainder repeatedly.
|
|
|
|
const size_t kPrimeGeneratorRemainder = static_cast<size_t>(
|
|
|
|
kPrimeGenerator % static_cast<uint64_t>(kNumBlocksBuffered));
|
|
|
|
const size_t kInitSampleIdx = kNumBlocksBuffered / 2;
|
Limit buffering for collecting samples for compression dictionary (#7970)
Summary:
For dictionary compression, we need to collect some representative samples of the data to be compressed, which we use to either generate or train (when `CompressionOptions::zstd_max_train_bytes > 0`) a dictionary. Previously, the strategy was to buffer all the data blocks during flush, and up to the target file size during compaction. That strategy allowed us to randomly pick samples from as wide a range as possible that'd be guaranteed to land in a single output file.
However, some users try to make huge files in memory-constrained environments, where this strategy can cause OOM. This PR introduces an option, `CompressionOptions::max_dict_buffer_bytes`, that limits how much data blocks are buffered before we switch to unbuffered mode (which means creating the per-SST dictionary, writing out the buffered data, and compressing/writing new blocks as soon as they are built). It is not strict as we currently buffer more than just data blocks -- also keys are buffered. But it does make a step towards giving users predictable memory usage.
Related changes include:
- Changed sampling for dictionary compression to select unique data blocks when there is limited availability of data blocks
- Made use of `BlockBuilder::SwapAndReset()` to save an allocation+memcpy when buffering data blocks for building a dictionary
- Changed `ParseBoolean()` to accept an input containing characters after the boolean. This is necessary since, with this PR, a value for `CompressionOptions::enabled` is no longer necessarily the final component in the `CompressionOptions` string.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7970
Test Plan:
- updated `CompressionOptions` unit tests to verify limit is respected (to the extent expected in the current implementation) in various scenarios of flush/compaction to bottommost/non-bottommost level
- looked at jemalloc heap profiles right before and after switching to unbuffered mode during flush/compaction. Verified memory usage in buffering is proportional to the limit set.
Reviewed By: pdillinger
Differential Revision: D26467994
Pulled By: ajkr
fbshipit-source-id: 3da4ef9fba59974e4ef40e40c01611002c861465
2021-02-19 14:06:59 -08:00
|
|
|
|
Reduce scope of compression dictionary to single SST (#4952)
Summary:
Our previous approach was to train one compression dictionary per compaction, using the first output SST to train a dictionary, and then applying it on subsequent SSTs in the same compaction. While this was great for minimizing CPU/memory/I/O overhead, it did not achieve good compression ratios in practice. In our most promising potential use case, moderate reductions in a dictionary's scope make a major difference on compression ratio.
So, this PR changes compression dictionary to be scoped per-SST. It accepts the tradeoff during table building to use more memory and CPU. Important changes include:
- The `BlockBasedTableBuilder` has a new state when dictionary compression is in-use: `kBuffered`. In that state it accumulates uncompressed data in-memory whenever `Add` is called.
- After accumulating target file size bytes or calling `BlockBasedTableBuilder::Finish`, a `BlockBasedTableBuilder` moves to the `kUnbuffered` state. The transition (`EnterUnbuffered()`) involves sampling the buffered data, training a dictionary, and compressing/writing out all buffered data. In the `kUnbuffered` state, a `BlockBasedTableBuilder` behaves the same as before -- blocks are compressed/written out as soon as they fill up.
- Samples are now whole uncompressed data blocks, except the final sample may be a partial data block so we don't breach the user's configured `max_dict_bytes` or `zstd_max_train_bytes`. The dictionary trainer is supposed to work better when we pass it real units of compression. Previously we were passing 64-byte KV samples which was not realistic.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4952
Differential Revision: D13967980
Pulled By: ajkr
fbshipit-source-id: 82bea6f7537e1529c7a1a4cdee84585f5949300f
2019-02-11 19:42:25 -08:00
|
|
|
std::string compression_dict_samples;
|
|
|
|
std::vector<size_t> compression_dict_sample_lens;
|
2021-02-22 17:41:11 -08:00
|
|
|
size_t buffer_idx = kInitSampleIdx;
|
|
|
|
for (size_t i = 0;
|
|
|
|
i < kNumBlocksBuffered && compression_dict_samples.size() < kSampleBytes;
|
|
|
|
++i) {
|
2021-04-23 12:44:11 -07:00
|
|
|
size_t copy_len = std::min(kSampleBytes - compression_dict_samples.size(),
|
|
|
|
r->data_block_buffers[buffer_idx].size());
|
|
|
|
compression_dict_samples.append(r->data_block_buffers[buffer_idx], 0,
|
|
|
|
copy_len);
|
2021-02-22 17:41:11 -08:00
|
|
|
compression_dict_sample_lens.emplace_back(copy_len);
|
|
|
|
|
|
|
|
buffer_idx += kPrimeGeneratorRemainder;
|
|
|
|
if (buffer_idx >= kNumBlocksBuffered) {
|
|
|
|
buffer_idx -= kNumBlocksBuffered;
|
Reduce scope of compression dictionary to single SST (#4952)
Summary:
Our previous approach was to train one compression dictionary per compaction, using the first output SST to train a dictionary, and then applying it on subsequent SSTs in the same compaction. While this was great for minimizing CPU/memory/I/O overhead, it did not achieve good compression ratios in practice. In our most promising potential use case, moderate reductions in a dictionary's scope make a major difference on compression ratio.
So, this PR changes compression dictionary to be scoped per-SST. It accepts the tradeoff during table building to use more memory and CPU. Important changes include:
- The `BlockBasedTableBuilder` has a new state when dictionary compression is in-use: `kBuffered`. In that state it accumulates uncompressed data in-memory whenever `Add` is called.
- After accumulating target file size bytes or calling `BlockBasedTableBuilder::Finish`, a `BlockBasedTableBuilder` moves to the `kUnbuffered` state. The transition (`EnterUnbuffered()`) involves sampling the buffered data, training a dictionary, and compressing/writing out all buffered data. In the `kUnbuffered` state, a `BlockBasedTableBuilder` behaves the same as before -- blocks are compressed/written out as soon as they fill up.
- Samples are now whole uncompressed data blocks, except the final sample may be a partial data block so we don't breach the user's configured `max_dict_bytes` or `zstd_max_train_bytes`. The dictionary trainer is supposed to work better when we pass it real units of compression. Previously we were passing 64-byte KV samples which was not realistic.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4952
Differential Revision: D13967980
Pulled By: ajkr
fbshipit-source-id: 82bea6f7537e1529c7a1a4cdee84585f5949300f
2019-02-11 19:42:25 -08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// final data block flushed, now we can generate dictionary from the samples.
|
|
|
|
// OK if compression_dict_samples is empty, we'll just get empty dictionary.
|
|
|
|
std::string dict;
|
|
|
|
if (r->compression_opts.zstd_max_train_bytes > 0) {
|
|
|
|
dict = ZSTD_TrainDictionary(compression_dict_samples,
|
|
|
|
compression_dict_sample_lens,
|
|
|
|
r->compression_opts.max_dict_bytes);
|
|
|
|
} else {
|
|
|
|
dict = std::move(compression_dict_samples);
|
|
|
|
}
|
|
|
|
r->compression_dict.reset(new CompressionDict(dict, r->compression_type,
|
|
|
|
r->compression_opts.level));
|
|
|
|
r->verify_dict.reset(new UncompressionDict(
|
|
|
|
dict, r->compression_type == kZSTD ||
|
|
|
|
r->compression_type == kZSTDNotFinalCompression));
|
|
|
|
|
2021-04-23 12:44:11 -07:00
|
|
|
auto get_iterator_for_block = [&r](size_t i) {
|
|
|
|
auto& data_block = r->data_block_buffers[i];
|
Reduce scope of compression dictionary to single SST (#4952)
Summary:
Our previous approach was to train one compression dictionary per compaction, using the first output SST to train a dictionary, and then applying it on subsequent SSTs in the same compaction. While this was great for minimizing CPU/memory/I/O overhead, it did not achieve good compression ratios in practice. In our most promising potential use case, moderate reductions in a dictionary's scope make a major difference on compression ratio.
So, this PR changes compression dictionary to be scoped per-SST. It accepts the tradeoff during table building to use more memory and CPU. Important changes include:
- The `BlockBasedTableBuilder` has a new state when dictionary compression is in-use: `kBuffered`. In that state it accumulates uncompressed data in-memory whenever `Add` is called.
- After accumulating target file size bytes or calling `BlockBasedTableBuilder::Finish`, a `BlockBasedTableBuilder` moves to the `kUnbuffered` state. The transition (`EnterUnbuffered()`) involves sampling the buffered data, training a dictionary, and compressing/writing out all buffered data. In the `kUnbuffered` state, a `BlockBasedTableBuilder` behaves the same as before -- blocks are compressed/written out as soon as they fill up.
- Samples are now whole uncompressed data blocks, except the final sample may be a partial data block so we don't breach the user's configured `max_dict_bytes` or `zstd_max_train_bytes`. The dictionary trainer is supposed to work better when we pass it real units of compression. Previously we were passing 64-byte KV samples which was not realistic.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4952
Differential Revision: D13967980
Pulled By: ajkr
fbshipit-source-id: 82bea6f7537e1529c7a1a4cdee84585f5949300f
2019-02-11 19:42:25 -08:00
|
|
|
assert(!data_block.empty());
|
2021-04-23 12:44:11 -07:00
|
|
|
|
|
|
|
Block reader{BlockContents{data_block}};
|
|
|
|
DataBlockIter* iter = reader.NewDataIterator(
|
|
|
|
r->internal_comparator.user_comparator(), kDisableGlobalSequenceNumber);
|
|
|
|
|
|
|
|
iter->SeekToFirst();
|
|
|
|
assert(iter->Valid());
|
|
|
|
return std::unique_ptr<DataBlockIter>(iter);
|
|
|
|
};
|
|
|
|
|
|
|
|
std::unique_ptr<DataBlockIter> iter = nullptr, next_block_iter = nullptr;
|
|
|
|
|
|
|
|
for (size_t i = 0; ok() && i < r->data_block_buffers.size(); ++i) {
|
|
|
|
if (iter == nullptr) {
|
|
|
|
iter = get_iterator_for_block(i);
|
|
|
|
assert(iter != nullptr);
|
|
|
|
};
|
|
|
|
|
|
|
|
if (i + 1 < r->data_block_buffers.size()) {
|
|
|
|
next_block_iter = get_iterator_for_block(i + 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
auto& data_block = r->data_block_buffers[i];
|
Reduce scope of compression dictionary to single SST (#4952)
Summary:
Our previous approach was to train one compression dictionary per compaction, using the first output SST to train a dictionary, and then applying it on subsequent SSTs in the same compaction. While this was great for minimizing CPU/memory/I/O overhead, it did not achieve good compression ratios in practice. In our most promising potential use case, moderate reductions in a dictionary's scope make a major difference on compression ratio.
So, this PR changes compression dictionary to be scoped per-SST. It accepts the tradeoff during table building to use more memory and CPU. Important changes include:
- The `BlockBasedTableBuilder` has a new state when dictionary compression is in-use: `kBuffered`. In that state it accumulates uncompressed data in-memory whenever `Add` is called.
- After accumulating target file size bytes or calling `BlockBasedTableBuilder::Finish`, a `BlockBasedTableBuilder` moves to the `kUnbuffered` state. The transition (`EnterUnbuffered()`) involves sampling the buffered data, training a dictionary, and compressing/writing out all buffered data. In the `kUnbuffered` state, a `BlockBasedTableBuilder` behaves the same as before -- blocks are compressed/written out as soon as they fill up.
- Samples are now whole uncompressed data blocks, except the final sample may be a partial data block so we don't breach the user's configured `max_dict_bytes` or `zstd_max_train_bytes`. The dictionary trainer is supposed to work better when we pass it real units of compression. Previously we were passing 64-byte KV samples which was not realistic.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4952
Differential Revision: D13967980
Pulled By: ajkr
fbshipit-source-id: 82bea6f7537e1529c7a1a4cdee84585f5949300f
2019-02-11 19:42:25 -08:00
|
|
|
|
2020-10-22 11:03:10 -07:00
|
|
|
if (r->IsParallelCompressionEnabled()) {
|
|
|
|
Slice first_key_in_next_block;
|
|
|
|
const Slice* first_key_in_next_block_ptr = &first_key_in_next_block;
|
2021-04-23 12:44:11 -07:00
|
|
|
if (i + 1 < r->data_block_buffers.size()) {
|
|
|
|
assert(next_block_iter != nullptr);
|
|
|
|
first_key_in_next_block = next_block_iter->key();
|
2020-04-01 16:37:54 -07:00
|
|
|
} else {
|
2020-10-22 11:03:10 -07:00
|
|
|
first_key_in_next_block_ptr = r->first_key_in_next_block;
|
2020-04-01 16:37:54 -07:00
|
|
|
}
|
|
|
|
|
2021-04-23 12:44:11 -07:00
|
|
|
std::vector<std::string> keys;
|
|
|
|
for (; iter->Valid(); iter->Next()) {
|
|
|
|
keys.emplace_back(iter->key().ToString());
|
|
|
|
}
|
|
|
|
|
2020-10-22 11:03:10 -07:00
|
|
|
ParallelCompressionRep::BlockRep* block_rep = r->pc_rep->PrepareBlock(
|
|
|
|
r->compression_type, first_key_in_next_block_ptr, &data_block, &keys);
|
2021-04-23 12:44:11 -07:00
|
|
|
|
2020-10-22 11:03:10 -07:00
|
|
|
assert(block_rep != nullptr);
|
|
|
|
r->pc_rep->file_size_estimator.EmitBlock(block_rep->data->size(),
|
|
|
|
r->get_offset());
|
|
|
|
r->pc_rep->EmitBlock(block_rep);
|
2020-04-01 16:37:54 -07:00
|
|
|
} else {
|
2021-04-23 12:44:11 -07:00
|
|
|
for (; iter->Valid(); iter->Next()) {
|
|
|
|
Slice key = iter->key();
|
2020-04-01 16:37:54 -07:00
|
|
|
if (r->filter_builder != nullptr) {
|
|
|
|
size_t ts_sz =
|
|
|
|
r->internal_comparator.user_comparator()->timestamp_size();
|
|
|
|
r->filter_builder->Add(ExtractUserKeyAndStripTimestamp(key, ts_sz));
|
|
|
|
}
|
|
|
|
r->index_builder->OnKeyAdded(key);
|
|
|
|
}
|
2021-08-03 12:42:22 -07:00
|
|
|
WriteBlock(Slice(data_block), &r->pending_handle, BlockType::kData);
|
2021-04-23 12:44:11 -07:00
|
|
|
if (ok() && i + 1 < r->data_block_buffers.size()) {
|
|
|
|
assert(next_block_iter != nullptr);
|
|
|
|
Slice first_key_in_next_block = next_block_iter->key();
|
|
|
|
|
2020-04-01 16:37:54 -07:00
|
|
|
Slice* first_key_in_next_block_ptr = &first_key_in_next_block;
|
2021-04-23 12:44:11 -07:00
|
|
|
|
|
|
|
iter->SeekToLast();
|
|
|
|
std::string last_key = iter->key().ToString();
|
|
|
|
r->index_builder->AddIndexEntry(&last_key, first_key_in_next_block_ptr,
|
|
|
|
r->pending_handle);
|
Reduce scope of compression dictionary to single SST (#4952)
Summary:
Our previous approach was to train one compression dictionary per compaction, using the first output SST to train a dictionary, and then applying it on subsequent SSTs in the same compaction. While this was great for minimizing CPU/memory/I/O overhead, it did not achieve good compression ratios in practice. In our most promising potential use case, moderate reductions in a dictionary's scope make a major difference on compression ratio.
So, this PR changes compression dictionary to be scoped per-SST. It accepts the tradeoff during table building to use more memory and CPU. Important changes include:
- The `BlockBasedTableBuilder` has a new state when dictionary compression is in-use: `kBuffered`. In that state it accumulates uncompressed data in-memory whenever `Add` is called.
- After accumulating target file size bytes or calling `BlockBasedTableBuilder::Finish`, a `BlockBasedTableBuilder` moves to the `kUnbuffered` state. The transition (`EnterUnbuffered()`) involves sampling the buffered data, training a dictionary, and compressing/writing out all buffered data. In the `kUnbuffered` state, a `BlockBasedTableBuilder` behaves the same as before -- blocks are compressed/written out as soon as they fill up.
- Samples are now whole uncompressed data blocks, except the final sample may be a partial data block so we don't breach the user's configured `max_dict_bytes` or `zstd_max_train_bytes`. The dictionary trainer is supposed to work better when we pass it real units of compression. Previously we were passing 64-byte KV samples which was not realistic.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4952
Differential Revision: D13967980
Pulled By: ajkr
fbshipit-source-id: 82bea6f7537e1529c7a1a4cdee84585f5949300f
2019-02-11 19:42:25 -08:00
|
|
|
}
|
|
|
|
}
|
2021-04-23 12:44:11 -07:00
|
|
|
std::swap(iter, next_block_iter);
|
Reduce scope of compression dictionary to single SST (#4952)
Summary:
Our previous approach was to train one compression dictionary per compaction, using the first output SST to train a dictionary, and then applying it on subsequent SSTs in the same compaction. While this was great for minimizing CPU/memory/I/O overhead, it did not achieve good compression ratios in practice. In our most promising potential use case, moderate reductions in a dictionary's scope make a major difference on compression ratio.
So, this PR changes compression dictionary to be scoped per-SST. It accepts the tradeoff during table building to use more memory and CPU. Important changes include:
- The `BlockBasedTableBuilder` has a new state when dictionary compression is in-use: `kBuffered`. In that state it accumulates uncompressed data in-memory whenever `Add` is called.
- After accumulating target file size bytes or calling `BlockBasedTableBuilder::Finish`, a `BlockBasedTableBuilder` moves to the `kUnbuffered` state. The transition (`EnterUnbuffered()`) involves sampling the buffered data, training a dictionary, and compressing/writing out all buffered data. In the `kUnbuffered` state, a `BlockBasedTableBuilder` behaves the same as before -- blocks are compressed/written out as soon as they fill up.
- Samples are now whole uncompressed data blocks, except the final sample may be a partial data block so we don't breach the user's configured `max_dict_bytes` or `zstd_max_train_bytes`. The dictionary trainer is supposed to work better when we pass it real units of compression. Previously we were passing 64-byte KV samples which was not realistic.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4952
Differential Revision: D13967980
Pulled By: ajkr
fbshipit-source-id: 82bea6f7537e1529c7a1a4cdee84585f5949300f
2019-02-11 19:42:25 -08:00
|
|
|
}
|
2021-04-23 12:44:11 -07:00
|
|
|
r->data_block_buffers.clear();
|
2021-09-08 12:34:35 -07:00
|
|
|
r->data_begin_offset = 0;
|
2021-11-05 16:12:11 -07:00
|
|
|
// Release all reserved cache for data block buffers
|
2021-11-01 14:26:50 -07:00
|
|
|
if (r->compression_dict_buffer_cache_res_mgr != nullptr) {
|
|
|
|
Status s = r->compression_dict_buffer_cache_res_mgr->UpdateCacheReservation<
|
2021-09-08 12:34:35 -07:00
|
|
|
CacheEntryRole::kCompressionDictionaryBuildingBuffer>(
|
|
|
|
r->data_begin_offset);
|
|
|
|
s.PermitUncheckedError();
|
|
|
|
}
|
Reduce scope of compression dictionary to single SST (#4952)
Summary:
Our previous approach was to train one compression dictionary per compaction, using the first output SST to train a dictionary, and then applying it on subsequent SSTs in the same compaction. While this was great for minimizing CPU/memory/I/O overhead, it did not achieve good compression ratios in practice. In our most promising potential use case, moderate reductions in a dictionary's scope make a major difference on compression ratio.
So, this PR changes compression dictionary to be scoped per-SST. It accepts the tradeoff during table building to use more memory and CPU. Important changes include:
- The `BlockBasedTableBuilder` has a new state when dictionary compression is in-use: `kBuffered`. In that state it accumulates uncompressed data in-memory whenever `Add` is called.
- After accumulating target file size bytes or calling `BlockBasedTableBuilder::Finish`, a `BlockBasedTableBuilder` moves to the `kUnbuffered` state. The transition (`EnterUnbuffered()`) involves sampling the buffered data, training a dictionary, and compressing/writing out all buffered data. In the `kUnbuffered` state, a `BlockBasedTableBuilder` behaves the same as before -- blocks are compressed/written out as soon as they fill up.
- Samples are now whole uncompressed data blocks, except the final sample may be a partial data block so we don't breach the user's configured `max_dict_bytes` or `zstd_max_train_bytes`. The dictionary trainer is supposed to work better when we pass it real units of compression. Previously we were passing 64-byte KV samples which was not realistic.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4952
Differential Revision: D13967980
Pulled By: ajkr
fbshipit-source-id: 82bea6f7537e1529c7a1a4cdee84585f5949300f
2019-02-11 19:42:25 -08:00
|
|
|
}
|
|
|
|
|
2018-07-20 09:00:33 -07:00
|
|
|
Status BlockBasedTableBuilder::Finish() {
|
|
|
|
Rep* r = rep_;
|
Reduce scope of compression dictionary to single SST (#4952)
Summary:
Our previous approach was to train one compression dictionary per compaction, using the first output SST to train a dictionary, and then applying it on subsequent SSTs in the same compaction. While this was great for minimizing CPU/memory/I/O overhead, it did not achieve good compression ratios in practice. In our most promising potential use case, moderate reductions in a dictionary's scope make a major difference on compression ratio.
So, this PR changes compression dictionary to be scoped per-SST. It accepts the tradeoff during table building to use more memory and CPU. Important changes include:
- The `BlockBasedTableBuilder` has a new state when dictionary compression is in-use: `kBuffered`. In that state it accumulates uncompressed data in-memory whenever `Add` is called.
- After accumulating target file size bytes or calling `BlockBasedTableBuilder::Finish`, a `BlockBasedTableBuilder` moves to the `kUnbuffered` state. The transition (`EnterUnbuffered()`) involves sampling the buffered data, training a dictionary, and compressing/writing out all buffered data. In the `kUnbuffered` state, a `BlockBasedTableBuilder` behaves the same as before -- blocks are compressed/written out as soon as they fill up.
- Samples are now whole uncompressed data blocks, except the final sample may be a partial data block so we don't breach the user's configured `max_dict_bytes` or `zstd_max_train_bytes`. The dictionary trainer is supposed to work better when we pass it real units of compression. Previously we were passing 64-byte KV samples which was not realistic.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4952
Differential Revision: D13967980
Pulled By: ajkr
fbshipit-source-id: 82bea6f7537e1529c7a1a4cdee84585f5949300f
2019-02-11 19:42:25 -08:00
|
|
|
assert(r->state != Rep::State::kClosed);
|
2018-07-20 09:00:33 -07:00
|
|
|
bool empty_data_block = r->data_block.empty();
|
2020-04-01 16:37:54 -07:00
|
|
|
r->first_key_in_next_block = nullptr;
|
2018-07-20 09:00:33 -07:00
|
|
|
Flush();
|
Reduce scope of compression dictionary to single SST (#4952)
Summary:
Our previous approach was to train one compression dictionary per compaction, using the first output SST to train a dictionary, and then applying it on subsequent SSTs in the same compaction. While this was great for minimizing CPU/memory/I/O overhead, it did not achieve good compression ratios in practice. In our most promising potential use case, moderate reductions in a dictionary's scope make a major difference on compression ratio.
So, this PR changes compression dictionary to be scoped per-SST. It accepts the tradeoff during table building to use more memory and CPU. Important changes include:
- The `BlockBasedTableBuilder` has a new state when dictionary compression is in-use: `kBuffered`. In that state it accumulates uncompressed data in-memory whenever `Add` is called.
- After accumulating target file size bytes or calling `BlockBasedTableBuilder::Finish`, a `BlockBasedTableBuilder` moves to the `kUnbuffered` state. The transition (`EnterUnbuffered()`) involves sampling the buffered data, training a dictionary, and compressing/writing out all buffered data. In the `kUnbuffered` state, a `BlockBasedTableBuilder` behaves the same as before -- blocks are compressed/written out as soon as they fill up.
- Samples are now whole uncompressed data blocks, except the final sample may be a partial data block so we don't breach the user's configured `max_dict_bytes` or `zstd_max_train_bytes`. The dictionary trainer is supposed to work better when we pass it real units of compression. Previously we were passing 64-byte KV samples which was not realistic.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4952
Differential Revision: D13967980
Pulled By: ajkr
fbshipit-source-id: 82bea6f7537e1529c7a1a4cdee84585f5949300f
2019-02-11 19:42:25 -08:00
|
|
|
if (r->state == Rep::State::kBuffered) {
|
|
|
|
EnterUnbuffered();
|
|
|
|
}
|
2020-10-22 11:03:10 -07:00
|
|
|
if (r->IsParallelCompressionEnabled()) {
|
|
|
|
StopParallelCompression();
|
2020-09-29 18:21:49 -07:00
|
|
|
#ifndef NDEBUG
|
|
|
|
for (const auto& br : r->pc_rep->block_rep_buf) {
|
|
|
|
assert(br.status.ok());
|
|
|
|
}
|
|
|
|
#endif // !NDEBUG
|
2020-04-01 16:37:54 -07:00
|
|
|
} else {
|
|
|
|
// To make sure properties block is able to keep the accurate size of index
|
|
|
|
// block, we will finish writing all index entries first.
|
|
|
|
if (ok() && !empty_data_block) {
|
|
|
|
r->index_builder->AddIndexEntry(
|
|
|
|
&r->last_key, nullptr /* no next data block */, r->pending_handle);
|
|
|
|
}
|
2018-07-20 09:00:33 -07:00
|
|
|
}
|
|
|
|
|
2018-12-07 13:15:09 -08:00
|
|
|
// Write meta blocks, metaindex block and footer in the following order.
|
2018-07-20 09:00:33 -07:00
|
|
|
// 1. [meta block: filter]
|
|
|
|
// 2. [meta block: index]
|
|
|
|
// 3. [meta block: compression dictionary]
|
|
|
|
// 4. [meta block: range deletion tombstone]
|
|
|
|
// 5. [meta block: properties]
|
|
|
|
// 6. [metaindex block]
|
2018-12-07 13:15:09 -08:00
|
|
|
// 7. Footer
|
2018-07-20 09:00:33 -07:00
|
|
|
BlockHandle metaindex_block_handle, index_block_handle;
|
|
|
|
MetaIndexBuilder meta_index_builder;
|
|
|
|
WriteFilterBlock(&meta_index_builder);
|
|
|
|
WriteIndexBlock(&meta_index_builder, &index_block_handle);
|
|
|
|
WriteCompressionDictBlock(&meta_index_builder);
|
|
|
|
WriteRangeDelBlock(&meta_index_builder);
|
|
|
|
WritePropertiesBlock(&meta_index_builder);
|
|
|
|
if (ok()) {
|
|
|
|
// flush the meta index block
|
|
|
|
WriteRawBlock(meta_index_builder.Finish(), kNoCompression,
|
2021-08-03 12:42:22 -07:00
|
|
|
&metaindex_block_handle, BlockType::kMetaIndex);
|
2018-07-20 09:00:33 -07:00
|
|
|
}
|
2011-03-18 22:37:00 +00:00
|
|
|
if (ok()) {
|
2018-12-07 13:15:09 -08:00
|
|
|
WriteFooter(metaindex_block_handle, index_block_handle);
|
2011-03-18 22:37:00 +00:00
|
|
|
}
|
Reduce scope of compression dictionary to single SST (#4952)
Summary:
Our previous approach was to train one compression dictionary per compaction, using the first output SST to train a dictionary, and then applying it on subsequent SSTs in the same compaction. While this was great for minimizing CPU/memory/I/O overhead, it did not achieve good compression ratios in practice. In our most promising potential use case, moderate reductions in a dictionary's scope make a major difference on compression ratio.
So, this PR changes compression dictionary to be scoped per-SST. It accepts the tradeoff during table building to use more memory and CPU. Important changes include:
- The `BlockBasedTableBuilder` has a new state when dictionary compression is in-use: `kBuffered`. In that state it accumulates uncompressed data in-memory whenever `Add` is called.
- After accumulating target file size bytes or calling `BlockBasedTableBuilder::Finish`, a `BlockBasedTableBuilder` moves to the `kUnbuffered` state. The transition (`EnterUnbuffered()`) involves sampling the buffered data, training a dictionary, and compressing/writing out all buffered data. In the `kUnbuffered` state, a `BlockBasedTableBuilder` behaves the same as before -- blocks are compressed/written out as soon as they fill up.
- Samples are now whole uncompressed data blocks, except the final sample may be a partial data block so we don't breach the user's configured `max_dict_bytes` or `zstd_max_train_bytes`. The dictionary trainer is supposed to work better when we pass it real units of compression. Previously we were passing 64-byte KV samples which was not realistic.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4952
Differential Revision: D13967980
Pulled By: ajkr
fbshipit-source-id: 82bea6f7537e1529c7a1a4cdee84585f5949300f
2019-02-11 19:42:25 -08:00
|
|
|
r->state = Rep::State::kClosed;
|
2020-10-22 11:03:10 -07:00
|
|
|
r->SetStatus(r->CopyIOStatus());
|
|
|
|
Status ret_status = r->CopyStatus();
|
2020-09-29 09:47:33 -07:00
|
|
|
assert(!ret_status.ok() || io_status().ok());
|
|
|
|
return ret_status;
|
2011-03-18 22:37:00 +00:00
|
|
|
}
|
|
|
|
|
2013-10-28 17:54:09 -07:00
|
|
|
void BlockBasedTableBuilder::Abandon() {
|
Reduce scope of compression dictionary to single SST (#4952)
Summary:
Our previous approach was to train one compression dictionary per compaction, using the first output SST to train a dictionary, and then applying it on subsequent SSTs in the same compaction. While this was great for minimizing CPU/memory/I/O overhead, it did not achieve good compression ratios in practice. In our most promising potential use case, moderate reductions in a dictionary's scope make a major difference on compression ratio.
So, this PR changes compression dictionary to be scoped per-SST. It accepts the tradeoff during table building to use more memory and CPU. Important changes include:
- The `BlockBasedTableBuilder` has a new state when dictionary compression is in-use: `kBuffered`. In that state it accumulates uncompressed data in-memory whenever `Add` is called.
- After accumulating target file size bytes or calling `BlockBasedTableBuilder::Finish`, a `BlockBasedTableBuilder` moves to the `kUnbuffered` state. The transition (`EnterUnbuffered()`) involves sampling the buffered data, training a dictionary, and compressing/writing out all buffered data. In the `kUnbuffered` state, a `BlockBasedTableBuilder` behaves the same as before -- blocks are compressed/written out as soon as they fill up.
- Samples are now whole uncompressed data blocks, except the final sample may be a partial data block so we don't breach the user's configured `max_dict_bytes` or `zstd_max_train_bytes`. The dictionary trainer is supposed to work better when we pass it real units of compression. Previously we were passing 64-byte KV samples which was not realistic.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4952
Differential Revision: D13967980
Pulled By: ajkr
fbshipit-source-id: 82bea6f7537e1529c7a1a4cdee84585f5949300f
2019-02-11 19:42:25 -08:00
|
|
|
assert(rep_->state != Rep::State::kClosed);
|
2020-10-22 11:03:10 -07:00
|
|
|
if (rep_->IsParallelCompressionEnabled()) {
|
|
|
|
StopParallelCompression();
|
2020-04-01 16:37:54 -07:00
|
|
|
}
|
Reduce scope of compression dictionary to single SST (#4952)
Summary:
Our previous approach was to train one compression dictionary per compaction, using the first output SST to train a dictionary, and then applying it on subsequent SSTs in the same compaction. While this was great for minimizing CPU/memory/I/O overhead, it did not achieve good compression ratios in practice. In our most promising potential use case, moderate reductions in a dictionary's scope make a major difference on compression ratio.
So, this PR changes compression dictionary to be scoped per-SST. It accepts the tradeoff during table building to use more memory and CPU. Important changes include:
- The `BlockBasedTableBuilder` has a new state when dictionary compression is in-use: `kBuffered`. In that state it accumulates uncompressed data in-memory whenever `Add` is called.
- After accumulating target file size bytes or calling `BlockBasedTableBuilder::Finish`, a `BlockBasedTableBuilder` moves to the `kUnbuffered` state. The transition (`EnterUnbuffered()`) involves sampling the buffered data, training a dictionary, and compressing/writing out all buffered data. In the `kUnbuffered` state, a `BlockBasedTableBuilder` behaves the same as before -- blocks are compressed/written out as soon as they fill up.
- Samples are now whole uncompressed data blocks, except the final sample may be a partial data block so we don't breach the user's configured `max_dict_bytes` or `zstd_max_train_bytes`. The dictionary trainer is supposed to work better when we pass it real units of compression. Previously we were passing 64-byte KV samples which was not realistic.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4952
Differential Revision: D13967980
Pulled By: ajkr
fbshipit-source-id: 82bea6f7537e1529c7a1a4cdee84585f5949300f
2019-02-11 19:42:25 -08:00
|
|
|
rep_->state = Rep::State::kClosed;
|
2020-10-22 11:03:10 -07:00
|
|
|
rep_->CopyStatus().PermitUncheckedError();
|
|
|
|
rep_->CopyIOStatus().PermitUncheckedError();
|
2011-03-18 22:37:00 +00:00
|
|
|
}
|
|
|
|
|
2013-10-28 17:54:09 -07:00
|
|
|
uint64_t BlockBasedTableBuilder::NumEntries() const {
|
2013-11-19 16:29:42 -08:00
|
|
|
return rep_->props.num_entries;
|
2011-03-18 22:37:00 +00:00
|
|
|
}
|
|
|
|
|
2020-04-02 16:13:44 -07:00
|
|
|
bool BlockBasedTableBuilder::IsEmpty() const {
|
|
|
|
return rep_->props.num_entries == 0 && rep_->props.num_range_deletions == 0;
|
|
|
|
}
|
|
|
|
|
Reduce scope of compression dictionary to single SST (#4952)
Summary:
Our previous approach was to train one compression dictionary per compaction, using the first output SST to train a dictionary, and then applying it on subsequent SSTs in the same compaction. While this was great for minimizing CPU/memory/I/O overhead, it did not achieve good compression ratios in practice. In our most promising potential use case, moderate reductions in a dictionary's scope make a major difference on compression ratio.
So, this PR changes compression dictionary to be scoped per-SST. It accepts the tradeoff during table building to use more memory and CPU. Important changes include:
- The `BlockBasedTableBuilder` has a new state when dictionary compression is in-use: `kBuffered`. In that state it accumulates uncompressed data in-memory whenever `Add` is called.
- After accumulating target file size bytes or calling `BlockBasedTableBuilder::Finish`, a `BlockBasedTableBuilder` moves to the `kUnbuffered` state. The transition (`EnterUnbuffered()`) involves sampling the buffered data, training a dictionary, and compressing/writing out all buffered data. In the `kUnbuffered` state, a `BlockBasedTableBuilder` behaves the same as before -- blocks are compressed/written out as soon as they fill up.
- Samples are now whole uncompressed data blocks, except the final sample may be a partial data block so we don't breach the user's configured `max_dict_bytes` or `zstd_max_train_bytes`. The dictionary trainer is supposed to work better when we pass it real units of compression. Previously we were passing 64-byte KV samples which was not realistic.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4952
Differential Revision: D13967980
Pulled By: ajkr
fbshipit-source-id: 82bea6f7537e1529c7a1a4cdee84585f5949300f
2019-02-11 19:42:25 -08:00
|
|
|
uint64_t BlockBasedTableBuilder::FileSize() const { return rep_->offset; }
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2020-04-01 16:37:54 -07:00
|
|
|
uint64_t BlockBasedTableBuilder::EstimatedFileSize() const {
|
2020-10-22 11:03:10 -07:00
|
|
|
if (rep_->IsParallelCompressionEnabled()) {
|
2020-04-01 16:37:54 -07:00
|
|
|
// Use compression ratio so far and inflight raw bytes to estimate
|
|
|
|
// final SST size.
|
2020-10-22 11:03:10 -07:00
|
|
|
return rep_->pc_rep->file_size_estimator.GetEstimatedFileSize();
|
2020-04-01 16:37:54 -07:00
|
|
|
} else {
|
|
|
|
return FileSize();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-06-04 12:03:40 -07:00
|
|
|
bool BlockBasedTableBuilder::NeedCompact() const {
|
|
|
|
for (const auto& collector : rep_->table_properties_collectors) {
|
|
|
|
if (collector->NeedCompact()) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
Add more table properties to EventLogger
Summary:
Example output:
{"time_micros": 1431463794310521, "job": 353, "event": "table_file_creation", "file_number": 387, "file_size": 86937, "table_info": {"data_size": "81801", "index_size": "9751", "filter_size": "0", "raw_key_size": "23448", "raw_average_key_size": "24.000000", "raw_value_size": "990571", "raw_average_value_size": "1013.890481", "num_data_blocks": "245", "num_entries": "977", "filter_policy_name": "", "kDeletedKeys": "0"}}
Also fixed a bug where BuildTable() in recovery was passing Env::IOHigh argument into paranoid_checks_file parameter.
Test Plan: make check + check out the output in the log
Reviewers: sdong, rven, yhchiang
Reviewed By: yhchiang
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D38343
2015-05-12 15:53:55 -07:00
|
|
|
TableProperties BlockBasedTableBuilder::GetTableProperties() const {
|
|
|
|
TableProperties ret = rep_->props;
|
|
|
|
for (const auto& collector : rep_->table_properties_collectors) {
|
|
|
|
for (const auto& prop : collector->GetReadableProperties()) {
|
2015-09-15 09:03:08 -07:00
|
|
|
ret.readable_properties.insert(prop);
|
Add more table properties to EventLogger
Summary:
Example output:
{"time_micros": 1431463794310521, "job": 353, "event": "table_file_creation", "file_number": 387, "file_size": 86937, "table_info": {"data_size": "81801", "index_size": "9751", "filter_size": "0", "raw_key_size": "23448", "raw_average_key_size": "24.000000", "raw_value_size": "990571", "raw_average_value_size": "1013.890481", "num_data_blocks": "245", "num_entries": "977", "filter_policy_name": "", "kDeletedKeys": "0"}}
Also fixed a bug where BuildTable() in recovery was passing Env::IOHigh argument into paranoid_checks_file parameter.
Test Plan: make check + check out the output in the log
Reviewers: sdong, rven, yhchiang
Reviewed By: yhchiang
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D38343
2015-05-12 15:53:55 -07:00
|
|
|
}
|
2020-08-20 19:16:56 -07:00
|
|
|
collector->Finish(&ret.user_collected_properties).PermitUncheckedError();
|
Add more table properties to EventLogger
Summary:
Example output:
{"time_micros": 1431463794310521, "job": 353, "event": "table_file_creation", "file_number": 387, "file_size": 86937, "table_info": {"data_size": "81801", "index_size": "9751", "filter_size": "0", "raw_key_size": "23448", "raw_average_key_size": "24.000000", "raw_value_size": "990571", "raw_average_value_size": "1013.890481", "num_data_blocks": "245", "num_entries": "977", "filter_policy_name": "", "kDeletedKeys": "0"}}
Also fixed a bug where BuildTable() in recovery was passing Env::IOHigh argument into paranoid_checks_file parameter.
Test Plan: make check + check out the output in the log
Reviewers: sdong, rven, yhchiang
Reviewed By: yhchiang
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D38343
2015-05-12 15:53:55 -07:00
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2020-03-29 15:57:02 -07:00
|
|
|
std::string BlockBasedTableBuilder::GetFileChecksum() const {
|
|
|
|
if (rep_->file != nullptr) {
|
|
|
|
return rep_->file->GetFileChecksum();
|
|
|
|
} else {
|
|
|
|
return kUnknownFileChecksum;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-02-10 15:42:46 -08:00
|
|
|
const char* BlockBasedTableBuilder::GetFileChecksumFuncName() const {
|
|
|
|
if (rep_->file != nullptr) {
|
|
|
|
return rep_->file->GetFileChecksumFuncName();
|
|
|
|
} else {
|
2020-06-07 21:54:54 -07:00
|
|
|
return kUnknownFileChecksumFuncName;
|
2020-02-10 15:42:46 -08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-05-15 14:09:03 -07:00
|
|
|
const std::string BlockBasedTable::kFilterBlockPrefix = "filter.";
|
2014-09-08 10:37:05 -07:00
|
|
|
const std::string BlockBasedTable::kFullFilterBlockPrefix = "fullfilter.";
|
2017-03-07 13:48:02 -08:00
|
|
|
const std::string BlockBasedTable::kPartitionedFilterBlockPrefix =
|
|
|
|
"partitionedfilter.";
|
2020-02-20 12:07:53 -08:00
|
|
|
} // namespace ROCKSDB_NAMESPACE
|