2016-02-09 15:12:00 -08:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
2017-07-15 16:03:42 -07:00
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
2013-10-16 14:59:46 -07:00
|
|
|
//
|
2011-03-18 22:37:00 +00:00
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
2019-05-30 14:47:29 -07:00
|
|
|
#include "table/block_based/block_based_table_reader.h"
|
2020-04-21 17:35:28 -07:00
|
|
|
|
2016-08-12 16:34:11 -07:00
|
|
|
#include <algorithm>
|
2018-07-27 16:00:26 -07:00
|
|
|
#include <array>
|
2022-05-17 15:01:51 -07:00
|
|
|
#include <cstdint>
|
2016-08-12 16:34:11 -07:00
|
|
|
#include <limits>
|
2022-04-06 10:33:00 -07:00
|
|
|
#include <memory>
|
2014-02-28 18:19:07 -08:00
|
|
|
#include <string>
|
2022-03-23 10:00:54 -07:00
|
|
|
#include <unordered_set>
|
2014-02-28 18:19:07 -08:00
|
|
|
#include <utility>
|
2016-08-01 14:50:19 -07:00
|
|
|
#include <vector>
|
2014-02-28 18:19:07 -08:00
|
|
|
|
Use deleters to label cache entries and collect stats (#8297)
Summary:
This change gathers and publishes statistics about the
kinds of items in block cache. This is especially important for
profiling relative usage of cache by index vs. filter vs. data blocks.
It works by iterating over the cache during periodic stats dump
(InternalStats, stats_dump_period_sec) or on demand when
DB::Get(Map)Property(kBlockCacheEntryStats), except that for
efficiency and sharing among column families, saved data from
the last scan is used when the data is not considered too old.
The new information can be seen in info LOG, for example:
Block cache LRUCache@0x7fca62229330 capacity: 95.37 MB collections: 8 last_copies: 0 last_secs: 0.00178 secs_since: 0
Block cache entry stats(count,size,portion): DataBlock(7092,28.24 MB,29.6136%) FilterBlock(215,867.90 KB,0.888728%) FilterMetaBlock(2,5.31 KB,0.00544%) IndexBlock(217,180.11 KB,0.184432%) WriteBuffer(1,256.00 KB,0.262144%) Misc(1,0.00 KB,0%)
And also through DB::GetProperty and GetMapProperty (here using
ldb just for demonstration):
$ ./ldb --db=/dev/shm/dbbench/ get_property rocksdb.block-cache-entry-stats
rocksdb.block-cache-entry-stats.bytes.data-block: 0
rocksdb.block-cache-entry-stats.bytes.deprecated-filter-block: 0
rocksdb.block-cache-entry-stats.bytes.filter-block: 0
rocksdb.block-cache-entry-stats.bytes.filter-meta-block: 0
rocksdb.block-cache-entry-stats.bytes.index-block: 178992
rocksdb.block-cache-entry-stats.bytes.misc: 0
rocksdb.block-cache-entry-stats.bytes.other-block: 0
rocksdb.block-cache-entry-stats.bytes.write-buffer: 0
rocksdb.block-cache-entry-stats.capacity: 8388608
rocksdb.block-cache-entry-stats.count.data-block: 0
rocksdb.block-cache-entry-stats.count.deprecated-filter-block: 0
rocksdb.block-cache-entry-stats.count.filter-block: 0
rocksdb.block-cache-entry-stats.count.filter-meta-block: 0
rocksdb.block-cache-entry-stats.count.index-block: 215
rocksdb.block-cache-entry-stats.count.misc: 1
rocksdb.block-cache-entry-stats.count.other-block: 0
rocksdb.block-cache-entry-stats.count.write-buffer: 0
rocksdb.block-cache-entry-stats.id: LRUCache@0x7f3636661290
rocksdb.block-cache-entry-stats.percent.data-block: 0.000000
rocksdb.block-cache-entry-stats.percent.deprecated-filter-block: 0.000000
rocksdb.block-cache-entry-stats.percent.filter-block: 0.000000
rocksdb.block-cache-entry-stats.percent.filter-meta-block: 0.000000
rocksdb.block-cache-entry-stats.percent.index-block: 2.133751
rocksdb.block-cache-entry-stats.percent.misc: 0.000000
rocksdb.block-cache-entry-stats.percent.other-block: 0.000000
rocksdb.block-cache-entry-stats.percent.write-buffer: 0.000000
rocksdb.block-cache-entry-stats.secs_for_last_collection: 0.000052
rocksdb.block-cache-entry-stats.secs_since_last_collection: 0
Solution detail - We need some way to flag what kind of blocks each
entry belongs to, preferably without changing the Cache API.
One of the complications is that Cache is a general interface that could
have other users that don't adhere to whichever convention we decide
on for keys and values. Or we would pay for an extra field in the Handle
that would only be used for this purpose.
This change uses a back-door approach, the deleter, to indicate the
"role" of a Cache entry (in addition to the value type, implicitly).
This has the added benefit of ensuring proper code origin whenever we
recognize a particular role for a cache entry; if the entry came from
some other part of the code, it will use an unrecognized deleter, which
we simply attribute to the "Misc" role.
An internal API makes for simple instantiation and automatic
registration of Cache deleters for a given value type and "role".
Another internal API, CacheEntryStatsCollector, solves the problem of
caching the results of a scan and sharing them, to ensure scans are
neither excessive nor redundant so as not to harm Cache performance.
Because code is added to BlocklikeTraits, it is pulled out of
block_based_table_reader.cc into its own file.
This is a reformulation of https://github.com/facebook/rocksdb/issues/8276, without the type checking option
(could still be added), and with actual stat gathering.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8297
Test Plan: manual testing with db_bench, and a couple of basic unit tests
Reviewed By: ltamasi
Differential Revision: D28488721
Pulled By: pdillinger
fbshipit-source-id: 472f524a9691b5afb107934be2d41d84f2b129fb
2021-05-19 16:45:51 -07:00
|
|
|
#include "cache/cache_entry_roles.h"
|
New stable, fixed-length cache keys (#9126)
Summary:
This change standardizes on a new 16-byte cache key format for
block cache (incl compressed and secondary) and persistent cache (but
not table cache and row cache).
The goal is a really fast cache key with practically ideal stability and
uniqueness properties without external dependencies (e.g. from FileSystem).
A fixed key size of 16 bytes should enable future optimizations to the
concurrent hash table for block cache, which is a heavy CPU user /
bottleneck, but there appears to be measurable performance improvement
even with no changes to LRUCache.
This change replaces a lot of disjointed and ugly code handling cache
keys with calls to a simple, clean new internal API (cache_key.h).
(Preserving the old cache key logic under an option would be very ugly
and likely negate the performance gain of the new approach. Complete
replacement carries some inherent risk, but I think that's acceptable
with sufficient analysis and testing.)
The scheme for encoding new cache keys is complicated but explained
in cache_key.cc.
Also: EndianSwapValue is moved to math.h to be next to other bit
operations. (Explains some new include "math.h".) ReverseBits operation
added and unit tests added to hash_test for both.
Fixes https://github.com/facebook/rocksdb/issues/7405 (presuming a root cause)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9126
Test Plan:
### Basic correctness
Several tests needed updates to work with the new functionality, mostly
because we are no longer relying on filesystem for stable cache keys
so table builders & readers need more context info to agree on cache
keys. This functionality is so core, a huge number of existing tests
exercise the cache key functionality.
### Performance
Create db with
`TEST_TMPDIR=/dev/shm ./db_bench -bloom_bits=10 -benchmarks=fillrandom -num=3000000 -partition_index_and_filters`
And test performance with
`TEST_TMPDIR=/dev/shm ./db_bench -readonly -use_existing_db -bloom_bits=10 -benchmarks=readrandom -num=3000000 -duration=30 -cache_index_and_filter_blocks -cache_size=250000 -threads=4`
using DEBUG_LEVEL=0 and simultaneous before & after runs.
Before ops/sec, avg over 100 runs: 121924
After ops/sec, avg over 100 runs: 125385 (+2.8%)
### Collision probability
I have built a tool, ./cache_bench -stress_cache_key to broadly simulate host-wide cache activity
over many months, by making some pessimistic simplifying assumptions:
* Every generated file has a cache entry for every byte offset in the file (contiguous range of cache keys)
* All of every file is cached for its entire lifetime
We use a simple table with skewed address assignment and replacement on address collision
to simulate files coming & going, with quite a variance (super-Poisson) in ages. Some output
with `./cache_bench -stress_cache_key -sck_keep_bits=40`:
```
Total cache or DBs size: 32TiB Writing 925.926 MiB/s or 76.2939TiB/day
Multiply by 9.22337e+18 to correct for simulation losses (but still assume whole file cached)
```
These come from default settings of 2.5M files per day of 32 MB each, and
`-sck_keep_bits=40` means that to represent a single file, we are only keeping 40 bits of
the 128-bit cache key. With file size of 2\*\*25 contiguous keys (pessimistic), our simulation
is about 2\*\*(128-40-25) or about 9 billion billion times more prone to collision than reality.
More default assumptions, relatively pessimistic:
* 100 DBs in same process (doesn't matter much)
* Re-open DB in same process (new session ID related to old session ID) on average
every 100 files generated
* Restart process (all new session IDs unrelated to old) 24 times per day
After enough data, we get a result at the end:
```
(keep 40 bits) 17 collisions after 2 x 90 days, est 10.5882 days between (9.76592e+19 corrected)
```
If we believe the (pessimistic) simulation and the mathematical generalization, we would need to run a billion machines all for 97 billion days to expect a cache key collision. To help verify that our generalization ("corrected") is robust, we can make our simulation more precise with `-sck_keep_bits=41` and `42`, which takes more running time to get enough data:
```
(keep 41 bits) 16 collisions after 4 x 90 days, est 22.5 days between (1.03763e+20 corrected)
(keep 42 bits) 19 collisions after 10 x 90 days, est 47.3684 days between (1.09224e+20 corrected)
```
The generalized prediction still holds. With the `-sck_randomize` option, we can see that we are beating "random" cache keys (except offsets still non-randomized) by a modest amount (roughly 20x less collision prone than random), which should make us reasonably comfortable even in "degenerate" cases:
```
197 collisions after 1 x 90 days, est 0.456853 days between (4.21372e+18 corrected)
```
I've run other tests to validate other conditions behave as expected, never behaving "worse than random" unless we start chopping off structured data.
Reviewed By: zhichao-cao
Differential Revision: D33171746
Pulled By: pdillinger
fbshipit-source-id: f16a57e369ed37be5e7e33525ace848d0537c88f
2021-12-16 17:13:55 -08:00
|
|
|
#include "cache/cache_key.h"
|
2020-04-27 13:18:18 -07:00
|
|
|
#include "cache/sharded_cache.h"
|
Improve / clean up meta block code & integrity (#9163)
Summary:
* Checksums are now checked on meta blocks unless specifically
suppressed or not applicable (e.g. plain table). (Was other way around.)
This means a number of cases that were not checking checksums now are,
including direct read TableProperties in Version::GetTableProperties
(fixed in meta_blocks ReadTableProperties), reading any block from
PersistentCache (fixed in BlockFetcher), read TableProperties in
SstFileDumper (ldb/sst_dump/BackupEngine) before table reader open,
maybe more.
* For that to work, I moved the global_seqno+TableProperties checksum
logic to the shared table/ code, because that is used by many utilies
such as SstFileDumper.
* Also for that to work, we have to know when we're dealing with a block
that has a checksum (trailer), so added that capability to Footer based
on magic number, and from there BlockFetcher.
* Knowledge of trailer presence has also fixed a problem where other
table formats were reading blocks including bytes for a non-existant
trailer--and awkwardly kind-of not using them, e.g. no shared code
checking checksums. (BlockFetcher compression type was populated
incorrectly.) Now we only read what is needed.
* Minimized code duplication and differing/incompatible/awkward
abstractions in meta_blocks.{cc,h} (e.g. SeekTo in metaindex block
without parsing block handle)
* Moved some meta block handling code from table_properties*.*
* Moved some code specific to block-based table from shared table/ code
to BlockBasedTable class. The checksum stuff means we can't completely
separate it, but things that don't need to be in shared table/ code
should not be.
* Use unique_ptr rather than raw ptr in more places. (Note: you can
std::move from unique_ptr to shared_ptr.)
Without enhancements to GetPropertiesOfAllTablesTest (see below),
net reduction of roughly 100 lines of code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9163
Test Plan:
existing tests and
* Enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to verify that
checksums are now checked on direct read of table properties by TableCache
(new test would fail before this change)
* Also enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to test
putting table properties under old meta name
* Also generally enhanced that same test to actually test what it was
supposed to be testing already, by kicking things out of table cache when
we don't want them there.
Reviewed By: ajkr, mrambacher
Differential Revision: D32514757
Pulled By: pdillinger
fbshipit-source-id: 507964b9311d186ae8d1131182290cbd97a99fa9
2021-11-18 11:42:12 -08:00
|
|
|
#include "db/compaction/compaction_picker.h"
|
2013-08-13 14:04:56 -07:00
|
|
|
#include "db/dbformat.h"
|
Introduce FullMergeV2 (eliminate memcpy from merge operators)
Summary:
This diff update the code to pin the merge operator operands while the merge operation is done, so that we can eliminate the memcpy cost, to do that we need a new public API for FullMerge that replace the std::deque<std::string> with std::vector<Slice>
This diff is stacked on top of D56493 and D56511
In this diff we
- Update FullMergeV2 arguments to be encapsulated in MergeOperationInput and MergeOperationOutput which will make it easier to add new arguments in the future
- Replace std::deque<std::string> with std::vector<Slice> to pass operands
- Replace MergeContext std::deque with std::vector (based on a simple benchmark I ran https://gist.github.com/IslamAbdelRahman/78fc86c9ab9f52b1df791e58943fb187)
- Allow FullMergeV2 output to be an existing operand
```
[Everything in Memtable | 10K operands | 10 KB each | 1 operand per key]
DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="mergerandom,readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --merge_keys=10000 --num=10000 --disable_auto_compactions --value_size=10240 --write_buffer_size=1000000000
[FullMergeV2]
readseq : 0.607 micros/op 1648235 ops/sec; 16121.2 MB/s
readseq : 0.478 micros/op 2091546 ops/sec; 20457.2 MB/s
readseq : 0.252 micros/op 3972081 ops/sec; 38850.5 MB/s
readseq : 0.237 micros/op 4218328 ops/sec; 41259.0 MB/s
readseq : 0.247 micros/op 4043927 ops/sec; 39553.2 MB/s
[master]
readseq : 3.935 micros/op 254140 ops/sec; 2485.7 MB/s
readseq : 3.722 micros/op 268657 ops/sec; 2627.7 MB/s
readseq : 3.149 micros/op 317605 ops/sec; 3106.5 MB/s
readseq : 3.125 micros/op 320024 ops/sec; 3130.1 MB/s
readseq : 4.075 micros/op 245374 ops/sec; 2400.0 MB/s
```
```
[Everything in Memtable | 10K operands | 10 KB each | 10 operand per key]
DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="mergerandom,readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --merge_keys=1000 --num=10000 --disable_auto_compactions --value_size=10240 --write_buffer_size=1000000000
[FullMergeV2]
readseq : 3.472 micros/op 288018 ops/sec; 2817.1 MB/s
readseq : 2.304 micros/op 434027 ops/sec; 4245.2 MB/s
readseq : 1.163 micros/op 859845 ops/sec; 8410.0 MB/s
readseq : 1.192 micros/op 838926 ops/sec; 8205.4 MB/s
readseq : 1.250 micros/op 800000 ops/sec; 7824.7 MB/s
[master]
readseq : 24.025 micros/op 41623 ops/sec; 407.1 MB/s
readseq : 18.489 micros/op 54086 ops/sec; 529.0 MB/s
readseq : 18.693 micros/op 53495 ops/sec; 523.2 MB/s
readseq : 23.621 micros/op 42335 ops/sec; 414.1 MB/s
readseq : 18.775 micros/op 53262 ops/sec; 521.0 MB/s
```
```
[Everything in Block cache | 10K operands | 10 KB each | 1 operand per key]
[FullMergeV2]
$ DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --num=100000 --db="/dev/shm/merge-random-10K-10KB" --cache_size=1000000000 --use_existing_db --disable_auto_compactions
readseq : 14.741 micros/op 67837 ops/sec; 663.5 MB/s
readseq : 1.029 micros/op 971446 ops/sec; 9501.6 MB/s
readseq : 0.974 micros/op 1026229 ops/sec; 10037.4 MB/s
readseq : 0.965 micros/op 1036080 ops/sec; 10133.8 MB/s
readseq : 0.943 micros/op 1060657 ops/sec; 10374.2 MB/s
[master]
readseq : 16.735 micros/op 59755 ops/sec; 584.5 MB/s
readseq : 3.029 micros/op 330151 ops/sec; 3229.2 MB/s
readseq : 3.136 micros/op 318883 ops/sec; 3119.0 MB/s
readseq : 3.065 micros/op 326245 ops/sec; 3191.0 MB/s
readseq : 3.014 micros/op 331813 ops/sec; 3245.4 MB/s
```
```
[Everything in Block cache | 10K operands | 10 KB each | 10 operand per key]
DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="readseq,readseq,readseq,readseq,readseq" --merge_operator="max" --num=100000 --db="/dev/shm/merge-random-10-operands-10K-10KB" --cache_size=1000000000 --use_existing_db --disable_auto_compactions
[FullMergeV2]
readseq : 24.325 micros/op 41109 ops/sec; 402.1 MB/s
readseq : 1.470 micros/op 680272 ops/sec; 6653.7 MB/s
readseq : 1.231 micros/op 812347 ops/sec; 7945.5 MB/s
readseq : 1.091 micros/op 916590 ops/sec; 8965.1 MB/s
readseq : 1.109 micros/op 901713 ops/sec; 8819.6 MB/s
[master]
readseq : 27.257 micros/op 36687 ops/sec; 358.8 MB/s
readseq : 4.443 micros/op 225073 ops/sec; 2201.4 MB/s
readseq : 5.830 micros/op 171526 ops/sec; 1677.7 MB/s
readseq : 4.173 micros/op 239635 ops/sec; 2343.8 MB/s
readseq : 4.150 micros/op 240963 ops/sec; 2356.8 MB/s
```
Test Plan: COMPILE_WITH_ASAN=1 make check -j64
Reviewers: yhchiang, andrewkr, sdong
Reviewed By: sdong
Subscribers: lovro, andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D57075
2016-07-20 09:49:03 -07:00
|
|
|
#include "db/pinned_iterators_manager.h"
|
2019-09-16 10:31:27 -07:00
|
|
|
#include "file/file_prefetch_buffer.h"
|
2020-04-30 14:48:51 -07:00
|
|
|
#include "file/file_util.h"
|
2019-09-16 10:31:27 -07:00
|
|
|
#include "file/random_access_file_reader.h"
|
2021-09-29 04:01:57 -07:00
|
|
|
#include "logging/logging.h"
|
2020-04-21 17:35:28 -07:00
|
|
|
#include "monitoring/perf_context_imp.h"
|
2021-01-25 22:07:26 -08:00
|
|
|
#include "port/lang.h"
|
2014-02-28 18:19:07 -08:00
|
|
|
#include "rocksdb/cache.h"
|
2013-08-23 08:38:13 -07:00
|
|
|
#include "rocksdb/comparator.h"
|
2021-09-27 07:42:36 -07:00
|
|
|
#include "rocksdb/convenience.h"
|
2013-08-23 08:38:13 -07:00
|
|
|
#include "rocksdb/env.h"
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 14:47:08 -08:00
|
|
|
#include "rocksdb/file_system.h"
|
2013-08-23 08:38:13 -07:00
|
|
|
#include "rocksdb/filter_policy.h"
|
2014-02-28 18:19:07 -08:00
|
|
|
#include "rocksdb/iterator.h"
|
2013-08-23 08:38:13 -07:00
|
|
|
#include "rocksdb/options.h"
|
2021-09-29 04:01:57 -07:00
|
|
|
#include "rocksdb/snapshot.h"
|
2013-08-23 08:38:13 -07:00
|
|
|
#include "rocksdb/statistics.h"
|
2021-01-25 22:07:26 -08:00
|
|
|
#include "rocksdb/system_clock.h"
|
2013-10-28 17:54:09 -07:00
|
|
|
#include "rocksdb/table.h"
|
2014-04-21 17:49:47 -07:00
|
|
|
#include "rocksdb/table_properties.h"
|
2021-08-11 19:31:44 -07:00
|
|
|
#include "rocksdb/trace_record.h"
|
2020-03-12 21:39:36 -07:00
|
|
|
#include "table/block_based/binary_search_index_reader.h"
|
2019-05-30 14:47:29 -07:00
|
|
|
#include "table/block_based/block.h"
|
|
|
|
#include "table/block_based/block_based_filter_block.h"
|
|
|
|
#include "table/block_based/block_based_table_factory.h"
|
2020-03-12 21:39:36 -07:00
|
|
|
#include "table/block_based/block_based_table_iterator.h"
|
Use deleters to label cache entries and collect stats (#8297)
Summary:
This change gathers and publishes statistics about the
kinds of items in block cache. This is especially important for
profiling relative usage of cache by index vs. filter vs. data blocks.
It works by iterating over the cache during periodic stats dump
(InternalStats, stats_dump_period_sec) or on demand when
DB::Get(Map)Property(kBlockCacheEntryStats), except that for
efficiency and sharing among column families, saved data from
the last scan is used when the data is not considered too old.
The new information can be seen in info LOG, for example:
Block cache LRUCache@0x7fca62229330 capacity: 95.37 MB collections: 8 last_copies: 0 last_secs: 0.00178 secs_since: 0
Block cache entry stats(count,size,portion): DataBlock(7092,28.24 MB,29.6136%) FilterBlock(215,867.90 KB,0.888728%) FilterMetaBlock(2,5.31 KB,0.00544%) IndexBlock(217,180.11 KB,0.184432%) WriteBuffer(1,256.00 KB,0.262144%) Misc(1,0.00 KB,0%)
And also through DB::GetProperty and GetMapProperty (here using
ldb just for demonstration):
$ ./ldb --db=/dev/shm/dbbench/ get_property rocksdb.block-cache-entry-stats
rocksdb.block-cache-entry-stats.bytes.data-block: 0
rocksdb.block-cache-entry-stats.bytes.deprecated-filter-block: 0
rocksdb.block-cache-entry-stats.bytes.filter-block: 0
rocksdb.block-cache-entry-stats.bytes.filter-meta-block: 0
rocksdb.block-cache-entry-stats.bytes.index-block: 178992
rocksdb.block-cache-entry-stats.bytes.misc: 0
rocksdb.block-cache-entry-stats.bytes.other-block: 0
rocksdb.block-cache-entry-stats.bytes.write-buffer: 0
rocksdb.block-cache-entry-stats.capacity: 8388608
rocksdb.block-cache-entry-stats.count.data-block: 0
rocksdb.block-cache-entry-stats.count.deprecated-filter-block: 0
rocksdb.block-cache-entry-stats.count.filter-block: 0
rocksdb.block-cache-entry-stats.count.filter-meta-block: 0
rocksdb.block-cache-entry-stats.count.index-block: 215
rocksdb.block-cache-entry-stats.count.misc: 1
rocksdb.block-cache-entry-stats.count.other-block: 0
rocksdb.block-cache-entry-stats.count.write-buffer: 0
rocksdb.block-cache-entry-stats.id: LRUCache@0x7f3636661290
rocksdb.block-cache-entry-stats.percent.data-block: 0.000000
rocksdb.block-cache-entry-stats.percent.deprecated-filter-block: 0.000000
rocksdb.block-cache-entry-stats.percent.filter-block: 0.000000
rocksdb.block-cache-entry-stats.percent.filter-meta-block: 0.000000
rocksdb.block-cache-entry-stats.percent.index-block: 2.133751
rocksdb.block-cache-entry-stats.percent.misc: 0.000000
rocksdb.block-cache-entry-stats.percent.other-block: 0.000000
rocksdb.block-cache-entry-stats.percent.write-buffer: 0.000000
rocksdb.block-cache-entry-stats.secs_for_last_collection: 0.000052
rocksdb.block-cache-entry-stats.secs_since_last_collection: 0
Solution detail - We need some way to flag what kind of blocks each
entry belongs to, preferably without changing the Cache API.
One of the complications is that Cache is a general interface that could
have other users that don't adhere to whichever convention we decide
on for keys and values. Or we would pay for an extra field in the Handle
that would only be used for this purpose.
This change uses a back-door approach, the deleter, to indicate the
"role" of a Cache entry (in addition to the value type, implicitly).
This has the added benefit of ensuring proper code origin whenever we
recognize a particular role for a cache entry; if the entry came from
some other part of the code, it will use an unrecognized deleter, which
we simply attribute to the "Misc" role.
An internal API makes for simple instantiation and automatic
registration of Cache deleters for a given value type and "role".
Another internal API, CacheEntryStatsCollector, solves the problem of
caching the results of a scan and sharing them, to ensure scans are
neither excessive nor redundant so as not to harm Cache performance.
Because code is added to BlocklikeTraits, it is pulled out of
block_based_table_reader.cc into its own file.
This is a reformulation of https://github.com/facebook/rocksdb/issues/8276, without the type checking option
(could still be added), and with actual stat gathering.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8297
Test Plan: manual testing with db_bench, and a couple of basic unit tests
Reviewed By: ltamasi
Differential Revision: D28488721
Pulled By: pdillinger
fbshipit-source-id: 472f524a9691b5afb107934be2d41d84f2b129fb
2021-05-19 16:45:51 -07:00
|
|
|
#include "table/block_based/block_like_traits.h"
|
2019-05-30 14:47:29 -07:00
|
|
|
#include "table/block_based/block_prefix_index.h"
|
Use deleters to label cache entries and collect stats (#8297)
Summary:
This change gathers and publishes statistics about the
kinds of items in block cache. This is especially important for
profiling relative usage of cache by index vs. filter vs. data blocks.
It works by iterating over the cache during periodic stats dump
(InternalStats, stats_dump_period_sec) or on demand when
DB::Get(Map)Property(kBlockCacheEntryStats), except that for
efficiency and sharing among column families, saved data from
the last scan is used when the data is not considered too old.
The new information can be seen in info LOG, for example:
Block cache LRUCache@0x7fca62229330 capacity: 95.37 MB collections: 8 last_copies: 0 last_secs: 0.00178 secs_since: 0
Block cache entry stats(count,size,portion): DataBlock(7092,28.24 MB,29.6136%) FilterBlock(215,867.90 KB,0.888728%) FilterMetaBlock(2,5.31 KB,0.00544%) IndexBlock(217,180.11 KB,0.184432%) WriteBuffer(1,256.00 KB,0.262144%) Misc(1,0.00 KB,0%)
And also through DB::GetProperty and GetMapProperty (here using
ldb just for demonstration):
$ ./ldb --db=/dev/shm/dbbench/ get_property rocksdb.block-cache-entry-stats
rocksdb.block-cache-entry-stats.bytes.data-block: 0
rocksdb.block-cache-entry-stats.bytes.deprecated-filter-block: 0
rocksdb.block-cache-entry-stats.bytes.filter-block: 0
rocksdb.block-cache-entry-stats.bytes.filter-meta-block: 0
rocksdb.block-cache-entry-stats.bytes.index-block: 178992
rocksdb.block-cache-entry-stats.bytes.misc: 0
rocksdb.block-cache-entry-stats.bytes.other-block: 0
rocksdb.block-cache-entry-stats.bytes.write-buffer: 0
rocksdb.block-cache-entry-stats.capacity: 8388608
rocksdb.block-cache-entry-stats.count.data-block: 0
rocksdb.block-cache-entry-stats.count.deprecated-filter-block: 0
rocksdb.block-cache-entry-stats.count.filter-block: 0
rocksdb.block-cache-entry-stats.count.filter-meta-block: 0
rocksdb.block-cache-entry-stats.count.index-block: 215
rocksdb.block-cache-entry-stats.count.misc: 1
rocksdb.block-cache-entry-stats.count.other-block: 0
rocksdb.block-cache-entry-stats.count.write-buffer: 0
rocksdb.block-cache-entry-stats.id: LRUCache@0x7f3636661290
rocksdb.block-cache-entry-stats.percent.data-block: 0.000000
rocksdb.block-cache-entry-stats.percent.deprecated-filter-block: 0.000000
rocksdb.block-cache-entry-stats.percent.filter-block: 0.000000
rocksdb.block-cache-entry-stats.percent.filter-meta-block: 0.000000
rocksdb.block-cache-entry-stats.percent.index-block: 2.133751
rocksdb.block-cache-entry-stats.percent.misc: 0.000000
rocksdb.block-cache-entry-stats.percent.other-block: 0.000000
rocksdb.block-cache-entry-stats.percent.write-buffer: 0.000000
rocksdb.block-cache-entry-stats.secs_for_last_collection: 0.000052
rocksdb.block-cache-entry-stats.secs_since_last_collection: 0
Solution detail - We need some way to flag what kind of blocks each
entry belongs to, preferably without changing the Cache API.
One of the complications is that Cache is a general interface that could
have other users that don't adhere to whichever convention we decide
on for keys and values. Or we would pay for an extra field in the Handle
that would only be used for this purpose.
This change uses a back-door approach, the deleter, to indicate the
"role" of a Cache entry (in addition to the value type, implicitly).
This has the added benefit of ensuring proper code origin whenever we
recognize a particular role for a cache entry; if the entry came from
some other part of the code, it will use an unrecognized deleter, which
we simply attribute to the "Misc" role.
An internal API makes for simple instantiation and automatic
registration of Cache deleters for a given value type and "role".
Another internal API, CacheEntryStatsCollector, solves the problem of
caching the results of a scan and sharing them, to ensure scans are
neither excessive nor redundant so as not to harm Cache performance.
Because code is added to BlocklikeTraits, it is pulled out of
block_based_table_reader.cc into its own file.
This is a reformulation of https://github.com/facebook/rocksdb/issues/8276, without the type checking option
(could still be added), and with actual stat gathering.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8297
Test Plan: manual testing with db_bench, and a couple of basic unit tests
Reviewed By: ltamasi
Differential Revision: D28488721
Pulled By: pdillinger
fbshipit-source-id: 472f524a9691b5afb107934be2d41d84f2b129fb
2021-05-19 16:45:51 -07:00
|
|
|
#include "table/block_based/block_type.h"
|
2019-05-30 14:47:29 -07:00
|
|
|
#include "table/block_based/filter_block.h"
|
2022-03-23 10:00:54 -07:00
|
|
|
#include "table/block_based/filter_policy_internal.h"
|
2019-05-30 14:47:29 -07:00
|
|
|
#include "table/block_based/full_filter_block.h"
|
2020-03-12 21:39:36 -07:00
|
|
|
#include "table/block_based/hash_index_reader.h"
|
2019-05-30 14:47:29 -07:00
|
|
|
#include "table/block_based/partitioned_filter_block.h"
|
2020-03-12 21:39:36 -07:00
|
|
|
#include "table/block_based/partitioned_index_reader.h"
|
2019-06-03 12:31:45 -07:00
|
|
|
#include "table/block_fetcher.h"
|
2011-03-18 22:37:00 +00:00
|
|
|
#include "table/format.h"
|
2015-12-15 18:20:10 -08:00
|
|
|
#include "table/get_context.h"
|
2015-10-12 15:06:38 -07:00
|
|
|
#include "table/internal_iterator.h"
|
2019-06-03 12:31:45 -07:00
|
|
|
#include "table/meta_blocks.h"
|
Introduce a new MultiGet batching implementation (#5011)
Summary:
This PR introduces a new MultiGet() API, with the underlying implementation grouping keys based on SST file and batching lookups in a file. The reason for the new API is twofold - the definition allows callers to allocate storage for status and values on stack instead of std::vector, as well as return values as PinnableSlices in order to avoid copying, and it keeps the original MultiGet() implementation intact while we experiment with batching.
Batching is useful when there is some spatial locality to the keys being queries, as well as larger batch sizes. The main benefits are due to -
1. Fewer function calls, especially to BlockBasedTableReader::MultiGet() and FullFilterBlockReader::KeysMayMatch()
2. Bloom filter cachelines can be prefetched, hiding the cache miss latency
The next step is to optimize the binary searches in the level_storage_info, index blocks and data blocks, since we could reduce the number of key comparisons if the keys are relatively close to each other. The batching optimizations also need to be extended to other formats, such as PlainTable and filter formats. This also needs to be added to db_stress.
Benchmark results from db_bench for various batch size/locality of reference combinations are given below. Locality was simulated by offsetting the keys in a batch by a stride length. Each SST file is about 8.6MB uncompressed and key/value size is 16/100 uncompressed. To focus on the cpu benefit of batching, the runs were single threaded and bound to the same cpu to eliminate interference from other system events. The results show a 10-25% improvement in micros/op from smaller to larger batch sizes (4 - 32).
Batch Sizes
1 | 2 | 4 | 8 | 16 | 32
Random pattern (Stride length 0)
4.158 | 4.109 | 4.026 | 4.05 | 4.1 | 4.074 - Get
4.438 | 4.302 | 4.165 | 4.122 | 4.096 | 4.075 - MultiGet (no batching)
4.461 | 4.256 | 4.277 | 4.11 | 4.182 | 4.14 - MultiGet (w/ batching)
Good locality (Stride length 16)
4.048 | 3.659 | 3.248 | 2.99 | 2.84 | 2.753
4.429 | 3.728 | 3.406 | 3.053 | 2.911 | 2.781
4.452 | 3.45 | 2.833 | 2.451 | 2.233 | 2.135
Good locality (Stride length 256)
4.066 | 3.786 | 3.581 | 3.447 | 3.415 | 3.232
4.406 | 4.005 | 3.644 | 3.49 | 3.381 | 3.268
4.393 | 3.649 | 3.186 | 2.882 | 2.676 | 2.62
Medium locality (Stride length 4096)
4.012 | 3.922 | 3.768 | 3.61 | 3.582 | 3.555
4.364 | 4.057 | 3.791 | 3.65 | 3.57 | 3.465
4.479 | 3.758 | 3.316 | 3.077 | 2.959 | 2.891
dbbench command used (on a DB with 4 levels, 12 million keys)-
TEST_TMPDIR=/dev/shm numactl -C 10 ./db_bench.tmp -use_existing_db=true -benchmarks="readseq,multireadrandom" -write_buffer_size=4194304 -target_file_size_base=4194304 -max_bytes_for_level_base=16777216 -num=12000000 -reads=12000000 -duration=90 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5011
Differential Revision: D14348703
Pulled By: anand1976
fbshipit-source-id: 774406dab3776d979c809522a67bedac6c17f84b
2019-04-11 14:24:09 -07:00
|
|
|
#include "table/multiget_context.h"
|
2015-12-15 18:20:10 -08:00
|
|
|
#include "table/persistent_cache_helper.h"
|
2021-08-16 20:36:19 -07:00
|
|
|
#include "table/persistent_cache_options.h"
|
2016-10-18 16:59:37 -07:00
|
|
|
#include "table/sst_file_writer_collectors.h"
|
2011-03-18 22:37:00 +00:00
|
|
|
#include "table/two_level_iterator.h"
|
2019-05-30 17:39:43 -07:00
|
|
|
#include "test_util/sync_point.h"
|
2011-03-18 22:37:00 +00:00
|
|
|
#include "util/coding.h"
|
2019-02-11 11:37:07 -08:00
|
|
|
#include "util/crc32c.h"
|
2013-06-13 17:25:09 -07:00
|
|
|
#include "util/stop_watch.h"
|
2014-11-24 20:44:49 -08:00
|
|
|
#include "util/string_util.h"
|
2011-03-18 22:37:00 +00:00
|
|
|
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 15:36:27 -07:00
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
namespace {
|
|
|
|
|
|
|
|
CacheAllocationPtr CopyBufferToHeap(MemoryAllocator* allocator, Slice& buf) {
|
|
|
|
CacheAllocationPtr heap_buf;
|
|
|
|
heap_buf = AllocateBlock(buf.size(), allocator);
|
|
|
|
memcpy(heap_buf.get(), buf.data(), buf.size());
|
|
|
|
return heap_buf;
|
|
|
|
}
|
|
|
|
} // namespace
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|
|
|
|
|
|
|
|
// Generate the regular and coroutine versions of some methods by
|
|
|
|
// including block_based_table_reader_sync_and_async.h twice
|
|
|
|
// Macros in the header will expand differently based on whether
|
|
|
|
// WITH_COROUTINES or WITHOUT_COROUTINES is defined
|
|
|
|
// clang-format off
|
|
|
|
#define WITHOUT_COROUTINES
|
|
|
|
#include "table/block_based/block_based_table_reader_sync_and_async.h"
|
|
|
|
#undef WITHOUT_COROUTINES
|
|
|
|
#define WITH_COROUTINES
|
|
|
|
#include "table/block_based/block_based_table_reader_sync_and_async.h"
|
|
|
|
// clang-format on
|
|
|
|
|
2020-02-20 12:07:53 -08:00
|
|
|
namespace ROCKSDB_NAMESPACE {
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2014-05-01 14:09:32 -04:00
|
|
|
extern const uint64_t kBlockBasedTableMagicNumber;
|
2014-05-15 14:09:03 -07:00
|
|
|
extern const std::string kHashIndexPrefixesBlock;
|
|
|
|
extern const std::string kHashIndexPrefixesMetadataBlock;
|
2014-02-28 18:19:07 -08:00
|
|
|
|
2017-03-03 18:09:43 -08:00
|
|
|
BlockBasedTable::~BlockBasedTable() {
|
|
|
|
delete rep_;
|
|
|
|
}
|
|
|
|
|
2014-02-28 18:19:07 -08:00
|
|
|
namespace {
|
|
|
|
// Read the block identified by "handle" from "file".
|
|
|
|
// The only relevant option is options.verify_checksums for now.
|
|
|
|
// On failure return non-OK.
|
|
|
|
// On success fill *result and return OK - caller owns *result
|
2019-01-23 18:11:08 -08:00
|
|
|
// @param uncompression_dict Data for presetting the compression library's
|
Shared dictionary compression using reference block
Summary:
This adds a new metablock containing a shared dictionary that is used
to compress all data blocks in the SST file. The size of the shared dictionary
is configurable in CompressionOptions and defaults to 0. It's currently only
used for zlib/lz4/lz4hc, but the block will be stored in the SST regardless of
the compression type if the user chooses a nonzero dictionary size.
During compaction, computes the dictionary by randomly sampling the first
output file in each subcompaction. It pre-computes the intervals to sample
by assuming the output file will have the maximum allowable length. In case
the file is smaller, some of the pre-computed sampling intervals can be beyond
end-of-file, in which case we skip over those samples and the dictionary will
be a bit smaller. After the dictionary is generated using the first file in a
subcompaction, it is loaded into the compression library before writing each
block in each subsequent file of that subcompaction.
On the read path, gets the dictionary from the metablock, if it exists. Then,
loads that dictionary into the compression library before reading each block.
Test Plan: new unit test
Reviewers: yhchiang, IslamAbdelRahman, cyan, sdong
Reviewed By: sdong
Subscribers: andrewkr, yoshinorim, kradhakrishnan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D52287
2016-04-27 17:36:03 -07:00
|
|
|
// dictionary.
|
2019-08-23 08:25:52 -07:00
|
|
|
template <typename TBlocklike>
|
2017-08-11 11:59:13 -07:00
|
|
|
Status ReadBlockFromFile(
|
|
|
|
RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer,
|
|
|
|
const Footer& footer, const ReadOptions& options, const BlockHandle& handle,
|
2021-05-05 13:59:21 -07:00
|
|
|
std::unique_ptr<TBlocklike>* result, const ImmutableOptions& ioptions,
|
2019-06-18 19:00:03 -07:00
|
|
|
bool do_uncompress, bool maybe_compressed, BlockType block_type,
|
2019-01-23 18:11:08 -08:00
|
|
|
const UncompressionDict& uncompression_dict,
|
2020-02-25 15:29:17 -08:00
|
|
|
const PersistentCacheOptions& cache_options, size_t read_amp_bytes_per_bit,
|
|
|
|
MemoryAllocator* memory_allocator, bool for_compaction, bool using_zstd,
|
|
|
|
const FilterPolicy* filter_policy) {
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
assert(result);
|
|
|
|
|
2014-02-28 18:19:07 -08:00
|
|
|
BlockContents contents;
|
2019-06-19 14:07:36 -07:00
|
|
|
BlockFetcher block_fetcher(
|
|
|
|
file, prefetch_buffer, footer, options, handle, &contents, ioptions,
|
|
|
|
do_uncompress, maybe_compressed, block_type, uncompression_dict,
|
|
|
|
cache_options, memory_allocator, nullptr, for_compaction);
|
2017-12-11 15:16:37 -08:00
|
|
|
Status s = block_fetcher.ReadBlockContents();
|
2014-02-28 18:19:07 -08:00
|
|
|
if (s.ok()) {
|
2019-08-23 08:25:52 -07:00
|
|
|
result->reset(BlocklikeTraits<TBlocklike>::Create(
|
2021-04-26 12:43:02 -07:00
|
|
|
std::move(contents), read_amp_bytes_per_bit, ioptions.stats, using_zstd,
|
|
|
|
filter_policy));
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2022-01-31 19:45:17 -08:00
|
|
|
// For hash based index, return false if table_properties->prefix_extractor_name
|
|
|
|
// and prefix_extractor both exist and match, otherwise true.
|
2022-01-21 11:36:36 -08:00
|
|
|
inline bool PrefixExtractorChangedHelper(
|
|
|
|
const TableProperties* table_properties,
|
|
|
|
const SliceTransform* prefix_extractor) {
|
2018-05-21 14:33:55 -07:00
|
|
|
// BlockBasedTableOptions::kHashSearch requires prefix_extractor to be set.
|
|
|
|
// Turn off hash index in prefix_extractor is not set; if prefix_extractor
|
|
|
|
// is set but prefix_extractor_block is not set, also disable hash index
|
2018-05-22 13:51:21 -07:00
|
|
|
if (prefix_extractor == nullptr || table_properties == nullptr ||
|
|
|
|
table_properties->prefix_extractor_name.empty()) {
|
2018-05-21 14:33:55 -07:00
|
|
|
return true;
|
|
|
|
}
|
2018-05-22 13:51:21 -07:00
|
|
|
|
2018-05-21 14:33:55 -07:00
|
|
|
// prefix_extractor and prefix_extractor_block are both non-empty
|
2021-09-27 07:42:36 -07:00
|
|
|
if (table_properties->prefix_extractor_name != prefix_extractor->AsString()) {
|
2018-05-21 14:33:55 -07:00
|
|
|
return true;
|
|
|
|
} else {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-02-28 18:19:07 -08:00
|
|
|
} // namespace
|
|
|
|
|
2019-06-06 11:28:54 -07:00
|
|
|
void BlockBasedTable::UpdateCacheHitMetrics(BlockType block_type,
|
|
|
|
GetContext* get_context,
|
|
|
|
size_t usage) const {
|
2021-04-26 12:43:02 -07:00
|
|
|
Statistics* const statistics = rep_->ioptions.stats;
|
2019-06-06 11:28:54 -07:00
|
|
|
|
|
|
|
PERF_COUNTER_ADD(block_cache_hit_count, 1);
|
|
|
|
PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 1,
|
|
|
|
static_cast<uint32_t>(rep_->level));
|
|
|
|
|
|
|
|
if (get_context) {
|
|
|
|
++get_context->get_context_stats_.num_cache_hit;
|
|
|
|
get_context->get_context_stats_.num_cache_bytes_read += usage;
|
|
|
|
} else {
|
|
|
|
RecordTick(statistics, BLOCK_CACHE_HIT);
|
|
|
|
RecordTick(statistics, BLOCK_CACHE_BYTES_READ, usage);
|
|
|
|
}
|
|
|
|
|
|
|
|
switch (block_type) {
|
|
|
|
case BlockType::kFilter:
|
|
|
|
PERF_COUNTER_ADD(block_cache_filter_hit_count, 1);
|
|
|
|
|
|
|
|
if (get_context) {
|
|
|
|
++get_context->get_context_stats_.num_cache_filter_hit;
|
|
|
|
} else {
|
|
|
|
RecordTick(statistics, BLOCK_CACHE_FILTER_HIT);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case BlockType::kCompressionDictionary:
|
|
|
|
// TODO: introduce perf counter for compression dictionary hit count
|
|
|
|
if (get_context) {
|
|
|
|
++get_context->get_context_stats_.num_cache_compression_dict_hit;
|
|
|
|
} else {
|
|
|
|
RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_HIT);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case BlockType::kIndex:
|
|
|
|
PERF_COUNTER_ADD(block_cache_index_hit_count, 1);
|
|
|
|
|
|
|
|
if (get_context) {
|
|
|
|
++get_context->get_context_stats_.num_cache_index_hit;
|
|
|
|
} else {
|
|
|
|
RecordTick(statistics, BLOCK_CACHE_INDEX_HIT);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
// TODO: introduce dedicated tickers/statistics/counters
|
|
|
|
// for range tombstones
|
|
|
|
if (get_context) {
|
|
|
|
++get_context->get_context_stats_.num_cache_data_hit;
|
|
|
|
} else {
|
|
|
|
RecordTick(statistics, BLOCK_CACHE_DATA_HIT);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void BlockBasedTable::UpdateCacheMissMetrics(BlockType block_type,
|
|
|
|
GetContext* get_context) const {
|
2021-04-26 12:43:02 -07:00
|
|
|
Statistics* const statistics = rep_->ioptions.stats;
|
2019-06-06 11:28:54 -07:00
|
|
|
|
|
|
|
// TODO: introduce aggregate (not per-level) block cache miss count
|
|
|
|
PERF_COUNTER_BY_LEVEL_ADD(block_cache_miss_count, 1,
|
|
|
|
static_cast<uint32_t>(rep_->level));
|
|
|
|
|
|
|
|
if (get_context) {
|
|
|
|
++get_context->get_context_stats_.num_cache_miss;
|
|
|
|
} else {
|
|
|
|
RecordTick(statistics, BLOCK_CACHE_MISS);
|
|
|
|
}
|
|
|
|
|
|
|
|
// TODO: introduce perf counters for misses per block type
|
|
|
|
switch (block_type) {
|
|
|
|
case BlockType::kFilter:
|
|
|
|
if (get_context) {
|
|
|
|
++get_context->get_context_stats_.num_cache_filter_miss;
|
|
|
|
} else {
|
|
|
|
RecordTick(statistics, BLOCK_CACHE_FILTER_MISS);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case BlockType::kCompressionDictionary:
|
|
|
|
if (get_context) {
|
|
|
|
++get_context->get_context_stats_.num_cache_compression_dict_miss;
|
|
|
|
} else {
|
|
|
|
RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_MISS);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case BlockType::kIndex:
|
|
|
|
if (get_context) {
|
|
|
|
++get_context->get_context_stats_.num_cache_index_miss;
|
|
|
|
} else {
|
|
|
|
RecordTick(statistics, BLOCK_CACHE_INDEX_MISS);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
// TODO: introduce dedicated tickers/statistics/counters
|
|
|
|
// for range tombstones
|
|
|
|
if (get_context) {
|
|
|
|
++get_context->get_context_stats_.num_cache_data_miss;
|
|
|
|
} else {
|
|
|
|
RecordTick(statistics, BLOCK_CACHE_DATA_MISS);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-06-17 21:55:42 -07:00
|
|
|
void BlockBasedTable::UpdateCacheInsertionMetrics(
|
|
|
|
BlockType block_type, GetContext* get_context, size_t usage, bool redundant,
|
|
|
|
Statistics* const statistics) {
|
2019-06-06 11:28:54 -07:00
|
|
|
// TODO: introduce perf counters for block cache insertions
|
|
|
|
if (get_context) {
|
|
|
|
++get_context->get_context_stats_.num_cache_add;
|
2020-04-27 13:18:18 -07:00
|
|
|
if (redundant) {
|
|
|
|
++get_context->get_context_stats_.num_cache_add_redundant;
|
|
|
|
}
|
2019-06-06 11:28:54 -07:00
|
|
|
get_context->get_context_stats_.num_cache_bytes_write += usage;
|
|
|
|
} else {
|
|
|
|
RecordTick(statistics, BLOCK_CACHE_ADD);
|
2020-04-27 13:18:18 -07:00
|
|
|
if (redundant) {
|
|
|
|
RecordTick(statistics, BLOCK_CACHE_ADD_REDUNDANT);
|
|
|
|
}
|
2019-06-06 11:28:54 -07:00
|
|
|
RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, usage);
|
|
|
|
}
|
|
|
|
|
|
|
|
switch (block_type) {
|
|
|
|
case BlockType::kFilter:
|
|
|
|
if (get_context) {
|
|
|
|
++get_context->get_context_stats_.num_cache_filter_add;
|
2020-04-27 13:18:18 -07:00
|
|
|
if (redundant) {
|
|
|
|
++get_context->get_context_stats_.num_cache_filter_add_redundant;
|
|
|
|
}
|
2019-06-06 11:28:54 -07:00
|
|
|
get_context->get_context_stats_.num_cache_filter_bytes_insert += usage;
|
|
|
|
} else {
|
|
|
|
RecordTick(statistics, BLOCK_CACHE_FILTER_ADD);
|
2020-04-27 13:18:18 -07:00
|
|
|
if (redundant) {
|
|
|
|
RecordTick(statistics, BLOCK_CACHE_FILTER_ADD_REDUNDANT);
|
|
|
|
}
|
2019-06-06 11:28:54 -07:00
|
|
|
RecordTick(statistics, BLOCK_CACHE_FILTER_BYTES_INSERT, usage);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case BlockType::kCompressionDictionary:
|
|
|
|
if (get_context) {
|
|
|
|
++get_context->get_context_stats_.num_cache_compression_dict_add;
|
2020-04-27 13:18:18 -07:00
|
|
|
if (redundant) {
|
|
|
|
++get_context->get_context_stats_
|
|
|
|
.num_cache_compression_dict_add_redundant;
|
|
|
|
}
|
2019-06-06 11:28:54 -07:00
|
|
|
get_context->get_context_stats_
|
|
|
|
.num_cache_compression_dict_bytes_insert += usage;
|
|
|
|
} else {
|
|
|
|
RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_ADD);
|
2020-04-27 13:18:18 -07:00
|
|
|
if (redundant) {
|
|
|
|
RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_ADD_REDUNDANT);
|
|
|
|
}
|
2019-06-06 11:28:54 -07:00
|
|
|
RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT,
|
|
|
|
usage);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case BlockType::kIndex:
|
|
|
|
if (get_context) {
|
|
|
|
++get_context->get_context_stats_.num_cache_index_add;
|
2020-04-27 13:18:18 -07:00
|
|
|
if (redundant) {
|
|
|
|
++get_context->get_context_stats_.num_cache_index_add_redundant;
|
|
|
|
}
|
2019-06-06 11:28:54 -07:00
|
|
|
get_context->get_context_stats_.num_cache_index_bytes_insert += usage;
|
|
|
|
} else {
|
|
|
|
RecordTick(statistics, BLOCK_CACHE_INDEX_ADD);
|
2020-04-27 13:18:18 -07:00
|
|
|
if (redundant) {
|
|
|
|
RecordTick(statistics, BLOCK_CACHE_INDEX_ADD_REDUNDANT);
|
|
|
|
}
|
2019-06-06 11:28:54 -07:00
|
|
|
RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, usage);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
// TODO: introduce dedicated tickers/statistics/counters
|
|
|
|
// for range tombstones
|
|
|
|
if (get_context) {
|
|
|
|
++get_context->get_context_stats_.num_cache_data_add;
|
2020-04-27 13:18:18 -07:00
|
|
|
if (redundant) {
|
|
|
|
++get_context->get_context_stats_.num_cache_data_add_redundant;
|
|
|
|
}
|
2019-06-06 11:28:54 -07:00
|
|
|
get_context->get_context_stats_.num_cache_data_bytes_insert += usage;
|
|
|
|
} else {
|
|
|
|
RecordTick(statistics, BLOCK_CACHE_DATA_ADD);
|
2020-04-27 13:18:18 -07:00
|
|
|
if (redundant) {
|
|
|
|
RecordTick(statistics, BLOCK_CACHE_DATA_ADD_REDUNDANT);
|
|
|
|
}
|
2019-06-06 11:28:54 -07:00
|
|
|
RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT, usage);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-06-03 12:31:45 -07:00
|
|
|
Cache::Handle* BlockBasedTable::GetEntryFromCache(
|
2021-10-19 15:53:16 -07:00
|
|
|
const CacheTier& cache_tier, Cache* block_cache, const Slice& key,
|
|
|
|
BlockType block_type, const bool wait, GetContext* get_context,
|
|
|
|
const Cache::CacheItemHelper* cache_helper,
|
2021-05-21 18:28:28 -07:00
|
|
|
const Cache::CreateCallback& create_cb, Cache::Priority priority) const {
|
2021-10-19 15:53:16 -07:00
|
|
|
Cache::Handle* cache_handle = nullptr;
|
|
|
|
if (cache_tier == CacheTier::kNonVolatileBlockTier) {
|
|
|
|
cache_handle = block_cache->Lookup(key, cache_helper, create_cb, priority,
|
|
|
|
wait, rep_->ioptions.statistics.get());
|
|
|
|
} else {
|
|
|
|
cache_handle = block_cache->Lookup(key, rep_->ioptions.statistics.get());
|
|
|
|
}
|
2019-06-06 11:28:54 -07:00
|
|
|
|
2019-06-03 12:31:45 -07:00
|
|
|
if (cache_handle != nullptr) {
|
2019-06-06 11:28:54 -07:00
|
|
|
UpdateCacheHitMetrics(block_type, get_context,
|
|
|
|
block_cache->GetUsage(cache_handle));
|
2019-06-03 12:31:45 -07:00
|
|
|
} else {
|
2019-06-06 11:28:54 -07:00
|
|
|
UpdateCacheMissMetrics(block_type, get_context);
|
2019-06-03 12:31:45 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
return cache_handle;
|
|
|
|
}
|
|
|
|
|
2021-10-19 15:53:16 -07:00
|
|
|
template <typename TBlocklike>
|
|
|
|
Status BlockBasedTable::InsertEntryToCache(
|
|
|
|
const CacheTier& cache_tier, Cache* block_cache, const Slice& key,
|
|
|
|
const Cache::CacheItemHelper* cache_helper,
|
|
|
|
std::unique_ptr<TBlocklike>& block_holder, size_t charge,
|
|
|
|
Cache::Handle** cache_handle, Cache::Priority priority) const {
|
|
|
|
Status s = Status::OK();
|
|
|
|
if (cache_tier == CacheTier::kNonVolatileBlockTier) {
|
|
|
|
s = block_cache->Insert(key, block_holder.get(), cache_helper, charge,
|
|
|
|
cache_handle, priority);
|
|
|
|
} else {
|
|
|
|
s = block_cache->Insert(key, block_holder.get(), charge,
|
|
|
|
cache_helper->del_cb, cache_handle, priority);
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2015-02-04 17:03:57 -08:00
|
|
|
namespace {
|
|
|
|
// Return True if table_properties has `user_prop_name` has a `true` value
|
|
|
|
// or it doesn't contain this property (for backward compatible).
|
|
|
|
bool IsFeatureSupported(const TableProperties& table_properties,
|
|
|
|
const std::string& user_prop_name, Logger* info_log) {
|
|
|
|
auto& props = table_properties.user_collected_properties;
|
|
|
|
auto pos = props.find(user_prop_name);
|
|
|
|
// Older version doesn't have this value set. Skip this check.
|
|
|
|
if (pos != props.end()) {
|
|
|
|
if (pos->second == kPropFalse) {
|
|
|
|
return false;
|
|
|
|
} else if (pos->second != kPropTrue) {
|
2017-03-15 19:22:52 -07:00
|
|
|
ROCKS_LOG_WARN(info_log, "Property %s has invalidate value %s",
|
|
|
|
user_prop_name.c_str(), pos->second.c_str());
|
2015-02-04 17:03:57 -08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
2016-10-18 16:59:37 -07:00
|
|
|
|
2018-07-27 16:00:26 -07:00
|
|
|
// Caller has to ensure seqno is not nullptr.
|
|
|
|
Status GetGlobalSequenceNumber(const TableProperties& table_properties,
|
|
|
|
SequenceNumber largest_seqno,
|
|
|
|
SequenceNumber* seqno) {
|
|
|
|
const auto& props = table_properties.user_collected_properties;
|
|
|
|
const auto version_pos = props.find(ExternalSstFilePropertyNames::kVersion);
|
|
|
|
const auto seqno_pos = props.find(ExternalSstFilePropertyNames::kGlobalSeqno);
|
2016-10-18 16:59:37 -07:00
|
|
|
|
2018-07-27 16:00:26 -07:00
|
|
|
*seqno = kDisableGlobalSequenceNumber;
|
2016-10-18 16:59:37 -07:00
|
|
|
if (version_pos == props.end()) {
|
|
|
|
if (seqno_pos != props.end()) {
|
2018-07-27 16:00:26 -07:00
|
|
|
std::array<char, 200> msg_buf;
|
2016-10-18 16:59:37 -07:00
|
|
|
// This is not an external sst file, global_seqno is not supported.
|
2018-07-27 16:00:26 -07:00
|
|
|
snprintf(
|
|
|
|
msg_buf.data(), msg_buf.max_size(),
|
2016-10-18 16:59:37 -07:00
|
|
|
"A non-external sst file have global seqno property with value %s",
|
|
|
|
seqno_pos->second.c_str());
|
2018-07-27 16:00:26 -07:00
|
|
|
return Status::Corruption(msg_buf.data());
|
2016-10-18 16:59:37 -07:00
|
|
|
}
|
2018-07-27 16:00:26 -07:00
|
|
|
return Status::OK();
|
2016-10-18 16:59:37 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
uint32_t version = DecodeFixed32(version_pos->second.c_str());
|
|
|
|
if (version < 2) {
|
|
|
|
if (seqno_pos != props.end() || version != 1) {
|
2018-07-27 16:00:26 -07:00
|
|
|
std::array<char, 200> msg_buf;
|
2016-10-18 16:59:37 -07:00
|
|
|
// This is a v1 external sst file, global_seqno is not supported.
|
2018-07-27 16:00:26 -07:00
|
|
|
snprintf(msg_buf.data(), msg_buf.max_size(),
|
|
|
|
"An external sst file with version %u have global seqno "
|
|
|
|
"property with value %s",
|
|
|
|
version, seqno_pos->second.c_str());
|
|
|
|
return Status::Corruption(msg_buf.data());
|
2016-10-18 16:59:37 -07:00
|
|
|
}
|
2018-07-27 16:00:26 -07:00
|
|
|
return Status::OK();
|
2016-10-18 16:59:37 -07:00
|
|
|
}
|
|
|
|
|
2018-07-27 16:00:26 -07:00
|
|
|
// Since we have a plan to deprecate global_seqno, we do not return failure
|
|
|
|
// if seqno_pos == props.end(). We rely on version_pos to detect whether the
|
|
|
|
// SST is external.
|
|
|
|
SequenceNumber global_seqno(0);
|
|
|
|
if (seqno_pos != props.end()) {
|
|
|
|
global_seqno = DecodeFixed64(seqno_pos->second.c_str());
|
|
|
|
}
|
2019-03-26 10:15:43 -07:00
|
|
|
// SstTableReader open table reader with kMaxSequenceNumber as largest_seqno
|
|
|
|
// to denote it is unknown.
|
|
|
|
if (largest_seqno < kMaxSequenceNumber) {
|
|
|
|
if (global_seqno == 0) {
|
|
|
|
global_seqno = largest_seqno;
|
|
|
|
}
|
|
|
|
if (global_seqno != largest_seqno) {
|
|
|
|
std::array<char, 200> msg_buf;
|
|
|
|
snprintf(
|
|
|
|
msg_buf.data(), msg_buf.max_size(),
|
|
|
|
"An external sst file with version %u have global seqno property "
|
|
|
|
"with value %s, while largest seqno in the file is %llu",
|
|
|
|
version, seqno_pos->second.c_str(),
|
|
|
|
static_cast<unsigned long long>(largest_seqno));
|
|
|
|
return Status::Corruption(msg_buf.data());
|
|
|
|
}
|
2018-07-27 16:00:26 -07:00
|
|
|
}
|
2019-03-26 10:15:43 -07:00
|
|
|
*seqno = global_seqno;
|
2016-10-18 16:59:37 -07:00
|
|
|
|
|
|
|
if (global_seqno > kMaxSequenceNumber) {
|
2018-07-27 16:00:26 -07:00
|
|
|
std::array<char, 200> msg_buf;
|
|
|
|
snprintf(msg_buf.data(), msg_buf.max_size(),
|
|
|
|
"An external sst file with version %u have global seqno property "
|
|
|
|
"with value %llu, which is greater than kMaxSequenceNumber",
|
|
|
|
version, static_cast<unsigned long long>(global_seqno));
|
|
|
|
return Status::Corruption(msg_buf.data());
|
2016-10-18 16:59:37 -07:00
|
|
|
}
|
|
|
|
|
2018-07-27 16:00:26 -07:00
|
|
|
return Status::OK();
|
2016-10-18 16:59:37 -07:00
|
|
|
}
|
2015-02-04 17:03:57 -08:00
|
|
|
} // namespace
|
|
|
|
|
New stable, fixed-length cache keys (#9126)
Summary:
This change standardizes on a new 16-byte cache key format for
block cache (incl compressed and secondary) and persistent cache (but
not table cache and row cache).
The goal is a really fast cache key with practically ideal stability and
uniqueness properties without external dependencies (e.g. from FileSystem).
A fixed key size of 16 bytes should enable future optimizations to the
concurrent hash table for block cache, which is a heavy CPU user /
bottleneck, but there appears to be measurable performance improvement
even with no changes to LRUCache.
This change replaces a lot of disjointed and ugly code handling cache
keys with calls to a simple, clean new internal API (cache_key.h).
(Preserving the old cache key logic under an option would be very ugly
and likely negate the performance gain of the new approach. Complete
replacement carries some inherent risk, but I think that's acceptable
with sufficient analysis and testing.)
The scheme for encoding new cache keys is complicated but explained
in cache_key.cc.
Also: EndianSwapValue is moved to math.h to be next to other bit
operations. (Explains some new include "math.h".) ReverseBits operation
added and unit tests added to hash_test for both.
Fixes https://github.com/facebook/rocksdb/issues/7405 (presuming a root cause)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9126
Test Plan:
### Basic correctness
Several tests needed updates to work with the new functionality, mostly
because we are no longer relying on filesystem for stable cache keys
so table builders & readers need more context info to agree on cache
keys. This functionality is so core, a huge number of existing tests
exercise the cache key functionality.
### Performance
Create db with
`TEST_TMPDIR=/dev/shm ./db_bench -bloom_bits=10 -benchmarks=fillrandom -num=3000000 -partition_index_and_filters`
And test performance with
`TEST_TMPDIR=/dev/shm ./db_bench -readonly -use_existing_db -bloom_bits=10 -benchmarks=readrandom -num=3000000 -duration=30 -cache_index_and_filter_blocks -cache_size=250000 -threads=4`
using DEBUG_LEVEL=0 and simultaneous before & after runs.
Before ops/sec, avg over 100 runs: 121924
After ops/sec, avg over 100 runs: 125385 (+2.8%)
### Collision probability
I have built a tool, ./cache_bench -stress_cache_key to broadly simulate host-wide cache activity
over many months, by making some pessimistic simplifying assumptions:
* Every generated file has a cache entry for every byte offset in the file (contiguous range of cache keys)
* All of every file is cached for its entire lifetime
We use a simple table with skewed address assignment and replacement on address collision
to simulate files coming & going, with quite a variance (super-Poisson) in ages. Some output
with `./cache_bench -stress_cache_key -sck_keep_bits=40`:
```
Total cache or DBs size: 32TiB Writing 925.926 MiB/s or 76.2939TiB/day
Multiply by 9.22337e+18 to correct for simulation losses (but still assume whole file cached)
```
These come from default settings of 2.5M files per day of 32 MB each, and
`-sck_keep_bits=40` means that to represent a single file, we are only keeping 40 bits of
the 128-bit cache key. With file size of 2\*\*25 contiguous keys (pessimistic), our simulation
is about 2\*\*(128-40-25) or about 9 billion billion times more prone to collision than reality.
More default assumptions, relatively pessimistic:
* 100 DBs in same process (doesn't matter much)
* Re-open DB in same process (new session ID related to old session ID) on average
every 100 files generated
* Restart process (all new session IDs unrelated to old) 24 times per day
After enough data, we get a result at the end:
```
(keep 40 bits) 17 collisions after 2 x 90 days, est 10.5882 days between (9.76592e+19 corrected)
```
If we believe the (pessimistic) simulation and the mathematical generalization, we would need to run a billion machines all for 97 billion days to expect a cache key collision. To help verify that our generalization ("corrected") is robust, we can make our simulation more precise with `-sck_keep_bits=41` and `42`, which takes more running time to get enough data:
```
(keep 41 bits) 16 collisions after 4 x 90 days, est 22.5 days between (1.03763e+20 corrected)
(keep 42 bits) 19 collisions after 10 x 90 days, est 47.3684 days between (1.09224e+20 corrected)
```
The generalized prediction still holds. With the `-sck_randomize` option, we can see that we are beating "random" cache keys (except offsets still non-randomized) by a modest amount (roughly 20x less collision prone than random), which should make us reasonably comfortable even in "degenerate" cases:
```
197 collisions after 1 x 90 days, est 0.456853 days between (4.21372e+18 corrected)
```
I've run other tests to validate other conditions behave as expected, never behaving "worse than random" unless we start chopping off structured data.
Reviewed By: zhichao-cao
Differential Revision: D33171746
Pulled By: pdillinger
fbshipit-source-id: f16a57e369ed37be5e7e33525ace848d0537c88f
2021-12-16 17:13:55 -08:00
|
|
|
void BlockBasedTable::SetupBaseCacheKey(const TableProperties* properties,
|
|
|
|
const std::string& cur_db_session_id,
|
|
|
|
uint64_t cur_file_number,
|
|
|
|
uint64_t file_size,
|
|
|
|
OffsetableCacheKey* out_base_cache_key,
|
|
|
|
bool* out_is_stable) {
|
|
|
|
// Use a stable cache key if sufficient data is in table properties
|
|
|
|
std::string db_session_id;
|
|
|
|
uint64_t file_num;
|
|
|
|
std::string db_id;
|
|
|
|
if (properties && !properties->db_session_id.empty() &&
|
|
|
|
properties->orig_file_number > 0) {
|
|
|
|
// (Newer SST file case)
|
|
|
|
// We must have both properties to get a stable unique id because
|
|
|
|
// CreateColumnFamilyWithImport or IngestExternalFiles can change the
|
|
|
|
// file numbers on a file.
|
|
|
|
db_session_id = properties->db_session_id;
|
|
|
|
file_num = properties->orig_file_number;
|
|
|
|
// Less critical, populated in earlier release than above
|
|
|
|
db_id = properties->db_id;
|
|
|
|
if (out_is_stable) {
|
|
|
|
*out_is_stable = true;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// (Old SST file case)
|
|
|
|
// We use (unique) cache keys based on current identifiers. These are at
|
|
|
|
// least stable across table file close and re-open, but not across
|
|
|
|
// different DBs nor DB close and re-open.
|
|
|
|
db_session_id = cur_db_session_id;
|
|
|
|
file_num = cur_file_number;
|
|
|
|
// Plumbing through the DB ID to here would be annoying, and of limited
|
|
|
|
// value because of the case of VersionSet::Recover opening some table
|
|
|
|
// files and later setting the DB ID. So we just rely on uniqueness
|
|
|
|
// level provided by session ID.
|
|
|
|
db_id = "unknown";
|
|
|
|
if (out_is_stable) {
|
|
|
|
*out_is_stable = false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Too many tests to update to get these working
|
|
|
|
// assert(file_num > 0);
|
|
|
|
// assert(!db_session_id.empty());
|
|
|
|
// assert(!db_id.empty());
|
|
|
|
|
|
|
|
// Minimum block size is 5 bytes; therefore we can trim off two lower bits
|
|
|
|
// from offets. See GetCacheKey.
|
|
|
|
*out_base_cache_key = OffsetableCacheKey(db_id, db_session_id, file_num,
|
|
|
|
/*max_offset*/ file_size >> 2);
|
|
|
|
}
|
|
|
|
|
|
|
|
CacheKey BlockBasedTable::GetCacheKey(const OffsetableCacheKey& base_cache_key,
|
|
|
|
const BlockHandle& handle) {
|
|
|
|
// Minimum block size is 5 bytes; therefore we can trim off two lower bits
|
|
|
|
// from offet.
|
|
|
|
return base_cache_key.WithOffset(handle.offset() >> 2);
|
2015-12-15 18:20:10 -08:00
|
|
|
}
|
|
|
|
|
2019-06-13 15:39:52 -07:00
|
|
|
Status BlockBasedTable::Open(
|
2021-05-05 13:59:21 -07:00
|
|
|
const ReadOptions& read_options, const ImmutableOptions& ioptions,
|
2020-06-29 14:51:57 -07:00
|
|
|
const EnvOptions& env_options, const BlockBasedTableOptions& table_options,
|
2019-06-13 15:39:52 -07:00
|
|
|
const InternalKeyComparator& internal_comparator,
|
|
|
|
std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
|
|
|
|
std::unique_ptr<TableReader>* table_reader,
|
2022-04-06 10:33:00 -07:00
|
|
|
std::shared_ptr<CacheReservationManager> table_reader_cache_res_mgr,
|
2022-01-21 11:36:36 -08:00
|
|
|
const std::shared_ptr<const SliceTransform>& prefix_extractor,
|
2019-06-13 15:39:52 -07:00
|
|
|
const bool prefetch_index_and_filter_in_cache, const bool skip_filters,
|
|
|
|
const int level, const bool immortal_table,
|
2020-05-12 18:21:32 -07:00
|
|
|
const SequenceNumber largest_seqno, const bool force_direct_prefetch,
|
|
|
|
TailPrefetchStats* tail_prefetch_stats,
|
2020-06-09 16:49:07 -07:00
|
|
|
BlockCacheTracer* const block_cache_tracer,
|
2021-08-16 20:36:19 -07:00
|
|
|
size_t max_file_size_for_l0_meta_pin, const std::string& cur_db_session_id,
|
2021-06-10 11:01:44 -07:00
|
|
|
uint64_t cur_file_num) {
|
2013-10-30 10:52:33 -07:00
|
|
|
table_reader->reset();
|
2013-01-09 10:44:30 -08:00
|
|
|
|
2018-12-07 13:15:09 -08:00
|
|
|
Status s;
|
2015-01-13 14:33:04 -08:00
|
|
|
Footer footer;
|
2017-08-11 11:59:13 -07:00
|
|
|
std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
|
|
|
|
|
Set Read rate limiter priority dynamically and pass it to FS (#9996)
Summary:
### Context:
Background compactions and flush generate large reads and writes, and can be long running, especially for universal compaction. In some cases, this can impact foreground reads and writes by users.
### Solution
User, Flush, and Compaction reads share some code path. For this task, we update the rate_limiter_priority in ReadOptions for code paths (e.g. FindTable (mainly in BlockBasedTable::Open()) and various iterators), and eventually update the rate_limiter_priority in IOOptions for FSRandomAccessFile.
**This PR is for the Read path.** The **Read:** dynamic priority for different state are listed as follows:
| State | Normal | Delayed | Stalled |
| ----- | ------ | ------- | ------- |
| Flush (verification read in BuildTable()) | IO_USER | IO_USER | IO_USER |
| Compaction | IO_LOW | IO_USER | IO_USER |
| User | User provided | User provided | User provided |
We will respect the read_options that the user provided and will not set it.
The only sst read for Flush is the verification read in BuildTable(). It claims to be "regard as user read".
**Details**
1. Set read_options.rate_limiter_priority dynamically:
- User: Do not update the read_options. Use the read_options that the user provided.
- Compaction: Update read_options in CompactionJob::ProcessKeyValueCompaction().
- Flush: Update read_options in BuildTable().
2. Pass the rate limiter priority to FSRandomAccessFile functions:
- After calling the FindTable(), read_options is passed through GetTableReader(table_cache.cc), BlockBasedTableFactory::NewTableReader(block_based_table_factory.cc), and BlockBasedTable::Open(). The Open() needs some updates for the ReadOptions variable and the updates are also needed for the called functions, including PrefetchTail(), PrepareIOOptions(), ReadFooterFromFile(), ReadMetaIndexblock(), ReadPropertiesBlock(), PrefetchIndexAndFilterBlocks(), and ReadRangeDelBlock().
- In RandomAccessFileReader, the functions to be updated include Read(), MultiRead(), ReadAsync(), and Prefetch().
- Update the downstream functions of NewIndexIterator(), NewDataBlockIterator(), and BlockBasedTableIterator().
### Test Plans
Add unit tests.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9996
Reviewed By: anand1976
Differential Revision: D36452483
Pulled By: gitbw95
fbshipit-source-id: 60978204a4f849bb9261cb78d9bc1cb56d6008cf
2022-05-18 19:41:44 -07:00
|
|
|
// From read_options, retain deadline, io_timeout, and rate_limiter_priority.
|
2020-08-07 11:59:19 -07:00
|
|
|
// In future, we may retain more
|
2020-06-29 14:51:57 -07:00
|
|
|
// options. Specifically, w ignore verify_checksums and default to
|
|
|
|
// checksum verification anyway when creating the index and filter
|
|
|
|
// readers.
|
|
|
|
ReadOptions ro;
|
|
|
|
ro.deadline = read_options.deadline;
|
2020-08-07 11:59:19 -07:00
|
|
|
ro.io_timeout = read_options.io_timeout;
|
Set Read rate limiter priority dynamically and pass it to FS (#9996)
Summary:
### Context:
Background compactions and flush generate large reads and writes, and can be long running, especially for universal compaction. In some cases, this can impact foreground reads and writes by users.
### Solution
User, Flush, and Compaction reads share some code path. For this task, we update the rate_limiter_priority in ReadOptions for code paths (e.g. FindTable (mainly in BlockBasedTable::Open()) and various iterators), and eventually update the rate_limiter_priority in IOOptions for FSRandomAccessFile.
**This PR is for the Read path.** The **Read:** dynamic priority for different state are listed as follows:
| State | Normal | Delayed | Stalled |
| ----- | ------ | ------- | ------- |
| Flush (verification read in BuildTable()) | IO_USER | IO_USER | IO_USER |
| Compaction | IO_LOW | IO_USER | IO_USER |
| User | User provided | User provided | User provided |
We will respect the read_options that the user provided and will not set it.
The only sst read for Flush is the verification read in BuildTable(). It claims to be "regard as user read".
**Details**
1. Set read_options.rate_limiter_priority dynamically:
- User: Do not update the read_options. Use the read_options that the user provided.
- Compaction: Update read_options in CompactionJob::ProcessKeyValueCompaction().
- Flush: Update read_options in BuildTable().
2. Pass the rate limiter priority to FSRandomAccessFile functions:
- After calling the FindTable(), read_options is passed through GetTableReader(table_cache.cc), BlockBasedTableFactory::NewTableReader(block_based_table_factory.cc), and BlockBasedTable::Open(). The Open() needs some updates for the ReadOptions variable and the updates are also needed for the called functions, including PrefetchTail(), PrepareIOOptions(), ReadFooterFromFile(), ReadMetaIndexblock(), ReadPropertiesBlock(), PrefetchIndexAndFilterBlocks(), and ReadRangeDelBlock().
- In RandomAccessFileReader, the functions to be updated include Read(), MultiRead(), ReadAsync(), and Prefetch().
- Update the downstream functions of NewIndexIterator(), NewDataBlockIterator(), and BlockBasedTableIterator().
### Test Plans
Add unit tests.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9996
Reviewed By: anand1976
Differential Revision: D36452483
Pulled By: gitbw95
fbshipit-source-id: 60978204a4f849bb9261cb78d9bc1cb56d6008cf
2022-05-18 19:41:44 -07:00
|
|
|
ro.rate_limiter_priority = read_options.rate_limiter_priority;
|
2020-06-29 14:51:57 -07:00
|
|
|
|
2018-07-19 16:07:53 -07:00
|
|
|
// prefetch both index and filters, down to all partitions
|
|
|
|
const bool prefetch_all = prefetch_index_and_filter_in_cache || level == 0;
|
|
|
|
const bool preload_all = !table_options.cache_index_and_filter_blocks;
|
2018-07-20 14:31:27 -07:00
|
|
|
|
2019-12-18 10:59:21 -08:00
|
|
|
if (!ioptions.allow_mmap_reads) {
|
2020-06-29 14:51:57 -07:00
|
|
|
s = PrefetchTail(ro, file.get(), file_size, force_direct_prefetch,
|
2020-05-12 18:21:32 -07:00
|
|
|
tail_prefetch_stats, prefetch_all, preload_all,
|
|
|
|
&prefetch_buffer);
|
2020-10-20 09:11:50 -07:00
|
|
|
// Return error in prefetch path to users.
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
2019-12-18 10:59:21 -08:00
|
|
|
} else {
|
|
|
|
// Should not prefetch for mmap mode.
|
|
|
|
prefetch_buffer.reset(new FilePrefetchBuffer(
|
2021-11-19 17:52:42 -08:00
|
|
|
0 /* readahead_size */, 0 /* max_readahead_size */, false /* enable */,
|
|
|
|
true /* track_min_offset */));
|
2019-12-18 10:59:21 -08:00
|
|
|
}
|
2018-12-07 13:15:09 -08:00
|
|
|
|
|
|
|
// Read in the following order:
|
|
|
|
// 1. Footer
|
|
|
|
// 2. [metaindex block]
|
|
|
|
// 3. [meta block: properties]
|
|
|
|
// 4. [meta block: range deletion tombstone]
|
|
|
|
// 5. [meta block: compression dictionary]
|
|
|
|
// 6. [meta block: index]
|
|
|
|
// 7. [meta block: filter]
|
2020-06-29 14:51:57 -07:00
|
|
|
IOOptions opts;
|
2021-01-25 22:07:26 -08:00
|
|
|
s = file->PrepareIOOptions(ro, opts);
|
2020-06-29 14:51:57 -07:00
|
|
|
if (s.ok()) {
|
|
|
|
s = ReadFooterFromFile(opts, file.get(), prefetch_buffer.get(), file_size,
|
|
|
|
&footer, kBlockBasedTableMagicNumber);
|
|
|
|
}
|
2014-10-31 11:41:15 -07:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
2021-12-10 08:12:09 -08:00
|
|
|
if (!IsSupportedFormatVersion(footer.format_version())) {
|
2015-01-13 14:33:04 -08:00
|
|
|
return Status::Corruption(
|
2015-01-14 16:24:24 -08:00
|
|
|
"Unknown Footer version. Maybe this file was created with newer "
|
2015-01-13 14:33:04 -08:00
|
|
|
"version of RocksDB?");
|
|
|
|
}
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2019-06-20 14:28:22 -07:00
|
|
|
BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
|
Skip bottom-level filter block caching when hit-optimized
Summary:
When Get() or NewIterator() trigger file loads, skip caching the filter block if
(1) optimize_filters_for_hits is set and (2) the file is on the bottommost
level. Also skip checking filters under the same conditions, which means that
for a preloaded file or a file that was trivially-moved to the bottom level, its
filter block will eventually expire from the cache.
- added parameters/instance variables in various places in order to propagate the config ("skip_filters") from version_set to block_based_table_reader
- in BlockBasedTable::Rep, this optimization prevents filter from being loaded when the file is opened simply by setting filter_policy = nullptr
- in BlockBasedTable::Get/BlockBasedTable::NewIterator, this optimization prevents filter from being used (even if it was loaded already) by setting filter = nullptr
Test Plan:
updated unit test:
$ ./db_test --gtest_filter=DBTest.OptimizeFiltersForHits
will also run 'make check'
Reviewers: sdong, igor, paultuckfield, anthony, rven, kradhakrishnan, IslamAbdelRahman, yhchiang
Reviewed By: yhchiang
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D51633
2015-12-23 10:15:07 -08:00
|
|
|
Rep* rep = new BlockBasedTable::Rep(ioptions, env_options, table_options,
|
For ApproximateSizes, pro-rate table metadata size over data blocks (#6784)
Summary:
The implementation of GetApproximateSizes was inconsistent in
its treatment of the size of non-data blocks of SST files, sometimes
including and sometimes now. This was at its worst with large portion
of table file used by filters and querying a small range that crossed
a table boundary: the size estimate would include large filter size.
It's conceivable that someone might want only to know the size in terms
of data blocks, but I believe that's unlikely enough to ignore for now.
Similarly, there's no evidence the internal function AppoximateOffsetOf
is used for anything other than a one-sided ApproximateSize, so I intend
to refactor to remove redundancy in a follow-up commit.
So to fix this, GetApproximateSizes (and implementation details
ApproximateSize and ApproximateOffsetOf) now consistently include in
their returned sizes a portion of table file metadata (incl filters
and indexes) based on the size portion of the data blocks in range. In
other words, if a key range covers data blocks that are X% by size of all
the table's data blocks, returned approximate size is X% of the total
file size. It would technically be more accurate to attribute metadata
based on number of keys, but that's not computationally efficient with
data available and rarely a meaningful difference.
Also includes miscellaneous comment improvements / clarifications.
Also included is a new approximatesizerandom benchmark for db_bench.
No significant performance difference seen with this change, whether ~700 ops/sec with cache_index_and_filter_blocks and small cache or ~150k ops/sec without cache_index_and_filter_blocks.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6784
Test Plan:
Test added to DBTest.ApproximateSizesFilesWithErrorMargin.
Old code running new test...
[ RUN ] DBTest.ApproximateSizesFilesWithErrorMargin
db/db_test.cc:1562: Failure
Expected: (size) <= (11 * 100), actual: 9478 vs 1100
Other tests updated to reflect consistent accounting of metadata.
Reviewed By: siying
Differential Revision: D21334706
Pulled By: pdillinger
fbshipit-source-id: 6f86870e45213334fedbe9c73b4ebb1d8d611185
2020-06-02 12:27:59 -07:00
|
|
|
internal_comparator, skip_filters,
|
|
|
|
file_size, level, immortal_table);
|
2013-11-12 22:46:51 -08:00
|
|
|
rep->file = std::move(file);
|
2014-05-01 14:09:32 -04:00
|
|
|
rep->footer = footer;
|
2022-01-31 19:45:17 -08:00
|
|
|
// We've successfully read the footer. We are ready to serve requests.
|
|
|
|
// Better not mutate rep_ after the creation. eg. internal_prefix_transform
|
|
|
|
// raw pointer will be used to create HashIndexReader, whose reset may
|
|
|
|
// access a dangling pointer.
|
2016-08-23 18:20:41 -07:00
|
|
|
// We need to wrap data with internal_prefix_transform to make sure it can
|
|
|
|
// handle prefix correctly.
|
2022-01-31 19:45:17 -08:00
|
|
|
// FIXME: is changed prefix_extractor handled anywhere for hash index?
|
2020-01-27 15:41:57 -08:00
|
|
|
if (prefix_extractor != nullptr) {
|
|
|
|
rep->internal_prefix_transform.reset(
|
2022-01-21 11:36:36 -08:00
|
|
|
new InternalKeySliceTransform(prefix_extractor.get()));
|
2020-01-27 15:41:57 -08:00
|
|
|
}
|
2013-11-12 22:46:51 -08:00
|
|
|
|
2021-08-16 20:36:19 -07:00
|
|
|
// For fully portable/stable cache keys, we need to read the properties
|
|
|
|
// block before setting up cache keys. TODO: consider setting up a bootstrap
|
|
|
|
// cache key for PersistentCache to use for metaindex and properties blocks.
|
|
|
|
rep->persistent_cache_options = PersistentCacheOptions();
|
2015-12-15 18:20:10 -08:00
|
|
|
|
2019-04-30 09:46:40 -07:00
|
|
|
// Meta-blocks are not dictionary compressed. Explicitly set the dictionary
|
|
|
|
// handle to null, otherwise it may be seen as uninitialized during the below
|
|
|
|
// meta-block reads.
|
|
|
|
rep->compression_dict_handle = BlockHandle::NullBlockHandle();
|
|
|
|
|
2018-12-07 13:15:09 -08:00
|
|
|
// Read metaindex
|
2021-08-16 20:36:19 -07:00
|
|
|
std::unique_ptr<BlockBasedTable> new_table(
|
|
|
|
new BlockBasedTable(rep, block_cache_tracer));
|
2019-11-05 17:17:36 -08:00
|
|
|
std::unique_ptr<Block> metaindex;
|
|
|
|
std::unique_ptr<InternalIterator> metaindex_iter;
|
2020-06-29 14:51:57 -07:00
|
|
|
s = new_table->ReadMetaIndexBlock(ro, prefetch_buffer.get(), &metaindex,
|
2019-11-05 17:17:36 -08:00
|
|
|
&metaindex_iter);
|
2014-10-31 11:41:15 -07:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
2013-11-12 22:46:51 -08:00
|
|
|
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-24 20:50:35 -07:00
|
|
|
// Populates table_properties and some fields that depend on it,
|
|
|
|
// such as index_type.
|
2020-06-29 14:51:57 -07:00
|
|
|
s = new_table->ReadPropertiesBlock(ro, prefetch_buffer.get(),
|
2019-11-05 17:17:36 -08:00
|
|
|
metaindex_iter.get(), largest_seqno);
|
2018-12-07 13:15:09 -08:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
2022-01-21 11:36:36 -08:00
|
|
|
if (!PrefixExtractorChangedHelper(rep->table_properties.get(),
|
|
|
|
prefix_extractor.get())) {
|
|
|
|
// Establish fast path for unchanged prefix_extractor
|
|
|
|
rep->table_prefix_extractor = prefix_extractor;
|
|
|
|
} else {
|
|
|
|
// Current prefix_extractor doesn't match table
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
if (rep->table_properties) {
|
|
|
|
//**TODO: If/When the DBOptions has a registry in it, the ConfigOptions
|
|
|
|
// will need to use it
|
|
|
|
ConfigOptions config_options;
|
|
|
|
Status st = SliceTransform::CreateFromString(
|
|
|
|
config_options, rep->table_properties->prefix_extractor_name,
|
|
|
|
&(rep->table_prefix_extractor));
|
|
|
|
if (!st.ok()) {
|
|
|
|
//**TODO: Should this be error be returned or swallowed?
|
|
|
|
ROCKS_LOG_ERROR(rep->ioptions.logger,
|
|
|
|
"Failed to create prefix extractor[%s]: %s",
|
|
|
|
rep->table_properties->prefix_extractor_name.c_str(),
|
|
|
|
st.ToString().c_str());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif // ROCKSDB_LITE
|
|
|
|
}
|
2021-08-16 20:36:19 -07:00
|
|
|
|
New stable, fixed-length cache keys (#9126)
Summary:
This change standardizes on a new 16-byte cache key format for
block cache (incl compressed and secondary) and persistent cache (but
not table cache and row cache).
The goal is a really fast cache key with practically ideal stability and
uniqueness properties without external dependencies (e.g. from FileSystem).
A fixed key size of 16 bytes should enable future optimizations to the
concurrent hash table for block cache, which is a heavy CPU user /
bottleneck, but there appears to be measurable performance improvement
even with no changes to LRUCache.
This change replaces a lot of disjointed and ugly code handling cache
keys with calls to a simple, clean new internal API (cache_key.h).
(Preserving the old cache key logic under an option would be very ugly
and likely negate the performance gain of the new approach. Complete
replacement carries some inherent risk, but I think that's acceptable
with sufficient analysis and testing.)
The scheme for encoding new cache keys is complicated but explained
in cache_key.cc.
Also: EndianSwapValue is moved to math.h to be next to other bit
operations. (Explains some new include "math.h".) ReverseBits operation
added and unit tests added to hash_test for both.
Fixes https://github.com/facebook/rocksdb/issues/7405 (presuming a root cause)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9126
Test Plan:
### Basic correctness
Several tests needed updates to work with the new functionality, mostly
because we are no longer relying on filesystem for stable cache keys
so table builders & readers need more context info to agree on cache
keys. This functionality is so core, a huge number of existing tests
exercise the cache key functionality.
### Performance
Create db with
`TEST_TMPDIR=/dev/shm ./db_bench -bloom_bits=10 -benchmarks=fillrandom -num=3000000 -partition_index_and_filters`
And test performance with
`TEST_TMPDIR=/dev/shm ./db_bench -readonly -use_existing_db -bloom_bits=10 -benchmarks=readrandom -num=3000000 -duration=30 -cache_index_and_filter_blocks -cache_size=250000 -threads=4`
using DEBUG_LEVEL=0 and simultaneous before & after runs.
Before ops/sec, avg over 100 runs: 121924
After ops/sec, avg over 100 runs: 125385 (+2.8%)
### Collision probability
I have built a tool, ./cache_bench -stress_cache_key to broadly simulate host-wide cache activity
over many months, by making some pessimistic simplifying assumptions:
* Every generated file has a cache entry for every byte offset in the file (contiguous range of cache keys)
* All of every file is cached for its entire lifetime
We use a simple table with skewed address assignment and replacement on address collision
to simulate files coming & going, with quite a variance (super-Poisson) in ages. Some output
with `./cache_bench -stress_cache_key -sck_keep_bits=40`:
```
Total cache or DBs size: 32TiB Writing 925.926 MiB/s or 76.2939TiB/day
Multiply by 9.22337e+18 to correct for simulation losses (but still assume whole file cached)
```
These come from default settings of 2.5M files per day of 32 MB each, and
`-sck_keep_bits=40` means that to represent a single file, we are only keeping 40 bits of
the 128-bit cache key. With file size of 2\*\*25 contiguous keys (pessimistic), our simulation
is about 2\*\*(128-40-25) or about 9 billion billion times more prone to collision than reality.
More default assumptions, relatively pessimistic:
* 100 DBs in same process (doesn't matter much)
* Re-open DB in same process (new session ID related to old session ID) on average
every 100 files generated
* Restart process (all new session IDs unrelated to old) 24 times per day
After enough data, we get a result at the end:
```
(keep 40 bits) 17 collisions after 2 x 90 days, est 10.5882 days between (9.76592e+19 corrected)
```
If we believe the (pessimistic) simulation and the mathematical generalization, we would need to run a billion machines all for 97 billion days to expect a cache key collision. To help verify that our generalization ("corrected") is robust, we can make our simulation more precise with `-sck_keep_bits=41` and `42`, which takes more running time to get enough data:
```
(keep 41 bits) 16 collisions after 4 x 90 days, est 22.5 days between (1.03763e+20 corrected)
(keep 42 bits) 19 collisions after 10 x 90 days, est 47.3684 days between (1.09224e+20 corrected)
```
The generalized prediction still holds. With the `-sck_randomize` option, we can see that we are beating "random" cache keys (except offsets still non-randomized) by a modest amount (roughly 20x less collision prone than random), which should make us reasonably comfortable even in "degenerate" cases:
```
197 collisions after 1 x 90 days, est 0.456853 days between (4.21372e+18 corrected)
```
I've run other tests to validate other conditions behave as expected, never behaving "worse than random" unless we start chopping off structured data.
Reviewed By: zhichao-cao
Differential Revision: D33171746
Pulled By: pdillinger
fbshipit-source-id: f16a57e369ed37be5e7e33525ace848d0537c88f
2021-12-16 17:13:55 -08:00
|
|
|
// With properties loaded, we can set up portable/stable cache keys
|
|
|
|
SetupBaseCacheKey(rep->table_properties.get(), cur_db_session_id,
|
|
|
|
cur_file_num, file_size, &rep->base_cache_key);
|
|
|
|
|
|
|
|
rep->persistent_cache_options =
|
|
|
|
PersistentCacheOptions(rep->table_options.persistent_cache,
|
|
|
|
rep->base_cache_key, rep->ioptions.stats);
|
2021-08-16 20:36:19 -07:00
|
|
|
|
2020-06-29 14:51:57 -07:00
|
|
|
s = new_table->ReadRangeDelBlock(ro, prefetch_buffer.get(),
|
|
|
|
metaindex_iter.get(), internal_comparator,
|
|
|
|
&lookup_context);
|
2018-12-20 21:57:18 -08:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
2019-06-03 12:31:45 -07:00
|
|
|
s = new_table->PrefetchIndexAndFilterBlocks(
|
2020-06-29 14:51:57 -07:00
|
|
|
ro, prefetch_buffer.get(), metaindex_iter.get(), new_table.get(),
|
2020-06-09 16:49:07 -07:00
|
|
|
prefetch_all, table_options, level, file_size,
|
|
|
|
max_file_size_for_l0_meta_pin, &lookup_context);
|
2018-12-07 13:15:09 -08:00
|
|
|
|
|
|
|
if (s.ok()) {
|
|
|
|
// Update tail prefetch stats
|
|
|
|
assert(prefetch_buffer.get() != nullptr);
|
|
|
|
if (tail_prefetch_stats != nullptr) {
|
|
|
|
assert(prefetch_buffer->min_offset_read() < file_size);
|
|
|
|
tail_prefetch_stats->RecordEffectiveSize(
|
|
|
|
static_cast<size_t>(file_size) - prefetch_buffer->min_offset_read());
|
2015-09-02 15:36:47 -07:00
|
|
|
}
|
2022-04-06 10:33:00 -07:00
|
|
|
}
|
2018-12-07 13:15:09 -08:00
|
|
|
|
2022-04-06 10:33:00 -07:00
|
|
|
if (s.ok() && table_reader_cache_res_mgr) {
|
|
|
|
std::size_t mem_usage = new_table->ApproximateMemoryUsage();
|
|
|
|
s = table_reader_cache_res_mgr->MakeCacheReservation(
|
|
|
|
mem_usage, &(rep->table_reader_cache_res_handle));
|
|
|
|
if (s.IsIncomplete()) {
|
|
|
|
s = Status::MemoryLimit(
|
2022-05-17 15:01:51 -07:00
|
|
|
"Can't allocate " +
|
|
|
|
kCacheEntryRoleToCamelString[static_cast<std::uint32_t>(
|
|
|
|
CacheEntryRole::kBlockBasedTableReader)] +
|
|
|
|
" due to memory limit based on "
|
2022-04-06 10:33:00 -07:00
|
|
|
"cache capacity for memory allocation");
|
|
|
|
}
|
2015-09-02 15:36:47 -07:00
|
|
|
}
|
|
|
|
|
2022-04-06 10:33:00 -07:00
|
|
|
if (s.ok()) {
|
|
|
|
*table_reader = std::move(new_table);
|
|
|
|
}
|
2018-12-07 13:15:09 -08:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status BlockBasedTable::PrefetchTail(
|
2020-06-29 14:51:57 -07:00
|
|
|
const ReadOptions& ro, RandomAccessFileReader* file, uint64_t file_size,
|
2020-05-12 18:21:32 -07:00
|
|
|
bool force_direct_prefetch, TailPrefetchStats* tail_prefetch_stats,
|
|
|
|
const bool prefetch_all, const bool preload_all,
|
2018-12-07 13:15:09 -08:00
|
|
|
std::unique_ptr<FilePrefetchBuffer>* prefetch_buffer) {
|
|
|
|
size_t tail_prefetch_size = 0;
|
|
|
|
if (tail_prefetch_stats != nullptr) {
|
|
|
|
// Multiple threads may get a 0 (no history) when running in parallel,
|
|
|
|
// but it will get cleared after the first of them finishes.
|
|
|
|
tail_prefetch_size = tail_prefetch_stats->GetSuggestedPrefetchSize();
|
|
|
|
}
|
|
|
|
if (tail_prefetch_size == 0) {
|
|
|
|
// Before read footer, readahead backwards to prefetch data. Do more
|
|
|
|
// readahead if we're going to read index/filter.
|
|
|
|
// TODO: This may incorrectly select small readahead in case partitioned
|
|
|
|
// index/filter is enabled and top-level partition pinning is enabled.
|
|
|
|
// That's because we need to issue readahead before we read the properties,
|
|
|
|
// at which point we don't yet know the index type.
|
|
|
|
tail_prefetch_size = prefetch_all || preload_all ? 512 * 1024 : 4 * 1024;
|
|
|
|
}
|
|
|
|
size_t prefetch_off;
|
|
|
|
size_t prefetch_len;
|
|
|
|
if (file_size < tail_prefetch_size) {
|
|
|
|
prefetch_off = 0;
|
|
|
|
prefetch_len = static_cast<size_t>(file_size);
|
|
|
|
} else {
|
|
|
|
prefetch_off = static_cast<size_t>(file_size - tail_prefetch_size);
|
|
|
|
prefetch_len = tail_prefetch_size;
|
|
|
|
}
|
|
|
|
TEST_SYNC_POINT_CALLBACK("BlockBasedTable::Open::TailPrefetchLen",
|
|
|
|
&tail_prefetch_size);
|
2020-08-27 18:15:11 -07:00
|
|
|
|
|
|
|
// Try file system prefetch
|
2020-05-12 18:21:32 -07:00
|
|
|
if (!file->use_direct_io() && !force_direct_prefetch) {
|
Set Read rate limiter priority dynamically and pass it to FS (#9996)
Summary:
### Context:
Background compactions and flush generate large reads and writes, and can be long running, especially for universal compaction. In some cases, this can impact foreground reads and writes by users.
### Solution
User, Flush, and Compaction reads share some code path. For this task, we update the rate_limiter_priority in ReadOptions for code paths (e.g. FindTable (mainly in BlockBasedTable::Open()) and various iterators), and eventually update the rate_limiter_priority in IOOptions for FSRandomAccessFile.
**This PR is for the Read path.** The **Read:** dynamic priority for different state are listed as follows:
| State | Normal | Delayed | Stalled |
| ----- | ------ | ------- | ------- |
| Flush (verification read in BuildTable()) | IO_USER | IO_USER | IO_USER |
| Compaction | IO_LOW | IO_USER | IO_USER |
| User | User provided | User provided | User provided |
We will respect the read_options that the user provided and will not set it.
The only sst read for Flush is the verification read in BuildTable(). It claims to be "regard as user read".
**Details**
1. Set read_options.rate_limiter_priority dynamically:
- User: Do not update the read_options. Use the read_options that the user provided.
- Compaction: Update read_options in CompactionJob::ProcessKeyValueCompaction().
- Flush: Update read_options in BuildTable().
2. Pass the rate limiter priority to FSRandomAccessFile functions:
- After calling the FindTable(), read_options is passed through GetTableReader(table_cache.cc), BlockBasedTableFactory::NewTableReader(block_based_table_factory.cc), and BlockBasedTable::Open(). The Open() needs some updates for the ReadOptions variable and the updates are also needed for the called functions, including PrefetchTail(), PrepareIOOptions(), ReadFooterFromFile(), ReadMetaIndexblock(), ReadPropertiesBlock(), PrefetchIndexAndFilterBlocks(), and ReadRangeDelBlock().
- In RandomAccessFileReader, the functions to be updated include Read(), MultiRead(), ReadAsync(), and Prefetch().
- Update the downstream functions of NewIndexIterator(), NewDataBlockIterator(), and BlockBasedTableIterator().
### Test Plans
Add unit tests.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9996
Reviewed By: anand1976
Differential Revision: D36452483
Pulled By: gitbw95
fbshipit-source-id: 60978204a4f849bb9261cb78d9bc1cb56d6008cf
2022-05-18 19:41:44 -07:00
|
|
|
if (!file->Prefetch(prefetch_off, prefetch_len, ro.rate_limiter_priority)
|
|
|
|
.IsNotSupported()) {
|
2021-11-19 17:52:42 -08:00
|
|
|
prefetch_buffer->reset(new FilePrefetchBuffer(
|
|
|
|
0 /* readahead_size */, 0 /* max_readahead_size */,
|
|
|
|
false /* enable */, true /* track_min_offset */));
|
2020-08-27 18:15:11 -07:00
|
|
|
return Status::OK();
|
2020-06-29 14:51:57 -07:00
|
|
|
}
|
2018-12-07 13:15:09 -08:00
|
|
|
}
|
2020-04-24 13:03:08 -07:00
|
|
|
|
2020-08-27 18:15:11 -07:00
|
|
|
// Use `FilePrefetchBuffer`
|
2021-11-19 17:52:42 -08:00
|
|
|
prefetch_buffer->reset(
|
|
|
|
new FilePrefetchBuffer(0 /* readahead_size */, 0 /* max_readahead_size */,
|
|
|
|
true /* enable */, true /* track_min_offset */));
|
Set Read rate limiter priority dynamically and pass it to FS (#9996)
Summary:
### Context:
Background compactions and flush generate large reads and writes, and can be long running, especially for universal compaction. In some cases, this can impact foreground reads and writes by users.
### Solution
User, Flush, and Compaction reads share some code path. For this task, we update the rate_limiter_priority in ReadOptions for code paths (e.g. FindTable (mainly in BlockBasedTable::Open()) and various iterators), and eventually update the rate_limiter_priority in IOOptions for FSRandomAccessFile.
**This PR is for the Read path.** The **Read:** dynamic priority for different state are listed as follows:
| State | Normal | Delayed | Stalled |
| ----- | ------ | ------- | ------- |
| Flush (verification read in BuildTable()) | IO_USER | IO_USER | IO_USER |
| Compaction | IO_LOW | IO_USER | IO_USER |
| User | User provided | User provided | User provided |
We will respect the read_options that the user provided and will not set it.
The only sst read for Flush is the verification read in BuildTable(). It claims to be "regard as user read".
**Details**
1. Set read_options.rate_limiter_priority dynamically:
- User: Do not update the read_options. Use the read_options that the user provided.
- Compaction: Update read_options in CompactionJob::ProcessKeyValueCompaction().
- Flush: Update read_options in BuildTable().
2. Pass the rate limiter priority to FSRandomAccessFile functions:
- After calling the FindTable(), read_options is passed through GetTableReader(table_cache.cc), BlockBasedTableFactory::NewTableReader(block_based_table_factory.cc), and BlockBasedTable::Open(). The Open() needs some updates for the ReadOptions variable and the updates are also needed for the called functions, including PrefetchTail(), PrepareIOOptions(), ReadFooterFromFile(), ReadMetaIndexblock(), ReadPropertiesBlock(), PrefetchIndexAndFilterBlocks(), and ReadRangeDelBlock().
- In RandomAccessFileReader, the functions to be updated include Read(), MultiRead(), ReadAsync(), and Prefetch().
- Update the downstream functions of NewIndexIterator(), NewDataBlockIterator(), and BlockBasedTableIterator().
### Test Plans
Add unit tests.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9996
Reviewed By: anand1976
Differential Revision: D36452483
Pulled By: gitbw95
fbshipit-source-id: 60978204a4f849bb9261cb78d9bc1cb56d6008cf
2022-05-18 19:41:44 -07:00
|
|
|
|
2020-08-27 18:15:11 -07:00
|
|
|
IOOptions opts;
|
2021-01-25 22:07:26 -08:00
|
|
|
Status s = file->PrepareIOOptions(ro, opts);
|
2020-08-27 18:15:11 -07:00
|
|
|
if (s.ok()) {
|
2022-02-16 23:17:03 -08:00
|
|
|
s = (*prefetch_buffer)
|
|
|
|
->Prefetch(opts, file, prefetch_off, prefetch_len,
|
|
|
|
ro.rate_limiter_priority);
|
2020-08-27 18:15:11 -07:00
|
|
|
}
|
2018-12-07 13:15:09 -08:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status BlockBasedTable::ReadPropertiesBlock(
|
2020-06-29 14:51:57 -07:00
|
|
|
const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer,
|
|
|
|
InternalIterator* meta_iter, const SequenceNumber largest_seqno) {
|
2018-12-07 13:15:09 -08:00
|
|
|
Status s;
|
Improve / clean up meta block code & integrity (#9163)
Summary:
* Checksums are now checked on meta blocks unless specifically
suppressed or not applicable (e.g. plain table). (Was other way around.)
This means a number of cases that were not checking checksums now are,
including direct read TableProperties in Version::GetTableProperties
(fixed in meta_blocks ReadTableProperties), reading any block from
PersistentCache (fixed in BlockFetcher), read TableProperties in
SstFileDumper (ldb/sst_dump/BackupEngine) before table reader open,
maybe more.
* For that to work, I moved the global_seqno+TableProperties checksum
logic to the shared table/ code, because that is used by many utilies
such as SstFileDumper.
* Also for that to work, we have to know when we're dealing with a block
that has a checksum (trailer), so added that capability to Footer based
on magic number, and from there BlockFetcher.
* Knowledge of trailer presence has also fixed a problem where other
table formats were reading blocks including bytes for a non-existant
trailer--and awkwardly kind-of not using them, e.g. no shared code
checking checksums. (BlockFetcher compression type was populated
incorrectly.) Now we only read what is needed.
* Minimized code duplication and differing/incompatible/awkward
abstractions in meta_blocks.{cc,h} (e.g. SeekTo in metaindex block
without parsing block handle)
* Moved some meta block handling code from table_properties*.*
* Moved some code specific to block-based table from shared table/ code
to BlockBasedTable class. The checksum stuff means we can't completely
separate it, but things that don't need to be in shared table/ code
should not be.
* Use unique_ptr rather than raw ptr in more places. (Note: you can
std::move from unique_ptr to shared_ptr.)
Without enhancements to GetPropertiesOfAllTablesTest (see below),
net reduction of roughly 100 lines of code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9163
Test Plan:
existing tests and
* Enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to verify that
checksums are now checked on direct read of table properties by TableCache
(new test would fail before this change)
* Also enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to test
putting table properties under old meta name
* Also generally enhanced that same test to actually test what it was
supposed to be testing already, by kicking things out of table cache when
we don't want them there.
Reviewed By: ajkr, mrambacher
Differential Revision: D32514757
Pulled By: pdillinger
fbshipit-source-id: 507964b9311d186ae8d1131182290cbd97a99fa9
2021-11-18 11:42:12 -08:00
|
|
|
BlockHandle handle;
|
2021-12-10 08:12:09 -08:00
|
|
|
s = FindOptionalMetaBlock(meta_iter, kPropertiesBlockName, &handle);
|
RocksDB 2.8 to be able to read files generated by 2.6
Summary:
From 2.6 to 2.7, property block name is renamed from rocksdb.stats to rocksdb.properties. Older properties were not able to be loaded. In 2.8, we seem to have added some logic that uses property block without checking null pointers, which create segment faults.
In this patch, we fix it by:
(1) try rocksdb.stats if rocksdb.properties is not found
(2) add some null checking before consuming rep->table_properties
Test Plan: make sure a file generated in 2.7 couldn't be opened now can be opened.
Reviewers: haobo, igor, yhchiang
Reviewed By: igor
CC: ljin, xjin, dhruba, kailiu, leveldb
Differential Revision: https://reviews.facebook.net/D17961
2014-04-16 19:30:33 -07:00
|
|
|
|
2014-10-31 11:41:15 -07:00
|
|
|
if (!s.ok()) {
|
2021-04-26 12:43:02 -07:00
|
|
|
ROCKS_LOG_WARN(rep_->ioptions.logger,
|
2017-03-15 19:22:52 -07:00
|
|
|
"Error when seeking to properties block from file: %s",
|
|
|
|
s.ToString().c_str());
|
Improve / clean up meta block code & integrity (#9163)
Summary:
* Checksums are now checked on meta blocks unless specifically
suppressed or not applicable (e.g. plain table). (Was other way around.)
This means a number of cases that were not checking checksums now are,
including direct read TableProperties in Version::GetTableProperties
(fixed in meta_blocks ReadTableProperties), reading any block from
PersistentCache (fixed in BlockFetcher), read TableProperties in
SstFileDumper (ldb/sst_dump/BackupEngine) before table reader open,
maybe more.
* For that to work, I moved the global_seqno+TableProperties checksum
logic to the shared table/ code, because that is used by many utilies
such as SstFileDumper.
* Also for that to work, we have to know when we're dealing with a block
that has a checksum (trailer), so added that capability to Footer based
on magic number, and from there BlockFetcher.
* Knowledge of trailer presence has also fixed a problem where other
table formats were reading blocks including bytes for a non-existant
trailer--and awkwardly kind-of not using them, e.g. no shared code
checking checksums. (BlockFetcher compression type was populated
incorrectly.) Now we only read what is needed.
* Minimized code duplication and differing/incompatible/awkward
abstractions in meta_blocks.{cc,h} (e.g. SeekTo in metaindex block
without parsing block handle)
* Moved some meta block handling code from table_properties*.*
* Moved some code specific to block-based table from shared table/ code
to BlockBasedTable class. The checksum stuff means we can't completely
separate it, but things that don't need to be in shared table/ code
should not be.
* Use unique_ptr rather than raw ptr in more places. (Note: you can
std::move from unique_ptr to shared_ptr.)
Without enhancements to GetPropertiesOfAllTablesTest (see below),
net reduction of roughly 100 lines of code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9163
Test Plan:
existing tests and
* Enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to verify that
checksums are now checked on direct read of table properties by TableCache
(new test would fail before this change)
* Also enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to test
putting table properties under old meta name
* Also generally enhanced that same test to actually test what it was
supposed to be testing already, by kicking things out of table cache when
we don't want them there.
Reviewed By: ajkr, mrambacher
Differential Revision: D32514757
Pulled By: pdillinger
fbshipit-source-id: 507964b9311d186ae8d1131182290cbd97a99fa9
2021-11-18 11:42:12 -08:00
|
|
|
} else if (!handle.IsNull()) {
|
2013-11-12 22:46:51 -08:00
|
|
|
s = meta_iter->status();
|
Improve / clean up meta block code & integrity (#9163)
Summary:
* Checksums are now checked on meta blocks unless specifically
suppressed or not applicable (e.g. plain table). (Was other way around.)
This means a number of cases that were not checking checksums now are,
including direct read TableProperties in Version::GetTableProperties
(fixed in meta_blocks ReadTableProperties), reading any block from
PersistentCache (fixed in BlockFetcher), read TableProperties in
SstFileDumper (ldb/sst_dump/BackupEngine) before table reader open,
maybe more.
* For that to work, I moved the global_seqno+TableProperties checksum
logic to the shared table/ code, because that is used by many utilies
such as SstFileDumper.
* Also for that to work, we have to know when we're dealing with a block
that has a checksum (trailer), so added that capability to Footer based
on magic number, and from there BlockFetcher.
* Knowledge of trailer presence has also fixed a problem where other
table formats were reading blocks including bytes for a non-existant
trailer--and awkwardly kind-of not using them, e.g. no shared code
checking checksums. (BlockFetcher compression type was populated
incorrectly.) Now we only read what is needed.
* Minimized code duplication and differing/incompatible/awkward
abstractions in meta_blocks.{cc,h} (e.g. SeekTo in metaindex block
without parsing block handle)
* Moved some meta block handling code from table_properties*.*
* Moved some code specific to block-based table from shared table/ code
to BlockBasedTable class. The checksum stuff means we can't completely
separate it, but things that don't need to be in shared table/ code
should not be.
* Use unique_ptr rather than raw ptr in more places. (Note: you can
std::move from unique_ptr to shared_ptr.)
Without enhancements to GetPropertiesOfAllTablesTest (see below),
net reduction of roughly 100 lines of code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9163
Test Plan:
existing tests and
* Enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to verify that
checksums are now checked on direct read of table properties by TableCache
(new test would fail before this change)
* Also enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to test
putting table properties under old meta name
* Also generally enhanced that same test to actually test what it was
supposed to be testing already, by kicking things out of table cache when
we don't want them there.
Reviewed By: ajkr, mrambacher
Differential Revision: D32514757
Pulled By: pdillinger
fbshipit-source-id: 507964b9311d186ae8d1131182290cbd97a99fa9
2021-11-18 11:42:12 -08:00
|
|
|
std::unique_ptr<TableProperties> table_properties;
|
2013-11-12 22:46:51 -08:00
|
|
|
if (s.ok()) {
|
Improve / clean up meta block code & integrity (#9163)
Summary:
* Checksums are now checked on meta blocks unless specifically
suppressed or not applicable (e.g. plain table). (Was other way around.)
This means a number of cases that were not checking checksums now are,
including direct read TableProperties in Version::GetTableProperties
(fixed in meta_blocks ReadTableProperties), reading any block from
PersistentCache (fixed in BlockFetcher), read TableProperties in
SstFileDumper (ldb/sst_dump/BackupEngine) before table reader open,
maybe more.
* For that to work, I moved the global_seqno+TableProperties checksum
logic to the shared table/ code, because that is used by many utilies
such as SstFileDumper.
* Also for that to work, we have to know when we're dealing with a block
that has a checksum (trailer), so added that capability to Footer based
on magic number, and from there BlockFetcher.
* Knowledge of trailer presence has also fixed a problem where other
table formats were reading blocks including bytes for a non-existant
trailer--and awkwardly kind-of not using them, e.g. no shared code
checking checksums. (BlockFetcher compression type was populated
incorrectly.) Now we only read what is needed.
* Minimized code duplication and differing/incompatible/awkward
abstractions in meta_blocks.{cc,h} (e.g. SeekTo in metaindex block
without parsing block handle)
* Moved some meta block handling code from table_properties*.*
* Moved some code specific to block-based table from shared table/ code
to BlockBasedTable class. The checksum stuff means we can't completely
separate it, but things that don't need to be in shared table/ code
should not be.
* Use unique_ptr rather than raw ptr in more places. (Note: you can
std::move from unique_ptr to shared_ptr.)
Without enhancements to GetPropertiesOfAllTablesTest (see below),
net reduction of roughly 100 lines of code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9163
Test Plan:
existing tests and
* Enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to verify that
checksums are now checked on direct read of table properties by TableCache
(new test would fail before this change)
* Also enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to test
putting table properties under old meta name
* Also generally enhanced that same test to actually test what it was
supposed to be testing already, by kicking things out of table cache when
we don't want them there.
Reviewed By: ajkr, mrambacher
Differential Revision: D32514757
Pulled By: pdillinger
fbshipit-source-id: 507964b9311d186ae8d1131182290cbd97a99fa9
2021-11-18 11:42:12 -08:00
|
|
|
s = ReadTablePropertiesHelper(
|
|
|
|
ro, handle, rep_->file.get(), prefetch_buffer, rep_->footer,
|
|
|
|
rep_->ioptions, &table_properties, nullptr /* memory_allocator */);
|
2019-02-11 11:37:07 -08:00
|
|
|
}
|
2020-04-24 13:03:08 -07:00
|
|
|
IGNORE_STATUS_IF_ERROR(s);
|
2019-02-11 11:37:07 -08:00
|
|
|
|
2013-11-12 22:46:51 -08:00
|
|
|
if (!s.ok()) {
|
2021-04-26 12:43:02 -07:00
|
|
|
ROCKS_LOG_WARN(rep_->ioptions.logger,
|
2017-03-15 19:22:52 -07:00
|
|
|
"Encountered error while reading data from properties "
|
|
|
|
"block %s",
|
|
|
|
s.ToString().c_str());
|
2014-02-07 19:26:49 -08:00
|
|
|
} else {
|
2018-02-07 15:42:35 -08:00
|
|
|
assert(table_properties != nullptr);
|
Improve / clean up meta block code & integrity (#9163)
Summary:
* Checksums are now checked on meta blocks unless specifically
suppressed or not applicable (e.g. plain table). (Was other way around.)
This means a number of cases that were not checking checksums now are,
including direct read TableProperties in Version::GetTableProperties
(fixed in meta_blocks ReadTableProperties), reading any block from
PersistentCache (fixed in BlockFetcher), read TableProperties in
SstFileDumper (ldb/sst_dump/BackupEngine) before table reader open,
maybe more.
* For that to work, I moved the global_seqno+TableProperties checksum
logic to the shared table/ code, because that is used by many utilies
such as SstFileDumper.
* Also for that to work, we have to know when we're dealing with a block
that has a checksum (trailer), so added that capability to Footer based
on magic number, and from there BlockFetcher.
* Knowledge of trailer presence has also fixed a problem where other
table formats were reading blocks including bytes for a non-existant
trailer--and awkwardly kind-of not using them, e.g. no shared code
checking checksums. (BlockFetcher compression type was populated
incorrectly.) Now we only read what is needed.
* Minimized code duplication and differing/incompatible/awkward
abstractions in meta_blocks.{cc,h} (e.g. SeekTo in metaindex block
without parsing block handle)
* Moved some meta block handling code from table_properties*.*
* Moved some code specific to block-based table from shared table/ code
to BlockBasedTable class. The checksum stuff means we can't completely
separate it, but things that don't need to be in shared table/ code
should not be.
* Use unique_ptr rather than raw ptr in more places. (Note: you can
std::move from unique_ptr to shared_ptr.)
Without enhancements to GetPropertiesOfAllTablesTest (see below),
net reduction of roughly 100 lines of code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9163
Test Plan:
existing tests and
* Enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to verify that
checksums are now checked on direct read of table properties by TableCache
(new test would fail before this change)
* Also enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to test
putting table properties under old meta name
* Also generally enhanced that same test to actually test what it was
supposed to be testing already, by kicking things out of table cache when
we don't want them there.
Reviewed By: ajkr, mrambacher
Differential Revision: D32514757
Pulled By: pdillinger
fbshipit-source-id: 507964b9311d186ae8d1131182290cbd97a99fa9
2021-11-18 11:42:12 -08:00
|
|
|
rep_->table_properties = std::move(table_properties);
|
2019-06-03 12:31:45 -07:00
|
|
|
rep_->blocks_maybe_compressed =
|
|
|
|
rep_->table_properties->compression_name !=
|
|
|
|
CompressionTypeToString(kNoCompression);
|
|
|
|
rep_->blocks_definitely_zstd_compressed =
|
|
|
|
(rep_->table_properties->compression_name ==
|
2019-01-23 18:11:08 -08:00
|
|
|
CompressionTypeToString(kZSTD) ||
|
2019-06-03 12:31:45 -07:00
|
|
|
rep_->table_properties->compression_name ==
|
2019-01-23 18:11:08 -08:00
|
|
|
CompressionTypeToString(kZSTDNotFinalCompression));
|
2013-11-12 22:46:51 -08:00
|
|
|
}
|
2014-04-21 17:49:47 -07:00
|
|
|
} else {
|
2021-04-26 12:43:02 -07:00
|
|
|
ROCKS_LOG_ERROR(rep_->ioptions.logger,
|
2017-03-15 19:22:52 -07:00
|
|
|
"Cannot find Properties block from file.");
|
2013-11-12 22:46:51 -08:00
|
|
|
}
|
|
|
|
|
2018-07-13 22:40:23 -07:00
|
|
|
// Read the table properties, if provided.
|
2019-06-03 12:31:45 -07:00
|
|
|
if (rep_->table_properties) {
|
|
|
|
rep_->whole_key_filtering &=
|
|
|
|
IsFeatureSupported(*(rep_->table_properties),
|
2018-07-13 22:40:23 -07:00
|
|
|
BlockBasedTablePropertyNames::kWholeKeyFiltering,
|
2021-04-26 12:43:02 -07:00
|
|
|
rep_->ioptions.logger);
|
|
|
|
rep_->prefix_filtering &= IsFeatureSupported(
|
|
|
|
*(rep_->table_properties),
|
|
|
|
BlockBasedTablePropertyNames::kPrefixFiltering, rep_->ioptions.logger);
|
2019-06-03 12:31:45 -07:00
|
|
|
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-24 20:50:35 -07:00
|
|
|
rep_->index_key_includes_seq =
|
|
|
|
rep_->table_properties->index_key_is_user_key == 0;
|
|
|
|
rep_->index_value_is_full =
|
|
|
|
rep_->table_properties->index_value_is_delta_encoded == 0;
|
|
|
|
|
|
|
|
// Update index_type with the true type.
|
|
|
|
// If table properties don't contain index type, we assume that the table
|
|
|
|
// is in very old format and has kBinarySearch index type.
|
|
|
|
auto& props = rep_->table_properties->user_collected_properties;
|
|
|
|
auto pos = props.find(BlockBasedTablePropertyNames::kIndexType);
|
|
|
|
if (pos != props.end()) {
|
|
|
|
rep_->index_type = static_cast<BlockBasedTableOptions::IndexType>(
|
|
|
|
DecodeFixed32(pos->second.c_str()));
|
|
|
|
}
|
|
|
|
|
|
|
|
rep_->index_has_first_key =
|
|
|
|
rep_->index_type == BlockBasedTableOptions::kBinarySearchWithFirstKey;
|
|
|
|
|
2019-06-03 12:31:45 -07:00
|
|
|
s = GetGlobalSequenceNumber(*(rep_->table_properties), largest_seqno,
|
|
|
|
&(rep_->global_seqno));
|
2018-07-27 16:00:26 -07:00
|
|
|
if (!s.ok()) {
|
2021-04-26 12:43:02 -07:00
|
|
|
ROCKS_LOG_ERROR(rep_->ioptions.logger, "%s", s.ToString().c_str());
|
2018-07-27 16:00:26 -07:00
|
|
|
}
|
2018-07-13 22:40:23 -07:00
|
|
|
}
|
2018-12-07 13:15:09 -08:00
|
|
|
return s;
|
|
|
|
}
|
2018-07-13 22:40:23 -07:00
|
|
|
|
2018-12-07 13:15:09 -08:00
|
|
|
Status BlockBasedTable::ReadRangeDelBlock(
|
2020-06-29 14:51:57 -07:00
|
|
|
const ReadOptions& read_options, FilePrefetchBuffer* prefetch_buffer,
|
|
|
|
InternalIterator* meta_iter,
|
2019-06-10 15:30:05 -07:00
|
|
|
const InternalKeyComparator& internal_comparator,
|
|
|
|
BlockCacheLookupContext* lookup_context) {
|
2018-12-07 13:15:09 -08:00
|
|
|
Status s;
|
2018-12-20 21:57:18 -08:00
|
|
|
BlockHandle range_del_handle;
|
2021-12-10 08:12:09 -08:00
|
|
|
s = FindOptionalMetaBlock(meta_iter, kRangeDelBlockName, &range_del_handle);
|
2016-08-19 15:10:31 -07:00
|
|
|
if (!s.ok()) {
|
2017-03-15 19:22:52 -07:00
|
|
|
ROCKS_LOG_WARN(
|
2021-04-26 12:43:02 -07:00
|
|
|
rep_->ioptions.logger,
|
2016-08-19 15:10:31 -07:00
|
|
|
"Error when seeking to range delete tombstones block from file: %s",
|
|
|
|
s.ToString().c_str());
|
Improve / clean up meta block code & integrity (#9163)
Summary:
* Checksums are now checked on meta blocks unless specifically
suppressed or not applicable (e.g. plain table). (Was other way around.)
This means a number of cases that were not checking checksums now are,
including direct read TableProperties in Version::GetTableProperties
(fixed in meta_blocks ReadTableProperties), reading any block from
PersistentCache (fixed in BlockFetcher), read TableProperties in
SstFileDumper (ldb/sst_dump/BackupEngine) before table reader open,
maybe more.
* For that to work, I moved the global_seqno+TableProperties checksum
logic to the shared table/ code, because that is used by many utilies
such as SstFileDumper.
* Also for that to work, we have to know when we're dealing with a block
that has a checksum (trailer), so added that capability to Footer based
on magic number, and from there BlockFetcher.
* Knowledge of trailer presence has also fixed a problem where other
table formats were reading blocks including bytes for a non-existant
trailer--and awkwardly kind-of not using them, e.g. no shared code
checking checksums. (BlockFetcher compression type was populated
incorrectly.) Now we only read what is needed.
* Minimized code duplication and differing/incompatible/awkward
abstractions in meta_blocks.{cc,h} (e.g. SeekTo in metaindex block
without parsing block handle)
* Moved some meta block handling code from table_properties*.*
* Moved some code specific to block-based table from shared table/ code
to BlockBasedTable class. The checksum stuff means we can't completely
separate it, but things that don't need to be in shared table/ code
should not be.
* Use unique_ptr rather than raw ptr in more places. (Note: you can
std::move from unique_ptr to shared_ptr.)
Without enhancements to GetPropertiesOfAllTablesTest (see below),
net reduction of roughly 100 lines of code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9163
Test Plan:
existing tests and
* Enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to verify that
checksums are now checked on direct read of table properties by TableCache
(new test would fail before this change)
* Also enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to test
putting table properties under old meta name
* Also generally enhanced that same test to actually test what it was
supposed to be testing already, by kicking things out of table cache when
we don't want them there.
Reviewed By: ajkr, mrambacher
Differential Revision: D32514757
Pulled By: pdillinger
fbshipit-source-id: 507964b9311d186ae8d1131182290cbd97a99fa9
2021-11-18 11:42:12 -08:00
|
|
|
} else if (!range_del_handle.IsNull()) {
|
2018-12-20 21:57:18 -08:00
|
|
|
std::unique_ptr<InternalIterator> iter(NewDataBlockIterator<DataBlockIter>(
|
2019-06-10 15:30:05 -07:00
|
|
|
read_options, range_del_handle,
|
|
|
|
/*input_iter=*/nullptr, BlockType::kRangeDeletion,
|
|
|
|
/*get_context=*/nullptr, lookup_context, Status(), prefetch_buffer));
|
2018-12-20 21:57:18 -08:00
|
|
|
assert(iter != nullptr);
|
|
|
|
s = iter->status();
|
Cache fragmented range tombstones in BlockBasedTableReader (#4493)
Summary:
This allows tombstone fragmenting to only be performed when the table is opened, and cached for subsequent accesses.
On the same DB used in #4449, running `readrandom` results in the following:
```
readrandom : 0.983 micros/op 1017076 ops/sec; 78.3 MB/s (63103 of 100000 found)
```
Now that Get performance in the presence of range tombstones is reasonable, I also compared the performance between a DB with range tombstones, "expanded" range tombstones (several point tombstones that cover the same keys the equivalent range tombstone would cover, a common workaround for DeleteRange), and no range tombstones. The created DBs had 5 million keys each, and DeleteRange was called at regular intervals (depending on the total number of range tombstones being written) after 4.5 million Puts. The table below summarizes the results of a `readwhilewriting` benchmark (in order to provide somewhat more realistic results):
```
Tombstones? | avg micros/op | stddev micros/op | avg ops/s | stddev ops/s
----------------- | ------------- | ---------------- | ------------ | ------------
None | 0.6186 | 0.04637 | 1,625,252.90 | 124,679.41
500 Expanded | 0.6019 | 0.03628 | 1,666,670.40 | 101,142.65
500 Unexpanded | 0.6435 | 0.03994 | 1,559,979.40 | 104,090.52
1k Expanded | 0.6034 | 0.04349 | 1,665,128.10 | 125,144.57
1k Unexpanded | 0.6261 | 0.03093 | 1,600,457.50 | 79,024.94
5k Expanded | 0.6163 | 0.05926 | 1,636,668.80 | 154,888.85
5k Unexpanded | 0.6402 | 0.04002 | 1,567,804.70 | 100,965.55
10k Expanded | 0.6036 | 0.05105 | 1,667,237.70 | 142,830.36
10k Unexpanded | 0.6128 | 0.02598 | 1,634,633.40 | 72,161.82
25k Expanded | 0.6198 | 0.04542 | 1,620,980.50 | 116,662.93
25k Unexpanded | 0.5478 | 0.0362 | 1,833,059.10 | 121,233.81
50k Expanded | 0.5104 | 0.04347 | 1,973,107.90 | 184,073.49
50k Unexpanded | 0.4528 | 0.03387 | 2,219,034.50 | 170,984.32
```
After a large enough quantity of range tombstones are written, range tombstone Gets can become faster than reading from an equivalent DB with several point tombstones.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4493
Differential Revision: D10842844
Pulled By: abhimadan
fbshipit-source-id: a7d44534f8120e6aabb65779d26c6b9df954c509
2018-10-25 19:25:00 -07:00
|
|
|
if (!s.ok()) {
|
|
|
|
ROCKS_LOG_WARN(
|
2021-04-26 12:43:02 -07:00
|
|
|
rep_->ioptions.logger,
|
Cache fragmented range tombstones in BlockBasedTableReader (#4493)
Summary:
This allows tombstone fragmenting to only be performed when the table is opened, and cached for subsequent accesses.
On the same DB used in #4449, running `readrandom` results in the following:
```
readrandom : 0.983 micros/op 1017076 ops/sec; 78.3 MB/s (63103 of 100000 found)
```
Now that Get performance in the presence of range tombstones is reasonable, I also compared the performance between a DB with range tombstones, "expanded" range tombstones (several point tombstones that cover the same keys the equivalent range tombstone would cover, a common workaround for DeleteRange), and no range tombstones. The created DBs had 5 million keys each, and DeleteRange was called at regular intervals (depending on the total number of range tombstones being written) after 4.5 million Puts. The table below summarizes the results of a `readwhilewriting` benchmark (in order to provide somewhat more realistic results):
```
Tombstones? | avg micros/op | stddev micros/op | avg ops/s | stddev ops/s
----------------- | ------------- | ---------------- | ------------ | ------------
None | 0.6186 | 0.04637 | 1,625,252.90 | 124,679.41
500 Expanded | 0.6019 | 0.03628 | 1,666,670.40 | 101,142.65
500 Unexpanded | 0.6435 | 0.03994 | 1,559,979.40 | 104,090.52
1k Expanded | 0.6034 | 0.04349 | 1,665,128.10 | 125,144.57
1k Unexpanded | 0.6261 | 0.03093 | 1,600,457.50 | 79,024.94
5k Expanded | 0.6163 | 0.05926 | 1,636,668.80 | 154,888.85
5k Unexpanded | 0.6402 | 0.04002 | 1,567,804.70 | 100,965.55
10k Expanded | 0.6036 | 0.05105 | 1,667,237.70 | 142,830.36
10k Unexpanded | 0.6128 | 0.02598 | 1,634,633.40 | 72,161.82
25k Expanded | 0.6198 | 0.04542 | 1,620,980.50 | 116,662.93
25k Unexpanded | 0.5478 | 0.0362 | 1,833,059.10 | 121,233.81
50k Expanded | 0.5104 | 0.04347 | 1,973,107.90 | 184,073.49
50k Unexpanded | 0.4528 | 0.03387 | 2,219,034.50 | 170,984.32
```
After a large enough quantity of range tombstones are written, range tombstone Gets can become faster than reading from an equivalent DB with several point tombstones.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4493
Differential Revision: D10842844
Pulled By: abhimadan
fbshipit-source-id: a7d44534f8120e6aabb65779d26c6b9df954c509
2018-10-25 19:25:00 -07:00
|
|
|
"Encountered error while reading data from range del block %s",
|
|
|
|
s.ToString().c_str());
|
2020-04-24 13:03:08 -07:00
|
|
|
IGNORE_STATUS_IF_ERROR(s);
|
2018-12-20 21:57:18 -08:00
|
|
|
} else {
|
2019-06-03 12:31:45 -07:00
|
|
|
rep_->fragmented_range_dels =
|
2018-12-20 21:57:18 -08:00
|
|
|
std::make_shared<FragmentedRangeTombstoneList>(std::move(iter),
|
|
|
|
internal_comparator);
|
2016-08-19 15:10:31 -07:00
|
|
|
}
|
|
|
|
}
|
2018-12-07 13:15:09 -08:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
|
2020-06-29 14:51:57 -07:00
|
|
|
const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer,
|
|
|
|
InternalIterator* meta_iter, BlockBasedTable* new_table, bool prefetch_all,
|
2019-06-10 15:30:05 -07:00
|
|
|
const BlockBasedTableOptions& table_options, const int level,
|
2020-06-09 16:49:07 -07:00
|
|
|
size_t file_size, size_t max_file_size_for_l0_meta_pin,
|
2019-06-10 15:30:05 -07:00
|
|
|
BlockCacheLookupContext* lookup_context) {
|
2018-12-07 13:15:09 -08:00
|
|
|
// Find filter handle and filter type
|
2019-06-03 12:31:45 -07:00
|
|
|
if (rep_->filter_policy) {
|
2022-03-23 10:00:54 -07:00
|
|
|
auto name = rep_->filter_policy->CompatibilityName();
|
|
|
|
bool builtin_compatible =
|
|
|
|
strcmp(name, BuiltinFilterPolicy::kCompatibilityName()) == 0;
|
|
|
|
|
|
|
|
for (const auto& [filter_type, prefix] :
|
|
|
|
{std::make_pair(Rep::FilterType::kFullFilter, kFullFilterBlockPrefix),
|
|
|
|
std::make_pair(Rep::FilterType::kPartitionedFilter,
|
|
|
|
kPartitionedFilterBlockPrefix),
|
|
|
|
std::make_pair(Rep::FilterType::kBlockFilter, kFilterBlockPrefix)}) {
|
|
|
|
if (builtin_compatible) {
|
|
|
|
// This code is only here to deal with a hiccup in early 7.0.x where
|
|
|
|
// there was an unintentional name change in the SST files metadata.
|
|
|
|
// It should be OK to remove this in the future (late 2022) and just
|
|
|
|
// have the 'else' code.
|
|
|
|
// NOTE: the test:: names below are likely not needed but included
|
|
|
|
// out of caution
|
|
|
|
static const std::unordered_set<std::string> kBuiltinNameAndAliases = {
|
|
|
|
BuiltinFilterPolicy::kCompatibilityName(),
|
|
|
|
test::LegacyBloomFilterPolicy::kClassName(),
|
|
|
|
test::FastLocalBloomFilterPolicy::kClassName(),
|
|
|
|
test::Standard128RibbonFilterPolicy::kClassName(),
|
|
|
|
DeprecatedBlockBasedBloomFilterPolicy::kClassName(),
|
|
|
|
BloomFilterPolicy::kClassName(),
|
|
|
|
RibbonFilterPolicy::kClassName(),
|
|
|
|
};
|
|
|
|
|
|
|
|
// For efficiency, do a prefix seek and see if the first match is
|
|
|
|
// good.
|
|
|
|
meta_iter->Seek(prefix);
|
|
|
|
if (meta_iter->status().ok() && meta_iter->Valid()) {
|
|
|
|
Slice key = meta_iter->key();
|
|
|
|
if (key.starts_with(prefix)) {
|
|
|
|
key.remove_prefix(prefix.size());
|
|
|
|
if (kBuiltinNameAndAliases.find(key.ToString()) !=
|
|
|
|
kBuiltinNameAndAliases.end()) {
|
|
|
|
Slice v = meta_iter->value();
|
|
|
|
Status s = rep_->filter_handle.DecodeFrom(&v);
|
|
|
|
if (s.ok()) {
|
|
|
|
rep_->filter_type = filter_type;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
std::string filter_block_key = prefix + name;
|
|
|
|
if (FindMetaBlock(meta_iter, filter_block_key, &rep_->filter_handle)
|
|
|
|
.ok()) {
|
|
|
|
rep_->filter_type = filter_type;
|
2018-12-07 13:15:09 -08:00
|
|
|
break;
|
2022-03-23 10:00:54 -07:00
|
|
|
}
|
2018-12-07 13:15:09 -08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2020-10-11 14:52:49 -07:00
|
|
|
// Partition filters cannot be enabled without partition indexes
|
|
|
|
assert(rep_->filter_type != Rep::FilterType::kPartitionedFilter ||
|
|
|
|
rep_->index_type == BlockBasedTableOptions::kTwoLevelIndexSearch);
|
2016-08-19 15:10:31 -07:00
|
|
|
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
// Find compression dictionary handle
|
2022-03-23 10:00:54 -07:00
|
|
|
Status s = FindOptionalMetaBlock(meta_iter, kCompressionDictBlockName,
|
|
|
|
&rep_->compression_dict_handle);
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
2019-01-23 18:11:08 -08:00
|
|
|
}
|
|
|
|
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-24 20:50:35 -07:00
|
|
|
BlockBasedTableOptions::IndexType index_type = rep_->index_type;
|
Move the index readers out of the block cache (#5298)
Summary:
Currently, when the block cache is used for index blocks as well, it is
not really the index block that is stored in the cache but an
IndexReader object. Since this object is not pure data (it has, for
instance, pointers that might dangle), it's not really sharable. To
avoid the issues around this, the current code uses a dummy unique cache
key for each TableReader to store the IndexReader, and erases the
IndexReader entry when the TableReader is closed. Instead of doing this,
the new code moves the IndexReader out of the cache altogether. In
particular, instead of the TableReader owning, or caching/pinning the
IndexReader based on the customer's settings, the TableReader
unconditionally owns the IndexReader, which in turn owns/caches/pins
the index block (which is itself sharable and thus can be safely put in
the cache without any hacks).
Note: the change has two side effects:
1) Partitions of partitioned indexes no longer affect the read
amplification statistics.
2) Eviction statistics for index blocks are temporarily broken. We plan to fix
this in a separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5298
Differential Revision: D15303203
Pulled By: ltamasi
fbshipit-source-id: 935a69ba59d87d5e44f42e2310619b790c366e47
2019-05-30 11:49:36 -07:00
|
|
|
|
|
|
|
const bool use_cache = table_options.cache_index_and_filter_blocks;
|
|
|
|
|
2020-10-11 14:52:49 -07:00
|
|
|
const bool maybe_flushed =
|
2020-06-09 16:49:07 -07:00
|
|
|
level == 0 && file_size <= max_file_size_for_l0_meta_pin;
|
2020-10-11 14:52:49 -07:00
|
|
|
std::function<bool(PinningTier, PinningTier)> is_pinned =
|
|
|
|
[maybe_flushed, &is_pinned](PinningTier pinning_tier,
|
|
|
|
PinningTier fallback_pinning_tier) {
|
|
|
|
// Fallback to fallback would lead to infinite recursion. Disallow it.
|
|
|
|
assert(fallback_pinning_tier != PinningTier::kFallback);
|
|
|
|
|
|
|
|
switch (pinning_tier) {
|
|
|
|
case PinningTier::kFallback:
|
|
|
|
return is_pinned(fallback_pinning_tier,
|
|
|
|
PinningTier::kNone /* fallback_pinning_tier */);
|
|
|
|
case PinningTier::kNone:
|
|
|
|
return false;
|
|
|
|
case PinningTier::kFlushedAndSimilar:
|
|
|
|
return maybe_flushed;
|
|
|
|
case PinningTier::kAll:
|
|
|
|
return true;
|
|
|
|
};
|
|
|
|
|
|
|
|
// In GCC, this is needed to suppress `control reaches end of non-void
|
|
|
|
// function [-Werror=return-type]`.
|
|
|
|
assert(false);
|
|
|
|
return false;
|
|
|
|
};
|
|
|
|
const bool pin_top_level_index = is_pinned(
|
|
|
|
table_options.metadata_cache_options.top_level_index_pinning,
|
|
|
|
table_options.pin_top_level_index_and_filter ? PinningTier::kAll
|
|
|
|
: PinningTier::kNone);
|
|
|
|
const bool pin_partition =
|
|
|
|
is_pinned(table_options.metadata_cache_options.partition_pinning,
|
|
|
|
table_options.pin_l0_filter_and_index_blocks_in_cache
|
|
|
|
? PinningTier::kFlushedAndSimilar
|
|
|
|
: PinningTier::kNone);
|
|
|
|
const bool pin_unpartitioned =
|
|
|
|
is_pinned(table_options.metadata_cache_options.unpartitioned_pinning,
|
|
|
|
table_options.pin_l0_filter_and_index_blocks_in_cache
|
|
|
|
? PinningTier::kFlushedAndSimilar
|
|
|
|
: PinningTier::kNone);
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
|
|
|
|
// pin the first level of index
|
|
|
|
const bool pin_index =
|
2020-10-11 14:52:49 -07:00
|
|
|
index_type == BlockBasedTableOptions::kTwoLevelIndexSearch
|
|
|
|
? pin_top_level_index
|
|
|
|
: pin_unpartitioned;
|
|
|
|
// prefetch the first level of index
|
2021-09-15 15:32:07 -07:00
|
|
|
// WART: this might be redundant (unnecessary cache hit) if !pin_index,
|
|
|
|
// depending on prepopulate_block_cache option
|
2020-10-11 14:52:49 -07:00
|
|
|
const bool prefetch_index = prefetch_all || pin_index;
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
|
|
|
|
std::unique_ptr<IndexReader> index_reader;
|
2020-06-29 14:51:57 -07:00
|
|
|
s = new_table->CreateIndexReader(ro, prefetch_buffer, meta_iter, use_cache,
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
prefetch_index, pin_index, lookup_context,
|
|
|
|
&index_reader);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
rep_->index_reader = std::move(index_reader);
|
|
|
|
|
|
|
|
// The partitions of partitioned index are always stored in cache. They
|
|
|
|
// are hence follow the configuration for pin and prefetch regardless of
|
|
|
|
// the value of cache_index_and_filter_blocks
|
2020-10-11 14:52:49 -07:00
|
|
|
if (prefetch_all || pin_partition) {
|
|
|
|
s = rep_->index_reader->CacheDependencies(ro, pin_partition);
|
2020-08-25 18:59:19 -07:00
|
|
|
}
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
}
|
|
|
|
|
2018-06-22 15:14:05 -07:00
|
|
|
// pin the first level of filter
|
|
|
|
const bool pin_filter =
|
2020-10-11 14:52:49 -07:00
|
|
|
rep_->filter_type == Rep::FilterType::kPartitionedFilter
|
|
|
|
? pin_top_level_index
|
|
|
|
: pin_unpartitioned;
|
|
|
|
// prefetch the first level of filter
|
2021-09-15 15:32:07 -07:00
|
|
|
// WART: this might be redundant (unnecessary cache hit) if !pin_filter,
|
|
|
|
// depending on prepopulate_block_cache option
|
2020-10-11 14:52:49 -07:00
|
|
|
const bool prefetch_filter = prefetch_all || pin_filter;
|
Move the index readers out of the block cache (#5298)
Summary:
Currently, when the block cache is used for index blocks as well, it is
not really the index block that is stored in the cache but an
IndexReader object. Since this object is not pure data (it has, for
instance, pointers that might dangle), it's not really sharable. To
avoid the issues around this, the current code uses a dummy unique cache
key for each TableReader to store the IndexReader, and erases the
IndexReader entry when the TableReader is closed. Instead of doing this,
the new code moves the IndexReader out of the cache altogether. In
particular, instead of the TableReader owning, or caching/pinning the
IndexReader based on the customer's settings, the TableReader
unconditionally owns the IndexReader, which in turn owns/caches/pins
the index block (which is itself sharable and thus can be safely put in
the cache without any hacks).
Note: the change has two side effects:
1) Partitions of partitioned indexes no longer affect the read
amplification statistics.
2) Eviction statistics for index blocks are temporarily broken. We plan to fix
this in a separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5298
Differential Revision: D15303203
Pulled By: ltamasi
fbshipit-source-id: 935a69ba59d87d5e44f42e2310619b790c366e47
2019-05-30 11:49:36 -07:00
|
|
|
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
if (rep_->filter_policy) {
|
|
|
|
auto filter = new_table->CreateFilterBlockReader(
|
2020-06-29 14:51:57 -07:00
|
|
|
ro, prefetch_buffer, use_cache, prefetch_filter, pin_filter,
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
lookup_context);
|
2020-10-14 10:43:37 -07:00
|
|
|
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
if (filter) {
|
|
|
|
// Refer to the comment above about paritioned indexes always being cached
|
2020-10-11 14:52:49 -07:00
|
|
|
if (prefetch_all || pin_partition) {
|
2020-10-14 10:43:37 -07:00
|
|
|
s = filter->CacheDependencies(ro, pin_partition);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
Move the index readers out of the block cache (#5298)
Summary:
Currently, when the block cache is used for index blocks as well, it is
not really the index block that is stored in the cache but an
IndexReader object. Since this object is not pure data (it has, for
instance, pointers that might dangle), it's not really sharable. To
avoid the issues around this, the current code uses a dummy unique cache
key for each TableReader to store the IndexReader, and erases the
IndexReader entry when the TableReader is closed. Instead of doing this,
the new code moves the IndexReader out of the cache altogether. In
particular, instead of the TableReader owning, or caching/pinning the
IndexReader based on the customer's settings, the TableReader
unconditionally owns the IndexReader, which in turn owns/caches/pins
the index block (which is itself sharable and thus can be safely put in
the cache without any hacks).
Note: the change has two side effects:
1) Partitions of partitioned indexes no longer affect the read
amplification statistics.
2) Eviction statistics for index blocks are temporarily broken. We plan to fix
this in a separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5298
Differential Revision: D15303203
Pulled By: ltamasi
fbshipit-source-id: 935a69ba59d87d5e44f42e2310619b790c366e47
2019-05-30 11:49:36 -07:00
|
|
|
}
|
2020-10-03 18:36:16 -07:00
|
|
|
rep_->filter = std::move(filter);
|
Move the index readers out of the block cache (#5298)
Summary:
Currently, when the block cache is used for index blocks as well, it is
not really the index block that is stored in the cache but an
IndexReader object. Since this object is not pure data (it has, for
instance, pointers that might dangle), it's not really sharable. To
avoid the issues around this, the current code uses a dummy unique cache
key for each TableReader to store the IndexReader, and erases the
IndexReader entry when the TableReader is closed. Instead of doing this,
the new code moves the IndexReader out of the cache altogether. In
particular, instead of the TableReader owning, or caching/pinning the
IndexReader based on the customer's settings, the TableReader
unconditionally owns the IndexReader, which in turn owns/caches/pins
the index block (which is itself sharable and thus can be safely put in
the cache without any hacks).
Note: the change has two side effects:
1) Partitions of partitioned indexes no longer affect the read
amplification statistics.
2) Eviction statistics for index blocks are temporarily broken. We plan to fix
this in a separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5298
Differential Revision: D15303203
Pulled By: ltamasi
fbshipit-source-id: 935a69ba59d87d5e44f42e2310619b790c366e47
2019-05-30 11:49:36 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-07-23 15:57:43 -07:00
|
|
|
if (!rep_->compression_dict_handle.IsNull()) {
|
|
|
|
std::unique_ptr<UncompressionDictReader> uncompression_dict_reader;
|
2020-10-11 14:52:49 -07:00
|
|
|
s = UncompressionDictReader::Create(
|
|
|
|
this, ro, prefetch_buffer, use_cache, prefetch_all || pin_unpartitioned,
|
|
|
|
pin_unpartitioned, lookup_context, &uncompression_dict_reader);
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
2013-11-12 22:46:51 -08:00
|
|
|
}
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
|
2019-07-23 15:57:43 -07:00
|
|
|
rep_->uncompression_dict_reader = std::move(uncompression_dict_reader);
|
2013-11-12 22:46:51 -08:00
|
|
|
}
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
|
|
|
|
assert(s.ok());
|
2011-03-18 22:37:00 +00:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2013-10-28 17:54:09 -07:00
|
|
|
void BlockBasedTable::SetupForCompaction() {
|
2014-09-23 14:18:57 -07:00
|
|
|
switch (rep_->ioptions.access_hint_on_compaction_start) {
|
2013-05-17 15:53:01 -07:00
|
|
|
case Options::NONE:
|
|
|
|
break;
|
|
|
|
case Options::NORMAL:
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 14:47:08 -08:00
|
|
|
rep_->file->file()->Hint(FSRandomAccessFile::kNormal);
|
2013-05-17 15:53:01 -07:00
|
|
|
break;
|
|
|
|
case Options::SEQUENTIAL:
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 14:47:08 -08:00
|
|
|
rep_->file->file()->Hint(FSRandomAccessFile::kSequential);
|
2013-05-17 15:53:01 -07:00
|
|
|
break;
|
|
|
|
case Options::WILLNEED:
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 14:47:08 -08:00
|
|
|
rep_->file->file()->Hint(FSRandomAccessFile::kWillNeed);
|
2013-05-17 15:53:01 -07:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
assert(false);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-02-07 19:26:49 -08:00
|
|
|
std::shared_ptr<const TableProperties> BlockBasedTable::GetTableProperties()
|
|
|
|
const {
|
2013-11-19 16:29:42 -08:00
|
|
|
return rep_->table_properties;
|
2013-10-10 11:43:24 -07:00
|
|
|
}
|
2012-04-17 08:36:46 -07:00
|
|
|
|
2014-08-05 11:27:34 -07:00
|
|
|
size_t BlockBasedTable::ApproximateMemoryUsage() const {
|
|
|
|
size_t usage = 0;
|
2022-04-06 10:33:00 -07:00
|
|
|
if (rep_) {
|
|
|
|
usage += rep_->ApproximateMemoryUsage();
|
|
|
|
} else {
|
|
|
|
return usage;
|
|
|
|
}
|
2014-08-05 11:27:34 -07:00
|
|
|
if (rep_->filter) {
|
|
|
|
usage += rep_->filter->ApproximateMemoryUsage();
|
|
|
|
}
|
|
|
|
if (rep_->index_reader) {
|
|
|
|
usage += rep_->index_reader->ApproximateMemoryUsage();
|
|
|
|
}
|
2019-07-23 15:57:43 -07:00
|
|
|
if (rep_->uncompression_dict_reader) {
|
|
|
|
usage += rep_->uncompression_dict_reader->ApproximateMemoryUsage();
|
2019-01-23 18:11:08 -08:00
|
|
|
}
|
2022-04-06 10:33:00 -07:00
|
|
|
if (rep_->table_properties) {
|
|
|
|
usage += rep_->table_properties->ApproximateMemoryUsage();
|
|
|
|
}
|
2014-08-05 11:27:34 -07:00
|
|
|
return usage;
|
|
|
|
}
|
|
|
|
|
2019-11-05 17:17:36 -08:00
|
|
|
// Load the meta-index-block from the file. On success, return the loaded
|
|
|
|
// metaindex
|
|
|
|
// block and its iterator.
|
|
|
|
Status BlockBasedTable::ReadMetaIndexBlock(
|
2020-06-29 14:51:57 -07:00
|
|
|
const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer,
|
2019-11-05 17:17:36 -08:00
|
|
|
std::unique_ptr<Block>* metaindex_block,
|
|
|
|
std::unique_ptr<InternalIterator>* iter) {
|
2012-04-17 08:36:46 -07:00
|
|
|
// TODO(sanjay): Skip this if footer.metaindex_handle() size indicates
|
|
|
|
// it is an empty block.
|
2019-11-05 17:17:36 -08:00
|
|
|
std::unique_ptr<Block> metaindex;
|
2013-11-12 22:46:51 -08:00
|
|
|
Status s = ReadBlockFromFile(
|
2020-06-29 14:51:57 -07:00
|
|
|
rep_->file.get(), prefetch_buffer, rep_->footer, ro,
|
2019-11-05 17:17:36 -08:00
|
|
|
rep_->footer.metaindex_handle(), &metaindex, rep_->ioptions,
|
2019-06-18 19:00:03 -07:00
|
|
|
true /* decompress */, true /*maybe_compressed*/, BlockType::kMetaIndex,
|
2019-06-03 12:31:45 -07:00
|
|
|
UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options,
|
2020-02-25 15:29:17 -08:00
|
|
|
0 /* read_amp_bytes_per_bit */, GetMemoryAllocator(rep_->table_options),
|
|
|
|
false /* for_compaction */, rep_->blocks_definitely_zstd_compressed,
|
|
|
|
nullptr /* filter_policy */);
|
2013-10-10 11:43:24 -07:00
|
|
|
|
2013-11-12 22:46:51 -08:00
|
|
|
if (!s.ok()) {
|
2021-04-26 12:43:02 -07:00
|
|
|
ROCKS_LOG_ERROR(rep_->ioptions.logger,
|
2017-03-15 19:22:52 -07:00
|
|
|
"Encountered error while reading data from properties"
|
|
|
|
" block %s",
|
|
|
|
s.ToString().c_str());
|
2013-11-12 22:46:51 -08:00
|
|
|
return s;
|
2012-04-17 08:36:46 -07:00
|
|
|
}
|
2013-10-10 11:43:24 -07:00
|
|
|
|
2019-11-05 17:17:36 -08:00
|
|
|
*metaindex_block = std::move(metaindex);
|
2013-11-12 22:46:51 -08:00
|
|
|
// meta block uses bytewise comparator.
|
2021-12-21 11:29:52 -08:00
|
|
|
iter->reset(metaindex_block->get()->NewMetaIterator());
|
2013-11-12 22:46:51 -08:00
|
|
|
return Status::OK();
|
2012-04-17 08:36:46 -07:00
|
|
|
}
|
|
|
|
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
template <typename TBlocklike>
|
2014-02-28 18:19:07 -08:00
|
|
|
Status BlockBasedTable::GetDataBlockFromCache(
|
New stable, fixed-length cache keys (#9126)
Summary:
This change standardizes on a new 16-byte cache key format for
block cache (incl compressed and secondary) and persistent cache (but
not table cache and row cache).
The goal is a really fast cache key with practically ideal stability and
uniqueness properties without external dependencies (e.g. from FileSystem).
A fixed key size of 16 bytes should enable future optimizations to the
concurrent hash table for block cache, which is a heavy CPU user /
bottleneck, but there appears to be measurable performance improvement
even with no changes to LRUCache.
This change replaces a lot of disjointed and ugly code handling cache
keys with calls to a simple, clean new internal API (cache_key.h).
(Preserving the old cache key logic under an option would be very ugly
and likely negate the performance gain of the new approach. Complete
replacement carries some inherent risk, but I think that's acceptable
with sufficient analysis and testing.)
The scheme for encoding new cache keys is complicated but explained
in cache_key.cc.
Also: EndianSwapValue is moved to math.h to be next to other bit
operations. (Explains some new include "math.h".) ReverseBits operation
added and unit tests added to hash_test for both.
Fixes https://github.com/facebook/rocksdb/issues/7405 (presuming a root cause)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9126
Test Plan:
### Basic correctness
Several tests needed updates to work with the new functionality, mostly
because we are no longer relying on filesystem for stable cache keys
so table builders & readers need more context info to agree on cache
keys. This functionality is so core, a huge number of existing tests
exercise the cache key functionality.
### Performance
Create db with
`TEST_TMPDIR=/dev/shm ./db_bench -bloom_bits=10 -benchmarks=fillrandom -num=3000000 -partition_index_and_filters`
And test performance with
`TEST_TMPDIR=/dev/shm ./db_bench -readonly -use_existing_db -bloom_bits=10 -benchmarks=readrandom -num=3000000 -duration=30 -cache_index_and_filter_blocks -cache_size=250000 -threads=4`
using DEBUG_LEVEL=0 and simultaneous before & after runs.
Before ops/sec, avg over 100 runs: 121924
After ops/sec, avg over 100 runs: 125385 (+2.8%)
### Collision probability
I have built a tool, ./cache_bench -stress_cache_key to broadly simulate host-wide cache activity
over many months, by making some pessimistic simplifying assumptions:
* Every generated file has a cache entry for every byte offset in the file (contiguous range of cache keys)
* All of every file is cached for its entire lifetime
We use a simple table with skewed address assignment and replacement on address collision
to simulate files coming & going, with quite a variance (super-Poisson) in ages. Some output
with `./cache_bench -stress_cache_key -sck_keep_bits=40`:
```
Total cache or DBs size: 32TiB Writing 925.926 MiB/s or 76.2939TiB/day
Multiply by 9.22337e+18 to correct for simulation losses (but still assume whole file cached)
```
These come from default settings of 2.5M files per day of 32 MB each, and
`-sck_keep_bits=40` means that to represent a single file, we are only keeping 40 bits of
the 128-bit cache key. With file size of 2\*\*25 contiguous keys (pessimistic), our simulation
is about 2\*\*(128-40-25) or about 9 billion billion times more prone to collision than reality.
More default assumptions, relatively pessimistic:
* 100 DBs in same process (doesn't matter much)
* Re-open DB in same process (new session ID related to old session ID) on average
every 100 files generated
* Restart process (all new session IDs unrelated to old) 24 times per day
After enough data, we get a result at the end:
```
(keep 40 bits) 17 collisions after 2 x 90 days, est 10.5882 days between (9.76592e+19 corrected)
```
If we believe the (pessimistic) simulation and the mathematical generalization, we would need to run a billion machines all for 97 billion days to expect a cache key collision. To help verify that our generalization ("corrected") is robust, we can make our simulation more precise with `-sck_keep_bits=41` and `42`, which takes more running time to get enough data:
```
(keep 41 bits) 16 collisions after 4 x 90 days, est 22.5 days between (1.03763e+20 corrected)
(keep 42 bits) 19 collisions after 10 x 90 days, est 47.3684 days between (1.09224e+20 corrected)
```
The generalized prediction still holds. With the `-sck_randomize` option, we can see that we are beating "random" cache keys (except offsets still non-randomized) by a modest amount (roughly 20x less collision prone than random), which should make us reasonably comfortable even in "degenerate" cases:
```
197 collisions after 1 x 90 days, est 0.456853 days between (4.21372e+18 corrected)
```
I've run other tests to validate other conditions behave as expected, never behaving "worse than random" unless we start chopping off structured data.
Reviewed By: zhichao-cao
Differential Revision: D33171746
Pulled By: pdillinger
fbshipit-source-id: f16a57e369ed37be5e7e33525ace848d0537c88f
2021-12-16 17:13:55 -08:00
|
|
|
const Slice& cache_key, Cache* block_cache, Cache* block_cache_compressed,
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
const ReadOptions& read_options, CachableEntry<TBlocklike>* block,
|
2019-06-06 11:28:54 -07:00
|
|
|
const UncompressionDict& uncompression_dict, BlockType block_type,
|
2021-06-18 09:35:03 -07:00
|
|
|
const bool wait, GetContext* get_context) const {
|
2019-06-03 12:31:45 -07:00
|
|
|
const size_t read_amp_bytes_per_bit =
|
2019-06-06 11:28:54 -07:00
|
|
|
block_type == BlockType::kData
|
|
|
|
? rep_->table_options.read_amp_bytes_per_bit
|
|
|
|
: 0;
|
2019-05-10 11:53:33 -07:00
|
|
|
assert(block);
|
|
|
|
assert(block->IsEmpty());
|
2021-05-21 18:28:28 -07:00
|
|
|
const Cache::Priority priority =
|
|
|
|
rep_->table_options.cache_index_and_filter_blocks_with_high_priority &&
|
|
|
|
(block_type == BlockType::kFilter ||
|
|
|
|
block_type == BlockType::kCompressionDictionary ||
|
|
|
|
block_type == BlockType::kIndex)
|
|
|
|
? Cache::Priority::HIGH
|
|
|
|
: Cache::Priority::LOW;
|
2019-05-10 11:53:33 -07:00
|
|
|
|
2014-02-28 18:19:07 -08:00
|
|
|
Status s;
|
2018-11-13 17:00:49 -08:00
|
|
|
BlockContents* compressed_block = nullptr;
|
2014-02-28 18:19:07 -08:00
|
|
|
Cache::Handle* block_cache_compressed_handle = nullptr;
|
2021-05-21 18:28:28 -07:00
|
|
|
Statistics* statistics = rep_->ioptions.statistics.get();
|
|
|
|
bool using_zstd = rep_->blocks_definitely_zstd_compressed;
|
|
|
|
const FilterPolicy* filter_policy = rep_->filter_policy;
|
2021-05-24 08:35:20 -07:00
|
|
|
Cache::CreateCallback create_cb = GetCreateCallback<TBlocklike>(
|
|
|
|
read_amp_bytes_per_bit, statistics, using_zstd, filter_policy);
|
2014-02-28 18:19:07 -08:00
|
|
|
|
|
|
|
// Lookup uncompressed cache first
|
|
|
|
if (block_cache != nullptr) {
|
New stable, fixed-length cache keys (#9126)
Summary:
This change standardizes on a new 16-byte cache key format for
block cache (incl compressed and secondary) and persistent cache (but
not table cache and row cache).
The goal is a really fast cache key with practically ideal stability and
uniqueness properties without external dependencies (e.g. from FileSystem).
A fixed key size of 16 bytes should enable future optimizations to the
concurrent hash table for block cache, which is a heavy CPU user /
bottleneck, but there appears to be measurable performance improvement
even with no changes to LRUCache.
This change replaces a lot of disjointed and ugly code handling cache
keys with calls to a simple, clean new internal API (cache_key.h).
(Preserving the old cache key logic under an option would be very ugly
and likely negate the performance gain of the new approach. Complete
replacement carries some inherent risk, but I think that's acceptable
with sufficient analysis and testing.)
The scheme for encoding new cache keys is complicated but explained
in cache_key.cc.
Also: EndianSwapValue is moved to math.h to be next to other bit
operations. (Explains some new include "math.h".) ReverseBits operation
added and unit tests added to hash_test for both.
Fixes https://github.com/facebook/rocksdb/issues/7405 (presuming a root cause)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9126
Test Plan:
### Basic correctness
Several tests needed updates to work with the new functionality, mostly
because we are no longer relying on filesystem for stable cache keys
so table builders & readers need more context info to agree on cache
keys. This functionality is so core, a huge number of existing tests
exercise the cache key functionality.
### Performance
Create db with
`TEST_TMPDIR=/dev/shm ./db_bench -bloom_bits=10 -benchmarks=fillrandom -num=3000000 -partition_index_and_filters`
And test performance with
`TEST_TMPDIR=/dev/shm ./db_bench -readonly -use_existing_db -bloom_bits=10 -benchmarks=readrandom -num=3000000 -duration=30 -cache_index_and_filter_blocks -cache_size=250000 -threads=4`
using DEBUG_LEVEL=0 and simultaneous before & after runs.
Before ops/sec, avg over 100 runs: 121924
After ops/sec, avg over 100 runs: 125385 (+2.8%)
### Collision probability
I have built a tool, ./cache_bench -stress_cache_key to broadly simulate host-wide cache activity
over many months, by making some pessimistic simplifying assumptions:
* Every generated file has a cache entry for every byte offset in the file (contiguous range of cache keys)
* All of every file is cached for its entire lifetime
We use a simple table with skewed address assignment and replacement on address collision
to simulate files coming & going, with quite a variance (super-Poisson) in ages. Some output
with `./cache_bench -stress_cache_key -sck_keep_bits=40`:
```
Total cache or DBs size: 32TiB Writing 925.926 MiB/s or 76.2939TiB/day
Multiply by 9.22337e+18 to correct for simulation losses (but still assume whole file cached)
```
These come from default settings of 2.5M files per day of 32 MB each, and
`-sck_keep_bits=40` means that to represent a single file, we are only keeping 40 bits of
the 128-bit cache key. With file size of 2\*\*25 contiguous keys (pessimistic), our simulation
is about 2\*\*(128-40-25) or about 9 billion billion times more prone to collision than reality.
More default assumptions, relatively pessimistic:
* 100 DBs in same process (doesn't matter much)
* Re-open DB in same process (new session ID related to old session ID) on average
every 100 files generated
* Restart process (all new session IDs unrelated to old) 24 times per day
After enough data, we get a result at the end:
```
(keep 40 bits) 17 collisions after 2 x 90 days, est 10.5882 days between (9.76592e+19 corrected)
```
If we believe the (pessimistic) simulation and the mathematical generalization, we would need to run a billion machines all for 97 billion days to expect a cache key collision. To help verify that our generalization ("corrected") is robust, we can make our simulation more precise with `-sck_keep_bits=41` and `42`, which takes more running time to get enough data:
```
(keep 41 bits) 16 collisions after 4 x 90 days, est 22.5 days between (1.03763e+20 corrected)
(keep 42 bits) 19 collisions after 10 x 90 days, est 47.3684 days between (1.09224e+20 corrected)
```
The generalized prediction still holds. With the `-sck_randomize` option, we can see that we are beating "random" cache keys (except offsets still non-randomized) by a modest amount (roughly 20x less collision prone than random), which should make us reasonably comfortable even in "degenerate" cases:
```
197 collisions after 1 x 90 days, est 0.456853 days between (4.21372e+18 corrected)
```
I've run other tests to validate other conditions behave as expected, never behaving "worse than random" unless we start chopping off structured data.
Reviewed By: zhichao-cao
Differential Revision: D33171746
Pulled By: pdillinger
fbshipit-source-id: f16a57e369ed37be5e7e33525ace848d0537c88f
2021-12-16 17:13:55 -08:00
|
|
|
assert(!cache_key.empty());
|
2021-10-19 15:53:16 -07:00
|
|
|
Cache::Handle* cache_handle = nullptr;
|
|
|
|
cache_handle = GetEntryFromCache(
|
New stable, fixed-length cache keys (#9126)
Summary:
This change standardizes on a new 16-byte cache key format for
block cache (incl compressed and secondary) and persistent cache (but
not table cache and row cache).
The goal is a really fast cache key with practically ideal stability and
uniqueness properties without external dependencies (e.g. from FileSystem).
A fixed key size of 16 bytes should enable future optimizations to the
concurrent hash table for block cache, which is a heavy CPU user /
bottleneck, but there appears to be measurable performance improvement
even with no changes to LRUCache.
This change replaces a lot of disjointed and ugly code handling cache
keys with calls to a simple, clean new internal API (cache_key.h).
(Preserving the old cache key logic under an option would be very ugly
and likely negate the performance gain of the new approach. Complete
replacement carries some inherent risk, but I think that's acceptable
with sufficient analysis and testing.)
The scheme for encoding new cache keys is complicated but explained
in cache_key.cc.
Also: EndianSwapValue is moved to math.h to be next to other bit
operations. (Explains some new include "math.h".) ReverseBits operation
added and unit tests added to hash_test for both.
Fixes https://github.com/facebook/rocksdb/issues/7405 (presuming a root cause)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9126
Test Plan:
### Basic correctness
Several tests needed updates to work with the new functionality, mostly
because we are no longer relying on filesystem for stable cache keys
so table builders & readers need more context info to agree on cache
keys. This functionality is so core, a huge number of existing tests
exercise the cache key functionality.
### Performance
Create db with
`TEST_TMPDIR=/dev/shm ./db_bench -bloom_bits=10 -benchmarks=fillrandom -num=3000000 -partition_index_and_filters`
And test performance with
`TEST_TMPDIR=/dev/shm ./db_bench -readonly -use_existing_db -bloom_bits=10 -benchmarks=readrandom -num=3000000 -duration=30 -cache_index_and_filter_blocks -cache_size=250000 -threads=4`
using DEBUG_LEVEL=0 and simultaneous before & after runs.
Before ops/sec, avg over 100 runs: 121924
After ops/sec, avg over 100 runs: 125385 (+2.8%)
### Collision probability
I have built a tool, ./cache_bench -stress_cache_key to broadly simulate host-wide cache activity
over many months, by making some pessimistic simplifying assumptions:
* Every generated file has a cache entry for every byte offset in the file (contiguous range of cache keys)
* All of every file is cached for its entire lifetime
We use a simple table with skewed address assignment and replacement on address collision
to simulate files coming & going, with quite a variance (super-Poisson) in ages. Some output
with `./cache_bench -stress_cache_key -sck_keep_bits=40`:
```
Total cache or DBs size: 32TiB Writing 925.926 MiB/s or 76.2939TiB/day
Multiply by 9.22337e+18 to correct for simulation losses (but still assume whole file cached)
```
These come from default settings of 2.5M files per day of 32 MB each, and
`-sck_keep_bits=40` means that to represent a single file, we are only keeping 40 bits of
the 128-bit cache key. With file size of 2\*\*25 contiguous keys (pessimistic), our simulation
is about 2\*\*(128-40-25) or about 9 billion billion times more prone to collision than reality.
More default assumptions, relatively pessimistic:
* 100 DBs in same process (doesn't matter much)
* Re-open DB in same process (new session ID related to old session ID) on average
every 100 files generated
* Restart process (all new session IDs unrelated to old) 24 times per day
After enough data, we get a result at the end:
```
(keep 40 bits) 17 collisions after 2 x 90 days, est 10.5882 days between (9.76592e+19 corrected)
```
If we believe the (pessimistic) simulation and the mathematical generalization, we would need to run a billion machines all for 97 billion days to expect a cache key collision. To help verify that our generalization ("corrected") is robust, we can make our simulation more precise with `-sck_keep_bits=41` and `42`, which takes more running time to get enough data:
```
(keep 41 bits) 16 collisions after 4 x 90 days, est 22.5 days between (1.03763e+20 corrected)
(keep 42 bits) 19 collisions after 10 x 90 days, est 47.3684 days between (1.09224e+20 corrected)
```
The generalized prediction still holds. With the `-sck_randomize` option, we can see that we are beating "random" cache keys (except offsets still non-randomized) by a modest amount (roughly 20x less collision prone than random), which should make us reasonably comfortable even in "degenerate" cases:
```
197 collisions after 1 x 90 days, est 0.456853 days between (4.21372e+18 corrected)
```
I've run other tests to validate other conditions behave as expected, never behaving "worse than random" unless we start chopping off structured data.
Reviewed By: zhichao-cao
Differential Revision: D33171746
Pulled By: pdillinger
fbshipit-source-id: f16a57e369ed37be5e7e33525ace848d0537c88f
2021-12-16 17:13:55 -08:00
|
|
|
rep_->ioptions.lowest_used_cache_tier, block_cache, cache_key,
|
2021-10-19 15:53:16 -07:00
|
|
|
block_type, wait, get_context,
|
2021-05-21 18:28:28 -07:00
|
|
|
BlocklikeTraits<TBlocklike>::GetCacheItemHelper(block_type), create_cb,
|
|
|
|
priority);
|
2019-05-10 11:53:33 -07:00
|
|
|
if (cache_handle != nullptr) {
|
|
|
|
block->SetCachedValue(
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
reinterpret_cast<TBlocklike*>(block_cache->Value(cache_handle)),
|
2019-05-10 11:53:33 -07:00
|
|
|
block_cache, cache_handle);
|
2014-02-28 18:19:07 -08:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// If not found, search from the compressed block cache.
|
2019-05-10 11:53:33 -07:00
|
|
|
assert(block->IsEmpty());
|
2014-02-28 18:19:07 -08:00
|
|
|
|
|
|
|
if (block_cache_compressed == nullptr) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
New stable, fixed-length cache keys (#9126)
Summary:
This change standardizes on a new 16-byte cache key format for
block cache (incl compressed and secondary) and persistent cache (but
not table cache and row cache).
The goal is a really fast cache key with practically ideal stability and
uniqueness properties without external dependencies (e.g. from FileSystem).
A fixed key size of 16 bytes should enable future optimizations to the
concurrent hash table for block cache, which is a heavy CPU user /
bottleneck, but there appears to be measurable performance improvement
even with no changes to LRUCache.
This change replaces a lot of disjointed and ugly code handling cache
keys with calls to a simple, clean new internal API (cache_key.h).
(Preserving the old cache key logic under an option would be very ugly
and likely negate the performance gain of the new approach. Complete
replacement carries some inherent risk, but I think that's acceptable
with sufficient analysis and testing.)
The scheme for encoding new cache keys is complicated but explained
in cache_key.cc.
Also: EndianSwapValue is moved to math.h to be next to other bit
operations. (Explains some new include "math.h".) ReverseBits operation
added and unit tests added to hash_test for both.
Fixes https://github.com/facebook/rocksdb/issues/7405 (presuming a root cause)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9126
Test Plan:
### Basic correctness
Several tests needed updates to work with the new functionality, mostly
because we are no longer relying on filesystem for stable cache keys
so table builders & readers need more context info to agree on cache
keys. This functionality is so core, a huge number of existing tests
exercise the cache key functionality.
### Performance
Create db with
`TEST_TMPDIR=/dev/shm ./db_bench -bloom_bits=10 -benchmarks=fillrandom -num=3000000 -partition_index_and_filters`
And test performance with
`TEST_TMPDIR=/dev/shm ./db_bench -readonly -use_existing_db -bloom_bits=10 -benchmarks=readrandom -num=3000000 -duration=30 -cache_index_and_filter_blocks -cache_size=250000 -threads=4`
using DEBUG_LEVEL=0 and simultaneous before & after runs.
Before ops/sec, avg over 100 runs: 121924
After ops/sec, avg over 100 runs: 125385 (+2.8%)
### Collision probability
I have built a tool, ./cache_bench -stress_cache_key to broadly simulate host-wide cache activity
over many months, by making some pessimistic simplifying assumptions:
* Every generated file has a cache entry for every byte offset in the file (contiguous range of cache keys)
* All of every file is cached for its entire lifetime
We use a simple table with skewed address assignment and replacement on address collision
to simulate files coming & going, with quite a variance (super-Poisson) in ages. Some output
with `./cache_bench -stress_cache_key -sck_keep_bits=40`:
```
Total cache or DBs size: 32TiB Writing 925.926 MiB/s or 76.2939TiB/day
Multiply by 9.22337e+18 to correct for simulation losses (but still assume whole file cached)
```
These come from default settings of 2.5M files per day of 32 MB each, and
`-sck_keep_bits=40` means that to represent a single file, we are only keeping 40 bits of
the 128-bit cache key. With file size of 2\*\*25 contiguous keys (pessimistic), our simulation
is about 2\*\*(128-40-25) or about 9 billion billion times more prone to collision than reality.
More default assumptions, relatively pessimistic:
* 100 DBs in same process (doesn't matter much)
* Re-open DB in same process (new session ID related to old session ID) on average
every 100 files generated
* Restart process (all new session IDs unrelated to old) 24 times per day
After enough data, we get a result at the end:
```
(keep 40 bits) 17 collisions after 2 x 90 days, est 10.5882 days between (9.76592e+19 corrected)
```
If we believe the (pessimistic) simulation and the mathematical generalization, we would need to run a billion machines all for 97 billion days to expect a cache key collision. To help verify that our generalization ("corrected") is robust, we can make our simulation more precise with `-sck_keep_bits=41` and `42`, which takes more running time to get enough data:
```
(keep 41 bits) 16 collisions after 4 x 90 days, est 22.5 days between (1.03763e+20 corrected)
(keep 42 bits) 19 collisions after 10 x 90 days, est 47.3684 days between (1.09224e+20 corrected)
```
The generalized prediction still holds. With the `-sck_randomize` option, we can see that we are beating "random" cache keys (except offsets still non-randomized) by a modest amount (roughly 20x less collision prone than random), which should make us reasonably comfortable even in "degenerate" cases:
```
197 collisions after 1 x 90 days, est 0.456853 days between (4.21372e+18 corrected)
```
I've run other tests to validate other conditions behave as expected, never behaving "worse than random" unless we start chopping off structured data.
Reviewed By: zhichao-cao
Differential Revision: D33171746
Pulled By: pdillinger
fbshipit-source-id: f16a57e369ed37be5e7e33525ace848d0537c88f
2021-12-16 17:13:55 -08:00
|
|
|
assert(!cache_key.empty());
|
2021-05-21 18:28:28 -07:00
|
|
|
BlockContents contents;
|
2021-10-19 15:53:16 -07:00
|
|
|
if (rep_->ioptions.lowest_used_cache_tier ==
|
|
|
|
CacheTier::kNonVolatileBlockTier) {
|
|
|
|
Cache::CreateCallback create_cb_special = GetCreateCallback<BlockContents>(
|
|
|
|
read_amp_bytes_per_bit, statistics, using_zstd, filter_policy);
|
|
|
|
block_cache_compressed_handle = block_cache_compressed->Lookup(
|
New stable, fixed-length cache keys (#9126)
Summary:
This change standardizes on a new 16-byte cache key format for
block cache (incl compressed and secondary) and persistent cache (but
not table cache and row cache).
The goal is a really fast cache key with practically ideal stability and
uniqueness properties without external dependencies (e.g. from FileSystem).
A fixed key size of 16 bytes should enable future optimizations to the
concurrent hash table for block cache, which is a heavy CPU user /
bottleneck, but there appears to be measurable performance improvement
even with no changes to LRUCache.
This change replaces a lot of disjointed and ugly code handling cache
keys with calls to a simple, clean new internal API (cache_key.h).
(Preserving the old cache key logic under an option would be very ugly
and likely negate the performance gain of the new approach. Complete
replacement carries some inherent risk, but I think that's acceptable
with sufficient analysis and testing.)
The scheme for encoding new cache keys is complicated but explained
in cache_key.cc.
Also: EndianSwapValue is moved to math.h to be next to other bit
operations. (Explains some new include "math.h".) ReverseBits operation
added and unit tests added to hash_test for both.
Fixes https://github.com/facebook/rocksdb/issues/7405 (presuming a root cause)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9126
Test Plan:
### Basic correctness
Several tests needed updates to work with the new functionality, mostly
because we are no longer relying on filesystem for stable cache keys
so table builders & readers need more context info to agree on cache
keys. This functionality is so core, a huge number of existing tests
exercise the cache key functionality.
### Performance
Create db with
`TEST_TMPDIR=/dev/shm ./db_bench -bloom_bits=10 -benchmarks=fillrandom -num=3000000 -partition_index_and_filters`
And test performance with
`TEST_TMPDIR=/dev/shm ./db_bench -readonly -use_existing_db -bloom_bits=10 -benchmarks=readrandom -num=3000000 -duration=30 -cache_index_and_filter_blocks -cache_size=250000 -threads=4`
using DEBUG_LEVEL=0 and simultaneous before & after runs.
Before ops/sec, avg over 100 runs: 121924
After ops/sec, avg over 100 runs: 125385 (+2.8%)
### Collision probability
I have built a tool, ./cache_bench -stress_cache_key to broadly simulate host-wide cache activity
over many months, by making some pessimistic simplifying assumptions:
* Every generated file has a cache entry for every byte offset in the file (contiguous range of cache keys)
* All of every file is cached for its entire lifetime
We use a simple table with skewed address assignment and replacement on address collision
to simulate files coming & going, with quite a variance (super-Poisson) in ages. Some output
with `./cache_bench -stress_cache_key -sck_keep_bits=40`:
```
Total cache or DBs size: 32TiB Writing 925.926 MiB/s or 76.2939TiB/day
Multiply by 9.22337e+18 to correct for simulation losses (but still assume whole file cached)
```
These come from default settings of 2.5M files per day of 32 MB each, and
`-sck_keep_bits=40` means that to represent a single file, we are only keeping 40 bits of
the 128-bit cache key. With file size of 2\*\*25 contiguous keys (pessimistic), our simulation
is about 2\*\*(128-40-25) or about 9 billion billion times more prone to collision than reality.
More default assumptions, relatively pessimistic:
* 100 DBs in same process (doesn't matter much)
* Re-open DB in same process (new session ID related to old session ID) on average
every 100 files generated
* Restart process (all new session IDs unrelated to old) 24 times per day
After enough data, we get a result at the end:
```
(keep 40 bits) 17 collisions after 2 x 90 days, est 10.5882 days between (9.76592e+19 corrected)
```
If we believe the (pessimistic) simulation and the mathematical generalization, we would need to run a billion machines all for 97 billion days to expect a cache key collision. To help verify that our generalization ("corrected") is robust, we can make our simulation more precise with `-sck_keep_bits=41` and `42`, which takes more running time to get enough data:
```
(keep 41 bits) 16 collisions after 4 x 90 days, est 22.5 days between (1.03763e+20 corrected)
(keep 42 bits) 19 collisions after 10 x 90 days, est 47.3684 days between (1.09224e+20 corrected)
```
The generalized prediction still holds. With the `-sck_randomize` option, we can see that we are beating "random" cache keys (except offsets still non-randomized) by a modest amount (roughly 20x less collision prone than random), which should make us reasonably comfortable even in "degenerate" cases:
```
197 collisions after 1 x 90 days, est 0.456853 days between (4.21372e+18 corrected)
```
I've run other tests to validate other conditions behave as expected, never behaving "worse than random" unless we start chopping off structured data.
Reviewed By: zhichao-cao
Differential Revision: D33171746
Pulled By: pdillinger
fbshipit-source-id: f16a57e369ed37be5e7e33525ace848d0537c88f
2021-12-16 17:13:55 -08:00
|
|
|
cache_key,
|
2021-10-19 15:53:16 -07:00
|
|
|
BlocklikeTraits<BlockContents>::GetCacheItemHelper(block_type),
|
|
|
|
create_cb_special, priority, true);
|
|
|
|
} else {
|
|
|
|
block_cache_compressed_handle =
|
New stable, fixed-length cache keys (#9126)
Summary:
This change standardizes on a new 16-byte cache key format for
block cache (incl compressed and secondary) and persistent cache (but
not table cache and row cache).
The goal is a really fast cache key with practically ideal stability and
uniqueness properties without external dependencies (e.g. from FileSystem).
A fixed key size of 16 bytes should enable future optimizations to the
concurrent hash table for block cache, which is a heavy CPU user /
bottleneck, but there appears to be measurable performance improvement
even with no changes to LRUCache.
This change replaces a lot of disjointed and ugly code handling cache
keys with calls to a simple, clean new internal API (cache_key.h).
(Preserving the old cache key logic under an option would be very ugly
and likely negate the performance gain of the new approach. Complete
replacement carries some inherent risk, but I think that's acceptable
with sufficient analysis and testing.)
The scheme for encoding new cache keys is complicated but explained
in cache_key.cc.
Also: EndianSwapValue is moved to math.h to be next to other bit
operations. (Explains some new include "math.h".) ReverseBits operation
added and unit tests added to hash_test for both.
Fixes https://github.com/facebook/rocksdb/issues/7405 (presuming a root cause)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9126
Test Plan:
### Basic correctness
Several tests needed updates to work with the new functionality, mostly
because we are no longer relying on filesystem for stable cache keys
so table builders & readers need more context info to agree on cache
keys. This functionality is so core, a huge number of existing tests
exercise the cache key functionality.
### Performance
Create db with
`TEST_TMPDIR=/dev/shm ./db_bench -bloom_bits=10 -benchmarks=fillrandom -num=3000000 -partition_index_and_filters`
And test performance with
`TEST_TMPDIR=/dev/shm ./db_bench -readonly -use_existing_db -bloom_bits=10 -benchmarks=readrandom -num=3000000 -duration=30 -cache_index_and_filter_blocks -cache_size=250000 -threads=4`
using DEBUG_LEVEL=0 and simultaneous before & after runs.
Before ops/sec, avg over 100 runs: 121924
After ops/sec, avg over 100 runs: 125385 (+2.8%)
### Collision probability
I have built a tool, ./cache_bench -stress_cache_key to broadly simulate host-wide cache activity
over many months, by making some pessimistic simplifying assumptions:
* Every generated file has a cache entry for every byte offset in the file (contiguous range of cache keys)
* All of every file is cached for its entire lifetime
We use a simple table with skewed address assignment and replacement on address collision
to simulate files coming & going, with quite a variance (super-Poisson) in ages. Some output
with `./cache_bench -stress_cache_key -sck_keep_bits=40`:
```
Total cache or DBs size: 32TiB Writing 925.926 MiB/s or 76.2939TiB/day
Multiply by 9.22337e+18 to correct for simulation losses (but still assume whole file cached)
```
These come from default settings of 2.5M files per day of 32 MB each, and
`-sck_keep_bits=40` means that to represent a single file, we are only keeping 40 bits of
the 128-bit cache key. With file size of 2\*\*25 contiguous keys (pessimistic), our simulation
is about 2\*\*(128-40-25) or about 9 billion billion times more prone to collision than reality.
More default assumptions, relatively pessimistic:
* 100 DBs in same process (doesn't matter much)
* Re-open DB in same process (new session ID related to old session ID) on average
every 100 files generated
* Restart process (all new session IDs unrelated to old) 24 times per day
After enough data, we get a result at the end:
```
(keep 40 bits) 17 collisions after 2 x 90 days, est 10.5882 days between (9.76592e+19 corrected)
```
If we believe the (pessimistic) simulation and the mathematical generalization, we would need to run a billion machines all for 97 billion days to expect a cache key collision. To help verify that our generalization ("corrected") is robust, we can make our simulation more precise with `-sck_keep_bits=41` and `42`, which takes more running time to get enough data:
```
(keep 41 bits) 16 collisions after 4 x 90 days, est 22.5 days between (1.03763e+20 corrected)
(keep 42 bits) 19 collisions after 10 x 90 days, est 47.3684 days between (1.09224e+20 corrected)
```
The generalized prediction still holds. With the `-sck_randomize` option, we can see that we are beating "random" cache keys (except offsets still non-randomized) by a modest amount (roughly 20x less collision prone than random), which should make us reasonably comfortable even in "degenerate" cases:
```
197 collisions after 1 x 90 days, est 0.456853 days between (4.21372e+18 corrected)
```
I've run other tests to validate other conditions behave as expected, never behaving "worse than random" unless we start chopping off structured data.
Reviewed By: zhichao-cao
Differential Revision: D33171746
Pulled By: pdillinger
fbshipit-source-id: f16a57e369ed37be5e7e33525ace848d0537c88f
2021-12-16 17:13:55 -08:00
|
|
|
block_cache_compressed->Lookup(cache_key, statistics);
|
2021-10-19 15:53:16 -07:00
|
|
|
}
|
2019-06-06 11:28:54 -07:00
|
|
|
|
2014-02-28 18:19:07 -08:00
|
|
|
// if we found in the compressed cache, then uncompress and insert into
|
|
|
|
// uncompressed cache
|
|
|
|
if (block_cache_compressed_handle == nullptr) {
|
|
|
|
RecordTick(statistics, BLOCK_CACHE_COMPRESSED_MISS);
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
// found compressed block
|
|
|
|
RecordTick(statistics, BLOCK_CACHE_COMPRESSED_HIT);
|
2018-11-13 17:00:49 -08:00
|
|
|
compressed_block = reinterpret_cast<BlockContents*>(
|
2014-02-28 18:19:07 -08:00
|
|
|
block_cache_compressed->Value(block_cache_compressed_handle));
|
Improve / clean up meta block code & integrity (#9163)
Summary:
* Checksums are now checked on meta blocks unless specifically
suppressed or not applicable (e.g. plain table). (Was other way around.)
This means a number of cases that were not checking checksums now are,
including direct read TableProperties in Version::GetTableProperties
(fixed in meta_blocks ReadTableProperties), reading any block from
PersistentCache (fixed in BlockFetcher), read TableProperties in
SstFileDumper (ldb/sst_dump/BackupEngine) before table reader open,
maybe more.
* For that to work, I moved the global_seqno+TableProperties checksum
logic to the shared table/ code, because that is used by many utilies
such as SstFileDumper.
* Also for that to work, we have to know when we're dealing with a block
that has a checksum (trailer), so added that capability to Footer based
on magic number, and from there BlockFetcher.
* Knowledge of trailer presence has also fixed a problem where other
table formats were reading blocks including bytes for a non-existant
trailer--and awkwardly kind-of not using them, e.g. no shared code
checking checksums. (BlockFetcher compression type was populated
incorrectly.) Now we only read what is needed.
* Minimized code duplication and differing/incompatible/awkward
abstractions in meta_blocks.{cc,h} (e.g. SeekTo in metaindex block
without parsing block handle)
* Moved some meta block handling code from table_properties*.*
* Moved some code specific to block-based table from shared table/ code
to BlockBasedTable class. The checksum stuff means we can't completely
separate it, but things that don't need to be in shared table/ code
should not be.
* Use unique_ptr rather than raw ptr in more places. (Note: you can
std::move from unique_ptr to shared_ptr.)
Without enhancements to GetPropertiesOfAllTablesTest (see below),
net reduction of roughly 100 lines of code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9163
Test Plan:
existing tests and
* Enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to verify that
checksums are now checked on direct read of table properties by TableCache
(new test would fail before this change)
* Also enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to test
putting table properties under old meta name
* Also generally enhanced that same test to actually test what it was
supposed to be testing already, by kicking things out of table cache when
we don't want them there.
Reviewed By: ajkr, mrambacher
Differential Revision: D32514757
Pulled By: pdillinger
fbshipit-source-id: 507964b9311d186ae8d1131182290cbd97a99fa9
2021-11-18 11:42:12 -08:00
|
|
|
CompressionType compression_type = GetBlockCompressionType(*compressed_block);
|
2018-11-13 17:00:49 -08:00
|
|
|
assert(compression_type != kNoCompression);
|
2014-02-28 18:19:07 -08:00
|
|
|
|
|
|
|
// Retrieve the uncompressed contents into a new buffer
|
2019-01-18 19:10:17 -08:00
|
|
|
UncompressionContext context(compression_type);
|
2019-01-23 18:11:08 -08:00
|
|
|
UncompressionInfo info(context, uncompression_dict, compression_type);
|
2019-06-03 12:31:45 -07:00
|
|
|
s = UncompressBlockContents(
|
|
|
|
info, compressed_block->data.data(), compressed_block->data.size(),
|
|
|
|
&contents, rep_->table_options.format_version, rep_->ioptions,
|
|
|
|
GetMemoryAllocator(rep_->table_options));
|
2014-02-28 18:19:07 -08:00
|
|
|
|
2021-05-21 18:28:28 -07:00
|
|
|
// Insert uncompressed block into block cache, the priority is based on the
|
|
|
|
// data block type.
|
2014-02-28 18:19:07 -08:00
|
|
|
if (s.ok()) {
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
std::unique_ptr<TBlocklike> block_holder(
|
|
|
|
BlocklikeTraits<TBlocklike>::Create(
|
2020-02-25 15:29:17 -08:00
|
|
|
std::move(contents), read_amp_bytes_per_bit, statistics,
|
2019-10-18 19:30:47 -07:00
|
|
|
rep_->blocks_definitely_zstd_compressed,
|
|
|
|
rep_->table_options.filter_policy.get())); // uncompressed block
|
2019-05-10 11:53:33 -07:00
|
|
|
|
|
|
|
if (block_cache != nullptr && block_holder->own_bytes() &&
|
2014-02-28 18:19:07 -08:00
|
|
|
read_options.fill_cache) {
|
2019-05-10 11:53:33 -07:00
|
|
|
size_t charge = block_holder->ApproximateMemoryUsage();
|
|
|
|
Cache::Handle* cache_handle = nullptr;
|
2021-10-19 15:53:16 -07:00
|
|
|
s = InsertEntryToCache(
|
New stable, fixed-length cache keys (#9126)
Summary:
This change standardizes on a new 16-byte cache key format for
block cache (incl compressed and secondary) and persistent cache (but
not table cache and row cache).
The goal is a really fast cache key with practically ideal stability and
uniqueness properties without external dependencies (e.g. from FileSystem).
A fixed key size of 16 bytes should enable future optimizations to the
concurrent hash table for block cache, which is a heavy CPU user /
bottleneck, but there appears to be measurable performance improvement
even with no changes to LRUCache.
This change replaces a lot of disjointed and ugly code handling cache
keys with calls to a simple, clean new internal API (cache_key.h).
(Preserving the old cache key logic under an option would be very ugly
and likely negate the performance gain of the new approach. Complete
replacement carries some inherent risk, but I think that's acceptable
with sufficient analysis and testing.)
The scheme for encoding new cache keys is complicated but explained
in cache_key.cc.
Also: EndianSwapValue is moved to math.h to be next to other bit
operations. (Explains some new include "math.h".) ReverseBits operation
added and unit tests added to hash_test for both.
Fixes https://github.com/facebook/rocksdb/issues/7405 (presuming a root cause)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9126
Test Plan:
### Basic correctness
Several tests needed updates to work with the new functionality, mostly
because we are no longer relying on filesystem for stable cache keys
so table builders & readers need more context info to agree on cache
keys. This functionality is so core, a huge number of existing tests
exercise the cache key functionality.
### Performance
Create db with
`TEST_TMPDIR=/dev/shm ./db_bench -bloom_bits=10 -benchmarks=fillrandom -num=3000000 -partition_index_and_filters`
And test performance with
`TEST_TMPDIR=/dev/shm ./db_bench -readonly -use_existing_db -bloom_bits=10 -benchmarks=readrandom -num=3000000 -duration=30 -cache_index_and_filter_blocks -cache_size=250000 -threads=4`
using DEBUG_LEVEL=0 and simultaneous before & after runs.
Before ops/sec, avg over 100 runs: 121924
After ops/sec, avg over 100 runs: 125385 (+2.8%)
### Collision probability
I have built a tool, ./cache_bench -stress_cache_key to broadly simulate host-wide cache activity
over many months, by making some pessimistic simplifying assumptions:
* Every generated file has a cache entry for every byte offset in the file (contiguous range of cache keys)
* All of every file is cached for its entire lifetime
We use a simple table with skewed address assignment and replacement on address collision
to simulate files coming & going, with quite a variance (super-Poisson) in ages. Some output
with `./cache_bench -stress_cache_key -sck_keep_bits=40`:
```
Total cache or DBs size: 32TiB Writing 925.926 MiB/s or 76.2939TiB/day
Multiply by 9.22337e+18 to correct for simulation losses (but still assume whole file cached)
```
These come from default settings of 2.5M files per day of 32 MB each, and
`-sck_keep_bits=40` means that to represent a single file, we are only keeping 40 bits of
the 128-bit cache key. With file size of 2\*\*25 contiguous keys (pessimistic), our simulation
is about 2\*\*(128-40-25) or about 9 billion billion times more prone to collision than reality.
More default assumptions, relatively pessimistic:
* 100 DBs in same process (doesn't matter much)
* Re-open DB in same process (new session ID related to old session ID) on average
every 100 files generated
* Restart process (all new session IDs unrelated to old) 24 times per day
After enough data, we get a result at the end:
```
(keep 40 bits) 17 collisions after 2 x 90 days, est 10.5882 days between (9.76592e+19 corrected)
```
If we believe the (pessimistic) simulation and the mathematical generalization, we would need to run a billion machines all for 97 billion days to expect a cache key collision. To help verify that our generalization ("corrected") is robust, we can make our simulation more precise with `-sck_keep_bits=41` and `42`, which takes more running time to get enough data:
```
(keep 41 bits) 16 collisions after 4 x 90 days, est 22.5 days between (1.03763e+20 corrected)
(keep 42 bits) 19 collisions after 10 x 90 days, est 47.3684 days between (1.09224e+20 corrected)
```
The generalized prediction still holds. With the `-sck_randomize` option, we can see that we are beating "random" cache keys (except offsets still non-randomized) by a modest amount (roughly 20x less collision prone than random), which should make us reasonably comfortable even in "degenerate" cases:
```
197 collisions after 1 x 90 days, est 0.456853 days between (4.21372e+18 corrected)
```
I've run other tests to validate other conditions behave as expected, never behaving "worse than random" unless we start chopping off structured data.
Reviewed By: zhichao-cao
Differential Revision: D33171746
Pulled By: pdillinger
fbshipit-source-id: f16a57e369ed37be5e7e33525ace848d0537c88f
2021-12-16 17:13:55 -08:00
|
|
|
rep_->ioptions.lowest_used_cache_tier, block_cache, cache_key,
|
2021-10-19 15:53:16 -07:00
|
|
|
BlocklikeTraits<TBlocklike>::GetCacheItemHelper(block_type),
|
|
|
|
block_holder, charge, &cache_handle, priority);
|
2016-03-10 17:35:19 -08:00
|
|
|
if (s.ok()) {
|
2019-05-10 11:53:33 -07:00
|
|
|
assert(cache_handle != nullptr);
|
|
|
|
block->SetCachedValue(block_holder.release(), block_cache,
|
|
|
|
cache_handle);
|
|
|
|
|
2020-04-27 13:18:18 -07:00
|
|
|
UpdateCacheInsertionMetrics(block_type, get_context, charge,
|
2021-06-17 21:55:42 -07:00
|
|
|
s.IsOkOverwritten(), rep_->ioptions.stats);
|
2016-03-10 17:35:19 -08:00
|
|
|
} else {
|
|
|
|
RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
|
|
|
|
}
|
2019-05-10 11:53:33 -07:00
|
|
|
} else {
|
|
|
|
block->SetOwnedValue(block_holder.release());
|
2014-02-28 18:19:07 -08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Release hold on compressed cache entry
|
|
|
|
block_cache_compressed->Release(block_cache_compressed_handle);
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
template <typename TBlocklike>
|
2014-02-28 18:19:07 -08:00
|
|
|
Status BlockBasedTable::PutDataBlockToCache(
|
New stable, fixed-length cache keys (#9126)
Summary:
This change standardizes on a new 16-byte cache key format for
block cache (incl compressed and secondary) and persistent cache (but
not table cache and row cache).
The goal is a really fast cache key with practically ideal stability and
uniqueness properties without external dependencies (e.g. from FileSystem).
A fixed key size of 16 bytes should enable future optimizations to the
concurrent hash table for block cache, which is a heavy CPU user /
bottleneck, but there appears to be measurable performance improvement
even with no changes to LRUCache.
This change replaces a lot of disjointed and ugly code handling cache
keys with calls to a simple, clean new internal API (cache_key.h).
(Preserving the old cache key logic under an option would be very ugly
and likely negate the performance gain of the new approach. Complete
replacement carries some inherent risk, but I think that's acceptable
with sufficient analysis and testing.)
The scheme for encoding new cache keys is complicated but explained
in cache_key.cc.
Also: EndianSwapValue is moved to math.h to be next to other bit
operations. (Explains some new include "math.h".) ReverseBits operation
added and unit tests added to hash_test for both.
Fixes https://github.com/facebook/rocksdb/issues/7405 (presuming a root cause)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9126
Test Plan:
### Basic correctness
Several tests needed updates to work with the new functionality, mostly
because we are no longer relying on filesystem for stable cache keys
so table builders & readers need more context info to agree on cache
keys. This functionality is so core, a huge number of existing tests
exercise the cache key functionality.
### Performance
Create db with
`TEST_TMPDIR=/dev/shm ./db_bench -bloom_bits=10 -benchmarks=fillrandom -num=3000000 -partition_index_and_filters`
And test performance with
`TEST_TMPDIR=/dev/shm ./db_bench -readonly -use_existing_db -bloom_bits=10 -benchmarks=readrandom -num=3000000 -duration=30 -cache_index_and_filter_blocks -cache_size=250000 -threads=4`
using DEBUG_LEVEL=0 and simultaneous before & after runs.
Before ops/sec, avg over 100 runs: 121924
After ops/sec, avg over 100 runs: 125385 (+2.8%)
### Collision probability
I have built a tool, ./cache_bench -stress_cache_key to broadly simulate host-wide cache activity
over many months, by making some pessimistic simplifying assumptions:
* Every generated file has a cache entry for every byte offset in the file (contiguous range of cache keys)
* All of every file is cached for its entire lifetime
We use a simple table with skewed address assignment and replacement on address collision
to simulate files coming & going, with quite a variance (super-Poisson) in ages. Some output
with `./cache_bench -stress_cache_key -sck_keep_bits=40`:
```
Total cache or DBs size: 32TiB Writing 925.926 MiB/s or 76.2939TiB/day
Multiply by 9.22337e+18 to correct for simulation losses (but still assume whole file cached)
```
These come from default settings of 2.5M files per day of 32 MB each, and
`-sck_keep_bits=40` means that to represent a single file, we are only keeping 40 bits of
the 128-bit cache key. With file size of 2\*\*25 contiguous keys (pessimistic), our simulation
is about 2\*\*(128-40-25) or about 9 billion billion times more prone to collision than reality.
More default assumptions, relatively pessimistic:
* 100 DBs in same process (doesn't matter much)
* Re-open DB in same process (new session ID related to old session ID) on average
every 100 files generated
* Restart process (all new session IDs unrelated to old) 24 times per day
After enough data, we get a result at the end:
```
(keep 40 bits) 17 collisions after 2 x 90 days, est 10.5882 days between (9.76592e+19 corrected)
```
If we believe the (pessimistic) simulation and the mathematical generalization, we would need to run a billion machines all for 97 billion days to expect a cache key collision. To help verify that our generalization ("corrected") is robust, we can make our simulation more precise with `-sck_keep_bits=41` and `42`, which takes more running time to get enough data:
```
(keep 41 bits) 16 collisions after 4 x 90 days, est 22.5 days between (1.03763e+20 corrected)
(keep 42 bits) 19 collisions after 10 x 90 days, est 47.3684 days between (1.09224e+20 corrected)
```
The generalized prediction still holds. With the `-sck_randomize` option, we can see that we are beating "random" cache keys (except offsets still non-randomized) by a modest amount (roughly 20x less collision prone than random), which should make us reasonably comfortable even in "degenerate" cases:
```
197 collisions after 1 x 90 days, est 0.456853 days between (4.21372e+18 corrected)
```
I've run other tests to validate other conditions behave as expected, never behaving "worse than random" unless we start chopping off structured data.
Reviewed By: zhichao-cao
Differential Revision: D33171746
Pulled By: pdillinger
fbshipit-source-id: f16a57e369ed37be5e7e33525ace848d0537c88f
2021-12-16 17:13:55 -08:00
|
|
|
const Slice& cache_key, Cache* block_cache, Cache* block_cache_compressed,
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
CachableEntry<TBlocklike>* cached_block, BlockContents* raw_block_contents,
|
2019-06-03 12:31:45 -07:00
|
|
|
CompressionType raw_block_comp_type,
|
2020-02-25 15:29:17 -08:00
|
|
|
const UncompressionDict& uncompression_dict,
|
2019-06-06 11:28:54 -07:00
|
|
|
MemoryAllocator* memory_allocator, BlockType block_type,
|
2019-06-03 12:31:45 -07:00
|
|
|
GetContext* get_context) const {
|
2021-05-05 13:59:21 -07:00
|
|
|
const ImmutableOptions& ioptions = rep_->ioptions;
|
2019-06-03 12:31:45 -07:00
|
|
|
const uint32_t format_version = rep_->table_options.format_version;
|
|
|
|
const size_t read_amp_bytes_per_bit =
|
2019-06-06 11:28:54 -07:00
|
|
|
block_type == BlockType::kData
|
|
|
|
? rep_->table_options.read_amp_bytes_per_bit
|
|
|
|
: 0;
|
2019-06-03 12:31:45 -07:00
|
|
|
const Cache::Priority priority =
|
2019-06-06 11:28:54 -07:00
|
|
|
rep_->table_options.cache_index_and_filter_blocks_with_high_priority &&
|
|
|
|
(block_type == BlockType::kFilter ||
|
|
|
|
block_type == BlockType::kCompressionDictionary ||
|
|
|
|
block_type == BlockType::kIndex)
|
2019-06-03 12:31:45 -07:00
|
|
|
? Cache::Priority::HIGH
|
|
|
|
: Cache::Priority::LOW;
|
2019-05-10 11:53:33 -07:00
|
|
|
assert(cached_block);
|
|
|
|
assert(cached_block->IsEmpty());
|
2014-02-28 18:19:07 -08:00
|
|
|
|
|
|
|
Status s;
|
2021-04-26 12:43:02 -07:00
|
|
|
Statistics* statistics = ioptions.stats;
|
2019-05-10 11:53:33 -07:00
|
|
|
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
std::unique_ptr<TBlocklike> block_holder;
|
2018-11-13 17:00:49 -08:00
|
|
|
if (raw_block_comp_type != kNoCompression) {
|
2019-05-10 11:53:33 -07:00
|
|
|
// Retrieve the uncompressed contents into a new buffer
|
|
|
|
BlockContents uncompressed_block_contents;
|
2019-01-18 19:10:17 -08:00
|
|
|
UncompressionContext context(raw_block_comp_type);
|
2019-01-23 18:11:08 -08:00
|
|
|
UncompressionInfo info(context, uncompression_dict, raw_block_comp_type);
|
2019-01-18 19:10:17 -08:00
|
|
|
s = UncompressBlockContents(info, raw_block_contents->data.data(),
|
|
|
|
raw_block_contents->data.size(),
|
|
|
|
&uncompressed_block_contents, format_version,
|
|
|
|
ioptions, memory_allocator);
|
2019-05-10 11:53:33 -07:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
2014-02-28 18:19:07 -08:00
|
|
|
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
block_holder.reset(BlocklikeTraits<TBlocklike>::Create(
|
2020-02-25 15:29:17 -08:00
|
|
|
std::move(uncompressed_block_contents), read_amp_bytes_per_bit,
|
2019-10-18 19:30:47 -07:00
|
|
|
statistics, rep_->blocks_definitely_zstd_compressed,
|
|
|
|
rep_->table_options.filter_policy.get()));
|
2014-02-28 18:19:07 -08:00
|
|
|
} else {
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
block_holder.reset(BlocklikeTraits<TBlocklike>::Create(
|
2020-02-25 15:29:17 -08:00
|
|
|
std::move(*raw_block_contents), read_amp_bytes_per_bit, statistics,
|
|
|
|
rep_->blocks_definitely_zstd_compressed,
|
2019-10-18 19:30:47 -07:00
|
|
|
rep_->table_options.filter_policy.get()));
|
2014-02-28 18:19:07 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Insert compressed block into compressed block cache.
|
|
|
|
// Release the hold on the compressed cache entry immediately.
|
2018-11-13 17:00:49 -08:00
|
|
|
if (block_cache_compressed != nullptr &&
|
|
|
|
raw_block_comp_type != kNoCompression && raw_block_contents != nullptr &&
|
|
|
|
raw_block_contents->own_bytes()) {
|
|
|
|
assert(raw_block_contents->is_raw_block);
|
New stable, fixed-length cache keys (#9126)
Summary:
This change standardizes on a new 16-byte cache key format for
block cache (incl compressed and secondary) and persistent cache (but
not table cache and row cache).
The goal is a really fast cache key with practically ideal stability and
uniqueness properties without external dependencies (e.g. from FileSystem).
A fixed key size of 16 bytes should enable future optimizations to the
concurrent hash table for block cache, which is a heavy CPU user /
bottleneck, but there appears to be measurable performance improvement
even with no changes to LRUCache.
This change replaces a lot of disjointed and ugly code handling cache
keys with calls to a simple, clean new internal API (cache_key.h).
(Preserving the old cache key logic under an option would be very ugly
and likely negate the performance gain of the new approach. Complete
replacement carries some inherent risk, but I think that's acceptable
with sufficient analysis and testing.)
The scheme for encoding new cache keys is complicated but explained
in cache_key.cc.
Also: EndianSwapValue is moved to math.h to be next to other bit
operations. (Explains some new include "math.h".) ReverseBits operation
added and unit tests added to hash_test for both.
Fixes https://github.com/facebook/rocksdb/issues/7405 (presuming a root cause)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9126
Test Plan:
### Basic correctness
Several tests needed updates to work with the new functionality, mostly
because we are no longer relying on filesystem for stable cache keys
so table builders & readers need more context info to agree on cache
keys. This functionality is so core, a huge number of existing tests
exercise the cache key functionality.
### Performance
Create db with
`TEST_TMPDIR=/dev/shm ./db_bench -bloom_bits=10 -benchmarks=fillrandom -num=3000000 -partition_index_and_filters`
And test performance with
`TEST_TMPDIR=/dev/shm ./db_bench -readonly -use_existing_db -bloom_bits=10 -benchmarks=readrandom -num=3000000 -duration=30 -cache_index_and_filter_blocks -cache_size=250000 -threads=4`
using DEBUG_LEVEL=0 and simultaneous before & after runs.
Before ops/sec, avg over 100 runs: 121924
After ops/sec, avg over 100 runs: 125385 (+2.8%)
### Collision probability
I have built a tool, ./cache_bench -stress_cache_key to broadly simulate host-wide cache activity
over many months, by making some pessimistic simplifying assumptions:
* Every generated file has a cache entry for every byte offset in the file (contiguous range of cache keys)
* All of every file is cached for its entire lifetime
We use a simple table with skewed address assignment and replacement on address collision
to simulate files coming & going, with quite a variance (super-Poisson) in ages. Some output
with `./cache_bench -stress_cache_key -sck_keep_bits=40`:
```
Total cache or DBs size: 32TiB Writing 925.926 MiB/s or 76.2939TiB/day
Multiply by 9.22337e+18 to correct for simulation losses (but still assume whole file cached)
```
These come from default settings of 2.5M files per day of 32 MB each, and
`-sck_keep_bits=40` means that to represent a single file, we are only keeping 40 bits of
the 128-bit cache key. With file size of 2\*\*25 contiguous keys (pessimistic), our simulation
is about 2\*\*(128-40-25) or about 9 billion billion times more prone to collision than reality.
More default assumptions, relatively pessimistic:
* 100 DBs in same process (doesn't matter much)
* Re-open DB in same process (new session ID related to old session ID) on average
every 100 files generated
* Restart process (all new session IDs unrelated to old) 24 times per day
After enough data, we get a result at the end:
```
(keep 40 bits) 17 collisions after 2 x 90 days, est 10.5882 days between (9.76592e+19 corrected)
```
If we believe the (pessimistic) simulation and the mathematical generalization, we would need to run a billion machines all for 97 billion days to expect a cache key collision. To help verify that our generalization ("corrected") is robust, we can make our simulation more precise with `-sck_keep_bits=41` and `42`, which takes more running time to get enough data:
```
(keep 41 bits) 16 collisions after 4 x 90 days, est 22.5 days between (1.03763e+20 corrected)
(keep 42 bits) 19 collisions after 10 x 90 days, est 47.3684 days between (1.09224e+20 corrected)
```
The generalized prediction still holds. With the `-sck_randomize` option, we can see that we are beating "random" cache keys (except offsets still non-randomized) by a modest amount (roughly 20x less collision prone than random), which should make us reasonably comfortable even in "degenerate" cases:
```
197 collisions after 1 x 90 days, est 0.456853 days between (4.21372e+18 corrected)
```
I've run other tests to validate other conditions behave as expected, never behaving "worse than random" unless we start chopping off structured data.
Reviewed By: zhichao-cao
Differential Revision: D33171746
Pulled By: pdillinger
fbshipit-source-id: f16a57e369ed37be5e7e33525ace848d0537c88f
2021-12-16 17:13:55 -08:00
|
|
|
assert(!cache_key.empty());
|
2018-11-13 17:00:49 -08:00
|
|
|
|
|
|
|
// We cannot directly put raw_block_contents because this could point to
|
|
|
|
// an object in the stack.
|
2021-10-19 15:53:16 -07:00
|
|
|
std::unique_ptr<BlockContents> block_cont_for_comp_cache(
|
|
|
|
new BlockContents(std::move(*raw_block_contents)));
|
|
|
|
s = InsertEntryToCache(
|
|
|
|
rep_->ioptions.lowest_used_cache_tier, block_cache_compressed,
|
New stable, fixed-length cache keys (#9126)
Summary:
This change standardizes on a new 16-byte cache key format for
block cache (incl compressed and secondary) and persistent cache (but
not table cache and row cache).
The goal is a really fast cache key with practically ideal stability and
uniqueness properties without external dependencies (e.g. from FileSystem).
A fixed key size of 16 bytes should enable future optimizations to the
concurrent hash table for block cache, which is a heavy CPU user /
bottleneck, but there appears to be measurable performance improvement
even with no changes to LRUCache.
This change replaces a lot of disjointed and ugly code handling cache
keys with calls to a simple, clean new internal API (cache_key.h).
(Preserving the old cache key logic under an option would be very ugly
and likely negate the performance gain of the new approach. Complete
replacement carries some inherent risk, but I think that's acceptable
with sufficient analysis and testing.)
The scheme for encoding new cache keys is complicated but explained
in cache_key.cc.
Also: EndianSwapValue is moved to math.h to be next to other bit
operations. (Explains some new include "math.h".) ReverseBits operation
added and unit tests added to hash_test for both.
Fixes https://github.com/facebook/rocksdb/issues/7405 (presuming a root cause)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9126
Test Plan:
### Basic correctness
Several tests needed updates to work with the new functionality, mostly
because we are no longer relying on filesystem for stable cache keys
so table builders & readers need more context info to agree on cache
keys. This functionality is so core, a huge number of existing tests
exercise the cache key functionality.
### Performance
Create db with
`TEST_TMPDIR=/dev/shm ./db_bench -bloom_bits=10 -benchmarks=fillrandom -num=3000000 -partition_index_and_filters`
And test performance with
`TEST_TMPDIR=/dev/shm ./db_bench -readonly -use_existing_db -bloom_bits=10 -benchmarks=readrandom -num=3000000 -duration=30 -cache_index_and_filter_blocks -cache_size=250000 -threads=4`
using DEBUG_LEVEL=0 and simultaneous before & after runs.
Before ops/sec, avg over 100 runs: 121924
After ops/sec, avg over 100 runs: 125385 (+2.8%)
### Collision probability
I have built a tool, ./cache_bench -stress_cache_key to broadly simulate host-wide cache activity
over many months, by making some pessimistic simplifying assumptions:
* Every generated file has a cache entry for every byte offset in the file (contiguous range of cache keys)
* All of every file is cached for its entire lifetime
We use a simple table with skewed address assignment and replacement on address collision
to simulate files coming & going, with quite a variance (super-Poisson) in ages. Some output
with `./cache_bench -stress_cache_key -sck_keep_bits=40`:
```
Total cache or DBs size: 32TiB Writing 925.926 MiB/s or 76.2939TiB/day
Multiply by 9.22337e+18 to correct for simulation losses (but still assume whole file cached)
```
These come from default settings of 2.5M files per day of 32 MB each, and
`-sck_keep_bits=40` means that to represent a single file, we are only keeping 40 bits of
the 128-bit cache key. With file size of 2\*\*25 contiguous keys (pessimistic), our simulation
is about 2\*\*(128-40-25) or about 9 billion billion times more prone to collision than reality.
More default assumptions, relatively pessimistic:
* 100 DBs in same process (doesn't matter much)
* Re-open DB in same process (new session ID related to old session ID) on average
every 100 files generated
* Restart process (all new session IDs unrelated to old) 24 times per day
After enough data, we get a result at the end:
```
(keep 40 bits) 17 collisions after 2 x 90 days, est 10.5882 days between (9.76592e+19 corrected)
```
If we believe the (pessimistic) simulation and the mathematical generalization, we would need to run a billion machines all for 97 billion days to expect a cache key collision. To help verify that our generalization ("corrected") is robust, we can make our simulation more precise with `-sck_keep_bits=41` and `42`, which takes more running time to get enough data:
```
(keep 41 bits) 16 collisions after 4 x 90 days, est 22.5 days between (1.03763e+20 corrected)
(keep 42 bits) 19 collisions after 10 x 90 days, est 47.3684 days between (1.09224e+20 corrected)
```
The generalized prediction still holds. With the `-sck_randomize` option, we can see that we are beating "random" cache keys (except offsets still non-randomized) by a modest amount (roughly 20x less collision prone than random), which should make us reasonably comfortable even in "degenerate" cases:
```
197 collisions after 1 x 90 days, est 0.456853 days between (4.21372e+18 corrected)
```
I've run other tests to validate other conditions behave as expected, never behaving "worse than random" unless we start chopping off structured data.
Reviewed By: zhichao-cao
Differential Revision: D33171746
Pulled By: pdillinger
fbshipit-source-id: f16a57e369ed37be5e7e33525ace848d0537c88f
2021-12-16 17:13:55 -08:00
|
|
|
cache_key,
|
2021-05-21 18:28:28 -07:00
|
|
|
BlocklikeTraits<BlockContents>::GetCacheItemHelper(block_type),
|
2021-10-19 15:53:16 -07:00
|
|
|
block_cont_for_comp_cache,
|
|
|
|
block_cont_for_comp_cache->ApproximateMemoryUsage(), nullptr,
|
|
|
|
Cache::Priority::LOW);
|
|
|
|
|
|
|
|
BlockContents* block_cont_raw_ptr = block_cont_for_comp_cache.release();
|
2016-03-10 17:35:19 -08:00
|
|
|
if (s.ok()) {
|
|
|
|
// Avoid the following code to delete this cached block.
|
|
|
|
RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD);
|
|
|
|
} else {
|
|
|
|
RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD_FAILURES);
|
2021-10-19 15:53:16 -07:00
|
|
|
delete block_cont_raw_ptr;
|
2016-03-10 17:35:19 -08:00
|
|
|
}
|
2014-02-28 18:19:07 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
// insert into uncompressed block cache
|
2019-05-10 11:53:33 -07:00
|
|
|
if (block_cache != nullptr && block_holder->own_bytes()) {
|
|
|
|
size_t charge = block_holder->ApproximateMemoryUsage();
|
|
|
|
Cache::Handle* cache_handle = nullptr;
|
2021-10-19 15:53:16 -07:00
|
|
|
s = InsertEntryToCache(
|
New stable, fixed-length cache keys (#9126)
Summary:
This change standardizes on a new 16-byte cache key format for
block cache (incl compressed and secondary) and persistent cache (but
not table cache and row cache).
The goal is a really fast cache key with practically ideal stability and
uniqueness properties without external dependencies (e.g. from FileSystem).
A fixed key size of 16 bytes should enable future optimizations to the
concurrent hash table for block cache, which is a heavy CPU user /
bottleneck, but there appears to be measurable performance improvement
even with no changes to LRUCache.
This change replaces a lot of disjointed and ugly code handling cache
keys with calls to a simple, clean new internal API (cache_key.h).
(Preserving the old cache key logic under an option would be very ugly
and likely negate the performance gain of the new approach. Complete
replacement carries some inherent risk, but I think that's acceptable
with sufficient analysis and testing.)
The scheme for encoding new cache keys is complicated but explained
in cache_key.cc.
Also: EndianSwapValue is moved to math.h to be next to other bit
operations. (Explains some new include "math.h".) ReverseBits operation
added and unit tests added to hash_test for both.
Fixes https://github.com/facebook/rocksdb/issues/7405 (presuming a root cause)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9126
Test Plan:
### Basic correctness
Several tests needed updates to work with the new functionality, mostly
because we are no longer relying on filesystem for stable cache keys
so table builders & readers need more context info to agree on cache
keys. This functionality is so core, a huge number of existing tests
exercise the cache key functionality.
### Performance
Create db with
`TEST_TMPDIR=/dev/shm ./db_bench -bloom_bits=10 -benchmarks=fillrandom -num=3000000 -partition_index_and_filters`
And test performance with
`TEST_TMPDIR=/dev/shm ./db_bench -readonly -use_existing_db -bloom_bits=10 -benchmarks=readrandom -num=3000000 -duration=30 -cache_index_and_filter_blocks -cache_size=250000 -threads=4`
using DEBUG_LEVEL=0 and simultaneous before & after runs.
Before ops/sec, avg over 100 runs: 121924
After ops/sec, avg over 100 runs: 125385 (+2.8%)
### Collision probability
I have built a tool, ./cache_bench -stress_cache_key to broadly simulate host-wide cache activity
over many months, by making some pessimistic simplifying assumptions:
* Every generated file has a cache entry for every byte offset in the file (contiguous range of cache keys)
* All of every file is cached for its entire lifetime
We use a simple table with skewed address assignment and replacement on address collision
to simulate files coming & going, with quite a variance (super-Poisson) in ages. Some output
with `./cache_bench -stress_cache_key -sck_keep_bits=40`:
```
Total cache or DBs size: 32TiB Writing 925.926 MiB/s or 76.2939TiB/day
Multiply by 9.22337e+18 to correct for simulation losses (but still assume whole file cached)
```
These come from default settings of 2.5M files per day of 32 MB each, and
`-sck_keep_bits=40` means that to represent a single file, we are only keeping 40 bits of
the 128-bit cache key. With file size of 2\*\*25 contiguous keys (pessimistic), our simulation
is about 2\*\*(128-40-25) or about 9 billion billion times more prone to collision than reality.
More default assumptions, relatively pessimistic:
* 100 DBs in same process (doesn't matter much)
* Re-open DB in same process (new session ID related to old session ID) on average
every 100 files generated
* Restart process (all new session IDs unrelated to old) 24 times per day
After enough data, we get a result at the end:
```
(keep 40 bits) 17 collisions after 2 x 90 days, est 10.5882 days between (9.76592e+19 corrected)
```
If we believe the (pessimistic) simulation and the mathematical generalization, we would need to run a billion machines all for 97 billion days to expect a cache key collision. To help verify that our generalization ("corrected") is robust, we can make our simulation more precise with `-sck_keep_bits=41` and `42`, which takes more running time to get enough data:
```
(keep 41 bits) 16 collisions after 4 x 90 days, est 22.5 days between (1.03763e+20 corrected)
(keep 42 bits) 19 collisions after 10 x 90 days, est 47.3684 days between (1.09224e+20 corrected)
```
The generalized prediction still holds. With the `-sck_randomize` option, we can see that we are beating "random" cache keys (except offsets still non-randomized) by a modest amount (roughly 20x less collision prone than random), which should make us reasonably comfortable even in "degenerate" cases:
```
197 collisions after 1 x 90 days, est 0.456853 days between (4.21372e+18 corrected)
```
I've run other tests to validate other conditions behave as expected, never behaving "worse than random" unless we start chopping off structured data.
Reviewed By: zhichao-cao
Differential Revision: D33171746
Pulled By: pdillinger
fbshipit-source-id: f16a57e369ed37be5e7e33525ace848d0537c88f
2021-12-16 17:13:55 -08:00
|
|
|
rep_->ioptions.lowest_used_cache_tier, block_cache, cache_key,
|
2021-10-19 15:53:16 -07:00
|
|
|
BlocklikeTraits<TBlocklike>::GetCacheItemHelper(block_type),
|
|
|
|
block_holder, charge, &cache_handle, priority);
|
2016-03-10 17:35:19 -08:00
|
|
|
if (s.ok()) {
|
2019-05-10 11:53:33 -07:00
|
|
|
assert(cache_handle != nullptr);
|
|
|
|
cached_block->SetCachedValue(block_holder.release(), block_cache,
|
|
|
|
cache_handle);
|
|
|
|
|
2020-04-27 13:18:18 -07:00
|
|
|
UpdateCacheInsertionMetrics(block_type, get_context, charge,
|
2021-06-17 21:55:42 -07:00
|
|
|
s.IsOkOverwritten(), rep_->ioptions.stats);
|
2016-03-10 17:35:19 -08:00
|
|
|
} else {
|
|
|
|
RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
|
|
|
|
}
|
2019-05-10 11:53:33 -07:00
|
|
|
} else {
|
|
|
|
cached_block->SetOwnedValue(block_holder.release());
|
2014-02-28 18:19:07 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
std::unique_ptr<FilterBlockReader> BlockBasedTable::CreateFilterBlockReader(
|
2020-06-29 14:51:57 -07:00
|
|
|
const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer, bool use_cache,
|
|
|
|
bool prefetch, bool pin, BlockCacheLookupContext* lookup_context) {
|
2017-03-22 09:11:23 -07:00
|
|
|
auto& rep = rep_;
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
auto filter_type = rep->filter_type;
|
|
|
|
if (filter_type == Rep::FilterType::kNoFilter) {
|
|
|
|
return std::unique_ptr<FilterBlockReader>();
|
2015-09-02 15:36:47 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
assert(rep->filter_policy);
|
|
|
|
|
2017-03-22 09:11:23 -07:00
|
|
|
switch (filter_type) {
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
case Rep::FilterType::kPartitionedFilter:
|
|
|
|
return PartitionedFilterBlockReader::Create(
|
2020-06-29 14:51:57 -07:00
|
|
|
this, ro, prefetch_buffer, use_cache, prefetch, pin, lookup_context);
|
2017-03-22 09:11:23 -07:00
|
|
|
|
|
|
|
case Rep::FilterType::kBlockFilter:
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
return BlockBasedFilterBlockReader::Create(
|
2020-06-29 14:51:57 -07:00
|
|
|
this, ro, prefetch_buffer, use_cache, prefetch, pin, lookup_context);
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
|
|
|
|
case Rep::FilterType::kFullFilter:
|
2020-06-29 14:51:57 -07:00
|
|
|
return FullFilterBlockReader::Create(this, ro, prefetch_buffer, use_cache,
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
prefetch, pin, lookup_context);
|
2015-09-02 15:36:47 -07:00
|
|
|
|
2017-03-22 09:11:23 -07:00
|
|
|
default:
|
|
|
|
// filter_type is either kNoFilter (exited the function at the first if),
|
|
|
|
// or it must be covered in this switch block
|
|
|
|
assert(false);
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
return std::unique_ptr<FilterBlockReader>();
|
2019-06-14 17:37:24 -07:00
|
|
|
}
|
2013-11-12 22:46:51 -08:00
|
|
|
}
|
|
|
|
|
2018-05-21 14:33:55 -07:00
|
|
|
// disable_prefix_seek should be set to true when prefix_extractor found in SST
|
|
|
|
// differs from the one in mutable_cf_options and index type is HashBasedIndex
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-24 20:50:35 -07:00
|
|
|
InternalIteratorBase<IndexValue>* BlockBasedTable::NewIndexIterator(
|
2018-05-21 14:33:55 -07:00
|
|
|
const ReadOptions& read_options, bool disable_prefix_seek,
|
2019-06-10 15:30:05 -07:00
|
|
|
IndexBlockIter* input_iter, GetContext* get_context,
|
|
|
|
BlockCacheLookupContext* lookup_context) const {
|
Move the index readers out of the block cache (#5298)
Summary:
Currently, when the block cache is used for index blocks as well, it is
not really the index block that is stored in the cache but an
IndexReader object. Since this object is not pure data (it has, for
instance, pointers that might dangle), it's not really sharable. To
avoid the issues around this, the current code uses a dummy unique cache
key for each TableReader to store the IndexReader, and erases the
IndexReader entry when the TableReader is closed. Instead of doing this,
the new code moves the IndexReader out of the cache altogether. In
particular, instead of the TableReader owning, or caching/pinning the
IndexReader based on the customer's settings, the TableReader
unconditionally owns the IndexReader, which in turn owns/caches/pins
the index block (which is itself sharable and thus can be safely put in
the cache without any hacks).
Note: the change has two side effects:
1) Partitions of partitioned indexes no longer affect the read
amplification statistics.
2) Eviction statistics for index blocks are temporarily broken. We plan to fix
this in a separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5298
Differential Revision: D15303203
Pulled By: ltamasi
fbshipit-source-id: 935a69ba59d87d5e44f42e2310619b790c366e47
2019-05-30 11:49:36 -07:00
|
|
|
assert(rep_ != nullptr);
|
|
|
|
assert(rep_->index_reader != nullptr);
|
2014-02-28 18:19:07 -08:00
|
|
|
|
Move the index readers out of the block cache (#5298)
Summary:
Currently, when the block cache is used for index blocks as well, it is
not really the index block that is stored in the cache but an
IndexReader object. Since this object is not pure data (it has, for
instance, pointers that might dangle), it's not really sharable. To
avoid the issues around this, the current code uses a dummy unique cache
key for each TableReader to store the IndexReader, and erases the
IndexReader entry when the TableReader is closed. Instead of doing this,
the new code moves the IndexReader out of the cache altogether. In
particular, instead of the TableReader owning, or caching/pinning the
IndexReader based on the customer's settings, the TableReader
unconditionally owns the IndexReader, which in turn owns/caches/pins
the index block (which is itself sharable and thus can be safely put in
the cache without any hacks).
Note: the change has two side effects:
1) Partitions of partitioned indexes no longer affect the read
amplification statistics.
2) Eviction statistics for index blocks are temporarily broken. We plan to fix
this in a separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5298
Differential Revision: D15303203
Pulled By: ltamasi
fbshipit-source-id: 935a69ba59d87d5e44f42e2310619b790c366e47
2019-05-30 11:49:36 -07:00
|
|
|
// We don't return pinned data from index blocks, so no need
|
2018-11-13 17:00:49 -08:00
|
|
|
// to set `block_contents_pinned`.
|
Move the index readers out of the block cache (#5298)
Summary:
Currently, when the block cache is used for index blocks as well, it is
not really the index block that is stored in the cache but an
IndexReader object. Since this object is not pure data (it has, for
instance, pointers that might dangle), it's not really sharable. To
avoid the issues around this, the current code uses a dummy unique cache
key for each TableReader to store the IndexReader, and erases the
IndexReader entry when the TableReader is closed. Instead of doing this,
the new code moves the IndexReader out of the cache altogether. In
particular, instead of the TableReader owning, or caching/pinning the
IndexReader based on the customer's settings, the TableReader
unconditionally owns the IndexReader, which in turn owns/caches/pins
the index block (which is itself sharable and thus can be safely put in
the cache without any hacks).
Note: the change has two side effects:
1) Partitions of partitioned indexes no longer affect the read
amplification statistics.
2) Eviction statistics for index blocks are temporarily broken. We plan to fix
this in a separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5298
Differential Revision: D15303203
Pulled By: ltamasi
fbshipit-source-id: 935a69ba59d87d5e44f42e2310619b790c366e47
2019-05-30 11:49:36 -07:00
|
|
|
return rep_->index_reader->NewIterator(read_options, disable_prefix_seek,
|
2019-06-10 15:30:05 -07:00
|
|
|
input_iter, get_context,
|
|
|
|
lookup_context);
|
2013-11-12 22:46:51 -08:00
|
|
|
}
|
|
|
|
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-24 20:50:35 -07:00
|
|
|
template <>
|
|
|
|
DataBlockIter* BlockBasedTable::InitBlockIterator<DataBlockIter>(
|
2020-02-25 15:29:17 -08:00
|
|
|
const Rep* rep, Block* block, BlockType block_type,
|
|
|
|
DataBlockIter* input_iter, bool block_contents_pinned) {
|
2020-07-07 17:25:08 -07:00
|
|
|
return block->NewDataIterator(rep->internal_comparator.user_comparator(),
|
|
|
|
rep->get_global_seqno(block_type), input_iter,
|
2021-04-26 12:43:02 -07:00
|
|
|
rep->ioptions.stats, block_contents_pinned);
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-24 20:50:35 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
template <>
|
|
|
|
IndexBlockIter* BlockBasedTable::InitBlockIterator<IndexBlockIter>(
|
2020-02-25 15:29:17 -08:00
|
|
|
const Rep* rep, Block* block, BlockType block_type,
|
|
|
|
IndexBlockIter* input_iter, bool block_contents_pinned) {
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-24 20:50:35 -07:00
|
|
|
return block->NewIndexIterator(
|
2020-07-07 17:25:08 -07:00
|
|
|
rep->internal_comparator.user_comparator(),
|
2021-04-26 12:43:02 -07:00
|
|
|
rep->get_global_seqno(block_type), input_iter, rep->ioptions.stats,
|
2020-02-25 15:29:17 -08:00
|
|
|
/* total_order_seek */ true, rep->index_has_first_key,
|
|
|
|
rep->index_key_includes_seq, rep->index_value_is_full,
|
|
|
|
block_contents_pinned);
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-24 20:50:35 -07:00
|
|
|
}
|
|
|
|
|
2019-06-30 20:52:34 -07:00
|
|
|
// If contents is nullptr, this function looks up the block caches for the
|
|
|
|
// data block referenced by handle, and read the block from disk if necessary.
|
|
|
|
// If contents is non-null, it skips the cache lookup and disk read, since
|
|
|
|
// the caller has already read it. In both cases, if ro.fill_cache is true,
|
|
|
|
// it inserts the block into the block cache.
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
template <typename TBlocklike>
|
2018-11-13 17:00:49 -08:00
|
|
|
Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
|
2019-06-03 12:31:45 -07:00
|
|
|
FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
|
2019-01-23 18:11:08 -08:00
|
|
|
const BlockHandle& handle, const UncompressionDict& uncompression_dict,
|
2022-03-24 15:06:24 -07:00
|
|
|
const bool wait, const bool for_compaction,
|
|
|
|
CachableEntry<TBlocklike>* block_entry, BlockType block_type,
|
|
|
|
GetContext* get_context, BlockCacheLookupContext* lookup_context,
|
|
|
|
BlockContents* contents) const {
|
2017-08-23 14:55:26 -07:00
|
|
|
assert(block_entry != nullptr);
|
2016-11-05 09:10:51 -07:00
|
|
|
const bool no_io = (ro.read_tier == kBlockCacheTier);
|
2019-06-03 12:31:45 -07:00
|
|
|
Cache* block_cache = rep_->table_options.block_cache.get();
|
2016-11-05 09:10:51 -07:00
|
|
|
Cache* block_cache_compressed =
|
2020-06-17 14:19:35 -07:00
|
|
|
rep_->table_options.block_cache_compressed.get();
|
2014-04-25 12:22:23 -07:00
|
|
|
|
2018-11-13 17:00:49 -08:00
|
|
|
// First, try to get the block from the cache
|
|
|
|
//
|
2014-04-25 12:22:23 -07:00
|
|
|
// If either block cache is enabled, we'll try to read from it.
|
2016-11-05 09:10:51 -07:00
|
|
|
Status s;
|
New stable, fixed-length cache keys (#9126)
Summary:
This change standardizes on a new 16-byte cache key format for
block cache (incl compressed and secondary) and persistent cache (but
not table cache and row cache).
The goal is a really fast cache key with practically ideal stability and
uniqueness properties without external dependencies (e.g. from FileSystem).
A fixed key size of 16 bytes should enable future optimizations to the
concurrent hash table for block cache, which is a heavy CPU user /
bottleneck, but there appears to be measurable performance improvement
even with no changes to LRUCache.
This change replaces a lot of disjointed and ugly code handling cache
keys with calls to a simple, clean new internal API (cache_key.h).
(Preserving the old cache key logic under an option would be very ugly
and likely negate the performance gain of the new approach. Complete
replacement carries some inherent risk, but I think that's acceptable
with sufficient analysis and testing.)
The scheme for encoding new cache keys is complicated but explained
in cache_key.cc.
Also: EndianSwapValue is moved to math.h to be next to other bit
operations. (Explains some new include "math.h".) ReverseBits operation
added and unit tests added to hash_test for both.
Fixes https://github.com/facebook/rocksdb/issues/7405 (presuming a root cause)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9126
Test Plan:
### Basic correctness
Several tests needed updates to work with the new functionality, mostly
because we are no longer relying on filesystem for stable cache keys
so table builders & readers need more context info to agree on cache
keys. This functionality is so core, a huge number of existing tests
exercise the cache key functionality.
### Performance
Create db with
`TEST_TMPDIR=/dev/shm ./db_bench -bloom_bits=10 -benchmarks=fillrandom -num=3000000 -partition_index_and_filters`
And test performance with
`TEST_TMPDIR=/dev/shm ./db_bench -readonly -use_existing_db -bloom_bits=10 -benchmarks=readrandom -num=3000000 -duration=30 -cache_index_and_filter_blocks -cache_size=250000 -threads=4`
using DEBUG_LEVEL=0 and simultaneous before & after runs.
Before ops/sec, avg over 100 runs: 121924
After ops/sec, avg over 100 runs: 125385 (+2.8%)
### Collision probability
I have built a tool, ./cache_bench -stress_cache_key to broadly simulate host-wide cache activity
over many months, by making some pessimistic simplifying assumptions:
* Every generated file has a cache entry for every byte offset in the file (contiguous range of cache keys)
* All of every file is cached for its entire lifetime
We use a simple table with skewed address assignment and replacement on address collision
to simulate files coming & going, with quite a variance (super-Poisson) in ages. Some output
with `./cache_bench -stress_cache_key -sck_keep_bits=40`:
```
Total cache or DBs size: 32TiB Writing 925.926 MiB/s or 76.2939TiB/day
Multiply by 9.22337e+18 to correct for simulation losses (but still assume whole file cached)
```
These come from default settings of 2.5M files per day of 32 MB each, and
`-sck_keep_bits=40` means that to represent a single file, we are only keeping 40 bits of
the 128-bit cache key. With file size of 2\*\*25 contiguous keys (pessimistic), our simulation
is about 2\*\*(128-40-25) or about 9 billion billion times more prone to collision than reality.
More default assumptions, relatively pessimistic:
* 100 DBs in same process (doesn't matter much)
* Re-open DB in same process (new session ID related to old session ID) on average
every 100 files generated
* Restart process (all new session IDs unrelated to old) 24 times per day
After enough data, we get a result at the end:
```
(keep 40 bits) 17 collisions after 2 x 90 days, est 10.5882 days between (9.76592e+19 corrected)
```
If we believe the (pessimistic) simulation and the mathematical generalization, we would need to run a billion machines all for 97 billion days to expect a cache key collision. To help verify that our generalization ("corrected") is robust, we can make our simulation more precise with `-sck_keep_bits=41` and `42`, which takes more running time to get enough data:
```
(keep 41 bits) 16 collisions after 4 x 90 days, est 22.5 days between (1.03763e+20 corrected)
(keep 42 bits) 19 collisions after 10 x 90 days, est 47.3684 days between (1.09224e+20 corrected)
```
The generalized prediction still holds. With the `-sck_randomize` option, we can see that we are beating "random" cache keys (except offsets still non-randomized) by a modest amount (roughly 20x less collision prone than random), which should make us reasonably comfortable even in "degenerate" cases:
```
197 collisions after 1 x 90 days, est 0.456853 days between (4.21372e+18 corrected)
```
I've run other tests to validate other conditions behave as expected, never behaving "worse than random" unless we start chopping off structured data.
Reviewed By: zhichao-cao
Differential Revision: D33171746
Pulled By: pdillinger
fbshipit-source-id: f16a57e369ed37be5e7e33525ace848d0537c88f
2021-12-16 17:13:55 -08:00
|
|
|
CacheKey key_data;
|
|
|
|
Slice key;
|
2019-06-14 17:37:24 -07:00
|
|
|
bool is_cache_hit = false;
|
2014-04-25 12:22:23 -07:00
|
|
|
if (block_cache != nullptr || block_cache_compressed != nullptr) {
|
|
|
|
// create key for block cache
|
New stable, fixed-length cache keys (#9126)
Summary:
This change standardizes on a new 16-byte cache key format for
block cache (incl compressed and secondary) and persistent cache (but
not table cache and row cache).
The goal is a really fast cache key with practically ideal stability and
uniqueness properties without external dependencies (e.g. from FileSystem).
A fixed key size of 16 bytes should enable future optimizations to the
concurrent hash table for block cache, which is a heavy CPU user /
bottleneck, but there appears to be measurable performance improvement
even with no changes to LRUCache.
This change replaces a lot of disjointed and ugly code handling cache
keys with calls to a simple, clean new internal API (cache_key.h).
(Preserving the old cache key logic under an option would be very ugly
and likely negate the performance gain of the new approach. Complete
replacement carries some inherent risk, but I think that's acceptable
with sufficient analysis and testing.)
The scheme for encoding new cache keys is complicated but explained
in cache_key.cc.
Also: EndianSwapValue is moved to math.h to be next to other bit
operations. (Explains some new include "math.h".) ReverseBits operation
added and unit tests added to hash_test for both.
Fixes https://github.com/facebook/rocksdb/issues/7405 (presuming a root cause)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9126
Test Plan:
### Basic correctness
Several tests needed updates to work with the new functionality, mostly
because we are no longer relying on filesystem for stable cache keys
so table builders & readers need more context info to agree on cache
keys. This functionality is so core, a huge number of existing tests
exercise the cache key functionality.
### Performance
Create db with
`TEST_TMPDIR=/dev/shm ./db_bench -bloom_bits=10 -benchmarks=fillrandom -num=3000000 -partition_index_and_filters`
And test performance with
`TEST_TMPDIR=/dev/shm ./db_bench -readonly -use_existing_db -bloom_bits=10 -benchmarks=readrandom -num=3000000 -duration=30 -cache_index_and_filter_blocks -cache_size=250000 -threads=4`
using DEBUG_LEVEL=0 and simultaneous before & after runs.
Before ops/sec, avg over 100 runs: 121924
After ops/sec, avg over 100 runs: 125385 (+2.8%)
### Collision probability
I have built a tool, ./cache_bench -stress_cache_key to broadly simulate host-wide cache activity
over many months, by making some pessimistic simplifying assumptions:
* Every generated file has a cache entry for every byte offset in the file (contiguous range of cache keys)
* All of every file is cached for its entire lifetime
We use a simple table with skewed address assignment and replacement on address collision
to simulate files coming & going, with quite a variance (super-Poisson) in ages. Some output
with `./cache_bench -stress_cache_key -sck_keep_bits=40`:
```
Total cache or DBs size: 32TiB Writing 925.926 MiB/s or 76.2939TiB/day
Multiply by 9.22337e+18 to correct for simulation losses (but still assume whole file cached)
```
These come from default settings of 2.5M files per day of 32 MB each, and
`-sck_keep_bits=40` means that to represent a single file, we are only keeping 40 bits of
the 128-bit cache key. With file size of 2\*\*25 contiguous keys (pessimistic), our simulation
is about 2\*\*(128-40-25) or about 9 billion billion times more prone to collision than reality.
More default assumptions, relatively pessimistic:
* 100 DBs in same process (doesn't matter much)
* Re-open DB in same process (new session ID related to old session ID) on average
every 100 files generated
* Restart process (all new session IDs unrelated to old) 24 times per day
After enough data, we get a result at the end:
```
(keep 40 bits) 17 collisions after 2 x 90 days, est 10.5882 days between (9.76592e+19 corrected)
```
If we believe the (pessimistic) simulation and the mathematical generalization, we would need to run a billion machines all for 97 billion days to expect a cache key collision. To help verify that our generalization ("corrected") is robust, we can make our simulation more precise with `-sck_keep_bits=41` and `42`, which takes more running time to get enough data:
```
(keep 41 bits) 16 collisions after 4 x 90 days, est 22.5 days between (1.03763e+20 corrected)
(keep 42 bits) 19 collisions after 10 x 90 days, est 47.3684 days between (1.09224e+20 corrected)
```
The generalized prediction still holds. With the `-sck_randomize` option, we can see that we are beating "random" cache keys (except offsets still non-randomized) by a modest amount (roughly 20x less collision prone than random), which should make us reasonably comfortable even in "degenerate" cases:
```
197 collisions after 1 x 90 days, est 0.456853 days between (4.21372e+18 corrected)
```
I've run other tests to validate other conditions behave as expected, never behaving "worse than random" unless we start chopping off structured data.
Reviewed By: zhichao-cao
Differential Revision: D33171746
Pulled By: pdillinger
fbshipit-source-id: f16a57e369ed37be5e7e33525ace848d0537c88f
2021-12-16 17:13:55 -08:00
|
|
|
key_data = GetCacheKey(rep_->base_cache_key, handle);
|
|
|
|
key = key_data.AsSlice();
|
2014-04-25 12:22:23 -07:00
|
|
|
|
2019-06-30 20:52:34 -07:00
|
|
|
if (!contents) {
|
New stable, fixed-length cache keys (#9126)
Summary:
This change standardizes on a new 16-byte cache key format for
block cache (incl compressed and secondary) and persistent cache (but
not table cache and row cache).
The goal is a really fast cache key with practically ideal stability and
uniqueness properties without external dependencies (e.g. from FileSystem).
A fixed key size of 16 bytes should enable future optimizations to the
concurrent hash table for block cache, which is a heavy CPU user /
bottleneck, but there appears to be measurable performance improvement
even with no changes to LRUCache.
This change replaces a lot of disjointed and ugly code handling cache
keys with calls to a simple, clean new internal API (cache_key.h).
(Preserving the old cache key logic under an option would be very ugly
and likely negate the performance gain of the new approach. Complete
replacement carries some inherent risk, but I think that's acceptable
with sufficient analysis and testing.)
The scheme for encoding new cache keys is complicated but explained
in cache_key.cc.
Also: EndianSwapValue is moved to math.h to be next to other bit
operations. (Explains some new include "math.h".) ReverseBits operation
added and unit tests added to hash_test for both.
Fixes https://github.com/facebook/rocksdb/issues/7405 (presuming a root cause)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9126
Test Plan:
### Basic correctness
Several tests needed updates to work with the new functionality, mostly
because we are no longer relying on filesystem for stable cache keys
so table builders & readers need more context info to agree on cache
keys. This functionality is so core, a huge number of existing tests
exercise the cache key functionality.
### Performance
Create db with
`TEST_TMPDIR=/dev/shm ./db_bench -bloom_bits=10 -benchmarks=fillrandom -num=3000000 -partition_index_and_filters`
And test performance with
`TEST_TMPDIR=/dev/shm ./db_bench -readonly -use_existing_db -bloom_bits=10 -benchmarks=readrandom -num=3000000 -duration=30 -cache_index_and_filter_blocks -cache_size=250000 -threads=4`
using DEBUG_LEVEL=0 and simultaneous before & after runs.
Before ops/sec, avg over 100 runs: 121924
After ops/sec, avg over 100 runs: 125385 (+2.8%)
### Collision probability
I have built a tool, ./cache_bench -stress_cache_key to broadly simulate host-wide cache activity
over many months, by making some pessimistic simplifying assumptions:
* Every generated file has a cache entry for every byte offset in the file (contiguous range of cache keys)
* All of every file is cached for its entire lifetime
We use a simple table with skewed address assignment and replacement on address collision
to simulate files coming & going, with quite a variance (super-Poisson) in ages. Some output
with `./cache_bench -stress_cache_key -sck_keep_bits=40`:
```
Total cache or DBs size: 32TiB Writing 925.926 MiB/s or 76.2939TiB/day
Multiply by 9.22337e+18 to correct for simulation losses (but still assume whole file cached)
```
These come from default settings of 2.5M files per day of 32 MB each, and
`-sck_keep_bits=40` means that to represent a single file, we are only keeping 40 bits of
the 128-bit cache key. With file size of 2\*\*25 contiguous keys (pessimistic), our simulation
is about 2\*\*(128-40-25) or about 9 billion billion times more prone to collision than reality.
More default assumptions, relatively pessimistic:
* 100 DBs in same process (doesn't matter much)
* Re-open DB in same process (new session ID related to old session ID) on average
every 100 files generated
* Restart process (all new session IDs unrelated to old) 24 times per day
After enough data, we get a result at the end:
```
(keep 40 bits) 17 collisions after 2 x 90 days, est 10.5882 days between (9.76592e+19 corrected)
```
If we believe the (pessimistic) simulation and the mathematical generalization, we would need to run a billion machines all for 97 billion days to expect a cache key collision. To help verify that our generalization ("corrected") is robust, we can make our simulation more precise with `-sck_keep_bits=41` and `42`, which takes more running time to get enough data:
```
(keep 41 bits) 16 collisions after 4 x 90 days, est 22.5 days between (1.03763e+20 corrected)
(keep 42 bits) 19 collisions after 10 x 90 days, est 47.3684 days between (1.09224e+20 corrected)
```
The generalized prediction still holds. With the `-sck_randomize` option, we can see that we are beating "random" cache keys (except offsets still non-randomized) by a modest amount (roughly 20x less collision prone than random), which should make us reasonably comfortable even in "degenerate" cases:
```
197 collisions after 1 x 90 days, est 0.456853 days between (4.21372e+18 corrected)
```
I've run other tests to validate other conditions behave as expected, never behaving "worse than random" unless we start chopping off structured data.
Reviewed By: zhichao-cao
Differential Revision: D33171746
Pulled By: pdillinger
fbshipit-source-id: f16a57e369ed37be5e7e33525ace848d0537c88f
2021-12-16 17:13:55 -08:00
|
|
|
s = GetDataBlockFromCache(key, block_cache, block_cache_compressed, ro,
|
|
|
|
block_entry, uncompression_dict, block_type,
|
2021-06-18 09:35:03 -07:00
|
|
|
wait, get_context);
|
|
|
|
// Value could still be null at this point, so check the cache handle
|
|
|
|
// and update the read pattern for prefetching
|
|
|
|
if (block_entry->GetValue() || block_entry->GetCacheHandle()) {
|
2019-06-30 20:52:34 -07:00
|
|
|
// TODO(haoyu): Differentiate cache hit on uncompressed block cache and
|
|
|
|
// compressed block cache.
|
|
|
|
is_cache_hit = true;
|
2021-04-28 12:52:53 -07:00
|
|
|
if (prefetch_buffer) {
|
|
|
|
// Update the block details so that PrefetchBuffer can use the read
|
|
|
|
// pattern to determine if reads are sequential or not for
|
|
|
|
// prefetching. It should also take in account blocks read from cache.
|
2022-03-21 07:12:43 -07:00
|
|
|
prefetch_buffer->UpdateReadPattern(
|
|
|
|
handle.offset(), BlockSizeWithTrailer(handle),
|
|
|
|
ro.adaptive_readahead /*decrease_readahead_size*/);
|
2021-04-28 12:52:53 -07:00
|
|
|
}
|
2019-06-30 20:52:34 -07:00
|
|
|
}
|
2019-06-14 17:37:24 -07:00
|
|
|
}
|
2019-06-30 20:52:34 -07:00
|
|
|
|
2018-11-13 17:00:49 -08:00
|
|
|
// Can't find the block from the cache. If I/O is allowed, read from the
|
|
|
|
// file.
|
2021-06-18 09:35:03 -07:00
|
|
|
if (block_entry->GetValue() == nullptr &&
|
|
|
|
block_entry->GetCacheHandle() == nullptr && !no_io && ro.fill_cache) {
|
2021-04-26 12:43:02 -07:00
|
|
|
Statistics* statistics = rep_->ioptions.stats;
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
const bool maybe_compressed =
|
2019-07-23 15:57:43 -07:00
|
|
|
block_type != BlockType::kFilter &&
|
|
|
|
block_type != BlockType::kCompressionDictionary &&
|
|
|
|
rep_->blocks_maybe_compressed;
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
const bool do_uncompress = maybe_compressed && !block_cache_compressed;
|
2018-11-13 17:00:49 -08:00
|
|
|
CompressionType raw_block_comp_type;
|
|
|
|
BlockContents raw_block_contents;
|
2019-06-30 20:52:34 -07:00
|
|
|
if (!contents) {
|
2022-03-24 15:06:24 -07:00
|
|
|
Histograms histogram = for_compaction ? READ_BLOCK_COMPACTION_MICROS
|
|
|
|
: READ_BLOCK_GET_MICROS;
|
|
|
|
StopWatch sw(rep_->ioptions.clock, statistics, histogram);
|
2018-11-13 17:00:49 -08:00
|
|
|
BlockFetcher block_fetcher(
|
2019-06-03 12:31:45 -07:00
|
|
|
rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle,
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
&raw_block_contents, rep_->ioptions, do_uncompress,
|
|
|
|
maybe_compressed, block_type, uncompression_dict,
|
|
|
|
rep_->persistent_cache_options,
|
2019-06-03 12:31:45 -07:00
|
|
|
GetMemoryAllocator(rep_->table_options),
|
|
|
|
GetMemoryAllocatorForCompressedBlock(rep_->table_options));
|
2018-11-13 17:00:49 -08:00
|
|
|
s = block_fetcher.ReadBlockContents();
|
|
|
|
raw_block_comp_type = block_fetcher.get_compression_type();
|
2019-06-30 20:52:34 -07:00
|
|
|
contents = &raw_block_contents;
|
2020-10-07 13:27:03 -07:00
|
|
|
if (get_context) {
|
|
|
|
switch (block_type) {
|
|
|
|
case BlockType::kIndex:
|
|
|
|
++get_context->get_context_stats_.num_index_read;
|
|
|
|
break;
|
|
|
|
case BlockType::kFilter:
|
|
|
|
++get_context->get_context_stats_.num_filter_read;
|
|
|
|
break;
|
|
|
|
case BlockType::kData:
|
|
|
|
++get_context->get_context_stats_.num_data_read;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2019-06-30 20:52:34 -07:00
|
|
|
} else {
|
Improve / clean up meta block code & integrity (#9163)
Summary:
* Checksums are now checked on meta blocks unless specifically
suppressed or not applicable (e.g. plain table). (Was other way around.)
This means a number of cases that were not checking checksums now are,
including direct read TableProperties in Version::GetTableProperties
(fixed in meta_blocks ReadTableProperties), reading any block from
PersistentCache (fixed in BlockFetcher), read TableProperties in
SstFileDumper (ldb/sst_dump/BackupEngine) before table reader open,
maybe more.
* For that to work, I moved the global_seqno+TableProperties checksum
logic to the shared table/ code, because that is used by many utilies
such as SstFileDumper.
* Also for that to work, we have to know when we're dealing with a block
that has a checksum (trailer), so added that capability to Footer based
on magic number, and from there BlockFetcher.
* Knowledge of trailer presence has also fixed a problem where other
table formats were reading blocks including bytes for a non-existant
trailer--and awkwardly kind-of not using them, e.g. no shared code
checking checksums. (BlockFetcher compression type was populated
incorrectly.) Now we only read what is needed.
* Minimized code duplication and differing/incompatible/awkward
abstractions in meta_blocks.{cc,h} (e.g. SeekTo in metaindex block
without parsing block handle)
* Moved some meta block handling code from table_properties*.*
* Moved some code specific to block-based table from shared table/ code
to BlockBasedTable class. The checksum stuff means we can't completely
separate it, but things that don't need to be in shared table/ code
should not be.
* Use unique_ptr rather than raw ptr in more places. (Note: you can
std::move from unique_ptr to shared_ptr.)
Without enhancements to GetPropertiesOfAllTablesTest (see below),
net reduction of roughly 100 lines of code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9163
Test Plan:
existing tests and
* Enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to verify that
checksums are now checked on direct read of table properties by TableCache
(new test would fail before this change)
* Also enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to test
putting table properties under old meta name
* Also generally enhanced that same test to actually test what it was
supposed to be testing already, by kicking things out of table cache when
we don't want them there.
Reviewed By: ajkr, mrambacher
Differential Revision: D32514757
Pulled By: pdillinger
fbshipit-source-id: 507964b9311d186ae8d1131182290cbd97a99fa9
2021-11-18 11:42:12 -08:00
|
|
|
raw_block_comp_type = GetBlockCompressionType(*contents);
|
2014-04-25 12:22:23 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
if (s.ok()) {
|
2018-11-13 17:00:49 -08:00
|
|
|
// If filling cache is allowed and a cache is configured, try to put the
|
|
|
|
// block to the cache.
|
2019-09-20 12:00:55 -07:00
|
|
|
s = PutDataBlockToCache(
|
New stable, fixed-length cache keys (#9126)
Summary:
This change standardizes on a new 16-byte cache key format for
block cache (incl compressed and secondary) and persistent cache (but
not table cache and row cache).
The goal is a really fast cache key with practically ideal stability and
uniqueness properties without external dependencies (e.g. from FileSystem).
A fixed key size of 16 bytes should enable future optimizations to the
concurrent hash table for block cache, which is a heavy CPU user /
bottleneck, but there appears to be measurable performance improvement
even with no changes to LRUCache.
This change replaces a lot of disjointed and ugly code handling cache
keys with calls to a simple, clean new internal API (cache_key.h).
(Preserving the old cache key logic under an option would be very ugly
and likely negate the performance gain of the new approach. Complete
replacement carries some inherent risk, but I think that's acceptable
with sufficient analysis and testing.)
The scheme for encoding new cache keys is complicated but explained
in cache_key.cc.
Also: EndianSwapValue is moved to math.h to be next to other bit
operations. (Explains some new include "math.h".) ReverseBits operation
added and unit tests added to hash_test for both.
Fixes https://github.com/facebook/rocksdb/issues/7405 (presuming a root cause)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9126
Test Plan:
### Basic correctness
Several tests needed updates to work with the new functionality, mostly
because we are no longer relying on filesystem for stable cache keys
so table builders & readers need more context info to agree on cache
keys. This functionality is so core, a huge number of existing tests
exercise the cache key functionality.
### Performance
Create db with
`TEST_TMPDIR=/dev/shm ./db_bench -bloom_bits=10 -benchmarks=fillrandom -num=3000000 -partition_index_and_filters`
And test performance with
`TEST_TMPDIR=/dev/shm ./db_bench -readonly -use_existing_db -bloom_bits=10 -benchmarks=readrandom -num=3000000 -duration=30 -cache_index_and_filter_blocks -cache_size=250000 -threads=4`
using DEBUG_LEVEL=0 and simultaneous before & after runs.
Before ops/sec, avg over 100 runs: 121924
After ops/sec, avg over 100 runs: 125385 (+2.8%)
### Collision probability
I have built a tool, ./cache_bench -stress_cache_key to broadly simulate host-wide cache activity
over many months, by making some pessimistic simplifying assumptions:
* Every generated file has a cache entry for every byte offset in the file (contiguous range of cache keys)
* All of every file is cached for its entire lifetime
We use a simple table with skewed address assignment and replacement on address collision
to simulate files coming & going, with quite a variance (super-Poisson) in ages. Some output
with `./cache_bench -stress_cache_key -sck_keep_bits=40`:
```
Total cache or DBs size: 32TiB Writing 925.926 MiB/s or 76.2939TiB/day
Multiply by 9.22337e+18 to correct for simulation losses (but still assume whole file cached)
```
These come from default settings of 2.5M files per day of 32 MB each, and
`-sck_keep_bits=40` means that to represent a single file, we are only keeping 40 bits of
the 128-bit cache key. With file size of 2\*\*25 contiguous keys (pessimistic), our simulation
is about 2\*\*(128-40-25) or about 9 billion billion times more prone to collision than reality.
More default assumptions, relatively pessimistic:
* 100 DBs in same process (doesn't matter much)
* Re-open DB in same process (new session ID related to old session ID) on average
every 100 files generated
* Restart process (all new session IDs unrelated to old) 24 times per day
After enough data, we get a result at the end:
```
(keep 40 bits) 17 collisions after 2 x 90 days, est 10.5882 days between (9.76592e+19 corrected)
```
If we believe the (pessimistic) simulation and the mathematical generalization, we would need to run a billion machines all for 97 billion days to expect a cache key collision. To help verify that our generalization ("corrected") is robust, we can make our simulation more precise with `-sck_keep_bits=41` and `42`, which takes more running time to get enough data:
```
(keep 41 bits) 16 collisions after 4 x 90 days, est 22.5 days between (1.03763e+20 corrected)
(keep 42 bits) 19 collisions after 10 x 90 days, est 47.3684 days between (1.09224e+20 corrected)
```
The generalized prediction still holds. With the `-sck_randomize` option, we can see that we are beating "random" cache keys (except offsets still non-randomized) by a modest amount (roughly 20x less collision prone than random), which should make us reasonably comfortable even in "degenerate" cases:
```
197 collisions after 1 x 90 days, est 0.456853 days between (4.21372e+18 corrected)
```
I've run other tests to validate other conditions behave as expected, never behaving "worse than random" unless we start chopping off structured data.
Reviewed By: zhichao-cao
Differential Revision: D33171746
Pulled By: pdillinger
fbshipit-source-id: f16a57e369ed37be5e7e33525ace848d0537c88f
2021-12-16 17:13:55 -08:00
|
|
|
key, block_cache, block_cache_compressed, block_entry, contents,
|
|
|
|
raw_block_comp_type, uncompression_dict,
|
2019-09-20 12:00:55 -07:00
|
|
|
GetMemoryAllocator(rep_->table_options), block_type, get_context);
|
2014-04-25 12:22:23 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2019-06-14 17:37:24 -07:00
|
|
|
|
|
|
|
// Fill lookup_context.
|
2019-06-27 08:31:03 -07:00
|
|
|
if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled() &&
|
|
|
|
lookup_context) {
|
2019-06-14 17:37:24 -07:00
|
|
|
size_t usage = 0;
|
|
|
|
uint64_t nkeys = 0;
|
|
|
|
if (block_entry->GetValue()) {
|
|
|
|
// Approximate the number of keys in the block using restarts.
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
nkeys =
|
|
|
|
rep_->table_options.block_restart_interval *
|
|
|
|
BlocklikeTraits<TBlocklike>::GetNumRestarts(*block_entry->GetValue());
|
2019-06-14 17:37:24 -07:00
|
|
|
usage = block_entry->GetValue()->ApproximateMemoryUsage();
|
|
|
|
}
|
|
|
|
TraceType trace_block_type = TraceType::kTraceMax;
|
|
|
|
switch (block_type) {
|
|
|
|
case BlockType::kData:
|
|
|
|
trace_block_type = TraceType::kBlockTraceDataBlock;
|
|
|
|
break;
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
case BlockType::kFilter:
|
|
|
|
trace_block_type = TraceType::kBlockTraceFilterBlock;
|
|
|
|
break;
|
2019-07-23 15:57:43 -07:00
|
|
|
case BlockType::kCompressionDictionary:
|
|
|
|
trace_block_type = TraceType::kBlockTraceUncompressionDictBlock;
|
|
|
|
break;
|
2019-06-14 17:37:24 -07:00
|
|
|
case BlockType::kRangeDeletion:
|
|
|
|
trace_block_type = TraceType::kBlockTraceRangeDeletionBlock;
|
|
|
|
break;
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
case BlockType::kIndex:
|
|
|
|
trace_block_type = TraceType::kBlockTraceIndexBlock;
|
|
|
|
break;
|
2019-06-14 17:37:24 -07:00
|
|
|
default:
|
|
|
|
// This cannot happen.
|
|
|
|
assert(false);
|
|
|
|
break;
|
|
|
|
}
|
2019-07-17 13:02:00 -07:00
|
|
|
bool no_insert = no_io || !ro.fill_cache;
|
|
|
|
if (BlockCacheTraceHelper::IsGetOrMultiGetOnDataBlock(
|
2019-06-14 17:37:24 -07:00
|
|
|
trace_block_type, lookup_context->caller)) {
|
|
|
|
// Defer logging the access to Get() and MultiGet() to trace additional
|
2019-07-17 13:02:00 -07:00
|
|
|
// information, e.g., referenced_key_exist_in_block.
|
2019-06-14 17:37:24 -07:00
|
|
|
|
|
|
|
// Make a copy of the block key here since it will be logged later.
|
|
|
|
lookup_context->FillLookupContext(
|
|
|
|
is_cache_hit, no_insert, trace_block_type,
|
|
|
|
/*block_size=*/usage, /*block_key=*/key.ToString(), nkeys);
|
|
|
|
} else {
|
|
|
|
// Avoid making copy of block_key and cf_name when constructing the access
|
|
|
|
// record.
|
|
|
|
BlockCacheTraceRecord access_record(
|
2021-03-15 04:32:24 -07:00
|
|
|
rep_->ioptions.clock->NowMicros(),
|
2019-06-14 17:37:24 -07:00
|
|
|
/*block_key=*/"", trace_block_type,
|
|
|
|
/*block_size=*/usage, rep_->cf_id_for_tracing(),
|
|
|
|
/*cf_name=*/"", rep_->level_for_tracing(),
|
|
|
|
rep_->sst_number_for_tracing(), lookup_context->caller, is_cache_hit,
|
2019-07-17 13:02:00 -07:00
|
|
|
no_insert, lookup_context->get_id,
|
|
|
|
lookup_context->get_from_user_specified_snapshot,
|
|
|
|
/*referenced_key=*/"");
|
2020-10-08 11:11:52 -07:00
|
|
|
// TODO: Should handle this error?
|
|
|
|
block_cache_tracer_
|
|
|
|
->WriteBlockAccess(access_record, key, rep_->cf_name_for_tracing(),
|
|
|
|
lookup_context->referenced_key)
|
|
|
|
.PermitUncheckedError();
|
2019-06-14 17:37:24 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-05-10 11:53:33 -07:00
|
|
|
assert(s.ok() || block_entry->GetValue() == nullptr);
|
2016-11-05 09:10:51 -07:00
|
|
|
return s;
|
2012-09-27 01:05:38 -07:00
|
|
|
}
|
|
|
|
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
template <typename TBlocklike>
|
Move the index readers out of the block cache (#5298)
Summary:
Currently, when the block cache is used for index blocks as well, it is
not really the index block that is stored in the cache but an
IndexReader object. Since this object is not pure data (it has, for
instance, pointers that might dangle), it's not really sharable. To
avoid the issues around this, the current code uses a dummy unique cache
key for each TableReader to store the IndexReader, and erases the
IndexReader entry when the TableReader is closed. Instead of doing this,
the new code moves the IndexReader out of the cache altogether. In
particular, instead of the TableReader owning, or caching/pinning the
IndexReader based on the customer's settings, the TableReader
unconditionally owns the IndexReader, which in turn owns/caches/pins
the index block (which is itself sharable and thus can be safely put in
the cache without any hacks).
Note: the change has two side effects:
1) Partitions of partitioned indexes no longer affect the read
amplification statistics.
2) Eviction statistics for index blocks are temporarily broken. We plan to fix
this in a separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5298
Differential Revision: D15303203
Pulled By: ltamasi
fbshipit-source-id: 935a69ba59d87d5e44f42e2310619b790c366e47
2019-05-30 11:49:36 -07:00
|
|
|
Status BlockBasedTable::RetrieveBlock(
|
2019-06-03 12:31:45 -07:00
|
|
|
FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
|
Move the index readers out of the block cache (#5298)
Summary:
Currently, when the block cache is used for index blocks as well, it is
not really the index block that is stored in the cache but an
IndexReader object. Since this object is not pure data (it has, for
instance, pointers that might dangle), it's not really sharable. To
avoid the issues around this, the current code uses a dummy unique cache
key for each TableReader to store the IndexReader, and erases the
IndexReader entry when the TableReader is closed. Instead of doing this,
the new code moves the IndexReader out of the cache altogether. In
particular, instead of the TableReader owning, or caching/pinning the
IndexReader based on the customer's settings, the TableReader
unconditionally owns the IndexReader, which in turn owns/caches/pins
the index block (which is itself sharable and thus can be safely put in
the cache without any hacks).
Note: the change has two side effects:
1) Partitions of partitioned indexes no longer affect the read
amplification statistics.
2) Eviction statistics for index blocks are temporarily broken. We plan to fix
this in a separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5298
Differential Revision: D15303203
Pulled By: ltamasi
fbshipit-source-id: 935a69ba59d87d5e44f42e2310619b790c366e47
2019-05-30 11:49:36 -07:00
|
|
|
const BlockHandle& handle, const UncompressionDict& uncompression_dict,
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
CachableEntry<TBlocklike>* block_entry, BlockType block_type,
|
2019-06-19 14:07:36 -07:00
|
|
|
GetContext* get_context, BlockCacheLookupContext* lookup_context,
|
2021-06-18 09:35:03 -07:00
|
|
|
bool for_compaction, bool use_cache, bool wait_for_cache) const {
|
Move the index readers out of the block cache (#5298)
Summary:
Currently, when the block cache is used for index blocks as well, it is
not really the index block that is stored in the cache but an
IndexReader object. Since this object is not pure data (it has, for
instance, pointers that might dangle), it's not really sharable. To
avoid the issues around this, the current code uses a dummy unique cache
key for each TableReader to store the IndexReader, and erases the
IndexReader entry when the TableReader is closed. Instead of doing this,
the new code moves the IndexReader out of the cache altogether. In
particular, instead of the TableReader owning, or caching/pinning the
IndexReader based on the customer's settings, the TableReader
unconditionally owns the IndexReader, which in turn owns/caches/pins
the index block (which is itself sharable and thus can be safely put in
the cache without any hacks).
Note: the change has two side effects:
1) Partitions of partitioned indexes no longer affect the read
amplification statistics.
2) Eviction statistics for index blocks are temporarily broken. We plan to fix
this in a separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5298
Differential Revision: D15303203
Pulled By: ltamasi
fbshipit-source-id: 935a69ba59d87d5e44f42e2310619b790c366e47
2019-05-30 11:49:36 -07:00
|
|
|
assert(block_entry);
|
|
|
|
assert(block_entry->IsEmpty());
|
|
|
|
|
|
|
|
Status s;
|
2019-08-14 18:13:14 -07:00
|
|
|
if (use_cache) {
|
2021-06-18 09:35:03 -07:00
|
|
|
s = MaybeReadBlockAndLoadToCache(
|
|
|
|
prefetch_buffer, ro, handle, uncompression_dict, wait_for_cache,
|
2022-03-24 15:06:24 -07:00
|
|
|
for_compaction, block_entry, block_type, get_context, lookup_context,
|
2021-06-18 09:35:03 -07:00
|
|
|
/*contents=*/nullptr);
|
Move the index readers out of the block cache (#5298)
Summary:
Currently, when the block cache is used for index blocks as well, it is
not really the index block that is stored in the cache but an
IndexReader object. Since this object is not pure data (it has, for
instance, pointers that might dangle), it's not really sharable. To
avoid the issues around this, the current code uses a dummy unique cache
key for each TableReader to store the IndexReader, and erases the
IndexReader entry when the TableReader is closed. Instead of doing this,
the new code moves the IndexReader out of the cache altogether. In
particular, instead of the TableReader owning, or caching/pinning the
IndexReader based on the customer's settings, the TableReader
unconditionally owns the IndexReader, which in turn owns/caches/pins
the index block (which is itself sharable and thus can be safely put in
the cache without any hacks).
Note: the change has two side effects:
1) Partitions of partitioned indexes no longer affect the read
amplification statistics.
2) Eviction statistics for index blocks are temporarily broken. We plan to fix
this in a separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5298
Differential Revision: D15303203
Pulled By: ltamasi
fbshipit-source-id: 935a69ba59d87d5e44f42e2310619b790c366e47
2019-05-30 11:49:36 -07:00
|
|
|
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2021-06-18 09:35:03 -07:00
|
|
|
if (block_entry->GetValue() != nullptr ||
|
|
|
|
block_entry->GetCacheHandle() != nullptr) {
|
2019-05-30 17:39:43 -07:00
|
|
|
assert(s.ok());
|
Move the index readers out of the block cache (#5298)
Summary:
Currently, when the block cache is used for index blocks as well, it is
not really the index block that is stored in the cache but an
IndexReader object. Since this object is not pure data (it has, for
instance, pointers that might dangle), it's not really sharable. To
avoid the issues around this, the current code uses a dummy unique cache
key for each TableReader to store the IndexReader, and erases the
IndexReader entry when the TableReader is closed. Instead of doing this,
the new code moves the IndexReader out of the cache altogether. In
particular, instead of the TableReader owning, or caching/pinning the
IndexReader based on the customer's settings, the TableReader
unconditionally owns the IndexReader, which in turn owns/caches/pins
the index block (which is itself sharable and thus can be safely put in
the cache without any hacks).
Note: the change has two side effects:
1) Partitions of partitioned indexes no longer affect the read
amplification statistics.
2) Eviction statistics for index blocks are temporarily broken. We plan to fix
this in a separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5298
Differential Revision: D15303203
Pulled By: ltamasi
fbshipit-source-id: 935a69ba59d87d5e44f42e2310619b790c366e47
2019-05-30 11:49:36 -07:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
assert(block_entry->IsEmpty());
|
|
|
|
|
|
|
|
const bool no_io = ro.read_tier == kBlockCacheTier;
|
|
|
|
if (no_io) {
|
|
|
|
return Status::Incomplete("no blocking io");
|
|
|
|
}
|
|
|
|
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
const bool maybe_compressed =
|
2019-07-23 15:57:43 -07:00
|
|
|
block_type != BlockType::kFilter &&
|
|
|
|
block_type != BlockType::kCompressionDictionary &&
|
|
|
|
rep_->blocks_maybe_compressed;
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
const bool do_uncompress = maybe_compressed;
|
|
|
|
std::unique_ptr<TBlocklike> block;
|
Move the index readers out of the block cache (#5298)
Summary:
Currently, when the block cache is used for index blocks as well, it is
not really the index block that is stored in the cache but an
IndexReader object. Since this object is not pure data (it has, for
instance, pointers that might dangle), it's not really sharable. To
avoid the issues around this, the current code uses a dummy unique cache
key for each TableReader to store the IndexReader, and erases the
IndexReader entry when the TableReader is closed. Instead of doing this,
the new code moves the IndexReader out of the cache altogether. In
particular, instead of the TableReader owning, or caching/pinning the
IndexReader based on the customer's settings, the TableReader
unconditionally owns the IndexReader, which in turn owns/caches/pins
the index block (which is itself sharable and thus can be safely put in
the cache without any hacks).
Note: the change has two side effects:
1) Partitions of partitioned indexes no longer affect the read
amplification statistics.
2) Eviction statistics for index blocks are temporarily broken. We plan to fix
this in a separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5298
Differential Revision: D15303203
Pulled By: ltamasi
fbshipit-source-id: 935a69ba59d87d5e44f42e2310619b790c366e47
2019-05-30 11:49:36 -07:00
|
|
|
|
|
|
|
{
|
2022-03-24 15:06:24 -07:00
|
|
|
Histograms histogram =
|
|
|
|
for_compaction ? READ_BLOCK_COMPACTION_MICROS : READ_BLOCK_GET_MICROS;
|
|
|
|
StopWatch sw(rep_->ioptions.clock, rep_->ioptions.stats, histogram);
|
2019-05-30 17:39:43 -07:00
|
|
|
s = ReadBlockFromFile(
|
2019-06-03 12:31:45 -07:00
|
|
|
rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle, &block,
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
rep_->ioptions, do_uncompress, maybe_compressed, block_type,
|
|
|
|
uncompression_dict, rep_->persistent_cache_options,
|
2019-06-06 11:28:54 -07:00
|
|
|
block_type == BlockType::kData
|
|
|
|
? rep_->table_options.read_amp_bytes_per_bit
|
|
|
|
: 0,
|
2019-08-23 08:25:52 -07:00
|
|
|
GetMemoryAllocator(rep_->table_options), for_compaction,
|
2019-10-18 19:30:47 -07:00
|
|
|
rep_->blocks_definitely_zstd_compressed,
|
|
|
|
rep_->table_options.filter_policy.get());
|
2020-10-07 13:27:03 -07:00
|
|
|
|
|
|
|
if (get_context) {
|
|
|
|
switch (block_type) {
|
|
|
|
case BlockType::kIndex:
|
|
|
|
++(get_context->get_context_stats_.num_index_read);
|
|
|
|
break;
|
|
|
|
case BlockType::kFilter:
|
|
|
|
++(get_context->get_context_stats_.num_filter_read);
|
|
|
|
break;
|
|
|
|
case BlockType::kData:
|
|
|
|
++(get_context->get_context_stats_.num_data_read);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
Move the index readers out of the block cache (#5298)
Summary:
Currently, when the block cache is used for index blocks as well, it is
not really the index block that is stored in the cache but an
IndexReader object. Since this object is not pure data (it has, for
instance, pointers that might dangle), it's not really sharable. To
avoid the issues around this, the current code uses a dummy unique cache
key for each TableReader to store the IndexReader, and erases the
IndexReader entry when the TableReader is closed. Instead of doing this,
the new code moves the IndexReader out of the cache altogether. In
particular, instead of the TableReader owning, or caching/pinning the
IndexReader based on the customer's settings, the TableReader
unconditionally owns the IndexReader, which in turn owns/caches/pins
the index block (which is itself sharable and thus can be safely put in
the cache without any hacks).
Note: the change has two side effects:
1) Partitions of partitioned indexes no longer affect the read
amplification statistics.
2) Eviction statistics for index blocks are temporarily broken. We plan to fix
this in a separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5298
Differential Revision: D15303203
Pulled By: ltamasi
fbshipit-source-id: 935a69ba59d87d5e44f42e2310619b790c366e47
2019-05-30 11:49:36 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
block_entry->SetOwnedValue(block.release());
|
|
|
|
|
|
|
|
assert(s.ok());
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
// Explicitly instantiate templates for both "blocklike" types we use.
|
|
|
|
// This makes it possible to keep the template definitions in the .cc file.
|
|
|
|
template Status BlockBasedTable::RetrieveBlock<BlockContents>(
|
|
|
|
FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
|
|
|
|
const BlockHandle& handle, const UncompressionDict& uncompression_dict,
|
|
|
|
CachableEntry<BlockContents>* block_entry, BlockType block_type,
|
|
|
|
GetContext* get_context, BlockCacheLookupContext* lookup_context,
|
2021-06-18 09:35:03 -07:00
|
|
|
bool for_compaction, bool use_cache, bool wait_for_cache) const;
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
|
2019-10-18 19:30:47 -07:00
|
|
|
template Status BlockBasedTable::RetrieveBlock<ParsedFullFilterBlock>(
|
|
|
|
FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
|
|
|
|
const BlockHandle& handle, const UncompressionDict& uncompression_dict,
|
|
|
|
CachableEntry<ParsedFullFilterBlock>* block_entry, BlockType block_type,
|
|
|
|
GetContext* get_context, BlockCacheLookupContext* lookup_context,
|
2021-06-18 09:35:03 -07:00
|
|
|
bool for_compaction, bool use_cache, bool wait_for_cache) const;
|
2019-10-18 19:30:47 -07:00
|
|
|
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
template Status BlockBasedTable::RetrieveBlock<Block>(
|
|
|
|
FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
|
|
|
|
const BlockHandle& handle, const UncompressionDict& uncompression_dict,
|
|
|
|
CachableEntry<Block>* block_entry, BlockType block_type,
|
|
|
|
GetContext* get_context, BlockCacheLookupContext* lookup_context,
|
2021-06-18 09:35:03 -07:00
|
|
|
bool for_compaction, bool use_cache, bool wait_for_cache) const;
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
|
2019-08-23 08:25:52 -07:00
|
|
|
template Status BlockBasedTable::RetrieveBlock<UncompressionDict>(
|
|
|
|
FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
|
|
|
|
const BlockHandle& handle, const UncompressionDict& uncompression_dict,
|
|
|
|
CachableEntry<UncompressionDict>* block_entry, BlockType block_type,
|
|
|
|
GetContext* get_context, BlockCacheLookupContext* lookup_context,
|
2021-06-18 09:35:03 -07:00
|
|
|
bool for_compaction, bool use_cache, bool wait_for_cache) const;
|
2019-08-23 08:25:52 -07:00
|
|
|
|
2018-02-12 16:57:56 -08:00
|
|
|
BlockBasedTable::PartitionedIndexIteratorState::PartitionedIndexIteratorState(
|
2019-05-31 11:37:21 -07:00
|
|
|
const BlockBasedTable* table,
|
Meta-internal folly integration with F14FastMap (#9546)
Summary:
Especially after updating to C++17, I don't see a compelling case for
*requiring* any folly components in RocksDB. I was able to purge the existing
hard dependencies, and it can be quite difficult to strip out non-trivial components
from folly for use in RocksDB. (The prospect of doing that on F14 has changed
my mind on the best approach here.)
But this change creates an optional integration where we can plug in
components from folly at compile time, starting here with F14FastMap to replace
std::unordered_map when possible (probably no public APIs for example). I have
replaced the biggest CPU users of std::unordered_map with compile-time
pluggable UnorderedMap which will use F14FastMap when USE_FOLLY is set.
USE_FOLLY is always set in the Meta-internal buck build, and a simulation of
that is in the Makefile for public CI testing. A full folly build is not needed, but
checking out the full folly repo is much simpler for getting the dependency,
and anything else we might want to optionally integrate in the future.
Some picky details:
* I don't think the distributed mutex stuff is actually used, so it was easy to remove.
* I implemented an alternative to `folly::constexpr_log2` (which is much easier
in C++17 than C++11) so that I could pull out the hard dependencies on
`ConstexprMath.h`
* I had to add noexcept move constructors/operators to some types to make
F14's complainUnlessNothrowMoveAndDestroy check happy, and I added a
macro to make that easier in some common cases.
* Updated Meta-internal buck build to use folly F14Map (always)
No updates to HISTORY.md nor INSTALL.md as this is not (yet?) considered a
production integration for open source users.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9546
Test Plan:
CircleCI tests updated so that a couple of them use folly.
Most internal unit & stress/crash tests updated to use Meta-internal latest folly.
(Note: they should probably use buck but they currently use Makefile.)
Example performance improvement: when filter partitions are pinned in cache,
they are tracked by PartitionedFilterBlockReader::filter_map_ and we can build
a test that exercises that heavily. Build DB with
```
TEST_TMPDIR=/dev/shm/rocksdb ./db_bench -benchmarks=fillrandom -num=10000000 -disable_wal=1 -write_buffer_size=30000000 -bloom_bits=16 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0 -partition_index_and_filters
```
and test with (simultaneous runs with & without folly, ~20 times each to see
convergence)
```
TEST_TMPDIR=/dev/shm/rocksdb ./db_bench_folly -readonly -use_existing_db -benchmarks=readrandom -num=10000000 -bloom_bits=16 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0 -partition_index_and_filters -duration=40 -pin_l0_filter_and_index_blocks_in_cache
```
Average ops/s no folly: 26229.2
Average ops/s with folly: 26853.3 (+2.4%)
Reviewed By: ajkr
Differential Revision: D34181736
Pulled By: pdillinger
fbshipit-source-id: ffa6ad5104c2880321d8a1aa7187e00ab0d02e94
2022-04-13 07:34:01 -07:00
|
|
|
UnorderedMap<uint64_t, CachableEntry<Block>>* block_map)
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-24 20:50:35 -07:00
|
|
|
: table_(table), block_map_(block_map) {}
|
|
|
|
|
|
|
|
InternalIteratorBase<IndexValue>*
|
2018-02-12 16:57:56 -08:00
|
|
|
BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator(
|
2018-08-09 16:49:45 -07:00
|
|
|
const BlockHandle& handle) {
|
2017-02-06 16:29:29 -08:00
|
|
|
// Return a block iterator on the index partition
|
2018-02-12 16:57:56 -08:00
|
|
|
auto block = block_map_->find(handle.offset());
|
2021-10-27 17:21:48 -07:00
|
|
|
// block_map_ must be exhaustive
|
|
|
|
if (block == block_map_->end()) {
|
|
|
|
assert(false);
|
|
|
|
// Signal problem to caller
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
const Rep* rep = table_->get_rep();
|
|
|
|
assert(rep);
|
|
|
|
|
|
|
|
Statistics* kNullStats = nullptr;
|
|
|
|
// We don't return pinned data from index blocks, so no need
|
|
|
|
// to set `block_contents_pinned`.
|
|
|
|
return block->second.GetValue()->NewIndexIterator(
|
|
|
|
rep->internal_comparator.user_comparator(),
|
|
|
|
rep->get_global_seqno(BlockType::kIndex), nullptr, kNullStats, true,
|
|
|
|
rep->index_has_first_key, rep->index_key_includes_seq,
|
|
|
|
rep->index_value_is_full);
|
2017-05-05 15:01:04 -07:00
|
|
|
}
|
|
|
|
|
2013-08-13 14:04:56 -07:00
|
|
|
// This will be broken if the user specifies an unusual implementation
|
|
|
|
// of Options.comparator, or if the user specifies an unusual
|
2014-08-25 14:22:05 -07:00
|
|
|
// definition of prefixes in BlockBasedTableOptions.filter_policy.
|
|
|
|
// In particular, we require the following three properties:
|
2013-08-13 14:04:56 -07:00
|
|
|
//
|
|
|
|
// 1) key.starts_with(prefix(key))
|
|
|
|
// 2) Compare(prefix(key), key) <= 0.
|
|
|
|
// 3) If Compare(key1, key2) <= 0, then Compare(prefix(key1), prefix(key2)) <= 0
|
2013-08-23 14:49:57 -07:00
|
|
|
//
|
Fix iterator reading filter block despite read_tier == kBlockCacheTier (#6562)
Summary:
We're seeing iterators with `ReadOptions::read_tier == kBlockCacheTier` sometimes doing file reads. Stack trace:
```
rocksdb::RandomAccessFileReader::Read(unsigned long, unsigned long, rocksdb::Slice*, char*, bool) const
rocksdb::BlockFetcher::ReadBlockContents()
rocksdb::Status rocksdb::BlockBasedTable::MaybeReadBlockAndLoadToCache<rocksdb::ParsedFullFilterBlock>(rocksdb::FilePrefetchBuffer*, rocksdb::ReadOptions const&, rocksdb::BlockHandle const&, rocksdb::UncompressionDict const&, rocksdb::CachableEntry<rocksdb::ParsedFullFilterBlock>*, rocksdb::BlockType, rocksdb::GetContext*, rocksdb::BlockCacheLookupContext*, rocksdb::BlockContents*) const
rocksdb::Status rocksdb::BlockBasedTable::RetrieveBlock<rocksdb::ParsedFullFilterBlock>(rocksdb::FilePrefetchBuffer*, rocksdb::ReadOptions const&, rocksdb::BlockHandle const&, rocksdb::UncompressionDict const&, rocksdb::CachableEntry<rocksdb::ParsedFullFilterBlock>*, rocksdb::BlockType, rocksdb::GetContext*, rocksdb::BlockCacheLookupContext*, bool, bool) const
rocksdb::FilterBlockReaderCommon<rocksdb::ParsedFullFilterBlock>::ReadFilterBlock(rocksdb::BlockBasedTable const*, rocksdb::FilePrefetchBuffer*, rocksdb::ReadOptions const&, bool, rocksdb::GetContext*, rocksdb::BlockCacheLookupContext*, rocksdb::CachableEntry<rocksdb::ParsedFullFilterBlock>*)
rocksdb::FilterBlockReaderCommon<rocksdb::ParsedFullFilterBlock>::GetOrReadFilterBlock(bool, rocksdb::GetContext*, rocksdb::BlockCacheLookupContext*, rocksdb::CachableEntry<rocksdb::ParsedFullFilterBlock>*) const
rocksdb::FullFilterBlockReader::MayMatch(rocksdb::Slice const&, bool, rocksdb::GetContext*, rocksdb::BlockCacheLookupContext*) const
rocksdb::FullFilterBlockReader::RangeMayExist(rocksdb::Slice const*, rocksdb::Slice const&, rocksdb::SliceTransform const*, rocksdb::Comparator const*, rocksdb::Slice const*, bool*, bool, rocksdb::BlockCacheLookupContext*)
rocksdb::BlockBasedTable::PrefixMayMatch(rocksdb::Slice const&, rocksdb::ReadOptions const&, rocksdb::SliceTransform const*, bool, rocksdb::BlockCacheLookupContext*) const
rocksdb::BlockBasedTableIterator<rocksdb::DataBlockIter, rocksdb::Slice>::SeekImpl(rocksdb::Slice const*)
rocksdb::ForwardIterator::SeekInternal(rocksdb::Slice const&, bool)
rocksdb::DBIter::Seek(rocksdb::Slice const&)
```
`BlockBasedTableIterator::CheckPrefixMayMatch` was missing a check for `kBlockCacheTier`. This PR adds it.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6562
Test Plan: deployed it to a logdevice test cluster and looked at logdevice's IO tracing.
Reviewed By: siying
Differential Revision: D20529368
Pulled By: al13n321
fbshipit-source-id: 65bf33964b1951464415c900336635fb20919611
2020-03-26 15:18:03 -07:00
|
|
|
// If read_options.read_tier == kBlockCacheTier, this method will do no I/O and
|
|
|
|
// will return true if the filter block is not in memory and not found in block
|
|
|
|
// cache.
|
2013-11-12 22:46:51 -08:00
|
|
|
//
|
|
|
|
// REQUIRES: this method shouldn't be called while the DB lock is held.
|
2018-06-26 15:56:26 -07:00
|
|
|
bool BlockBasedTable::PrefixMayMatch(
|
|
|
|
const Slice& internal_key, const ReadOptions& read_options,
|
|
|
|
const SliceTransform* options_prefix_extractor,
|
2019-06-10 15:30:05 -07:00
|
|
|
const bool need_upper_bound_check,
|
|
|
|
BlockCacheLookupContext* lookup_context) const {
|
2014-08-25 14:22:05 -07:00
|
|
|
if (!rep_->filter_policy) {
|
2014-06-10 09:36:59 -07:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2018-06-26 15:56:26 -07:00
|
|
|
const SliceTransform* prefix_extractor;
|
|
|
|
|
|
|
|
if (rep_->table_prefix_extractor == nullptr) {
|
|
|
|
if (need_upper_bound_check) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
prefix_extractor = options_prefix_extractor;
|
|
|
|
} else {
|
|
|
|
prefix_extractor = rep_->table_prefix_extractor.get();
|
|
|
|
}
|
2020-12-01 14:05:19 -08:00
|
|
|
auto ts_sz = rep_->internal_comparator.user_comparator()->timestamp_size();
|
|
|
|
auto user_key_without_ts =
|
|
|
|
ExtractUserKeyAndStripTimestamp(internal_key, ts_sz);
|
|
|
|
if (!prefix_extractor->InDomain(user_key_without_ts)) {
|
2016-01-26 14:47:42 -08:00
|
|
|
return true;
|
|
|
|
}
|
2014-04-25 12:22:23 -07:00
|
|
|
|
2013-08-13 14:04:56 -07:00
|
|
|
bool may_match = true;
|
|
|
|
|
2014-09-08 10:37:05 -07:00
|
|
|
// First, try check with full filter
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
FilterBlockReader* const filter = rep_->filter.get();
|
2018-06-26 15:56:26 -07:00
|
|
|
bool filter_checked = true;
|
2016-04-13 13:02:33 -07:00
|
|
|
if (filter != nullptr) {
|
Fix iterator reading filter block despite read_tier == kBlockCacheTier (#6562)
Summary:
We're seeing iterators with `ReadOptions::read_tier == kBlockCacheTier` sometimes doing file reads. Stack trace:
```
rocksdb::RandomAccessFileReader::Read(unsigned long, unsigned long, rocksdb::Slice*, char*, bool) const
rocksdb::BlockFetcher::ReadBlockContents()
rocksdb::Status rocksdb::BlockBasedTable::MaybeReadBlockAndLoadToCache<rocksdb::ParsedFullFilterBlock>(rocksdb::FilePrefetchBuffer*, rocksdb::ReadOptions const&, rocksdb::BlockHandle const&, rocksdb::UncompressionDict const&, rocksdb::CachableEntry<rocksdb::ParsedFullFilterBlock>*, rocksdb::BlockType, rocksdb::GetContext*, rocksdb::BlockCacheLookupContext*, rocksdb::BlockContents*) const
rocksdb::Status rocksdb::BlockBasedTable::RetrieveBlock<rocksdb::ParsedFullFilterBlock>(rocksdb::FilePrefetchBuffer*, rocksdb::ReadOptions const&, rocksdb::BlockHandle const&, rocksdb::UncompressionDict const&, rocksdb::CachableEntry<rocksdb::ParsedFullFilterBlock>*, rocksdb::BlockType, rocksdb::GetContext*, rocksdb::BlockCacheLookupContext*, bool, bool) const
rocksdb::FilterBlockReaderCommon<rocksdb::ParsedFullFilterBlock>::ReadFilterBlock(rocksdb::BlockBasedTable const*, rocksdb::FilePrefetchBuffer*, rocksdb::ReadOptions const&, bool, rocksdb::GetContext*, rocksdb::BlockCacheLookupContext*, rocksdb::CachableEntry<rocksdb::ParsedFullFilterBlock>*)
rocksdb::FilterBlockReaderCommon<rocksdb::ParsedFullFilterBlock>::GetOrReadFilterBlock(bool, rocksdb::GetContext*, rocksdb::BlockCacheLookupContext*, rocksdb::CachableEntry<rocksdb::ParsedFullFilterBlock>*) const
rocksdb::FullFilterBlockReader::MayMatch(rocksdb::Slice const&, bool, rocksdb::GetContext*, rocksdb::BlockCacheLookupContext*) const
rocksdb::FullFilterBlockReader::RangeMayExist(rocksdb::Slice const*, rocksdb::Slice const&, rocksdb::SliceTransform const*, rocksdb::Comparator const*, rocksdb::Slice const*, bool*, bool, rocksdb::BlockCacheLookupContext*)
rocksdb::BlockBasedTable::PrefixMayMatch(rocksdb::Slice const&, rocksdb::ReadOptions const&, rocksdb::SliceTransform const*, bool, rocksdb::BlockCacheLookupContext*) const
rocksdb::BlockBasedTableIterator<rocksdb::DataBlockIter, rocksdb::Slice>::SeekImpl(rocksdb::Slice const*)
rocksdb::ForwardIterator::SeekInternal(rocksdb::Slice const&, bool)
rocksdb::DBIter::Seek(rocksdb::Slice const&)
```
`BlockBasedTableIterator::CheckPrefixMayMatch` was missing a check for `kBlockCacheTier`. This PR adds it.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6562
Test Plan: deployed it to a logdevice test cluster and looked at logdevice's IO tracing.
Reviewed By: siying
Differential Revision: D20529368
Pulled By: al13n321
fbshipit-source-id: 65bf33964b1951464415c900336635fb20919611
2020-03-26 15:18:03 -07:00
|
|
|
const bool no_io = read_options.read_tier == kBlockCacheTier;
|
|
|
|
|
2016-04-13 13:02:33 -07:00
|
|
|
if (!filter->IsBlockBased()) {
|
2017-03-22 09:11:23 -07:00
|
|
|
const Slice* const const_ikey_ptr = &internal_key;
|
2018-06-26 15:56:26 -07:00
|
|
|
may_match = filter->RangeMayExist(
|
2020-12-01 14:05:19 -08:00
|
|
|
read_options.iterate_upper_bound, user_key_without_ts,
|
|
|
|
prefix_extractor, rep_->internal_comparator.user_comparator(),
|
|
|
|
const_ikey_ptr, &filter_checked, need_upper_bound_check, no_io,
|
|
|
|
lookup_context);
|
2016-04-13 13:02:33 -07:00
|
|
|
} else {
|
2018-06-26 15:56:26 -07:00
|
|
|
// if prefix_extractor changed for block based filter, skip filter
|
|
|
|
if (need_upper_bound_check) {
|
|
|
|
return true;
|
|
|
|
}
|
2020-12-01 14:05:19 -08:00
|
|
|
auto prefix = prefix_extractor->Transform(user_key_without_ts);
|
2017-03-22 09:11:23 -07:00
|
|
|
InternalKey internal_key_prefix(prefix, kMaxSequenceNumber, kTypeValue);
|
|
|
|
auto internal_prefix = internal_key_prefix.Encode();
|
|
|
|
|
|
|
|
// To prevent any io operation in this method, we set `read_tier` to make
|
|
|
|
// sure we always read index or filter only when they have already been
|
|
|
|
// loaded to memory.
|
|
|
|
ReadOptions no_io_read_options;
|
|
|
|
no_io_read_options.read_tier = kBlockCacheTier;
|
|
|
|
|
2016-04-13 13:02:33 -07:00
|
|
|
// Then, try find it within each block
|
2018-05-21 14:33:55 -07:00
|
|
|
// we already know prefix_extractor and prefix_extractor_name must match
|
|
|
|
// because `CheckPrefixMayMatch` first checks `check_filter_ == true`
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-24 20:50:35 -07:00
|
|
|
std::unique_ptr<InternalIteratorBase<IndexValue>> iiter(NewIndexIterator(
|
2019-06-10 15:30:05 -07:00
|
|
|
no_io_read_options,
|
|
|
|
/*need_upper_bound_check=*/false, /*input_iter=*/nullptr,
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-24 20:50:35 -07:00
|
|
|
/*get_context=*/nullptr, lookup_context));
|
2016-04-13 13:02:33 -07:00
|
|
|
iiter->Seek(internal_prefix);
|
|
|
|
|
|
|
|
if (!iiter->Valid()) {
|
|
|
|
// we're past end of file
|
|
|
|
// if it's incomplete, it means that we avoided I/O
|
|
|
|
// and we're not really sure that we're past the end
|
|
|
|
// of the file
|
|
|
|
may_match = iiter->status().IsIncomplete();
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-24 20:50:35 -07:00
|
|
|
} else if ((rep_->index_key_includes_seq ? ExtractUserKey(iiter->key())
|
|
|
|
: iiter->key())
|
2016-04-13 13:02:33 -07:00
|
|
|
.starts_with(ExtractUserKey(internal_prefix))) {
|
|
|
|
// we need to check for this subtle case because our only
|
|
|
|
// guarantee is that "the key is a string >= last key in that data
|
|
|
|
// block" according to the doc/table_format.txt spec.
|
|
|
|
//
|
|
|
|
// Suppose iiter->key() starts with the desired prefix; it is not
|
|
|
|
// necessarily the case that the corresponding data block will
|
|
|
|
// contain the prefix, since iiter->key() need not be in the
|
|
|
|
// block. However, the next data block may contain the prefix, so
|
|
|
|
// we return true to play it safe.
|
|
|
|
may_match = true;
|
|
|
|
} else if (filter->IsBlockBased()) {
|
|
|
|
// iiter->key() does NOT start with the desired prefix. Because
|
|
|
|
// Seek() finds the first key that is >= the seek target, this
|
|
|
|
// means that iiter->key() > prefix. Thus, any data blocks coming
|
|
|
|
// after the data block corresponding to iiter->key() cannot
|
|
|
|
// possibly contain the key. Thus, the corresponding data block
|
|
|
|
// is the only on could potentially contain the prefix.
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-24 20:50:35 -07:00
|
|
|
BlockHandle handle = iiter->value().handle;
|
2019-06-10 15:30:05 -07:00
|
|
|
may_match = filter->PrefixMayMatch(
|
Fix iterator reading filter block despite read_tier == kBlockCacheTier (#6562)
Summary:
We're seeing iterators with `ReadOptions::read_tier == kBlockCacheTier` sometimes doing file reads. Stack trace:
```
rocksdb::RandomAccessFileReader::Read(unsigned long, unsigned long, rocksdb::Slice*, char*, bool) const
rocksdb::BlockFetcher::ReadBlockContents()
rocksdb::Status rocksdb::BlockBasedTable::MaybeReadBlockAndLoadToCache<rocksdb::ParsedFullFilterBlock>(rocksdb::FilePrefetchBuffer*, rocksdb::ReadOptions const&, rocksdb::BlockHandle const&, rocksdb::UncompressionDict const&, rocksdb::CachableEntry<rocksdb::ParsedFullFilterBlock>*, rocksdb::BlockType, rocksdb::GetContext*, rocksdb::BlockCacheLookupContext*, rocksdb::BlockContents*) const
rocksdb::Status rocksdb::BlockBasedTable::RetrieveBlock<rocksdb::ParsedFullFilterBlock>(rocksdb::FilePrefetchBuffer*, rocksdb::ReadOptions const&, rocksdb::BlockHandle const&, rocksdb::UncompressionDict const&, rocksdb::CachableEntry<rocksdb::ParsedFullFilterBlock>*, rocksdb::BlockType, rocksdb::GetContext*, rocksdb::BlockCacheLookupContext*, bool, bool) const
rocksdb::FilterBlockReaderCommon<rocksdb::ParsedFullFilterBlock>::ReadFilterBlock(rocksdb::BlockBasedTable const*, rocksdb::FilePrefetchBuffer*, rocksdb::ReadOptions const&, bool, rocksdb::GetContext*, rocksdb::BlockCacheLookupContext*, rocksdb::CachableEntry<rocksdb::ParsedFullFilterBlock>*)
rocksdb::FilterBlockReaderCommon<rocksdb::ParsedFullFilterBlock>::GetOrReadFilterBlock(bool, rocksdb::GetContext*, rocksdb::BlockCacheLookupContext*, rocksdb::CachableEntry<rocksdb::ParsedFullFilterBlock>*) const
rocksdb::FullFilterBlockReader::MayMatch(rocksdb::Slice const&, bool, rocksdb::GetContext*, rocksdb::BlockCacheLookupContext*) const
rocksdb::FullFilterBlockReader::RangeMayExist(rocksdb::Slice const*, rocksdb::Slice const&, rocksdb::SliceTransform const*, rocksdb::Comparator const*, rocksdb::Slice const*, bool*, bool, rocksdb::BlockCacheLookupContext*)
rocksdb::BlockBasedTable::PrefixMayMatch(rocksdb::Slice const&, rocksdb::ReadOptions const&, rocksdb::SliceTransform const*, bool, rocksdb::BlockCacheLookupContext*) const
rocksdb::BlockBasedTableIterator<rocksdb::DataBlockIter, rocksdb::Slice>::SeekImpl(rocksdb::Slice const*)
rocksdb::ForwardIterator::SeekInternal(rocksdb::Slice const&, bool)
rocksdb::DBIter::Seek(rocksdb::Slice const&)
```
`BlockBasedTableIterator::CheckPrefixMayMatch` was missing a check for `kBlockCacheTier`. This PR adds it.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6562
Test Plan: deployed it to a logdevice test cluster and looked at logdevice's IO tracing.
Reviewed By: siying
Differential Revision: D20529368
Pulled By: al13n321
fbshipit-source-id: 65bf33964b1951464415c900336635fb20919611
2020-03-26 15:18:03 -07:00
|
|
|
prefix, prefix_extractor, handle.offset(), no_io,
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
/*const_key_ptr=*/nullptr, /*get_context=*/nullptr, lookup_context);
|
2016-04-13 13:02:33 -07:00
|
|
|
}
|
2014-09-08 10:37:05 -07:00
|
|
|
}
|
2013-08-13 14:04:56 -07:00
|
|
|
}
|
2013-08-23 14:49:57 -07:00
|
|
|
|
2018-06-26 15:56:26 -07:00
|
|
|
if (filter_checked) {
|
2021-04-26 12:43:02 -07:00
|
|
|
Statistics* statistics = rep_->ioptions.stats;
|
2018-06-26 15:56:26 -07:00
|
|
|
RecordTick(statistics, BLOOM_FILTER_PREFIX_CHECKED);
|
|
|
|
if (!may_match) {
|
|
|
|
RecordTick(statistics, BLOOM_FILTER_PREFIX_USEFUL);
|
|
|
|
}
|
2013-08-23 14:49:57 -07:00
|
|
|
}
|
|
|
|
|
2013-08-13 14:04:56 -07:00
|
|
|
return may_match;
|
|
|
|
}
|
|
|
|
|
2022-01-21 11:36:36 -08:00
|
|
|
bool BlockBasedTable::PrefixExtractorChanged(
|
|
|
|
const SliceTransform* prefix_extractor) const {
|
|
|
|
if (prefix_extractor == nullptr) {
|
|
|
|
return true;
|
|
|
|
} else if (prefix_extractor == rep_->table_prefix_extractor.get()) {
|
|
|
|
return false;
|
|
|
|
} else {
|
|
|
|
return PrefixExtractorChangedHelper(rep_->table_properties.get(),
|
|
|
|
prefix_extractor);
|
|
|
|
}
|
|
|
|
}
|
2019-07-02 11:45:32 -07:00
|
|
|
|
2018-05-21 14:33:55 -07:00
|
|
|
InternalIterator* BlockBasedTable::NewIterator(
|
|
|
|
const ReadOptions& read_options, const SliceTransform* prefix_extractor,
|
2019-09-20 12:00:55 -07:00
|
|
|
Arena* arena, bool skip_filters, TableReaderCaller caller,
|
Properly report IO errors when IndexType::kBinarySearchWithFirstKey is used (#6621)
Summary:
Context: Index type `kBinarySearchWithFirstKey` added the ability for sst file iterator to sometimes report a key from index without reading the corresponding data block. This is useful when sst blocks are cut at some meaningful boundaries (e.g. one block per key prefix), and many seeks land between blocks (e.g. for each prefix, the ranges of keys in different sst files are nearly disjoint, so a typical seek needs to read a data block from only one file even if all files have the prefix). But this added a new error condition, which rocksdb code was really not equipped to deal with: `InternalIterator::value()` may fail with an IO error or Status::Incomplete, but it's just a method returning a Slice, with no way to report error instead. Before this PR, this type of error wasn't handled at all (an empty slice was returned), and kBinarySearchWithFirstKey implementation was considered a prototype.
Now that we (LogDevice) have experimented with kBinarySearchWithFirstKey for a while and confirmed that it's really useful, this PR is adding the missing error handling.
It's a pretty inconvenient situation implementation-wise. The error needs to be reported from InternalIterator when trying to access value. But there are ~700 call sites of `InternalIterator::value()`, most of which either can't hit the error condition (because the iterator is reading from memtable or from index or something) or wouldn't benefit from the deferred loading of the value (e.g. compaction iterator that reads all values anyway). Adding error handling to all these call sites would needlessly bloat the code. So instead I made the deferred value loading optional: only the call sites that may use deferred loading have to call the new method `PrepareValue()` before calling `value()`. The feature is enabled with a new bool argument `allow_unprepared_value` to a bunch of methods that create iterators (it wouldn't make sense to put it in ReadOptions because it's completely internal to iterators, with virtually no user-visible effect). Lmk if you have better ideas.
Note that the deferred value loading only happens for *internal* iterators. The user-visible iterator (DBIter) always prepares the value before returning from Seek/Next/etc. We could go further and add an API to defer that value loading too, but that's most likely not useful for LogDevice, so it doesn't seem worth the complexity for now.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6621
Test Plan: make -j5 check . Will also deploy to some logdevice test clusters and look at stats.
Reviewed By: siying
Differential Revision: D20786930
Pulled By: al13n321
fbshipit-source-id: 6da77d918bad3780522e918f17f4d5513d3e99ee
2020-04-15 17:37:23 -07:00
|
|
|
size_t compaction_readahead_size, bool allow_unprepared_value) {
|
2019-06-20 14:28:22 -07:00
|
|
|
BlockCacheLookupContext lookup_context{caller};
|
2018-06-26 15:56:26 -07:00
|
|
|
bool need_upper_bound_check =
|
2022-01-21 11:36:36 -08:00
|
|
|
read_options.auto_prefix_mode || PrefixExtractorChanged(prefix_extractor);
|
De-template block based table iterator (#6531)
Summary:
Right now block based table iterator is used as both of iterating data for block based table, and for the index iterator for partitioend index. This was initially convenient for introducing a new iterator and block type for new index format, while reducing code change. However, these two usage doesn't go with each other very well. For example, Prev() is never called for partitioned index iterator, and some other complexity is maintained in block based iterators, which is not needed for index iterator but maintainers will always need to reason about it. Furthermore, the template usage is not following Google C++ Style which we are following, and makes a large chunk of code tangled together. This commit separate the two iterators. Right now, here is what it is done:
1. Copy the block based iterator code into partitioned index iterator, and de-template them.
2. Remove some code not needed for partitioned index. The upper bound check and tricks are removed. We never tested performance for those tricks when partitioned index is enabled in the first place. It's unlikelyl to generate performance regression, as creating new partitioned index block is much rarer than data blocks.
3. Separate out the prefetch logic to a helper class and both classes call them.
This commit will enable future follow-ups. One direction is that we might separate index iterator interface for data blocks and index blocks, as they are quite different.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6531
Test Plan: build using make and cmake. And build release
Differential Revision: D20473108
fbshipit-source-id: e48011783b339a4257c204cc07507b171b834b0f
2020-03-16 12:17:34 -07:00
|
|
|
std::unique_ptr<InternalIteratorBase<IndexValue>> index_iter(NewIndexIterator(
|
|
|
|
read_options,
|
|
|
|
need_upper_bound_check &&
|
|
|
|
rep_->index_type == BlockBasedTableOptions::kHashSearch,
|
|
|
|
/*input_iter=*/nullptr, /*get_context=*/nullptr, &lookup_context));
|
2018-02-12 16:57:56 -08:00
|
|
|
if (arena == nullptr) {
|
De-template block based table iterator (#6531)
Summary:
Right now block based table iterator is used as both of iterating data for block based table, and for the index iterator for partitioend index. This was initially convenient for introducing a new iterator and block type for new index format, while reducing code change. However, these two usage doesn't go with each other very well. For example, Prev() is never called for partitioned index iterator, and some other complexity is maintained in block based iterators, which is not needed for index iterator but maintainers will always need to reason about it. Furthermore, the template usage is not following Google C++ Style which we are following, and makes a large chunk of code tangled together. This commit separate the two iterators. Right now, here is what it is done:
1. Copy the block based iterator code into partitioned index iterator, and de-template them.
2. Remove some code not needed for partitioned index. The upper bound check and tricks are removed. We never tested performance for those tricks when partitioned index is enabled in the first place. It's unlikelyl to generate performance regression, as creating new partitioned index block is much rarer than data blocks.
3. Separate out the prefetch logic to a helper class and both classes call them.
This commit will enable future follow-ups. One direction is that we might separate index iterator interface for data blocks and index blocks, as they are quite different.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6531
Test Plan: build using make and cmake. And build release
Differential Revision: D20473108
fbshipit-source-id: e48011783b339a4257c204cc07507b171b834b0f
2020-03-16 12:17:34 -07:00
|
|
|
return new BlockBasedTableIterator(
|
|
|
|
this, read_options, rep_->internal_comparator, std::move(index_iter),
|
2018-02-12 16:57:56 -08:00
|
|
|
!skip_filters && !read_options.total_order_seek &&
|
2018-06-26 15:56:26 -07:00
|
|
|
prefix_extractor != nullptr,
|
De-template block based table iterator (#6531)
Summary:
Right now block based table iterator is used as both of iterating data for block based table, and for the index iterator for partitioend index. This was initially convenient for introducing a new iterator and block type for new index format, while reducing code change. However, these two usage doesn't go with each other very well. For example, Prev() is never called for partitioned index iterator, and some other complexity is maintained in block based iterators, which is not needed for index iterator but maintainers will always need to reason about it. Furthermore, the template usage is not following Google C++ Style which we are following, and makes a large chunk of code tangled together. This commit separate the two iterators. Right now, here is what it is done:
1. Copy the block based iterator code into partitioned index iterator, and de-template them.
2. Remove some code not needed for partitioned index. The upper bound check and tricks are removed. We never tested performance for those tricks when partitioned index is enabled in the first place. It's unlikelyl to generate performance regression, as creating new partitioned index block is much rarer than data blocks.
3. Separate out the prefetch logic to a helper class and both classes call them.
This commit will enable future follow-ups. One direction is that we might separate index iterator interface for data blocks and index blocks, as they are quite different.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6531
Test Plan: build using make and cmake. And build release
Differential Revision: D20473108
fbshipit-source-id: e48011783b339a4257c204cc07507b171b834b0f
2020-03-16 12:17:34 -07:00
|
|
|
need_upper_bound_check, prefix_extractor, caller,
|
Properly report IO errors when IndexType::kBinarySearchWithFirstKey is used (#6621)
Summary:
Context: Index type `kBinarySearchWithFirstKey` added the ability for sst file iterator to sometimes report a key from index without reading the corresponding data block. This is useful when sst blocks are cut at some meaningful boundaries (e.g. one block per key prefix), and many seeks land between blocks (e.g. for each prefix, the ranges of keys in different sst files are nearly disjoint, so a typical seek needs to read a data block from only one file even if all files have the prefix). But this added a new error condition, which rocksdb code was really not equipped to deal with: `InternalIterator::value()` may fail with an IO error or Status::Incomplete, but it's just a method returning a Slice, with no way to report error instead. Before this PR, this type of error wasn't handled at all (an empty slice was returned), and kBinarySearchWithFirstKey implementation was considered a prototype.
Now that we (LogDevice) have experimented with kBinarySearchWithFirstKey for a while and confirmed that it's really useful, this PR is adding the missing error handling.
It's a pretty inconvenient situation implementation-wise. The error needs to be reported from InternalIterator when trying to access value. But there are ~700 call sites of `InternalIterator::value()`, most of which either can't hit the error condition (because the iterator is reading from memtable or from index or something) or wouldn't benefit from the deferred loading of the value (e.g. compaction iterator that reads all values anyway). Adding error handling to all these call sites would needlessly bloat the code. So instead I made the deferred value loading optional: only the call sites that may use deferred loading have to call the new method `PrepareValue()` before calling `value()`. The feature is enabled with a new bool argument `allow_unprepared_value` to a bunch of methods that create iterators (it wouldn't make sense to put it in ReadOptions because it's completely internal to iterators, with virtually no user-visible effect). Lmk if you have better ideas.
Note that the deferred value loading only happens for *internal* iterators. The user-visible iterator (DBIter) always prepares the value before returning from Seek/Next/etc. We could go further and add an API to defer that value loading too, but that's most likely not useful for LogDevice, so it doesn't seem worth the complexity for now.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6621
Test Plan: make -j5 check . Will also deploy to some logdevice test clusters and look at stats.
Reviewed By: siying
Differential Revision: D20786930
Pulled By: al13n321
fbshipit-source-id: 6da77d918bad3780522e918f17f4d5513d3e99ee
2020-04-15 17:37:23 -07:00
|
|
|
compaction_readahead_size, allow_unprepared_value);
|
2018-02-12 16:57:56 -08:00
|
|
|
} else {
|
De-template block based table iterator (#6531)
Summary:
Right now block based table iterator is used as both of iterating data for block based table, and for the index iterator for partitioend index. This was initially convenient for introducing a new iterator and block type for new index format, while reducing code change. However, these two usage doesn't go with each other very well. For example, Prev() is never called for partitioned index iterator, and some other complexity is maintained in block based iterators, which is not needed for index iterator but maintainers will always need to reason about it. Furthermore, the template usage is not following Google C++ Style which we are following, and makes a large chunk of code tangled together. This commit separate the two iterators. Right now, here is what it is done:
1. Copy the block based iterator code into partitioned index iterator, and de-template them.
2. Remove some code not needed for partitioned index. The upper bound check and tricks are removed. We never tested performance for those tricks when partitioned index is enabled in the first place. It's unlikelyl to generate performance regression, as creating new partitioned index block is much rarer than data blocks.
3. Separate out the prefetch logic to a helper class and both classes call them.
This commit will enable future follow-ups. One direction is that we might separate index iterator interface for data blocks and index blocks, as they are quite different.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6531
Test Plan: build using make and cmake. And build release
Differential Revision: D20473108
fbshipit-source-id: e48011783b339a4257c204cc07507b171b834b0f
2020-03-16 12:17:34 -07:00
|
|
|
auto* mem = arena->AllocateAligned(sizeof(BlockBasedTableIterator));
|
|
|
|
return new (mem) BlockBasedTableIterator(
|
|
|
|
this, read_options, rep_->internal_comparator, std::move(index_iter),
|
2018-02-12 16:57:56 -08:00
|
|
|
!skip_filters && !read_options.total_order_seek &&
|
2018-06-26 15:56:26 -07:00
|
|
|
prefix_extractor != nullptr,
|
De-template block based table iterator (#6531)
Summary:
Right now block based table iterator is used as both of iterating data for block based table, and for the index iterator for partitioend index. This was initially convenient for introducing a new iterator and block type for new index format, while reducing code change. However, these two usage doesn't go with each other very well. For example, Prev() is never called for partitioned index iterator, and some other complexity is maintained in block based iterators, which is not needed for index iterator but maintainers will always need to reason about it. Furthermore, the template usage is not following Google C++ Style which we are following, and makes a large chunk of code tangled together. This commit separate the two iterators. Right now, here is what it is done:
1. Copy the block based iterator code into partitioned index iterator, and de-template them.
2. Remove some code not needed for partitioned index. The upper bound check and tricks are removed. We never tested performance for those tricks when partitioned index is enabled in the first place. It's unlikelyl to generate performance regression, as creating new partitioned index block is much rarer than data blocks.
3. Separate out the prefetch logic to a helper class and both classes call them.
This commit will enable future follow-ups. One direction is that we might separate index iterator interface for data blocks and index blocks, as they are quite different.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6531
Test Plan: build using make and cmake. And build release
Differential Revision: D20473108
fbshipit-source-id: e48011783b339a4257c204cc07507b171b834b0f
2020-03-16 12:17:34 -07:00
|
|
|
need_upper_bound_check, prefix_extractor, caller,
|
Properly report IO errors when IndexType::kBinarySearchWithFirstKey is used (#6621)
Summary:
Context: Index type `kBinarySearchWithFirstKey` added the ability for sst file iterator to sometimes report a key from index without reading the corresponding data block. This is useful when sst blocks are cut at some meaningful boundaries (e.g. one block per key prefix), and many seeks land between blocks (e.g. for each prefix, the ranges of keys in different sst files are nearly disjoint, so a typical seek needs to read a data block from only one file even if all files have the prefix). But this added a new error condition, which rocksdb code was really not equipped to deal with: `InternalIterator::value()` may fail with an IO error or Status::Incomplete, but it's just a method returning a Slice, with no way to report error instead. Before this PR, this type of error wasn't handled at all (an empty slice was returned), and kBinarySearchWithFirstKey implementation was considered a prototype.
Now that we (LogDevice) have experimented with kBinarySearchWithFirstKey for a while and confirmed that it's really useful, this PR is adding the missing error handling.
It's a pretty inconvenient situation implementation-wise. The error needs to be reported from InternalIterator when trying to access value. But there are ~700 call sites of `InternalIterator::value()`, most of which either can't hit the error condition (because the iterator is reading from memtable or from index or something) or wouldn't benefit from the deferred loading of the value (e.g. compaction iterator that reads all values anyway). Adding error handling to all these call sites would needlessly bloat the code. So instead I made the deferred value loading optional: only the call sites that may use deferred loading have to call the new method `PrepareValue()` before calling `value()`. The feature is enabled with a new bool argument `allow_unprepared_value` to a bunch of methods that create iterators (it wouldn't make sense to put it in ReadOptions because it's completely internal to iterators, with virtually no user-visible effect). Lmk if you have better ideas.
Note that the deferred value loading only happens for *internal* iterators. The user-visible iterator (DBIter) always prepares the value before returning from Seek/Next/etc. We could go further and add an API to defer that value loading too, but that's most likely not useful for LogDevice, so it doesn't seem worth the complexity for now.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6621
Test Plan: make -j5 check . Will also deploy to some logdevice test clusters and look at stats.
Reviewed By: siying
Differential Revision: D20786930
Pulled By: al13n321
fbshipit-source-id: 6da77d918bad3780522e918f17f4d5513d3e99ee
2020-04-15 17:37:23 -07:00
|
|
|
compaction_readahead_size, allow_unprepared_value);
|
2018-02-12 16:57:56 -08:00
|
|
|
}
|
2011-03-18 22:37:00 +00:00
|
|
|
}
|
|
|
|
|
2018-11-28 15:26:56 -08:00
|
|
|
FragmentedRangeTombstoneIterator* BlockBasedTable::NewRangeTombstoneIterator(
|
2018-11-14 16:18:16 -08:00
|
|
|
const ReadOptions& read_options) {
|
Cache fragmented range tombstones in BlockBasedTableReader (#4493)
Summary:
This allows tombstone fragmenting to only be performed when the table is opened, and cached for subsequent accesses.
On the same DB used in #4449, running `readrandom` results in the following:
```
readrandom : 0.983 micros/op 1017076 ops/sec; 78.3 MB/s (63103 of 100000 found)
```
Now that Get performance in the presence of range tombstones is reasonable, I also compared the performance between a DB with range tombstones, "expanded" range tombstones (several point tombstones that cover the same keys the equivalent range tombstone would cover, a common workaround for DeleteRange), and no range tombstones. The created DBs had 5 million keys each, and DeleteRange was called at regular intervals (depending on the total number of range tombstones being written) after 4.5 million Puts. The table below summarizes the results of a `readwhilewriting` benchmark (in order to provide somewhat more realistic results):
```
Tombstones? | avg micros/op | stddev micros/op | avg ops/s | stddev ops/s
----------------- | ------------- | ---------------- | ------------ | ------------
None | 0.6186 | 0.04637 | 1,625,252.90 | 124,679.41
500 Expanded | 0.6019 | 0.03628 | 1,666,670.40 | 101,142.65
500 Unexpanded | 0.6435 | 0.03994 | 1,559,979.40 | 104,090.52
1k Expanded | 0.6034 | 0.04349 | 1,665,128.10 | 125,144.57
1k Unexpanded | 0.6261 | 0.03093 | 1,600,457.50 | 79,024.94
5k Expanded | 0.6163 | 0.05926 | 1,636,668.80 | 154,888.85
5k Unexpanded | 0.6402 | 0.04002 | 1,567,804.70 | 100,965.55
10k Expanded | 0.6036 | 0.05105 | 1,667,237.70 | 142,830.36
10k Unexpanded | 0.6128 | 0.02598 | 1,634,633.40 | 72,161.82
25k Expanded | 0.6198 | 0.04542 | 1,620,980.50 | 116,662.93
25k Unexpanded | 0.5478 | 0.0362 | 1,833,059.10 | 121,233.81
50k Expanded | 0.5104 | 0.04347 | 1,973,107.90 | 184,073.49
50k Unexpanded | 0.4528 | 0.03387 | 2,219,034.50 | 170,984.32
```
After a large enough quantity of range tombstones are written, range tombstone Gets can become faster than reading from an equivalent DB with several point tombstones.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4493
Differential Revision: D10842844
Pulled By: abhimadan
fbshipit-source-id: a7d44534f8120e6aabb65779d26c6b9df954c509
2018-10-25 19:25:00 -07:00
|
|
|
if (rep_->fragmented_range_dels == nullptr) {
|
|
|
|
return nullptr;
|
|
|
|
}
|
2018-11-14 16:18:16 -08:00
|
|
|
SequenceNumber snapshot = kMaxSequenceNumber;
|
|
|
|
if (read_options.snapshot != nullptr) {
|
|
|
|
snapshot = read_options.snapshot->GetSequenceNumber();
|
|
|
|
}
|
|
|
|
return new FragmentedRangeTombstoneIterator(
|
2018-12-11 11:44:24 -08:00
|
|
|
rep_->fragmented_range_dels, rep_->internal_comparator, snapshot);
|
Cache fragmented range tombstones in BlockBasedTableReader (#4493)
Summary:
This allows tombstone fragmenting to only be performed when the table is opened, and cached for subsequent accesses.
On the same DB used in #4449, running `readrandom` results in the following:
```
readrandom : 0.983 micros/op 1017076 ops/sec; 78.3 MB/s (63103 of 100000 found)
```
Now that Get performance in the presence of range tombstones is reasonable, I also compared the performance between a DB with range tombstones, "expanded" range tombstones (several point tombstones that cover the same keys the equivalent range tombstone would cover, a common workaround for DeleteRange), and no range tombstones. The created DBs had 5 million keys each, and DeleteRange was called at regular intervals (depending on the total number of range tombstones being written) after 4.5 million Puts. The table below summarizes the results of a `readwhilewriting` benchmark (in order to provide somewhat more realistic results):
```
Tombstones? | avg micros/op | stddev micros/op | avg ops/s | stddev ops/s
----------------- | ------------- | ---------------- | ------------ | ------------
None | 0.6186 | 0.04637 | 1,625,252.90 | 124,679.41
500 Expanded | 0.6019 | 0.03628 | 1,666,670.40 | 101,142.65
500 Unexpanded | 0.6435 | 0.03994 | 1,559,979.40 | 104,090.52
1k Expanded | 0.6034 | 0.04349 | 1,665,128.10 | 125,144.57
1k Unexpanded | 0.6261 | 0.03093 | 1,600,457.50 | 79,024.94
5k Expanded | 0.6163 | 0.05926 | 1,636,668.80 | 154,888.85
5k Unexpanded | 0.6402 | 0.04002 | 1,567,804.70 | 100,965.55
10k Expanded | 0.6036 | 0.05105 | 1,667,237.70 | 142,830.36
10k Unexpanded | 0.6128 | 0.02598 | 1,634,633.40 | 72,161.82
25k Expanded | 0.6198 | 0.04542 | 1,620,980.50 | 116,662.93
25k Unexpanded | 0.5478 | 0.0362 | 1,833,059.10 | 121,233.81
50k Expanded | 0.5104 | 0.04347 | 1,973,107.90 | 184,073.49
50k Unexpanded | 0.4528 | 0.03387 | 2,219,034.50 | 170,984.32
```
After a large enough quantity of range tombstones are written, range tombstone Gets can become faster than reading from an equivalent DB with several point tombstones.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4493
Differential Revision: D10842844
Pulled By: abhimadan
fbshipit-source-id: a7d44534f8120e6aabb65779d26c6b9df954c509
2018-10-25 19:25:00 -07:00
|
|
|
}
|
|
|
|
|
2018-05-21 14:33:55 -07:00
|
|
|
bool BlockBasedTable::FullFilterKeyMayMatch(
|
2022-01-31 19:45:17 -08:00
|
|
|
FilterBlockReader* filter, const Slice& internal_key, const bool no_io,
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
const SliceTransform* prefix_extractor, GetContext* get_context,
|
2019-06-10 15:30:05 -07:00
|
|
|
BlockCacheLookupContext* lookup_context) const {
|
2015-02-02 17:42:57 -08:00
|
|
|
if (filter == nullptr || filter->IsBlockBased()) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
Slice user_key = ExtractUserKey(internal_key);
|
2017-03-22 09:11:23 -07:00
|
|
|
const Slice* const const_ikey_ptr = &internal_key;
|
2018-04-05 15:54:24 -07:00
|
|
|
bool may_match = true;
|
2020-11-03 09:44:21 -08:00
|
|
|
size_t ts_sz = rep_->internal_comparator.user_comparator()->timestamp_size();
|
|
|
|
Slice user_key_without_ts = StripTimestampFromUserKey(user_key, ts_sz);
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
if (rep_->whole_key_filtering) {
|
2019-06-10 15:30:05 -07:00
|
|
|
may_match =
|
|
|
|
filter->KeyMayMatch(user_key_without_ts, prefix_extractor, kNotValid,
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
no_io, const_ikey_ptr, get_context, lookup_context);
|
2022-01-31 19:45:17 -08:00
|
|
|
} else if (!PrefixExtractorChanged(prefix_extractor) &&
|
2020-11-03 09:44:21 -08:00
|
|
|
prefix_extractor->InDomain(user_key_without_ts) &&
|
|
|
|
!filter->PrefixMayMatch(
|
|
|
|
prefix_extractor->Transform(user_key_without_ts),
|
|
|
|
prefix_extractor, kNotValid, no_io, const_ikey_ptr,
|
|
|
|
get_context, lookup_context)) {
|
2022-01-31 19:45:17 -08:00
|
|
|
// FIXME ^^^: there should be no reason for Get() to depend on current
|
|
|
|
// prefix_extractor at all. It should always use table_prefix_extractor.
|
2018-04-05 15:54:24 -07:00
|
|
|
may_match = false;
|
|
|
|
}
|
|
|
|
if (may_match) {
|
2021-04-26 12:43:02 -07:00
|
|
|
RecordTick(rep_->ioptions.stats, BLOOM_FILTER_FULL_POSITIVE);
|
2018-10-24 12:10:59 -07:00
|
|
|
PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, 1, rep_->level);
|
2015-02-02 17:42:57 -08:00
|
|
|
}
|
2018-04-05 15:54:24 -07:00
|
|
|
return may_match;
|
2015-02-02 17:42:57 -08:00
|
|
|
}
|
|
|
|
|
Introduce a new MultiGet batching implementation (#5011)
Summary:
This PR introduces a new MultiGet() API, with the underlying implementation grouping keys based on SST file and batching lookups in a file. The reason for the new API is twofold - the definition allows callers to allocate storage for status and values on stack instead of std::vector, as well as return values as PinnableSlices in order to avoid copying, and it keeps the original MultiGet() implementation intact while we experiment with batching.
Batching is useful when there is some spatial locality to the keys being queries, as well as larger batch sizes. The main benefits are due to -
1. Fewer function calls, especially to BlockBasedTableReader::MultiGet() and FullFilterBlockReader::KeysMayMatch()
2. Bloom filter cachelines can be prefetched, hiding the cache miss latency
The next step is to optimize the binary searches in the level_storage_info, index blocks and data blocks, since we could reduce the number of key comparisons if the keys are relatively close to each other. The batching optimizations also need to be extended to other formats, such as PlainTable and filter formats. This also needs to be added to db_stress.
Benchmark results from db_bench for various batch size/locality of reference combinations are given below. Locality was simulated by offsetting the keys in a batch by a stride length. Each SST file is about 8.6MB uncompressed and key/value size is 16/100 uncompressed. To focus on the cpu benefit of batching, the runs were single threaded and bound to the same cpu to eliminate interference from other system events. The results show a 10-25% improvement in micros/op from smaller to larger batch sizes (4 - 32).
Batch Sizes
1 | 2 | 4 | 8 | 16 | 32
Random pattern (Stride length 0)
4.158 | 4.109 | 4.026 | 4.05 | 4.1 | 4.074 - Get
4.438 | 4.302 | 4.165 | 4.122 | 4.096 | 4.075 - MultiGet (no batching)
4.461 | 4.256 | 4.277 | 4.11 | 4.182 | 4.14 - MultiGet (w/ batching)
Good locality (Stride length 16)
4.048 | 3.659 | 3.248 | 2.99 | 2.84 | 2.753
4.429 | 3.728 | 3.406 | 3.053 | 2.911 | 2.781
4.452 | 3.45 | 2.833 | 2.451 | 2.233 | 2.135
Good locality (Stride length 256)
4.066 | 3.786 | 3.581 | 3.447 | 3.415 | 3.232
4.406 | 4.005 | 3.644 | 3.49 | 3.381 | 3.268
4.393 | 3.649 | 3.186 | 2.882 | 2.676 | 2.62
Medium locality (Stride length 4096)
4.012 | 3.922 | 3.768 | 3.61 | 3.582 | 3.555
4.364 | 4.057 | 3.791 | 3.65 | 3.57 | 3.465
4.479 | 3.758 | 3.316 | 3.077 | 2.959 | 2.891
dbbench command used (on a DB with 4 levels, 12 million keys)-
TEST_TMPDIR=/dev/shm numactl -C 10 ./db_bench.tmp -use_existing_db=true -benchmarks="readseq,multireadrandom" -write_buffer_size=4194304 -target_file_size_base=4194304 -max_bytes_for_level_base=16777216 -num=12000000 -reads=12000000 -duration=90 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5011
Differential Revision: D14348703
Pulled By: anand1976
fbshipit-source-id: 774406dab3776d979c809522a67bedac6c17f84b
2019-04-11 14:24:09 -07:00
|
|
|
void BlockBasedTable::FullFilterKeysMayMatch(
|
2022-01-31 19:45:17 -08:00
|
|
|
FilterBlockReader* filter, MultiGetRange* range, const bool no_io,
|
2019-06-10 15:30:05 -07:00
|
|
|
const SliceTransform* prefix_extractor,
|
|
|
|
BlockCacheLookupContext* lookup_context) const {
|
Introduce a new MultiGet batching implementation (#5011)
Summary:
This PR introduces a new MultiGet() API, with the underlying implementation grouping keys based on SST file and batching lookups in a file. The reason for the new API is twofold - the definition allows callers to allocate storage for status and values on stack instead of std::vector, as well as return values as PinnableSlices in order to avoid copying, and it keeps the original MultiGet() implementation intact while we experiment with batching.
Batching is useful when there is some spatial locality to the keys being queries, as well as larger batch sizes. The main benefits are due to -
1. Fewer function calls, especially to BlockBasedTableReader::MultiGet() and FullFilterBlockReader::KeysMayMatch()
2. Bloom filter cachelines can be prefetched, hiding the cache miss latency
The next step is to optimize the binary searches in the level_storage_info, index blocks and data blocks, since we could reduce the number of key comparisons if the keys are relatively close to each other. The batching optimizations also need to be extended to other formats, such as PlainTable and filter formats. This also needs to be added to db_stress.
Benchmark results from db_bench for various batch size/locality of reference combinations are given below. Locality was simulated by offsetting the keys in a batch by a stride length. Each SST file is about 8.6MB uncompressed and key/value size is 16/100 uncompressed. To focus on the cpu benefit of batching, the runs were single threaded and bound to the same cpu to eliminate interference from other system events. The results show a 10-25% improvement in micros/op from smaller to larger batch sizes (4 - 32).
Batch Sizes
1 | 2 | 4 | 8 | 16 | 32
Random pattern (Stride length 0)
4.158 | 4.109 | 4.026 | 4.05 | 4.1 | 4.074 - Get
4.438 | 4.302 | 4.165 | 4.122 | 4.096 | 4.075 - MultiGet (no batching)
4.461 | 4.256 | 4.277 | 4.11 | 4.182 | 4.14 - MultiGet (w/ batching)
Good locality (Stride length 16)
4.048 | 3.659 | 3.248 | 2.99 | 2.84 | 2.753
4.429 | 3.728 | 3.406 | 3.053 | 2.911 | 2.781
4.452 | 3.45 | 2.833 | 2.451 | 2.233 | 2.135
Good locality (Stride length 256)
4.066 | 3.786 | 3.581 | 3.447 | 3.415 | 3.232
4.406 | 4.005 | 3.644 | 3.49 | 3.381 | 3.268
4.393 | 3.649 | 3.186 | 2.882 | 2.676 | 2.62
Medium locality (Stride length 4096)
4.012 | 3.922 | 3.768 | 3.61 | 3.582 | 3.555
4.364 | 4.057 | 3.791 | 3.65 | 3.57 | 3.465
4.479 | 3.758 | 3.316 | 3.077 | 2.959 | 2.891
dbbench command used (on a DB with 4 levels, 12 million keys)-
TEST_TMPDIR=/dev/shm numactl -C 10 ./db_bench.tmp -use_existing_db=true -benchmarks="readseq,multireadrandom" -write_buffer_size=4194304 -target_file_size_base=4194304 -max_bytes_for_level_base=16777216 -num=12000000 -reads=12000000 -duration=90 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5011
Differential Revision: D14348703
Pulled By: anand1976
fbshipit-source-id: 774406dab3776d979c809522a67bedac6c17f84b
2019-04-11 14:24:09 -07:00
|
|
|
if (filter == nullptr || filter->IsBlockBased()) {
|
|
|
|
return;
|
|
|
|
}
|
2020-04-28 14:46:13 -07:00
|
|
|
uint64_t before_keys = range->KeysLeft();
|
|
|
|
assert(before_keys > 0); // Caller should ensure
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
if (rep_->whole_key_filtering) {
|
2019-06-10 15:30:05 -07:00
|
|
|
filter->KeysMayMatch(range, prefix_extractor, kNotValid, no_io,
|
|
|
|
lookup_context);
|
2020-04-28 14:46:13 -07:00
|
|
|
uint64_t after_keys = range->KeysLeft();
|
|
|
|
if (after_keys) {
|
2021-04-26 12:43:02 -07:00
|
|
|
RecordTick(rep_->ioptions.stats, BLOOM_FILTER_FULL_POSITIVE, after_keys);
|
2020-04-28 14:46:13 -07:00
|
|
|
PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, after_keys,
|
|
|
|
rep_->level);
|
|
|
|
}
|
|
|
|
uint64_t filtered_keys = before_keys - after_keys;
|
|
|
|
if (filtered_keys) {
|
2021-04-26 12:43:02 -07:00
|
|
|
RecordTick(rep_->ioptions.stats, BLOOM_FILTER_USEFUL, filtered_keys);
|
2020-04-28 14:46:13 -07:00
|
|
|
PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, filtered_keys,
|
|
|
|
rep_->level);
|
|
|
|
}
|
2022-01-31 19:45:17 -08:00
|
|
|
} else if (!PrefixExtractorChanged(prefix_extractor)) {
|
|
|
|
// FIXME ^^^: there should be no reason for MultiGet() to depend on current
|
|
|
|
// prefix_extractor at all. It should always use table_prefix_extractor.
|
2019-06-10 15:30:05 -07:00
|
|
|
filter->PrefixesMayMatch(range, prefix_extractor, kNotValid, false,
|
|
|
|
lookup_context);
|
2021-04-26 12:43:02 -07:00
|
|
|
RecordTick(rep_->ioptions.stats, BLOOM_FILTER_PREFIX_CHECKED, before_keys);
|
2020-04-28 14:46:13 -07:00
|
|
|
uint64_t after_keys = range->KeysLeft();
|
|
|
|
uint64_t filtered_keys = before_keys - after_keys;
|
|
|
|
if (filtered_keys) {
|
2021-04-26 12:43:02 -07:00
|
|
|
RecordTick(rep_->ioptions.stats, BLOOM_FILTER_PREFIX_USEFUL,
|
2020-04-28 14:46:13 -07:00
|
|
|
filtered_keys);
|
|
|
|
}
|
Introduce a new MultiGet batching implementation (#5011)
Summary:
This PR introduces a new MultiGet() API, with the underlying implementation grouping keys based on SST file and batching lookups in a file. The reason for the new API is twofold - the definition allows callers to allocate storage for status and values on stack instead of std::vector, as well as return values as PinnableSlices in order to avoid copying, and it keeps the original MultiGet() implementation intact while we experiment with batching.
Batching is useful when there is some spatial locality to the keys being queries, as well as larger batch sizes. The main benefits are due to -
1. Fewer function calls, especially to BlockBasedTableReader::MultiGet() and FullFilterBlockReader::KeysMayMatch()
2. Bloom filter cachelines can be prefetched, hiding the cache miss latency
The next step is to optimize the binary searches in the level_storage_info, index blocks and data blocks, since we could reduce the number of key comparisons if the keys are relatively close to each other. The batching optimizations also need to be extended to other formats, such as PlainTable and filter formats. This also needs to be added to db_stress.
Benchmark results from db_bench for various batch size/locality of reference combinations are given below. Locality was simulated by offsetting the keys in a batch by a stride length. Each SST file is about 8.6MB uncompressed and key/value size is 16/100 uncompressed. To focus on the cpu benefit of batching, the runs were single threaded and bound to the same cpu to eliminate interference from other system events. The results show a 10-25% improvement in micros/op from smaller to larger batch sizes (4 - 32).
Batch Sizes
1 | 2 | 4 | 8 | 16 | 32
Random pattern (Stride length 0)
4.158 | 4.109 | 4.026 | 4.05 | 4.1 | 4.074 - Get
4.438 | 4.302 | 4.165 | 4.122 | 4.096 | 4.075 - MultiGet (no batching)
4.461 | 4.256 | 4.277 | 4.11 | 4.182 | 4.14 - MultiGet (w/ batching)
Good locality (Stride length 16)
4.048 | 3.659 | 3.248 | 2.99 | 2.84 | 2.753
4.429 | 3.728 | 3.406 | 3.053 | 2.911 | 2.781
4.452 | 3.45 | 2.833 | 2.451 | 2.233 | 2.135
Good locality (Stride length 256)
4.066 | 3.786 | 3.581 | 3.447 | 3.415 | 3.232
4.406 | 4.005 | 3.644 | 3.49 | 3.381 | 3.268
4.393 | 3.649 | 3.186 | 2.882 | 2.676 | 2.62
Medium locality (Stride length 4096)
4.012 | 3.922 | 3.768 | 3.61 | 3.582 | 3.555
4.364 | 4.057 | 3.791 | 3.65 | 3.57 | 3.465
4.479 | 3.758 | 3.316 | 3.077 | 2.959 | 2.891
dbbench command used (on a DB with 4 levels, 12 million keys)-
TEST_TMPDIR=/dev/shm numactl -C 10 ./db_bench.tmp -use_existing_db=true -benchmarks="readseq,multireadrandom" -write_buffer_size=4194304 -target_file_size_base=4194304 -max_bytes_for_level_base=16777216 -num=12000000 -reads=12000000 -duration=90 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5011
Differential Revision: D14348703
Pulled By: anand1976
fbshipit-source-id: 774406dab3776d979c809522a67bedac6c17f84b
2019-04-11 14:24:09 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Skip bottom-level filter block caching when hit-optimized
Summary:
When Get() or NewIterator() trigger file loads, skip caching the filter block if
(1) optimize_filters_for_hits is set and (2) the file is on the bottommost
level. Also skip checking filters under the same conditions, which means that
for a preloaded file or a file that was trivially-moved to the bottom level, its
filter block will eventually expire from the cache.
- added parameters/instance variables in various places in order to propagate the config ("skip_filters") from version_set to block_based_table_reader
- in BlockBasedTable::Rep, this optimization prevents filter from being loaded when the file is opened simply by setting filter_policy = nullptr
- in BlockBasedTable::Get/BlockBasedTable::NewIterator, this optimization prevents filter from being used (even if it was loaded already) by setting filter = nullptr
Test Plan:
updated unit test:
$ ./db_test --gtest_filter=DBTest.OptimizeFiltersForHits
will also run 'make check'
Reviewers: sdong, igor, paultuckfield, anthony, rven, kradhakrishnan, IslamAbdelRahman, yhchiang
Reviewed By: yhchiang
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D51633
2015-12-23 10:15:07 -08:00
|
|
|
Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
|
2018-05-21 14:33:55 -07:00
|
|
|
GetContext* get_context,
|
|
|
|
const SliceTransform* prefix_extractor,
|
|
|
|
bool skip_filters) {
|
2018-05-25 18:41:31 -07:00
|
|
|
assert(key.size() >= 8); // key must be internal key
|
2019-07-05 12:28:48 -07:00
|
|
|
assert(get_context != nullptr);
|
2012-04-17 08:36:46 -07:00
|
|
|
Status s;
|
2017-03-22 09:11:23 -07:00
|
|
|
const bool no_io = read_options.read_tier == kBlockCacheTier;
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
|
|
|
|
FilterBlockReader* const filter =
|
|
|
|
!skip_filters ? rep_->filter.get() : nullptr;
|
|
|
|
|
|
|
|
// First check the full filter
|
|
|
|
// If full filter not useful, Then go into each block
|
2019-07-08 00:09:44 -07:00
|
|
|
uint64_t tracing_get_id = get_context->get_tracing_get_id();
|
2019-07-17 13:02:00 -07:00
|
|
|
BlockCacheLookupContext lookup_context{
|
|
|
|
TableReaderCaller::kUserGet, tracing_get_id,
|
|
|
|
/*get_from_user_specified_snapshot=*/read_options.snapshot != nullptr};
|
|
|
|
if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) {
|
|
|
|
// Trace the key since it contains both user key and sequence number.
|
|
|
|
lookup_context.referenced_key = key.ToString();
|
|
|
|
lookup_context.get_from_user_specified_snapshot =
|
|
|
|
read_options.snapshot != nullptr;
|
|
|
|
}
|
2020-06-29 14:51:57 -07:00
|
|
|
TEST_SYNC_POINT("BlockBasedTable::Get:BeforeFilterMatch");
|
2022-01-31 19:45:17 -08:00
|
|
|
const bool may_match = FullFilterKeyMayMatch(
|
|
|
|
filter, key, no_io, prefix_extractor, get_context, &lookup_context);
|
2020-06-29 14:51:57 -07:00
|
|
|
TEST_SYNC_POINT("BlockBasedTable::Get:AfterFilterMatch");
|
Introduce a new MultiGet batching implementation (#5011)
Summary:
This PR introduces a new MultiGet() API, with the underlying implementation grouping keys based on SST file and batching lookups in a file. The reason for the new API is twofold - the definition allows callers to allocate storage for status and values on stack instead of std::vector, as well as return values as PinnableSlices in order to avoid copying, and it keeps the original MultiGet() implementation intact while we experiment with batching.
Batching is useful when there is some spatial locality to the keys being queries, as well as larger batch sizes. The main benefits are due to -
1. Fewer function calls, especially to BlockBasedTableReader::MultiGet() and FullFilterBlockReader::KeysMayMatch()
2. Bloom filter cachelines can be prefetched, hiding the cache miss latency
The next step is to optimize the binary searches in the level_storage_info, index blocks and data blocks, since we could reduce the number of key comparisons if the keys are relatively close to each other. The batching optimizations also need to be extended to other formats, such as PlainTable and filter formats. This also needs to be added to db_stress.
Benchmark results from db_bench for various batch size/locality of reference combinations are given below. Locality was simulated by offsetting the keys in a batch by a stride length. Each SST file is about 8.6MB uncompressed and key/value size is 16/100 uncompressed. To focus on the cpu benefit of batching, the runs were single threaded and bound to the same cpu to eliminate interference from other system events. The results show a 10-25% improvement in micros/op from smaller to larger batch sizes (4 - 32).
Batch Sizes
1 | 2 | 4 | 8 | 16 | 32
Random pattern (Stride length 0)
4.158 | 4.109 | 4.026 | 4.05 | 4.1 | 4.074 - Get
4.438 | 4.302 | 4.165 | 4.122 | 4.096 | 4.075 - MultiGet (no batching)
4.461 | 4.256 | 4.277 | 4.11 | 4.182 | 4.14 - MultiGet (w/ batching)
Good locality (Stride length 16)
4.048 | 3.659 | 3.248 | 2.99 | 2.84 | 2.753
4.429 | 3.728 | 3.406 | 3.053 | 2.911 | 2.781
4.452 | 3.45 | 2.833 | 2.451 | 2.233 | 2.135
Good locality (Stride length 256)
4.066 | 3.786 | 3.581 | 3.447 | 3.415 | 3.232
4.406 | 4.005 | 3.644 | 3.49 | 3.381 | 3.268
4.393 | 3.649 | 3.186 | 2.882 | 2.676 | 2.62
Medium locality (Stride length 4096)
4.012 | 3.922 | 3.768 | 3.61 | 3.582 | 3.555
4.364 | 4.057 | 3.791 | 3.65 | 3.57 | 3.465
4.479 | 3.758 | 3.316 | 3.077 | 2.959 | 2.891
dbbench command used (on a DB with 4 levels, 12 million keys)-
TEST_TMPDIR=/dev/shm numactl -C 10 ./db_bench.tmp -use_existing_db=true -benchmarks="readseq,multireadrandom" -write_buffer_size=4194304 -target_file_size_base=4194304 -max_bytes_for_level_base=16777216 -num=12000000 -reads=12000000 -duration=90 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5011
Differential Revision: D14348703
Pulled By: anand1976
fbshipit-source-id: 774406dab3776d979c809522a67bedac6c17f84b
2019-04-11 14:24:09 -07:00
|
|
|
if (!may_match) {
|
2021-04-26 12:43:02 -07:00
|
|
|
RecordTick(rep_->ioptions.stats, BLOOM_FILTER_USEFUL);
|
2018-10-24 12:10:59 -07:00
|
|
|
PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level);
|
2014-09-08 10:37:05 -07:00
|
|
|
} else {
|
2018-07-12 17:19:57 -07:00
|
|
|
IndexBlockIter iiter_on_stack;
|
2018-05-21 14:33:55 -07:00
|
|
|
// if prefix_extractor found in block differs from options, disable
|
|
|
|
// BlockPrefixIndex. Only do this check when index_type is kHashSearch.
|
2018-06-26 15:56:26 -07:00
|
|
|
bool need_upper_bound_check = false;
|
2018-05-21 14:33:55 -07:00
|
|
|
if (rep_->index_type == BlockBasedTableOptions::kHashSearch) {
|
2022-01-21 11:36:36 -08:00
|
|
|
need_upper_bound_check = PrefixExtractorChanged(prefix_extractor);
|
2018-05-21 14:33:55 -07:00
|
|
|
}
|
2019-06-10 15:30:05 -07:00
|
|
|
auto iiter =
|
|
|
|
NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack,
|
|
|
|
get_context, &lookup_context);
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-24 20:50:35 -07:00
|
|
|
std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
|
2017-02-06 16:29:29 -08:00
|
|
|
if (iiter != &iiter_on_stack) {
|
2017-03-22 09:11:23 -07:00
|
|
|
iiter_unique_ptr.reset(iiter);
|
2017-02-06 16:29:29 -08:00
|
|
|
}
|
2014-09-08 10:37:05 -07:00
|
|
|
|
2019-06-05 23:07:28 -07:00
|
|
|
size_t ts_sz =
|
|
|
|
rep_->internal_comparator.user_comparator()->timestamp_size();
|
2020-04-23 12:26:56 -07:00
|
|
|
bool matched = false; // if such user key matched a key in SST
|
2014-09-08 10:37:05 -07:00
|
|
|
bool done = false;
|
2017-02-06 16:29:29 -08:00
|
|
|
for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) {
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-24 20:50:35 -07:00
|
|
|
IndexValue v = iiter->value();
|
2014-01-27 13:53:22 -08:00
|
|
|
|
2014-09-08 10:37:05 -07:00
|
|
|
bool not_exist_in_filter =
|
|
|
|
filter != nullptr && filter->IsBlockBased() == true &&
|
2019-06-05 23:07:28 -07:00
|
|
|
!filter->KeyMayMatch(ExtractUserKeyAndStripTimestamp(key, ts_sz),
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-24 20:50:35 -07:00
|
|
|
prefix_extractor, v.handle.offset(), no_io,
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
/*const_ikey_ptr=*/nullptr, get_context,
|
|
|
|
&lookup_context);
|
2014-09-08 10:37:05 -07:00
|
|
|
|
|
|
|
if (not_exist_in_filter) {
|
|
|
|
// Not found
|
|
|
|
// TODO: think about interaction with Merge. If a user key cannot
|
|
|
|
// cross one data block, we should be fine.
|
2021-04-26 12:43:02 -07:00
|
|
|
RecordTick(rep_->ioptions.stats, BLOOM_FILTER_USEFUL);
|
2018-10-24 12:10:59 -07:00
|
|
|
PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level);
|
2014-09-08 10:37:05 -07:00
|
|
|
break;
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-24 20:50:35 -07:00
|
|
|
}
|
2014-09-08 10:37:05 -07:00
|
|
|
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-24 20:50:35 -07:00
|
|
|
if (!v.first_internal_key.empty() && !skip_filters &&
|
|
|
|
UserComparatorWrapper(rep_->internal_comparator.user_comparator())
|
Fix a bug in key comparison when index type is kBinarySearchWithFirstKey (#8062)
Summary:
When timestamp is enabled, key comparison should take this into account.
In `BlockBasedTableReader::Get()`, `BlockBasedTableReader::MultiGet()`,
assume the target key is `key`, and the timestamp upper bound is `ts`.
The highest key in current block is (key, ts1), while the lowest key in next
block is (key, ts2).
If
```
ts1 > ts > ts2
```
then
```
(key, ts1) < (key, ts) < (key, ts2)
```
It can be shown that if `Compare()` is used, then we will mistakenly skip the next
block. Instead, we should use `CompareWithoutTimestamp()`.
The majority of this PR makes some existing tests in `db_with_timestamp_basic_test.cc`
parameterized so that different index types can be tested. A new unit test is
also added for more coverage.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8062
Test Plan: make check
Reviewed By: ltamasi
Differential Revision: D27057557
Pulled By: riversand963
fbshipit-source-id: c1062fa7c159ed600a1ad7e461531d52265021f1
2021-03-15 17:43:02 -07:00
|
|
|
.CompareWithoutTimestamp(
|
|
|
|
ExtractUserKey(key),
|
|
|
|
ExtractUserKey(v.first_internal_key)) < 0) {
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-24 20:50:35 -07:00
|
|
|
// The requested key falls between highest key in previous block and
|
|
|
|
// lowest key in current block.
|
|
|
|
break;
|
|
|
|
}
|
2014-09-08 10:37:05 -07:00
|
|
|
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-24 20:50:35 -07:00
|
|
|
BlockCacheLookupContext lookup_data_block_context{
|
2019-07-17 13:02:00 -07:00
|
|
|
TableReaderCaller::kUserGet, tracing_get_id,
|
|
|
|
/*get_from_user_specified_snapshot=*/read_options.snapshot !=
|
|
|
|
nullptr};
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-24 20:50:35 -07:00
|
|
|
bool does_referenced_key_exist = false;
|
|
|
|
DataBlockIter biter;
|
|
|
|
uint64_t referenced_data_size = 0;
|
|
|
|
NewDataBlockIterator<DataBlockIter>(
|
2019-06-27 10:16:21 -07:00
|
|
|
read_options, v.handle, &biter, BlockType::kData, get_context,
|
|
|
|
&lookup_data_block_context,
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-24 20:50:35 -07:00
|
|
|
/*s=*/Status(), /*prefetch_buffer*/ nullptr);
|
|
|
|
|
|
|
|
if (no_io && biter.status().IsIncomplete()) {
|
|
|
|
// couldn't get block from block_cache
|
|
|
|
// Update Saver.state to Found because we are only looking for
|
|
|
|
// whether we can guarantee the key is not there when "no_io" is set
|
|
|
|
get_context->MarkKeyMayExist();
|
2021-07-13 18:12:03 -07:00
|
|
|
s = biter.status();
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-24 20:50:35 -07:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (!biter.status().ok()) {
|
|
|
|
s = biter.status();
|
|
|
|
break;
|
|
|
|
}
|
2014-09-08 10:37:05 -07:00
|
|
|
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-24 20:50:35 -07:00
|
|
|
bool may_exist = biter.SeekForGet(key);
|
|
|
|
// If user-specified timestamp is supported, we cannot end the search
|
|
|
|
// just because hash index lookup indicates the key+ts does not exist.
|
|
|
|
if (!may_exist && ts_sz == 0) {
|
|
|
|
// HashSeek cannot find the key this block and the the iter is not
|
|
|
|
// the end of the block, i.e. cannot be in the following blocks
|
|
|
|
// either. In this case, the seek_key cannot be found, so we break
|
|
|
|
// from the top level for-loop.
|
|
|
|
done = true;
|
|
|
|
} else {
|
|
|
|
// Call the *saver function on each entry/block until it returns false
|
|
|
|
for (; biter.Valid(); biter.Next()) {
|
|
|
|
ParsedInternalKey parsed_key;
|
2020-10-28 10:11:13 -07:00
|
|
|
Status pik_status = ParseInternalKey(
|
|
|
|
biter.key(), &parsed_key, false /* log_err_key */); // TODO
|
|
|
|
if (!pik_status.ok()) {
|
|
|
|
s = pik_status;
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-24 20:50:35 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
if (!get_context->SaveValue(
|
|
|
|
parsed_key, biter.value(), &matched,
|
|
|
|
biter.IsValuePinned() ? &biter : nullptr)) {
|
2019-07-03 18:45:36 -07:00
|
|
|
if (get_context->State() == GetContext::GetState::kFound) {
|
|
|
|
does_referenced_key_exist = true;
|
|
|
|
referenced_data_size = biter.key().size() + biter.value().size();
|
|
|
|
}
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-24 20:50:35 -07:00
|
|
|
done = true;
|
|
|
|
break;
|
2014-09-08 10:37:05 -07:00
|
|
|
}
|
|
|
|
}
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-24 20:50:35 -07:00
|
|
|
s = biter.status();
|
|
|
|
}
|
|
|
|
// Write the block cache access record.
|
2019-06-27 08:31:03 -07:00
|
|
|
if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) {
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-24 20:50:35 -07:00
|
|
|
// Avoid making copy of block_key, cf_name, and referenced_key when
|
|
|
|
// constructing the access record.
|
2019-07-03 18:45:36 -07:00
|
|
|
Slice referenced_key;
|
|
|
|
if (does_referenced_key_exist) {
|
|
|
|
referenced_key = biter.key();
|
|
|
|
} else {
|
2019-07-17 13:02:00 -07:00
|
|
|
referenced_key = key;
|
2019-07-03 18:45:36 -07:00
|
|
|
}
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-24 20:50:35 -07:00
|
|
|
BlockCacheTraceRecord access_record(
|
2021-03-15 04:32:24 -07:00
|
|
|
rep_->ioptions.clock->NowMicros(),
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-24 20:50:35 -07:00
|
|
|
/*block_key=*/"", lookup_data_block_context.block_type,
|
|
|
|
lookup_data_block_context.block_size, rep_->cf_id_for_tracing(),
|
|
|
|
/*cf_name=*/"", rep_->level_for_tracing(),
|
|
|
|
rep_->sst_number_for_tracing(), lookup_data_block_context.caller,
|
|
|
|
lookup_data_block_context.is_cache_hit,
|
|
|
|
lookup_data_block_context.no_insert,
|
2019-07-03 18:45:36 -07:00
|
|
|
lookup_data_block_context.get_id,
|
2019-07-17 13:02:00 -07:00
|
|
|
lookup_data_block_context.get_from_user_specified_snapshot,
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-24 20:50:35 -07:00
|
|
|
/*referenced_key=*/"", referenced_data_size,
|
|
|
|
lookup_data_block_context.num_keys_in_block,
|
|
|
|
does_referenced_key_exist);
|
2020-10-08 11:11:52 -07:00
|
|
|
// TODO: Should handle status here?
|
|
|
|
block_cache_tracer_
|
|
|
|
->WriteBlockAccess(access_record,
|
|
|
|
lookup_data_block_context.block_key,
|
|
|
|
rep_->cf_name_for_tracing(), referenced_key)
|
|
|
|
.PermitUncheckedError();
|
2012-04-17 08:36:46 -07:00
|
|
|
}
|
2019-06-14 17:37:24 -07:00
|
|
|
|
2017-03-22 09:11:23 -07:00
|
|
|
if (done) {
|
|
|
|
// Avoid the extra Next which is expensive in two-level indexes
|
|
|
|
break;
|
|
|
|
}
|
2014-09-08 10:37:05 -07:00
|
|
|
}
|
2018-04-05 15:54:24 -07:00
|
|
|
if (matched && filter != nullptr && !filter->IsBlockBased()) {
|
2021-04-26 12:43:02 -07:00
|
|
|
RecordTick(rep_->ioptions.stats, BLOOM_FILTER_FULL_TRUE_POSITIVE);
|
2018-10-24 12:10:59 -07:00
|
|
|
PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1,
|
|
|
|
rep_->level);
|
2018-04-05 15:54:24 -07:00
|
|
|
}
|
2020-01-17 01:39:22 -08:00
|
|
|
if (s.ok() && !iiter->status().IsNotFound()) {
|
2017-02-06 16:29:29 -08:00
|
|
|
s = iiter->status();
|
2012-04-17 08:36:46 -07:00
|
|
|
}
|
|
|
|
}
|
2013-11-12 22:46:51 -08:00
|
|
|
|
2012-04-17 08:36:46 -07:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2015-03-02 17:07:03 -08:00
|
|
|
Status BlockBasedTable::Prefetch(const Slice* const begin,
|
|
|
|
const Slice* const end) {
|
|
|
|
auto& comparator = rep_->internal_comparator;
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-24 20:50:35 -07:00
|
|
|
UserComparatorWrapper user_comparator(comparator.user_comparator());
|
2015-03-02 17:07:03 -08:00
|
|
|
// pre-condition
|
|
|
|
if (begin && end && comparator.Compare(*begin, *end) > 0) {
|
|
|
|
return Status::InvalidArgument(*begin, *end);
|
|
|
|
}
|
2019-06-20 14:28:22 -07:00
|
|
|
BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
|
2018-07-12 17:19:57 -07:00
|
|
|
IndexBlockIter iiter_on_stack;
|
2019-06-10 15:30:05 -07:00
|
|
|
auto iiter = NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
|
|
|
|
&iiter_on_stack, /*get_context=*/nullptr,
|
|
|
|
&lookup_context);
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-24 20:50:35 -07:00
|
|
|
std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
|
2017-02-06 16:29:29 -08:00
|
|
|
if (iiter != &iiter_on_stack) {
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-24 20:50:35 -07:00
|
|
|
iiter_unique_ptr = std::unique_ptr<InternalIteratorBase<IndexValue>>(iiter);
|
2017-02-06 16:29:29 -08:00
|
|
|
}
|
2015-03-02 17:07:03 -08:00
|
|
|
|
2017-02-06 16:29:29 -08:00
|
|
|
if (!iiter->status().ok()) {
|
2015-03-02 17:07:03 -08:00
|
|
|
// error opening index iterator
|
2017-02-06 16:29:29 -08:00
|
|
|
return iiter->status();
|
2015-03-02 17:07:03 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
// indicates if we are on the last page that need to be pre-fetched
|
|
|
|
bool prefetching_boundary_page = false;
|
|
|
|
|
2017-02-06 16:29:29 -08:00
|
|
|
for (begin ? iiter->Seek(*begin) : iiter->SeekToFirst(); iiter->Valid();
|
|
|
|
iiter->Next()) {
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-24 20:50:35 -07:00
|
|
|
BlockHandle block_handle = iiter->value().handle;
|
|
|
|
const bool is_user_key = !rep_->index_key_includes_seq;
|
2018-05-25 18:41:31 -07:00
|
|
|
if (end &&
|
|
|
|
((!is_user_key && comparator.Compare(iiter->key(), *end) >= 0) ||
|
|
|
|
(is_user_key &&
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-24 20:50:35 -07:00
|
|
|
user_comparator.Compare(iiter->key(), ExtractUserKey(*end)) >= 0))) {
|
2015-03-02 17:07:03 -08:00
|
|
|
if (prefetching_boundary_page) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
// The index entry represents the last key in the data block.
|
|
|
|
// We should load this page into memory as well, but no more
|
|
|
|
prefetching_boundary_page = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Load the block specified by the block_handle into the block cache
|
2018-07-12 17:19:57 -07:00
|
|
|
DataBlockIter biter;
|
2019-06-10 15:30:05 -07:00
|
|
|
|
|
|
|
NewDataBlockIterator<DataBlockIter>(
|
|
|
|
ReadOptions(), block_handle, &biter, /*type=*/BlockType::kData,
|
|
|
|
/*get_context=*/nullptr, &lookup_context, Status(),
|
|
|
|
/*prefetch_buffer=*/nullptr);
|
2015-03-02 17:07:03 -08:00
|
|
|
|
|
|
|
if (!biter.status().ok()) {
|
|
|
|
// there was an unexpected error while pre-fetching
|
|
|
|
return biter.status();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
2019-08-16 16:40:09 -07:00
|
|
|
Status BlockBasedTable::VerifyChecksum(const ReadOptions& read_options,
|
|
|
|
TableReaderCaller caller) {
|
2017-08-09 15:49:40 -07:00
|
|
|
Status s;
|
|
|
|
// Check Meta blocks
|
2019-11-05 17:17:36 -08:00
|
|
|
std::unique_ptr<Block> metaindex;
|
|
|
|
std::unique_ptr<InternalIterator> metaindex_iter;
|
2020-06-29 14:51:57 -07:00
|
|
|
ReadOptions ro;
|
|
|
|
s = ReadMetaIndexBlock(ro, nullptr /* prefetch buffer */, &metaindex,
|
2019-11-05 17:17:36 -08:00
|
|
|
&metaindex_iter);
|
2017-08-09 15:49:40 -07:00
|
|
|
if (s.ok()) {
|
2019-11-05 17:17:36 -08:00
|
|
|
s = VerifyChecksumInMetaBlocks(metaindex_iter.get());
|
2017-08-09 15:49:40 -07:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
// Check Data blocks
|
2018-07-12 17:19:57 -07:00
|
|
|
IndexBlockIter iiter_on_stack;
|
2019-06-20 14:28:22 -07:00
|
|
|
BlockCacheLookupContext context{caller};
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-24 20:50:35 -07:00
|
|
|
InternalIteratorBase<IndexValue>* iiter = NewIndexIterator(
|
2019-08-16 16:40:09 -07:00
|
|
|
read_options, /*disable_prefix_seek=*/false, &iiter_on_stack,
|
2019-06-20 14:28:22 -07:00
|
|
|
/*get_context=*/nullptr, &context);
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-24 20:50:35 -07:00
|
|
|
std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
|
2017-08-09 15:49:40 -07:00
|
|
|
if (iiter != &iiter_on_stack) {
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-24 20:50:35 -07:00
|
|
|
iiter_unique_ptr = std::unique_ptr<InternalIteratorBase<IndexValue>>(iiter);
|
2017-08-09 15:49:40 -07:00
|
|
|
}
|
|
|
|
if (!iiter->status().ok()) {
|
|
|
|
// error opening index iterator
|
|
|
|
return iiter->status();
|
|
|
|
}
|
2019-08-16 16:40:09 -07:00
|
|
|
s = VerifyChecksumInBlocks(read_options, iiter);
|
2017-08-09 15:49:40 -07:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2018-08-09 16:49:45 -07:00
|
|
|
Status BlockBasedTable::VerifyChecksumInBlocks(
|
2019-08-16 16:40:09 -07:00
|
|
|
const ReadOptions& read_options,
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-24 20:50:35 -07:00
|
|
|
InternalIteratorBase<IndexValue>* index_iter) {
|
2017-08-09 15:49:40 -07:00
|
|
|
Status s;
|
2019-08-16 16:40:09 -07:00
|
|
|
// We are scanning the whole file, so no need to do exponential
|
|
|
|
// increasing of the buffer size.
|
|
|
|
size_t readahead_size = (read_options.readahead_size != 0)
|
|
|
|
? read_options.readahead_size
|
2021-02-23 16:52:35 -08:00
|
|
|
: rep_->table_options.max_auto_readahead_size;
|
2019-10-21 11:37:09 -07:00
|
|
|
// FilePrefetchBuffer doesn't work in mmap mode and readahead is not
|
|
|
|
// needed there.
|
|
|
|
FilePrefetchBuffer prefetch_buffer(
|
2021-11-19 17:52:42 -08:00
|
|
|
readahead_size /* readahead_size */,
|
2019-10-21 11:37:09 -07:00
|
|
|
readahead_size /* max_readahead_size */,
|
|
|
|
!rep_->ioptions.allow_mmap_reads /* enable */);
|
2019-08-16 16:40:09 -07:00
|
|
|
|
2017-08-09 15:49:40 -07:00
|
|
|
for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) {
|
|
|
|
s = index_iter->status();
|
|
|
|
if (!s.ok()) {
|
|
|
|
break;
|
|
|
|
}
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-24 20:50:35 -07:00
|
|
|
BlockHandle handle = index_iter->value().handle;
|
2018-08-09 16:49:45 -07:00
|
|
|
BlockContents contents;
|
2018-11-28 17:58:08 -08:00
|
|
|
BlockFetcher block_fetcher(
|
2022-02-16 23:17:03 -08:00
|
|
|
rep_->file.get(), &prefetch_buffer, rep_->footer, read_options, handle,
|
2019-08-16 16:40:09 -07:00
|
|
|
&contents, rep_->ioptions, false /* decompress */,
|
|
|
|
false /*maybe_compressed*/, BlockType::kData,
|
2019-01-23 18:11:08 -08:00
|
|
|
UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options);
|
2018-08-09 16:49:45 -07:00
|
|
|
s = block_fetcher.ReadBlockContents();
|
|
|
|
if (!s.ok()) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2020-06-05 11:06:26 -07:00
|
|
|
if (s.ok()) {
|
|
|
|
// In the case of two level indexes, we would have exited the above loop
|
|
|
|
// by checking index_iter->Valid(), but Valid() might have returned false
|
|
|
|
// due to an IO error. So check the index_iter status
|
|
|
|
s = index_iter->status();
|
|
|
|
}
|
2018-08-09 16:49:45 -07:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2019-06-18 19:00:03 -07:00
|
|
|
BlockType BlockBasedTable::GetBlockTypeForMetaBlockByName(
|
|
|
|
const Slice& meta_block_name) {
|
|
|
|
if (meta_block_name.starts_with(kFilterBlockPrefix) ||
|
|
|
|
meta_block_name.starts_with(kFullFilterBlockPrefix) ||
|
|
|
|
meta_block_name.starts_with(kPartitionedFilterBlockPrefix)) {
|
|
|
|
return BlockType::kFilter;
|
|
|
|
}
|
|
|
|
|
2021-12-10 08:12:09 -08:00
|
|
|
if (meta_block_name == kPropertiesBlockName) {
|
2019-06-18 19:00:03 -07:00
|
|
|
return BlockType::kProperties;
|
|
|
|
}
|
|
|
|
|
2021-12-10 08:12:09 -08:00
|
|
|
if (meta_block_name == kCompressionDictBlockName) {
|
2019-06-18 19:00:03 -07:00
|
|
|
return BlockType::kCompressionDictionary;
|
|
|
|
}
|
|
|
|
|
2021-12-10 08:12:09 -08:00
|
|
|
if (meta_block_name == kRangeDelBlockName) {
|
2019-06-18 19:00:03 -07:00
|
|
|
return BlockType::kRangeDeletion;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (meta_block_name == kHashIndexPrefixesBlock) {
|
|
|
|
return BlockType::kHashIndexPrefixes;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (meta_block_name == kHashIndexPrefixesMetadataBlock) {
|
|
|
|
return BlockType::kHashIndexMetadata;
|
|
|
|
}
|
|
|
|
|
|
|
|
assert(false);
|
|
|
|
return BlockType::kInvalid;
|
|
|
|
}
|
|
|
|
|
2019-03-26 10:15:43 -07:00
|
|
|
Status BlockBasedTable::VerifyChecksumInMetaBlocks(
|
2018-08-09 16:49:45 -07:00
|
|
|
InternalIteratorBase<Slice>* index_iter) {
|
|
|
|
Status s;
|
|
|
|
for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) {
|
|
|
|
s = index_iter->status();
|
2017-08-09 15:49:40 -07:00
|
|
|
if (!s.ok()) {
|
|
|
|
break;
|
|
|
|
}
|
2018-08-09 16:49:45 -07:00
|
|
|
BlockHandle handle;
|
|
|
|
Slice input = index_iter->value();
|
|
|
|
s = handle.DecodeFrom(&input);
|
2017-08-09 15:49:40 -07:00
|
|
|
BlockContents contents;
|
2019-06-18 19:00:03 -07:00
|
|
|
const Slice meta_block_name = index_iter->key();
|
2021-12-10 08:12:09 -08:00
|
|
|
if (meta_block_name == kPropertiesBlockName) {
|
Improve / clean up meta block code & integrity (#9163)
Summary:
* Checksums are now checked on meta blocks unless specifically
suppressed or not applicable (e.g. plain table). (Was other way around.)
This means a number of cases that were not checking checksums now are,
including direct read TableProperties in Version::GetTableProperties
(fixed in meta_blocks ReadTableProperties), reading any block from
PersistentCache (fixed in BlockFetcher), read TableProperties in
SstFileDumper (ldb/sst_dump/BackupEngine) before table reader open,
maybe more.
* For that to work, I moved the global_seqno+TableProperties checksum
logic to the shared table/ code, because that is used by many utilies
such as SstFileDumper.
* Also for that to work, we have to know when we're dealing with a block
that has a checksum (trailer), so added that capability to Footer based
on magic number, and from there BlockFetcher.
* Knowledge of trailer presence has also fixed a problem where other
table formats were reading blocks including bytes for a non-existant
trailer--and awkwardly kind-of not using them, e.g. no shared code
checking checksums. (BlockFetcher compression type was populated
incorrectly.) Now we only read what is needed.
* Minimized code duplication and differing/incompatible/awkward
abstractions in meta_blocks.{cc,h} (e.g. SeekTo in metaindex block
without parsing block handle)
* Moved some meta block handling code from table_properties*.*
* Moved some code specific to block-based table from shared table/ code
to BlockBasedTable class. The checksum stuff means we can't completely
separate it, but things that don't need to be in shared table/ code
should not be.
* Use unique_ptr rather than raw ptr in more places. (Note: you can
std::move from unique_ptr to shared_ptr.)
Without enhancements to GetPropertiesOfAllTablesTest (see below),
net reduction of roughly 100 lines of code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9163
Test Plan:
existing tests and
* Enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to verify that
checksums are now checked on direct read of table properties by TableCache
(new test would fail before this change)
* Also enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to test
putting table properties under old meta name
* Also generally enhanced that same test to actually test what it was
supposed to be testing already, by kicking things out of table cache when
we don't want them there.
Reviewed By: ajkr, mrambacher
Differential Revision: D32514757
Pulled By: pdillinger
fbshipit-source-id: 507964b9311d186ae8d1131182290cbd97a99fa9
2021-11-18 11:42:12 -08:00
|
|
|
// Unfortunate special handling for properties block checksum w/
|
|
|
|
// global seqno
|
|
|
|
std::unique_ptr<TableProperties> table_properties;
|
|
|
|
s = ReadTablePropertiesHelper(ReadOptions(), handle, rep_->file.get(),
|
|
|
|
nullptr /* prefetch_buffer */, rep_->footer,
|
|
|
|
rep_->ioptions, &table_properties,
|
|
|
|
nullptr /* memory_allocator */);
|
|
|
|
} else {
|
|
|
|
s = BlockFetcher(
|
|
|
|
rep_->file.get(), nullptr /* prefetch buffer */, rep_->footer,
|
|
|
|
ReadOptions(), handle, &contents, rep_->ioptions,
|
|
|
|
false /* decompress */, false /*maybe_compressed*/,
|
|
|
|
GetBlockTypeForMetaBlockByName(meta_block_name),
|
|
|
|
UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options)
|
|
|
|
.ReadBlockContents();
|
2019-03-26 10:15:43 -07:00
|
|
|
}
|
2017-08-09 15:49:40 -07:00
|
|
|
if (!s.ok()) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
Move the index readers out of the block cache (#5298)
Summary:
Currently, when the block cache is used for index blocks as well, it is
not really the index block that is stored in the cache but an
IndexReader object. Since this object is not pure data (it has, for
instance, pointers that might dangle), it's not really sharable. To
avoid the issues around this, the current code uses a dummy unique cache
key for each TableReader to store the IndexReader, and erases the
IndexReader entry when the TableReader is closed. Instead of doing this,
the new code moves the IndexReader out of the cache altogether. In
particular, instead of the TableReader owning, or caching/pinning the
IndexReader based on the customer's settings, the TableReader
unconditionally owns the IndexReader, which in turn owns/caches/pins
the index block (which is itself sharable and thus can be safely put in
the cache without any hacks).
Note: the change has two side effects:
1) Partitions of partitioned indexes no longer affect the read
amplification statistics.
2) Eviction statistics for index blocks are temporarily broken. We plan to fix
this in a separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5298
Differential Revision: D15303203
Pulled By: ltamasi
fbshipit-source-id: 935a69ba59d87d5e44f42e2310619b790c366e47
2019-05-30 11:49:36 -07:00
|
|
|
bool BlockBasedTable::TEST_BlockInCache(const BlockHandle& handle) const {
|
|
|
|
assert(rep_ != nullptr);
|
|
|
|
|
|
|
|
Cache* const cache = rep_->table_options.block_cache.get();
|
|
|
|
if (cache == nullptr) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
New stable, fixed-length cache keys (#9126)
Summary:
This change standardizes on a new 16-byte cache key format for
block cache (incl compressed and secondary) and persistent cache (but
not table cache and row cache).
The goal is a really fast cache key with practically ideal stability and
uniqueness properties without external dependencies (e.g. from FileSystem).
A fixed key size of 16 bytes should enable future optimizations to the
concurrent hash table for block cache, which is a heavy CPU user /
bottleneck, but there appears to be measurable performance improvement
even with no changes to LRUCache.
This change replaces a lot of disjointed and ugly code handling cache
keys with calls to a simple, clean new internal API (cache_key.h).
(Preserving the old cache key logic under an option would be very ugly
and likely negate the performance gain of the new approach. Complete
replacement carries some inherent risk, but I think that's acceptable
with sufficient analysis and testing.)
The scheme for encoding new cache keys is complicated but explained
in cache_key.cc.
Also: EndianSwapValue is moved to math.h to be next to other bit
operations. (Explains some new include "math.h".) ReverseBits operation
added and unit tests added to hash_test for both.
Fixes https://github.com/facebook/rocksdb/issues/7405 (presuming a root cause)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9126
Test Plan:
### Basic correctness
Several tests needed updates to work with the new functionality, mostly
because we are no longer relying on filesystem for stable cache keys
so table builders & readers need more context info to agree on cache
keys. This functionality is so core, a huge number of existing tests
exercise the cache key functionality.
### Performance
Create db with
`TEST_TMPDIR=/dev/shm ./db_bench -bloom_bits=10 -benchmarks=fillrandom -num=3000000 -partition_index_and_filters`
And test performance with
`TEST_TMPDIR=/dev/shm ./db_bench -readonly -use_existing_db -bloom_bits=10 -benchmarks=readrandom -num=3000000 -duration=30 -cache_index_and_filter_blocks -cache_size=250000 -threads=4`
using DEBUG_LEVEL=0 and simultaneous before & after runs.
Before ops/sec, avg over 100 runs: 121924
After ops/sec, avg over 100 runs: 125385 (+2.8%)
### Collision probability
I have built a tool, ./cache_bench -stress_cache_key to broadly simulate host-wide cache activity
over many months, by making some pessimistic simplifying assumptions:
* Every generated file has a cache entry for every byte offset in the file (contiguous range of cache keys)
* All of every file is cached for its entire lifetime
We use a simple table with skewed address assignment and replacement on address collision
to simulate files coming & going, with quite a variance (super-Poisson) in ages. Some output
with `./cache_bench -stress_cache_key -sck_keep_bits=40`:
```
Total cache or DBs size: 32TiB Writing 925.926 MiB/s or 76.2939TiB/day
Multiply by 9.22337e+18 to correct for simulation losses (but still assume whole file cached)
```
These come from default settings of 2.5M files per day of 32 MB each, and
`-sck_keep_bits=40` means that to represent a single file, we are only keeping 40 bits of
the 128-bit cache key. With file size of 2\*\*25 contiguous keys (pessimistic), our simulation
is about 2\*\*(128-40-25) or about 9 billion billion times more prone to collision than reality.
More default assumptions, relatively pessimistic:
* 100 DBs in same process (doesn't matter much)
* Re-open DB in same process (new session ID related to old session ID) on average
every 100 files generated
* Restart process (all new session IDs unrelated to old) 24 times per day
After enough data, we get a result at the end:
```
(keep 40 bits) 17 collisions after 2 x 90 days, est 10.5882 days between (9.76592e+19 corrected)
```
If we believe the (pessimistic) simulation and the mathematical generalization, we would need to run a billion machines all for 97 billion days to expect a cache key collision. To help verify that our generalization ("corrected") is robust, we can make our simulation more precise with `-sck_keep_bits=41` and `42`, which takes more running time to get enough data:
```
(keep 41 bits) 16 collisions after 4 x 90 days, est 22.5 days between (1.03763e+20 corrected)
(keep 42 bits) 19 collisions after 10 x 90 days, est 47.3684 days between (1.09224e+20 corrected)
```
The generalized prediction still holds. With the `-sck_randomize` option, we can see that we are beating "random" cache keys (except offsets still non-randomized) by a modest amount (roughly 20x less collision prone than random), which should make us reasonably comfortable even in "degenerate" cases:
```
197 collisions after 1 x 90 days, est 0.456853 days between (4.21372e+18 corrected)
```
I've run other tests to validate other conditions behave as expected, never behaving "worse than random" unless we start chopping off structured data.
Reviewed By: zhichao-cao
Differential Revision: D33171746
Pulled By: pdillinger
fbshipit-source-id: f16a57e369ed37be5e7e33525ace848d0537c88f
2021-12-16 17:13:55 -08:00
|
|
|
CacheKey key = GetCacheKey(rep_->base_cache_key, handle);
|
Move the index readers out of the block cache (#5298)
Summary:
Currently, when the block cache is used for index blocks as well, it is
not really the index block that is stored in the cache but an
IndexReader object. Since this object is not pure data (it has, for
instance, pointers that might dangle), it's not really sharable. To
avoid the issues around this, the current code uses a dummy unique cache
key for each TableReader to store the IndexReader, and erases the
IndexReader entry when the TableReader is closed. Instead of doing this,
the new code moves the IndexReader out of the cache altogether. In
particular, instead of the TableReader owning, or caching/pinning the
IndexReader based on the customer's settings, the TableReader
unconditionally owns the IndexReader, which in turn owns/caches/pins
the index block (which is itself sharable and thus can be safely put in
the cache without any hacks).
Note: the change has two side effects:
1) Partitions of partitioned indexes no longer affect the read
amplification statistics.
2) Eviction statistics for index blocks are temporarily broken. We plan to fix
this in a separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5298
Differential Revision: D15303203
Pulled By: ltamasi
fbshipit-source-id: 935a69ba59d87d5e44f42e2310619b790c366e47
2019-05-30 11:49:36 -07:00
|
|
|
|
New stable, fixed-length cache keys (#9126)
Summary:
This change standardizes on a new 16-byte cache key format for
block cache (incl compressed and secondary) and persistent cache (but
not table cache and row cache).
The goal is a really fast cache key with practically ideal stability and
uniqueness properties without external dependencies (e.g. from FileSystem).
A fixed key size of 16 bytes should enable future optimizations to the
concurrent hash table for block cache, which is a heavy CPU user /
bottleneck, but there appears to be measurable performance improvement
even with no changes to LRUCache.
This change replaces a lot of disjointed and ugly code handling cache
keys with calls to a simple, clean new internal API (cache_key.h).
(Preserving the old cache key logic under an option would be very ugly
and likely negate the performance gain of the new approach. Complete
replacement carries some inherent risk, but I think that's acceptable
with sufficient analysis and testing.)
The scheme for encoding new cache keys is complicated but explained
in cache_key.cc.
Also: EndianSwapValue is moved to math.h to be next to other bit
operations. (Explains some new include "math.h".) ReverseBits operation
added and unit tests added to hash_test for both.
Fixes https://github.com/facebook/rocksdb/issues/7405 (presuming a root cause)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9126
Test Plan:
### Basic correctness
Several tests needed updates to work with the new functionality, mostly
because we are no longer relying on filesystem for stable cache keys
so table builders & readers need more context info to agree on cache
keys. This functionality is so core, a huge number of existing tests
exercise the cache key functionality.
### Performance
Create db with
`TEST_TMPDIR=/dev/shm ./db_bench -bloom_bits=10 -benchmarks=fillrandom -num=3000000 -partition_index_and_filters`
And test performance with
`TEST_TMPDIR=/dev/shm ./db_bench -readonly -use_existing_db -bloom_bits=10 -benchmarks=readrandom -num=3000000 -duration=30 -cache_index_and_filter_blocks -cache_size=250000 -threads=4`
using DEBUG_LEVEL=0 and simultaneous before & after runs.
Before ops/sec, avg over 100 runs: 121924
After ops/sec, avg over 100 runs: 125385 (+2.8%)
### Collision probability
I have built a tool, ./cache_bench -stress_cache_key to broadly simulate host-wide cache activity
over many months, by making some pessimistic simplifying assumptions:
* Every generated file has a cache entry for every byte offset in the file (contiguous range of cache keys)
* All of every file is cached for its entire lifetime
We use a simple table with skewed address assignment and replacement on address collision
to simulate files coming & going, with quite a variance (super-Poisson) in ages. Some output
with `./cache_bench -stress_cache_key -sck_keep_bits=40`:
```
Total cache or DBs size: 32TiB Writing 925.926 MiB/s or 76.2939TiB/day
Multiply by 9.22337e+18 to correct for simulation losses (but still assume whole file cached)
```
These come from default settings of 2.5M files per day of 32 MB each, and
`-sck_keep_bits=40` means that to represent a single file, we are only keeping 40 bits of
the 128-bit cache key. With file size of 2\*\*25 contiguous keys (pessimistic), our simulation
is about 2\*\*(128-40-25) or about 9 billion billion times more prone to collision than reality.
More default assumptions, relatively pessimistic:
* 100 DBs in same process (doesn't matter much)
* Re-open DB in same process (new session ID related to old session ID) on average
every 100 files generated
* Restart process (all new session IDs unrelated to old) 24 times per day
After enough data, we get a result at the end:
```
(keep 40 bits) 17 collisions after 2 x 90 days, est 10.5882 days between (9.76592e+19 corrected)
```
If we believe the (pessimistic) simulation and the mathematical generalization, we would need to run a billion machines all for 97 billion days to expect a cache key collision. To help verify that our generalization ("corrected") is robust, we can make our simulation more precise with `-sck_keep_bits=41` and `42`, which takes more running time to get enough data:
```
(keep 41 bits) 16 collisions after 4 x 90 days, est 22.5 days between (1.03763e+20 corrected)
(keep 42 bits) 19 collisions after 10 x 90 days, est 47.3684 days between (1.09224e+20 corrected)
```
The generalized prediction still holds. With the `-sck_randomize` option, we can see that we are beating "random" cache keys (except offsets still non-randomized) by a modest amount (roughly 20x less collision prone than random), which should make us reasonably comfortable even in "degenerate" cases:
```
197 collisions after 1 x 90 days, est 0.456853 days between (4.21372e+18 corrected)
```
I've run other tests to validate other conditions behave as expected, never behaving "worse than random" unless we start chopping off structured data.
Reviewed By: zhichao-cao
Differential Revision: D33171746
Pulled By: pdillinger
fbshipit-source-id: f16a57e369ed37be5e7e33525ace848d0537c88f
2021-12-16 17:13:55 -08:00
|
|
|
Cache::Handle* const cache_handle = cache->Lookup(key.AsSlice());
|
Move the index readers out of the block cache (#5298)
Summary:
Currently, when the block cache is used for index blocks as well, it is
not really the index block that is stored in the cache but an
IndexReader object. Since this object is not pure data (it has, for
instance, pointers that might dangle), it's not really sharable. To
avoid the issues around this, the current code uses a dummy unique cache
key for each TableReader to store the IndexReader, and erases the
IndexReader entry when the TableReader is closed. Instead of doing this,
the new code moves the IndexReader out of the cache altogether. In
particular, instead of the TableReader owning, or caching/pinning the
IndexReader based on the customer's settings, the TableReader
unconditionally owns the IndexReader, which in turn owns/caches/pins
the index block (which is itself sharable and thus can be safely put in
the cache without any hacks).
Note: the change has two side effects:
1) Partitions of partitioned indexes no longer affect the read
amplification statistics.
2) Eviction statistics for index blocks are temporarily broken. We plan to fix
this in a separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5298
Differential Revision: D15303203
Pulled By: ltamasi
fbshipit-source-id: 935a69ba59d87d5e44f42e2310619b790c366e47
2019-05-30 11:49:36 -07:00
|
|
|
if (cache_handle == nullptr) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
cache->Release(cache_handle);
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2013-10-28 17:54:09 -07:00
|
|
|
bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options,
|
|
|
|
const Slice& key) {
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-24 20:50:35 -07:00
|
|
|
std::unique_ptr<InternalIteratorBase<IndexValue>> iiter(NewIndexIterator(
|
2019-06-10 15:30:05 -07:00
|
|
|
options, /*need_upper_bound_check=*/false, /*input_iter=*/nullptr,
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
/*get_context=*/nullptr, /*lookup_context=*/nullptr));
|
2014-06-20 10:23:02 +02:00
|
|
|
iiter->Seek(key);
|
|
|
|
assert(iiter->Valid());
|
|
|
|
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-24 20:50:35 -07:00
|
|
|
return TEST_BlockInCache(iiter->value().handle);
|
2018-06-22 15:14:05 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
// REQUIRES: The following fields of rep_ should have already been populated:
|
|
|
|
// 1. file
|
|
|
|
// 2. index_handle,
|
|
|
|
// 3. options
|
|
|
|
// 4. internal_comparator
|
|
|
|
// 5. index_type
|
|
|
|
Status BlockBasedTable::CreateIndexReader(
|
2020-06-29 14:51:57 -07:00
|
|
|
const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer,
|
2021-12-10 08:12:09 -08:00
|
|
|
InternalIterator* meta_iter, bool use_cache, bool prefetch, bool pin,
|
|
|
|
BlockCacheLookupContext* lookup_context,
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
std::unique_ptr<IndexReader>* index_reader) {
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-24 20:50:35 -07:00
|
|
|
switch (rep_->index_type) {
|
2017-02-06 16:29:29 -08:00
|
|
|
case BlockBasedTableOptions::kTwoLevelIndexSearch: {
|
2020-06-29 14:51:57 -07:00
|
|
|
return PartitionIndexReader::Create(this, ro, prefetch_buffer, use_cache,
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
prefetch, pin, lookup_context,
|
|
|
|
index_reader);
|
2017-02-06 16:29:29 -08:00
|
|
|
}
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-24 20:50:35 -07:00
|
|
|
case BlockBasedTableOptions::kBinarySearch:
|
2020-02-21 15:07:55 -08:00
|
|
|
FALLTHROUGH_INTENDED;
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-24 20:50:35 -07:00
|
|
|
case BlockBasedTableOptions::kBinarySearchWithFirstKey: {
|
2020-06-29 14:51:57 -07:00
|
|
|
return BinarySearchIndexReader::Create(this, ro, prefetch_buffer,
|
|
|
|
use_cache, prefetch, pin,
|
|
|
|
lookup_context, index_reader);
|
2014-04-10 14:19:43 -07:00
|
|
|
}
|
|
|
|
case BlockBasedTableOptions::kHashSearch: {
|
2019-11-05 17:17:36 -08:00
|
|
|
std::unique_ptr<Block> metaindex_guard;
|
|
|
|
std::unique_ptr<InternalIterator> metaindex_iter_guard;
|
2020-01-27 15:41:57 -08:00
|
|
|
bool should_fallback = false;
|
2022-01-31 19:45:17 -08:00
|
|
|
// FIXME: is changed prefix_extractor handled anywhere for hash index?
|
2020-01-27 15:41:57 -08:00
|
|
|
if (rep_->internal_prefix_transform.get() == nullptr) {
|
2021-04-26 12:43:02 -07:00
|
|
|
ROCKS_LOG_WARN(rep_->ioptions.logger,
|
2020-01-27 15:41:57 -08:00
|
|
|
"No prefix extractor passed in. Fall back to binary"
|
|
|
|
" search index.");
|
|
|
|
should_fallback = true;
|
2014-05-15 14:09:03 -07:00
|
|
|
}
|
|
|
|
|
2020-01-27 15:41:57 -08:00
|
|
|
if (should_fallback) {
|
2020-06-29 14:51:57 -07:00
|
|
|
return BinarySearchIndexReader::Create(this, ro, prefetch_buffer,
|
|
|
|
use_cache, prefetch, pin,
|
|
|
|
lookup_context, index_reader);
|
2020-01-27 15:41:57 -08:00
|
|
|
} else {
|
2021-12-10 08:12:09 -08:00
|
|
|
return HashIndexReader::Create(this, ro, prefetch_buffer, meta_iter,
|
|
|
|
use_cache, prefetch, pin, lookup_context,
|
|
|
|
index_reader);
|
2020-01-27 15:41:57 -08:00
|
|
|
}
|
2014-02-28 18:19:07 -08:00
|
|
|
}
|
|
|
|
default: {
|
|
|
|
std::string error_message =
|
2022-05-06 13:03:58 -07:00
|
|
|
"Unrecognized index type: " + std::to_string(rep_->index_type);
|
2014-03-01 23:40:08 -08:00
|
|
|
return Status::InvalidArgument(error_message.c_str());
|
2014-02-28 18:19:07 -08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
For ApproximateSizes, pro-rate table metadata size over data blocks (#6784)
Summary:
The implementation of GetApproximateSizes was inconsistent in
its treatment of the size of non-data blocks of SST files, sometimes
including and sometimes now. This was at its worst with large portion
of table file used by filters and querying a small range that crossed
a table boundary: the size estimate would include large filter size.
It's conceivable that someone might want only to know the size in terms
of data blocks, but I believe that's unlikely enough to ignore for now.
Similarly, there's no evidence the internal function AppoximateOffsetOf
is used for anything other than a one-sided ApproximateSize, so I intend
to refactor to remove redundancy in a follow-up commit.
So to fix this, GetApproximateSizes (and implementation details
ApproximateSize and ApproximateOffsetOf) now consistently include in
their returned sizes a portion of table file metadata (incl filters
and indexes) based on the size portion of the data blocks in range. In
other words, if a key range covers data blocks that are X% by size of all
the table's data blocks, returned approximate size is X% of the total
file size. It would technically be more accurate to attribute metadata
based on number of keys, but that's not computationally efficient with
data available and rarely a meaningful difference.
Also includes miscellaneous comment improvements / clarifications.
Also included is a new approximatesizerandom benchmark for db_bench.
No significant performance difference seen with this change, whether ~700 ops/sec with cache_index_and_filter_blocks and small cache or ~150k ops/sec without cache_index_and_filter_blocks.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6784
Test Plan:
Test added to DBTest.ApproximateSizesFilesWithErrorMargin.
Old code running new test...
[ RUN ] DBTest.ApproximateSizesFilesWithErrorMargin
db/db_test.cc:1562: Failure
Expected: (size) <= (11 * 100), actual: 9478 vs 1100
Other tests updated to reflect consistent accounting of metadata.
Reviewed By: siying
Differential Revision: D21334706
Pulled By: pdillinger
fbshipit-source-id: 6f86870e45213334fedbe9c73b4ebb1d8d611185
2020-06-02 12:27:59 -07:00
|
|
|
uint64_t BlockBasedTable::ApproximateDataOffsetOf(
|
|
|
|
const InternalIteratorBase<IndexValue>& index_iter,
|
|
|
|
uint64_t data_size) const {
|
Handle failures in block-based table size/offset approximation (#9615)
Summary:
In crash test with fault injection, we were seeing stack traces like the following:
```
https://github.com/facebook/rocksdb/issues/3 0x00007f75f763c533 in __GI___assert_fail (assertion=assertion@entry=0x1c5b2a0 "end_offset >= start_offset", file=file@entry=0x1c580a0 "table/block_based/block_based_table_reader.cc", line=line@entry=3245,
function=function@entry=0x1c60e60 "virtual uint64_t rocksdb::BlockBasedTable::ApproximateSize(const rocksdb::Slice&, const rocksdb::Slice&, rocksdb::TableReaderCaller)") at assert.c:101
https://github.com/facebook/rocksdb/issues/4 0x00000000010ea9b4 in rocksdb::BlockBasedTable::ApproximateSize (this=<optimized out>, start=..., end=..., caller=<optimized out>) at table/block_based/block_based_table_reader.cc:3224
https://github.com/facebook/rocksdb/issues/5 0x0000000000be61fb in rocksdb::TableCache::ApproximateSize (this=0x60f0000161b0, start=..., end=..., fd=..., caller=caller@entry=rocksdb::kCompaction, internal_comparator=..., prefix_extractor=...) at db/table_cache.cc:719
https://github.com/facebook/rocksdb/issues/6 0x0000000000c3eaec in rocksdb::VersionSet::ApproximateSize (this=<optimized out>, v=<optimized out>, f=..., start=..., end=..., caller=<optimized out>) at ./db/version_set.h:850
https://github.com/facebook/rocksdb/issues/7 0x0000000000c6ebc3 in rocksdb::VersionSet::ApproximateSize (this=<optimized out>, options=..., v=v@entry=0x621000047500, start=..., end=..., start_level=start_level@entry=0, end_level=<optimized out>, caller=<optimized out>)
at db/version_set.cc:5657
https://github.com/facebook/rocksdb/issues/8 0x000000000166e894 in rocksdb::CompactionJob::GenSubcompactionBoundaries (this=<optimized out>) at ./include/rocksdb/options.h:1869
https://github.com/facebook/rocksdb/issues/9 0x000000000168c526 in rocksdb::CompactionJob::Prepare (this=this@entry=0x7f75f3ffcf00) at db/compaction/compaction_job.cc:546
```
The problem occurred in `ApproximateSize()` when the index `Seek()` for the first `ApproximateDataOffsetOf()` encountered an I/O error, while the second `Seek()` did not. In the old code that scenario caused `start_offset == data_size` , thus it was easy to trip the assertion that `end_offset >= start_offset`.
The fix is to set `start_offset == 0` when the first index `Seek()` fails, and `end_offset == data_size` when the second index `Seek()` fails. I doubt these give an "on average correct" answer for how this function is used, but I/O errors in index seeks are hopefully rare, it looked consistent with what was already there, and it was easier to calculate.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9615
Test Plan:
run the repro command for a while and stopped seeing coredumps -
```
$ while ! ./db_stress --block_size=128 --cache_size=32768 --clear_column_family_one_in=0 --column_families=1 --continuous_verification_interval=0 --db=/dev/shm/rocksdb_crashtest --delpercent=4 --delrangepercent=1 --destroy_db_initially=0 --expected_values_dir=/dev/shm/rocksdb_crashtest_expected --index_type=2 --iterpercent=10 --kill_random_test=18887 --max_key=1000000 --max_bytes_for_level_base=2048576 --nooverwritepercent=1 --open_files=-1 --open_read_fault_one_in=32 --ops_per_thread=1000000 --prefixpercent=5 --read_fault_one_in=0 --readpercent=45 --reopen=0 --skip_verifydb=1 --subcompactions=2 --target_file_size_base=524288 --test_batches_snapshots=0 --value_size_mult=32 --write_buffer_size=524288 --writepercent=35 ; do : ; done
```
Reviewed By: pdillinger
Differential Revision: D34383069
Pulled By: ajkr
fbshipit-source-id: fac26c3b20ea962e75387515ba5f2724dc48719f
2022-02-28 23:45:08 -08:00
|
|
|
assert(index_iter.status().ok());
|
2019-08-16 14:16:49 -07:00
|
|
|
if (index_iter.Valid()) {
|
|
|
|
BlockHandle handle = index_iter.value().handle;
|
For ApproximateSizes, pro-rate table metadata size over data blocks (#6784)
Summary:
The implementation of GetApproximateSizes was inconsistent in
its treatment of the size of non-data blocks of SST files, sometimes
including and sometimes now. This was at its worst with large portion
of table file used by filters and querying a small range that crossed
a table boundary: the size estimate would include large filter size.
It's conceivable that someone might want only to know the size in terms
of data blocks, but I believe that's unlikely enough to ignore for now.
Similarly, there's no evidence the internal function AppoximateOffsetOf
is used for anything other than a one-sided ApproximateSize, so I intend
to refactor to remove redundancy in a follow-up commit.
So to fix this, GetApproximateSizes (and implementation details
ApproximateSize and ApproximateOffsetOf) now consistently include in
their returned sizes a portion of table file metadata (incl filters
and indexes) based on the size portion of the data blocks in range. In
other words, if a key range covers data blocks that are X% by size of all
the table's data blocks, returned approximate size is X% of the total
file size. It would technically be more accurate to attribute metadata
based on number of keys, but that's not computationally efficient with
data available and rarely a meaningful difference.
Also includes miscellaneous comment improvements / clarifications.
Also included is a new approximatesizerandom benchmark for db_bench.
No significant performance difference seen with this change, whether ~700 ops/sec with cache_index_and_filter_blocks and small cache or ~150k ops/sec without cache_index_and_filter_blocks.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6784
Test Plan:
Test added to DBTest.ApproximateSizesFilesWithErrorMargin.
Old code running new test...
[ RUN ] DBTest.ApproximateSizesFilesWithErrorMargin
db/db_test.cc:1562: Failure
Expected: (size) <= (11 * 100), actual: 9478 vs 1100
Other tests updated to reflect consistent accounting of metadata.
Reviewed By: siying
Differential Revision: D21334706
Pulled By: pdillinger
fbshipit-source-id: 6f86870e45213334fedbe9c73b4ebb1d8d611185
2020-06-02 12:27:59 -07:00
|
|
|
return handle.offset();
|
2011-03-18 22:37:00 +00:00
|
|
|
} else {
|
For ApproximateSizes, pro-rate table metadata size over data blocks (#6784)
Summary:
The implementation of GetApproximateSizes was inconsistent in
its treatment of the size of non-data blocks of SST files, sometimes
including and sometimes now. This was at its worst with large portion
of table file used by filters and querying a small range that crossed
a table boundary: the size estimate would include large filter size.
It's conceivable that someone might want only to know the size in terms
of data blocks, but I believe that's unlikely enough to ignore for now.
Similarly, there's no evidence the internal function AppoximateOffsetOf
is used for anything other than a one-sided ApproximateSize, so I intend
to refactor to remove redundancy in a follow-up commit.
So to fix this, GetApproximateSizes (and implementation details
ApproximateSize and ApproximateOffsetOf) now consistently include in
their returned sizes a portion of table file metadata (incl filters
and indexes) based on the size portion of the data blocks in range. In
other words, if a key range covers data blocks that are X% by size of all
the table's data blocks, returned approximate size is X% of the total
file size. It would technically be more accurate to attribute metadata
based on number of keys, but that's not computationally efficient with
data available and rarely a meaningful difference.
Also includes miscellaneous comment improvements / clarifications.
Also included is a new approximatesizerandom benchmark for db_bench.
No significant performance difference seen with this change, whether ~700 ops/sec with cache_index_and_filter_blocks and small cache or ~150k ops/sec without cache_index_and_filter_blocks.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6784
Test Plan:
Test added to DBTest.ApproximateSizesFilesWithErrorMargin.
Old code running new test...
[ RUN ] DBTest.ApproximateSizesFilesWithErrorMargin
db/db_test.cc:1562: Failure
Expected: (size) <= (11 * 100), actual: 9478 vs 1100
Other tests updated to reflect consistent accounting of metadata.
Reviewed By: siying
Differential Revision: D21334706
Pulled By: pdillinger
fbshipit-source-id: 6f86870e45213334fedbe9c73b4ebb1d8d611185
2020-06-02 12:27:59 -07:00
|
|
|
// The iterator is past the last key in the file.
|
|
|
|
return data_size;
|
2011-03-18 22:37:00 +00:00
|
|
|
}
|
For ApproximateSizes, pro-rate table metadata size over data blocks (#6784)
Summary:
The implementation of GetApproximateSizes was inconsistent in
its treatment of the size of non-data blocks of SST files, sometimes
including and sometimes now. This was at its worst with large portion
of table file used by filters and querying a small range that crossed
a table boundary: the size estimate would include large filter size.
It's conceivable that someone might want only to know the size in terms
of data blocks, but I believe that's unlikely enough to ignore for now.
Similarly, there's no evidence the internal function AppoximateOffsetOf
is used for anything other than a one-sided ApproximateSize, so I intend
to refactor to remove redundancy in a follow-up commit.
So to fix this, GetApproximateSizes (and implementation details
ApproximateSize and ApproximateOffsetOf) now consistently include in
their returned sizes a portion of table file metadata (incl filters
and indexes) based on the size portion of the data blocks in range. In
other words, if a key range covers data blocks that are X% by size of all
the table's data blocks, returned approximate size is X% of the total
file size. It would technically be more accurate to attribute metadata
based on number of keys, but that's not computationally efficient with
data available and rarely a meaningful difference.
Also includes miscellaneous comment improvements / clarifications.
Also included is a new approximatesizerandom benchmark for db_bench.
No significant performance difference seen with this change, whether ~700 ops/sec with cache_index_and_filter_blocks and small cache or ~150k ops/sec without cache_index_and_filter_blocks.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6784
Test Plan:
Test added to DBTest.ApproximateSizesFilesWithErrorMargin.
Old code running new test...
[ RUN ] DBTest.ApproximateSizesFilesWithErrorMargin
db/db_test.cc:1562: Failure
Expected: (size) <= (11 * 100), actual: 9478 vs 1100
Other tests updated to reflect consistent accounting of metadata.
Reviewed By: siying
Differential Revision: D21334706
Pulled By: pdillinger
fbshipit-source-id: 6f86870e45213334fedbe9c73b4ebb1d8d611185
2020-06-02 12:27:59 -07:00
|
|
|
}
|
2019-07-23 15:30:59 -07:00
|
|
|
|
For ApproximateSizes, pro-rate table metadata size over data blocks (#6784)
Summary:
The implementation of GetApproximateSizes was inconsistent in
its treatment of the size of non-data blocks of SST files, sometimes
including and sometimes now. This was at its worst with large portion
of table file used by filters and querying a small range that crossed
a table boundary: the size estimate would include large filter size.
It's conceivable that someone might want only to know the size in terms
of data blocks, but I believe that's unlikely enough to ignore for now.
Similarly, there's no evidence the internal function AppoximateOffsetOf
is used for anything other than a one-sided ApproximateSize, so I intend
to refactor to remove redundancy in a follow-up commit.
So to fix this, GetApproximateSizes (and implementation details
ApproximateSize and ApproximateOffsetOf) now consistently include in
their returned sizes a portion of table file metadata (incl filters
and indexes) based on the size portion of the data blocks in range. In
other words, if a key range covers data blocks that are X% by size of all
the table's data blocks, returned approximate size is X% of the total
file size. It would technically be more accurate to attribute metadata
based on number of keys, but that's not computationally efficient with
data available and rarely a meaningful difference.
Also includes miscellaneous comment improvements / clarifications.
Also included is a new approximatesizerandom benchmark for db_bench.
No significant performance difference seen with this change, whether ~700 ops/sec with cache_index_and_filter_blocks and small cache or ~150k ops/sec without cache_index_and_filter_blocks.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6784
Test Plan:
Test added to DBTest.ApproximateSizesFilesWithErrorMargin.
Old code running new test...
[ RUN ] DBTest.ApproximateSizesFilesWithErrorMargin
db/db_test.cc:1562: Failure
Expected: (size) <= (11 * 100), actual: 9478 vs 1100
Other tests updated to reflect consistent accounting of metadata.
Reviewed By: siying
Differential Revision: D21334706
Pulled By: pdillinger
fbshipit-source-id: 6f86870e45213334fedbe9c73b4ebb1d8d611185
2020-06-02 12:27:59 -07:00
|
|
|
uint64_t BlockBasedTable::GetApproximateDataSize() {
|
|
|
|
// Should be in table properties unless super old version
|
|
|
|
if (rep_->table_properties) {
|
|
|
|
return rep_->table_properties->data_size;
|
|
|
|
}
|
|
|
|
// Fall back to rough estimate from footer
|
|
|
|
return rep_->footer.metaindex_handle().offset();
|
2019-08-16 14:16:49 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key,
|
|
|
|
TableReaderCaller caller) {
|
For ApproximateSizes, pro-rate table metadata size over data blocks (#6784)
Summary:
The implementation of GetApproximateSizes was inconsistent in
its treatment of the size of non-data blocks of SST files, sometimes
including and sometimes now. This was at its worst with large portion
of table file used by filters and querying a small range that crossed
a table boundary: the size estimate would include large filter size.
It's conceivable that someone might want only to know the size in terms
of data blocks, but I believe that's unlikely enough to ignore for now.
Similarly, there's no evidence the internal function AppoximateOffsetOf
is used for anything other than a one-sided ApproximateSize, so I intend
to refactor to remove redundancy in a follow-up commit.
So to fix this, GetApproximateSizes (and implementation details
ApproximateSize and ApproximateOffsetOf) now consistently include in
their returned sizes a portion of table file metadata (incl filters
and indexes) based on the size portion of the data blocks in range. In
other words, if a key range covers data blocks that are X% by size of all
the table's data blocks, returned approximate size is X% of the total
file size. It would technically be more accurate to attribute metadata
based on number of keys, but that's not computationally efficient with
data available and rarely a meaningful difference.
Also includes miscellaneous comment improvements / clarifications.
Also included is a new approximatesizerandom benchmark for db_bench.
No significant performance difference seen with this change, whether ~700 ops/sec with cache_index_and_filter_blocks and small cache or ~150k ops/sec without cache_index_and_filter_blocks.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6784
Test Plan:
Test added to DBTest.ApproximateSizesFilesWithErrorMargin.
Old code running new test...
[ RUN ] DBTest.ApproximateSizesFilesWithErrorMargin
db/db_test.cc:1562: Failure
Expected: (size) <= (11 * 100), actual: 9478 vs 1100
Other tests updated to reflect consistent accounting of metadata.
Reviewed By: siying
Differential Revision: D21334706
Pulled By: pdillinger
fbshipit-source-id: 6f86870e45213334fedbe9c73b4ebb1d8d611185
2020-06-02 12:27:59 -07:00
|
|
|
uint64_t data_size = GetApproximateDataSize();
|
|
|
|
if (UNLIKELY(data_size == 0)) {
|
|
|
|
// Hmm. Let's just split in half to avoid skewing one way or another,
|
|
|
|
// since we don't know whether we're operating on lower bound or
|
|
|
|
// upper bound.
|
|
|
|
return rep_->file_size / 2;
|
|
|
|
}
|
|
|
|
|
2019-08-16 14:16:49 -07:00
|
|
|
BlockCacheLookupContext context(caller);
|
|
|
|
IndexBlockIter iiter_on_stack;
|
2019-12-19 14:54:48 -08:00
|
|
|
ReadOptions ro;
|
|
|
|
ro.total_order_seek = true;
|
2019-08-16 14:16:49 -07:00
|
|
|
auto index_iter =
|
2019-12-19 14:54:48 -08:00
|
|
|
NewIndexIterator(ro, /*disable_prefix_seek=*/true,
|
2019-08-16 14:16:49 -07:00
|
|
|
/*input_iter=*/&iiter_on_stack, /*get_context=*/nullptr,
|
|
|
|
/*lookup_context=*/&context);
|
|
|
|
std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
|
2019-07-23 15:30:59 -07:00
|
|
|
if (index_iter != &iiter_on_stack) {
|
2019-08-16 14:16:49 -07:00
|
|
|
iiter_unique_ptr.reset(index_iter);
|
2019-07-23 15:30:59 -07:00
|
|
|
}
|
|
|
|
|
2019-08-16 14:16:49 -07:00
|
|
|
index_iter->Seek(key);
|
Handle failures in block-based table size/offset approximation (#9615)
Summary:
In crash test with fault injection, we were seeing stack traces like the following:
```
https://github.com/facebook/rocksdb/issues/3 0x00007f75f763c533 in __GI___assert_fail (assertion=assertion@entry=0x1c5b2a0 "end_offset >= start_offset", file=file@entry=0x1c580a0 "table/block_based/block_based_table_reader.cc", line=line@entry=3245,
function=function@entry=0x1c60e60 "virtual uint64_t rocksdb::BlockBasedTable::ApproximateSize(const rocksdb::Slice&, const rocksdb::Slice&, rocksdb::TableReaderCaller)") at assert.c:101
https://github.com/facebook/rocksdb/issues/4 0x00000000010ea9b4 in rocksdb::BlockBasedTable::ApproximateSize (this=<optimized out>, start=..., end=..., caller=<optimized out>) at table/block_based/block_based_table_reader.cc:3224
https://github.com/facebook/rocksdb/issues/5 0x0000000000be61fb in rocksdb::TableCache::ApproximateSize (this=0x60f0000161b0, start=..., end=..., fd=..., caller=caller@entry=rocksdb::kCompaction, internal_comparator=..., prefix_extractor=...) at db/table_cache.cc:719
https://github.com/facebook/rocksdb/issues/6 0x0000000000c3eaec in rocksdb::VersionSet::ApproximateSize (this=<optimized out>, v=<optimized out>, f=..., start=..., end=..., caller=<optimized out>) at ./db/version_set.h:850
https://github.com/facebook/rocksdb/issues/7 0x0000000000c6ebc3 in rocksdb::VersionSet::ApproximateSize (this=<optimized out>, options=..., v=v@entry=0x621000047500, start=..., end=..., start_level=start_level@entry=0, end_level=<optimized out>, caller=<optimized out>)
at db/version_set.cc:5657
https://github.com/facebook/rocksdb/issues/8 0x000000000166e894 in rocksdb::CompactionJob::GenSubcompactionBoundaries (this=<optimized out>) at ./include/rocksdb/options.h:1869
https://github.com/facebook/rocksdb/issues/9 0x000000000168c526 in rocksdb::CompactionJob::Prepare (this=this@entry=0x7f75f3ffcf00) at db/compaction/compaction_job.cc:546
```
The problem occurred in `ApproximateSize()` when the index `Seek()` for the first `ApproximateDataOffsetOf()` encountered an I/O error, while the second `Seek()` did not. In the old code that scenario caused `start_offset == data_size` , thus it was easy to trip the assertion that `end_offset >= start_offset`.
The fix is to set `start_offset == 0` when the first index `Seek()` fails, and `end_offset == data_size` when the second index `Seek()` fails. I doubt these give an "on average correct" answer for how this function is used, but I/O errors in index seeks are hopefully rare, it looked consistent with what was already there, and it was easier to calculate.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9615
Test Plan:
run the repro command for a while and stopped seeing coredumps -
```
$ while ! ./db_stress --block_size=128 --cache_size=32768 --clear_column_family_one_in=0 --column_families=1 --continuous_verification_interval=0 --db=/dev/shm/rocksdb_crashtest --delpercent=4 --delrangepercent=1 --destroy_db_initially=0 --expected_values_dir=/dev/shm/rocksdb_crashtest_expected --index_type=2 --iterpercent=10 --kill_random_test=18887 --max_key=1000000 --max_bytes_for_level_base=2048576 --nooverwritepercent=1 --open_files=-1 --open_read_fault_one_in=32 --ops_per_thread=1000000 --prefixpercent=5 --read_fault_one_in=0 --readpercent=45 --reopen=0 --skip_verifydb=1 --subcompactions=2 --target_file_size_base=524288 --test_batches_snapshots=0 --value_size_mult=32 --write_buffer_size=524288 --writepercent=35 ; do : ; done
```
Reviewed By: pdillinger
Differential Revision: D34383069
Pulled By: ajkr
fbshipit-source-id: fac26c3b20ea962e75387515ba5f2724dc48719f
2022-02-28 23:45:08 -08:00
|
|
|
uint64_t offset;
|
|
|
|
if (index_iter->status().ok()) {
|
|
|
|
offset = ApproximateDataOffsetOf(*index_iter, data_size);
|
|
|
|
} else {
|
|
|
|
// Split in half to avoid skewing one way or another,
|
|
|
|
// since we don't know whether we're operating on lower bound or
|
|
|
|
// upper bound.
|
|
|
|
return rep_->file_size / 2;
|
|
|
|
}
|
For ApproximateSizes, pro-rate table metadata size over data blocks (#6784)
Summary:
The implementation of GetApproximateSizes was inconsistent in
its treatment of the size of non-data blocks of SST files, sometimes
including and sometimes now. This was at its worst with large portion
of table file used by filters and querying a small range that crossed
a table boundary: the size estimate would include large filter size.
It's conceivable that someone might want only to know the size in terms
of data blocks, but I believe that's unlikely enough to ignore for now.
Similarly, there's no evidence the internal function AppoximateOffsetOf
is used for anything other than a one-sided ApproximateSize, so I intend
to refactor to remove redundancy in a follow-up commit.
So to fix this, GetApproximateSizes (and implementation details
ApproximateSize and ApproximateOffsetOf) now consistently include in
their returned sizes a portion of table file metadata (incl filters
and indexes) based on the size portion of the data blocks in range. In
other words, if a key range covers data blocks that are X% by size of all
the table's data blocks, returned approximate size is X% of the total
file size. It would technically be more accurate to attribute metadata
based on number of keys, but that's not computationally efficient with
data available and rarely a meaningful difference.
Also includes miscellaneous comment improvements / clarifications.
Also included is a new approximatesizerandom benchmark for db_bench.
No significant performance difference seen with this change, whether ~700 ops/sec with cache_index_and_filter_blocks and small cache or ~150k ops/sec without cache_index_and_filter_blocks.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6784
Test Plan:
Test added to DBTest.ApproximateSizesFilesWithErrorMargin.
Old code running new test...
[ RUN ] DBTest.ApproximateSizesFilesWithErrorMargin
db/db_test.cc:1562: Failure
Expected: (size) <= (11 * 100), actual: 9478 vs 1100
Other tests updated to reflect consistent accounting of metadata.
Reviewed By: siying
Differential Revision: D21334706
Pulled By: pdillinger
fbshipit-source-id: 6f86870e45213334fedbe9c73b4ebb1d8d611185
2020-06-02 12:27:59 -07:00
|
|
|
|
|
|
|
// Pro-rate file metadata (incl filters) size-proportionally across data
|
|
|
|
// blocks.
|
|
|
|
double size_ratio =
|
|
|
|
static_cast<double>(offset) / static_cast<double>(data_size);
|
|
|
|
return static_cast<uint64_t>(size_ratio *
|
|
|
|
static_cast<double>(rep_->file_size));
|
2019-08-16 14:16:49 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t BlockBasedTable::ApproximateSize(const Slice& start, const Slice& end,
|
|
|
|
TableReaderCaller caller) {
|
|
|
|
assert(rep_->internal_comparator.Compare(start, end) <= 0);
|
|
|
|
|
For ApproximateSizes, pro-rate table metadata size over data blocks (#6784)
Summary:
The implementation of GetApproximateSizes was inconsistent in
its treatment of the size of non-data blocks of SST files, sometimes
including and sometimes now. This was at its worst with large portion
of table file used by filters and querying a small range that crossed
a table boundary: the size estimate would include large filter size.
It's conceivable that someone might want only to know the size in terms
of data blocks, but I believe that's unlikely enough to ignore for now.
Similarly, there's no evidence the internal function AppoximateOffsetOf
is used for anything other than a one-sided ApproximateSize, so I intend
to refactor to remove redundancy in a follow-up commit.
So to fix this, GetApproximateSizes (and implementation details
ApproximateSize and ApproximateOffsetOf) now consistently include in
their returned sizes a portion of table file metadata (incl filters
and indexes) based on the size portion of the data blocks in range. In
other words, if a key range covers data blocks that are X% by size of all
the table's data blocks, returned approximate size is X% of the total
file size. It would technically be more accurate to attribute metadata
based on number of keys, but that's not computationally efficient with
data available and rarely a meaningful difference.
Also includes miscellaneous comment improvements / clarifications.
Also included is a new approximatesizerandom benchmark for db_bench.
No significant performance difference seen with this change, whether ~700 ops/sec with cache_index_and_filter_blocks and small cache or ~150k ops/sec without cache_index_and_filter_blocks.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6784
Test Plan:
Test added to DBTest.ApproximateSizesFilesWithErrorMargin.
Old code running new test...
[ RUN ] DBTest.ApproximateSizesFilesWithErrorMargin
db/db_test.cc:1562: Failure
Expected: (size) <= (11 * 100), actual: 9478 vs 1100
Other tests updated to reflect consistent accounting of metadata.
Reviewed By: siying
Differential Revision: D21334706
Pulled By: pdillinger
fbshipit-source-id: 6f86870e45213334fedbe9c73b4ebb1d8d611185
2020-06-02 12:27:59 -07:00
|
|
|
uint64_t data_size = GetApproximateDataSize();
|
|
|
|
if (UNLIKELY(data_size == 0)) {
|
|
|
|
// Hmm. Assume whole file is involved, since we have lower and upper
|
Handle failures in block-based table size/offset approximation (#9615)
Summary:
In crash test with fault injection, we were seeing stack traces like the following:
```
https://github.com/facebook/rocksdb/issues/3 0x00007f75f763c533 in __GI___assert_fail (assertion=assertion@entry=0x1c5b2a0 "end_offset >= start_offset", file=file@entry=0x1c580a0 "table/block_based/block_based_table_reader.cc", line=line@entry=3245,
function=function@entry=0x1c60e60 "virtual uint64_t rocksdb::BlockBasedTable::ApproximateSize(const rocksdb::Slice&, const rocksdb::Slice&, rocksdb::TableReaderCaller)") at assert.c:101
https://github.com/facebook/rocksdb/issues/4 0x00000000010ea9b4 in rocksdb::BlockBasedTable::ApproximateSize (this=<optimized out>, start=..., end=..., caller=<optimized out>) at table/block_based/block_based_table_reader.cc:3224
https://github.com/facebook/rocksdb/issues/5 0x0000000000be61fb in rocksdb::TableCache::ApproximateSize (this=0x60f0000161b0, start=..., end=..., fd=..., caller=caller@entry=rocksdb::kCompaction, internal_comparator=..., prefix_extractor=...) at db/table_cache.cc:719
https://github.com/facebook/rocksdb/issues/6 0x0000000000c3eaec in rocksdb::VersionSet::ApproximateSize (this=<optimized out>, v=<optimized out>, f=..., start=..., end=..., caller=<optimized out>) at ./db/version_set.h:850
https://github.com/facebook/rocksdb/issues/7 0x0000000000c6ebc3 in rocksdb::VersionSet::ApproximateSize (this=<optimized out>, options=..., v=v@entry=0x621000047500, start=..., end=..., start_level=start_level@entry=0, end_level=<optimized out>, caller=<optimized out>)
at db/version_set.cc:5657
https://github.com/facebook/rocksdb/issues/8 0x000000000166e894 in rocksdb::CompactionJob::GenSubcompactionBoundaries (this=<optimized out>) at ./include/rocksdb/options.h:1869
https://github.com/facebook/rocksdb/issues/9 0x000000000168c526 in rocksdb::CompactionJob::Prepare (this=this@entry=0x7f75f3ffcf00) at db/compaction/compaction_job.cc:546
```
The problem occurred in `ApproximateSize()` when the index `Seek()` for the first `ApproximateDataOffsetOf()` encountered an I/O error, while the second `Seek()` did not. In the old code that scenario caused `start_offset == data_size` , thus it was easy to trip the assertion that `end_offset >= start_offset`.
The fix is to set `start_offset == 0` when the first index `Seek()` fails, and `end_offset == data_size` when the second index `Seek()` fails. I doubt these give an "on average correct" answer for how this function is used, but I/O errors in index seeks are hopefully rare, it looked consistent with what was already there, and it was easier to calculate.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9615
Test Plan:
run the repro command for a while and stopped seeing coredumps -
```
$ while ! ./db_stress --block_size=128 --cache_size=32768 --clear_column_family_one_in=0 --column_families=1 --continuous_verification_interval=0 --db=/dev/shm/rocksdb_crashtest --delpercent=4 --delrangepercent=1 --destroy_db_initially=0 --expected_values_dir=/dev/shm/rocksdb_crashtest_expected --index_type=2 --iterpercent=10 --kill_random_test=18887 --max_key=1000000 --max_bytes_for_level_base=2048576 --nooverwritepercent=1 --open_files=-1 --open_read_fault_one_in=32 --ops_per_thread=1000000 --prefixpercent=5 --read_fault_one_in=0 --readpercent=45 --reopen=0 --skip_verifydb=1 --subcompactions=2 --target_file_size_base=524288 --test_batches_snapshots=0 --value_size_mult=32 --write_buffer_size=524288 --writepercent=35 ; do : ; done
```
Reviewed By: pdillinger
Differential Revision: D34383069
Pulled By: ajkr
fbshipit-source-id: fac26c3b20ea962e75387515ba5f2724dc48719f
2022-02-28 23:45:08 -08:00
|
|
|
// bound. This likely skews the estimate if we consider that this function
|
|
|
|
// is typically called with `[start, end]` fully contained in the file's
|
|
|
|
// key-range.
|
For ApproximateSizes, pro-rate table metadata size over data blocks (#6784)
Summary:
The implementation of GetApproximateSizes was inconsistent in
its treatment of the size of non-data blocks of SST files, sometimes
including and sometimes now. This was at its worst with large portion
of table file used by filters and querying a small range that crossed
a table boundary: the size estimate would include large filter size.
It's conceivable that someone might want only to know the size in terms
of data blocks, but I believe that's unlikely enough to ignore for now.
Similarly, there's no evidence the internal function AppoximateOffsetOf
is used for anything other than a one-sided ApproximateSize, so I intend
to refactor to remove redundancy in a follow-up commit.
So to fix this, GetApproximateSizes (and implementation details
ApproximateSize and ApproximateOffsetOf) now consistently include in
their returned sizes a portion of table file metadata (incl filters
and indexes) based on the size portion of the data blocks in range. In
other words, if a key range covers data blocks that are X% by size of all
the table's data blocks, returned approximate size is X% of the total
file size. It would technically be more accurate to attribute metadata
based on number of keys, but that's not computationally efficient with
data available and rarely a meaningful difference.
Also includes miscellaneous comment improvements / clarifications.
Also included is a new approximatesizerandom benchmark for db_bench.
No significant performance difference seen with this change, whether ~700 ops/sec with cache_index_and_filter_blocks and small cache or ~150k ops/sec without cache_index_and_filter_blocks.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6784
Test Plan:
Test added to DBTest.ApproximateSizesFilesWithErrorMargin.
Old code running new test...
[ RUN ] DBTest.ApproximateSizesFilesWithErrorMargin
db/db_test.cc:1562: Failure
Expected: (size) <= (11 * 100), actual: 9478 vs 1100
Other tests updated to reflect consistent accounting of metadata.
Reviewed By: siying
Differential Revision: D21334706
Pulled By: pdillinger
fbshipit-source-id: 6f86870e45213334fedbe9c73b4ebb1d8d611185
2020-06-02 12:27:59 -07:00
|
|
|
return rep_->file_size;
|
|
|
|
}
|
|
|
|
|
2019-08-16 14:16:49 -07:00
|
|
|
BlockCacheLookupContext context(caller);
|
|
|
|
IndexBlockIter iiter_on_stack;
|
2019-12-19 14:54:48 -08:00
|
|
|
ReadOptions ro;
|
|
|
|
ro.total_order_seek = true;
|
2019-08-16 14:16:49 -07:00
|
|
|
auto index_iter =
|
2019-12-19 14:54:48 -08:00
|
|
|
NewIndexIterator(ro, /*disable_prefix_seek=*/true,
|
2019-08-16 14:16:49 -07:00
|
|
|
/*input_iter=*/&iiter_on_stack, /*get_context=*/nullptr,
|
|
|
|
/*lookup_context=*/&context);
|
|
|
|
std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
|
|
|
|
if (index_iter != &iiter_on_stack) {
|
|
|
|
iiter_unique_ptr.reset(index_iter);
|
|
|
|
}
|
|
|
|
|
|
|
|
index_iter->Seek(start);
|
Handle failures in block-based table size/offset approximation (#9615)
Summary:
In crash test with fault injection, we were seeing stack traces like the following:
```
https://github.com/facebook/rocksdb/issues/3 0x00007f75f763c533 in __GI___assert_fail (assertion=assertion@entry=0x1c5b2a0 "end_offset >= start_offset", file=file@entry=0x1c580a0 "table/block_based/block_based_table_reader.cc", line=line@entry=3245,
function=function@entry=0x1c60e60 "virtual uint64_t rocksdb::BlockBasedTable::ApproximateSize(const rocksdb::Slice&, const rocksdb::Slice&, rocksdb::TableReaderCaller)") at assert.c:101
https://github.com/facebook/rocksdb/issues/4 0x00000000010ea9b4 in rocksdb::BlockBasedTable::ApproximateSize (this=<optimized out>, start=..., end=..., caller=<optimized out>) at table/block_based/block_based_table_reader.cc:3224
https://github.com/facebook/rocksdb/issues/5 0x0000000000be61fb in rocksdb::TableCache::ApproximateSize (this=0x60f0000161b0, start=..., end=..., fd=..., caller=caller@entry=rocksdb::kCompaction, internal_comparator=..., prefix_extractor=...) at db/table_cache.cc:719
https://github.com/facebook/rocksdb/issues/6 0x0000000000c3eaec in rocksdb::VersionSet::ApproximateSize (this=<optimized out>, v=<optimized out>, f=..., start=..., end=..., caller=<optimized out>) at ./db/version_set.h:850
https://github.com/facebook/rocksdb/issues/7 0x0000000000c6ebc3 in rocksdb::VersionSet::ApproximateSize (this=<optimized out>, options=..., v=v@entry=0x621000047500, start=..., end=..., start_level=start_level@entry=0, end_level=<optimized out>, caller=<optimized out>)
at db/version_set.cc:5657
https://github.com/facebook/rocksdb/issues/8 0x000000000166e894 in rocksdb::CompactionJob::GenSubcompactionBoundaries (this=<optimized out>) at ./include/rocksdb/options.h:1869
https://github.com/facebook/rocksdb/issues/9 0x000000000168c526 in rocksdb::CompactionJob::Prepare (this=this@entry=0x7f75f3ffcf00) at db/compaction/compaction_job.cc:546
```
The problem occurred in `ApproximateSize()` when the index `Seek()` for the first `ApproximateDataOffsetOf()` encountered an I/O error, while the second `Seek()` did not. In the old code that scenario caused `start_offset == data_size` , thus it was easy to trip the assertion that `end_offset >= start_offset`.
The fix is to set `start_offset == 0` when the first index `Seek()` fails, and `end_offset == data_size` when the second index `Seek()` fails. I doubt these give an "on average correct" answer for how this function is used, but I/O errors in index seeks are hopefully rare, it looked consistent with what was already there, and it was easier to calculate.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9615
Test Plan:
run the repro command for a while and stopped seeing coredumps -
```
$ while ! ./db_stress --block_size=128 --cache_size=32768 --clear_column_family_one_in=0 --column_families=1 --continuous_verification_interval=0 --db=/dev/shm/rocksdb_crashtest --delpercent=4 --delrangepercent=1 --destroy_db_initially=0 --expected_values_dir=/dev/shm/rocksdb_crashtest_expected --index_type=2 --iterpercent=10 --kill_random_test=18887 --max_key=1000000 --max_bytes_for_level_base=2048576 --nooverwritepercent=1 --open_files=-1 --open_read_fault_one_in=32 --ops_per_thread=1000000 --prefixpercent=5 --read_fault_one_in=0 --readpercent=45 --reopen=0 --skip_verifydb=1 --subcompactions=2 --target_file_size_base=524288 --test_batches_snapshots=0 --value_size_mult=32 --write_buffer_size=524288 --writepercent=35 ; do : ; done
```
Reviewed By: pdillinger
Differential Revision: D34383069
Pulled By: ajkr
fbshipit-source-id: fac26c3b20ea962e75387515ba5f2724dc48719f
2022-02-28 23:45:08 -08:00
|
|
|
uint64_t start_offset;
|
|
|
|
if (index_iter->status().ok()) {
|
|
|
|
start_offset = ApproximateDataOffsetOf(*index_iter, data_size);
|
|
|
|
} else {
|
|
|
|
// Assume file is involved from the start. This likely skews the estimate
|
|
|
|
// but is consistent with the above error handling.
|
|
|
|
start_offset = 0;
|
|
|
|
}
|
|
|
|
|
2019-08-16 14:16:49 -07:00
|
|
|
index_iter->Seek(end);
|
Handle failures in block-based table size/offset approximation (#9615)
Summary:
In crash test with fault injection, we were seeing stack traces like the following:
```
https://github.com/facebook/rocksdb/issues/3 0x00007f75f763c533 in __GI___assert_fail (assertion=assertion@entry=0x1c5b2a0 "end_offset >= start_offset", file=file@entry=0x1c580a0 "table/block_based/block_based_table_reader.cc", line=line@entry=3245,
function=function@entry=0x1c60e60 "virtual uint64_t rocksdb::BlockBasedTable::ApproximateSize(const rocksdb::Slice&, const rocksdb::Slice&, rocksdb::TableReaderCaller)") at assert.c:101
https://github.com/facebook/rocksdb/issues/4 0x00000000010ea9b4 in rocksdb::BlockBasedTable::ApproximateSize (this=<optimized out>, start=..., end=..., caller=<optimized out>) at table/block_based/block_based_table_reader.cc:3224
https://github.com/facebook/rocksdb/issues/5 0x0000000000be61fb in rocksdb::TableCache::ApproximateSize (this=0x60f0000161b0, start=..., end=..., fd=..., caller=caller@entry=rocksdb::kCompaction, internal_comparator=..., prefix_extractor=...) at db/table_cache.cc:719
https://github.com/facebook/rocksdb/issues/6 0x0000000000c3eaec in rocksdb::VersionSet::ApproximateSize (this=<optimized out>, v=<optimized out>, f=..., start=..., end=..., caller=<optimized out>) at ./db/version_set.h:850
https://github.com/facebook/rocksdb/issues/7 0x0000000000c6ebc3 in rocksdb::VersionSet::ApproximateSize (this=<optimized out>, options=..., v=v@entry=0x621000047500, start=..., end=..., start_level=start_level@entry=0, end_level=<optimized out>, caller=<optimized out>)
at db/version_set.cc:5657
https://github.com/facebook/rocksdb/issues/8 0x000000000166e894 in rocksdb::CompactionJob::GenSubcompactionBoundaries (this=<optimized out>) at ./include/rocksdb/options.h:1869
https://github.com/facebook/rocksdb/issues/9 0x000000000168c526 in rocksdb::CompactionJob::Prepare (this=this@entry=0x7f75f3ffcf00) at db/compaction/compaction_job.cc:546
```
The problem occurred in `ApproximateSize()` when the index `Seek()` for the first `ApproximateDataOffsetOf()` encountered an I/O error, while the second `Seek()` did not. In the old code that scenario caused `start_offset == data_size` , thus it was easy to trip the assertion that `end_offset >= start_offset`.
The fix is to set `start_offset == 0` when the first index `Seek()` fails, and `end_offset == data_size` when the second index `Seek()` fails. I doubt these give an "on average correct" answer for how this function is used, but I/O errors in index seeks are hopefully rare, it looked consistent with what was already there, and it was easier to calculate.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9615
Test Plan:
run the repro command for a while and stopped seeing coredumps -
```
$ while ! ./db_stress --block_size=128 --cache_size=32768 --clear_column_family_one_in=0 --column_families=1 --continuous_verification_interval=0 --db=/dev/shm/rocksdb_crashtest --delpercent=4 --delrangepercent=1 --destroy_db_initially=0 --expected_values_dir=/dev/shm/rocksdb_crashtest_expected --index_type=2 --iterpercent=10 --kill_random_test=18887 --max_key=1000000 --max_bytes_for_level_base=2048576 --nooverwritepercent=1 --open_files=-1 --open_read_fault_one_in=32 --ops_per_thread=1000000 --prefixpercent=5 --read_fault_one_in=0 --readpercent=45 --reopen=0 --skip_verifydb=1 --subcompactions=2 --target_file_size_base=524288 --test_batches_snapshots=0 --value_size_mult=32 --write_buffer_size=524288 --writepercent=35 ; do : ; done
```
Reviewed By: pdillinger
Differential Revision: D34383069
Pulled By: ajkr
fbshipit-source-id: fac26c3b20ea962e75387515ba5f2724dc48719f
2022-02-28 23:45:08 -08:00
|
|
|
uint64_t end_offset;
|
|
|
|
if (index_iter->status().ok()) {
|
|
|
|
end_offset = ApproximateDataOffsetOf(*index_iter, data_size);
|
|
|
|
} else {
|
|
|
|
// Assume file is involved until the end. This likely skews the estimate
|
|
|
|
// but is consistent with the above error handling.
|
|
|
|
end_offset = data_size;
|
|
|
|
}
|
2019-08-16 14:16:49 -07:00
|
|
|
|
|
|
|
assert(end_offset >= start_offset);
|
For ApproximateSizes, pro-rate table metadata size over data blocks (#6784)
Summary:
The implementation of GetApproximateSizes was inconsistent in
its treatment of the size of non-data blocks of SST files, sometimes
including and sometimes now. This was at its worst with large portion
of table file used by filters and querying a small range that crossed
a table boundary: the size estimate would include large filter size.
It's conceivable that someone might want only to know the size in terms
of data blocks, but I believe that's unlikely enough to ignore for now.
Similarly, there's no evidence the internal function AppoximateOffsetOf
is used for anything other than a one-sided ApproximateSize, so I intend
to refactor to remove redundancy in a follow-up commit.
So to fix this, GetApproximateSizes (and implementation details
ApproximateSize and ApproximateOffsetOf) now consistently include in
their returned sizes a portion of table file metadata (incl filters
and indexes) based on the size portion of the data blocks in range. In
other words, if a key range covers data blocks that are X% by size of all
the table's data blocks, returned approximate size is X% of the total
file size. It would technically be more accurate to attribute metadata
based on number of keys, but that's not computationally efficient with
data available and rarely a meaningful difference.
Also includes miscellaneous comment improvements / clarifications.
Also included is a new approximatesizerandom benchmark for db_bench.
No significant performance difference seen with this change, whether ~700 ops/sec with cache_index_and_filter_blocks and small cache or ~150k ops/sec without cache_index_and_filter_blocks.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6784
Test Plan:
Test added to DBTest.ApproximateSizesFilesWithErrorMargin.
Old code running new test...
[ RUN ] DBTest.ApproximateSizesFilesWithErrorMargin
db/db_test.cc:1562: Failure
Expected: (size) <= (11 * 100), actual: 9478 vs 1100
Other tests updated to reflect consistent accounting of metadata.
Reviewed By: siying
Differential Revision: D21334706
Pulled By: pdillinger
fbshipit-source-id: 6f86870e45213334fedbe9c73b4ebb1d8d611185
2020-06-02 12:27:59 -07:00
|
|
|
// Pro-rate file metadata (incl filters) size-proportionally across data
|
|
|
|
// blocks.
|
|
|
|
double size_ratio = static_cast<double>(end_offset - start_offset) /
|
|
|
|
static_cast<double>(data_size);
|
|
|
|
return static_cast<uint64_t>(size_ratio *
|
|
|
|
static_cast<double>(rep_->file_size));
|
2011-03-18 22:37:00 +00:00
|
|
|
}
|
|
|
|
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
bool BlockBasedTable::TEST_FilterBlockInCache() const {
|
|
|
|
assert(rep_ != nullptr);
|
New stable, fixed-length cache keys (#9126)
Summary:
This change standardizes on a new 16-byte cache key format for
block cache (incl compressed and secondary) and persistent cache (but
not table cache and row cache).
The goal is a really fast cache key with practically ideal stability and
uniqueness properties without external dependencies (e.g. from FileSystem).
A fixed key size of 16 bytes should enable future optimizations to the
concurrent hash table for block cache, which is a heavy CPU user /
bottleneck, but there appears to be measurable performance improvement
even with no changes to LRUCache.
This change replaces a lot of disjointed and ugly code handling cache
keys with calls to a simple, clean new internal API (cache_key.h).
(Preserving the old cache key logic under an option would be very ugly
and likely negate the performance gain of the new approach. Complete
replacement carries some inherent risk, but I think that's acceptable
with sufficient analysis and testing.)
The scheme for encoding new cache keys is complicated but explained
in cache_key.cc.
Also: EndianSwapValue is moved to math.h to be next to other bit
operations. (Explains some new include "math.h".) ReverseBits operation
added and unit tests added to hash_test for both.
Fixes https://github.com/facebook/rocksdb/issues/7405 (presuming a root cause)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9126
Test Plan:
### Basic correctness
Several tests needed updates to work with the new functionality, mostly
because we are no longer relying on filesystem for stable cache keys
so table builders & readers need more context info to agree on cache
keys. This functionality is so core, a huge number of existing tests
exercise the cache key functionality.
### Performance
Create db with
`TEST_TMPDIR=/dev/shm ./db_bench -bloom_bits=10 -benchmarks=fillrandom -num=3000000 -partition_index_and_filters`
And test performance with
`TEST_TMPDIR=/dev/shm ./db_bench -readonly -use_existing_db -bloom_bits=10 -benchmarks=readrandom -num=3000000 -duration=30 -cache_index_and_filter_blocks -cache_size=250000 -threads=4`
using DEBUG_LEVEL=0 and simultaneous before & after runs.
Before ops/sec, avg over 100 runs: 121924
After ops/sec, avg over 100 runs: 125385 (+2.8%)
### Collision probability
I have built a tool, ./cache_bench -stress_cache_key to broadly simulate host-wide cache activity
over many months, by making some pessimistic simplifying assumptions:
* Every generated file has a cache entry for every byte offset in the file (contiguous range of cache keys)
* All of every file is cached for its entire lifetime
We use a simple table with skewed address assignment and replacement on address collision
to simulate files coming & going, with quite a variance (super-Poisson) in ages. Some output
with `./cache_bench -stress_cache_key -sck_keep_bits=40`:
```
Total cache or DBs size: 32TiB Writing 925.926 MiB/s or 76.2939TiB/day
Multiply by 9.22337e+18 to correct for simulation losses (but still assume whole file cached)
```
These come from default settings of 2.5M files per day of 32 MB each, and
`-sck_keep_bits=40` means that to represent a single file, we are only keeping 40 bits of
the 128-bit cache key. With file size of 2\*\*25 contiguous keys (pessimistic), our simulation
is about 2\*\*(128-40-25) or about 9 billion billion times more prone to collision than reality.
More default assumptions, relatively pessimistic:
* 100 DBs in same process (doesn't matter much)
* Re-open DB in same process (new session ID related to old session ID) on average
every 100 files generated
* Restart process (all new session IDs unrelated to old) 24 times per day
After enough data, we get a result at the end:
```
(keep 40 bits) 17 collisions after 2 x 90 days, est 10.5882 days between (9.76592e+19 corrected)
```
If we believe the (pessimistic) simulation and the mathematical generalization, we would need to run a billion machines all for 97 billion days to expect a cache key collision. To help verify that our generalization ("corrected") is robust, we can make our simulation more precise with `-sck_keep_bits=41` and `42`, which takes more running time to get enough data:
```
(keep 41 bits) 16 collisions after 4 x 90 days, est 22.5 days between (1.03763e+20 corrected)
(keep 42 bits) 19 collisions after 10 x 90 days, est 47.3684 days between (1.09224e+20 corrected)
```
The generalized prediction still holds. With the `-sck_randomize` option, we can see that we are beating "random" cache keys (except offsets still non-randomized) by a modest amount (roughly 20x less collision prone than random), which should make us reasonably comfortable even in "degenerate" cases:
```
197 collisions after 1 x 90 days, est 0.456853 days between (4.21372e+18 corrected)
```
I've run other tests to validate other conditions behave as expected, never behaving "worse than random" unless we start chopping off structured data.
Reviewed By: zhichao-cao
Differential Revision: D33171746
Pulled By: pdillinger
fbshipit-source-id: f16a57e369ed37be5e7e33525ace848d0537c88f
2021-12-16 17:13:55 -08:00
|
|
|
return rep_->filter_type != Rep::FilterType::kNoFilter &&
|
|
|
|
TEST_BlockInCache(rep_->filter_handle);
|
2014-02-19 15:38:57 -08:00
|
|
|
}
|
|
|
|
|
Move the index readers out of the block cache (#5298)
Summary:
Currently, when the block cache is used for index blocks as well, it is
not really the index block that is stored in the cache but an
IndexReader object. Since this object is not pure data (it has, for
instance, pointers that might dangle), it's not really sharable. To
avoid the issues around this, the current code uses a dummy unique cache
key for each TableReader to store the IndexReader, and erases the
IndexReader entry when the TableReader is closed. Instead of doing this,
the new code moves the IndexReader out of the cache altogether. In
particular, instead of the TableReader owning, or caching/pinning the
IndexReader based on the customer's settings, the TableReader
unconditionally owns the IndexReader, which in turn owns/caches/pins
the index block (which is itself sharable and thus can be safely put in
the cache without any hacks).
Note: the change has two side effects:
1) Partitions of partitioned indexes no longer affect the read
amplification statistics.
2) Eviction statistics for index blocks are temporarily broken. We plan to fix
this in a separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5298
Differential Revision: D15303203
Pulled By: ltamasi
fbshipit-source-id: 935a69ba59d87d5e44f42e2310619b790c366e47
2019-05-30 11:49:36 -07:00
|
|
|
bool BlockBasedTable::TEST_IndexBlockInCache() const {
|
|
|
|
assert(rep_ != nullptr);
|
|
|
|
|
|
|
|
return TEST_BlockInCache(rep_->footer.index_handle());
|
2014-02-19 15:38:57 -08:00
|
|
|
}
|
|
|
|
|
2016-08-01 14:50:19 -07:00
|
|
|
Status BlockBasedTable::GetKVPairsFromDataBlocks(
|
|
|
|
std::vector<KVPairBlock>* kv_pair_blocks) {
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-24 20:50:35 -07:00
|
|
|
std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter(
|
2019-06-10 15:30:05 -07:00
|
|
|
NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
|
|
|
|
/*input_iter=*/nullptr, /*get_context=*/nullptr,
|
|
|
|
/*lookup_contex=*/nullptr));
|
2016-08-01 14:50:19 -07:00
|
|
|
|
|
|
|
Status s = blockhandles_iter->status();
|
|
|
|
if (!s.ok()) {
|
|
|
|
// Cannot read Index Block
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
|
|
|
|
blockhandles_iter->Next()) {
|
|
|
|
s = blockhandles_iter->status();
|
|
|
|
|
|
|
|
if (!s.ok()) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::unique_ptr<InternalIterator> datablock_iter;
|
2018-07-12 17:19:57 -07:00
|
|
|
datablock_iter.reset(NewDataBlockIterator<DataBlockIter>(
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-24 20:50:35 -07:00
|
|
|
ReadOptions(), blockhandles_iter->value().handle,
|
|
|
|
/*input_iter=*/nullptr, /*type=*/BlockType::kData,
|
2019-06-10 15:30:05 -07:00
|
|
|
/*get_context=*/nullptr, /*lookup_context=*/nullptr, Status(),
|
|
|
|
/*prefetch_buffer=*/nullptr));
|
2016-08-01 14:50:19 -07:00
|
|
|
s = datablock_iter->status();
|
|
|
|
|
|
|
|
if (!s.ok()) {
|
|
|
|
// Error reading the block - Skipped
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
KVPairBlock kv_pair_block;
|
|
|
|
for (datablock_iter->SeekToFirst(); datablock_iter->Valid();
|
|
|
|
datablock_iter->Next()) {
|
|
|
|
s = datablock_iter->status();
|
|
|
|
if (!s.ok()) {
|
|
|
|
// Error reading the block - Skipped
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
const Slice& key = datablock_iter->key();
|
|
|
|
const Slice& value = datablock_iter->value();
|
|
|
|
std::string key_copy = std::string(key.data(), key.size());
|
|
|
|
std::string value_copy = std::string(value.data(), value.size());
|
|
|
|
|
|
|
|
kv_pair_block.push_back(
|
|
|
|
std::make_pair(std::move(key_copy), std::move(value_copy)));
|
|
|
|
}
|
|
|
|
kv_pair_blocks->push_back(std::move(kv_pair_block));
|
|
|
|
}
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
Status BlockBasedTable::DumpTable(WritableFile* out_file) {
|
2020-09-08 12:08:05 -07:00
|
|
|
WritableFileStringStreamAdapter out_file_wrapper(out_file);
|
2020-09-04 19:25:20 -07:00
|
|
|
std::ostream out_stream(&out_file_wrapper);
|
2014-12-23 13:24:07 -08:00
|
|
|
// Output Footer
|
2020-09-04 19:25:20 -07:00
|
|
|
out_stream << "Footer Details:\n"
|
|
|
|
"--------------------------------------\n";
|
|
|
|
out_stream << " " << rep_->footer.ToString() << "\n";
|
2014-12-23 13:24:07 -08:00
|
|
|
|
|
|
|
// Output MetaIndex
|
2020-09-04 19:25:20 -07:00
|
|
|
out_stream << "Metaindex Details:\n"
|
|
|
|
"--------------------------------------\n";
|
2019-11-05 17:17:36 -08:00
|
|
|
std::unique_ptr<Block> metaindex;
|
|
|
|
std::unique_ptr<InternalIterator> metaindex_iter;
|
2020-06-29 14:51:57 -07:00
|
|
|
ReadOptions ro;
|
|
|
|
Status s = ReadMetaIndexBlock(ro, nullptr /* prefetch_buffer */, &metaindex,
|
2019-11-05 17:17:36 -08:00
|
|
|
&metaindex_iter);
|
2014-12-23 13:24:07 -08:00
|
|
|
if (s.ok()) {
|
2019-11-05 17:17:36 -08:00
|
|
|
for (metaindex_iter->SeekToFirst(); metaindex_iter->Valid();
|
|
|
|
metaindex_iter->Next()) {
|
|
|
|
s = metaindex_iter->status();
|
2014-12-23 13:24:07 -08:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
2021-12-10 08:12:09 -08:00
|
|
|
if (metaindex_iter->key() == kPropertiesBlockName) {
|
2020-09-04 19:25:20 -07:00
|
|
|
out_stream << " Properties block handle: "
|
|
|
|
<< metaindex_iter->value().ToString(true) << "\n";
|
2021-12-10 08:12:09 -08:00
|
|
|
} else if (metaindex_iter->key() == kCompressionDictBlockName) {
|
2020-09-04 19:25:20 -07:00
|
|
|
out_stream << " Compression dictionary block handle: "
|
|
|
|
<< metaindex_iter->value().ToString(true) << "\n";
|
2019-11-05 17:17:36 -08:00
|
|
|
} else if (strstr(metaindex_iter->key().ToString().c_str(),
|
2014-12-23 13:24:07 -08:00
|
|
|
"filter.rocksdb.") != nullptr) {
|
2020-09-04 19:25:20 -07:00
|
|
|
out_stream << " Filter block handle: "
|
|
|
|
<< metaindex_iter->value().ToString(true) << "\n";
|
2021-12-10 08:12:09 -08:00
|
|
|
} else if (metaindex_iter->key() == kRangeDelBlockName) {
|
2020-09-04 19:25:20 -07:00
|
|
|
out_stream << " Range deletion block handle: "
|
|
|
|
<< metaindex_iter->value().ToString(true) << "\n";
|
2014-12-23 13:24:07 -08:00
|
|
|
}
|
|
|
|
}
|
2020-09-04 19:25:20 -07:00
|
|
|
out_stream << "\n";
|
2014-12-23 13:24:07 -08:00
|
|
|
} else {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Output TableProperties
|
2020-02-20 12:07:53 -08:00
|
|
|
const ROCKSDB_NAMESPACE::TableProperties* table_properties;
|
2014-12-23 13:24:07 -08:00
|
|
|
table_properties = rep_->table_properties.get();
|
|
|
|
|
|
|
|
if (table_properties != nullptr) {
|
2020-09-04 19:25:20 -07:00
|
|
|
out_stream << "Table Properties:\n"
|
|
|
|
"--------------------------------------\n";
|
|
|
|
out_stream << " " << table_properties->ToString("\n ", ": ") << "\n";
|
2014-12-23 13:24:07 -08:00
|
|
|
}
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 13:11:23 -07:00
|
|
|
|
2014-12-23 13:24:07 -08:00
|
|
|
if (rep_->filter) {
|
2020-09-04 19:25:20 -07:00
|
|
|
out_stream << "Filter Details:\n"
|
|
|
|
"--------------------------------------\n";
|
|
|
|
out_stream << " " << rep_->filter->ToString() << "\n";
|
2014-12-23 13:24:07 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Output Index block
|
2020-09-04 19:25:20 -07:00
|
|
|
s = DumpIndexBlock(out_stream);
|
2014-12-23 13:24:07 -08:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
2017-02-03 12:37:02 -08:00
|
|
|
|
|
|
|
// Output compression dictionary
|
2019-07-23 15:57:43 -07:00
|
|
|
if (rep_->uncompression_dict_reader) {
|
2019-08-23 08:25:52 -07:00
|
|
|
CachableEntry<UncompressionDict> uncompression_dict;
|
2019-07-23 15:57:43 -07:00
|
|
|
s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary(
|
|
|
|
nullptr /* prefetch_buffer */, false /* no_io */,
|
2022-03-29 11:54:54 -07:00
|
|
|
false, /* verify_checksums */
|
2019-07-23 15:57:43 -07:00
|
|
|
nullptr /* get_context */, nullptr /* lookup_context */,
|
|
|
|
&uncompression_dict);
|
2019-01-23 18:11:08 -08:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
2019-07-23 15:57:43 -07:00
|
|
|
|
2019-08-23 08:25:52 -07:00
|
|
|
assert(uncompression_dict.GetValue());
|
|
|
|
|
|
|
|
const Slice& raw_dict = uncompression_dict.GetValue()->GetRawDict();
|
2020-09-04 19:25:20 -07:00
|
|
|
out_stream << "Compression Dictionary:\n"
|
|
|
|
"--------------------------------------\n";
|
|
|
|
out_stream << " size (bytes): " << raw_dict.size() << "\n\n";
|
|
|
|
out_stream << " HEX " << raw_dict.ToString(true) << "\n\n";
|
2017-02-03 12:37:02 -08:00
|
|
|
}
|
|
|
|
|
2016-11-12 09:23:05 -08:00
|
|
|
// Output range deletions block
|
2016-11-16 12:02:39 -08:00
|
|
|
auto* range_del_iter = NewRangeTombstoneIterator(ReadOptions());
|
2016-11-21 12:07:09 -08:00
|
|
|
if (range_del_iter != nullptr) {
|
|
|
|
range_del_iter->SeekToFirst();
|
|
|
|
if (range_del_iter->Valid()) {
|
2020-09-04 19:25:20 -07:00
|
|
|
out_stream << "Range deletions:\n"
|
|
|
|
"--------------------------------------\n";
|
2016-11-21 12:07:09 -08:00
|
|
|
for (; range_del_iter->Valid(); range_del_iter->Next()) {
|
2020-09-04 19:25:20 -07:00
|
|
|
DumpKeyValue(range_del_iter->key(), range_del_iter->value(),
|
|
|
|
out_stream);
|
2016-11-21 12:07:09 -08:00
|
|
|
}
|
2020-09-04 19:25:20 -07:00
|
|
|
out_stream << "\n";
|
2016-11-12 09:23:05 -08:00
|
|
|
}
|
2016-11-21 12:07:09 -08:00
|
|
|
delete range_del_iter;
|
2016-11-12 09:23:05 -08:00
|
|
|
}
|
2014-12-23 13:24:07 -08:00
|
|
|
// Output Data blocks
|
2020-09-04 19:25:20 -07:00
|
|
|
s = DumpDataBlocks(out_stream);
|
2014-12-23 13:24:07 -08:00
|
|
|
|
2020-09-04 19:25:20 -07:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!out_stream.good()) {
|
|
|
|
return Status::IOError("Failed to write to output file");
|
|
|
|
}
|
|
|
|
return Status::OK();
|
2014-12-23 13:24:07 -08:00
|
|
|
}
|
|
|
|
|
2020-09-04 19:25:20 -07:00
|
|
|
Status BlockBasedTable::DumpIndexBlock(std::ostream& out_stream) {
|
|
|
|
out_stream << "Index Details:\n"
|
|
|
|
"--------------------------------------\n";
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-24 20:50:35 -07:00
|
|
|
std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter(
|
2019-06-10 15:30:05 -07:00
|
|
|
NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
|
|
|
|
/*input_iter=*/nullptr, /*get_context=*/nullptr,
|
|
|
|
/*lookup_contex=*/nullptr));
|
2014-12-23 13:24:07 -08:00
|
|
|
Status s = blockhandles_iter->status();
|
|
|
|
if (!s.ok()) {
|
2020-09-04 19:25:20 -07:00
|
|
|
out_stream << "Can not read Index Block \n\n";
|
2014-12-23 13:24:07 -08:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2020-09-04 19:25:20 -07:00
|
|
|
out_stream << " Block key hex dump: Data block handle\n";
|
|
|
|
out_stream << " Block key ascii\n\n";
|
2014-12-23 13:24:07 -08:00
|
|
|
for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
|
|
|
|
blockhandles_iter->Next()) {
|
|
|
|
s = blockhandles_iter->status();
|
|
|
|
if (!s.ok()) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
Slice key = blockhandles_iter->key();
|
2018-05-25 18:41:31 -07:00
|
|
|
Slice user_key;
|
2014-12-23 13:24:07 -08:00
|
|
|
InternalKey ikey;
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-24 20:50:35 -07:00
|
|
|
if (!rep_->index_key_includes_seq) {
|
2018-05-29 12:09:01 -07:00
|
|
|
user_key = key;
|
|
|
|
} else {
|
2018-05-25 18:41:31 -07:00
|
|
|
ikey.DecodeFrom(key);
|
|
|
|
user_key = ikey.user_key();
|
|
|
|
}
|
2014-12-23 13:24:07 -08:00
|
|
|
|
2020-09-04 19:25:20 -07:00
|
|
|
out_stream << " HEX " << user_key.ToString(true) << ": "
|
|
|
|
<< blockhandles_iter->value().ToString(true,
|
|
|
|
rep_->index_has_first_key)
|
|
|
|
<< "\n";
|
2014-12-23 13:24:07 -08:00
|
|
|
|
2018-05-25 18:41:31 -07:00
|
|
|
std::string str_key = user_key.ToString();
|
2014-12-23 13:24:07 -08:00
|
|
|
std::string res_key("");
|
|
|
|
char cspace = ' ';
|
|
|
|
for (size_t i = 0; i < str_key.size(); i++) {
|
|
|
|
res_key.append(&str_key[i], 1);
|
|
|
|
res_key.append(1, cspace);
|
|
|
|
}
|
2020-09-04 19:25:20 -07:00
|
|
|
out_stream << " ASCII " << res_key << "\n";
|
|
|
|
out_stream << " ------\n";
|
2014-12-23 13:24:07 -08:00
|
|
|
}
|
2020-09-04 19:25:20 -07:00
|
|
|
out_stream << "\n";
|
2014-12-23 13:24:07 -08:00
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
2020-09-04 19:25:20 -07:00
|
|
|
Status BlockBasedTable::DumpDataBlocks(std::ostream& out_stream) {
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-24 20:50:35 -07:00
|
|
|
std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter(
|
2019-06-10 15:30:05 -07:00
|
|
|
NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
|
|
|
|
/*input_iter=*/nullptr, /*get_context=*/nullptr,
|
|
|
|
/*lookup_contex=*/nullptr));
|
2014-12-23 13:24:07 -08:00
|
|
|
Status s = blockhandles_iter->status();
|
|
|
|
if (!s.ok()) {
|
2020-09-04 19:25:20 -07:00
|
|
|
out_stream << "Can not read Index Block \n\n";
|
2014-12-23 13:24:07 -08:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2016-08-12 16:34:11 -07:00
|
|
|
uint64_t datablock_size_min = std::numeric_limits<uint64_t>::max();
|
|
|
|
uint64_t datablock_size_max = 0;
|
|
|
|
uint64_t datablock_size_sum = 0;
|
|
|
|
|
2014-12-23 13:24:07 -08:00
|
|
|
size_t block_id = 1;
|
|
|
|
for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
|
|
|
|
block_id++, blockhandles_iter->Next()) {
|
|
|
|
s = blockhandles_iter->status();
|
|
|
|
if (!s.ok()) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-24 20:50:35 -07:00
|
|
|
BlockHandle bh = blockhandles_iter->value().handle;
|
2016-08-12 16:34:11 -07:00
|
|
|
uint64_t datablock_size = bh.size();
|
|
|
|
datablock_size_min = std::min(datablock_size_min, datablock_size);
|
|
|
|
datablock_size_max = std::max(datablock_size_max, datablock_size);
|
|
|
|
datablock_size_sum += datablock_size;
|
|
|
|
|
2020-09-04 19:25:20 -07:00
|
|
|
out_stream << "Data Block # " << block_id << " @ "
|
|
|
|
<< blockhandles_iter->value().handle.ToString(true) << "\n";
|
|
|
|
out_stream << "--------------------------------------\n";
|
2014-12-23 13:24:07 -08:00
|
|
|
|
2015-10-12 15:06:38 -07:00
|
|
|
std::unique_ptr<InternalIterator> datablock_iter;
|
2018-07-12 17:19:57 -07:00
|
|
|
datablock_iter.reset(NewDataBlockIterator<DataBlockIter>(
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-24 20:50:35 -07:00
|
|
|
ReadOptions(), blockhandles_iter->value().handle,
|
|
|
|
/*input_iter=*/nullptr, /*type=*/BlockType::kData,
|
2019-06-10 15:30:05 -07:00
|
|
|
/*get_context=*/nullptr, /*lookup_context=*/nullptr, Status(),
|
|
|
|
/*prefetch_buffer=*/nullptr));
|
2014-12-23 13:24:07 -08:00
|
|
|
s = datablock_iter->status();
|
|
|
|
|
|
|
|
if (!s.ok()) {
|
2020-09-04 19:25:20 -07:00
|
|
|
out_stream << "Error reading the block - Skipped \n\n";
|
2014-12-23 13:24:07 -08:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (datablock_iter->SeekToFirst(); datablock_iter->Valid();
|
|
|
|
datablock_iter->Next()) {
|
|
|
|
s = datablock_iter->status();
|
|
|
|
if (!s.ok()) {
|
2020-09-04 19:25:20 -07:00
|
|
|
out_stream << "Error reading the block - Skipped \n";
|
2014-12-23 13:24:07 -08:00
|
|
|
break;
|
|
|
|
}
|
2020-09-04 19:25:20 -07:00
|
|
|
DumpKeyValue(datablock_iter->key(), datablock_iter->value(), out_stream);
|
2014-12-23 13:24:07 -08:00
|
|
|
}
|
2020-09-04 19:25:20 -07:00
|
|
|
out_stream << "\n";
|
2014-12-23 13:24:07 -08:00
|
|
|
}
|
2016-08-12 16:34:11 -07:00
|
|
|
|
|
|
|
uint64_t num_datablocks = block_id - 1;
|
|
|
|
if (num_datablocks) {
|
|
|
|
double datablock_size_avg =
|
|
|
|
static_cast<double>(datablock_size_sum) / num_datablocks;
|
2020-09-04 19:25:20 -07:00
|
|
|
out_stream << "Data Block Summary:\n";
|
|
|
|
out_stream << "--------------------------------------\n";
|
|
|
|
out_stream << " # data blocks: " << num_datablocks << "\n";
|
|
|
|
out_stream << " min data block size: " << datablock_size_min << "\n";
|
|
|
|
out_stream << " max data block size: " << datablock_size_max << "\n";
|
2022-05-06 13:03:58 -07:00
|
|
|
out_stream << " avg data block size: "
|
|
|
|
<< std::to_string(datablock_size_avg) << "\n";
|
2016-08-12 16:34:11 -07:00
|
|
|
}
|
|
|
|
|
2014-12-23 13:24:07 -08:00
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
2016-11-12 09:23:05 -08:00
|
|
|
void BlockBasedTable::DumpKeyValue(const Slice& key, const Slice& value,
|
2020-09-04 19:25:20 -07:00
|
|
|
std::ostream& out_stream) {
|
2016-11-12 09:23:05 -08:00
|
|
|
InternalKey ikey;
|
|
|
|
ikey.DecodeFrom(key);
|
|
|
|
|
2020-09-04 19:25:20 -07:00
|
|
|
out_stream << " HEX " << ikey.user_key().ToString(true) << ": "
|
|
|
|
<< value.ToString(true) << "\n";
|
2016-11-12 09:23:05 -08:00
|
|
|
|
|
|
|
std::string str_key = ikey.user_key().ToString();
|
|
|
|
std::string str_value = value.ToString();
|
|
|
|
std::string res_key(""), res_value("");
|
|
|
|
char cspace = ' ';
|
|
|
|
for (size_t i = 0; i < str_key.size(); i++) {
|
2017-11-28 17:20:21 -08:00
|
|
|
if (str_key[i] == '\0') {
|
|
|
|
res_key.append("\\0", 2);
|
|
|
|
} else {
|
|
|
|
res_key.append(&str_key[i], 1);
|
|
|
|
}
|
2016-11-12 09:23:05 -08:00
|
|
|
res_key.append(1, cspace);
|
|
|
|
}
|
|
|
|
for (size_t i = 0; i < str_value.size(); i++) {
|
2017-11-28 17:20:21 -08:00
|
|
|
if (str_value[i] == '\0') {
|
|
|
|
res_value.append("\\0", 2);
|
|
|
|
} else {
|
|
|
|
res_value.append(&str_value[i], 1);
|
|
|
|
}
|
2016-11-12 09:23:05 -08:00
|
|
|
res_value.append(1, cspace);
|
|
|
|
}
|
|
|
|
|
2020-09-04 19:25:20 -07:00
|
|
|
out_stream << " ASCII " << res_key << ": " << res_value << "\n";
|
|
|
|
out_stream << " ------\n";
|
2016-11-12 09:23:05 -08:00
|
|
|
}
|
|
|
|
|
2020-02-20 12:07:53 -08:00
|
|
|
} // namespace ROCKSDB_NAMESPACE
|