2016-02-10 00:12:00 +01:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
2017-07-16 01:03:42 +02:00
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
2013-10-16 23:59:46 +02:00
|
|
|
//
|
2012-04-17 17:36:46 +02:00
|
|
|
// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
//
|
|
|
|
// A filter block is stored near the end of a Table file. It contains
|
|
|
|
// filters (e.g., bloom filters) for all data blocks in the table combined
|
|
|
|
// into a single filter block.
|
2014-09-08 19:37:05 +02:00
|
|
|
//
|
|
|
|
// It is a base class for BlockBasedFilter and FullFilter.
|
|
|
|
// These two are both used in BlockBasedTable. The first one contain filter
|
|
|
|
// For a part of keys in sst file, the second contain filter for all keys
|
|
|
|
// in sst file.
|
2012-04-17 17:36:46 +02:00
|
|
|
|
2013-10-05 07:32:05 +02:00
|
|
|
#pragma once
|
2013-11-13 07:46:51 +01:00
|
|
|
|
2014-08-16 00:05:09 +02:00
|
|
|
#include <memory>
|
2012-04-17 17:36:46 +02:00
|
|
|
#include <stddef.h>
|
|
|
|
#include <stdint.h>
|
|
|
|
#include <string>
|
|
|
|
#include <vector>
|
2014-09-08 19:37:05 +02:00
|
|
|
#include "rocksdb/options.h"
|
2013-08-23 17:38:13 +02:00
|
|
|
#include "rocksdb/slice.h"
|
2014-08-16 00:05:09 +02:00
|
|
|
#include "rocksdb/slice_transform.h"
|
2014-08-25 23:22:05 +02:00
|
|
|
#include "rocksdb/table.h"
|
2014-08-16 00:05:09 +02:00
|
|
|
#include "util/hash.h"
|
|
|
|
#include "format.h"
|
2012-04-17 17:36:46 +02:00
|
|
|
|
2013-10-04 06:49:15 +02:00
|
|
|
namespace rocksdb {
|
2012-04-17 17:36:46 +02:00
|
|
|
|
2014-09-08 19:37:05 +02:00
|
|
|
const uint64_t kNotValid = ULLONG_MAX;
|
2014-08-16 00:05:09 +02:00
|
|
|
class FilterPolicy;
|
2012-04-17 17:36:46 +02:00
|
|
|
|
|
|
|
// A FilterBlockBuilder is used to construct all of the filters for a
|
|
|
|
// particular Table. It generates a single string which is stored as
|
|
|
|
// a special block in the Table.
|
|
|
|
//
|
|
|
|
// The sequence of calls to FilterBlockBuilder must match the regexp:
|
2014-09-08 19:37:05 +02:00
|
|
|
// (StartBlock Add*)* Finish
|
|
|
|
//
|
|
|
|
// BlockBased/Full FilterBlock would be called in the same way.
|
2012-04-17 17:36:46 +02:00
|
|
|
class FilterBlockBuilder {
|
|
|
|
public:
|
2014-09-08 19:37:05 +02:00
|
|
|
explicit FilterBlockBuilder() {}
|
|
|
|
virtual ~FilterBlockBuilder() {}
|
2012-04-17 17:36:46 +02:00
|
|
|
|
2014-09-08 19:37:05 +02:00
|
|
|
virtual bool IsBlockBased() = 0; // If is blockbased filter
|
|
|
|
virtual void StartBlock(uint64_t block_offset) = 0; // Start new block filter
|
|
|
|
virtual void Add(const Slice& key) = 0; // Add a key to current filter
|
2018-03-22 06:56:48 +01:00
|
|
|
virtual size_t NumAdded() const = 0; // Number of keys added
|
2017-03-07 22:48:02 +01:00
|
|
|
Slice Finish() { // Generate Filter
|
|
|
|
const BlockHandle empty_handle;
|
|
|
|
Status dont_care_status;
|
|
|
|
auto ret = Finish(empty_handle, &dont_care_status);
|
|
|
|
assert(dont_care_status.ok());
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
virtual Slice Finish(const BlockHandle& tmp, Status* status) = 0;
|
2012-04-17 17:36:46 +02:00
|
|
|
|
|
|
|
private:
|
|
|
|
// No copying allowed
|
|
|
|
FilterBlockBuilder(const FilterBlockBuilder&);
|
|
|
|
void operator=(const FilterBlockBuilder&);
|
|
|
|
};
|
|
|
|
|
2014-09-08 19:37:05 +02:00
|
|
|
// A FilterBlockReader is used to parse filter from SST table.
|
|
|
|
// KeyMayMatch and PrefixMayMatch would trigger filter checking
|
|
|
|
//
|
|
|
|
// BlockBased/Full FilterBlock would be called in the same way.
|
2012-04-17 17:36:46 +02:00
|
|
|
class FilterBlockReader {
|
|
|
|
public:
|
2016-06-10 00:48:45 +02:00
|
|
|
explicit FilterBlockReader()
|
|
|
|
: whole_key_filtering_(true), size_(0), statistics_(nullptr) {}
|
|
|
|
explicit FilterBlockReader(size_t s, Statistics* stats,
|
|
|
|
bool _whole_key_filtering)
|
|
|
|
: whole_key_filtering_(_whole_key_filtering),
|
|
|
|
size_(s),
|
|
|
|
statistics_(stats) {}
|
2014-09-08 19:37:05 +02:00
|
|
|
virtual ~FilterBlockReader() {}
|
2013-11-13 07:46:51 +01:00
|
|
|
|
2014-09-08 19:37:05 +02:00
|
|
|
virtual bool IsBlockBased() = 0; // If is blockbased filter
|
2017-03-22 17:11:23 +01:00
|
|
|
/**
|
|
|
|
* If no_io is set, then it returns true if it cannot answer the query without
|
|
|
|
* reading data from disk. This is used in PartitionedFilterBlockReader to
|
|
|
|
* avoid reading partitions that are not in block cache already
|
|
|
|
*
|
|
|
|
* Normally filters are built on only the user keys and the InternalKey is not
|
|
|
|
* needed for a query. The index in PartitionedFilterBlockReader however is
|
|
|
|
* built upon InternalKey and must be provided via const_ikey_ptr when running
|
|
|
|
* queries.
|
|
|
|
*/
|
2018-05-21 23:33:55 +02:00
|
|
|
virtual bool KeyMayMatch(const Slice& key,
|
|
|
|
const SliceTransform* prefix_extractor,
|
|
|
|
uint64_t block_offset = kNotValid,
|
2017-03-22 17:11:23 +01:00
|
|
|
const bool no_io = false,
|
|
|
|
const Slice* const const_ikey_ptr = nullptr) = 0;
|
2018-05-21 23:33:55 +02:00
|
|
|
|
Introduce a new MultiGet batching implementation (#5011)
Summary:
This PR introduces a new MultiGet() API, with the underlying implementation grouping keys based on SST file and batching lookups in a file. The reason for the new API is twofold - the definition allows callers to allocate storage for status and values on stack instead of std::vector, as well as return values as PinnableSlices in order to avoid copying, and it keeps the original MultiGet() implementation intact while we experiment with batching.
Batching is useful when there is some spatial locality to the keys being queries, as well as larger batch sizes. The main benefits are due to -
1. Fewer function calls, especially to BlockBasedTableReader::MultiGet() and FullFilterBlockReader::KeysMayMatch()
2. Bloom filter cachelines can be prefetched, hiding the cache miss latency
The next step is to optimize the binary searches in the level_storage_info, index blocks and data blocks, since we could reduce the number of key comparisons if the keys are relatively close to each other. The batching optimizations also need to be extended to other formats, such as PlainTable and filter formats. This also needs to be added to db_stress.
Benchmark results from db_bench for various batch size/locality of reference combinations are given below. Locality was simulated by offsetting the keys in a batch by a stride length. Each SST file is about 8.6MB uncompressed and key/value size is 16/100 uncompressed. To focus on the cpu benefit of batching, the runs were single threaded and bound to the same cpu to eliminate interference from other system events. The results show a 10-25% improvement in micros/op from smaller to larger batch sizes (4 - 32).
Batch Sizes
1 | 2 | 4 | 8 | 16 | 32
Random pattern (Stride length 0)
4.158 | 4.109 | 4.026 | 4.05 | 4.1 | 4.074 - Get
4.438 | 4.302 | 4.165 | 4.122 | 4.096 | 4.075 - MultiGet (no batching)
4.461 | 4.256 | 4.277 | 4.11 | 4.182 | 4.14 - MultiGet (w/ batching)
Good locality (Stride length 16)
4.048 | 3.659 | 3.248 | 2.99 | 2.84 | 2.753
4.429 | 3.728 | 3.406 | 3.053 | 2.911 | 2.781
4.452 | 3.45 | 2.833 | 2.451 | 2.233 | 2.135
Good locality (Stride length 256)
4.066 | 3.786 | 3.581 | 3.447 | 3.415 | 3.232
4.406 | 4.005 | 3.644 | 3.49 | 3.381 | 3.268
4.393 | 3.649 | 3.186 | 2.882 | 2.676 | 2.62
Medium locality (Stride length 4096)
4.012 | 3.922 | 3.768 | 3.61 | 3.582 | 3.555
4.364 | 4.057 | 3.791 | 3.65 | 3.57 | 3.465
4.479 | 3.758 | 3.316 | 3.077 | 2.959 | 2.891
dbbench command used (on a DB with 4 levels, 12 million keys)-
TEST_TMPDIR=/dev/shm numactl -C 10 ./db_bench.tmp -use_existing_db=true -benchmarks="readseq,multireadrandom" -write_buffer_size=4194304 -target_file_size_base=4194304 -max_bytes_for_level_base=16777216 -num=12000000 -reads=12000000 -duration=90 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5011
Differential Revision: D14348703
Pulled By: anand1976
fbshipit-source-id: 774406dab3776d979c809522a67bedac6c17f84b
2019-04-11 23:24:09 +02:00
|
|
|
virtual void KeysMayMatch(MultiGetRange* range,
|
|
|
|
const SliceTransform* prefix_extractor,
|
|
|
|
uint64_t block_offset = kNotValid,
|
|
|
|
const bool no_io = false) {
|
|
|
|
for (auto iter = range->begin(); iter != range->end(); ++iter) {
|
|
|
|
const Slice ukey = iter->ukey;
|
|
|
|
const Slice ikey = iter->ikey;
|
|
|
|
if (!KeyMayMatch(ukey, prefix_extractor, block_offset, no_io, &ikey)) {
|
|
|
|
range->SkipKey(iter);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-03-22 17:11:23 +01:00
|
|
|
/**
|
|
|
|
* no_io and const_ikey_ptr here means the same as in KeyMayMatch
|
|
|
|
*/
|
2014-09-08 19:37:05 +02:00
|
|
|
virtual bool PrefixMayMatch(const Slice& prefix,
|
2018-05-21 23:33:55 +02:00
|
|
|
const SliceTransform* prefix_extractor,
|
2017-03-22 17:11:23 +01:00
|
|
|
uint64_t block_offset = kNotValid,
|
|
|
|
const bool no_io = false,
|
|
|
|
const Slice* const const_ikey_ptr = nullptr) = 0;
|
2018-05-21 23:33:55 +02:00
|
|
|
|
Introduce a new MultiGet batching implementation (#5011)
Summary:
This PR introduces a new MultiGet() API, with the underlying implementation grouping keys based on SST file and batching lookups in a file. The reason for the new API is twofold - the definition allows callers to allocate storage for status and values on stack instead of std::vector, as well as return values as PinnableSlices in order to avoid copying, and it keeps the original MultiGet() implementation intact while we experiment with batching.
Batching is useful when there is some spatial locality to the keys being queries, as well as larger batch sizes. The main benefits are due to -
1. Fewer function calls, especially to BlockBasedTableReader::MultiGet() and FullFilterBlockReader::KeysMayMatch()
2. Bloom filter cachelines can be prefetched, hiding the cache miss latency
The next step is to optimize the binary searches in the level_storage_info, index blocks and data blocks, since we could reduce the number of key comparisons if the keys are relatively close to each other. The batching optimizations also need to be extended to other formats, such as PlainTable and filter formats. This also needs to be added to db_stress.
Benchmark results from db_bench for various batch size/locality of reference combinations are given below. Locality was simulated by offsetting the keys in a batch by a stride length. Each SST file is about 8.6MB uncompressed and key/value size is 16/100 uncompressed. To focus on the cpu benefit of batching, the runs were single threaded and bound to the same cpu to eliminate interference from other system events. The results show a 10-25% improvement in micros/op from smaller to larger batch sizes (4 - 32).
Batch Sizes
1 | 2 | 4 | 8 | 16 | 32
Random pattern (Stride length 0)
4.158 | 4.109 | 4.026 | 4.05 | 4.1 | 4.074 - Get
4.438 | 4.302 | 4.165 | 4.122 | 4.096 | 4.075 - MultiGet (no batching)
4.461 | 4.256 | 4.277 | 4.11 | 4.182 | 4.14 - MultiGet (w/ batching)
Good locality (Stride length 16)
4.048 | 3.659 | 3.248 | 2.99 | 2.84 | 2.753
4.429 | 3.728 | 3.406 | 3.053 | 2.911 | 2.781
4.452 | 3.45 | 2.833 | 2.451 | 2.233 | 2.135
Good locality (Stride length 256)
4.066 | 3.786 | 3.581 | 3.447 | 3.415 | 3.232
4.406 | 4.005 | 3.644 | 3.49 | 3.381 | 3.268
4.393 | 3.649 | 3.186 | 2.882 | 2.676 | 2.62
Medium locality (Stride length 4096)
4.012 | 3.922 | 3.768 | 3.61 | 3.582 | 3.555
4.364 | 4.057 | 3.791 | 3.65 | 3.57 | 3.465
4.479 | 3.758 | 3.316 | 3.077 | 2.959 | 2.891
dbbench command used (on a DB with 4 levels, 12 million keys)-
TEST_TMPDIR=/dev/shm numactl -C 10 ./db_bench.tmp -use_existing_db=true -benchmarks="readseq,multireadrandom" -write_buffer_size=4194304 -target_file_size_base=4194304 -max_bytes_for_level_base=16777216 -num=12000000 -reads=12000000 -duration=90 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5011
Differential Revision: D14348703
Pulled By: anand1976
fbshipit-source-id: 774406dab3776d979c809522a67bedac6c17f84b
2019-04-11 23:24:09 +02:00
|
|
|
virtual void PrefixesMayMatch(MultiGetRange* range,
|
|
|
|
const SliceTransform* prefix_extractor,
|
|
|
|
uint64_t block_offset = kNotValid,
|
|
|
|
const bool no_io = false) {
|
|
|
|
for (auto iter = range->begin(); iter != range->end(); ++iter) {
|
|
|
|
const Slice ukey = iter->ukey;
|
|
|
|
const Slice ikey = iter->ikey;
|
|
|
|
if (!KeyMayMatch(prefix_extractor->Transform(ukey), prefix_extractor,
|
|
|
|
block_offset, no_io, &ikey)) {
|
|
|
|
range->SkipKey(iter);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-09-08 19:37:05 +02:00
|
|
|
virtual size_t ApproximateMemoryUsage() const = 0;
|
2016-06-03 19:47:47 +02:00
|
|
|
virtual size_t size() const { return size_; }
|
|
|
|
virtual Statistics* statistics() const { return statistics_; }
|
2013-08-13 23:04:56 +02:00
|
|
|
|
2016-06-10 00:48:45 +02:00
|
|
|
bool whole_key_filtering() const { return whole_key_filtering_; }
|
|
|
|
|
2014-12-23 22:24:07 +01:00
|
|
|
// convert this object to a human readable form
|
|
|
|
virtual std::string ToString() const {
|
|
|
|
std::string error_msg("Unsupported filter \n");
|
|
|
|
return error_msg;
|
|
|
|
}
|
|
|
|
|
2018-05-21 23:33:55 +02:00
|
|
|
virtual void CacheDependencies(bool /*pin*/,
|
|
|
|
const SliceTransform* /*prefix_extractor*/) {}
|
2017-08-23 16:48:54 +02:00
|
|
|
|
2018-06-27 00:56:26 +02:00
|
|
|
virtual bool RangeMayExist(
|
|
|
|
const Slice* /*iterate_upper_bound*/, const Slice& user_key,
|
|
|
|
const SliceTransform* prefix_extractor,
|
|
|
|
const Comparator* /*comparator*/, const Slice* const const_ikey_ptr,
|
|
|
|
bool* filter_checked, bool /*need_upper_bound_check*/) {
|
|
|
|
*filter_checked = true;
|
|
|
|
Slice prefix = prefix_extractor->Transform(user_key);
|
|
|
|
return PrefixMayMatch(prefix, prefix_extractor, kNotValid, false,
|
|
|
|
const_ikey_ptr);
|
|
|
|
}
|
|
|
|
|
2016-06-10 00:48:45 +02:00
|
|
|
protected:
|
|
|
|
bool whole_key_filtering_;
|
|
|
|
|
2014-09-08 19:37:05 +02:00
|
|
|
private:
|
|
|
|
// No copying allowed
|
|
|
|
FilterBlockReader(const FilterBlockReader&);
|
|
|
|
void operator=(const FilterBlockReader&);
|
2016-06-03 19:47:47 +02:00
|
|
|
size_t size_;
|
|
|
|
Statistics* statistics_;
|
2017-03-22 17:11:23 +01:00
|
|
|
int level_ = -1;
|
2012-04-17 17:36:46 +02:00
|
|
|
};
|
|
|
|
|
2014-09-08 19:37:05 +02:00
|
|
|
} // namespace rocksdb
|