2019-04-18 10:51:19 -07:00
|
|
|
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
|
Use only "local" range tombstones during Get (#4449)
Summary:
Previously, range tombstones were accumulated from every level, which
was necessary if a range tombstone in a higher level covered a key in a lower
level. However, RangeDelAggregator::AddTombstones's complexity is based on
the number of tombstones that are currently stored in it, which is wasteful in
the Get case, where we only need to know the highest sequence number of range
tombstones that cover the key from higher levels, and compute the highest covering
sequence number at the current level. This change introduces this optimization, and
removes the use of RangeDelAggregator from the Get path.
In the benchmark results, the following command was used to initialize the database:
```
./db_bench -db=/dev/shm/5k-rts -use_existing_db=false -benchmarks=filluniquerandom -write_buffer_size=1048576 -compression_type=lz4 -target_file_size_base=1048576 -max_bytes_for_level_base=4194304 -value_size=112 -key_size=16 -block_size=4096 -level_compaction_dynamic_level_bytes=true -num=5000000 -max_background_jobs=12 -benchmark_write_rate_limit=20971520 -range_tombstone_width=100 -writes_per_range_tombstone=100 -max_num_range_tombstones=50000 -bloom_bits=8
```
...and the following command was used to measure read throughput:
```
./db_bench -db=/dev/shm/5k-rts/ -use_existing_db=true -benchmarks=readrandom -disable_auto_compactions=true -num=5000000 -reads=100000 -threads=32
```
The filluniquerandom command was only run once, and the resulting database was used
to measure read performance before and after the PR. Both binaries were compiled with
`DEBUG_LEVEL=0`.
Readrandom results before PR:
```
readrandom : 4.544 micros/op 220090 ops/sec; 16.9 MB/s (63103 of 100000 found)
```
Readrandom results after PR:
```
readrandom : 11.147 micros/op 89707 ops/sec; 6.9 MB/s (63103 of 100000 found)
```
So it's actually slower right now, but this PR paves the way for future optimizations (see #4493).
----
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4449
Differential Revision: D10370575
Pulled By: abhimadan
fbshipit-source-id: 9a2e152be1ef36969055c0e9eb4beb0d96c11f4d
2018-10-24 12:29:29 -07:00
|
|
|
#pragma once
|
|
|
|
|
|
|
|
#include <algorithm>
|
|
|
|
#include <string>
|
|
|
|
#include <vector>
|
|
|
|
|
|
|
|
#include "db/dbformat.h"
|
|
|
|
#include "rocksdb/iterator.h"
|
|
|
|
#include "rocksdb/slice.h"
|
|
|
|
#include "table/internal_iterator.h"
|
|
|
|
|
2020-02-20 12:07:53 -08:00
|
|
|
namespace ROCKSDB_NAMESPACE {
|
Use only "local" range tombstones during Get (#4449)
Summary:
Previously, range tombstones were accumulated from every level, which
was necessary if a range tombstone in a higher level covered a key in a lower
level. However, RangeDelAggregator::AddTombstones's complexity is based on
the number of tombstones that are currently stored in it, which is wasteful in
the Get case, where we only need to know the highest sequence number of range
tombstones that cover the key from higher levels, and compute the highest covering
sequence number at the current level. This change introduces this optimization, and
removes the use of RangeDelAggregator from the Get path.
In the benchmark results, the following command was used to initialize the database:
```
./db_bench -db=/dev/shm/5k-rts -use_existing_db=false -benchmarks=filluniquerandom -write_buffer_size=1048576 -compression_type=lz4 -target_file_size_base=1048576 -max_bytes_for_level_base=4194304 -value_size=112 -key_size=16 -block_size=4096 -level_compaction_dynamic_level_bytes=true -num=5000000 -max_background_jobs=12 -benchmark_write_rate_limit=20971520 -range_tombstone_width=100 -writes_per_range_tombstone=100 -max_num_range_tombstones=50000 -bloom_bits=8
```
...and the following command was used to measure read throughput:
```
./db_bench -db=/dev/shm/5k-rts/ -use_existing_db=true -benchmarks=readrandom -disable_auto_compactions=true -num=5000000 -reads=100000 -threads=32
```
The filluniquerandom command was only run once, and the resulting database was used
to measure read performance before and after the PR. Both binaries were compiled with
`DEBUG_LEVEL=0`.
Readrandom results before PR:
```
readrandom : 4.544 micros/op 220090 ops/sec; 16.9 MB/s (63103 of 100000 found)
```
Readrandom results after PR:
```
readrandom : 11.147 micros/op 89707 ops/sec; 6.9 MB/s (63103 of 100000 found)
```
So it's actually slower right now, but this PR paves the way for future optimizations (see #4493).
----
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4449
Differential Revision: D10370575
Pulled By: abhimadan
fbshipit-source-id: 9a2e152be1ef36969055c0e9eb4beb0d96c11f4d
2018-10-24 12:29:29 -07:00
|
|
|
|
|
|
|
// Iterator over a vector of keys/values
|
|
|
|
class VectorIterator : public InternalIterator {
|
|
|
|
public:
|
|
|
|
VectorIterator(std::vector<std::string> keys, std::vector<std::string> values,
|
|
|
|
const InternalKeyComparator* icmp)
|
|
|
|
: keys_(std::move(keys)),
|
|
|
|
values_(std::move(values)),
|
|
|
|
indexed_cmp_(icmp, &keys_),
|
2019-09-18 15:22:46 -07:00
|
|
|
current_(0) {
|
Use only "local" range tombstones during Get (#4449)
Summary:
Previously, range tombstones were accumulated from every level, which
was necessary if a range tombstone in a higher level covered a key in a lower
level. However, RangeDelAggregator::AddTombstones's complexity is based on
the number of tombstones that are currently stored in it, which is wasteful in
the Get case, where we only need to know the highest sequence number of range
tombstones that cover the key from higher levels, and compute the highest covering
sequence number at the current level. This change introduces this optimization, and
removes the use of RangeDelAggregator from the Get path.
In the benchmark results, the following command was used to initialize the database:
```
./db_bench -db=/dev/shm/5k-rts -use_existing_db=false -benchmarks=filluniquerandom -write_buffer_size=1048576 -compression_type=lz4 -target_file_size_base=1048576 -max_bytes_for_level_base=4194304 -value_size=112 -key_size=16 -block_size=4096 -level_compaction_dynamic_level_bytes=true -num=5000000 -max_background_jobs=12 -benchmark_write_rate_limit=20971520 -range_tombstone_width=100 -writes_per_range_tombstone=100 -max_num_range_tombstones=50000 -bloom_bits=8
```
...and the following command was used to measure read throughput:
```
./db_bench -db=/dev/shm/5k-rts/ -use_existing_db=true -benchmarks=readrandom -disable_auto_compactions=true -num=5000000 -reads=100000 -threads=32
```
The filluniquerandom command was only run once, and the resulting database was used
to measure read performance before and after the PR. Both binaries were compiled with
`DEBUG_LEVEL=0`.
Readrandom results before PR:
```
readrandom : 4.544 micros/op 220090 ops/sec; 16.9 MB/s (63103 of 100000 found)
```
Readrandom results after PR:
```
readrandom : 11.147 micros/op 89707 ops/sec; 6.9 MB/s (63103 of 100000 found)
```
So it's actually slower right now, but this PR paves the way for future optimizations (see #4493).
----
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4449
Differential Revision: D10370575
Pulled By: abhimadan
fbshipit-source-id: 9a2e152be1ef36969055c0e9eb4beb0d96c11f4d
2018-10-24 12:29:29 -07:00
|
|
|
assert(keys_.size() == values_.size());
|
|
|
|
|
|
|
|
indices_.reserve(keys_.size());
|
|
|
|
for (size_t i = 0; i < keys_.size(); i++) {
|
|
|
|
indices_.push_back(i);
|
|
|
|
}
|
|
|
|
std::sort(indices_.begin(), indices_.end(), indexed_cmp_);
|
|
|
|
}
|
|
|
|
|
|
|
|
virtual bool Valid() const override {
|
|
|
|
return !indices_.empty() && current_ < indices_.size();
|
|
|
|
}
|
|
|
|
|
|
|
|
virtual void SeekToFirst() override { current_ = 0; }
|
|
|
|
virtual void SeekToLast() override { current_ = indices_.size() - 1; }
|
|
|
|
|
|
|
|
virtual void Seek(const Slice& target) override {
|
|
|
|
current_ = std::lower_bound(indices_.begin(), indices_.end(), target,
|
|
|
|
indexed_cmp_) -
|
|
|
|
indices_.begin();
|
|
|
|
}
|
|
|
|
|
|
|
|
virtual void SeekForPrev(const Slice& target) override {
|
|
|
|
current_ = std::lower_bound(indices_.begin(), indices_.end(), target,
|
|
|
|
indexed_cmp_) -
|
|
|
|
indices_.begin();
|
|
|
|
if (!Valid()) {
|
|
|
|
SeekToLast();
|
|
|
|
} else {
|
|
|
|
Prev();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
virtual void Next() override { current_++; }
|
|
|
|
virtual void Prev() override { current_--; }
|
|
|
|
|
|
|
|
virtual Slice key() const override {
|
|
|
|
return Slice(keys_[indices_[current_]]);
|
|
|
|
}
|
|
|
|
virtual Slice value() const override {
|
|
|
|
return Slice(values_[indices_[current_]]);
|
|
|
|
}
|
|
|
|
|
|
|
|
virtual Status status() const override { return Status::OK(); }
|
|
|
|
|
|
|
|
virtual bool IsKeyPinned() const override { return true; }
|
|
|
|
virtual bool IsValuePinned() const override { return true; }
|
|
|
|
|
|
|
|
private:
|
|
|
|
struct IndexedKeyComparator {
|
|
|
|
IndexedKeyComparator(const InternalKeyComparator* c,
|
|
|
|
const std::vector<std::string>* ks)
|
|
|
|
: cmp(c), keys(ks) {}
|
|
|
|
|
|
|
|
bool operator()(size_t a, size_t b) const {
|
|
|
|
return cmp->Compare((*keys)[a], (*keys)[b]) < 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool operator()(size_t a, const Slice& b) const {
|
|
|
|
return cmp->Compare((*keys)[a], b) < 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool operator()(const Slice& a, size_t b) const {
|
|
|
|
return cmp->Compare(a, (*keys)[b]) < 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
const InternalKeyComparator* cmp;
|
|
|
|
const std::vector<std::string>* keys;
|
|
|
|
};
|
|
|
|
|
|
|
|
std::vector<std::string> keys_;
|
|
|
|
std::vector<std::string> values_;
|
|
|
|
IndexedKeyComparator indexed_cmp_;
|
|
|
|
std::vector<size_t> indices_;
|
|
|
|
size_t current_;
|
|
|
|
};
|
|
|
|
|
2020-02-20 12:07:53 -08:00
|
|
|
} // namespace ROCKSDB_NAMESPACE
|