72942ad7a4
Summary: The origin commit #3635 will hurt performance for users who aren't using range deletions, because unneeded std::set operations, so it was reverted by commit 44653c7b7aabe821e671946e732dda7ae6b43d1b. (see #3672) To fix this, move the set to and add a check in , i.e., file will be added only if is non-nullptr. The db_bench command which find the performance regression: > ./db_bench --benchmarks=fillrandom,seekrandomwhilewriting --threads=1 --num=1000000 --reads=150000 --key_size=66 > --value_size=1262 --statistics=0 --compression_ratio=0.5 --histogram=1 --seek_nexts=1 --stats_per_interval=1 > --stats_interval_seconds=600 --max_background_flushes=4 --num_multi_db=1 --max_background_compactions=16 --seed=1522388277 > -write_buffer_size=1048576 --level0_file_num_compaction_trigger=10000 --compression_type=none Before and after the modification, I re-run this command on the machine, the results of are as follows: **fillrandom** Table | P50 | P75 | P99 | P99.9 | P99.99 | ---- | --- | --- | --- | ----- | ------ | before commit | 5.92 | 8.57 | 19.63 | 980.97 | 12196.00 | after commit | 5.91 | 8.55 | 19.34 | 965.56 | 13513.56 | **seekrandomwhilewriting** Table | P50 | P75 | P99 | P99.9 | P99.99 | ---- | --- | --- | --- | ----- | ------ | before commit | 1418.62 | 1867.01 | 3823.28 | 4980.99 | 9240.00 | after commit | 1450.54 | 1880.61 | 3962.87 | 5429.60 | 7542.86 | Closes https://github.com/facebook/rocksdb/pull/3800 Differential Revision: D7874245 Pulled By: ajkr fbshipit-source-id: 2e8bec781b3f7399246babd66395c88619534a17
189 lines
8.6 KiB
C++
189 lines
8.6 KiB
C++
// Copyright (c) 2016-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
#pragma once
|
|
|
|
#include <map>
|
|
#include <set>
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
#include "db/compaction_iteration_stats.h"
|
|
#include "db/dbformat.h"
|
|
#include "db/pinned_iterators_manager.h"
|
|
#include "db/version_edit.h"
|
|
#include "include/rocksdb/comparator.h"
|
|
#include "include/rocksdb/types.h"
|
|
#include "table/internal_iterator.h"
|
|
#include "table/scoped_arena_iterator.h"
|
|
#include "table/table_builder.h"
|
|
#include "util/kv_map.h"
|
|
|
|
namespace rocksdb {
|
|
|
|
// A RangeDelAggregator aggregates range deletion tombstones as they are
|
|
// encountered in memtables/SST files. It provides methods that check whether a
|
|
// key is covered by range tombstones or write the relevant tombstones to a new
|
|
// SST file.
|
|
class RangeDelAggregator {
|
|
public:
|
|
// @param snapshots These are used to organize the tombstones into snapshot
|
|
// stripes, which is the seqnum range between consecutive snapshots,
|
|
// including the higher snapshot and excluding the lower one. Currently,
|
|
// this is used by ShouldDelete() to prevent deletion of keys that are
|
|
// covered by range tombstones in other snapshot stripes. This constructor
|
|
// is used for writes (flush/compaction). All DB snapshots are provided
|
|
// such that no keys are removed that are uncovered according to any DB
|
|
// snapshot.
|
|
// Note this overload does not lazily initialize Rep.
|
|
RangeDelAggregator(const InternalKeyComparator& icmp,
|
|
const std::vector<SequenceNumber>& snapshots,
|
|
bool collapse_deletions = true);
|
|
|
|
// @param upper_bound Similar to snapshots above, except with a single
|
|
// snapshot, which allows us to store the snapshot on the stack and defer
|
|
// initialization of heap-allocating members (in Rep) until the first range
|
|
// deletion is encountered. This constructor is used in case of reads (get/
|
|
// iterator), for which only the user snapshot (upper_bound) is provided
|
|
// such that the seqnum space is divided into two stripes. Only the older
|
|
// stripe will be used by ShouldDelete().
|
|
RangeDelAggregator(const InternalKeyComparator& icmp,
|
|
SequenceNumber upper_bound,
|
|
bool collapse_deletions = false);
|
|
|
|
// We maintain position in the tombstone map across calls to ShouldDelete. The
|
|
// caller may wish to specify a mode to optimize positioning the iterator
|
|
// during the next call to ShouldDelete. The non-kFullScan modes are only
|
|
// available when deletion collapsing is enabled.
|
|
//
|
|
// For example, if we invoke Next() on an iterator, kForwardTraversal should
|
|
// be specified to advance one-by-one through deletions until one is found
|
|
// with its interval containing the key. This will typically be faster than
|
|
// doing a full binary search (kBinarySearch).
|
|
enum RangePositioningMode {
|
|
kFullScan, // used iff collapse_deletions_ == false
|
|
kForwardTraversal,
|
|
kBackwardTraversal,
|
|
kBinarySearch,
|
|
};
|
|
|
|
// Returns whether the key should be deleted, which is the case when it is
|
|
// covered by a range tombstone residing in the same snapshot stripe.
|
|
// @param mode If collapse_deletions_ is true, this dictates how we will find
|
|
// the deletion whose interval contains this key. Otherwise, its
|
|
// value must be kFullScan indicating linear scan from beginning..
|
|
bool ShouldDelete(const ParsedInternalKey& parsed,
|
|
RangePositioningMode mode = kFullScan) {
|
|
if (rep_ == nullptr) {
|
|
return false;
|
|
}
|
|
return ShouldDeleteImpl(parsed, mode);
|
|
}
|
|
bool ShouldDelete(const Slice& internal_key,
|
|
RangePositioningMode mode = kFullScan) {
|
|
if (rep_ == nullptr) {
|
|
return false;
|
|
}
|
|
return ShouldDeleteImpl(internal_key, mode);
|
|
}
|
|
bool ShouldDeleteImpl(const ParsedInternalKey& parsed,
|
|
RangePositioningMode mode = kFullScan);
|
|
bool ShouldDeleteImpl(const Slice& internal_key,
|
|
RangePositioningMode mode = kFullScan);
|
|
|
|
// Checks whether range deletions cover any keys between `start` and `end`,
|
|
// inclusive.
|
|
//
|
|
// @param start User key representing beginning of range to check for overlap.
|
|
// @param end User key representing end of range to check for overlap. This
|
|
// argument is inclusive, so the existence of a range deletion covering
|
|
// `end` causes this to return true.
|
|
bool IsRangeOverlapped(const Slice& start, const Slice& end);
|
|
|
|
bool ShouldAddTombstones(bool bottommost_level = false);
|
|
|
|
// Adds tombstones to the tombstone aggregation structure maintained by this
|
|
// object.
|
|
// @return non-OK status if any of the tombstone keys are corrupted.
|
|
Status AddTombstones(std::unique_ptr<InternalIterator> input);
|
|
|
|
// Resets iterators maintained across calls to ShouldDelete(). This may be
|
|
// called when the tombstones change, or the owner may call explicitly, e.g.,
|
|
// if it's an iterator that just seeked to an arbitrary position. The effect
|
|
// of invalidation is that the following call to ShouldDelete() will binary
|
|
// search for its tombstone.
|
|
void InvalidateTombstoneMapPositions();
|
|
|
|
// Writes tombstones covering a range to a table builder.
|
|
// @param extend_before_min_key If true, the range of tombstones to be added
|
|
// to the TableBuilder starts from the beginning of the key-range;
|
|
// otherwise, it starts from meta->smallest.
|
|
// @param lower_bound/upper_bound Any range deletion with [start_key, end_key)
|
|
// that overlaps the target range [*lower_bound, *upper_bound) is added to
|
|
// the builder. If lower_bound is nullptr, the target range extends
|
|
// infinitely to the left. If upper_bound is nullptr, the target range
|
|
// extends infinitely to the right. If both are nullptr, the target range
|
|
// extends infinitely in both directions, i.e., all range deletions are
|
|
// added to the builder.
|
|
// @param meta The file's metadata. We modify the begin and end keys according
|
|
// to the range tombstones added to this file such that the read path does
|
|
// not miss range tombstones that cover gaps before/after/between files in
|
|
// a level. lower_bound/upper_bound above constrain how far file boundaries
|
|
// can be extended.
|
|
// @param bottommost_level If true, we will filter out any tombstones
|
|
// belonging to the oldest snapshot stripe, because all keys potentially
|
|
// covered by this tombstone are guaranteed to have been deleted by
|
|
// compaction.
|
|
void AddToBuilder(TableBuilder* builder, const Slice* lower_bound,
|
|
const Slice* upper_bound, FileMetaData* meta,
|
|
CompactionIterationStats* range_del_out_stats = nullptr,
|
|
bool bottommost_level = false);
|
|
bool IsEmpty();
|
|
bool AddFile(uint64_t file_number);
|
|
|
|
private:
|
|
// Maps tombstone user start key -> tombstone object
|
|
typedef std::multimap<Slice, RangeTombstone, stl_wrappers::LessOfComparator>
|
|
TombstoneMap;
|
|
// Also maintains position in TombstoneMap last seen by ShouldDelete(). The
|
|
// end iterator indicates invalidation (e.g., if AddTombstones() changes the
|
|
// underlying map). End iterator cannot be invalidated.
|
|
struct PositionalTombstoneMap {
|
|
explicit PositionalTombstoneMap(TombstoneMap _raw_map)
|
|
: raw_map(std::move(_raw_map)), iter(raw_map.end()) {}
|
|
PositionalTombstoneMap(const PositionalTombstoneMap&) = delete;
|
|
PositionalTombstoneMap(PositionalTombstoneMap&& other)
|
|
: raw_map(std::move(other.raw_map)), iter(raw_map.end()) {}
|
|
|
|
TombstoneMap raw_map;
|
|
TombstoneMap::const_iterator iter;
|
|
};
|
|
|
|
// Maps snapshot seqnum -> map of tombstones that fall in that stripe, i.e.,
|
|
// their seqnums are greater than the next smaller snapshot's seqnum.
|
|
typedef std::map<SequenceNumber, PositionalTombstoneMap> StripeMap;
|
|
|
|
struct Rep {
|
|
StripeMap stripe_map_;
|
|
PinnedIteratorsManager pinned_iters_mgr_;
|
|
std::set<uint64_t> added_files_;
|
|
};
|
|
// Initializes rep_ lazily. This aggregator object is constructed for every
|
|
// read, so expensive members should only be created when necessary, i.e.,
|
|
// once the first range deletion is encountered.
|
|
void InitRep(const std::vector<SequenceNumber>& snapshots);
|
|
|
|
PositionalTombstoneMap& GetPositionalTombstoneMap(SequenceNumber seq);
|
|
Status AddTombstone(RangeTombstone tombstone);
|
|
|
|
SequenceNumber upper_bound_;
|
|
std::unique_ptr<Rep> rep_;
|
|
const InternalKeyComparator& icmp_;
|
|
// collapse range deletions so they're binary searchable
|
|
const bool collapse_deletions_;
|
|
};
|
|
|
|
} // namespace rocksdb
|