rocksdb/db/merge_helper.cc
mrambacher 12f1137355 Add a SystemClock class to capture the time functions of an Env (#7858)
Summary:
Introduces and uses a SystemClock class to RocksDB.  This class contains the time-related functions of an Env and these functions can be redirected from the Env to the SystemClock.

Many of the places that used an Env (Timer, PerfStepTimer, RepeatableThread, RateLimiter, WriteController) for time-related functions have been changed to use SystemClock instead.  There are likely more places that can be changed, but this is a start to show what can/should be done.  Over time it would be nice to migrate most (if not all) of the uses of the time functions from the Env to the SystemClock.

There are several Env classes that implement these functions.  Most of these have not been converted yet to SystemClock implementations; that will come in a subsequent PR.  It would be good to unify many of the Mock Timer implementations, so that they behave similarly and be tested similarly (some override Sleep, some use a MockSleep, etc).

Additionally, this change will allow new methods to be introduced to the SystemClock (like https://github.com/facebook/rocksdb/issues/7101 WaitFor) in a consistent manner across a smaller number of classes.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/7858

Reviewed By: pdillinger

Differential Revision: D26006406

Pulled By: mrambacher

fbshipit-source-id: ed10a8abbdab7ff2e23d69d85bd25b3e7e899e90
2021-01-25 22:09:11 -08:00

423 lines
16 KiB
C++

// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#include "db/merge_helper.h"
#include <string>
#include "db/dbformat.h"
#include "monitoring/perf_context_imp.h"
#include "monitoring/statistics.h"
#include "port/likely.h"
#include "rocksdb/comparator.h"
#include "rocksdb/db.h"
#include "rocksdb/merge_operator.h"
#include "rocksdb/system_clock.h"
#include "table/format.h"
#include "table/internal_iterator.h"
namespace ROCKSDB_NAMESPACE {
MergeHelper::MergeHelper(Env* env, const Comparator* user_comparator,
const MergeOperator* user_merge_operator,
const CompactionFilter* compaction_filter,
Logger* logger, bool assert_valid_internal_key,
SequenceNumber latest_snapshot,
const SnapshotChecker* snapshot_checker, int level,
Statistics* stats,
const std::atomic<bool>* shutting_down)
: env_(env),
clock_(env->GetSystemClock()),
user_comparator_(user_comparator),
user_merge_operator_(user_merge_operator),
compaction_filter_(compaction_filter),
shutting_down_(shutting_down),
logger_(logger),
assert_valid_internal_key_(assert_valid_internal_key),
allow_single_operand_(false),
latest_snapshot_(latest_snapshot),
snapshot_checker_(snapshot_checker),
level_(level),
keys_(),
filter_timer_(clock_),
total_filter_time_(0U),
stats_(stats) {
assert(user_comparator_ != nullptr);
if (user_merge_operator_) {
allow_single_operand_ = user_merge_operator_->AllowSingleOperand();
}
}
Status MergeHelper::TimedFullMerge(
const MergeOperator* merge_operator, const Slice& key, const Slice* value,
const std::vector<Slice>& operands, std::string* result, Logger* logger,
Statistics* statistics, const std::shared_ptr<SystemClock>& clock,
Slice* result_operand, bool update_num_ops_stats) {
assert(merge_operator != nullptr);
if (operands.size() == 0) {
assert(value != nullptr && result != nullptr);
result->assign(value->data(), value->size());
return Status::OK();
}
if (update_num_ops_stats) {
RecordInHistogram(statistics, READ_NUM_MERGE_OPERANDS,
static_cast<uint64_t>(operands.size()));
}
bool success;
Slice tmp_result_operand(nullptr, 0);
const MergeOperator::MergeOperationInput merge_in(key, value, operands,
logger);
MergeOperator::MergeOperationOutput merge_out(*result, tmp_result_operand);
{
// Setup to time the merge
StopWatchNano timer(clock, statistics != nullptr);
PERF_TIMER_GUARD(merge_operator_time_nanos);
// Do the merge
success = merge_operator->FullMergeV2(merge_in, &merge_out);
if (tmp_result_operand.data()) {
// FullMergeV2 result is an existing operand
if (result_operand != nullptr) {
*result_operand = tmp_result_operand;
} else {
result->assign(tmp_result_operand.data(), tmp_result_operand.size());
}
} else if (result_operand) {
*result_operand = Slice(nullptr, 0);
}
RecordTick(statistics, MERGE_OPERATION_TOTAL_TIME,
statistics ? timer.ElapsedNanos() : 0);
}
if (!success) {
RecordTick(statistics, NUMBER_MERGE_FAILURES);
return Status::Corruption("Error: Could not perform merge.");
}
return Status::OK();
}
// PRE: iter points to the first merge type entry
// POST: iter points to the first entry beyond the merge process (or the end)
// keys_, operands_ are updated to reflect the merge result.
// keys_ stores the list of keys encountered while merging.
// operands_ stores the list of merge operands encountered while merging.
// keys_[i] corresponds to operands_[i] for each i.
//
// TODO: Avoid the snapshot stripe map lookup in CompactionRangeDelAggregator
// and just pass the StripeRep corresponding to the stripe being merged.
Status MergeHelper::MergeUntil(InternalIterator* iter,
CompactionRangeDelAggregator* range_del_agg,
const SequenceNumber stop_before,
const bool at_bottom,
const bool allow_data_in_errors) {
// Get a copy of the internal key, before it's invalidated by iter->Next()
// Also maintain the list of merge operands seen.
assert(HasOperator());
keys_.clear();
merge_context_.Clear();
has_compaction_filter_skip_until_ = false;
assert(user_merge_operator_);
bool first_key = true;
// We need to parse the internal key again as the parsed key is
// backed by the internal key!
// Assume no internal key corruption as it has been successfully parsed
// by the caller.
// original_key_is_iter variable is just caching the information:
// original_key_is_iter == (iter->key().ToString() == original_key)
bool original_key_is_iter = true;
std::string original_key = iter->key().ToString();
// Important:
// orig_ikey is backed by original_key if keys_.empty()
// orig_ikey is backed by keys_.back() if !keys_.empty()
ParsedInternalKey orig_ikey;
Status s = ParseInternalKey(original_key, &orig_ikey, allow_data_in_errors);
assert(s.ok());
if (!s.ok()) return s;
bool hit_the_next_user_key = false;
for (; iter->Valid(); iter->Next(), original_key_is_iter = false) {
if (IsShuttingDown()) {
s = Status::ShutdownInProgress();
return s;
}
ParsedInternalKey ikey;
assert(keys_.size() == merge_context_.GetNumOperands());
Status pik_status =
ParseInternalKey(iter->key(), &ikey, allow_data_in_errors);
if (!pik_status.ok()) {
// stop at corrupted key
if (assert_valid_internal_key_) {
return pik_status;
}
break;
} else if (first_key) {
assert(user_comparator_->Equal(ikey.user_key, orig_ikey.user_key));
first_key = false;
} else if (!user_comparator_->Equal(ikey.user_key, orig_ikey.user_key)) {
// hit a different user key, stop right here
hit_the_next_user_key = true;
break;
} else if (stop_before > 0 && ikey.sequence <= stop_before &&
LIKELY(snapshot_checker_ == nullptr ||
snapshot_checker_->CheckInSnapshot(ikey.sequence,
stop_before) !=
SnapshotCheckerResult::kNotInSnapshot)) {
// hit an entry that's possibly visible by the previous snapshot, can't
// touch that
break;
}
// At this point we are guaranteed that we need to process this key.
assert(IsValueType(ikey.type));
if (ikey.type != kTypeMerge) {
// hit a put/delete/single delete
// => merge the put value or a nullptr with operands_
// => store result in operands_.back() (and update keys_.back())
// => change the entry type to kTypeValue for keys_.back()
// We are done! Success!
// If there are no operands, just return the Status::OK(). That will cause
// the compaction iterator to write out the key we're currently at, which
// is the put/delete we just encountered.
if (keys_.empty()) {
return s;
}
// TODO(noetzli) If the merge operator returns false, we are currently
// (almost) silently dropping the put/delete. That's probably not what we
// want. Also if we're in compaction and it's a put, it would be nice to
// run compaction filter on it.
const Slice val = iter->value();
const Slice* val_ptr;
if (kTypeValue == ikey.type &&
(range_del_agg == nullptr ||
!range_del_agg->ShouldDelete(
ikey, RangeDelPositioningMode::kForwardTraversal))) {
val_ptr = &val;
} else {
val_ptr = nullptr;
}
std::string merge_result;
s = TimedFullMerge(user_merge_operator_, ikey.user_key, val_ptr,
merge_context_.GetOperands(), &merge_result, logger_,
stats_, clock_);
// We store the result in keys_.back() and operands_.back()
// if nothing went wrong (i.e.: no operand corruption on disk)
if (s.ok()) {
// The original key encountered
original_key = std::move(keys_.back());
orig_ikey.type = kTypeValue;
UpdateInternalKey(&original_key, orig_ikey.sequence, orig_ikey.type);
keys_.clear();
merge_context_.Clear();
keys_.emplace_front(std::move(original_key));
merge_context_.PushOperand(merge_result);
}
// move iter to the next entry
iter->Next();
return s;
} else {
// hit a merge
// => if there is a compaction filter, apply it.
// => check for range tombstones covering the operand
// => merge the operand into the front of the operands_ list
// if not filtered
// => then continue because we haven't yet seen a Put/Delete.
//
// Keep queuing keys and operands until we either meet a put / delete
// request or later did a partial merge.
Slice value_slice = iter->value();
// add an operand to the list if:
// 1) it's included in one of the snapshots. in that case we *must* write
// it out, no matter what compaction filter says
// 2) it's not filtered by a compaction filter
CompactionFilter::Decision filter =
ikey.sequence <= latest_snapshot_
? CompactionFilter::Decision::kKeep
: FilterMerge(orig_ikey.user_key, value_slice);
if (filter != CompactionFilter::Decision::kRemoveAndSkipUntil &&
range_del_agg != nullptr &&
range_del_agg->ShouldDelete(
iter->key(), RangeDelPositioningMode::kForwardTraversal)) {
filter = CompactionFilter::Decision::kRemove;
}
if (filter == CompactionFilter::Decision::kKeep ||
filter == CompactionFilter::Decision::kChangeValue) {
if (original_key_is_iter) {
// this is just an optimization that saves us one memcpy
keys_.push_front(std::move(original_key));
} else {
keys_.push_front(iter->key().ToString());
}
if (keys_.size() == 1) {
// we need to re-anchor the orig_ikey because it was anchored by
// original_key before
pik_status =
ParseInternalKey(keys_.back(), &orig_ikey, allow_data_in_errors);
pik_status.PermitUncheckedError();
assert(pik_status.ok());
}
if (filter == CompactionFilter::Decision::kKeep) {
merge_context_.PushOperand(
value_slice, iter->IsValuePinned() /* operand_pinned */);
} else { // kChangeValue
// Compaction filter asked us to change the operand from value_slice
// to compaction_filter_value_.
merge_context_.PushOperand(compaction_filter_value_, false);
}
} else if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil) {
// Compaction filter asked us to remove this key altogether
// (not just this operand), along with some keys following it.
keys_.clear();
merge_context_.Clear();
has_compaction_filter_skip_until_ = true;
return s;
}
}
}
if (merge_context_.GetNumOperands() == 0) {
// we filtered out all the merge operands
return s;
}
// We are sure we have seen this key's entire history if:
// at_bottom == true (this does not necessarily mean it is the bottommost
// layer, but rather that we are confident the key does not appear on any of
// the lower layers, at_bottom == false doesn't mean it does appear, just
// that we can't be sure, see Compaction::IsBottommostLevel for details)
// AND
// we have either encountered another key or end of key history on this
// layer.
//
// When these conditions are true we are able to merge all the keys
// using full merge.
//
// For these cases we are not sure about, we simply miss the opportunity
// to combine the keys. Since VersionSet::SetupOtherInputs() always makes
// sure that all merge-operands on the same level get compacted together,
// this will simply lead to these merge operands moving to the next level.
bool surely_seen_the_beginning =
(hit_the_next_user_key || !iter->Valid()) && at_bottom;
if (surely_seen_the_beginning) {
// do a final merge with nullptr as the existing value and say
// bye to the merge type (it's now converted to a Put)
assert(kTypeMerge == orig_ikey.type);
assert(merge_context_.GetNumOperands() >= 1);
assert(merge_context_.GetNumOperands() == keys_.size());
std::string merge_result;
s = TimedFullMerge(user_merge_operator_, orig_ikey.user_key, nullptr,
merge_context_.GetOperands(), &merge_result, logger_,
stats_, clock_);
if (s.ok()) {
// The original key encountered
// We are certain that keys_ is not empty here (see assertions couple of
// lines before).
original_key = std::move(keys_.back());
orig_ikey.type = kTypeValue;
UpdateInternalKey(&original_key, orig_ikey.sequence, orig_ikey.type);
keys_.clear();
merge_context_.Clear();
keys_.emplace_front(std::move(original_key));
merge_context_.PushOperand(merge_result);
}
} else {
// We haven't seen the beginning of the key nor a Put/Delete.
// Attempt to use the user's associative merge function to
// merge the stacked merge operands into a single operand.
s = Status::MergeInProgress();
if (merge_context_.GetNumOperands() >= 2 ||
(allow_single_operand_ && merge_context_.GetNumOperands() == 1)) {
bool merge_success = false;
std::string merge_result;
{
StopWatchNano timer(clock_, stats_ != nullptr);
PERF_TIMER_GUARD(merge_operator_time_nanos);
merge_success = user_merge_operator_->PartialMergeMulti(
orig_ikey.user_key,
std::deque<Slice>(merge_context_.GetOperands().begin(),
merge_context_.GetOperands().end()),
&merge_result, logger_);
RecordTick(stats_, MERGE_OPERATION_TOTAL_TIME,
stats_ ? timer.ElapsedNanosSafe() : 0);
}
if (merge_success) {
// Merging of operands (associative merge) was successful.
// Replace operands with the merge result
merge_context_.Clear();
merge_context_.PushOperand(merge_result);
keys_.erase(keys_.begin(), keys_.end() - 1);
}
}
}
return s;
}
MergeOutputIterator::MergeOutputIterator(const MergeHelper* merge_helper)
: merge_helper_(merge_helper) {
it_keys_ = merge_helper_->keys().rend();
it_values_ = merge_helper_->values().rend();
}
void MergeOutputIterator::SeekToFirst() {
const auto& keys = merge_helper_->keys();
const auto& values = merge_helper_->values();
assert(keys.size() == values.size());
it_keys_ = keys.rbegin();
it_values_ = values.rbegin();
}
void MergeOutputIterator::Next() {
++it_keys_;
++it_values_;
}
CompactionFilter::Decision MergeHelper::FilterMerge(const Slice& user_key,
const Slice& value_slice) {
if (compaction_filter_ == nullptr) {
return CompactionFilter::Decision::kKeep;
}
if (stats_ != nullptr && ShouldReportDetailedTime(env_, stats_)) {
filter_timer_.Start();
}
compaction_filter_value_.clear();
compaction_filter_skip_until_.Clear();
auto ret = compaction_filter_->FilterV2(
level_, user_key, CompactionFilter::ValueType::kMergeOperand, value_slice,
&compaction_filter_value_, compaction_filter_skip_until_.rep());
if (ret == CompactionFilter::Decision::kRemoveAndSkipUntil) {
if (user_comparator_->Compare(*compaction_filter_skip_until_.rep(),
user_key) <= 0) {
// Invalid skip_until returned from compaction filter.
// Keep the key as per FilterV2 documentation.
ret = CompactionFilter::Decision::kKeep;
} else {
compaction_filter_skip_until_.ConvertFromUserKey(kMaxSequenceNumber,
kValueTypeForSeek);
}
}
if (stats_ != nullptr && ShouldReportDetailedTime(env_, stats_)) {
total_filter_time_ += filter_timer_.ElapsedNanosSafe();
}
return ret;
}
} // namespace ROCKSDB_NAMESPACE