WritePrepared Txn: Support merge operator

Summary:
CompactionIterator invoke MergeHelper::MergeUntil() to do partial merge between snapshot boundaries. Previously it only depend on sequence number to tell snapshot boundary, but we also need to make use of snapshot_checker to verify visibility of the merge operands to the snapshots. For example, say there is a snapshot with seq = 2 but only can see data with seq <= 1. There are three merges, each with seq = 1, 2, 3. A correct compaction output would be (1),(2+3). Without taking snapshot_checker into account when generating merge result, compaction will generate output (1+2),(3).

By filtering uncommitted keys with read callback, the read path already take care of merges well and don't need additional updates.
Closes https://github.com/facebook/rocksdb/pull/3475

Differential Revision: D6926087

Pulled By: yiwu-arbug

fbshipit-source-id: 8f539d6f897cfe29b6dc27a8992f68c2a629d40a
This commit is contained in:
Yi Wu 2018-02-09 14:43:56 -08:00 committed by Facebook Github Bot
parent 9fc72d6f16
commit fe228da0a9
7 changed files with 158 additions and 34 deletions

View File

@ -131,7 +131,8 @@ Status BuildTable(
MergeHelper merge(env, internal_comparator.user_comparator(),
ioptions.merge_operator, nullptr, ioptions.info_log,
true /* internal key corruption is not ok */,
snapshots.empty() ? 0 : snapshots.back());
snapshots.empty() ? 0 : snapshots.back(),
snapshot_checker);
CompactionIterator c_iter(
iter, internal_comparator.user_comparator(), &merge, kMaxSequenceNumber,

View File

@ -559,10 +559,6 @@ void CompactionIterator::NextFromInput() {
// have hit (A)
// We encapsulate the merge related state machine in a different
// object to minimize change to the existing flow.
// In case snapshot_checker is present, we can probably merge further
// beyond prev_snapshot, since there could be more keys with sequence
// smaller than prev_snapshot, but reported by snapshot_checker as not
// visible by prev_snapshot. But it will make the logic more complicated.
Status s = merge_helper_->MergeUntil(input_, range_del_agg_,
prev_snapshot, bottommost_level_);
merge_out_iter_.SeekToFirst();

View File

@ -229,14 +229,15 @@ class CompactionIteratorTest : public testing::TestWithParam<bool> {
compaction_proxy_->is_bottommost_level = bottommost_level;
compaction.reset(compaction_proxy_);
}
merge_helper_.reset(new MergeHelper(Env::Default(), cmp_, merge_op, filter,
nullptr, false, 0, 0, nullptr,
&shutting_down_));
bool use_snapshot_checker = UseSnapshotChecker() || GetParam();
if (use_snapshot_checker || last_committed_sequence < kMaxSequenceNumber) {
snapshot_checker_.reset(
new TestSnapshotChecker(last_committed_sequence, snapshot_map_));
}
merge_helper_.reset(
new MergeHelper(Env::Default(), cmp_, merge_op, filter, nullptr, false,
0 /*latest_snapshot*/, snapshot_checker_.get(),
0 /*level*/, nullptr /*statistics*/, &shutting_down_));
iter_.reset(new LoggingForwardVectorIterator(ks, vs));
iter_->SeekToFirst();
@ -775,17 +776,18 @@ TEST_F(CompactionIteratorWithSnapshotCheckerTest, DedupSameSnapshot_Deletion) {
{"v4", "", "v1"}, 3 /*last_committed_seq*/);
}
TEST_F(CompactionIteratorWithSnapshotCheckerTest,
DISABLED_DedupSameSnapshot_Merge) {
TEST_F(CompactionIteratorWithSnapshotCheckerTest, DedupSameSnapshot_Merge) {
AddSnapshot(2, 1);
AddSnapshot(4, 3);
auto merge_op = MergeOperators::CreateStringAppendOperator();
RunTest(
{test::KeyStr("foo", 4, kTypeValue), test::KeyStr("foo", 3, kTypeMerge),
test::KeyStr("foo", 2, kTypeMerge), test::KeyStr("foo", 1, kTypeValue)},
{"v4", "v3", "v2", "v1"},
{test::KeyStr("foo", 4, kTypeValue), test::KeyStr("foo", 3, kTypeMerge),
{test::KeyStr("foo", 5, kTypeMerge), test::KeyStr("foo", 4, kTypeMerge),
test::KeyStr("foo", 3, kTypeMerge), test::KeyStr("foo", 2, kTypeMerge),
test::KeyStr("foo", 1, kTypeValue)},
{"v4", "v2,v3", "v1"}, 3 /*last_committed_seq*/, merge_op.get());
{"v5", "v4", "v3", "v2", "v1"},
{test::KeyStr("foo", 5, kTypeMerge), test::KeyStr("foo", 4, kTypeMerge),
test::KeyStr("foo", 3, kTypeMerge), test::KeyStr("foo", 1, kTypeValue)},
{"v5", "v4", "v2,v3", "v1"}, 4 /*last_committed_seq*/, merge_op.get());
}
TEST_F(CompactionIteratorWithSnapshotCheckerTest,
@ -886,8 +888,9 @@ TEST_F(CompactionIteratorWithSnapshotCheckerTest,
2 /*earliest_write_conflict_snapshot*/);
}
// Compaction filter should keep uncommitted key as-is, and trigger on the
// first committed version of a key.
// Compaction filter should keep uncommitted key as-is, and
// * Convert the latest velue to deletion, and/or
// * if latest value is a merge, apply filter to all suequent merges.
TEST_F(CompactionIteratorWithSnapshotCheckerTest, CompactionFilter_Value) {
std::unique_ptr<CompactionFilter> compaction_filter(
@ -915,22 +918,18 @@ TEST_F(CompactionIteratorWithSnapshotCheckerTest, CompactionFilter_Deletion) {
}
TEST_F(CompactionIteratorWithSnapshotCheckerTest,
DISABLED_CompactionFilter_PartialMerge) {
CompactionFilter_PartialMerge) {
std::shared_ptr<MergeOperator> merge_op =
MergeOperators::CreateStringAppendOperator();
std::unique_ptr<CompactionFilter> compaction_filter(
new FilterAllKeysCompactionFilter());
RunTest(
{test::KeyStr("a", 3, kTypeMerge), test::KeyStr("a", 2, kTypeMerge),
test::KeyStr("a", 1, kTypeMerge)},
{"v3", "v2", "v1"},
{test::KeyStr("a", 3, kTypeMerge), test::KeyStr("a", 2, kTypeDeletion)},
{"v3", ""}, 2 /*last_committed_seq*/, merge_op.get(),
compaction_filter.get());
RunTest({test::KeyStr("a", 3, kTypeMerge), test::KeyStr("a", 2, kTypeMerge),
test::KeyStr("a", 1, kTypeMerge)},
{"v3", "v2", "v1"}, {test::KeyStr("a", 3, kTypeMerge)}, {"v3"},
2 /*last_committed_seq*/, merge_op.get(), compaction_filter.get());
}
TEST_F(CompactionIteratorWithSnapshotCheckerTest,
DISABLED_CompactionFilter_FullMerge) {
TEST_F(CompactionIteratorWithSnapshotCheckerTest, CompactionFilter_FullMerge) {
std::shared_ptr<MergeOperator> merge_op =
MergeOperators::CreateStringAppendOperator();
std::unique_ptr<CompactionFilter> compaction_filter(
@ -939,7 +938,7 @@ TEST_F(CompactionIteratorWithSnapshotCheckerTest,
{test::KeyStr("a", 3, kTypeMerge), test::KeyStr("a", 2, kTypeMerge),
test::KeyStr("a", 1, kTypeValue)},
{"v3", "v2", "v1"},
{test::KeyStr("a", 3, kTypeMerge), test::KeyStr("a", 2, kTypeDeletion)},
{test::KeyStr("a", 3, kTypeMerge), test::KeyStr("a", 1, kTypeDeletion)},
{"v3", ""}, 2 /*last_committed_seq*/, merge_op.get(),
compaction_filter.get());
}

View File

@ -747,8 +747,8 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
compaction_filter, db_options_.info_log.get(),
false /* internal key corruption is expected */,
existing_snapshots_.empty() ? 0 : existing_snapshots_.back(),
compact_->compaction->level(), db_options_.statistics.get(),
shutting_down_);
snapshot_checker_, compact_->compaction->level(),
db_options_.statistics.get(), shutting_down_);
TEST_SYNC_POINT("CompactionJob::Run():Inprogress");

View File

@ -14,10 +14,43 @@
namespace rocksdb {
class TestReadCallback : public ReadCallback {
public:
TestReadCallback(SnapshotChecker* snapshot_checker,
SequenceNumber snapshot_seq)
: snapshot_checker_(snapshot_checker), snapshot_seq_(snapshot_seq) {}
bool IsCommitted(SequenceNumber seq) override {
return snapshot_checker_->IsInSnapshot(seq, snapshot_seq_);
}
private:
SnapshotChecker* snapshot_checker_;
SequenceNumber snapshot_seq_;
};
// Test merge operator functionality.
class DBMergeOperatorTest : public DBTestBase {
public:
DBMergeOperatorTest() : DBTestBase("/db_merge_operator_test") {}
std::string GetWithReadCallback(SnapshotChecker* snapshot_checker,
const Slice& key,
const Snapshot* snapshot = nullptr) {
SequenceNumber seq = snapshot == nullptr ? db_->GetLatestSequenceNumber()
: snapshot->GetSequenceNumber();
TestReadCallback read_callback(snapshot_checker, seq);
ReadOptions read_opt;
read_opt.snapshot = snapshot;
PinnableSlice value;
Status s =
dbfull()->GetImpl(read_opt, db_->DefaultColumnFamily(), key, &value,
nullptr /*value_found*/, &read_callback);
if (!s.ok()) {
return s.ToString();
}
return value.ToString();
}
};
TEST_F(DBMergeOperatorTest, LimitMergeOperands) {
@ -508,6 +541,93 @@ TEST_F(DBMergeOperatorTest, TailingIteratorMemtableUnrefedBySomeoneElse) {
}
#endif // ROCKSDB_LITE
TEST_F(DBMergeOperatorTest, SnapshotCheckerAndReadCallback) {
Options options = CurrentOptions();
options.merge_operator = MergeOperators::CreateStringAppendOperator();
DestroyAndReopen(options);
class TestSnapshotChecker : public SnapshotChecker {
bool IsInSnapshot(SequenceNumber seq,
SequenceNumber snapshot_seq) const override {
switch (snapshot_seq) {
case 0:
return seq == 0;
case 1:
return seq <= 1;
case 2:
// seq = 2 not visible to snapshot with seq = 2
return seq <= 1;
case 3:
return seq <= 3;
case 4:
// seq = 4 not visible to snpahost with seq = 4
return seq <= 3;
default:
// seq >=4 is uncommitted
return seq <= 4;
};
}
};
TestSnapshotChecker* snapshot_checker = new TestSnapshotChecker();
dbfull()->SetSnapshotChecker(snapshot_checker);
std::string value;
ASSERT_OK(Merge("foo", "v1"));
ASSERT_EQ(1, db_->GetLatestSequenceNumber());
ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo"));
ASSERT_OK(Merge("foo", "v2"));
ASSERT_EQ(2, db_->GetLatestSequenceNumber());
// v2 is not visible to latest snapshot, which has seq = 2.
ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo"));
// Take a snapshot with seq = 2.
const Snapshot* snapshot1 = db_->GetSnapshot();
ASSERT_EQ(2, snapshot1->GetSequenceNumber());
// v2 is not visible to snapshot1, which has seq = 2
ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo", snapshot1));
// Verify flush doesn't alter the result.
ASSERT_OK(Flush());
ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo", snapshot1));
ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo"));
ASSERT_OK(Merge("foo", "v3"));
ASSERT_EQ(3, db_->GetLatestSequenceNumber());
ASSERT_EQ("v1,v2,v3", GetWithReadCallback(snapshot_checker, "foo"));
ASSERT_OK(Merge("foo", "v4"));
ASSERT_EQ(4, db_->GetLatestSequenceNumber());
// v4 is not visible to latest snapshot, which has seq = 4.
ASSERT_EQ("v1,v2,v3", GetWithReadCallback(snapshot_checker, "foo"));
const Snapshot* snapshot2 = db_->GetSnapshot();
ASSERT_EQ(4, snapshot2->GetSequenceNumber());
// v4 is not visible to snapshot2, which has seq = 4.
ASSERT_EQ("v1,v2,v3",
GetWithReadCallback(snapshot_checker, "foo", snapshot2));
// Verify flush doesn't alter the result.
ASSERT_OK(Flush());
ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo", snapshot1));
ASSERT_EQ("v1,v2,v3",
GetWithReadCallback(snapshot_checker, "foo", snapshot2));
ASSERT_EQ("v1,v2,v3", GetWithReadCallback(snapshot_checker, "foo"));
ASSERT_OK(Merge("foo", "v5"));
ASSERT_EQ(5, db_->GetLatestSequenceNumber());
// v5 is uncommitted
ASSERT_EQ("v1,v2,v3,v4", GetWithReadCallback(snapshot_checker, "foo"));
// full manual compaction.
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
// Verify compaction doesn't alter the result.
ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo", snapshot1));
ASSERT_EQ("v1,v2,v3",
GetWithReadCallback(snapshot_checker, "foo", snapshot2));
ASSERT_EQ("v1,v2,v3,v4", GetWithReadCallback(snapshot_checker, "foo"));
db_->ReleaseSnapshot(snapshot1);
db_->ReleaseSnapshot(snapshot2);
}
} // namespace rocksdb
int main(int argc, char** argv) {

View File

@ -5,12 +5,12 @@
#include "db/merge_helper.h"
#include <stdio.h>
#include <string>
#include "db/dbformat.h"
#include "monitoring/perf_context_imp.h"
#include "monitoring/statistics.h"
#include "port/likely.h"
#include "rocksdb/comparator.h"
#include "rocksdb/db.h"
#include "rocksdb/merge_operator.h"
@ -22,7 +22,8 @@ MergeHelper::MergeHelper(Env* env, const Comparator* user_comparator,
const MergeOperator* user_merge_operator,
const CompactionFilter* compaction_filter,
Logger* logger, bool assert_valid_internal_key,
SequenceNumber latest_snapshot, int level,
SequenceNumber latest_snapshot,
const SnapshotChecker* snapshot_checker, int level,
Statistics* stats,
const std::atomic<bool>* shutting_down)
: env_(env),
@ -34,6 +35,7 @@ MergeHelper::MergeHelper(Env* env, const Comparator* user_comparator,
assert_valid_internal_key_(assert_valid_internal_key),
allow_single_operand_(false),
latest_snapshot_(latest_snapshot),
snapshot_checker_(snapshot_checker),
level_(level),
keys_(),
filter_timer_(env_),
@ -158,7 +160,10 @@ Status MergeHelper::MergeUntil(InternalIterator* iter,
// hit a different user key, stop right here
hit_the_next_user_key = true;
break;
} else if (stop_before && ikey.sequence <= stop_before) {
} else if (stop_before > 0 && ikey.sequence <= stop_before &&
LIKELY(snapshot_checker_ == nullptr ||
snapshot_checker_->IsInSnapshot(ikey.sequence,
stop_before))) {
// hit an entry that's visible by the previous snapshot, can't touch that
break;
}

View File

@ -13,6 +13,7 @@
#include "db/dbformat.h"
#include "db/merge_context.h"
#include "db/range_del_aggregator.h"
#include "db/snapshot_checker.h"
#include "rocksdb/compaction_filter.h"
#include "rocksdb/env.h"
#include "rocksdb/slice.h"
@ -33,7 +34,8 @@ class MergeHelper {
const MergeOperator* user_merge_operator,
const CompactionFilter* compaction_filter, Logger* logger,
bool assert_valid_internal_key, SequenceNumber latest_snapshot,
int level = 0, Statistics* stats = nullptr,
const SnapshotChecker* snapshot_checker = nullptr, int level = 0,
Statistics* stats = nullptr,
const std::atomic<bool>* shutting_down = nullptr);
// Wrapper around MergeOperator::FullMergeV2() that records perf statistics.
@ -145,6 +147,7 @@ class MergeHelper {
bool assert_valid_internal_key_; // enforce no internal key corruption?
bool allow_single_operand_;
SequenceNumber latest_snapshot_;
const SnapshotChecker* const snapshot_checker_;
int level_;
// the scratch area that holds the result of MergeUntil