From 17edc82a4ba3d4455b9fe86632f11e47df4a16e1 Mon Sep 17 00:00:00 2001 From: Nikhil Benesch Date: Tue, 25 Sep 2018 14:48:44 -0700 Subject: [PATCH] Handle tombstones at the same seqno in the CollapsedRangeDelMap (#4424) Summary: The CollapsedRangeDelMap was entirely mishandling tombstones at the same sequence number when the tombstones did not have identical start and end keys. Such tombstones are common since 90fc40690, which causes tombstones to be split during compactions. For example, if the tombstone [a, c) @ 1 lies across a compaction boundary at b, it will be split into [a, b) @ 1 and [b, c) @ 1. Without this patch, the collapsed range deletion map would look like this: a -> 1 b -> 1 c -> 0 Notice how the b -> 1 entry is redundant. When the tombstones overlap, the problem is even worse. Consider tombstones [a, c) @ 1 and [b, d) @ 1, which produces this map without this patch: a -> 1 b -> 1 c -> 0 d -> 0 This map is corrupt, as a map can never contain adjacent sentinel (zero) entries. When the iterator advances from b to c, it will notice that c is a sentinel enty and skip to d--but d is also a sentinel entry! Asking what tombstone this iterator points to will trigger an assertion, as it is not pointing to a valid tombstone. /cc ajkr Pull Request resolved: https://github.com/facebook/rocksdb/pull/4424 Differential Revision: D10039248 Pulled By: abhimadan fbshipit-source-id: 6d737c1e88d60e80cf27286726627ba44463e7f4 --- db/range_del_aggregator.cc | 60 ++++++++++++++++++++++++--------- db/range_del_aggregator_test.cc | 24 +++++++++++++ 2 files changed, 68 insertions(+), 16 deletions(-) diff --git a/db/range_del_aggregator.cc b/db/range_del_aggregator.cc index 1854377e4..d1885603c 100644 --- a/db/range_del_aggregator.cc +++ b/db/range_del_aggregator.cc @@ -269,22 +269,36 @@ class CollapsedRangeDelMap : public RangeDelMap { // 2: c--- OR 2: c--- OR 2: c--- OR 2: c------ // 1: A--C 1: 1: A------ 1: C------ // ^ ^ ^ ^ - // Insert a new transition at the new tombstone's start point, or raise - // the existing transition at that point to the new tombstone's seqno. end_seq = prev_seq(); - rep_[t.start_key_] = t.seq_; // operator[] will overwrite existing entry + Rep::iterator pit; + if (it != rep_.begin() && (pit = std::prev(it)) != rep_.begin() && + ucmp_->Compare(pit->first, t.start_key_) == 0 && std::prev(pit)->second == t.seq_) { + // The new tombstone starts at the end of an existing tombstone with an + // identical seqno: + // + // 3: + // 2: A--C--- + // 1: + // ^ + // Merge the tombstones by removing the existing tombstone's end key. + it = rep_.erase(std::prev(it)); + } else { + // Insert a new transition at the new tombstone's start point, or raise + // the existing transition at that point to the new tombstone's seqno. + rep_[t.start_key_] = t.seq_; // operator[] will overwrite existing entry + } } else { // The new tombstone's start point is covered by an existing tombstone: // - // 3: A----- OR 3: C------ - // 2: c--- 2: c------ - // ^ ^ + // 3: A----- OR 3: C------ OR + // 2: c--- 2: c------ 2: C------ + // ^ ^ ^ // Do nothing. } // Look at all the existing transitions that overlap the new tombstone. while (it != rep_.end() && ucmp_->Compare(it->first, t.end_key_) < 0) { - if (t.seq_ > it->second) { + if (t.seq_ >= it->second) { // The transition is to an existing tombstone that the new tombstone // covers. Save the covered tombstone's seqno. We'll need to return to // it if the new tombstone ends before the existing tombstone. @@ -328,15 +342,29 @@ class CollapsedRangeDelMap : public RangeDelMap { } if (t.seq_ == prev_seq()) { - // The new tombstone is unterminated in the map: - // - // 3: OR 3: --G OR 3: --G K-- - // 2: C-------k 2: G---k 2: G---k - // ^ ^ ^ - // End it now, returning to the last seqno we covered. Because end keys - // are exclusive, if there's an existing transition at t.end_key_, it - // takes precedence over the transition that we install here. - rep_.emplace(t.end_key_, end_seq); // emplace is a noop if existing entry + // The new tombstone is unterminated in the map. + if (it != rep_.end() && t.seq_ == it->second && ucmp_->Compare(it->first, t.end_key_) == 0) { + // The new tombstone ends at the start of another tombstone with an + // identical seqno. Merge the tombstones by removing the existing + // tombstone's start key. + rep_.erase(it); + } else if (end_seq == prev_seq() || (it != rep_.end() && end_seq == it->second)) { + // The new tombstone is implicitly ended because its end point is + // contained within an existing tombstone with the same seqno: + // + // 2: ---k--N + // ^ + } else { + // The new tombstone needs an explicit end point. + // + // 3: OR 3: --G OR 3: --G K-- + // 2: C-------k 2: G---k 2: G---k + // ^ ^ ^ + // Install one that returns to the last seqno we covered. Because end + // keys are exclusive, if there's an existing transition at t.end_key_, + // it takes precedence over the transition that we install here. + rep_.emplace(t.end_key_, end_seq); // emplace is a noop if existing entry + } } else { // The new tombstone is implicitly ended because its end point is covered // by an existing tombstone with a higher seqno. diff --git a/db/range_del_aggregator_test.cc b/db/range_del_aggregator_test.cc index 16b68b7c4..a5746df15 100644 --- a/db/range_del_aggregator_test.cc +++ b/db/range_del_aggregator_test.cc @@ -208,6 +208,30 @@ TEST_F(RangeDelAggregatorTest, GapsBetweenRanges) { {{"a", "b", 5}, {"c", "d", 10}, {"e", "f", 15}}); } +TEST_F(RangeDelAggregatorTest, IdenticalSameSeqNo) { + VerifyRangeDels({{"a", "b", 5}, {"a", "b", 5}}, + {{" ", 0}, {"a", 5}, {"b", 0}}, + {{"a", "b", 5}}); +} + +TEST_F(RangeDelAggregatorTest, ContiguousSameSeqNo) { + VerifyRangeDels({{"a", "b", 5}, {"b", "c", 5}}, + {{" ", 0}, {"a", 5}, {"b", 5}, {"c", 0}}, + {{"a", "c", 5}}); +} + +TEST_F(RangeDelAggregatorTest, OverlappingSameSeqNo) { + VerifyRangeDels({{"a", "c", 5}, {"b", "d", 5}}, + {{" ", 0}, {"a", 5}, {"b", 5}, {"c", 5}, {"d", 0}}, + {{"a", "d", 5}}); +} + +TEST_F(RangeDelAggregatorTest, CoverSameSeqNo) { + VerifyRangeDels({{"a", "d", 5}, {"b", "c", 5}}, + {{" ", 0}, {"a", 5}, {"b", 5}, {"c", 5}, {"d", 0}}, + {{"a", "d", 5}}); +} + // Note the Cover* tests also test cases where tombstones are inserted under a // larger one when VerifyRangeDels() runs them in reverse TEST_F(RangeDelAggregatorTest, CoverMultipleFromLeft) {