Delete non-visible keys during a compaction even in the presense of snapshots.

Summary:
 LevelDB should delete almost-new keys when a long-open snapshot exists.
The previous behavior is to keep all versions that were created after the
oldest open snapshot. This can lead to database size bloat for
high-update workloads when there are long-open snapshots and long-open
snapshot will be used for logical backup. By "almost new" I mean that the
key was updated more than once after the oldest snapshot.

If there were two snapshots with seq numbers s1 and s2 (s1 < s2), and if
we find two instances of the same key k1 that lie entirely within s1 and
s2 (i.e. s1 < k1 < s2), then the earlier version
of k1 can be safely deleted because that version is not visible in any snapshot.

Test Plan:
unit test attached
make clean check

Differential Revision: https://reviews.facebook.net/D6999
This commit is contained in:
Dhruba Borthakur 2012-11-26 21:16:21 -08:00
parent 34487af458
commit 9a357847eb
4 changed files with 138 additions and 17 deletions

View File

@ -85,11 +85,11 @@ struct DBImpl::Writer {
struct DBImpl::CompactionState {
Compaction* const compaction;
// Sequence numbers < smallest_snapshot are not significant since we
// will never have to service a snapshot below smallest_snapshot.
// Therefore if we have seen a sequence number S <= smallest_snapshot,
// we can drop all entries for the same key with sequence numbers < S.
SequenceNumber smallest_snapshot;
// If there were two snapshots with seq numbers s1 and
// s2 and s1 < s2, and if we find two instances of a key k1 then lies
// entirely within s1 and s2, then the earlier version of k1 can be safely
// deleted because that version is not visible in any snapshot.
std::vector<SequenceNumber> existing_snapshots;
// Files produced by compaction
struct Output {
@ -1262,6 +1262,32 @@ Status DBImpl::InstallCompactionResults(CompactionState* compact) {
return versions_->LogAndApply(compact->compaction->edit(), &mutex_);
}
//
// Given a sequence number, return the sequence number of the
// earliest snapshot that this sequence number is visible in.
// The snapshots themselves are arranged in ascending order of
// sequence numbers.
// Employ a sequential search because the total number of
// snapshots are typically small.
inline SequenceNumber DBImpl::findEarliestVisibleSnapshot(
SequenceNumber in, std::vector<SequenceNumber>& snapshots) {
SequenceNumber prev;
prev = 0;
for (std::vector<SequenceNumber>::iterator it = snapshots.begin();
it < snapshots.end(); it++) {
assert (prev <= *it);
if (*it >= in) {
return *it;
}
assert(prev = *it); // assignment
}
Log(options_.info_log,
"Looking for seqid %ld but maxseqid is %ld", in,
snapshots[snapshots.size()-1]);
assert(0);
return 0;
}
Status DBImpl::DoCompactionWork(CompactionState* compact) {
int64_t imm_micros = 0; // Micros spent doing imm_ compactions
@ -1279,10 +1305,19 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) {
assert(versions_->NumLevelFiles(compact->compaction->level()) > 0);
assert(compact->builder == NULL);
assert(compact->outfile == NULL);
if (snapshots_.empty()) {
compact->smallest_snapshot = versions_->LastSequence();
SequenceNumber visible_at_tip = 0;
SequenceNumber earliest_snapshot;
snapshots_.getAll(compact->existing_snapshots);
if (compact->existing_snapshots.size() == 0) {
// optimize for fast path if there are no snapshots
visible_at_tip = versions_->LastSequence();
earliest_snapshot = visible_at_tip;
} else {
compact->smallest_snapshot = snapshots_.oldest()->number_;
// Add the current seqno as the 'latest' virtual
// snapshot to the end of this list.
compact->existing_snapshots.push_back(versions_->LastSequence());
earliest_snapshot = compact->existing_snapshots[0];
}
// Allocate the output file numbers before we release the lock
@ -1299,6 +1334,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) {
std::string current_user_key;
bool has_current_user_key = false;
SequenceNumber last_sequence_for_key = kMaxSequenceNumber;
SequenceNumber visible_in_snapshot = kMaxSequenceNumber;
for (; input->Valid() && !shutting_down_.Acquire_Load(); ) {
// Prioritize immutable compaction work
if (imm_.imm_flush_needed.NoBarrier_Load() != NULL) {
@ -1330,6 +1366,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) {
current_user_key.clear();
has_current_user_key = false;
last_sequence_for_key = kMaxSequenceNumber;
visible_in_snapshot = kMaxSequenceNumber;
} else {
if (!has_current_user_key ||
user_comparator()->Compare(ikey.user_key,
@ -1338,14 +1375,26 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) {
current_user_key.assign(ikey.user_key.data(), ikey.user_key.size());
has_current_user_key = true;
last_sequence_for_key = kMaxSequenceNumber;
visible_in_snapshot = kMaxSequenceNumber;
}
if (last_sequence_for_key <= compact->smallest_snapshot) {
// If there are no snapshots, then this kv affect visibility at tip.
// Otherwise, search though all existing snapshots to find
// the earlist snapshot that is affected by this kv.
SequenceNumber visible = visible_at_tip ? visible_at_tip :
findEarliestVisibleSnapshot(ikey.sequence,
compact->existing_snapshots);
if (visible_in_snapshot == visible) {
// If the earliest snapshot is which this key is visible in
// is the same as the visibily of a previous instance of the
// same key, then this kv is not visible in any snapshot.
// Hidden by an newer entry for same user key
assert(last_sequence_for_key >= ikey.sequence);
drop = true; // (A)
RecordTick(options_.statistics, COMPACTION_KEY_DROP_NEWER_ENTRY);
} else if (ikey.type == kTypeDeletion &&
ikey.sequence <= compact->smallest_snapshot &&
ikey.sequence <= earliest_snapshot &&
compact->compaction->IsBaseLevelForKey(ikey.user_key)) {
// For this user key:
// (1) there is no data in higher levels
@ -1358,7 +1407,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) {
RecordTick(options_.statistics, COMPACTION_KEY_DROP_OBSOLETE);
} else if (options_.CompactionFilter != NULL &&
ikey.type != kTypeDeletion &&
ikey.sequence < compact->smallest_snapshot) {
ikey.sequence < earliest_snapshot) {
// If the user has specified a compaction filter, then invoke
// it. If this key is not visible via any snapshot and the
// return value of the compaction filter is true and then
@ -1378,6 +1427,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) {
}
last_sequence_for_key = ikey.sequence;
visible_in_snapshot = visible;
}
#if 0
Log(options_.info_log,

View File

@ -303,6 +303,10 @@ protected:
// dump the delayed_writes_ to the log file and reset counter.
void DelayLoggingAndReset();
// find the earliest snapshot where seqno is visible
inline SequenceNumber findEarliestVisibleSnapshot(SequenceNumber in,
std::vector<SequenceNumber>& snapshots);
};
// Sanitize db options. The caller should delete result.info_log if

View File

@ -1051,8 +1051,7 @@ TEST(DBTest, CompactionTrigger) {
for (int num = 0;
num < options.level0_file_num_compaction_trigger - 1;
num++)
{
num++) {
std::vector<std::string> values;
// Write 120KB (12 values, each 10K)
for (int i = 0; i < 12; i++) {
@ -1594,6 +1593,59 @@ TEST(DBTest, HiddenValuesAreRemoved) {
} while (ChangeOptions());
}
TEST(DBTest, CompactBetweenSnapshots) {
do {
Random rnd(301);
FillLevels("a", "z");
Put("foo", "first");
const Snapshot* snapshot1 = db_->GetSnapshot();
Put("foo", "second");
Put("foo", "third");
Put("foo", "fourth");
const Snapshot* snapshot2 = db_->GetSnapshot();
Put("foo", "fifth");
Put("foo", "sixth");
// All entries (including duplicates) exist
// before any compaction is triggered.
ASSERT_OK(dbfull()->TEST_CompactMemTable());
ASSERT_EQ("sixth", Get("foo"));
ASSERT_EQ("fourth", Get("foo", snapshot2));
ASSERT_EQ("first", Get("foo", snapshot1));
ASSERT_EQ(AllEntriesFor("foo"),
"[ sixth, fifth, fourth, third, second, first ]");
// After a compaction, "second", "third" and "fifth" should
// be removed
FillLevels("a", "z");
dbfull()->CompactRange(NULL, NULL);
ASSERT_EQ("sixth", Get("foo"));
ASSERT_EQ("fourth", Get("foo", snapshot2));
ASSERT_EQ("first", Get("foo", snapshot1));
ASSERT_EQ(AllEntriesFor("foo"), "[ sixth, fourth, first ]");
// after we release the snapshot1, only two values left
db_->ReleaseSnapshot(snapshot1);
FillLevels("a", "z");
dbfull()->CompactRange(NULL, NULL);
// We have only one valid snapshot snapshot2. Since snapshot1 is
// not valid anymore, "first" should be removed by a compaction.
ASSERT_EQ("sixth", Get("foo"));
ASSERT_EQ("fourth", Get("foo", snapshot2));
ASSERT_EQ(AllEntriesFor("foo"), "[ sixth, fourth ]");
// after we release the snapshot2, only one value should be left
db_->ReleaseSnapshot(snapshot2);
FillLevels("a", "z");
dbfull()->CompactRange(NULL, NULL);
ASSERT_EQ("sixth", Get("foo"));
ASSERT_EQ(AllEntriesFor("foo"), "[ sixth ]");
} while (ChangeOptions());
}
TEST(DBTest, DeletionMarkers1) {
Put("foo", "v1");
ASSERT_OK(dbfull()->TEST_CompactMemTable());

View File

@ -32,6 +32,7 @@ class SnapshotList {
SnapshotList() {
list_.prev_ = &list_;
list_.next_ = &list_;
list_.number_ = 0xFFFFFFFFL; // placeholder marker, for debugging
}
bool empty() const { return list_.next_ == &list_; }
@ -56,6 +57,20 @@ class SnapshotList {
delete s;
}
// retrieve all snapshot numbers. They are sorted in ascending order.
void getAll(std::vector<SequenceNumber>& ret) {
SnapshotImpl* s = &list_;
SequenceNumber prev;
prev = 0;
if (empty()) return;
while (s->next_ != &list_) {
assert(prev <= s->next_->number_);
assert(prev = s->next_->number_); // assignment
ret.push_back(s->next_->number_);
s = s ->next_;
}
}
private:
// Dummy head of doubly-linked list of snapshots
SnapshotImpl list_;