rocksdb/db/read_callback.h
Maysam Yabandeh fe642cbee6 WritePrepared: fix race condition in reading batch with duplicate keys (#5147)
Summary:
When ReadOption doesn't specify a snapshot, WritePrepared::Get used kMaxSequenceNumber to avoid the cost of creating a new snapshot object (that requires sync over db_mutex). This creates a race condition if it is reading from the writes of a transaction that had duplicate keys: each instance of duplicate key is inserted with a different sequence number and depending on the ordering the ::Get might skip the newer one and read the older one that is obsolete.
The patch fixes that by using last published seq as the snapshot sequence number. It also adds a check after the read is done to ensure that the max_evicted_seq has not advanced the aforementioned seq, which is a very unlikely event. If it did, then the read is not valid since the seq is not backed by an actually snapshot to let IsInSnapshot handle that properly when an overlapping commit is evicted from commit cache.
A unit  test is added to reproduce the race condition with duplicate keys.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5147

Differential Revision: D14758815

Pulled By: maysamyabandeh

fbshipit-source-id: a56915657132cf6ba5e3f5ea1b5d78c803407719
2019-04-12 14:40:41 -07:00

57 lines
1.8 KiB
C++

// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#pragma once
#include "rocksdb/types.h"
namespace rocksdb {
class ReadCallback {
public:
ReadCallback(SequenceNumber last_visible_seq)
: max_visible_seq_(last_visible_seq) {}
ReadCallback(SequenceNumber last_visible_seq, SequenceNumber min_uncommitted)
: max_visible_seq_(last_visible_seq), min_uncommitted_(min_uncommitted) {}
virtual ~ReadCallback() {}
// Will be called to see if the seq number visible; if not it moves on to
// the next seq number.
virtual bool IsVisibleFullCheck(SequenceNumber seq) = 0;
inline bool IsVisible(SequenceNumber seq) {
assert(min_uncommitted_ > 0);
assert(min_uncommitted_ >= kMinUnCommittedSeq);
if (seq < min_uncommitted_) { // handles seq == 0 as well
assert(seq <= max_visible_seq_);
return true;
} else if (max_visible_seq_ < seq) {
assert(seq != 0);
return false;
} else {
assert(seq != 0); // already handled in the first if-then clause
return IsVisibleFullCheck(seq);
}
}
inline SequenceNumber max_visible_seq() { return max_visible_seq_; }
// Refresh to a more recent visible seq
virtual void Refresh(SequenceNumber seq) { max_visible_seq_ = seq; }
// Refer to DBIter::CanReseekToSkip
virtual bool CanReseekToSkip() { return true; }
protected:
// The max visible seq, it is usually the snapshot but could be larger if
// transaction has its own writes written to db.
SequenceNumber max_visible_seq_ = kMaxSequenceNumber;
// Any seq less than min_uncommitted_ is committed.
const SequenceNumber min_uncommitted_ = kMinUnCommittedSeq;
};
} // namespace rocksdb