WritePrepared: Fix SmallestUnCommittedSeq bug (#5683)
Summary: SmallestUnCommittedSeq reads two data structures, prepared_txns_ and delayed_prepared_. These two are updated in CheckPreparedAgainstMax when max_evicted_seq_ advances some prepared entires. To avoid the cost of acquiring a mutex, the read from them in SmallestUnCommittedSeq is not atomic. This creates a potential race condition. The fix is to read the two data structures in the reverse order of their update. CheckPreparedAgainstMax copies the prepared entry to delayed_prepared_ before removing it from prepared_txns_ and SmallestUnCommittedSeq looks into prepared_txns_ before reading delayed_prepared_. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5683 Differential Revision: D16744699 Pulled By: maysamyabandeh fbshipit-source-id: b1bdb134018beb0b9de58827f512662bea35cad0
This commit is contained in:
parent
5d9a67e718
commit
12eaacb71d
@ -1575,6 +1575,68 @@ TEST_P(WritePreparedTransactionTest, AdvanceMaxEvictedSeqWithDuplicatesTest) {
|
|||||||
delete txn0;
|
delete txn0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Stress SmallestUnCommittedSeq, which reads from both prepared_txns_ and
|
||||||
|
// delayed_prepared_, when is run concurrently with advancing max_evicted_seq,
|
||||||
|
// which moves prepared txns from prepared_txns_ to delayed_prepared_.
|
||||||
|
TEST_P(WritePreparedTransactionTest, SmallestUnCommittedSeq) {
|
||||||
|
const size_t snapshot_cache_bits = 7; // same as default
|
||||||
|
const size_t commit_cache_bits = 1; // disable commit cache
|
||||||
|
UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
|
||||||
|
ReOpen();
|
||||||
|
WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
|
||||||
|
ReadOptions ropt;
|
||||||
|
PinnableSlice pinnable_val;
|
||||||
|
WriteOptions write_options;
|
||||||
|
TransactionOptions txn_options;
|
||||||
|
std::vector<Transaction*> txns, committed_txns;
|
||||||
|
|
||||||
|
const int cnt = 100;
|
||||||
|
for (int i = 0; i < cnt; i++) {
|
||||||
|
Transaction* txn = db->BeginTransaction(write_options, txn_options);
|
||||||
|
ASSERT_OK(txn->SetName("xid" + ToString(i)));
|
||||||
|
auto key = "key1" + ToString(i);
|
||||||
|
auto value = "value1" + ToString(i);
|
||||||
|
ASSERT_OK(txn->Put(Slice(key), Slice(value)));
|
||||||
|
ASSERT_OK(txn->Prepare());
|
||||||
|
txns.push_back(txn);
|
||||||
|
}
|
||||||
|
|
||||||
|
port::Mutex mutex;
|
||||||
|
Random rnd(1103);
|
||||||
|
rocksdb::port::Thread commit_thread([&]() {
|
||||||
|
for (int i = 0; i < cnt; i++) {
|
||||||
|
uint32_t index = rnd.Uniform(cnt - i);
|
||||||
|
Transaction* txn;
|
||||||
|
{
|
||||||
|
MutexLock l(&mutex);
|
||||||
|
txn = txns[index];
|
||||||
|
txns.erase(txns.begin() + index);
|
||||||
|
}
|
||||||
|
// Since commit cahce is practically disabled, commit results in immediate
|
||||||
|
// advance in max_evicted_seq_ and subsequently moving some prepared txns
|
||||||
|
// to delayed_prepared_.
|
||||||
|
txn->Commit();
|
||||||
|
committed_txns.push_back(txn);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
rocksdb::port::Thread read_thread([&]() {
|
||||||
|
while (1) {
|
||||||
|
MutexLock l(&mutex);
|
||||||
|
if (txns.empty()) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
auto min_uncommitted = wp_db->SmallestUnCommittedSeq();
|
||||||
|
ASSERT_LE(min_uncommitted, (*txns.begin())->GetId());
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
commit_thread.join();
|
||||||
|
read_thread.join();
|
||||||
|
for (auto txn : committed_txns) {
|
||||||
|
delete txn;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
TEST_P(SeqAdvanceConcurrentTest, SeqAdvanceConcurrentTest) {
|
TEST_P(SeqAdvanceConcurrentTest, SeqAdvanceConcurrentTest) {
|
||||||
// Given the sequential run of txns, with this timeout we should never see a
|
// Given the sequential run of txns, with this timeout we should never see a
|
||||||
// deadlock nor a timeout unless we have a key conflict, which should be
|
// deadlock nor a timeout unless we have a key conflict, which should be
|
||||||
|
@ -432,8 +432,12 @@ void WritePreparedTxnDB::CheckPreparedAgainstMax(SequenceNumber new_max,
|
|||||||
" new_max=%" PRIu64,
|
" new_max=%" PRIu64,
|
||||||
static_cast<uint64_t>(delayed_prepared_.size()),
|
static_cast<uint64_t>(delayed_prepared_.size()),
|
||||||
to_be_popped, new_max);
|
to_be_popped, new_max);
|
||||||
prepared_txns_.pop();
|
|
||||||
delayed_prepared_empty_.store(false, std::memory_order_release);
|
delayed_prepared_empty_.store(false, std::memory_order_release);
|
||||||
|
// Update prepared_txns_ after updating delayed_prepared_empty_ otherwise
|
||||||
|
// there will be a point in time that the entry is neither in
|
||||||
|
// prepared_txns_ nor in delayed_prepared_, which will not be checked if
|
||||||
|
// delayed_prepared_empty_ is false.
|
||||||
|
prepared_txns_.pop();
|
||||||
}
|
}
|
||||||
if (locked) {
|
if (locked) {
|
||||||
prepared_txns_.push_pop_mutex()->Lock();
|
prepared_txns_.push_pop_mutex()->Lock();
|
||||||
|
@ -500,6 +500,7 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
|
|||||||
friend class WritePreparedTransactionTest_NonAtomicUpdateOfMaxEvictedSeq_Test;
|
friend class WritePreparedTransactionTest_NonAtomicUpdateOfMaxEvictedSeq_Test;
|
||||||
friend class WritePreparedTransactionTest_OldCommitMapGC_Test;
|
friend class WritePreparedTransactionTest_OldCommitMapGC_Test;
|
||||||
friend class WritePreparedTransactionTest_RollbackTest_Test;
|
friend class WritePreparedTransactionTest_RollbackTest_Test;
|
||||||
|
friend class WritePreparedTransactionTest_SmallestUnCommittedSeq_Test;
|
||||||
friend class WriteUnpreparedTxn;
|
friend class WriteUnpreparedTxn;
|
||||||
friend class WriteUnpreparedTxnDB;
|
friend class WriteUnpreparedTxnDB;
|
||||||
friend class WriteUnpreparedTransactionTest_RecoveryTest_Test;
|
friend class WriteUnpreparedTransactionTest_RecoveryTest_Test;
|
||||||
@ -626,6 +627,19 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
|
|||||||
const SequenceNumber& new_max);
|
const SequenceNumber& new_max);
|
||||||
|
|
||||||
inline SequenceNumber SmallestUnCommittedSeq() {
|
inline SequenceNumber SmallestUnCommittedSeq() {
|
||||||
|
// Note: We have two lists to look into, but for performance reasons they
|
||||||
|
// are not read atomically. Since CheckPreparedAgainstMax copies the entry
|
||||||
|
// to delayed_prepared_ before removing it from prepared_txns_, to ensure
|
||||||
|
// that a prepared entry will not go unmissed, we look into them in opposite
|
||||||
|
// order: first read prepared_txns_ and then delayed_prepared_.
|
||||||
|
|
||||||
|
// This must be called before calling ::top. This is because the concurrent
|
||||||
|
// thread would call ::RemovePrepared before updating
|
||||||
|
// GetLatestSequenceNumber(). Reading then in opposite order here guarantees
|
||||||
|
// that the ::top that we read would be lower the ::top if we had otherwise
|
||||||
|
// update/read them atomically.
|
||||||
|
auto next_prepare = db_impl_->GetLatestSequenceNumber() + 1;
|
||||||
|
auto min_prepare = prepared_txns_.top();
|
||||||
// Since we update the prepare_heap always from the main write queue via
|
// Since we update the prepare_heap always from the main write queue via
|
||||||
// PreReleaseCallback, the prepared_txns_.top() indicates the smallest
|
// PreReleaseCallback, the prepared_txns_.top() indicates the smallest
|
||||||
// prepared data in 2pc transactions. For non-2pc transactions that are
|
// prepared data in 2pc transactions. For non-2pc transactions that are
|
||||||
@ -638,13 +652,6 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
|
|||||||
return *delayed_prepared_.begin();
|
return *delayed_prepared_.begin();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// This must be called before calling ::top. This is because the concurrent
|
|
||||||
// thread would call ::RemovePrepared before updating
|
|
||||||
// GetLatestSequenceNumber(). Reading then in opposite order here guarantees
|
|
||||||
// that the ::top that we read would be lower the ::top if we had otherwise
|
|
||||||
// update/read them atomically.
|
|
||||||
auto next_prepare = db_impl_->GetLatestSequenceNumber() + 1;
|
|
||||||
auto min_prepare = prepared_txns_.top();
|
|
||||||
bool empty = min_prepare == kMaxSequenceNumber;
|
bool empty = min_prepare == kMaxSequenceNumber;
|
||||||
if (empty) {
|
if (empty) {
|
||||||
// Since GetLatestSequenceNumber is updated
|
// Since GetLatestSequenceNumber is updated
|
||||||
|
Loading…
x
Reference in New Issue
Block a user