WritePrepared Txn: fix bug with Rollback seq
Summary: The sequence number was not properly advanced after a rollback marker. The patch extends the existing unit tests to detect the bug and also fixes it. Closes https://github.com/facebook/rocksdb/pull/3157 Differential Revision: D6304291 Pulled By: maysamyabandeh fbshipit-source-id: 1b519c44a5371b802da49c9e32bd00087a8da401
This commit is contained in:
parent
175d5d6a9e
commit
53863b76f9
@ -988,6 +988,16 @@ class MemTableInserter : public WriteBatch::Handler {
|
||||
|
||||
virtual bool WriterAfterCommit() const { return write_after_commit_; }
|
||||
|
||||
// The batch seq is regularly restarted; In normal mode it is set when
|
||||
// MemTableInserter is constructed in the write thread and in recovery mode it
|
||||
// is set when a batch, which is tagged with seq, is read from the WAL.
|
||||
// Within a sequenced batch, which could be a merge of multiple batches, we
|
||||
// have two policies to advance the seq: i) seq_per_key (default) and ii)
|
||||
// seq_per_batch. To implement the latter we need to mark the boundry between
|
||||
// the individual batches. The approach is this: 1) Use the terminating
|
||||
// markers to indicate the boundry (kTypeEndPrepareXID, kTypeCommitXID,
|
||||
// kTypeRollbackXID) 2) Terminate a batch with kTypeNoop in the absense of a
|
||||
// natural boundy marker.
|
||||
void MaybeAdvanceSeq(bool batch_boundry = false) {
|
||||
if (batch_boundry == seq_per_batch_) {
|
||||
sequence_++;
|
||||
@ -1430,6 +1440,9 @@ class MemTableInserter : public WriteBatch::Handler {
|
||||
// in non recovery we simply ignore this tag
|
||||
}
|
||||
|
||||
const bool batch_boundry = true;
|
||||
MaybeAdvanceSeq(batch_boundry);
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
|
@ -924,6 +924,7 @@ class DB {
|
||||
// Retrieve the sorted list of all wal files with earliest file first
|
||||
virtual Status GetSortedWalFiles(VectorLogPtr& files) = 0;
|
||||
|
||||
// Note: this API is not yet consistent with WritePrepared transactions.
|
||||
// Sets iter to an iterator that is positioned at a write-batch containing
|
||||
// seq_number. If the sequence number is non existent, it returns an iterator
|
||||
// at the first available seq_no after the requested seq_no
|
||||
|
@ -4837,6 +4837,8 @@ TEST_P(TransactionTest, MemoryLimitTest) {
|
||||
// necessarily the one acceptable way. If the algorithm is legitimately changed,
|
||||
// this unit test should be updated as well.
|
||||
TEST_P(TransactionTest, SeqAdvanceTest) {
|
||||
// TODO(myabandeh): must be test with false before new releases
|
||||
const bool short_test = true;
|
||||
WriteOptions wopts;
|
||||
FlushOptions fopt;
|
||||
|
||||
@ -4846,7 +4848,7 @@ TEST_P(TransactionTest, SeqAdvanceTest) {
|
||||
// Do the test with NUM_BRANCHES branches in it. Each run of a test takes some
|
||||
// of the branches. This is the same as counting a binary number where i-th
|
||||
// bit represents whether we take branch i in the represented by the number.
|
||||
const size_t NUM_BRANCHES = 8;
|
||||
const size_t NUM_BRANCHES = short_test ? 6 : 10;
|
||||
// Helper function that shows if the branch is to be taken in the run
|
||||
// represented by the number n.
|
||||
auto branch_do = [&](size_t n, size_t* branch) {
|
||||
@ -4869,7 +4871,7 @@ TEST_P(TransactionTest, SeqAdvanceTest) {
|
||||
seq = db_impl->TEST_GetLastVisibleSequence();
|
||||
ASSERT_EQ(exp_seq, seq);
|
||||
}
|
||||
if (branch_do(n, &branch)) {
|
||||
if (!short_test && branch_do(n, &branch)) {
|
||||
db_impl->FlushWAL(true);
|
||||
ReOpenNoDelete();
|
||||
db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
|
||||
@ -4891,7 +4893,7 @@ TEST_P(TransactionTest, SeqAdvanceTest) {
|
||||
seq = db_impl->TEST_GetLastVisibleSequence();
|
||||
ASSERT_EQ(exp_seq, seq);
|
||||
}
|
||||
if (branch_do(n, &branch)) {
|
||||
if (!short_test && branch_do(n, &branch)) {
|
||||
db_impl->FlushWAL(true);
|
||||
ReOpenNoDelete();
|
||||
db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
|
||||
@ -4908,7 +4910,7 @@ TEST_P(TransactionTest, SeqAdvanceTest) {
|
||||
seq = db_impl->TEST_GetLastVisibleSequence();
|
||||
ASSERT_EQ(exp_seq, seq);
|
||||
}
|
||||
if (branch_do(n, &branch)) {
|
||||
if (!short_test && branch_do(n, &branch)) {
|
||||
db_impl->FlushWAL(true);
|
||||
ReOpenNoDelete();
|
||||
db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
|
||||
@ -4916,10 +4918,24 @@ TEST_P(TransactionTest, SeqAdvanceTest) {
|
||||
ASSERT_EQ(exp_seq, seq);
|
||||
}
|
||||
|
||||
txn_t0(0);
|
||||
txn_t4(0);
|
||||
seq = db_impl->TEST_GetLastVisibleSequence();
|
||||
|
||||
ASSERT_EQ(exp_seq, seq);
|
||||
|
||||
if (branch_do(n, &branch)) {
|
||||
db_impl->Flush(fopt);
|
||||
seq = db_impl->TEST_GetLastVisibleSequence();
|
||||
ASSERT_EQ(exp_seq, seq);
|
||||
}
|
||||
if (!short_test && branch_do(n, &branch)) {
|
||||
db_impl->FlushWAL(true);
|
||||
ReOpenNoDelete();
|
||||
db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
|
||||
seq = db_impl->GetLatestSequenceNumber();
|
||||
ASSERT_EQ(exp_seq, seq);
|
||||
}
|
||||
|
||||
txn_t2(0);
|
||||
seq = db_impl->TEST_GetLastVisibleSequence();
|
||||
ASSERT_EQ(exp_seq, seq);
|
||||
@ -4929,7 +4945,7 @@ TEST_P(TransactionTest, SeqAdvanceTest) {
|
||||
seq = db_impl->TEST_GetLastVisibleSequence();
|
||||
ASSERT_EQ(exp_seq, seq);
|
||||
}
|
||||
if (branch_do(n, &branch)) {
|
||||
if (!short_test && branch_do(n, &branch)) {
|
||||
db_impl->FlushWAL(true);
|
||||
ReOpenNoDelete();
|
||||
db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
|
||||
|
@ -236,6 +236,41 @@ class TransactionTest : public ::testing::TestWithParam<
|
||||
}
|
||||
delete txn;
|
||||
};
|
||||
std::function<void(size_t)> txn_t4 = [&](size_t index) {
|
||||
// A full 2pc txn that also involves a commit marker.
|
||||
TransactionOptions txn_options;
|
||||
WriteOptions write_options;
|
||||
Transaction* txn = db->BeginTransaction(write_options, txn_options);
|
||||
auto istr = std::to_string(index);
|
||||
auto s = txn->SetName("xid" + istr);
|
||||
ASSERT_OK(s);
|
||||
s = txn->Put(Slice("foo" + istr), Slice("bar"));
|
||||
s = txn->Put(Slice("foo2" + istr), Slice("bar2"));
|
||||
s = txn->Put(Slice("foo3" + istr), Slice("bar3"));
|
||||
s = txn->Put(Slice("foo4" + istr), Slice("bar4"));
|
||||
s = txn->Put(Slice("foo5" + istr), Slice("bar5"));
|
||||
ASSERT_OK(s);
|
||||
expected_commits++;
|
||||
s = txn->Prepare();
|
||||
ASSERT_OK(s);
|
||||
commit_writes++;
|
||||
s = txn->Rollback();
|
||||
ASSERT_OK(s);
|
||||
if (txn_db_options.write_policy == TxnDBWritePolicy::WRITE_COMMITTED) {
|
||||
// No seq is consumed for deleting the txn buffer
|
||||
exp_seq += 0;
|
||||
} else {
|
||||
// Consume one seq per batch
|
||||
exp_seq++;
|
||||
// Consume one seq per rollback batch
|
||||
exp_seq++;
|
||||
if (options.two_write_queues) {
|
||||
// Consume one seq for rollback commit
|
||||
exp_seq++;
|
||||
}
|
||||
}
|
||||
delete txn;
|
||||
};
|
||||
|
||||
// Test that we can change write policy after a clean shutdown (which would
|
||||
// empty the WAL)
|
||||
|
@ -605,12 +605,13 @@ TEST_P(WritePreparedTransactionTest, SeqAdvanceConcurrentTest) {
|
||||
FlushOptions fopt;
|
||||
|
||||
// Number of different txn types we use in this test
|
||||
const size_t type_cnt = 4;
|
||||
const size_t type_cnt = 5;
|
||||
// The size of the first write group
|
||||
// TODO(myabandeh): This should be increase for pre-release tests
|
||||
const size_t first_group_size = 2;
|
||||
// Total number of txns we run in each test
|
||||
const size_t txn_cnt = first_group_size * 2;
|
||||
// TODO(myabandeh): This should be increase for pre-release tests
|
||||
const size_t txn_cnt = first_group_size + 1;
|
||||
|
||||
size_t base[txn_cnt + 1] = {
|
||||
1,
|
||||
@ -675,6 +676,9 @@ TEST_P(WritePreparedTransactionTest, SeqAdvanceConcurrentTest) {
|
||||
case 3:
|
||||
threads.emplace_back(txn_t3, bi);
|
||||
break;
|
||||
case 4:
|
||||
threads.emplace_back(txn_t3, bi);
|
||||
break;
|
||||
default:
|
||||
assert(false);
|
||||
}
|
||||
@ -710,16 +714,30 @@ TEST_P(WritePreparedTransactionTest, SeqAdvanceConcurrentTest) {
|
||||
rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
|
||||
|
||||
// The latest seq might be due to a commit without prepare and hence not
|
||||
// persisted in the WAL. To make the verification of seq after recovery
|
||||
// easier we write in a transaction with prepare which makes the latest seq
|
||||
// to be persisted via the commitmarker.
|
||||
txn_t3(0);
|
||||
// persisted in the WAL. We need to discount such seqs if they are not
|
||||
// continued by any seq consued by a value write.
|
||||
if (options.two_write_queues) {
|
||||
WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
|
||||
MutexLock l(&wp_db->seq_for_metadata_mutex_);
|
||||
auto& vec = wp_db->seq_for_metadata;
|
||||
std::sort(vec.begin(), vec.end());
|
||||
// going backward discount any last seq consumed for metadata until we see
|
||||
// a seq that is consumed for actualy key/values.
|
||||
auto rit = vec.rbegin();
|
||||
for (; rit != vec.rend(); ++rit) {
|
||||
if (*rit == exp_seq) {
|
||||
exp_seq--;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check if recovery preserves the last sequence number
|
||||
db_impl->FlushWAL(true);
|
||||
ReOpenNoDelete();
|
||||
db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
|
||||
seq = db_impl->GetLatestSequenceNumber();
|
||||
seq = db_impl->TEST_GetLastVisibleSequence();
|
||||
ASSERT_EQ(exp_seq, seq);
|
||||
|
||||
// Check if flush preserves the last sequence number
|
||||
@ -1134,25 +1152,18 @@ TEST_P(WritePreparedTransactionTest, RollbackTest) {
|
||||
ASSERT_SAME(db, s4, v4, "key4");
|
||||
|
||||
if (crash) {
|
||||
// TODO(myabandeh): replace it with true crash (commented lines below)
|
||||
// after compaction PR is landed.
|
||||
delete txn;
|
||||
auto db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
|
||||
auto seq = db_impl->GetLatestSequenceNumber();
|
||||
db_impl->FlushWAL(true);
|
||||
ReOpenNoDelete();
|
||||
wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
|
||||
SequenceNumber prev_max = wp_db->max_evicted_seq_;
|
||||
wp_db->AdvanceMaxEvictedSeq(prev_max, seq);
|
||||
// delete txn;
|
||||
// auto db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
|
||||
// db_impl->FlushWAL(true);
|
||||
// ReOpenNoDelete();
|
||||
// wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
|
||||
// txn = db->GetTransactionByName("xid0");
|
||||
// ASSERT_FALSE(wp_db->delayed_prepared_empty_);
|
||||
// ReadLock rl(&wp_db->prepared_mutex_);
|
||||
// ASSERT_TRUE(wp_db->prepared_txns_.empty());
|
||||
// ASSERT_FALSE(wp_db->delayed_prepared_.empty());
|
||||
// ASSERT_TRUE(wp_db->delayed_prepared_.find(txn->GetId()) !=
|
||||
// wp_db->delayed_prepared_.end());
|
||||
txn = db->GetTransactionByName("xid0");
|
||||
ASSERT_FALSE(wp_db->delayed_prepared_empty_);
|
||||
ReadLock rl(&wp_db->prepared_mutex_);
|
||||
ASSERT_TRUE(wp_db->prepared_txns_.empty());
|
||||
ASSERT_FALSE(wp_db->delayed_prepared_.empty());
|
||||
ASSERT_TRUE(wp_db->delayed_prepared_.find(txn->GetId()) !=
|
||||
wp_db->delayed_prepared_.end());
|
||||
}
|
||||
|
||||
ASSERT_SAME(db, s1, v1, "key1");
|
||||
|
@ -93,7 +93,12 @@ Status WritePreparedTxn::CommitWithoutPrepareInternal() {
|
||||
|
||||
SequenceNumber WritePreparedTxn::GetACommitSeqNumber(SequenceNumber prep_seq) {
|
||||
if (db_impl_->immutable_db_options().two_write_queues) {
|
||||
return db_impl_->IncAndFetchSequenceNumber();
|
||||
auto s = db_impl_->IncAndFetchSequenceNumber();
|
||||
#ifndef NDEBUG
|
||||
MutexLock l(&wpt_db_->seq_for_metadata_mutex_);
|
||||
wpt_db_->seq_for_metadata.push_back(s);
|
||||
#endif
|
||||
return s;
|
||||
} else {
|
||||
return prep_seq;
|
||||
}
|
||||
@ -161,8 +166,6 @@ Status WritePreparedTxn::RollbackInternal() {
|
||||
WriteBatch rollback_batch;
|
||||
assert(GetId() != kMaxSequenceNumber);
|
||||
assert(GetId() > 0);
|
||||
// In the absence of Prepare markers, use Noop as a batch separator
|
||||
WriteBatchInternal::InsertNoop(&rollback_batch);
|
||||
// In WritePrepared, the txn is is the same as prepare seq
|
||||
auto last_visible_txn = GetId() - 1;
|
||||
struct RollbackWriteBatchBuilder : public WriteBatch::Handler {
|
||||
@ -227,6 +230,7 @@ Status WritePreparedTxn::RollbackInternal() {
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
// The Rollback marker will be used as a batch separator
|
||||
WriteBatchInternal::MarkRollback(&rollback_batch, name_);
|
||||
const bool disable_memtable = true;
|
||||
const uint64_t no_log_ref = 0;
|
||||
|
@ -215,6 +215,7 @@ bool WritePreparedTxnDB::IsInSnapshot(uint64_t prep_seq,
|
||||
}
|
||||
{
|
||||
// We should not normally reach here
|
||||
// TODO(myabandeh): check only if snapshot_seq is in the list of snaphots
|
||||
ReadLock rl(&old_commit_map_mutex_);
|
||||
auto old_commit_entry = old_commit_map_.find(prep_seq);
|
||||
if (old_commit_entry == old_commit_map_.end() ||
|
||||
|
@ -179,6 +179,12 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
|
||||
// Struct to hold ownership of snapshot and read callback for cleanup.
|
||||
struct IteratorState;
|
||||
|
||||
#ifndef NDEBUG
|
||||
// For unit tests we can track of the seq numbers that are used for metadata as opposed to actual key/values
|
||||
std::vector<uint64_t> seq_for_metadata;
|
||||
mutable port::Mutex seq_for_metadata_mutex_;
|
||||
#endif
|
||||
|
||||
private:
|
||||
friend class WritePreparedTransactionTest_IsInSnapshotTest_Test;
|
||||
friend class WritePreparedTransactionTest_CheckAgainstSnapshotsTest_Test;
|
||||
|
Loading…
Reference in New Issue
Block a user