diff --git a/HISTORY.md b/HISTORY.md index 319906a6c..7cf8749f3 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -11,6 +11,7 @@ * EventListener::OnTableFileCreated was previously called with OK status and file_size==0 in cases of no SST file contents written (because there was no content to add) and the empty file deleted before calling the listener. Now the status is Aborted. * Fixed a bug in CompactionIterator when write-preared transaction is used. Releasing earliest_snapshot during compaction may cause a SingleDelete to be output after a PUT of the same user key whose seq has been zeroed. * Added input sanitization on negative bytes passed into `GenericRateLimiter::Request`. +* Fixed an assertion failure in CompactionIterator when write-prepared transaction is used. We prove that certain operations can lead to a Delete being followed by a SingleDelete (same user key). We can drop the SingleDelete. ### Behavior Changes * `NUM_FILES_IN_SINGLE_COMPACTION` was only counting the first input level files, now it's including all input files. diff --git a/db/compaction/compaction_iterator.cc b/db/compaction/compaction_iterator.cc index c4434bca2..002f2f839 100644 --- a/db/compaction/compaction_iterator.cc +++ b/db/compaction/compaction_iterator.cc @@ -608,11 +608,19 @@ void CompactionIterator::NextFromInput() { TEST_SYNC_POINT_CALLBACK( "CompactionIterator::NextFromInput:SingleDelete:2", nullptr); - if (next_ikey.type == kTypeSingleDeletion) { + if (next_ikey.type == kTypeSingleDeletion || + next_ikey.type == kTypeDeletion) { // We encountered two SingleDeletes for same key in a row. This - // could be due to unexpected user input. Skip the first - // SingleDelete and let the next iteration decide how to handle the - // second SingleDelete + // could be due to unexpected user input. If write-(un)prepared + // transaction is used, this could also be due to releasing an old + // snapshot between a Put and its matching SingleDelete. + // Furthermore, if write-(un)prepared transaction is rolled back + // after prepare, we will write a Delete to cancel a prior Put. If + // old snapshot is released between a later Put and its matching + // SingleDelete, we will end up with a Delete followed by + // SingleDelete. + // Skip the first SingleDelete and let the next iteration decide + // how to handle the second SingleDelete or Delete. // First SingleDelete has been skipped since we already called // input_.Next(). diff --git a/utilities/transactions/write_prepared_transaction_test.cc b/utilities/transactions/write_prepared_transaction_test.cc index c90d34b1a..62acbf1f5 100644 --- a/utilities/transactions/write_prepared_transaction_test.cc +++ b/utilities/transactions/write_prepared_transaction_test.cc @@ -3199,6 +3199,82 @@ TEST_P(WritePreparedTransactionTest, ReleaseEarliestSnapshotAfterSeqZeroing2) { SyncPoint::GetInstance()->ClearAllCallBacks(); } +// Although the user-contract indicates that a SD can only be issued for a key +// that exists and has not been overwritten, it is still possible for a Delete +// to be present when write-prepared transaction is rolled back. +TEST_P(WritePreparedTransactionTest, SingleDeleteAfterRollback) { + constexpr size_t kSnapshotCacheBits = 7; // same as default + constexpr size_t kCommitCacheBits = 0; // minimum commit cache + UpdateTransactionDBOptions(kSnapshotCacheBits, kCommitCacheBits); + options.disable_auto_compactions = true; + ASSERT_OK(ReOpen()); + + // Get a write conflict snapshot by creating a transaction with + // set_snapshot=true. + TransactionOptions txn_opts; + txn_opts.set_snapshot = true; + std::unique_ptr dummy_txn( + db->BeginTransaction(WriteOptions(), txn_opts)); + + std::unique_ptr txn0( + db->BeginTransaction(WriteOptions(), TransactionOptions())); + ASSERT_OK(txn0->Put("foo", "value")); + ASSERT_OK(txn0->SetName("xid0")); + ASSERT_OK(txn0->Prepare()); + + // Create an SST with only {"foo": "value"}. + ASSERT_OK(db->Flush(FlushOptions())); + + // Insert a Delete to cancel out the prior Put by txn0. + ASSERT_OK(txn0->Rollback()); + txn0.reset(); + + // Create a second SST. + ASSERT_OK(db->Flush(FlushOptions())); + + ASSERT_OK(db->Put(WriteOptions(), "foo", "value1")); + + auto* snapshot = db->GetSnapshot(); + + ASSERT_OK(db->SingleDelete(WriteOptions(), "foo")); + + int count = 0; + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::NextFromInput:SingleDelete:1", [&](void* arg) { + const auto* const c = reinterpret_cast(arg); + assert(!c); + // Trigger once only for SingleDelete during flush. + if (0 == count) { + ++count; + db->ReleaseSnapshot(snapshot); + // Bump max_evicted_seq + ASSERT_OK(db->Put(WriteOptions(), "x", "dontcare")); + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + + // Create a third SST containing a SD without its matching PUT. + ASSERT_OK(db->Flush(FlushOptions())); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->EnableProcessing(); + + DBImpl* dbimpl = static_cast_with_check(db->GetRootDB()); + assert(dbimpl); + ASSERT_OK(dbimpl->TEST_CompactRange( + /*level=*/0, /*begin=*/nullptr, /*end=*/nullptr, + /*column_family=*/nullptr, /*disallow_trivial_mode=*/true)); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + // Release the conflict-checking snapshot. + ASSERT_OK(dummy_txn->Rollback()); +} + // A more complex test to verify compaction/flush should keep keys visible // to snapshots. TEST_P(WritePreparedTransactionTest,