Add rollback_deletion_type_callback to TxnDBOptions (#9873)
Summary: This PR does not affect write-committed. Add a member, `rollback_deletion_type_callback` to TransactionDBOptions so that a write-prepared transaction, when rolling back, can call this callback to decide if a `Delete` or `SingleDelete` should be used to cancel a prior `Put` written to the database during prepare phase. The purpose of this PR is to prevent mixing `Delete` and `SingleDelete` for the same key, causing undefined behaviors. Without this PR, the following can happen: ``` // The application always issues SingleDelete when deleting keys. txn1->Put('a'); txn1->Prepare(); // writes to memtable and potentially gets flushed/compacted to Lmax txn1->Rollback(); // inserts DELETE('a') txn2->Put('a'); txn2->Commit(); // writes to memtable and potentially gets flushed/compacted ``` In the database, we may have ``` L0: [PUT('a', s=100)] L1: [DELETE('a', s=90)] Lmax: [PUT('a', s=0)] ``` If a compaction compacts L0 and L1, then we have ``` L1: [PUT('a', s=100)] Lmax: [PUT('a', s=0)] ``` If a future transaction issues a SingleDelete, we have ``` L0: [SD('a', s=110)] L1: [PUT('a', s=100)] Lmax: [PUT('a', s=0)] ``` Then, a compaction including L0, L1 and Lmax leads to ``` Lmax: [PUT('a', s=0)] ``` which is incorrect. Similar bugs reported and addressed in https://github.com/cockroachdb/pebble/issues/1255. Based on our team's current priority, we have decided to take this approach for now. We may come back and revisit in the future. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9873 Test Plan: make check Reviewed By: ltamasi Differential Revision: D35762170 Pulled By: riversand963 fbshipit-source-id: b28d56eefc786b53c9844b9ef4a7807acdd82c8d
This commit is contained in:
parent
1bac873fcf
commit
d13825e586
@ -3,6 +3,9 @@
|
||||
### New Features
|
||||
* DB::GetLiveFilesStorageInfo is ready for production use.
|
||||
|
||||
### Public API changes
|
||||
* Add rollback_deletion_type_callback to TransactionDBOptions so that write-prepared transactions know whether to issue a Delete or SingleDelete to cancel a previous key written during prior prepare phase. The PR aims to prevent mixing SingleDeletes and Deletes for the same key that can lead to undefined behaviors for write-prepared transactions.
|
||||
|
||||
## 7.2.0 (04/15/2022)
|
||||
### Bug Fixes
|
||||
* Fixed bug which caused rocksdb failure in the situation when rocksdb was accessible using UNC path
|
||||
|
@ -222,6 +222,20 @@ struct TransactionDBOptions {
|
||||
// pending writes into the database. A value of 0 or less means no limit.
|
||||
int64_t default_write_batch_flush_threshold = 0;
|
||||
|
||||
// This option is valid only for write-prepared/write-unprepared. Transaction
|
||||
// will rely on this callback to determine if a key should be rolled back
|
||||
// with Delete or SingleDelete when necessary. If the callback returns true,
|
||||
// then SingleDelete should be used. If the callback is not callable or the
|
||||
// callback returns false, then a Delete is used.
|
||||
// The application should ensure thread-safety of this callback.
|
||||
// The callback should not throw because RocksDB is not exception-safe.
|
||||
// The callback may be removed if we allow mixing Delete and SingleDelete in
|
||||
// the future.
|
||||
std::function<bool(TransactionDB* /*db*/,
|
||||
ColumnFamilyHandle* /*column_family*/,
|
||||
const Slice& /*key*/)>
|
||||
rollback_deletion_type_callback;
|
||||
|
||||
private:
|
||||
// 128 entries
|
||||
// Should the default value change, please also update wp_snapshot_cache_bits
|
||||
|
@ -3970,6 +3970,62 @@ TEST_P(WritePreparedTransactionTest, AtomicCommit) {
|
||||
}
|
||||
}
|
||||
|
||||
TEST_P(WritePreparedTransactionTest, BasicRollbackDeletionTypeCb) {
|
||||
options.level0_file_num_compaction_trigger = 2;
|
||||
// Always use SingleDelete to rollback Put.
|
||||
txn_db_options.rollback_deletion_type_callback =
|
||||
[](TransactionDB*, ColumnFamilyHandle*, const Slice&) { return true; };
|
||||
|
||||
const auto write_to_db = [&]() {
|
||||
assert(db);
|
||||
std::unique_ptr<Transaction> txn0(
|
||||
db->BeginTransaction(WriteOptions(), TransactionOptions()));
|
||||
ASSERT_OK(txn0->SetName("txn0"));
|
||||
ASSERT_OK(txn0->Put("a", "v0"));
|
||||
ASSERT_OK(txn0->Prepare());
|
||||
|
||||
// Generate sst1: [PUT('a')]
|
||||
ASSERT_OK(db->Flush(FlushOptions()));
|
||||
|
||||
{
|
||||
CompactRangeOptions cro;
|
||||
cro.change_level = true;
|
||||
cro.target_level = options.num_levels - 1;
|
||||
cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
|
||||
ASSERT_OK(db->CompactRange(cro, /*begin=*/nullptr, /*end=*/nullptr));
|
||||
}
|
||||
|
||||
ASSERT_OK(txn0->Rollback());
|
||||
txn0.reset();
|
||||
|
||||
ASSERT_OK(db->Put(WriteOptions(), "a", "v1"));
|
||||
|
||||
ASSERT_OK(db->SingleDelete(WriteOptions(), "a"));
|
||||
// Generate another SST with a SD to cover the oldest PUT('a')
|
||||
ASSERT_OK(db->Flush(FlushOptions()));
|
||||
|
||||
auto* dbimpl = static_cast_with_check<DBImpl>(db->GetRootDB());
|
||||
assert(dbimpl);
|
||||
ASSERT_OK(dbimpl->TEST_WaitForCompact());
|
||||
|
||||
{
|
||||
CompactRangeOptions cro;
|
||||
cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
|
||||
ASSERT_OK(db->CompactRange(cro, /*begin=*/nullptr, /*end=*/nullptr));
|
||||
}
|
||||
|
||||
{
|
||||
std::string value;
|
||||
const Status s = db->Get(ReadOptions(), "a", &value);
|
||||
ASSERT_TRUE(s.IsNotFound());
|
||||
}
|
||||
};
|
||||
|
||||
// Destroy and reopen
|
||||
ASSERT_OK(ReOpen());
|
||||
write_to_db();
|
||||
}
|
||||
|
||||
// Test that we can change write policy from WriteCommitted to WritePrepared
|
||||
// after a clean shutdown (which would empty the WAL)
|
||||
TEST_P(WritePreparedTransactionTest, WP_WC_DBBackwardCompatibility) {
|
||||
|
@ -263,6 +263,10 @@ Status WritePreparedTxn::CommitInternal() {
|
||||
Status WritePreparedTxn::RollbackInternal() {
|
||||
ROCKS_LOG_WARN(db_impl_->immutable_db_options().info_log,
|
||||
"RollbackInternal prepare_seq: %" PRIu64, GetId());
|
||||
|
||||
assert(db_impl_);
|
||||
assert(wpt_db_);
|
||||
|
||||
WriteBatch rollback_batch;
|
||||
assert(GetId() != kMaxSequenceNumber);
|
||||
assert(GetId() > 0);
|
||||
@ -273,8 +277,9 @@ Status WritePreparedTxn::RollbackInternal() {
|
||||
// to prevent callback's seq to be overrriden inside DBImpk::Get
|
||||
roptions.snapshot = wpt_db_->GetMaxSnapshot();
|
||||
struct RollbackWriteBatchBuilder : public WriteBatch::Handler {
|
||||
DBImpl* db_;
|
||||
WritePreparedTxnReadCallback callback;
|
||||
DBImpl* const db_;
|
||||
WritePreparedTxnDB* const wpt_db_;
|
||||
WritePreparedTxnReadCallback callback_;
|
||||
WriteBatch* rollback_batch_;
|
||||
std::map<uint32_t, const Comparator*>& comparators_;
|
||||
std::map<uint32_t, ColumnFamilyHandle*>& handles_;
|
||||
@ -282,14 +287,16 @@ Status WritePreparedTxn::RollbackInternal() {
|
||||
std::map<uint32_t, CFKeys> keys_;
|
||||
bool rollback_merge_operands_;
|
||||
ReadOptions roptions_;
|
||||
|
||||
RollbackWriteBatchBuilder(
|
||||
DBImpl* db, WritePreparedTxnDB* wpt_db, SequenceNumber snap_seq,
|
||||
WriteBatch* dst_batch,
|
||||
std::map<uint32_t, const Comparator*>& comparators,
|
||||
std::map<uint32_t, ColumnFamilyHandle*>& handles,
|
||||
bool rollback_merge_operands, ReadOptions _roptions)
|
||||
bool rollback_merge_operands, const ReadOptions& _roptions)
|
||||
: db_(db),
|
||||
callback(wpt_db, snap_seq), // disable min_uncommitted optimization
|
||||
wpt_db_(wpt_db),
|
||||
callback_(wpt_db, snap_seq), // disable min_uncommitted optimization
|
||||
rollback_batch_(dst_batch),
|
||||
comparators_(comparators),
|
||||
handles_(handles),
|
||||
@ -304,8 +311,8 @@ Status WritePreparedTxn::RollbackInternal() {
|
||||
keys_[cf] = CFKeys(SetComparator(cmp));
|
||||
}
|
||||
auto it = cf_keys.insert(key);
|
||||
if (it.second ==
|
||||
false) { // second is false if a element already existed.
|
||||
// second is false if a element already existed.
|
||||
if (it.second == false) {
|
||||
return s;
|
||||
}
|
||||
|
||||
@ -316,7 +323,7 @@ Status WritePreparedTxn::RollbackInternal() {
|
||||
get_impl_options.column_family = cf_handle;
|
||||
get_impl_options.value = &pinnable_val;
|
||||
get_impl_options.value_found = ¬_used;
|
||||
get_impl_options.callback = &callback;
|
||||
get_impl_options.callback = &callback_;
|
||||
s = db_->GetImpl(roptions_, key, get_impl_options);
|
||||
assert(s.ok() || s.IsNotFound());
|
||||
if (s.ok()) {
|
||||
@ -325,7 +332,11 @@ Status WritePreparedTxn::RollbackInternal() {
|
||||
} else if (s.IsNotFound()) {
|
||||
// There has been no readable value before txn. By adding a delete we
|
||||
// make sure that there will be none afterwards either.
|
||||
s = rollback_batch_->Delete(cf_handle, key);
|
||||
if (wpt_db_->ShouldRollbackWithSingleDelete(cf_handle, key)) {
|
||||
s = rollback_batch_->SingleDelete(cf_handle, key);
|
||||
} else {
|
||||
s = rollback_batch_->Delete(cf_handle, key);
|
||||
}
|
||||
assert(s.ok());
|
||||
} else {
|
||||
// Unexpected status. Return it to the user.
|
||||
|
@ -394,7 +394,7 @@ Status WritePreparedTxnDB::NewIterators(
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
void WritePreparedTxnDB::Init(const TransactionDBOptions& /* unused */) {
|
||||
void WritePreparedTxnDB::Init(const TransactionDBOptions& txn_db_opts) {
|
||||
// Adcance max_evicted_seq_ no more than 100 times before the cache wraps
|
||||
// around.
|
||||
INC_STEP_FOR_MAX_EVICTED =
|
||||
@ -404,6 +404,8 @@ void WritePreparedTxnDB::Init(const TransactionDBOptions& /* unused */) {
|
||||
commit_cache_ = std::unique_ptr<std::atomic<CommitEntry64b>[]>(
|
||||
new std::atomic<CommitEntry64b>[COMMIT_CACHE_SIZE] {});
|
||||
dummy_max_snapshot_.number_ = kMaxSequenceNumber;
|
||||
rollback_deletion_type_callback_ =
|
||||
txn_db_opts.rollback_deletion_type_callback;
|
||||
}
|
||||
|
||||
void WritePreparedTxnDB::CheckPreparedAgainstMax(SequenceNumber new_max,
|
||||
@ -433,7 +435,7 @@ void WritePreparedTxnDB::CheckPreparedAgainstMax(SequenceNumber new_max,
|
||||
delayed_prepared_.insert(to_be_popped);
|
||||
ROCKS_LOG_WARN(info_log_,
|
||||
"prepared_mutex_ overhead %" PRIu64 " (prep=%" PRIu64
|
||||
" new_max=%" PRIu64,
|
||||
" new_max=%" PRIu64 ")",
|
||||
static_cast<uint64_t>(delayed_prepared_.size()),
|
||||
to_be_popped, new_max);
|
||||
delayed_prepared_empty_.store(false, std::memory_order_release);
|
||||
|
@ -465,6 +465,16 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
|
||||
// Get a dummy snapshot that refers to kMaxSequenceNumber
|
||||
Snapshot* GetMaxSnapshot() { return &dummy_max_snapshot_; }
|
||||
|
||||
bool ShouldRollbackWithSingleDelete(ColumnFamilyHandle* column_family,
|
||||
const Slice& key) {
|
||||
return rollback_deletion_type_callback_
|
||||
? rollback_deletion_type_callback_(this, column_family, key)
|
||||
: false;
|
||||
}
|
||||
|
||||
std::function<bool(TransactionDB*, ColumnFamilyHandle*, const Slice&)>
|
||||
rollback_deletion_type_callback_;
|
||||
|
||||
private:
|
||||
friend class AddPreparedCallback;
|
||||
friend class PreparedHeap_BasicsTest_Test;
|
||||
@ -504,7 +514,7 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
|
||||
friend class WriteUnpreparedTxnDB;
|
||||
friend class WriteUnpreparedTransactionTest_RecoveryTest_Test;
|
||||
|
||||
void Init(const TransactionDBOptions& /* unused */);
|
||||
void Init(const TransactionDBOptions& txn_db_opts);
|
||||
|
||||
void WPRecordTick(uint32_t ticker_type) const {
|
||||
RecordTick(db_impl_->immutable_db_options_.statistics.get(), ticker_type);
|
||||
|
Loading…
Reference in New Issue
Block a user