diff --git a/HISTORY.md b/HISTORY.md index 492be27c6..1e7a6bec0 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -4,13 +4,15 @@ * Add a BlockBasedTableOption to align uncompressed data blocks on the smaller of block size or page size boundary, to reduce flash reads by avoiding reads spanning 4K pages. ### New Features -* * Introduce TTL for level compaction so that all files older than ttl go through the compaction process to get rid of old data. +* Introduce TTL for level compaction so that all files older than ttl go through the compaction process to get rid of old data. +* TransactionDBOptions::write_policy can be configured to enable WritePrepared 2PC transactions. Read more about them in the wiki. ### Bug Fixes * Fsync after writing global seq number to the ingestion file in ExternalSstFileIngestionJob. * Fix WAL corruption caused by race condition between user write thread and FlushWAL when two_write_queue is not set. * Fix `BackupableDBOptions::max_valid_backups_to_open` to not delete backup files when refcount cannot be accurately determined. * Fix memory leak when pin_l0_filter_and_index_blocks_in_cache is used with partitioned filters +* Disable rollback of merge operands in WritePrepared transactions to work around an issue in MyRocks. It can be enabled back by setting TransactionDBOptions::rollback_merge_operands to true. ### Java API Changes * Add `BlockBasedTableConfig.setBlockCache` to allow sharing a block cache across DB instances. diff --git a/include/rocksdb/utilities/transaction_db.h b/include/rocksdb/utilities/transaction_db.h index 856f48996..e55f65006 100644 --- a/include/rocksdb/utilities/transaction_db.h +++ b/include/rocksdb/utilities/transaction_db.h @@ -85,6 +85,14 @@ struct TransactionDBOptions { // before the commit phase. The DB then needs to provide the mechanisms to // tell apart committed from uncommitted data. TxnDBWritePolicy write_policy = TxnDBWritePolicy::WRITE_COMMITTED; + + // TODO(myabandeh): remove this option + // Note: this is a temporary option as a hot fix in rollback of writeprepared + // txns in myrocks. MyRocks uses merge operands for autoinc column id without + // however obtaining locks. This breaks the assumption behind the rollback + // logic in myrocks. This hack of simply not rolling back merge operands works + // for the special way that myrocks uses this operands. + bool rollback_merge_operands = false; }; struct TransactionOptions { diff --git a/utilities/transactions/transaction_test.h b/utilities/transactions/transaction_test.h index beec0df40..cf7e08a50 100644 --- a/utilities/transactions/transaction_test.h +++ b/utilities/transactions/transaction_test.h @@ -66,6 +66,7 @@ class TransactionTestBase : public ::testing::Test { txn_db_options.transaction_lock_timeout = 0; txn_db_options.default_lock_timeout = 0; txn_db_options.write_policy = write_policy; + txn_db_options.rollback_merge_operands = true; Status s; if (use_stackable_db == false) { s = TransactionDB::Open(options, txn_db_options, dbname, &db); diff --git a/utilities/transactions/write_prepared_txn.cc b/utilities/transactions/write_prepared_txn.cc index 77e5b6126..0c3669595 100644 --- a/utilities/transactions/write_prepared_txn.cc +++ b/utilities/transactions/write_prepared_txn.cc @@ -218,15 +218,18 @@ Status WritePreparedTxn::RollbackInternal() { std::map& comparators_; using CFKeys = std::set; std::map keys_; + bool rollback_merge_operands_; RollbackWriteBatchBuilder( DBImpl* db, WritePreparedTxnDB* wpt_db, SequenceNumber snap_seq, WriteBatch* dst_batch, - std::map& comparators) + std::map& comparators, + bool rollback_merge_operands) : db_(db), callback(wpt_db, snap_seq, 0), // 0 disables min_uncommitted optimization rollback_batch_(dst_batch), - comparators_(comparators) {} + comparators_(comparators), + rollback_merge_operands_(rollback_merge_operands) {} Status Rollback(uint32_t cf, const Slice& key) { Status s; @@ -275,7 +278,11 @@ Status WritePreparedTxn::RollbackInternal() { Status MergeCF(uint32_t cf, const Slice& key, const Slice& /*val*/) override { - return Rollback(cf, key); + if (rollback_merge_operands_) { + return Rollback(cf, key); + } else { + return Status::OK(); + } } Status MarkNoop(bool) override { return Status::OK(); } @@ -289,7 +296,8 @@ Status WritePreparedTxn::RollbackInternal() { protected: virtual bool WriteAfterCommit() const override { return false; } } rollback_handler(db_impl_, wpt_db_, last_visible_txn, &rollback_batch, - *wpt_db_->GetCFComparatorMap()); + *wpt_db_->GetCFComparatorMap(), + wpt_db_->txn_db_options_.rollback_merge_operands); auto s = GetWriteBatch()->GetWriteBatch()->Iterate(&rollback_handler); assert(s.ok()); if (!s.ok()) {