Summary: 1. prepare() 2. crash 3. recover 4. commit() 5. crash 6. data is lost This is due to the transaction data still only residing in the WAL but because the logs were flushed on the first recovery the data is ignored on the second recovery. We must scan all logs found on recovery and only ignore redundant data at the time of replay. It is not possible to know which logs still contain relevant data at time of recovery. We cannot simply ignore a log because all of the non-2pc data it contains has already been written to L0. The changes made to MemTableInserter are to ensure that prepared sections are still recovered even if all of the non-2pc data in that log has already been flushed to L0. Test Plan: Provided test. Reviewers: sdong Subscribers: andrewkr, hermanlee4, dhruba, leveldb Differential Revision: https://reviews.facebook.net/D57729
397 lines
13 KiB
C++
397 lines
13 KiB
C++
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under the BSD-style license found in the
|
|
// LICENSE file in the root directory of this source tree. An additional grant
|
|
// of patent rights can be found in the PATENTS file in the same directory.
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
#include "utilities/transactions/transaction_db_impl.h"
|
|
|
|
#include <string>
|
|
#include <unordered_set>
|
|
#include <vector>
|
|
|
|
#include "db/db_impl.h"
|
|
#include "rocksdb/db.h"
|
|
#include "rocksdb/options.h"
|
|
#include "rocksdb/utilities/transaction_db.h"
|
|
#include "utilities/transactions/transaction_db_mutex_impl.h"
|
|
#include "utilities/transactions/transaction_impl.h"
|
|
|
|
namespace rocksdb {
|
|
|
|
TransactionDBImpl::TransactionDBImpl(DB* db,
|
|
const TransactionDBOptions& txn_db_options)
|
|
: TransactionDB(db),
|
|
db_impl_(dynamic_cast<DBImpl*>(db)),
|
|
txn_db_options_(txn_db_options),
|
|
lock_mgr_(this, txn_db_options_.num_stripes, txn_db_options.max_num_locks,
|
|
txn_db_options_.custom_mutex_factory
|
|
? txn_db_options_.custom_mutex_factory
|
|
: std::shared_ptr<TransactionDBMutexFactory>(
|
|
new TransactionDBMutexFactoryImpl())) {
|
|
assert(db_impl_ != nullptr);
|
|
}
|
|
|
|
Transaction* TransactionDBImpl::BeginTransaction(
|
|
const WriteOptions& write_options, const TransactionOptions& txn_options,
|
|
Transaction* old_txn) {
|
|
if (old_txn != nullptr) {
|
|
ReinitializeTransaction(old_txn, write_options, txn_options);
|
|
return old_txn;
|
|
} else {
|
|
return new TransactionImpl(this, write_options, txn_options);
|
|
}
|
|
}
|
|
|
|
TransactionDBOptions TransactionDBImpl::ValidateTxnDBOptions(
|
|
const TransactionDBOptions& txn_db_options) {
|
|
TransactionDBOptions validated = txn_db_options;
|
|
|
|
if (txn_db_options.num_stripes == 0) {
|
|
validated.num_stripes = 1;
|
|
}
|
|
|
|
return validated;
|
|
}
|
|
|
|
Status TransactionDB::Open(const Options& options,
|
|
const TransactionDBOptions& txn_db_options,
|
|
const std::string& dbname, TransactionDB** dbptr) {
|
|
DBOptions db_options(options);
|
|
ColumnFamilyOptions cf_options(options);
|
|
std::vector<ColumnFamilyDescriptor> column_families;
|
|
column_families.push_back(
|
|
ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
|
|
std::vector<ColumnFamilyHandle*> handles;
|
|
Status s = TransactionDB::Open(db_options, txn_db_options, dbname,
|
|
column_families, &handles, dbptr);
|
|
if (s.ok()) {
|
|
assert(handles.size() == 1);
|
|
// i can delete the handle since DBImpl is always holding a reference to
|
|
// default column family
|
|
delete handles[0];
|
|
}
|
|
|
|
return s;
|
|
}
|
|
|
|
Status TransactionDB::Open(
|
|
const DBOptions& db_options, const TransactionDBOptions& txn_db_options,
|
|
const std::string& dbname,
|
|
const std::vector<ColumnFamilyDescriptor>& column_families,
|
|
std::vector<ColumnFamilyHandle*>* handles, TransactionDB** dbptr) {
|
|
Status s;
|
|
DB* db;
|
|
|
|
std::vector<ColumnFamilyDescriptor> column_families_copy = column_families;
|
|
std::vector<size_t> compaction_enabled_cf_indices;
|
|
|
|
// Enable MemTable History if not already enabled
|
|
for (size_t i = 0; i < column_families_copy.size(); i++) {
|
|
ColumnFamilyOptions* options = &column_families_copy[i].options;
|
|
|
|
if (options->max_write_buffer_number_to_maintain == 0) {
|
|
// Setting to -1 will set the History size to max_write_buffer_number.
|
|
options->max_write_buffer_number_to_maintain = -1;
|
|
}
|
|
|
|
if (!options->disable_auto_compactions) {
|
|
// Disable compactions momentarily to prevent race with DB::Open
|
|
options->disable_auto_compactions = true;
|
|
compaction_enabled_cf_indices.push_back(i);
|
|
}
|
|
}
|
|
|
|
DBOptions db_options_2pc = db_options;
|
|
db_options_2pc.allow_2pc = true;
|
|
s = DB::Open(db_options_2pc, dbname, column_families_copy, handles, &db);
|
|
|
|
if (s.ok()) {
|
|
TransactionDBImpl* txn_db = new TransactionDBImpl(
|
|
db, TransactionDBImpl::ValidateTxnDBOptions(txn_db_options));
|
|
*dbptr = txn_db;
|
|
|
|
for (auto cf_ptr : *handles) {
|
|
txn_db->AddColumnFamily(cf_ptr);
|
|
}
|
|
|
|
// Re-enable compaction for the column families that initially had
|
|
// compaction enabled.
|
|
assert(column_families_copy.size() == (*handles).size());
|
|
std::vector<ColumnFamilyHandle*> compaction_enabled_cf_handles;
|
|
compaction_enabled_cf_handles.reserve(compaction_enabled_cf_indices.size());
|
|
for (auto index : compaction_enabled_cf_indices) {
|
|
compaction_enabled_cf_handles.push_back((*handles)[index]);
|
|
}
|
|
|
|
s = txn_db->EnableAutoCompaction(compaction_enabled_cf_handles);
|
|
|
|
// create 'real' transactions from recovered shell transactions
|
|
assert(dynamic_cast<DBImpl*>(db) != nullptr);
|
|
auto dbimpl = reinterpret_cast<DBImpl*>(db);
|
|
auto rtrxs = dbimpl->recovered_transactions();
|
|
|
|
for (auto it = rtrxs.begin(); it != rtrxs.end(); it++) {
|
|
auto recovered_trx = it->second;
|
|
assert(recovered_trx);
|
|
assert(recovered_trx->log_number_);
|
|
assert(recovered_trx->name_.length());
|
|
|
|
WriteOptions w_options;
|
|
w_options.sync = true;
|
|
TransactionOptions t_options;
|
|
|
|
Transaction* real_trx =
|
|
txn_db->BeginTransaction(w_options, t_options, nullptr);
|
|
assert(real_trx);
|
|
real_trx->SetLogNumber(recovered_trx->log_number_);
|
|
|
|
s = real_trx->SetName(recovered_trx->name_);
|
|
if (!s.ok()) {
|
|
break;
|
|
}
|
|
|
|
s = real_trx->RebuildFromWriteBatch(recovered_trx->batch_);
|
|
real_trx->exec_status_ = Transaction::PREPARED;
|
|
if (!s.ok()) {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
return s;
|
|
}
|
|
|
|
// Let TransactionLockMgr know that this column family exists so it can
|
|
// allocate a LockMap for it.
|
|
void TransactionDBImpl::AddColumnFamily(const ColumnFamilyHandle* handle) {
|
|
lock_mgr_.AddColumnFamily(handle->GetID());
|
|
}
|
|
|
|
Status TransactionDBImpl::CreateColumnFamily(
|
|
const ColumnFamilyOptions& options, const std::string& column_family_name,
|
|
ColumnFamilyHandle** handle) {
|
|
InstrumentedMutexLock l(&column_family_mutex_);
|
|
|
|
Status s = db_->CreateColumnFamily(options, column_family_name, handle);
|
|
if (s.ok()) {
|
|
lock_mgr_.AddColumnFamily((*handle)->GetID());
|
|
}
|
|
|
|
return s;
|
|
}
|
|
|
|
// Let TransactionLockMgr know that it can deallocate the LockMap for this
|
|
// column family.
|
|
Status TransactionDBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) {
|
|
InstrumentedMutexLock l(&column_family_mutex_);
|
|
|
|
Status s = db_->DropColumnFamily(column_family);
|
|
if (s.ok()) {
|
|
lock_mgr_.RemoveColumnFamily(column_family->GetID());
|
|
}
|
|
|
|
return s;
|
|
}
|
|
|
|
Status TransactionDBImpl::TryLock(TransactionImpl* txn, uint32_t cfh_id,
|
|
const std::string& key) {
|
|
return lock_mgr_.TryLock(txn, cfh_id, key, GetEnv());
|
|
}
|
|
|
|
void TransactionDBImpl::UnLock(TransactionImpl* txn,
|
|
const TransactionKeyMap* keys) {
|
|
lock_mgr_.UnLock(txn, keys, GetEnv());
|
|
}
|
|
|
|
void TransactionDBImpl::UnLock(TransactionImpl* txn, uint32_t cfh_id,
|
|
const std::string& key) {
|
|
lock_mgr_.UnLock(txn, cfh_id, key, GetEnv());
|
|
}
|
|
|
|
// Used when wrapping DB write operations in a transaction
|
|
Transaction* TransactionDBImpl::BeginInternalTransaction(
|
|
const WriteOptions& options) {
|
|
TransactionOptions txn_options;
|
|
Transaction* txn = BeginTransaction(options, txn_options, nullptr);
|
|
|
|
assert(dynamic_cast<TransactionImpl*>(txn) != nullptr);
|
|
auto txn_impl = reinterpret_cast<TransactionImpl*>(txn);
|
|
|
|
// Use default timeout for non-transactional writes
|
|
txn_impl->SetLockTimeout(txn_db_options_.default_lock_timeout);
|
|
|
|
return txn;
|
|
}
|
|
|
|
// All user Put, Merge, Delete, and Write requests must be intercepted to make
|
|
// sure that they lock all keys that they are writing to avoid causing conflicts
|
|
// with any concurent transactions. The easiest way to do this is to wrap all
|
|
// write operations in a transaction.
|
|
//
|
|
// Put(), Merge(), and Delete() only lock a single key per call. Write() will
|
|
// sort its keys before locking them. This guarantees that TransactionDB write
|
|
// methods cannot deadlock with eachother (but still could deadlock with a
|
|
// Transaction).
|
|
Status TransactionDBImpl::Put(const WriteOptions& options,
|
|
ColumnFamilyHandle* column_family,
|
|
const Slice& key, const Slice& val) {
|
|
Status s;
|
|
|
|
Transaction* txn = BeginInternalTransaction(options);
|
|
txn->DisableIndexing();
|
|
|
|
// Since the client didn't create a transaction, they don't care about
|
|
// conflict checking for this write. So we just need to do PutUntracked().
|
|
s = txn->PutUntracked(column_family, key, val);
|
|
|
|
if (s.ok()) {
|
|
s = txn->Commit();
|
|
}
|
|
|
|
delete txn;
|
|
|
|
return s;
|
|
}
|
|
|
|
Status TransactionDBImpl::Delete(const WriteOptions& wopts,
|
|
ColumnFamilyHandle* column_family,
|
|
const Slice& key) {
|
|
Status s;
|
|
|
|
Transaction* txn = BeginInternalTransaction(wopts);
|
|
txn->DisableIndexing();
|
|
|
|
// Since the client didn't create a transaction, they don't care about
|
|
// conflict checking for this write. So we just need to do
|
|
// DeleteUntracked().
|
|
s = txn->DeleteUntracked(column_family, key);
|
|
|
|
if (s.ok()) {
|
|
s = txn->Commit();
|
|
}
|
|
|
|
delete txn;
|
|
|
|
return s;
|
|
}
|
|
|
|
Status TransactionDBImpl::Merge(const WriteOptions& options,
|
|
ColumnFamilyHandle* column_family,
|
|
const Slice& key, const Slice& value) {
|
|
Status s;
|
|
|
|
Transaction* txn = BeginInternalTransaction(options);
|
|
txn->DisableIndexing();
|
|
|
|
// Since the client didn't create a transaction, they don't care about
|
|
// conflict checking for this write. So we just need to do
|
|
// MergeUntracked().
|
|
s = txn->MergeUntracked(column_family, key, value);
|
|
|
|
if (s.ok()) {
|
|
s = txn->Commit();
|
|
}
|
|
|
|
delete txn;
|
|
|
|
return s;
|
|
}
|
|
|
|
Status TransactionDBImpl::Write(const WriteOptions& opts, WriteBatch* updates) {
|
|
// Need to lock all keys in this batch to prevent write conflicts with
|
|
// concurrent transactions.
|
|
Transaction* txn = BeginInternalTransaction(opts);
|
|
txn->DisableIndexing();
|
|
|
|
assert(dynamic_cast<TransactionImpl*>(txn) != nullptr);
|
|
auto txn_impl = reinterpret_cast<TransactionImpl*>(txn);
|
|
|
|
// Since commitBatch sorts the keys before locking, concurrent Write()
|
|
// operations will not cause a deadlock.
|
|
// In order to avoid a deadlock with a concurrent Transaction, Transactions
|
|
// should use a lock timeout.
|
|
Status s = txn_impl->CommitBatch(updates);
|
|
|
|
delete txn;
|
|
|
|
return s;
|
|
}
|
|
|
|
void TransactionDBImpl::InsertExpirableTransaction(TransactionID tx_id,
|
|
TransactionImpl* tx) {
|
|
assert(tx->GetExpirationTime() > 0);
|
|
std::lock_guard<std::mutex> lock(map_mutex_);
|
|
expirable_transactions_map_.insert({tx_id, tx});
|
|
}
|
|
|
|
void TransactionDBImpl::RemoveExpirableTransaction(TransactionID tx_id) {
|
|
std::lock_guard<std::mutex> lock(map_mutex_);
|
|
expirable_transactions_map_.erase(tx_id);
|
|
}
|
|
|
|
bool TransactionDBImpl::TryStealingExpiredTransactionLocks(
|
|
TransactionID tx_id) {
|
|
std::lock_guard<std::mutex> lock(map_mutex_);
|
|
|
|
auto tx_it = expirable_transactions_map_.find(tx_id);
|
|
if (tx_it == expirable_transactions_map_.end()) {
|
|
return true;
|
|
}
|
|
TransactionImpl& tx = *(tx_it->second);
|
|
return tx.TryStealingLocks();
|
|
}
|
|
|
|
void TransactionDBImpl::ReinitializeTransaction(
|
|
Transaction* txn, const WriteOptions& write_options,
|
|
const TransactionOptions& txn_options) {
|
|
assert(dynamic_cast<TransactionImpl*>(txn) != nullptr);
|
|
auto txn_impl = reinterpret_cast<TransactionImpl*>(txn);
|
|
|
|
txn_impl->Reinitialize(this, write_options, txn_options);
|
|
}
|
|
|
|
Transaction* TransactionDBImpl::GetTransactionByName(
|
|
const TransactionName& name) {
|
|
std::lock_guard<std::mutex> lock(name_map_mutex_);
|
|
auto it = transactions_.find(name);
|
|
if (it == transactions_.end()) {
|
|
return nullptr;
|
|
} else {
|
|
return it->second;
|
|
}
|
|
}
|
|
|
|
void TransactionDBImpl::GetAllPreparedTransactions(
|
|
std::vector<Transaction*>* transv) {
|
|
assert(transv);
|
|
transv->clear();
|
|
std::lock_guard<std::mutex> lock(name_map_mutex_);
|
|
for (auto it = transactions_.begin(); it != transactions_.end(); it++) {
|
|
if (it->second->exec_status_ == Transaction::PREPARED) {
|
|
transv->push_back(it->second);
|
|
}
|
|
}
|
|
}
|
|
|
|
void TransactionDBImpl::RegisterTransaction(Transaction* txn) {
|
|
assert(txn);
|
|
assert(txn->GetName().length() > 0);
|
|
assert(GetTransactionByName(txn->GetName()) == nullptr);
|
|
assert(txn->exec_status_ == Transaction::STARTED);
|
|
std::lock_guard<std::mutex> lock(name_map_mutex_);
|
|
transactions_[txn->GetName()] = txn;
|
|
}
|
|
|
|
void TransactionDBImpl::UnregisterTransaction(Transaction* txn) {
|
|
assert(txn);
|
|
std::lock_guard<std::mutex> lock(name_map_mutex_);
|
|
auto it = transactions_.find(txn->GetName());
|
|
assert(it != transactions_.end());
|
|
transactions_.erase(it);
|
|
}
|
|
|
|
} // namespace rocksdb
|
|
#endif // ROCKSDB_LITE
|