rocksdb/utilities/transactions/transaction_util.cc
agiardullo c2f2cb0214 Pessimistic Transactions
Summary:
Initial implementation of Pessimistic Transactions.  This diff contains the api changes discussed in D38913.  This diff is pretty large, so let me know if people would prefer to meet up to discuss it.

MyRocks folks:  please take a look at the API in include/rocksdb/utilities/transaction[_db].h and let me know if you have any issues.

Also, you'll notice a couple of TODOs in the implementation of RollbackToSavePoint().  After chatting with Siying, I'm going to send out a separate diff for an alternate implementation of this feature that implements the rollback inside of WriteBatch/WriteBatchWithIndex.  We can then decide which route is preferable.

Next, I'm planning on doing some perf testing and then integrating this diff into MongoRocks for further testing.

Test Plan: Unit tests, db_bench parallel testing.

Reviewers: igor, rven, sdong, yhchiang, yoshinorim

Reviewed By: sdong

Subscribers: hermanlee4, maykov, spetrunia, leveldb, dhruba

Differential Revision: https://reviews.facebook.net/D40869
2015-08-11 17:52:23 -07:00

266 lines
8.5 KiB
C++

// Copyright (c) 2015, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#ifndef ROCKSDB_LITE
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS
#endif
#include "utilities/transactions/transaction_util.h"
#include <inttypes.h>
#include <string>
#include <vector>
#include "db/db_impl.h"
#include "rocksdb/status.h"
#include "rocksdb/utilities/write_batch_with_index.h"
#include "util/string_util.h"
namespace rocksdb {
Status TransactionUtil::CheckKeyForConflicts(DBImpl* db_impl,
ColumnFamilyHandle* column_family,
const std::string& key,
SequenceNumber key_seq) {
Status result;
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
auto cfd = cfh->cfd();
SuperVersion* sv = db_impl->GetAndRefSuperVersion(cfd);
if (sv == nullptr) {
result = Status::Busy("Could not access column family " +
cfh->GetName());
}
if (result.ok()) {
SequenceNumber earliest_seq =
db_impl->GetEarliestMemTableSequenceNumber(sv, true);
result = CheckKey(db_impl, sv, earliest_seq, key_seq, key);
db_impl->ReturnAndCleanupSuperVersion(cfd, sv);
}
return result;
}
Status TransactionUtil::CheckKey(DBImpl* db_impl, SuperVersion* sv,
SequenceNumber earliest_seq,
SequenceNumber key_seq,
const std::string& key) {
Status result;
// Since it would be too slow to check the SST files, we will only use
// the memtables to check whether there have been any recent writes
// to this key after it was accessed in this transaction. But if the
// Memtables do not contain a long enough history, we must fail the
// transaction.
if (earliest_seq == kMaxSequenceNumber) {
// The age of this memtable is unknown. Cannot rely on it to check
// for recent writes. This error shouldn't happen often in practice as
// the
// Memtable should have a valid earliest sequence number except in some
// corner cases (such as error cases during recovery).
result = Status::Busy(
"Transaction ould not check for conflicts as the MemTable does not "
"countain a long enough history to check write at SequenceNumber: ",
ToString(key_seq));
} else if (key_seq < earliest_seq) {
// The age of this memtable is too new to use to check for recent
// writes.
char msg[255];
snprintf(msg, sizeof(msg),
"Transaction could not check for conflicts for opearation at "
"SequenceNumber %" PRIu64
" as the MemTable only contains changes newer than SequenceNumber "
"%" PRIu64
". Increasing the value of the "
"max_write_buffer_number_to_maintain option could reduce the "
"frequency "
"of this error.",
key_seq, earliest_seq);
result = Status::Busy(msg);
} else {
SequenceNumber seq = kMaxSequenceNumber;
Status s = db_impl->GetLatestSequenceForKeyFromMemtable(sv, key, &seq);
if (!s.ok()) {
result = s;
} else if (seq != kMaxSequenceNumber && seq > key_seq) {
result = Status::Busy();
}
}
return result;
}
Status TransactionUtil::CheckKeysForConflicts(DBImpl* db_impl,
TransactionKeyMap* key_map) {
Status result;
for (auto& key_map_iter : *key_map) {
uint32_t cf_id = key_map_iter.first;
const auto& keys = key_map_iter.second;
SuperVersion* sv = db_impl->GetAndRefSuperVersion(cf_id);
if (sv == nullptr) {
result =
Status::Busy("Could not access column family " + ToString(cf_id));
break;
}
SequenceNumber earliest_seq =
db_impl->GetEarliestMemTableSequenceNumber(sv, true);
// For each of the keys in this transaction, check to see if someone has
// written to this key since the start of the transaction.
for (const auto& key_iter : keys) {
const auto& key = key_iter.first;
const SequenceNumber key_seq = key_iter.second;
result = CheckKey(db_impl, sv, earliest_seq, key_seq, key);
if (!result.ok()) {
break;
}
}
db_impl->ReturnAndCleanupSuperVersion(cf_id, sv);
if (!result.ok()) {
break;
}
}
return result;
}
Status TransactionUtil::CopyFirstN(size_t num, WriteBatchWithIndex* batch,
WriteBatchWithIndex* new_batch,
DBImpl* db_impl) {
// Handler for iterating through batch and copying entries to new_batch
class Handler : public WriteBatch::Handler {
public:
WriteBatchWithIndex* batch;
const size_t limit;
DBImpl* db_impl;
size_t seen = 0;
std::unordered_map<uint32_t, SuperVersion*> super_versions;
std::unordered_map<uint32_t, ColumnFamilyHandle*> handles;
Handler(WriteBatchWithIndex* dest, size_t new_limit, DBImpl* db)
: batch(dest), limit(new_limit), db_impl(db) {}
~Handler() {
for (auto& iter : super_versions) {
db_impl->ReturnAndCleanupSuperVersionUnlocked(iter.first, iter.second);
}
}
Status GetColumnFamily(uint32_t column_family_id,
ColumnFamilyHandle** cfh) {
// Need to look up ColumnFamilyHandle for this column family id. Since
// doing this requires grabbing a mutex, lets only do it once per column
// family and cache it.
// In order to ensure that the ColumnFamilyHandle is still valid, we need
// to hold the superversion.
const auto& iter = handles.find(column_family_id);
if (iter == handles.end()) {
// Don't have ColumnFamilyHandle cached, look it up from the db.
SuperVersion* sv =
db_impl->GetAndRefSuperVersionUnlocked(column_family_id);
if (sv == nullptr) {
return Status::InvalidArgument(
"Could not find column family for ID " +
ToString(column_family_id));
}
super_versions.insert({column_family_id, sv});
*cfh = db_impl->GetColumnFamilyHandleUnlocked(column_family_id);
if (*cfh == nullptr) {
return Status::InvalidArgument(
"Could not find column family handle for ID " +
ToString(column_family_id));
}
handles.insert({column_family_id, *cfh});
} else {
*cfh = iter->second;
}
return Status::OK();
}
virtual Status PutCF(uint32_t column_family_id, const Slice& key,
const Slice& value) override {
if (seen >= limit) {
// Found the first N entries, return Aborted to stop the Iteration.
return Status::Aborted();
}
ColumnFamilyHandle* cfh = nullptr;
Status s = GetColumnFamily(column_family_id, &cfh);
if (s.ok()) {
batch->Put(cfh, key, value);
}
seen++;
return s;
}
virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
const Slice& value) override {
if (seen >= limit) {
// Found the first N entries, return Aborted to stop the Iteration.
return Status::Aborted();
}
ColumnFamilyHandle* cfh = nullptr;
Status s = GetColumnFamily(column_family_id, &cfh);
if (s.ok()) {
batch->Merge(cfh, key, value);
}
seen++;
return s;
}
virtual Status DeleteCF(uint32_t column_family_id,
const Slice& key) override {
if (seen >= limit) {
// Found the first N entries, return Aborted to stop the Iteration.
return Status::Aborted();
}
ColumnFamilyHandle* cfh = nullptr;
Status s = GetColumnFamily(column_family_id, &cfh);
if (s.ok()) {
batch->Delete(cfh, key);
}
seen++;
return s;
}
virtual void LogData(const Slice& blob) override {
if (seen < limit) {
batch->PutLogData(blob);
}
seen++;
}
};
// Iterating on this handler will add all keys in this batch into a new batch
// up to
// the limit.
Handler handler(new_batch, num, db_impl);
Status s = batch->GetWriteBatch()->Iterate(&handler);
if (s.IsAborted()) {
// Handler returns Aborted when it is done copying to stop the iteration.
s = Status::OK();
}
return s;
}
} // namespace rocksdb
#endif // ROCKSDB_LITE