2015-05-29 23:36:35 +02:00
|
|
|
// Copyright (c) 2015, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under the BSD-style license found in the
|
|
|
|
// LICENSE file in the root directory of this source tree. An additional grant
|
|
|
|
// of patent rights can be found in the PATENTS file in the same directory.
|
|
|
|
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
|
|
|
|
#include "utilities/transactions/optimistic_transaction_impl.h"
|
|
|
|
|
2015-05-30 18:25:45 +02:00
|
|
|
#ifndef __STDC_FORMAT_MACROS
|
|
|
|
#define __STDC_FORMAT_MACROS
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#include <inttypes.h>
|
2015-05-29 23:36:35 +02:00
|
|
|
#include <string>
|
|
|
|
#include <vector>
|
|
|
|
|
|
|
|
#include "db/column_family.h"
|
|
|
|
#include "db/db_impl.h"
|
|
|
|
#include "rocksdb/comparator.h"
|
|
|
|
#include "rocksdb/db.h"
|
|
|
|
#include "rocksdb/status.h"
|
|
|
|
#include "rocksdb/utilities/optimistic_transaction_db.h"
|
|
|
|
#include "util/string_util.h"
|
|
|
|
|
|
|
|
namespace rocksdb {
|
|
|
|
|
|
|
|
struct WriteOptions;
|
|
|
|
|
|
|
|
OptimisticTransactionImpl::OptimisticTransactionImpl(
|
|
|
|
OptimisticTransactionDB* txn_db, const WriteOptions& write_options,
|
|
|
|
const OptimisticTransactionOptions& txn_options)
|
|
|
|
: txn_db_(txn_db),
|
|
|
|
db_(txn_db->GetBaseDB()),
|
|
|
|
write_options_(write_options),
|
|
|
|
snapshot_(nullptr),
|
|
|
|
write_batch_(txn_options.cmp, 0, true) {
|
|
|
|
if (txn_options.set_snapshot) {
|
|
|
|
SetSnapshot();
|
|
|
|
} else {
|
|
|
|
start_sequence_number_ = db_->GetLatestSequenceNumber();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
OptimisticTransactionImpl::~OptimisticTransactionImpl() {
|
|
|
|
tracked_keys_.clear();
|
|
|
|
if (snapshot_ != nullptr) {
|
|
|
|
db_->ReleaseSnapshot(snapshot_);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void OptimisticTransactionImpl::SetSnapshot() {
|
|
|
|
if (snapshot_ != nullptr) {
|
|
|
|
db_->ReleaseSnapshot(snapshot_);
|
|
|
|
}
|
|
|
|
|
|
|
|
snapshot_ = db_->GetSnapshot();
|
|
|
|
start_sequence_number_ = snapshot_->GetSequenceNumber();
|
|
|
|
}
|
|
|
|
|
|
|
|
Status OptimisticTransactionImpl::Commit() {
|
|
|
|
// Set up callback which will call CheckTransactionForConflicts() to
|
|
|
|
// check whether this transaction is safe to be committed.
|
|
|
|
OptimisticTransactionCallback callback(this);
|
|
|
|
|
|
|
|
DBImpl* db_impl = dynamic_cast<DBImpl*>(db_->GetRootDB());
|
|
|
|
if (db_impl == nullptr) {
|
|
|
|
// This should only happen if we support creating transactions from
|
|
|
|
// a StackableDB and someone overrides GetRootDB().
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"DB::GetRootDB() returned an unexpected DB class");
|
|
|
|
}
|
|
|
|
|
|
|
|
Status s = db_impl->WriteWithCallback(
|
|
|
|
write_options_, write_batch_.GetWriteBatch(), &callback);
|
|
|
|
|
|
|
|
if (s.ok()) {
|
|
|
|
tracked_keys_.clear();
|
|
|
|
write_batch_.Clear();
|
|
|
|
}
|
|
|
|
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
void OptimisticTransactionImpl::Rollback() {
|
|
|
|
tracked_keys_.clear();
|
|
|
|
write_batch_.Clear();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Record this key so that we can check it for conflicts at commit time.
|
|
|
|
void OptimisticTransactionImpl::RecordOperation(
|
|
|
|
ColumnFamilyHandle* column_family, const Slice& key) {
|
|
|
|
uint32_t cfh_id = GetColumnFamilyID(column_family);
|
|
|
|
|
|
|
|
SequenceNumber seq;
|
|
|
|
if (snapshot_) {
|
|
|
|
seq = start_sequence_number_;
|
|
|
|
} else {
|
|
|
|
seq = db_->GetLatestSequenceNumber();
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string key_str = key.ToString();
|
|
|
|
|
|
|
|
auto iter = tracked_keys_[cfh_id].find(key_str);
|
|
|
|
if (iter == tracked_keys_[cfh_id].end()) {
|
|
|
|
// key not yet seen, store it.
|
|
|
|
tracked_keys_[cfh_id].insert({std::move(key_str), seq});
|
|
|
|
} else {
|
|
|
|
SequenceNumber old_seq = iter->second;
|
|
|
|
if (seq < old_seq) {
|
|
|
|
// Snapshot has changed since we last saw this key, need to
|
|
|
|
// store the earliest seen sequence number.
|
|
|
|
tracked_keys_[cfh_id][key_str] = seq;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void OptimisticTransactionImpl::RecordOperation(
|
|
|
|
ColumnFamilyHandle* column_family, const SliceParts& key) {
|
|
|
|
size_t key_size = 0;
|
|
|
|
for (int i = 0; i < key.num_parts; ++i) {
|
|
|
|
key_size += key.parts[i].size();
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string str;
|
|
|
|
str.reserve(key_size);
|
|
|
|
|
|
|
|
for (int i = 0; i < key.num_parts; ++i) {
|
|
|
|
str.append(key.parts[i].data(), key.parts[i].size());
|
|
|
|
}
|
|
|
|
|
|
|
|
RecordOperation(column_family, str);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status OptimisticTransactionImpl::Get(const ReadOptions& read_options,
|
|
|
|
ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& key, std::string* value) {
|
|
|
|
return write_batch_.GetFromBatchAndDB(db_, read_options, column_family, key,
|
|
|
|
value);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status OptimisticTransactionImpl::GetForUpdate(
|
|
|
|
const ReadOptions& read_options, ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& key, std::string* value) {
|
|
|
|
// Regardless of whether the Get succeeded, track this key.
|
|
|
|
RecordOperation(column_family, key);
|
|
|
|
|
|
|
|
return Get(read_options, column_family, key, value);
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<Status> OptimisticTransactionImpl::MultiGet(
|
|
|
|
const ReadOptions& read_options,
|
|
|
|
const std::vector<ColumnFamilyHandle*>& column_family,
|
|
|
|
const std::vector<Slice>& keys, std::vector<std::string>* values) {
|
|
|
|
// Regardless of whether the MultiGet succeeded, track these keys.
|
|
|
|
size_t num_keys = keys.size();
|
|
|
|
values->resize(num_keys);
|
|
|
|
|
|
|
|
// TODO(agiardullo): optimize multiget?
|
|
|
|
std::vector<Status> stat_list(num_keys);
|
|
|
|
for (size_t i = 0; i < num_keys; ++i) {
|
|
|
|
std::string* value = &(*values)[i];
|
|
|
|
stat_list[i] = Get(read_options, column_family[i], keys[i], value);
|
|
|
|
}
|
|
|
|
|
|
|
|
return stat_list;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<Status> OptimisticTransactionImpl::MultiGetForUpdate(
|
|
|
|
const ReadOptions& read_options,
|
|
|
|
const std::vector<ColumnFamilyHandle*>& column_family,
|
|
|
|
const std::vector<Slice>& keys, std::vector<std::string>* values) {
|
|
|
|
// Regardless of whether the MultiGet succeeded, track these keys.
|
|
|
|
size_t num_keys = keys.size();
|
|
|
|
values->resize(num_keys);
|
|
|
|
|
|
|
|
// TODO(agiardullo): optimize multiget?
|
|
|
|
std::vector<Status> stat_list(num_keys);
|
|
|
|
for (size_t i = 0; i < num_keys; ++i) {
|
|
|
|
// Regardless of whether the Get succeeded, track this key.
|
|
|
|
RecordOperation(column_family[i], keys[i]);
|
|
|
|
|
|
|
|
std::string* value = &(*values)[i];
|
|
|
|
stat_list[i] = Get(read_options, column_family[i], keys[i], value);
|
|
|
|
}
|
|
|
|
|
|
|
|
return stat_list;
|
|
|
|
}
|
|
|
|
|
|
|
|
void OptimisticTransactionImpl::Put(ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& key, const Slice& value) {
|
|
|
|
RecordOperation(column_family, key);
|
|
|
|
|
|
|
|
write_batch_.Put(column_family, key, value);
|
|
|
|
}
|
|
|
|
|
|
|
|
void OptimisticTransactionImpl::Put(ColumnFamilyHandle* column_family,
|
|
|
|
const SliceParts& key,
|
|
|
|
const SliceParts& value) {
|
|
|
|
RecordOperation(column_family, key);
|
|
|
|
|
|
|
|
write_batch_.Put(column_family, key, value);
|
|
|
|
}
|
|
|
|
|
|
|
|
void OptimisticTransactionImpl::Merge(ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& key, const Slice& value) {
|
|
|
|
RecordOperation(column_family, key);
|
|
|
|
|
|
|
|
write_batch_.Merge(column_family, key, value);
|
|
|
|
}
|
|
|
|
|
|
|
|
void OptimisticTransactionImpl::Delete(ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& key) {
|
|
|
|
RecordOperation(column_family, key);
|
|
|
|
|
|
|
|
write_batch_.Delete(column_family, key);
|
|
|
|
}
|
|
|
|
|
|
|
|
void OptimisticTransactionImpl::Delete(ColumnFamilyHandle* column_family,
|
|
|
|
const SliceParts& key) {
|
|
|
|
RecordOperation(column_family, key);
|
|
|
|
|
|
|
|
write_batch_.Delete(column_family, key);
|
|
|
|
}
|
|
|
|
|
|
|
|
void OptimisticTransactionImpl::PutUntracked(ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& key,
|
|
|
|
const Slice& value) {
|
|
|
|
write_batch_.Put(column_family, key, value);
|
|
|
|
}
|
|
|
|
|
|
|
|
void OptimisticTransactionImpl::PutUntracked(ColumnFamilyHandle* column_family,
|
|
|
|
const SliceParts& key,
|
|
|
|
const SliceParts& value) {
|
|
|
|
write_batch_.Put(column_family, key, value);
|
|
|
|
}
|
|
|
|
|
|
|
|
void OptimisticTransactionImpl::MergeUntracked(
|
|
|
|
ColumnFamilyHandle* column_family, const Slice& key, const Slice& value) {
|
|
|
|
write_batch_.Merge(column_family, key, value);
|
|
|
|
}
|
|
|
|
|
|
|
|
void OptimisticTransactionImpl::DeleteUntracked(
|
|
|
|
ColumnFamilyHandle* column_family, const Slice& key) {
|
|
|
|
write_batch_.Delete(column_family, key);
|
|
|
|
}
|
|
|
|
|
|
|
|
void OptimisticTransactionImpl::DeleteUntracked(
|
|
|
|
ColumnFamilyHandle* column_family, const SliceParts& key) {
|
|
|
|
write_batch_.Delete(column_family, key);
|
|
|
|
}
|
|
|
|
|
|
|
|
void OptimisticTransactionImpl::PutLogData(const Slice& blob) {
|
|
|
|
write_batch_.PutLogData(blob);
|
|
|
|
}
|
|
|
|
|
|
|
|
WriteBatchWithIndex* OptimisticTransactionImpl::GetWriteBatch() {
|
|
|
|
return &write_batch_;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Returns OK if it is safe to commit this transaction. Returns Status::Busy
|
|
|
|
// if there are read or write conflicts that would prevent us from committing OR
|
|
|
|
// if we can not determine whether there would be any such conflicts.
|
|
|
|
//
|
|
|
|
// Should only be called on writer thread.
|
|
|
|
Status OptimisticTransactionImpl::CheckTransactionForConflicts(DB* db) {
|
|
|
|
Status result;
|
|
|
|
|
|
|
|
assert(dynamic_cast<DBImpl*>(db) != nullptr);
|
|
|
|
auto db_impl = reinterpret_cast<DBImpl*>(db);
|
|
|
|
|
|
|
|
for (auto& tracked_keys_iter : tracked_keys_) {
|
|
|
|
uint32_t cf_id = tracked_keys_iter.first;
|
|
|
|
const auto& keys = tracked_keys_iter.second;
|
|
|
|
|
|
|
|
SuperVersion* sv = db_impl->GetAndRefSuperVersion(cf_id);
|
|
|
|
if (sv == nullptr) {
|
|
|
|
result =
|
|
|
|
Status::Busy("Could not access column family " + ToString(cf_id));
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
SequenceNumber earliest_seq =
|
|
|
|
db_impl->GetEarliestMemTableSequenceNumber(sv, true);
|
|
|
|
|
|
|
|
// For each of the keys in this transaction, check to see if someone has
|
|
|
|
// written to this key since the start of the transaction.
|
|
|
|
for (const auto& key_iter : keys) {
|
|
|
|
const auto& key = key_iter.first;
|
|
|
|
const SequenceNumber key_seq = key_iter.second;
|
|
|
|
|
|
|
|
// Since it would be too slow to check the SST files, we will only use
|
|
|
|
// the memtables to check whether there have been any recent writes
|
|
|
|
// to this key after it was accessed in this transaction. But if the
|
2015-06-03 06:24:19 +02:00
|
|
|
// Memtables do not contain a long enough history, we must fail the
|
2015-05-29 23:36:35 +02:00
|
|
|
// transaction.
|
|
|
|
if (earliest_seq == kMaxSequenceNumber) {
|
|
|
|
// The age of this memtable is unknown. Cannot rely on it to check
|
2015-06-03 06:24:19 +02:00
|
|
|
// for recent writes. This error shouldn't happen often in practice as
|
|
|
|
// the
|
|
|
|
// Memtable should have a valid earliest sequence number except in some
|
|
|
|
// corner cases (such as error cases during recovery).
|
2015-05-29 23:36:35 +02:00
|
|
|
result = Status::Busy(
|
|
|
|
"Could not commit transaction with as the MemTable does not "
|
|
|
|
"countain a long enough history to check write at SequenceNumber: ",
|
|
|
|
ToString(key_seq));
|
|
|
|
|
|
|
|
} else if (key_seq < earliest_seq) {
|
|
|
|
// The age of this memtable is too new to use to check for recent
|
|
|
|
// writes.
|
|
|
|
char msg[255];
|
|
|
|
snprintf(
|
|
|
|
msg, sizeof(msg),
|
2015-05-30 18:25:45 +02:00
|
|
|
"Could not commit transaction with write at SequenceNumber %" PRIu64
|
|
|
|
" as the MemTable only contains changes newer than SequenceNumber "
|
2015-06-03 06:24:19 +02:00
|
|
|
"%" PRIu64
|
|
|
|
". Increasing the value of the "
|
|
|
|
"max_write_buffer_number_to_maintain option could reduce the "
|
|
|
|
"frequency "
|
|
|
|
"of this error.",
|
2015-05-29 23:36:35 +02:00
|
|
|
key_seq, earliest_seq);
|
|
|
|
result = Status::Busy(msg);
|
|
|
|
} else {
|
|
|
|
SequenceNumber seq = kMaxSequenceNumber;
|
|
|
|
Status s = db_impl->GetLatestSequenceForKeyFromMemtable(sv, key, &seq);
|
|
|
|
if (!s.ok()) {
|
|
|
|
result = s;
|
|
|
|
} else if (seq != kMaxSequenceNumber && seq > key_seq) {
|
|
|
|
result = Status::Busy();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!result.ok()) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
db_impl->ReturnAndCleanupSuperVersion(cf_id, sv);
|
|
|
|
|
|
|
|
if (!result.ok()) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace rocksdb
|
|
|
|
|
|
|
|
#endif // ROCKSDB_LITE
|