WritePrepared Txn: Fix bug with duplicate keys during recovery

Summary:
Fix the following bugs:
- During recovery a duplicate key was inserted twice into the write batch of the recovery transaction,
once when the memtable returns false (because it was duplicates) and once for the 2nd attempt. This would result into different SubBatch count measured when the recovered transactions is committing.
- If a cf is flushed during recovery the memtable is not available to assist in detecting the duplicate key. This could result into not advancing the sequence number when iterating over duplicate keys of a flushed cf and hence inserting the next key with the wrong sequence number.
- SubBacthCounter would reset the comparator to default comparator after the first duplicate key. The 2nd duplicate key hence would have gone through a wrong comparator and not being detected.
Closes https://github.com/facebook/rocksdb/pull/3562

Differential Revision: D7149440

Pulled By: maysamyabandeh

fbshipit-source-id: 91ec317b165f363f5d11ff8b8c47c81cebb8ed77
This commit is contained in:
Maysam Yabandeh 2018-03-05 10:48:29 -08:00 committed by Facebook Github Bot
parent 15f55e5e06
commit 680864ae54
9 changed files with 497 additions and 74 deletions

View File

@ -550,9 +550,18 @@ class DBImpl : public DB {
WriteBatch* batch_;
// The seq number of the first key in the batch
SequenceNumber seq_;
// Number of sub-batched. A new sub-batch is created if we txn attempts to
// inserts a duplicate key,seq to memtable. This is currently used in
// WritePrparedTxn
size_t batch_cnt_;
explicit RecoveredTransaction(const uint64_t log, const std::string& name,
WriteBatch* batch, SequenceNumber seq)
: log_number_(log), name_(name), batch_(batch), seq_(seq) {}
WriteBatch* batch, SequenceNumber seq,
size_t batch_cnt)
: log_number_(log),
name_(name),
batch_(batch),
seq_(seq),
batch_cnt_(batch_cnt) {}
~RecoveredTransaction() { delete batch_; }
};
@ -574,9 +583,10 @@ class DBImpl : public DB {
}
void InsertRecoveredTransaction(const uint64_t log, const std::string& name,
WriteBatch* batch, SequenceNumber seq) {
WriteBatch* batch, SequenceNumber seq,
size_t batch_cnt) {
recovered_transactions_[name] =
new RecoveredTransaction(log, name, batch, seq);
new RecoveredTransaction(log, name, batch, seq, batch_cnt);
MarkLogAsContainingPrepSection(log);
}

View File

@ -978,6 +978,60 @@ Status WriteBatch::PopSavePoint() {
return Status::OK();
}
// TODO(myabandeh): move it to util
namespace {
// During recovery if the memtable is flushed we cannot rely on its help on
// duplicate key detection and as key insert will not be attempted. This class
// will be used as a emulator of memtable to tell if insertion of a key/seq
// would have resulted in duplication.
class DuplicateDetector {
public:
explicit DuplicateDetector(DBImpl* db) : db_(db) {}
bool IsDuplicateKeySeq(uint32_t cf, const Slice& key, SequenceNumber seq) {
assert(seq >= batch_seq_);
if (batch_seq_ != seq) { // it is a new batch
keys_.clear();
}
batch_seq_ = seq;
CFKeys& cf_keys = keys_[cf];
if (cf_keys.size() == 0) { // just inserted
InitWithComp(cf);
}
auto it = cf_keys.insert(key);
if (it.second == false) { // second is false if a element already existed.
keys_.clear();
InitWithComp(cf);
keys_[cf].insert(key);
return true;
}
return false;
}
private:
SequenceNumber batch_seq_ = 0;
DBImpl* db_;
// A comparator to be used in std::set
struct SetComparator {
explicit SetComparator() : user_comparator_(BytewiseComparator()) {}
explicit SetComparator(const Comparator* user_comparator)
: user_comparator_(user_comparator ? user_comparator
: BytewiseComparator()) {}
bool operator()(const Slice& lhs, const Slice& rhs) const {
return user_comparator_->Compare(lhs, rhs) < 0;
}
private:
const Comparator* user_comparator_;
};
using CFKeys = std::set<Slice, SetComparator>;
std::map<uint32_t, CFKeys> keys_;
void InitWithComp(const uint32_t cf) {
auto cmp = db_->GetColumnFamilyHandle(cf)->GetComparator();
keys_[cf] = CFKeys(SetComparator(cmp));
}
};
} // anonymous namespace
class MemTableInserter : public WriteBatch::Handler {
SequenceNumber sequence_;
@ -1008,6 +1062,7 @@ class MemTableInserter : public WriteBatch::Handler {
bool seq_per_batch_;
// Whether the memtable write will be done only after the commit
bool write_after_commit_;
DuplicateDetector duplicate_detector_;
MemPostInfoMap& GetPostMap() {
assert(concurrent_memtable_writes_);
@ -1045,7 +1100,8 @@ class MemTableInserter : public WriteBatch::Handler {
// Write after commit currently uses one seq per key (instead of per
// batch). So seq_per_batch being false indicates write_after_commit
// approach.
write_after_commit_(!seq_per_batch) {
write_after_commit_(!seq_per_batch),
duplicate_detector_(db_) {
assert(cf_mems_);
}
@ -1135,17 +1191,25 @@ class MemTableInserter : public WriteBatch::Handler {
Status PutCFImpl(uint32_t column_family_id, const Slice& key,
const Slice& value, ValueType value_type) {
if (rebuilding_trx_ != nullptr) {
// optimize for non-recovery mode
if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) {
WriteBatchInternal::Put(rebuilding_trx_, column_family_id, key, value);
if (write_after_commit_) {
return Status::OK();
}
// else insert the values to the memtable right away
}
Status seek_status;
if (!SeekToColumnFamily(column_family_id, &seek_status)) {
MaybeAdvanceSeq();
if (UNLIKELY(!SeekToColumnFamily(column_family_id, &seek_status))) {
bool batch_boundry = false;
if (rebuilding_trx_ != nullptr) {
assert(!write_after_commit_);
// The CF is probabely flushed and hence no need for insert but we still
// need to keep track of the keys for upcoming rollback/commit.
WriteBatchInternal::Put(rebuilding_trx_, column_family_id, key, value);
batch_boundry = duplicate_detector_.IsDuplicateKeySeq(column_family_id,
key, sequence_);
}
MaybeAdvanceSeq(batch_boundry);
return seek_status;
}
Status ret_status;
@ -1215,6 +1279,13 @@ class MemTableInserter : public WriteBatch::Handler {
}
}
}
// optimize for non-recovery mode
if (UNLIKELY(!ret_status.IsTryAgain() && rebuilding_trx_ != nullptr)) {
assert(!write_after_commit_);
// If the ret_status is TryAgain then let the next try to add the ky to
// the the rebuilding transaction object.
WriteBatchInternal::Put(rebuilding_trx_, column_family_id, key, value);
}
// Since all Puts are logged in trasaction logs (if enabled), always bump
// sequence number. Even if the update eventually fails and does not result
// in memtable add/update.
@ -1248,57 +1319,102 @@ class MemTableInserter : public WriteBatch::Handler {
virtual Status DeleteCF(uint32_t column_family_id,
const Slice& key) override {
if (rebuilding_trx_ != nullptr) {
// optimize for non-recovery mode
if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) {
WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key);
if (write_after_commit_) {
return Status::OK();
}
// else insert the values to the memtable right away
}
Status seek_status;
if (!SeekToColumnFamily(column_family_id, &seek_status)) {
MaybeAdvanceSeq();
if (UNLIKELY(!SeekToColumnFamily(column_family_id, &seek_status))) {
bool batch_boundry = false;
if (rebuilding_trx_ != nullptr) {
assert(!write_after_commit_);
// The CF is probabely flushed and hence no need for insert but we still
// need to keep track of the keys for upcoming rollback/commit.
WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key);
batch_boundry = duplicate_detector_.IsDuplicateKeySeq(column_family_id,
key, sequence_);
}
MaybeAdvanceSeq(batch_boundry);
return seek_status;
}
return DeleteImpl(column_family_id, key, Slice(), kTypeDeletion);
auto ret_status = DeleteImpl(column_family_id, key, Slice(), kTypeDeletion);
// optimize for non-recovery mode
if (UNLIKELY(!ret_status.IsTryAgain() && rebuilding_trx_ != nullptr)) {
assert(!write_after_commit_);
// If the ret_status is TryAgain then let the next try to add the ky to
// the the rebuilding transaction object.
WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key);
}
return ret_status;
}
virtual Status SingleDeleteCF(uint32_t column_family_id,
const Slice& key) override {
if (rebuilding_trx_ != nullptr) {
// optimize for non-recovery mode
if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) {
WriteBatchInternal::SingleDelete(rebuilding_trx_, column_family_id, key);
if (write_after_commit_) {
return Status::OK();
}
// else insert the values to the memtable right away
}
Status seek_status;
if (!SeekToColumnFamily(column_family_id, &seek_status)) {
MaybeAdvanceSeq();
if (UNLIKELY(!SeekToColumnFamily(column_family_id, &seek_status))) {
bool batch_boundry = false;
if (rebuilding_trx_ != nullptr) {
assert(!write_after_commit_);
// The CF is probabely flushed and hence no need for insert but we still
// need to keep track of the keys for upcoming rollback/commit.
WriteBatchInternal::SingleDelete(rebuilding_trx_, column_family_id,
key);
batch_boundry = duplicate_detector_.IsDuplicateKeySeq(column_family_id,
key, sequence_);
}
MaybeAdvanceSeq(batch_boundry);
return seek_status;
}
return DeleteImpl(column_family_id, key, Slice(), kTypeSingleDeletion);
auto ret_status =
DeleteImpl(column_family_id, key, Slice(), kTypeSingleDeletion);
// optimize for non-recovery mode
if (UNLIKELY(!ret_status.IsTryAgain() && rebuilding_trx_ != nullptr)) {
assert(!write_after_commit_);
// If the ret_status is TryAgain then let the next try to add the ky to
// the the rebuilding transaction object.
WriteBatchInternal::SingleDelete(rebuilding_trx_, column_family_id, key);
}
return ret_status;
}
virtual Status DeleteRangeCF(uint32_t column_family_id,
const Slice& begin_key,
const Slice& end_key) override {
if (rebuilding_trx_ != nullptr) {
// optimize for non-recovery mode
if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) {
WriteBatchInternal::DeleteRange(rebuilding_trx_, column_family_id,
begin_key, end_key);
if (write_after_commit_) {
return Status::OK();
}
// else insert the values to the memtable right away
}
Status seek_status;
if (!SeekToColumnFamily(column_family_id, &seek_status)) {
MaybeAdvanceSeq();
if (UNLIKELY(!SeekToColumnFamily(column_family_id, &seek_status))) {
bool batch_boundry = false;
if (rebuilding_trx_ != nullptr) {
assert(!write_after_commit_);
// The CF is probabely flushed and hence no need for insert but we still
// need to keep track of the keys for upcoming rollback/commit.
WriteBatchInternal::DeleteRange(rebuilding_trx_, column_family_id,
begin_key, end_key);
// TODO(myabandeh): when transctional DeleteRange support is added,
// check if end_key must also be added.
batch_boundry = duplicate_detector_.IsDuplicateKeySeq(
column_family_id, begin_key, sequence_);
}
MaybeAdvanceSeq(batch_boundry);
return seek_status;
}
if (db_ != nullptr) {
@ -1315,23 +1431,42 @@ class MemTableInserter : public WriteBatch::Handler {
}
}
return DeleteImpl(column_family_id, begin_key, end_key, kTypeRangeDeletion);
auto ret_status =
DeleteImpl(column_family_id, begin_key, end_key, kTypeRangeDeletion);
// optimize for non-recovery mode
if (UNLIKELY(!ret_status.IsTryAgain() && rebuilding_trx_ != nullptr)) {
assert(!write_after_commit_);
// If the ret_status is TryAgain then let the next try to add the ky to
// the the rebuilding transaction object.
WriteBatchInternal::DeleteRange(rebuilding_trx_, column_family_id,
begin_key, end_key);
}
return ret_status;
}
virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
const Slice& value) override {
assert(!concurrent_memtable_writes_);
if (rebuilding_trx_ != nullptr) {
// optimize for non-recovery mode
if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) {
WriteBatchInternal::Merge(rebuilding_trx_, column_family_id, key, value);
if (write_after_commit_) {
return Status::OK();
}
// else insert the values to the memtable right away
}
Status seek_status;
if (!SeekToColumnFamily(column_family_id, &seek_status)) {
MaybeAdvanceSeq();
if (UNLIKELY(!SeekToColumnFamily(column_family_id, &seek_status))) {
bool batch_boundry = false;
if (rebuilding_trx_ != nullptr) {
assert(!write_after_commit_);
// The CF is probabely flushed and hence no need for insert but we still
// need to keep track of the keys for upcoming rollback/commit.
WriteBatchInternal::Merge(rebuilding_trx_, column_family_id, key,
value);
batch_boundry = duplicate_detector_.IsDuplicateKeySeq(column_family_id,
key, sequence_);
}
MaybeAdvanceSeq(batch_boundry);
return seek_status;
}
@ -1412,6 +1547,13 @@ class MemTableInserter : public WriteBatch::Handler {
}
}
// optimize for non-recovery mode
if (UNLIKELY(!ret_status.IsTryAgain() && rebuilding_trx_ != nullptr)) {
assert(!write_after_commit_);
// If the ret_status is TryAgain then let the next try to add the ky to
// the the rebuilding transaction object.
WriteBatchInternal::Merge(rebuilding_trx_, column_family_id, key, value);
}
MaybeAdvanceSeq();
CheckMemtableFull();
return ret_status;
@ -1466,8 +1608,13 @@ class MemTableInserter : public WriteBatch::Handler {
if (recovering_log_number_ != 0) {
assert(db_->allow_2pc());
size_t batch_cnt =
write_after_commit_
? 0 // 0 will disable further checks
: static_cast<size_t>(sequence_ - rebuilding_trx_seq_ + 1);
db_->InsertRecoveredTransaction(recovering_log_number_, name.ToString(),
rebuilding_trx_, rebuilding_trx_seq_);
rebuilding_trx_, rebuilding_trx_seq_,
batch_cnt);
rebuilding_trx_ = nullptr;
} else {
assert(rebuilding_trx_ == nullptr);

View File

@ -227,6 +227,7 @@ class WriteBatchWithIndex : public WriteBatchBase {
void SetMaxBytes(size_t max_bytes) override;
private:
friend class PessimisticTransactionDB;
friend class WritePreparedTxn;
friend class WriteBatchWithIndex_SubBatchCnt_Test;
// Returns the number of sub-batches inside the write batch. A sub-batch

View File

@ -142,6 +142,11 @@ Status PessimisticTransactionDB::Initialize(
}
s = real_trx->RebuildFromWriteBatch(recovered_trx->batch_);
// WriteCommitted set this to to disable this check that is specific to
// WritePrepared txns
assert(recovered_trx->batch_cnt_ == 0 ||
real_trx->GetWriteBatch()->SubBatchCnt() ==
recovered_trx->batch_cnt_);
real_trx->SetState(Transaction::PREPARED);
if (!s.ok()) {
break;

View File

@ -135,6 +135,7 @@ class PessimisticTransactionDB : public TransactionDB {
friend class WritePreparedTxnDB;
friend class WritePreparedTxnDBMock;
friend class TransactionTest_DoubleEmptyWrite_Test;
friend class TransactionTest_DuplicateKeys_Test;
friend class TransactionTest_PersistentTwoPhaseTransactionTest_Test;
friend class TransactionTest_TwoPhaseLongPrepareTest_Test;
friend class TransactionTest_TwoPhaseDoubleRecoveryTest_Test;

View File

@ -5051,6 +5051,36 @@ TEST_P(TransactionTest, Optimizations) {
}
}
// A comparator that uses only the first three bytes
class ThreeBytewiseComparator : public Comparator {
public:
ThreeBytewiseComparator() {}
virtual const char* Name() const override {
return "test.ThreeBytewiseComparator";
}
virtual int Compare(const Slice& a, const Slice& b) const override {
Slice na = Slice(a.data(), a.size() < 3 ? a.size() : 3);
Slice nb = Slice(b.data(), b.size() < 3 ? b.size() : 3);
return na.compare(nb);
}
virtual bool Equal(const Slice& a, const Slice& b) const override {
Slice na = Slice(a.data(), a.size() < 3 ? a.size() : 3);
Slice nb = Slice(b.data(), b.size() < 3 ? b.size() : 3);
return na == nb;
}
// This methods below dont seem relevant to this test. Implement them if
// proven othersize.
void FindShortestSeparator(std::string* start,
const Slice& limit) const override {
const Comparator* bytewise_comp = BytewiseComparator();
bytewise_comp->FindShortestSeparator(start, limit);
}
void FindShortSuccessor(std::string* key) const override {
const Comparator* bytewise_comp = BytewiseComparator();
bytewise_comp->FindShortSuccessor(key);
}
};
// Test that the transactional db can handle duplicate keys in the write batch
TEST_P(TransactionTest, DuplicateKeys) {
ColumnFamilyOptions cf_options;
@ -5090,35 +5120,6 @@ TEST_P(TransactionTest, DuplicateKeys) {
// Test with non-bytewise comparator
{
// A comparator that uses only the first three bytes
class ThreeBytewiseComparator : public Comparator {
public:
ThreeBytewiseComparator() {}
virtual const char* Name() const override {
return "test.ThreeBytewiseComparator";
}
virtual int Compare(const Slice& a, const Slice& b) const override {
Slice na = Slice(a.data(), a.size() < 3 ? a.size() : 3);
Slice nb = Slice(b.data(), b.size() < 3 ? b.size() : 3);
return na.compare(nb);
}
virtual bool Equal(const Slice& a, const Slice& b) const override {
Slice na = Slice(a.data(), a.size() < 3 ? a.size() : 3);
Slice nb = Slice(b.data(), b.size() < 3 ? b.size() : 3);
return na == nb;
}
// This methods below dont seem relevant to this test. Implement them if
// proven othersize.
void FindShortestSeparator(std::string* start,
const Slice& limit) const override {
const Comparator* bytewise_comp = BytewiseComparator();
bytewise_comp->FindShortestSeparator(start, limit);
}
void FindShortSuccessor(std::string* key) const override {
const Comparator* bytewise_comp = BytewiseComparator();
bytewise_comp->FindShortSuccessor(key);
}
};
ReOpen();
std::unique_ptr<const Comparator> comp_gc(new ThreeBytewiseComparator());
cf_options.comparator = comp_gc.get();
@ -5128,6 +5129,8 @@ TEST_P(TransactionTest, DuplicateKeys) {
batch.Put(cf_handle, Slice("key"), Slice("value"));
// The first three bytes are the same, do it must be counted as duplicate
batch.Put(cf_handle, Slice("key2"), Slice("value2"));
// check for 2nd duplicate key in cf with non-default comparator
batch.Put(cf_handle, Slice("key2b"), Slice("value2b"));
ASSERT_OK(db->Write(write_options, &batch));
// The value must be the most recent value for all the keys equal to "key",
@ -5135,7 +5138,7 @@ TEST_P(TransactionTest, DuplicateKeys) {
ReadOptions ropt;
PinnableSlice pinnable_val;
ASSERT_OK(db->Get(ropt, cf_handle, "key", &pinnable_val));
ASSERT_TRUE(pinnable_val == ("value2"));
ASSERT_TRUE(pinnable_val == ("value2b"));
// Test duplicate keys with rollback
TransactionOptions txn_options;
@ -5145,7 +5148,7 @@ TEST_P(TransactionTest, DuplicateKeys) {
ASSERT_OK(txn0->Merge(cf_handle, Slice("key4"), Slice("value4")));
ASSERT_OK(txn0->Rollback());
ASSERT_OK(db->Get(ropt, cf_handle, "key5", &pinnable_val));
ASSERT_TRUE(pinnable_val == ("value2"));
ASSERT_TRUE(pinnable_val == ("value2b"));
delete txn0;
delete cf_handle;
@ -5321,6 +5324,212 @@ TEST_P(TransactionTest, DuplicateKeys) {
ASSERT_OK(txn0->Commit());
delete txn0;
}
// Test sucessfull recovery after a crash
{
ReOpen();
TransactionOptions txn_options;
WriteOptions write_options;
ReadOptions ropt;
Transaction* txn0;
PinnableSlice pinnable_val;
Status s;
std::unique_ptr<const Comparator> comp_gc(new ThreeBytewiseComparator());
cf_options.comparator = comp_gc.get();
ASSERT_OK(db->CreateColumnFamily(cf_options, cf_name, &cf_handle));
delete cf_handle;
std::vector<ColumnFamilyDescriptor> cfds{
ColumnFamilyDescriptor(kDefaultColumnFamilyName,
ColumnFamilyOptions(options)),
ColumnFamilyDescriptor(cf_name, cf_options),
};
std::vector<ColumnFamilyHandle*> handles;
ASSERT_OK(ReOpenNoDelete(cfds, &handles));
ASSERT_OK(db->Put(write_options, "foo0", "init"));
ASSERT_OK(db->Put(write_options, "foo1", "init"));
ASSERT_OK(db->Put(write_options, handles[1], "foo0", "init"));
ASSERT_OK(db->Put(write_options, handles[1], "foo1", "init"));
// one entry
txn0 = db->BeginTransaction(write_options, txn_options);
ASSERT_OK(txn0->SetName("xid"));
ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0a")));
ASSERT_OK(txn0->Prepare());
delete txn0;
// This will check the asserts inside recovery code
db->FlushWAL(true);
reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
ASSERT_OK(ReOpenNoDelete(cfds, &handles));
txn0 = db->GetTransactionByName("xid");
ASSERT_TRUE(txn0 != nullptr);
ASSERT_OK(txn0->Commit());
delete txn0;
s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val);
ASSERT_OK(s);
ASSERT_TRUE(pinnable_val == ("bar0a"));
// two entries, no duplicate
txn0 = db->BeginTransaction(write_options, txn_options);
ASSERT_OK(txn0->SetName("xid"));
ASSERT_OK(txn0->Put(handles[1], Slice("foo0"), Slice("bar0b")));
ASSERT_OK(txn0->Put(handles[1], Slice("fol1"), Slice("bar1b")));
ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0b")));
ASSERT_OK(txn0->Put(Slice("foo1"), Slice("bar1b")));
ASSERT_OK(txn0->Prepare());
delete txn0;
// This will check the asserts inside recovery code
db->FlushWAL(true);
// Flush only cf 1
reinterpret_cast<DBImpl*>(db->GetRootDB())
->TEST_FlushMemTable(true, handles[1]);
reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
ASSERT_OK(ReOpenNoDelete(cfds, &handles));
txn0 = db->GetTransactionByName("xid");
ASSERT_TRUE(txn0 != nullptr);
ASSERT_OK(txn0->Commit());
delete txn0;
pinnable_val.Reset();
s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val);
ASSERT_OK(s);
ASSERT_TRUE(pinnable_val == ("bar0b"));
pinnable_val.Reset();
s = db->Get(ropt, db->DefaultColumnFamily(), "foo1", &pinnable_val);
ASSERT_OK(s);
ASSERT_TRUE(pinnable_val == ("bar1b"));
pinnable_val.Reset();
s = db->Get(ropt, handles[1], "foo0", &pinnable_val);
ASSERT_OK(s);
ASSERT_TRUE(pinnable_val == ("bar0b"));
pinnable_val.Reset();
s = db->Get(ropt, handles[1], "fol1", &pinnable_val);
ASSERT_OK(s);
ASSERT_TRUE(pinnable_val == ("bar1b"));
// one duplicate with ::Put
txn0 = db->BeginTransaction(write_options, txn_options);
ASSERT_OK(txn0->SetName("xid"));
ASSERT_OK(txn0->Put(handles[1], Slice("key-nonkey0"), Slice("bar0c")));
ASSERT_OK(txn0->Put(handles[1], Slice("key-nonkey1"), Slice("bar1d")));
ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0c")));
ASSERT_OK(txn0->Put(Slice("foo1"), Slice("bar1c")));
ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0d")));
ASSERT_OK(txn0->Prepare());
delete txn0;
// This will check the asserts inside recovery code
db->FlushWAL(true);
// Flush only cf 1
reinterpret_cast<DBImpl*>(db->GetRootDB())
->TEST_FlushMemTable(true, handles[1]);
reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
ASSERT_OK(ReOpenNoDelete(cfds, &handles));
txn0 = db->GetTransactionByName("xid");
ASSERT_TRUE(txn0 != nullptr);
ASSERT_OK(txn0->Commit());
delete txn0;
pinnable_val.Reset();
s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val);
ASSERT_OK(s);
ASSERT_TRUE(pinnable_val == ("bar0d"));
pinnable_val.Reset();
s = db->Get(ropt, db->DefaultColumnFamily(), "foo1", &pinnable_val);
ASSERT_OK(s);
ASSERT_TRUE(pinnable_val == ("bar1c"));
pinnable_val.Reset();
s = db->Get(ropt, handles[1], "key-nonkey2", &pinnable_val);
ASSERT_OK(s);
ASSERT_TRUE(pinnable_val == ("bar1d"));
// Duplicate with ::Put, ::Delete
txn0 = db->BeginTransaction(write_options, txn_options);
ASSERT_OK(txn0->SetName("xid"));
ASSERT_OK(txn0->Put(handles[1], Slice("key-nonkey0"), Slice("bar0e")));
ASSERT_OK(txn0->Delete(handles[1], Slice("key-nonkey1")));
ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0e")));
ASSERT_OK(txn0->Delete(Slice("foo0")));
ASSERT_OK(txn0->Prepare());
delete txn0;
// This will check the asserts inside recovery code
db->FlushWAL(true);
// Flush only cf 1
reinterpret_cast<DBImpl*>(db->GetRootDB())
->TEST_FlushMemTable(true, handles[1]);
reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
ASSERT_OK(ReOpenNoDelete(cfds, &handles));
txn0 = db->GetTransactionByName("xid");
ASSERT_TRUE(txn0 != nullptr);
ASSERT_OK(txn0->Commit());
delete txn0;
pinnable_val.Reset();
s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val);
ASSERT_TRUE(s.IsNotFound());
pinnable_val.Reset();
s = db->Get(ropt, handles[1], "key-nonkey2", &pinnable_val);
ASSERT_TRUE(s.IsNotFound());
// Duplicate with ::Put, ::SingleDelete
txn0 = db->BeginTransaction(write_options, txn_options);
ASSERT_OK(txn0->SetName("xid"));
ASSERT_OK(txn0->Put(handles[1], Slice("key-nonkey0"), Slice("bar0g")));
ASSERT_OK(txn0->SingleDelete(handles[1], Slice("key-nonkey1")));
ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0e")));
ASSERT_OK(txn0->SingleDelete(Slice("foo0")));
ASSERT_OK(txn0->Prepare());
delete txn0;
// This will check the asserts inside recovery code
db->FlushWAL(true);
// Flush only cf 1
reinterpret_cast<DBImpl*>(db->GetRootDB())
->TEST_FlushMemTable(true, handles[1]);
reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
ASSERT_OK(ReOpenNoDelete(cfds, &handles));
txn0 = db->GetTransactionByName("xid");
ASSERT_TRUE(txn0 != nullptr);
ASSERT_OK(txn0->Commit());
delete txn0;
pinnable_val.Reset();
s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val);
ASSERT_TRUE(s.IsNotFound());
pinnable_val.Reset();
s = db->Get(ropt, handles[1], "key-nonkey2", &pinnable_val);
ASSERT_TRUE(s.IsNotFound());
// Duplicate with ::Put, ::Merge
txn0 = db->BeginTransaction(write_options, txn_options);
ASSERT_OK(txn0->SetName("xid"));
ASSERT_OK(txn0->Put(handles[1], Slice("key-nonkey0"), Slice("bar1i")));
ASSERT_OK(txn0->Merge(handles[1], Slice("key-nonkey1"), Slice("bar1j")));
ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0f")));
ASSERT_OK(txn0->Merge(Slice("foo0"), Slice("bar0g")));
ASSERT_OK(txn0->Prepare());
delete txn0;
// This will check the asserts inside recovery code
db->FlushWAL(true);
// Flush only cf 1
reinterpret_cast<DBImpl*>(db->GetRootDB())
->TEST_FlushMemTable(true, handles[1]);
reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
ASSERT_OK(ReOpenNoDelete(cfds, &handles));
txn0 = db->GetTransactionByName("xid");
ASSERT_TRUE(txn0 != nullptr);
ASSERT_OK(txn0->Commit());
delete txn0;
pinnable_val.Reset();
s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val);
ASSERT_OK(s);
ASSERT_TRUE(pinnable_val == ("bar0f,bar0g"));
pinnable_val.Reset();
s = db->Get(ropt, handles[1], "key-nonkey2", &pinnable_val);
ASSERT_OK(s);
ASSERT_TRUE(pinnable_val == ("bar1i,bar1j"));
for (auto h : handles) {
delete h;
}
delete db;
db = nullptr;
}
}
} // namespace rocksdb

View File

@ -101,6 +101,27 @@ class TransactionTestBase : public ::testing::Test {
return s;
}
Status ReOpenNoDelete(std::vector<ColumnFamilyDescriptor>& cfs,
std::vector<ColumnFamilyHandle*>* handles) {
for (auto h : *handles) {
delete h;
}
handles->clear();
delete db;
db = nullptr;
env->AssertNoOpenFile();
env->DropUnsyncedFileData();
env->ResetState();
Status s;
if (use_stackable_db_ == false) {
s = TransactionDB::Open(options, txn_db_options, dbname, cfs, handles,
&db);
} else {
s = OpenWithStackableDB(cfs, handles);
}
return s;
}
Status ReOpen() {
delete db;
DestroyDB(dbname, options);
@ -113,6 +134,24 @@ class TransactionTestBase : public ::testing::Test {
return s;
}
Status OpenWithStackableDB(std::vector<ColumnFamilyDescriptor>& cfs,
std::vector<ColumnFamilyHandle*>* handles) {
std::vector<size_t> compaction_enabled_cf_indices;
TransactionDB::PrepareWrap(&options, &cfs, &compaction_enabled_cf_indices);
DB* root_db;
Options options_copy(options);
const bool use_seq_per_batch =
txn_db_options.write_policy == WRITE_PREPARED;
Status s = DBImpl::Open(options_copy, dbname, cfs, handles, &root_db,
use_seq_per_batch);
if (s.ok()) {
s = TransactionDB::WrapStackableDB(
new StackableDB(root_db), txn_db_options,
compaction_enabled_cf_indices, *handles, &db);
}
return s;
}
Status OpenWithStackableDB() {
std::vector<size_t> compaction_enabled_cf_indices;
std::vector<ColumnFamilyDescriptor> column_families{ColumnFamilyDescriptor(

View File

@ -475,7 +475,8 @@ void WritePreparedTxnDB::AddCommitted(uint64_t prepare_seq, uint64_t commit_seq,
CommitEntry64b evicted_64b;
CommitEntry evicted;
bool to_be_evicted = GetCommitEntry(indexed_seq, &evicted_64b, &evicted);
if (to_be_evicted) {
if (LIKELY(to_be_evicted)) {
assert(evicted.prep_seq != prepare_seq);
auto prev_max = max_evicted_seq_.load(std::memory_order_acquire);
ROCKS_LOG_DETAILS(info_log_,
"Evicting %" PRIu64 ",%" PRIu64 " with max %" PRIu64,
@ -491,7 +492,11 @@ void WritePreparedTxnDB::AddCommitted(uint64_t prepare_seq, uint64_t commit_seq,
}
bool succ =
ExchangeCommitEntry(indexed_seq, evicted_64b, {prepare_seq, commit_seq});
if (!succ) {
if (UNLIKELY(!succ)) {
ROCKS_LOG_ERROR(info_log_,
"ExchangeCommitEntry failed on [%" PRIu64 "] %" PRIu64
",%" PRIu64 " retrying...",
indexed_seq, prepare_seq, commit_seq);
// A very rare event, in which the commit entry is updated before we do.
// Here we apply a very simple solution of retrying.
if (loop_cnt > 100) {
@ -783,16 +788,21 @@ WritePreparedTxnDB::~WritePreparedTxnDB() {
db_impl_->CancelAllBackgroundWork(true /*wait*/);
}
void SubBatchCounter::InitWithComp(const uint32_t cf) {
auto cmp = comparators_[cf];
keys_[cf] = CFKeys(SetComparator(cmp));
}
void SubBatchCounter::AddKey(const uint32_t cf, const Slice& key) {
CFKeys& cf_keys = keys_[cf];
if (cf_keys.size() == 0) { // just inserted
auto cmp = comparators_[cf];
keys_[cf] = CFKeys(SetComparator(cmp));
InitWithComp(cf);
}
auto it = cf_keys.insert(key);
if (it.second == false) { // second is false if a element already existed.
batches_++;
keys_.clear();
InitWithComp(cf);
keys_[cf].insert(key);
}
}

View File

@ -541,6 +541,7 @@ struct SubBatchCounter : public WriteBatch::Handler {
size_t batches_;
size_t BatchCount() { return batches_; }
void AddKey(const uint32_t cf, const Slice& key);
void InitWithComp(const uint32_t cf);
Status MarkNoop(bool) override { return Status::OK(); }
Status MarkEndPrepare(const Slice&) override { return Status::OK(); }
Status MarkCommit(const Slice&) override { return Status::OK(); }