WritePrepared Txn: Lock-free CommitMap

Summary:
We had two proposals for lock-free commit maps. This patch implements the latter one that was simpler. We can later experiment with both proposals.

In this impl each entry is an std::atomic of uint64_t, which are accessed via memory_order_acquire/release. In x86_64 arch this is compiled to simple reads and writes from memory.
Closes https://github.com/facebook/rocksdb/pull/2861

Differential Revision: D5800724

Pulled By: maysamyabandeh

fbshipit-source-id: 41abae9a4a5df050a8eb696c43de11c2770afdda
This commit is contained in:
Maysam Yabandeh 2017-09-13 11:56:27 -07:00 committed by Facebook Github Bot
parent 72e4190918
commit 09713a64b3
7 changed files with 212 additions and 84 deletions

View File

@ -125,7 +125,7 @@ WriteCommittedTxn::WriteCommittedTxn(TransactionDB* txn_db,
const TransactionOptions& txn_options) const TransactionOptions& txn_options)
: PessimisticTransaction(txn_db, write_options, txn_options){}; : PessimisticTransaction(txn_db, write_options, txn_options){};
Status WriteCommittedTxn::CommitBatch(WriteBatch* batch) { Status PessimisticTransaction::CommitBatch(WriteBatch* batch) {
TransactionKeyMap keys_to_unlock; TransactionKeyMap keys_to_unlock;
Status s = LockBatch(batch, &keys_to_unlock); Status s = LockBatch(batch, &keys_to_unlock);

View File

@ -52,7 +52,7 @@ class PessimisticTransaction : public TransactionBaseImpl {
// It is basically Commit without going through Prepare phase. The write batch // It is basically Commit without going through Prepare phase. The write batch
// is also directly provided instead of expecting txn to gradually batch the // is also directly provided instead of expecting txn to gradually batch the
// transactions writes to an internal write batch. // transactions writes to an internal write batch.
virtual Status CommitBatch(WriteBatch* batch) = 0; Status CommitBatch(WriteBatch* batch);
Status Rollback() override = 0; Status Rollback() override = 0;
@ -191,8 +191,6 @@ class WriteCommittedTxn : public PessimisticTransaction {
virtual ~WriteCommittedTxn() {} virtual ~WriteCommittedTxn() {}
Status CommitBatch(WriteBatch* batch) override;
Status Rollback() override; Status Rollback() override;
private: private:

View File

@ -541,8 +541,9 @@ bool WritePreparedTxnDB::IsInSnapshot(uint64_t prep_seq,
} }
} }
auto indexed_seq = prep_seq % COMMIT_CACHE_SIZE; auto indexed_seq = prep_seq % COMMIT_CACHE_SIZE;
CommitEntry64b dont_care;
CommitEntry cached; CommitEntry cached;
bool exist = GetCommitEntry(indexed_seq, &cached); bool exist = GetCommitEntry(indexed_seq, &dont_care, &cached);
if (!exist) { if (!exist) {
// It is not committed, so it must be still prepared // It is not committed, so it must be still prepared
return false; return false;
@ -599,8 +600,9 @@ void WritePreparedTxnDB::AddCommitted(uint64_t prepare_seq,
ROCKS_LOG_DEBUG(info_log_, "Txn %" PRIu64 " Committing with %" PRIu64, ROCKS_LOG_DEBUG(info_log_, "Txn %" PRIu64 " Committing with %" PRIu64,
prepare_seq, commit_seq); prepare_seq, commit_seq);
auto indexed_seq = prepare_seq % COMMIT_CACHE_SIZE; auto indexed_seq = prepare_seq % COMMIT_CACHE_SIZE;
CommitEntry64b evicted_64b;
CommitEntry evicted; CommitEntry evicted;
bool to_be_evicted = GetCommitEntry(indexed_seq, &evicted); bool to_be_evicted = GetCommitEntry(indexed_seq, &evicted_64b, &evicted);
if (to_be_evicted) { if (to_be_evicted) {
auto prev_max = max_evicted_seq_.load(std::memory_order_acquire); auto prev_max = max_evicted_seq_.load(std::memory_order_acquire);
if (prev_max < evicted.commit_seq) { if (prev_max < evicted.commit_seq) {
@ -613,7 +615,7 @@ void WritePreparedTxnDB::AddCommitted(uint64_t prepare_seq,
CheckAgainstSnapshots(evicted); CheckAgainstSnapshots(evicted);
} }
bool succ = bool succ =
ExchangeCommitEntry(indexed_seq, evicted, {prepare_seq, commit_seq}); ExchangeCommitEntry(indexed_seq, evicted_64b, {prepare_seq, commit_seq});
if (!succ) { if (!succ) {
// A very rare event, in which the commit entry is updated before we do. // A very rare event, in which the commit entry is updated before we do.
// Here we apply a very simple solution of retrying. // Here we apply a very simple solution of retrying.
@ -636,34 +638,32 @@ void WritePreparedTxnDB::AddCommitted(uint64_t prepare_seq,
} }
bool WritePreparedTxnDB::GetCommitEntry(const uint64_t indexed_seq, bool WritePreparedTxnDB::GetCommitEntry(const uint64_t indexed_seq,
CommitEntry64b* entry_64b,
CommitEntry* entry) { CommitEntry* entry) {
// TODO(myabandeh): implement lock-free commit_cache_ *entry_64b = commit_cache_[indexed_seq].load(std::memory_order_acquire);
ReadLock rl(&commit_cache_mutex_); bool valid = entry_64b->Parse(indexed_seq, entry, FORMAT);
*entry = commit_cache_[indexed_seq]; return valid;
return (entry->commit_seq != 0); // initialized
} }
bool WritePreparedTxnDB::AddCommitEntry(const uint64_t indexed_seq, bool WritePreparedTxnDB::AddCommitEntry(const uint64_t indexed_seq,
const CommitEntry& new_entry, const CommitEntry& new_entry,
CommitEntry* evicted_entry) { CommitEntry* evicted_entry) {
// TODO(myabandeh): implement lock-free commit_cache_ CommitEntry64b new_entry_64b(new_entry, FORMAT);
WriteLock wl(&commit_cache_mutex_); CommitEntry64b evicted_entry_64b = commit_cache_[indexed_seq].exchange(
*evicted_entry = commit_cache_[indexed_seq]; new_entry_64b, std::memory_order_acq_rel);
commit_cache_[indexed_seq] = new_entry; bool valid = evicted_entry_64b.Parse(indexed_seq, evicted_entry, FORMAT);
return (evicted_entry->commit_seq != 0); // initialized return valid;
} }
bool WritePreparedTxnDB::ExchangeCommitEntry(const uint64_t indexed_seq, bool WritePreparedTxnDB::ExchangeCommitEntry(const uint64_t indexed_seq,
const CommitEntry& expected_entry, CommitEntry64b& expected_entry_64b,
const CommitEntry& new_entry) { const CommitEntry& new_entry) {
// TODO(myabandeh): implement lock-free commit_cache_ auto& atomic_entry = commit_cache_[indexed_seq];
WriteLock wl(&commit_cache_mutex_); CommitEntry64b new_entry_64b(new_entry, FORMAT);
auto& evicted_entry = commit_cache_[indexed_seq]; bool succ = atomic_entry.compare_exchange_strong(
if (evicted_entry.prep_seq != expected_entry.prep_seq) { expected_entry_64b, new_entry_64b, std::memory_order_acq_rel,
return false; std::memory_order_acquire);
} return succ;
commit_cache_[indexed_seq] = new_entry;
return true;
} }
void WritePreparedTxnDB::AdvanceMaxEvictedSeq(SequenceNumber& prev_max, void WritePreparedTxnDB::AdvanceMaxEvictedSeq(SequenceNumber& prev_max,
@ -700,10 +700,9 @@ void WritePreparedTxnDB::AdvanceMaxEvictedSeq(SequenceNumber& prev_max,
if (update_snapshots) { if (update_snapshots) {
UpdateSnapshots(snapshots, new_snapshots_version); UpdateSnapshots(snapshots, new_snapshots_version);
} }
// TODO(myabandeh): check if it worked with relaxed ordering
while (prev_max < new_max && !max_evicted_seq_.compare_exchange_weak( while (prev_max < new_max && !max_evicted_seq_.compare_exchange_weak(
prev_max, new_max, std::memory_order_release, prev_max, new_max, std::memory_order_acq_rel,
std::memory_order_acquire)) { std::memory_order_relaxed)) {
}; };
} }

View File

@ -105,16 +105,6 @@ class PessimisticTransactionDB : public TransactionDB {
std::vector<DeadlockPath> GetDeadlockInfoBuffer() override; std::vector<DeadlockPath> GetDeadlockInfoBuffer() override;
void SetDeadlockInfoBufferSize(uint32_t target_size) override; void SetDeadlockInfoBufferSize(uint32_t target_size) override;
struct CommitEntry {
uint64_t prep_seq;
uint64_t commit_seq;
CommitEntry() : prep_seq(0), commit_seq(0) {}
CommitEntry(uint64_t ps, uint64_t cs) : prep_seq(ps), commit_seq(cs) {}
bool operator==(const CommitEntry& rhs) const {
return prep_seq == rhs.prep_seq && commit_seq == rhs.commit_seq;
}
};
protected: protected:
void ReinitializeTransaction( void ReinitializeTransaction(
Transaction* txn, const WriteOptions& write_options, Transaction* txn, const WriteOptions& write_options,
@ -170,21 +160,27 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
public: public:
explicit WritePreparedTxnDB( explicit WritePreparedTxnDB(
DB* db, const TransactionDBOptions& txn_db_options, DB* db, const TransactionDBOptions& txn_db_options,
size_t snapshot_cache_size = DEF_SNAPSHOT_CACHE_SIZE, size_t snapshot_cache_bits = DEF_SNAPSHOT_CACHE_BITS,
size_t commit_cache_size = DEF_COMMIT_CACHE_SIZE) size_t commit_cache_bits = DEF_COMMIT_CACHE_BITS)
: PessimisticTransactionDB(db, txn_db_options), : PessimisticTransactionDB(db, txn_db_options),
SNAPSHOT_CACHE_SIZE(snapshot_cache_size), SNAPSHOT_CACHE_BITS(snapshot_cache_bits),
COMMIT_CACHE_SIZE(commit_cache_size) { SNAPSHOT_CACHE_SIZE(static_cast<size_t>(1ull << SNAPSHOT_CACHE_BITS)),
COMMIT_CACHE_BITS(commit_cache_bits),
COMMIT_CACHE_SIZE(static_cast<size_t>(1ull << COMMIT_CACHE_BITS)),
FORMAT(COMMIT_CACHE_BITS) {
init(txn_db_options); init(txn_db_options);
} }
explicit WritePreparedTxnDB( explicit WritePreparedTxnDB(
StackableDB* db, const TransactionDBOptions& txn_db_options, StackableDB* db, const TransactionDBOptions& txn_db_options,
size_t snapshot_cache_size = DEF_SNAPSHOT_CACHE_SIZE, size_t snapshot_cache_bits = DEF_SNAPSHOT_CACHE_BITS,
size_t commit_cache_size = DEF_COMMIT_CACHE_SIZE) size_t commit_cache_bits = DEF_COMMIT_CACHE_BITS)
: PessimisticTransactionDB(db, txn_db_options), : PessimisticTransactionDB(db, txn_db_options),
SNAPSHOT_CACHE_SIZE(snapshot_cache_size), SNAPSHOT_CACHE_BITS(snapshot_cache_bits),
COMMIT_CACHE_SIZE(commit_cache_size) { SNAPSHOT_CACHE_SIZE(static_cast<size_t>(1ull << SNAPSHOT_CACHE_BITS)),
COMMIT_CACHE_BITS(commit_cache_bits),
COMMIT_CACHE_SIZE(static_cast<size_t>(1ull << COMMIT_CACHE_BITS)),
FORMAT(COMMIT_CACHE_BITS) {
init(txn_db_options); init(txn_db_options);
} }
@ -203,6 +199,87 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
// commit_seq to the commit map // commit_seq to the commit map
void AddCommitted(uint64_t prepare_seq, uint64_t commit_seq); void AddCommitted(uint64_t prepare_seq, uint64_t commit_seq);
struct CommitEntry {
uint64_t prep_seq;
uint64_t commit_seq;
CommitEntry() : prep_seq(0), commit_seq(0) {}
CommitEntry(uint64_t ps, uint64_t cs) : prep_seq(ps), commit_seq(cs) {}
bool operator==(const CommitEntry& rhs) const {
return prep_seq == rhs.prep_seq && commit_seq == rhs.commit_seq;
}
};
struct CommitEntry64bFormat {
explicit CommitEntry64bFormat(size_t index_bits)
: INDEX_BITS(index_bits),
PREP_BITS(static_cast<size_t>(64 - PAD_BITS - INDEX_BITS)),
COMMIT_BITS(static_cast<size_t>(64 - PREP_BITS)),
COMMIT_FILTER(static_cast<uint64_t>((1ull << COMMIT_BITS) - 1)) {}
// Number of higher bits of a sequence number that is not used. They are
// used to encode the value type, ...
const size_t PAD_BITS = static_cast<size_t>(8);
// Number of lower bits from prepare seq that can be skipped as they are
// implied by the index of the entry in the array
const size_t INDEX_BITS;
// Number of bits we use to encode the prepare seq
const size_t PREP_BITS;
// Number of bits we use to encode the commit seq.
const size_t COMMIT_BITS;
// Filter to encode/decode commit seq
const uint64_t COMMIT_FILTER;
};
// Prepare Seq (64 bits) = PAD ... PAD PREP PREP ... PREP INDEX INDEX ...
// INDEX Detal Seq (64 bits) = 0 0 0 0 0 0 0 0 0 0 0 0 DELTA DELTA ...
// DELTA DELTA Encoded Value = PREP PREP .... PREP PREP DELTA DELTA
// ... DELTA DELTA PAD: first bits of a seq that is reserved for tagging and
// hence ignored PREP/INDEX: the used bits in a prepare seq number INDEX: the
// bits that do not have to be encoded (will be provided externally) DELTA:
// prep seq - commit seq + 1 Number of DELTA bits should be equal to number of
// index bits + PADs
struct CommitEntry64b {
constexpr CommitEntry64b() noexcept : rep_(0) {}
CommitEntry64b(const CommitEntry& entry, const CommitEntry64bFormat& format)
: CommitEntry64b(entry.prep_seq, entry.commit_seq, format) {}
CommitEntry64b(const uint64_t ps, const uint64_t cs,
const CommitEntry64bFormat& format) {
assert(ps < static_cast<uint64_t>(
(1ull << (format.PREP_BITS + format.INDEX_BITS))));
assert(ps <= cs);
uint64_t delta = cs - ps + 1; // make initialized delta always >= 1
// zero is reserved for uninitialized entries
assert(0 < delta);
assert(delta < static_cast<uint64_t>((1ull << format.COMMIT_BITS)));
rep_ = (ps << format.PAD_BITS) & ~format.COMMIT_FILTER;
rep_ = rep_ | delta;
}
// Return false if the entry is empty
bool Parse(const uint64_t indexed_seq, CommitEntry* entry,
const CommitEntry64bFormat& format) {
uint64_t delta = rep_ & format.COMMIT_FILTER;
// zero is reserved for uninitialized entries
assert(delta < static_cast<uint64_t>((1ull << format.COMMIT_BITS)));
if (delta == 0) {
return false; // initialized entry would have non-zero delta
}
assert(indexed_seq < static_cast<uint64_t>((1ull << format.INDEX_BITS)));
uint64_t prep_up = rep_ & ~format.COMMIT_FILTER;
prep_up >>= format.PAD_BITS;
const uint64_t& prep_low = indexed_seq;
entry->prep_seq = prep_up | prep_low;
entry->commit_seq = entry->prep_seq + delta - 1;
return true;
}
private:
uint64_t rep_;
};
private: private:
friend class WritePreparedTransactionTest_IsInSnapshotTest_Test; friend class WritePreparedTransactionTest_IsInSnapshotTest_Test;
friend class WritePreparedTransactionTest_CheckAgainstSnapshotsTest_Test; friend class WritePreparedTransactionTest_CheckAgainstSnapshotsTest_Test;
@ -220,8 +297,8 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
std::max(SNAPSHOT_CACHE_SIZE / 100, static_cast<size_t>(1)); std::max(SNAPSHOT_CACHE_SIZE / 100, static_cast<size_t>(1));
snapshot_cache_ = unique_ptr<std::atomic<SequenceNumber>[]>( snapshot_cache_ = unique_ptr<std::atomic<SequenceNumber>[]>(
new std::atomic<SequenceNumber>[SNAPSHOT_CACHE_SIZE] {}); new std::atomic<SequenceNumber>[SNAPSHOT_CACHE_SIZE] {});
commit_cache_ = commit_cache_ = unique_ptr<std::atomic<CommitEntry64b>[]>(
unique_ptr<CommitEntry[]>(new CommitEntry[COMMIT_CACHE_SIZE]{}); new std::atomic<CommitEntry64b>[COMMIT_CACHE_SIZE] {});
} }
// A heap with the amortized O(1) complexity for erase. It uses one extra heap // A heap with the amortized O(1) complexity for erase. It uses one extra heap
@ -263,7 +340,8 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
// Get the commit entry with index indexed_seq from the commit table. It // Get the commit entry with index indexed_seq from the commit table. It
// returns true if such entry exists. // returns true if such entry exists.
bool GetCommitEntry(const uint64_t indexed_seq, CommitEntry* entry); bool GetCommitEntry(const uint64_t indexed_seq, CommitEntry64b* entry_64b,
CommitEntry* entry);
// Rewrite the entry with the index indexed_seq in the commit table with the // Rewrite the entry with the index indexed_seq in the commit table with the
// commit entry <prep_seq, commit_seq>. If the rewrite results into eviction, // commit entry <prep_seq, commit_seq>. If the rewrite results into eviction,
@ -275,7 +353,7 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
// commit entry new_entry only if the existing entry matches the // commit entry new_entry only if the existing entry matches the
// expected_entry. Returns false otherwise. // expected_entry. Returns false otherwise.
bool ExchangeCommitEntry(const uint64_t indexed_seq, bool ExchangeCommitEntry(const uint64_t indexed_seq,
const CommitEntry& expected_entry, CommitEntry64b& expected_entry,
const CommitEntry& new_entry); const CommitEntry& new_entry);
// Increase max_evicted_seq_ from the previous value prev_max to the new // Increase max_evicted_seq_ from the previous value prev_max to the new
@ -325,7 +403,8 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
// with snapshots_mutex_ and concurrent reads are safe due to std::atomic for // with snapshots_mutex_ and concurrent reads are safe due to std::atomic for
// each entry. In x86_64 architecture such reads are compiled to simple read // each entry. In x86_64 architecture such reads are compiled to simple read
// instructions. 128 entries // instructions. 128 entries
static const size_t DEF_SNAPSHOT_CACHE_SIZE = static_cast<size_t>(1 << 7); static const size_t DEF_SNAPSHOT_CACHE_BITS = static_cast<size_t>(7);
const size_t SNAPSHOT_CACHE_BITS;
const size_t SNAPSHOT_CACHE_SIZE; const size_t SNAPSHOT_CACHE_SIZE;
unique_ptr<std::atomic<SequenceNumber>[]> snapshot_cache_; unique_ptr<std::atomic<SequenceNumber>[]> snapshot_cache_;
// 2nd list for storing snapshots. The list sorted in ascending order. // 2nd list for storing snapshots. The list sorted in ascending order.
@ -339,11 +418,13 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
// prepared_mutex_. // prepared_mutex_.
PreparedHeap prepared_txns_; PreparedHeap prepared_txns_;
// 10m entry, 80MB size // 10m entry, 80MB size
static const size_t DEF_COMMIT_CACHE_SIZE = static_cast<size_t>(1 << 21); static const size_t DEF_COMMIT_CACHE_BITS = static_cast<size_t>(21);
const size_t COMMIT_CACHE_BITS;
const size_t COMMIT_CACHE_SIZE; const size_t COMMIT_CACHE_SIZE;
const CommitEntry64bFormat FORMAT;
// commit_cache_ must be initialized to zero to tell apart an empty index from // commit_cache_ must be initialized to zero to tell apart an empty index from
// a filled one. Thread-safety is provided with commit_cache_mutex_. // a filled one. Thread-safety is provided with commit_cache_mutex_.
unique_ptr<CommitEntry[]> commit_cache_; unique_ptr<std::atomic<CommitEntry64b>[]> commit_cache_;
// The largest evicted *commit* sequence number from the commit_cache_ // The largest evicted *commit* sequence number from the commit_cache_
std::atomic<uint64_t> max_evicted_seq_ = {}; std::atomic<uint64_t> max_evicted_seq_ = {};
// Advance max_evicted_seq_ by this value each time it needs an update. The // Advance max_evicted_seq_ by this value each time it needs an update. The

View File

@ -40,7 +40,9 @@ using std::string;
namespace rocksdb { namespace rocksdb {
using CommitEntry = PessimisticTransactionDB::CommitEntry; using CommitEntry = WritePreparedTxnDB::CommitEntry;
using CommitEntry64b = WritePreparedTxnDB::CommitEntry64b;
using CommitEntry64bFormat = WritePreparedTxnDB::CommitEntry64bFormat;
TEST(PreparedHeap, BasicsTest) { TEST(PreparedHeap, BasicsTest) {
WritePreparedTxnDB::PreparedHeap heap; WritePreparedTxnDB::PreparedHeap heap;
@ -106,6 +108,49 @@ TEST(PreparedHeap, BasicsTest) {
ASSERT_TRUE(heap.empty()); ASSERT_TRUE(heap.empty());
} }
TEST(CommitEntry64b, BasicTest) {
const size_t INDEX_BITS = static_cast<size_t>(21);
const size_t INDEX_SIZE = static_cast<size_t>(1ull << INDEX_BITS);
const CommitEntry64bFormat FORMAT(static_cast<size_t>(INDEX_BITS));
// zero-initialized CommitEntry64b should inidcate an empty entry
CommitEntry64b empty_entry64b;
uint64_t empty_index = 11ul;
CommitEntry empty_entry;
bool ok = empty_entry64b.Parse(empty_index, &empty_entry, FORMAT);
ASSERT_FALSE(ok);
// the zero entry is reserved for un-initialized entries
const size_t MAX_COMMIT = (1 << FORMAT.COMMIT_BITS) - 1 - 1;
// Samples over the numbers that are covered by that many index bits
std::array<uint64_t, 4> is = {{0, 1, INDEX_SIZE / 2 + 1, INDEX_SIZE - 1}};
// Samples over the numbers that are covered by that many commit bits
std::array<uint64_t, 4> ds = {{0, 1, MAX_COMMIT / 2 + 1, MAX_COMMIT}};
// Iterate over prepare numbers that have i) cover all bits of a sequence
// number, and ii) include some bits that fall into the range of index or
// commit bits
for (uint64_t base = 1; base < kMaxSequenceNumber; base *= 2) {
for (uint64_t i : is) {
for (uint64_t d : ds) {
uint64_t p = base + i + d;
for (uint64_t c : {p, p + d / 2, p + d}) {
uint64_t index = p % INDEX_SIZE;
CommitEntry before(p, c), after;
CommitEntry64b entry64b(before, FORMAT);
ok = entry64b.Parse(index, &after, FORMAT);
ASSERT_TRUE(ok);
if (!(before == after)) {
printf("base %" PRIu64 " i %" PRIu64 " d %" PRIu64 " p %" PRIu64
" c %" PRIu64 " index %" PRIu64 "\n",
base, i, d, p, c, index);
}
ASSERT_EQ(before, after);
}
}
}
}
}
class WritePreparedTxnDBMock : public WritePreparedTxnDB { class WritePreparedTxnDBMock : public WritePreparedTxnDB {
public: public:
WritePreparedTxnDBMock(DBImpl* db_impl, TransactionDBOptions& opt) WritePreparedTxnDBMock(DBImpl* db_impl, TransactionDBOptions& opt)
@ -255,33 +300,35 @@ TEST_P(WritePreparedTransactionTest, CommitMapTest) {
ASSERT_FALSE(evicted); ASSERT_FALSE(evicted);
// Should be able to read the same value // Should be able to read the same value
bool found = wp_db->GetCommitEntry(c.prep_seq % size, &e); CommitEntry64b dont_care;
bool found = wp_db->GetCommitEntry(c.prep_seq % size, &dont_care, &e);
ASSERT_TRUE(found); ASSERT_TRUE(found);
ASSERT_EQ(c, e); ASSERT_EQ(c, e);
// Should be able to distinguish between overlapping entries // Should be able to distinguish between overlapping entries
found = wp_db->GetCommitEntry((c.prep_seq + size) % size, &e); found = wp_db->GetCommitEntry((c.prep_seq + size) % size, &dont_care, &e);
ASSERT_TRUE(found); ASSERT_TRUE(found);
ASSERT_NE(c.prep_seq + size, e.prep_seq); ASSERT_NE(c.prep_seq + size, e.prep_seq);
// Should be able to detect non-existent entry // Should be able to detect non-existent entry
found = wp_db->GetCommitEntry((c.prep_seq + 1) % size, &e); found = wp_db->GetCommitEntry((c.prep_seq + 1) % size, &dont_care, &e);
ASSERT_EQ(e.commit_seq, 0);
ASSERT_FALSE(found); ASSERT_FALSE(found);
// Reject an invalid exchange // Reject an invalid exchange
CommitEntry e2 = {c.prep_seq + size, c.commit_seq}; CommitEntry e2 = {c.prep_seq + size, c.commit_seq + size};
bool exchanged = wp_db->ExchangeCommitEntry(e2.prep_seq % size, e2, e); CommitEntry64b e2_64b(e2, wp_db->FORMAT);
bool exchanged = wp_db->ExchangeCommitEntry(e2.prep_seq % size, e2_64b, e);
ASSERT_FALSE(exchanged); ASSERT_FALSE(exchanged);
// check whether it did actually reject that // check whether it did actually reject that
found = wp_db->GetCommitEntry(e2.prep_seq % size, &e); found = wp_db->GetCommitEntry(e2.prep_seq % size, &dont_care, &e);
ASSERT_TRUE(found); ASSERT_TRUE(found);
ASSERT_EQ(c, e); ASSERT_EQ(c, e);
// Accept a valid exchange // Accept a valid exchange
CommitEntry64b c_64b(c, wp_db->FORMAT);
CommitEntry e3 = {c.prep_seq + size, c.commit_seq + size + 1}; CommitEntry e3 = {c.prep_seq + size, c.commit_seq + size + 1};
exchanged = wp_db->ExchangeCommitEntry(c.prep_seq % size, c, e3); exchanged = wp_db->ExchangeCommitEntry(c.prep_seq % size, c_64b, e3);
ASSERT_TRUE(exchanged); ASSERT_TRUE(exchanged);
// check whether it did actually accepted that // check whether it did actually accepted that
found = wp_db->GetCommitEntry(c.prep_seq % size, &e); found = wp_db->GetCommitEntry(c.prep_seq % size, &dont_care, &e);
ASSERT_TRUE(found); ASSERT_TRUE(found);
ASSERT_EQ(e3, e); ASSERT_EQ(e3, e);
@ -290,7 +337,7 @@ TEST_P(WritePreparedTransactionTest, CommitMapTest) {
evicted = wp_db->AddCommitEntry(e4.prep_seq % size, e4, &e); evicted = wp_db->AddCommitEntry(e4.prep_seq % size, e4, &e);
ASSERT_TRUE(evicted); ASSERT_TRUE(evicted);
ASSERT_EQ(e3, e); ASSERT_EQ(e3, e);
found = wp_db->GetCommitEntry(e4.prep_seq % size, &e); found = wp_db->GetCommitEntry(e4.prep_seq % size, &dont_care, &e);
ASSERT_TRUE(found); ASSERT_TRUE(found);
ASSERT_EQ(e4, e); ASSERT_EQ(e4, e);
} }
@ -333,11 +380,15 @@ TEST_P(WritePreparedTransactionTest, MaybeUpdateOldCommitMap) {
} }
TEST_P(WritePreparedTransactionTest, CheckAgainstSnapshotsTest) { TEST_P(WritePreparedTransactionTest, CheckAgainstSnapshotsTest) {
std::vector<SequenceNumber> snapshots = {100l, 200l, 300l, 400l, std::vector<SequenceNumber> snapshots = {100l, 200l, 300l, 400l, 500l,
500l, 600l, 700l}; 600l, 700l, 800l, 900l};
const size_t snapshot_cache_bits = 2;
// Safety check to express the intended size in the test. Can be adjusted if
// the snapshots lists changed.
assert((1ul << snapshot_cache_bits) * 2 + 1 == snapshots.size());
DBImpl* mock_db = new DBImpl(options, dbname); DBImpl* mock_db = new DBImpl(options, dbname);
std::unique_ptr<WritePreparedTxnDBMock> wp_db(new WritePreparedTxnDBMock( std::unique_ptr<WritePreparedTxnDBMock> wp_db(
mock_db, txn_db_options, snapshots.size() / 2)); new WritePreparedTxnDBMock(mock_db, txn_db_options, snapshot_cache_bits));
SequenceNumber version = 1000l; SequenceNumber version = 1000l;
ASSERT_EQ(0, wp_db->snapshots_total_); ASSERT_EQ(0, wp_db->snapshots_total_);
wp_db->UpdateSnapshots(snapshots, version); wp_db->UpdateSnapshots(snapshots, version);
@ -345,9 +396,9 @@ TEST_P(WritePreparedTransactionTest, CheckAgainstSnapshotsTest) {
// seq numbers are chosen so that we have two of them between each two // seq numbers are chosen so that we have two of them between each two
// snapshots. If the diff of two consecuitive seq is more than 5, there is a // snapshots. If the diff of two consecuitive seq is more than 5, there is a
// snapshot between them. // snapshot between them.
std::vector<SequenceNumber> seqs = {50l, 55l, 150l, 155l, 250l, 255l, std::vector<SequenceNumber> seqs = {50l, 55l, 150l, 155l, 250l, 255l, 350l,
350l, 355l, 450l, 455l, 550l, 555l, 355l, 450l, 455l, 550l, 555l, 650l, 655l,
650l, 655l, 750l, 755l}; 750l, 755l, 850l, 855l, 950l, 955l};
assert(seqs.size() > 1); assert(seqs.size() > 1);
for (size_t i = 0; i < seqs.size() - 1; i++) { for (size_t i = 0; i < seqs.size() - 1; i++) {
wp_db->old_commit_map_empty_ = true; // reset wp_db->old_commit_map_empty_ = true; // reset
@ -374,12 +425,16 @@ TEST_P(WritePreparedTransactionTest, SnapshotConcurrentAccessTest) {
// in the methods must also be added. // in the methods must also be added.
const std::vector<SequenceNumber> snapshots = {10l, 20l, 30l, 40l, 50l, const std::vector<SequenceNumber> snapshots = {10l, 20l, 30l, 40l, 50l,
60l, 70l, 80l, 90l, 100l}; 60l, 70l, 80l, 90l, 100l};
const size_t snapshot_cache_bits = 2;
// Safety check to express the intended size in the test. Can be adjusted if
// the snapshots lists changed.
assert((1ul << snapshot_cache_bits) * 2 + 2 == snapshots.size());
SequenceNumber version = 1000l; SequenceNumber version = 1000l;
// Choose the cache size so that the new snapshot list could replace all the // Choose the cache size so that the new snapshot list could replace all the
// existing items in the cache and also have some overflow. // existing items in the cache and also have some overflow.
DBImpl* mock_db = new DBImpl(options, dbname); DBImpl* mock_db = new DBImpl(options, dbname);
std::unique_ptr<WritePreparedTxnDBMock> wp_db(new WritePreparedTxnDBMock( std::unique_ptr<WritePreparedTxnDBMock> wp_db(
mock_db, txn_db_options, (snapshots.size() - 2) / 2)); new WritePreparedTxnDBMock(mock_db, txn_db_options, snapshot_cache_bits));
// Add up to 2 items that do not fit into the cache // Add up to 2 items that do not fit into the cache
for (size_t old_size = 1; old_size <= wp_db->SNAPSHOT_CACHE_SIZE + 2; for (size_t old_size = 1; old_size <= wp_db->SNAPSHOT_CACHE_SIZE + 2;
old_size++) { old_size++) {
@ -435,6 +490,7 @@ TEST_P(WritePreparedTransactionTest, SnapshotConcurrentAccessTest) {
} }
} }
} }
printf("\n");
} }
#endif #endif
@ -502,9 +558,9 @@ TEST_P(WritePreparedTransactionTest, IsInSnapshotTest) {
WriteOptions wo; WriteOptions wo;
// Use small commit cache to trigger lots of eviction and fast advance of // Use small commit cache to trigger lots of eviction and fast advance of
// max_evicted_seq_ // max_evicted_seq_
const size_t commit_cache_size = 8; const size_t commit_cache_bits = 3;
// Same for snapshot cache size // Same for snapshot cache size
const size_t snapshot_cache_size = 5; const size_t snapshot_cache_bits = 2;
// Take some preliminary snapshots first. This is to stress the data structure // Take some preliminary snapshots first. This is to stress the data structure
// that holds the old snapshots as it will be designed to be efficient when // that holds the old snapshots as it will be designed to be efficient when
@ -538,7 +594,7 @@ TEST_P(WritePreparedTransactionTest, IsInSnapshotTest) {
std::set<uint64_t> committed_before; std::set<uint64_t> committed_before;
DBImpl* mock_db = new DBImpl(options, dbname); DBImpl* mock_db = new DBImpl(options, dbname);
std::unique_ptr<WritePreparedTxnDBMock> wp_db(new WritePreparedTxnDBMock( std::unique_ptr<WritePreparedTxnDBMock> wp_db(new WritePreparedTxnDBMock(
mock_db, txn_db_options, snapshot_cache_size, commit_cache_size)); mock_db, txn_db_options, snapshot_cache_bits, commit_cache_bits));
// We continue until max advances a bit beyond the snapshot. // We continue until max advances a bit beyond the snapshot.
while (!snapshot || wp_db->max_evicted_seq_ < snapshot + 100) { while (!snapshot || wp_db->max_evicted_seq_ < snapshot + 100) {
// do prepare for a transaction // do prepare for a transaction

View File

@ -41,12 +41,6 @@ Status WritePreparedTxn::Get(const ReadOptions& read_options,
pinnable_val, &callback); pinnable_val, &callback);
} }
Status WritePreparedTxn::CommitBatch(WriteBatch* /* unused */) {
// TODO(myabandeh) Implement this
throw std::runtime_error("CommitBatch not Implemented");
return Status::OK();
}
Status WritePreparedTxn::PrepareInternal() { Status WritePreparedTxn::PrepareInternal() {
WriteOptions write_options = write_options_; WriteOptions write_options = write_options_;
write_options.disableWAL = false; write_options.disableWAL = false;
@ -97,6 +91,8 @@ Status WritePreparedTxn::CommitInternal() {
auto s = db_impl_->WriteImpl(write_options_, working_batch, nullptr, nullptr, auto s = db_impl_->WriteImpl(write_options_, working_batch, nullptr, nullptr,
log_number_, disable_memtable, &seq_used); log_number_, disable_memtable, &seq_used);
uint64_t& commit_seq = seq_used; uint64_t& commit_seq = seq_used;
// TODO(myabandeh): Reject a commit request if AddCommitted cannot encode
// commit_seq. This happens if prep_seq <<< commit_seq.
wpt_db_->AddCommitted(prepare_seq_, commit_seq); wpt_db_->AddCommitted(prepare_seq_, commit_seq);
return s; return s;
} }

View File

@ -50,8 +50,6 @@ class WritePreparedTxn : public PessimisticTransaction {
ColumnFamilyHandle* column_family, const Slice& key, ColumnFamilyHandle* column_family, const Slice& key,
PinnableSlice* value) override; PinnableSlice* value) override;
Status CommitBatch(WriteBatch* batch) override;
Status Rollback() override; Status Rollback() override;
private: private: