WritePrepared: switch PreparedHeap from priority_queue to deque (#5436)
Summary: Internally PreparedHeap is currently using a priority_queue. The rationale was the in the initial design PreparedHeap::AddPrepared could be called in arbitrary order. With the recent optimizations, we call ::AddPrepared only from the main write queue, which results into in-order insertion into PreparedHeap. The patch thus replaces the underlying priority_queue with a more efficient deque implementation. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5436 Differential Revision: D15752147 Pulled By: maysamyabandeh fbshipit-source-id: e6960f2b2097e13137dded1ceeff3b10b03b0aeb
This commit is contained in:
parent
ca1aee2a19
commit
773f914a40
@ -48,18 +48,21 @@ using CommitEntry64bFormat = WritePreparedTxnDB::CommitEntry64bFormat;
|
||||
|
||||
TEST(PreparedHeap, BasicsTest) {
|
||||
WritePreparedTxnDB::PreparedHeap heap;
|
||||
heap.push(14l);
|
||||
// Test with one element
|
||||
ASSERT_EQ(14l, heap.top());
|
||||
heap.push(24l);
|
||||
heap.push(34l);
|
||||
// Test that old min is still on top
|
||||
ASSERT_EQ(14l, heap.top());
|
||||
heap.push(44l);
|
||||
heap.push(54l);
|
||||
heap.push(64l);
|
||||
heap.push(74l);
|
||||
heap.push(84l);
|
||||
{
|
||||
MutexLock ml(heap.push_pop_mutex());
|
||||
heap.push(14l);
|
||||
// Test with one element
|
||||
ASSERT_EQ(14l, heap.top());
|
||||
heap.push(24l);
|
||||
heap.push(34l);
|
||||
// Test that old min is still on top
|
||||
ASSERT_EQ(14l, heap.top());
|
||||
heap.push(44l);
|
||||
heap.push(54l);
|
||||
heap.push(64l);
|
||||
heap.push(74l);
|
||||
heap.push(84l);
|
||||
}
|
||||
// Test that old min is still on top
|
||||
ASSERT_EQ(14l, heap.top());
|
||||
heap.erase(24l);
|
||||
@ -81,11 +84,14 @@ TEST(PreparedHeap, BasicsTest) {
|
||||
ASSERT_EQ(64l, heap.top());
|
||||
heap.erase(84l);
|
||||
ASSERT_EQ(64l, heap.top());
|
||||
heap.push(85l);
|
||||
heap.push(86l);
|
||||
heap.push(87l);
|
||||
heap.push(88l);
|
||||
heap.push(89l);
|
||||
{
|
||||
MutexLock ml(heap.push_pop_mutex());
|
||||
heap.push(85l);
|
||||
heap.push(86l);
|
||||
heap.push(87l);
|
||||
heap.push(88l);
|
||||
heap.push(89l);
|
||||
}
|
||||
heap.erase(87l);
|
||||
heap.erase(85l);
|
||||
heap.erase(89l);
|
||||
@ -106,13 +112,19 @@ TEST(PreparedHeap, BasicsTest) {
|
||||
// not resurface again.
|
||||
TEST(PreparedHeap, EmptyAtTheEnd) {
|
||||
WritePreparedTxnDB::PreparedHeap heap;
|
||||
heap.push(40l);
|
||||
{
|
||||
MutexLock ml(heap.push_pop_mutex());
|
||||
heap.push(40l);
|
||||
}
|
||||
ASSERT_EQ(40l, heap.top());
|
||||
// Although not a recommended scenario, we must be resilient against erase
|
||||
// without a prior push.
|
||||
heap.erase(50l);
|
||||
ASSERT_EQ(40l, heap.top());
|
||||
heap.push(60l);
|
||||
{
|
||||
MutexLock ml(heap.push_pop_mutex());
|
||||
heap.push(60l);
|
||||
}
|
||||
ASSERT_EQ(40l, heap.top());
|
||||
|
||||
heap.erase(60l);
|
||||
@ -120,11 +132,17 @@ TEST(PreparedHeap, EmptyAtTheEnd) {
|
||||
heap.erase(40l);
|
||||
ASSERT_TRUE(heap.empty());
|
||||
|
||||
heap.push(40l);
|
||||
{
|
||||
MutexLock ml(heap.push_pop_mutex());
|
||||
heap.push(40l);
|
||||
}
|
||||
ASSERT_EQ(40l, heap.top());
|
||||
heap.erase(50l);
|
||||
ASSERT_EQ(40l, heap.top());
|
||||
heap.push(60l);
|
||||
{
|
||||
MutexLock ml(heap.push_pop_mutex());
|
||||
heap.push(60l);
|
||||
}
|
||||
ASSERT_EQ(40l, heap.top());
|
||||
|
||||
heap.erase(40l);
|
||||
@ -139,30 +157,37 @@ TEST(PreparedHeap, EmptyAtTheEnd) {
|
||||
// successfully emptied at the end.
|
||||
TEST(PreparedHeap, Concurrent) {
|
||||
const size_t t_cnt = 10;
|
||||
rocksdb::port::Thread t[t_cnt];
|
||||
Random rnd(1103);
|
||||
rocksdb::port::Thread t[t_cnt + 1];
|
||||
WritePreparedTxnDB::PreparedHeap heap;
|
||||
port::RWMutex prepared_mutex;
|
||||
std::atomic<size_t> last;
|
||||
|
||||
for (size_t n = 0; n < 100; n++) {
|
||||
for (size_t i = 0; i < t_cnt; i++) {
|
||||
// This is not recommended usage but we should be resilient against it.
|
||||
bool skip_push = rnd.OneIn(5);
|
||||
t[i] = rocksdb::port::Thread([&heap, &prepared_mutex, skip_push, i]() {
|
||||
auto seq = i;
|
||||
std::this_thread::yield();
|
||||
last = 0;
|
||||
t[0] = rocksdb::port::Thread([&heap, t_cnt, &last]() {
|
||||
Random rnd(1103);
|
||||
for (size_t seq = 1; seq <= t_cnt; seq++) {
|
||||
// This is not recommended usage but we should be resilient against it.
|
||||
bool skip_push = rnd.OneIn(5);
|
||||
if (!skip_push) {
|
||||
WriteLock wl(&prepared_mutex);
|
||||
MutexLock ml(heap.push_pop_mutex());
|
||||
std::this_thread::yield();
|
||||
heap.push(seq);
|
||||
last.store(seq);
|
||||
}
|
||||
std::this_thread::yield();
|
||||
{
|
||||
WriteLock wl(&prepared_mutex);
|
||||
heap.erase(seq);
|
||||
}
|
||||
}
|
||||
});
|
||||
for (size_t i = 1; i <= t_cnt; i++) {
|
||||
t[i] = rocksdb::port::Thread([&heap, &prepared_mutex, &last, i]() {
|
||||
auto seq = i;
|
||||
do {
|
||||
std::this_thread::yield();
|
||||
} while (last.load() < seq);
|
||||
WriteLock wl(&prepared_mutex);
|
||||
heap.erase(seq);
|
||||
});
|
||||
}
|
||||
for (size_t i = 0; i < t_cnt; i++) {
|
||||
for (size_t i = 0; i <= t_cnt; i++) {
|
||||
t[i].join();
|
||||
}
|
||||
ASSERT_TRUE(heap.empty());
|
||||
@ -3197,7 +3222,7 @@ TEST_P(WritePreparedTransactionTest, CommitOfDelayedPrepared) {
|
||||
ReOpen();
|
||||
std::atomic<const Snapshot*> snap = {nullptr};
|
||||
std::atomic<SequenceNumber> exp_prepare = {0};
|
||||
std::atomic<bool> snapshot_taken = {false};
|
||||
rocksdb::port::Thread callback_thread;
|
||||
// Value is synchronized via snap
|
||||
PinnableSlice value;
|
||||
// Take a snapshot after publish and before RemovePrepared:Start
|
||||
@ -3208,7 +3233,6 @@ TEST_P(WritePreparedTransactionTest, CommitOfDelayedPrepared) {
|
||||
roptions.snapshot = snap.load();
|
||||
auto s = db->Get(roptions, db->DefaultColumnFamily(), "key", &value);
|
||||
ASSERT_OK(s);
|
||||
snapshot_taken.store(true);
|
||||
};
|
||||
auto callback = [&](void* param) {
|
||||
SequenceNumber prep_seq = *((SequenceNumber*)param);
|
||||
@ -3216,8 +3240,7 @@ TEST_P(WritePreparedTransactionTest, CommitOfDelayedPrepared) {
|
||||
// We need to spawn a thread to avoid deadlock since getting a
|
||||
// snpashot might end up calling AdvanceSeqByOne which needs joining
|
||||
// the write queue.
|
||||
auto t = rocksdb::port::Thread(snap_callback);
|
||||
t.detach();
|
||||
callback_thread = rocksdb::port::Thread(snap_callback);
|
||||
TEST_SYNC_POINT("callback:end");
|
||||
}
|
||||
};
|
||||
@ -3250,15 +3273,12 @@ TEST_P(WritePreparedTransactionTest, CommitOfDelayedPrepared) {
|
||||
// Let an eviction to kick in
|
||||
std::this_thread::yield();
|
||||
|
||||
snapshot_taken.store(false);
|
||||
exp_prepare.store(txn->GetId());
|
||||
ASSERT_OK(txn->Commit());
|
||||
delete txn;
|
||||
// Wait for the snapshot taking that is triggered by
|
||||
// RemovePrepared:Start callback
|
||||
while (!snapshot_taken) {
|
||||
std::this_thread::yield();
|
||||
}
|
||||
callback_thread.join();
|
||||
|
||||
// Read with the snapshot taken before delayed_prepared_ cleanup
|
||||
ReadOptions roptions;
|
||||
@ -3278,9 +3298,9 @@ TEST_P(WritePreparedTransactionTest, CommitOfDelayedPrepared) {
|
||||
});
|
||||
write_thread.join();
|
||||
eviction_thread.join();
|
||||
rocksdb::SyncPoint::GetInstance()->DisableProcessing();
|
||||
rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
|
||||
}
|
||||
rocksdb::SyncPoint::GetInstance()->DisableProcessing();
|
||||
rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -32,12 +32,19 @@ Status WritePreparedTxnDB::Initialize(
|
||||
auto dbimpl = reinterpret_cast<DBImpl*>(GetRootDB());
|
||||
assert(dbimpl != nullptr);
|
||||
auto rtxns = dbimpl->recovered_transactions();
|
||||
std::map<SequenceNumber, SequenceNumber> ordered_seq_cnt;
|
||||
for (auto rtxn : rtxns) {
|
||||
// There should only one batch for WritePrepared policy.
|
||||
assert(rtxn.second->batches_.size() == 1);
|
||||
const auto& seq = rtxn.second->batches_.begin()->first;
|
||||
const auto& batch_info = rtxn.second->batches_.begin()->second;
|
||||
auto cnt = batch_info.batch_cnt_ ? batch_info.batch_cnt_ : 1;
|
||||
ordered_seq_cnt[seq] = cnt;
|
||||
}
|
||||
// AddPrepared must be called in order
|
||||
for (auto seq_cnt: ordered_seq_cnt) {
|
||||
auto seq = seq_cnt.first;
|
||||
auto cnt = seq_cnt.second;
|
||||
for (size_t i = 0; i < cnt; i++) {
|
||||
AddPrepared(seq + i);
|
||||
}
|
||||
|
@ -511,9 +511,7 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
|
||||
// The mutex is required for push and pop from PreparedHeap. ::erase will
|
||||
// use external synchronization via prepared_mutex_.
|
||||
port::Mutex push_pop_mutex_;
|
||||
// TODO(myabandeh): replace it with deque
|
||||
std::priority_queue<uint64_t, std::vector<uint64_t>, std::greater<uint64_t>>
|
||||
heap_;
|
||||
std::deque<uint64_t> heap_;
|
||||
std::priority_queue<uint64_t, std::vector<uint64_t>, std::greater<uint64_t>>
|
||||
erased_heap_;
|
||||
std::atomic<uint64_t> heap_top_ = {kMaxSequenceNumber};
|
||||
@ -534,21 +532,27 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
|
||||
// Returns kMaxSequenceNumber if empty() and the smallest otherwise.
|
||||
inline uint64_t top() { return heap_top_.load(std::memory_order_acquire); }
|
||||
inline void push(uint64_t v) {
|
||||
heap_.push(v);
|
||||
heap_top_.store(heap_.top(), std::memory_order_release);
|
||||
push_pop_mutex_.AssertHeld();
|
||||
if (heap_.empty()) {
|
||||
heap_top_.store(v, std::memory_order_release);
|
||||
} else {
|
||||
assert(heap_top_.load() < v);
|
||||
}
|
||||
heap_.push_back(v);
|
||||
}
|
||||
void pop(bool locked = false) {
|
||||
if (!locked) {
|
||||
push_pop_mutex()->Lock();
|
||||
}
|
||||
heap_.pop();
|
||||
push_pop_mutex_.AssertHeld();
|
||||
heap_.pop_front();
|
||||
while (!heap_.empty() && !erased_heap_.empty() &&
|
||||
// heap_.top() > erased_heap_.top() could happen if we have erased
|
||||
// a non-existent entry. Ideally the user should not do that but we
|
||||
// should be resilient against it.
|
||||
heap_.top() >= erased_heap_.top()) {
|
||||
if (heap_.top() == erased_heap_.top()) {
|
||||
heap_.pop();
|
||||
heap_.front() >= erased_heap_.top()) {
|
||||
if (heap_.front() == erased_heap_.top()) {
|
||||
heap_.pop_front();
|
||||
}
|
||||
uint64_t erased __attribute__((__unused__));
|
||||
erased = erased_heap_.top();
|
||||
@ -559,7 +563,7 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
|
||||
while (heap_.empty() && !erased_heap_.empty()) {
|
||||
erased_heap_.pop();
|
||||
}
|
||||
heap_top_.store(!heap_.empty() ? heap_.top() : kMaxSequenceNumber,
|
||||
heap_top_.store(!heap_.empty() ? heap_.front() : kMaxSequenceNumber,
|
||||
std::memory_order_release);
|
||||
if (!locked) {
|
||||
push_pop_mutex()->Unlock();
|
||||
@ -568,13 +572,16 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
|
||||
// Concurrrent calls needs external synchronization. It is safe to be called
|
||||
// concurrent to push and pop though.
|
||||
void erase(uint64_t seq) {
|
||||
if (!heap_.empty()) {
|
||||
if (!empty()) {
|
||||
auto top_seq = top();
|
||||
if (seq < top_seq) {
|
||||
// Already popped, ignore it.
|
||||
} else if (top_seq == seq) {
|
||||
pop();
|
||||
assert(heap_.empty() || heap_.top() != seq);
|
||||
#ifndef NDEBUG
|
||||
MutexLock ml(push_pop_mutex());
|
||||
assert(heap_.empty() || heap_.front() != seq);
|
||||
#endif
|
||||
} else { // top() > seq
|
||||
// Down the heap, remember to pop it later
|
||||
erased_heap_.push(seq);
|
||||
|
@ -225,6 +225,7 @@ Status WriteUnpreparedTxnDB::Initialize(
|
||||
|
||||
// create 'real' transactions from recovered shell transactions
|
||||
auto rtxns = dbimpl->recovered_transactions();
|
||||
std::map<SequenceNumber, SequenceNumber> ordered_seq_cnt;
|
||||
for (auto rtxn : rtxns) {
|
||||
auto recovered_trx = rtxn.second;
|
||||
assert(recovered_trx);
|
||||
@ -266,9 +267,7 @@ Status WriteUnpreparedTxnDB::Initialize(
|
||||
auto cnt = batch_info.batch_cnt_ ? batch_info.batch_cnt_ : 1;
|
||||
assert(batch_info.log_number_);
|
||||
|
||||
for (size_t i = 0; i < cnt; i++) {
|
||||
AddPrepared(seq + i);
|
||||
}
|
||||
ordered_seq_cnt[seq] = cnt;
|
||||
assert(wupt->unprep_seqs_.count(seq) == 0);
|
||||
wupt->unprep_seqs_[seq] = cnt;
|
||||
KeySetBuilder keyset_handler(wupt,
|
||||
@ -288,6 +287,14 @@ Status WriteUnpreparedTxnDB::Initialize(
|
||||
break;
|
||||
}
|
||||
}
|
||||
// AddPrepared must be called in order
|
||||
for (auto seq_cnt: ordered_seq_cnt) {
|
||||
auto seq = seq_cnt.first;
|
||||
auto cnt = seq_cnt.second;
|
||||
for (size_t i = 0; i < cnt; i++) {
|
||||
AddPrepared(seq + i);
|
||||
}
|
||||
}
|
||||
|
||||
SequenceNumber prev_max = max_evicted_seq_;
|
||||
SequenceNumber last_seq = db_impl_->GetLatestSequenceNumber();
|
||||
|
Loading…
Reference in New Issue
Block a user