WritePrepared Txn: add stats

Summary: Adding some stats that would be helpful to monitor if the DB has gone to unlikely stats that would hurt the performance. These are mostly when we end up needing to acquire a mutex. Closes https://github.com/facebook/rocksdb/pull/3683 Differential Revision: D7529393 Pulled By: maysamyabandeh fbshipit-source-id: f7d36279a8f39bd84d8ddbf64b5c97f670c5d6d9
2018-04-07 21:55:42 -07:00 · 2018-04-07 21:55:42 -07:00 · bde1c1a72a
commit bde1c1a72a
parent eb5a295440
6 changed files with 51 additions and 26 deletions
--- a/db/db_impl.h
+++ b/db/db_impl.h
@ -691,14 +691,20 @@ class DBImpl : public DB {
  void EraseThreadStatusDbInfo() const;

  // If disable_memtable is set the application logic must guarantee that the
-  // batch will still be skipped from memtable during the recovery. In
-  // WriteCommitted it is guarnateed since disable_memtable is used for prepare
-  // batch which will be written to memtable later during the commit, and in
-  // WritePrepared it is guaranteed since it will be used only for WAL markers
-  // which will never be written to memtable.
-  // batch_cnt is expected to be non-zero in seq_per_batch mode and indicates
-  // the number of sub-patches. A sub-patch is a subset of the write batch that
-  // does not have duplicate keys.
+  // batch will still be skipped from memtable during the recovery. An excption
+  // to this is seq_per_batch_ mode, in which since each batch already takes one
+  // seq, it is ok for the batch to write to memtable during recovery as long as
+  // it only takes one sequence number: i.e., no duplicate keys.
+  // In WriteCommitted it is guarnateed since disable_memtable is used for
+  // prepare batch which will be written to memtable later during the commit,
+  // and in WritePrepared it is guaranteed since it will be used only for WAL
+  // markers which will never be written to memtable. If the commit marker is
+  // accompanied with CommitTimeWriteBatch that is not written to memtable as
+  // long as it has no duplicate keys, it does not violate the one-seq-per-batch
+  // policy.
+  // batch_cnt is expected to be non-zero in seq_per_batch mode and
+  // indicates the number of sub-patches. A sub-patch is a subset of the write
+  // batch that does not have duplicate keys.
  Status WriteImpl(const WriteOptions& options, WriteBatch* updates,
                   WriteCallback* callback = nullptr,
                   uint64_t* log_used = nullptr, uint64_t log_ref = 0,
--- a/db/memtable.cc
+++ b/db/memtable.cc
@ -479,12 +479,12 @@ bool MemTable::Add(SequenceNumber s, ValueType type,
        insert_with_hint_prefix_extractor_->InDomain(key_slice)) {
      Slice prefix = insert_with_hint_prefix_extractor_->Transform(key_slice);
      bool res = table->InsertKeyWithHint(handle, &insert_hints_[prefix]);
-      if (!res) {
+      if (UNLIKELY(!res)) {
        return res;
      }
    } else {
      bool res = table->InsertKey(handle);
-      if (!res) {
+      if (UNLIKELY(!res)) {
        return res;
      }
    }
@ -520,7 +520,7 @@ bool MemTable::Add(SequenceNumber s, ValueType type,
    UpdateFlushState();
  } else {
    bool res = table->InsertKeyConcurrently(handle);
-    if (!res) {
+    if (UNLIKELY(!res)) {
      return res;
    }

--- a/include/rocksdb/statistics.h
+++ b/include/rocksdb/statistics.h
@ -309,6 +309,17 @@ enum Tickers : uint32_t {
  // # of bytes in the blob files evicted because of BlobDB is full.
  BLOB_DB_FIFO_BYTES_EVICTED,

+  // These coutners indicate a performance issue in WritePrepared transactions.
+  // We should not seem them ticking them much.
+  // # of times prepare_mutex_ is acquired in the fast path.
+  TXN_PREPARE_MUTEX_OVERHEAD,
+  // # of times old_commit_map_mutex_ is acquired in the fast path.
+  TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD,
+  // # of times we checked a batch for duplicate keys.
+  TXN_DUPLICATE_KEY_OVERHEAD,
+  // # of times snapshot_mutex_ is acquired in the fast path.
+  TXN_SNAPSHOT_MUTEX_OVERHEAD,
+
  TICKER_ENUM_MAX
 };

@ -455,6 +466,11 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
    {BLOB_DB_FIFO_NUM_FILES_EVICTED, "rocksdb.blobdb.fifo.num.files.evicted"},
    {BLOB_DB_FIFO_NUM_KEYS_EVICTED, "rocksdb.blobdb.fifo.num.keys.evicted"},
    {BLOB_DB_FIFO_BYTES_EVICTED, "rocksdb.blobdb.fifo.bytes.evicted"},
+    {TXN_PREPARE_MUTEX_OVERHEAD, "rocksdb.txn.overhead.mutex.prepare"},
+    {TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD,
+     "rocksdb.txn.overhead.mutex.old.commit.map"},
+    {TXN_DUPLICATE_KEY_OVERHEAD, "rocksdb.txn.overhead.duplicate.key"},
+    {TXN_SNAPSHOT_MUTEX_OVERHEAD, "rocksdb.txn.overhead.mutex.snapshot"},
 };

 /**
--- a/include/rocksdb/utilities/transaction_db.h
+++ b/include/rocksdb/utilities/transaction_db.h
@ -98,9 +98,10 @@ struct TransactionOptions {
  bool deadlock_detect = false;

  // If set, it states that the CommitTimeWriteBatch represents the latest state
-  // of the application and meant to be used later during recovery. It enables
-  // an optimization to postpone updating the memtable with CommitTimeWriteBatch
-  // to only SwitchMemtable or recovery.
+  // of the application, has only one sub-batch, i.e., no duplicate keys,  and
+  // meant to be used later during recovery. It enables an optimization to
+  // postpone updating the memtable with CommitTimeWriteBatch to only
+  // SwitchMemtable or recovery.
  bool use_only_the_last_commit_time_batch_for_recovery = false;

  // TODO(agiardullo): TransactionDB does not yet support comparators that allow
--- a/utilities/transactions/write_prepared_txn_db.cc
+++ b/utilities/transactions/write_prepared_txn_db.cc
@ -131,9 +131,9 @@ Status WritePreparedTxnDB::WriteInternal(const WriteOptions& write_options_orig,
    auto s = batch->Iterate(&counter);
    assert(s.ok());
    batch_cnt = counter.BatchCount();
-    // TODO(myabandeh): replace me with a stat
-    ROCKS_LOG_WARN(info_log_, "Duplicate key overhead: %" PRIu64 " batches",
-                   static_cast<uint64_t>(batch_cnt));
+    WPRecordTick(TXN_DUPLICATE_KEY_OVERHEAD);
+    ROCKS_LOG_DETAILS(info_log_, "Duplicate key overhead: %" PRIu64 " batches",
+                      static_cast<uint64_t>(batch_cnt));
  }
  assert(batch_cnt);

@ -506,7 +506,6 @@ void WritePreparedTxnDB::AdvanceMaxEvictedSeq(const SequenceNumber& prev_max,
    while (!prepared_txns_.empty() && prepared_txns_.top() <= new_max) {
      auto to_be_popped = prepared_txns_.top();
      delayed_prepared_.insert(to_be_popped);
-      // TODO(myabandeh): also add a stat
      ROCKS_LOG_WARN(info_log_,
                     "prepared_mutex_ overhead %" PRIu64 " (prep=%" PRIu64
                     " new_max=%" PRIu64 " oldmax=%" PRIu64,
@ -574,14 +573,14 @@ void WritePreparedTxnDB::ReleaseSnapshotInternal(
    // old_commit_map_. Check and do garbage collection if that is the case.
    bool need_gc = false;
    {
-      // TODO(myabandeh): also add a stat
+      WPRecordTick(TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD);
      ROCKS_LOG_WARN(info_log_, "old_commit_map_mutex_ overhead");
      ReadLock rl(&old_commit_map_mutex_);
      auto prep_set_entry = old_commit_map_.find(snap_seq);
      need_gc = prep_set_entry != old_commit_map_.end();
    }
    if (need_gc) {
-      // TODO(myabandeh): also add a stat
+      WPRecordTick(TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD);
      ROCKS_LOG_WARN(info_log_, "old_commit_map_mutex_ overhead");
      WriteLock wl(&old_commit_map_mutex_);
      old_commit_map_.erase(snap_seq);
@ -601,8 +600,7 @@ void WritePreparedTxnDB::UpdateSnapshots(
 #ifndef NDEBUG
  size_t sync_i = 0;
 #endif
-  // TODO(myabandeh): replace me with a stat
-  ROCKS_LOG_WARN(info_log_, "snapshots_mutex_ overhead");
+  ROCKS_LOG_DETAILS(info_log_, "snapshots_mutex_ overhead");
  WriteLock wl(&snapshots_mutex_);
  snapshots_version_ = version;
  // We update the list concurrently with the readers.
@ -682,7 +680,7 @@ void WritePreparedTxnDB::CheckAgainstSnapshots(const CommitEntry& evicted) {
  if (UNLIKELY(SNAPSHOT_CACHE_SIZE < cnt && ip1 == SNAPSHOT_CACHE_SIZE &&
               snapshot_seq < evicted.prep_seq)) {
    // Then access the less efficient list of snapshots_
-    // TODO(myabandeh): also add a stat
+    WPRecordTick(TXN_SNAPSHOT_MUTEX_OVERHEAD);
    ROCKS_LOG_WARN(info_log_, "snapshots_mutex_ overhead");
    ReadLock rl(&snapshots_mutex_);
    // Items could have moved from the snapshots_ to snapshot_cache_ before
@ -716,8 +714,8 @@ bool WritePreparedTxnDB::MaybeUpdateOldCommitMap(
  }
  // then snapshot_seq < commit_seq
  if (prep_seq <= snapshot_seq) {  // overlapping range
+    WPRecordTick(TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD);
    ROCKS_LOG_WARN(info_log_, "old_commit_map_mutex_ overhead");
-    // TODO(myabandeh): also add a stat
    WriteLock wl(&old_commit_map_mutex_);
    old_commit_map_empty_.store(false, std::memory_order_release);
    auto& vec = old_commit_map_[snapshot_seq];
--- a/utilities/transactions/write_prepared_txn_db.h
+++ b/utilities/transactions/write_prepared_txn_db.h
@ -144,8 +144,8 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
    }
    if (!delayed_prepared_empty_.load(std::memory_order_acquire)) {
      // We should not normally reach here
+      WPRecordTick(TXN_PREPARE_MUTEX_OVERHEAD);
      ReadLock rl(&prepared_mutex_);
-      // TODO(myabandeh): also add a stat
      ROCKS_LOG_WARN(info_log_, "prepared_mutex_ overhead %" PRIu64,
                     static_cast<uint64_t>(delayed_prepared_.size()));
      if (delayed_prepared_.find(prep_seq) != delayed_prepared_.end()) {
@ -216,7 +216,7 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
      // We should not normally reach here unless sapshot_seq is old. This is a
      // rare case and it is ok to pay the cost of mutex ReadLock for such old,
      // reading transactions.
-      // TODO(myabandeh): also add a stat
+      WPRecordTick(TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD);
      ROCKS_LOG_WARN(info_log_, "old_commit_map_mutex_ overhead");
      ReadLock rl(&old_commit_map_mutex_);
      auto prep_set_entry = old_commit_map_.find(snapshot_seq);
@ -381,6 +381,10 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {

  void Init(const TransactionDBOptions& /* unused */);

+  void WPRecordTick(uint32_t ticker_type) const {
+    RecordTick(db_impl_->immutable_db_options_.statistics.get(), ticker_type);
+  }
+
  // A heap with the amortized O(1) complexity for erase. It uses one extra heap
  // to keep track of erased entries that are not yet on top of the main heap.
  class PreparedHeap {