From 812dbfb483e0014ccd2f2e7e593e1a9ee0ed73af Mon Sep 17 00:00:00 2001 From: Islam AbdelRahman Date: Tue, 14 Jun 2016 12:27:46 -0700 Subject: [PATCH] Optimize BlockIter::Prev() by caching decoded entries Summary: Right now the way we do BlockIter::Prev() is like this - Go to the beginning of the restart interval - Keep moving forward (and decoding keys using ParseNextKey()) until we reach the desired key This can be optimized by caching the decoded entries in the first pass and reusing them in consecutive BlockIter::Prev() calls Before caching ``` DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="readreverse" --db="/dev/shm/bench_prev_opt/" --use_existing_db --disable_auto_compactions DB path: [/dev/shm/bench_prev_opt/] readreverse : 0.413 micros/op 2423972 ops/sec; 268.2 MB/s DB path: [/dev/shm/bench_prev_opt/] readreverse : 0.414 micros/op 2413867 ops/sec; 267.0 MB/s DB path: [/dev/shm/bench_prev_opt/] readreverse : 0.410 micros/op 2440881 ops/sec; 270.0 MB/s DB path: [/dev/shm/bench_prev_opt/] readreverse : 0.414 micros/op 2417298 ops/sec; 267.4 MB/s DB path: [/dev/shm/bench_prev_opt/] readreverse : 0.413 micros/op 2421682 ops/sec; 267.9 MB/s ``` After caching ``` DEBUG_LEVEL=0 make db_bench -j64 && ./db_bench --benchmarks="readreverse" --db="/dev/shm/bench_prev_opt/" --use_existing_db --disable_auto_compactions DB path: [/dev/shm/bench_prev_opt/] readreverse : 0.324 micros/op 3088955 ops/sec; 341.7 MB/s DB path: [/dev/shm/bench_prev_opt/] readreverse : 0.335 micros/op 2980999 ops/sec; 329.8 MB/s DB path: [/dev/shm/bench_prev_opt/] readreverse : 0.341 micros/op 2929681 ops/sec; 324.1 MB/s DB path: [/dev/shm/bench_prev_opt/] readreverse : 0.344 micros/op 2908490 ops/sec; 321.8 MB/s DB path: [/dev/shm/bench_prev_opt/] readreverse : 0.338 micros/op 2958404 ops/sec; 327.3 MB/s ``` Test Plan: COMPILE_WITH_ASAN=1 make check -j64 Reviewers: andrewkr, yiwu, sdong Reviewed By: sdong Subscribers: andrewkr, dhruba, yoshinorim Differential Revision: https://reviews.facebook.net/D59463 --- table/block.cc | 57 +++++++++++++++++++++++++++++++++++++++++++++++++- table/block.h | 32 ++++++++++++++++++++++++++-- 2 files changed, 86 insertions(+), 3 deletions(-) diff --git a/table/block.cc b/table/block.cc index a6672d3ca..8b08ebbc6 100644 --- a/table/block.cc +++ b/table/block.cc @@ -63,6 +63,40 @@ void BlockIter::Next() { void BlockIter::Prev() { assert(Valid()); + assert(prev_entries_idx_ == -1 || + static_cast(prev_entries_idx_) < prev_entries_.size()); + // Check if we can use cached prev_entries_ + if (prev_entries_idx_ > 0 && + prev_entries_[prev_entries_idx_].offset == current_) { + // Read cached CachedPrevEntry + prev_entries_idx_--; + const CachedPrevEntry& current_prev_entry = + prev_entries_[prev_entries_idx_]; + + const char* key_ptr = current_prev_entry.key_ptr; + if (current_prev_entry.key_ptr != nullptr) { + // The key is not delta encoded and stored in the data block + key_ptr = current_prev_entry.key_ptr; + key_pinned_ = true; + } else { + // The key is delta encoded and stored in prev_entries_keys_buff_ + key_ptr = prev_entries_keys_buff_.data() + current_prev_entry.key_offset; + key_pinned_ = false; + } + const Slice current_key(key_ptr, current_prev_entry.key_size); + + current_ = current_prev_entry.offset; + key_.SetKey(current_key, false /* copy */); + value_ = current_prev_entry.value; + + return; + } + + // Clear prev entries cache + prev_entries_idx_ = -1; + prev_entries_.clear(); + prev_entries_keys_buff_.clear(); + // Scan backwards to a restart point before current_ const uint32_t original = current_; while (GetRestartPoint(restart_index_) >= original) { @@ -76,9 +110,28 @@ void BlockIter::Prev() { } SeekToRestartPoint(restart_index_); + do { + if (!ParseNextKey()) { + break; + } + Slice current_key = key(); + + if (key_.IsKeyPinned()) { + // The key is not delta encoded + prev_entries_.emplace_back(current_, current_key.data(), 0, + current_key.size(), value()); + } else { + // The key is delta encoded, cache decoded key in buffer + size_t new_key_offset = prev_entries_keys_buff_.size(); + prev_entries_keys_buff_.append(current_key.data(), current_key.size()); + + prev_entries_.emplace_back(current_, nullptr, new_key_offset, + current_key.size(), value()); + } // Loop until end of current entry hits the start of original entry - } while (ParseNextKey() && NextEntryOffset() < original); + } while (NextEntryOffset() < original); + prev_entries_idx_ = prev_entries_.size() - 1; } void BlockIter::Seek(const Slice& target) { @@ -155,9 +208,11 @@ bool BlockIter::ParseNextKey() { // If this key dont share any bytes with prev key then we dont need // to decode it and can use it's address in the block directly. key_.SetKey(Slice(p, non_shared), false /* copy */); + key_pinned_ = true; } else { // This key share `shared` bytes with prev key, we need to decode it key_.TrimAppend(shared, p, non_shared); + key_pinned_ = false; } value_ = Slice(p + non_shared, value_length); while (restart_index_ + 1 < num_restarts_ && diff --git a/table/block.h b/table/block.h index 200be753c..033b27ba8 100644 --- a/table/block.h +++ b/table/block.h @@ -10,6 +10,8 @@ #pragma once #include #include +#include +#include #ifdef ROCKSDB_MALLOC_USABLE_SIZE #include #endif @@ -96,7 +98,8 @@ class BlockIter : public InternalIterator { current_(0), restart_index_(0), status_(Status::OK()), - prefix_index_(nullptr) {} + prefix_index_(nullptr), + key_pinned_(false) {} BlockIter(const Comparator* comparator, const char* data, uint32_t restarts, uint32_t num_restarts, BlockPrefixIndex* prefix_index) @@ -157,7 +160,7 @@ class BlockIter : public InternalIterator { PinnedIteratorsManager* pinned_iters_mgr_ = nullptr; #endif - virtual bool IsKeyPinned() const override { return key_.IsKeyPinned(); } + virtual bool IsKeyPinned() const override { return key_pinned_; } private: const Comparator* comparator_; @@ -172,6 +175,31 @@ class BlockIter : public InternalIterator { Slice value_; Status status_; BlockPrefixIndex* prefix_index_; + bool key_pinned_; + + struct CachedPrevEntry { + explicit CachedPrevEntry(uint32_t _offset, const char* _key_ptr, + size_t _key_offset, size_t _key_size, Slice _value) + : offset(_offset), + key_ptr(_key_ptr), + key_offset(_key_offset), + key_size(_key_size), + value(_value) {} + + // offset of entry in block + uint32_t offset; + // Pointer to key data in block (nullptr if key is delta-encoded) + const char* key_ptr; + // offset of key in prev_entries_keys_buff_ (0 if key_ptr is not nullptr) + size_t key_offset; + // size of key + size_t key_size; + // value slice pointing to data in block + Slice value; + }; + std::string prev_entries_keys_buff_; + std::vector prev_entries_; + int32_t prev_entries_idx_ = -1; inline int Compare(const Slice& a, const Slice& b) const { return comparator_->Compare(a, b);