From 88edfd90ae296785a6ba158e0d3f1e742d6b76b7 Mon Sep 17 00:00:00 2001 From: Tomislav Novak Date: Tue, 23 Sep 2014 15:52:28 -0700 Subject: [PATCH] SkipListRep::LookaheadIterator Summary: This diff introduces the `lookahead` argument to `SkipListFactory()`. This is an optimization for the tailing use case which includes many seeks. E.g. consider the following operations on a skip list iterator: Seek(x), Next(), Next(), Seek(x+2), Next(), Seek(x+3), Next(), Next(), ... If `lookahead` is positive, `SkipListRep` will return an iterator which also keeps track of the previously visited node. Seek() then first does a linear search starting from that node (up to `lookahead` steps). As in the tailing example above, this may require fewer than ~log(n) comparisons as with regular skip list search. Test Plan: Added a new benchmark (`fillseekseq`) which simulates the usage pattern. It first writes N records (with consecutive keys), then measures how much time it takes to read them by calling `Seek()` and `Next()`. $ time ./db_bench -num 10000000 -benchmarks fillseekseq -prefix_size 1 \ -key_size 8 -write_buffer_size $[1024*1024*1024] -value_size 50 \ -seekseq_next 2 -skip_list_lookahead=0 [...] DB path: [/dev/shm/rocksdbtest/dbbench] fillseekseq : 0.389 micros/op 2569047 ops/sec; real 0m21.806s user 0m12.106s sys 0m9.672s $ time ./db_bench [...] -skip_list_lookahead=2 [...] DB path: [/dev/shm/rocksdbtest/dbbench] fillseekseq : 0.153 micros/op 6540684 ops/sec; real 0m19.469s user 0m10.192s sys 0m9.252s Reviewers: ljin, sdong, igor Reviewed By: igor Subscribers: dhruba, leveldb, march, lovro Differential Revision: https://reviews.facebook.net/D23997 --- db/db_bench.cc | 46 ++++++++++++- include/rocksdb/memtablerep.h | 11 ++++ util/skiplistrep.cc | 120 ++++++++++++++++++++++++++++++++-- 3 files changed, 168 insertions(+), 9 deletions(-) diff --git a/db/db_bench.cc b/db/db_bench.cc index bbd807c2c..f04ab8144 100644 --- a/db/db_bench.cc +++ b/db/db_bench.cc @@ -86,7 +86,8 @@ DEFINE_string(benchmarks, "xxhash," "compress," "uncompress," - "acquireload,", + "acquireload," + "fillseekseq,", "Comma-separated list of operations to run in the specified order" "Actual benchmarks:\n" @@ -129,6 +130,8 @@ DEFINE_string(benchmarks, "\tcrc32c -- repeated crc32c of 4K of data\n" "\txxhash -- repeated xxHash of 4K of data\n" "\tacquireload -- load N*1000 times\n" + "\tfillseekseq -- write N values in sequential key, then read " + "them by seeking to each key\n" "Meta operations:\n" "\tcompact -- Compact the entire DB\n" "\tstats -- Print DB stats\n" @@ -165,6 +168,9 @@ DEFINE_int32(duration, 0, "Time in seconds for the random-ops tests to run." DEFINE_int32(value_size, 100, "Size of each value"); +DEFINE_int32(seekseq_next, 0, "How many times to call Next() after Seek() in " + "fillseekseq"); + DEFINE_bool(use_uint64_comparator, false, "use Uint64 user comparator"); DEFINE_int64(batch_size, 1, "Batch size"); @@ -565,6 +571,9 @@ DEFINE_string(merge_operator, "", "The merge operator to use with the database." "If a new merge operator is specified, be sure to use fresh" " database The possible merge operators are defined in" " utilities/merge_operators.h"); +DEFINE_int32(skip_list_lookahead, 0, "Used with skip_list memtablerep; try " + "linear search first for this many steps from the previous " + "position"); static const bool FLAGS_soft_rate_limit_dummy __attribute__((unused)) = RegisterFlagValidator(&FLAGS_soft_rate_limit, &ValidateRateLimit); @@ -1326,6 +1335,8 @@ class Benchmark { method = &Benchmark::MergeRandom; } else if (name == Slice("randomwithverify")) { method = &Benchmark::RandomWithVerify; + } else if (name == Slice("fillseekseq")) { + method = &Benchmark::WriteSeqSeekSeq; } else if (name == Slice("compact")) { method = &Benchmark::Compact; } else if (name == Slice("crc32c")) { @@ -1717,7 +1728,8 @@ class Benchmark { FLAGS_hash_bucket_count)); break; case kSkipList: - // no need to do anything + options.memtable_factory.reset(new SkipListFactory( + FLAGS_skip_list_lookahead)); break; case kHashLinkedList: options.memtable_factory.reset(NewHashLinkListRepFactory( @@ -2791,6 +2803,36 @@ class Benchmark { thread->stats.AddMessage(msg); } + void WriteSeqSeekSeq(ThreadState* thread) { + writes_ = FLAGS_num; + DoWrite(thread, SEQUENTIAL); + // exclude writes from the ops/sec calculation + thread->stats.Start(thread->tid); + + DB* db = SelectDB(thread); + std::unique_ptr iter( + db->NewIterator(ReadOptions(FLAGS_verify_checksum, true))); + + Slice key = AllocateKey(); + for (int64_t i = 0; i < FLAGS_num; ++i) { + GenerateKeyFromInt(i, FLAGS_num, &key); + iter->Seek(key); + assert(iter->Valid() && iter->key() == key); + thread->stats.FinishedOps(nullptr, db, 1); + + for (int j = 0; j < FLAGS_seekseq_next && i+1 < FLAGS_num; ++j) { + iter->Next(); + GenerateKeyFromInt(++i, FLAGS_num, &key); + assert(iter->Valid() && iter->key() == key); + thread->stats.FinishedOps(nullptr, db, 1); + } + + iter->Seek(key); + assert(iter->Valid() && iter->key() == key); + thread->stats.FinishedOps(nullptr, db, 1); + } + } + void Compact(ThreadState* thread) { DB* db = SelectDB(thread); db->CompactRange(nullptr, nullptr); diff --git a/include/rocksdb/memtablerep.h b/include/rocksdb/memtablerep.h index b7fc39c81..8c2d7201b 100644 --- a/include/rocksdb/memtablerep.h +++ b/include/rocksdb/memtablerep.h @@ -186,12 +186,23 @@ class MemTableRepFactory { }; // This uses a skip list to store keys. It is the default. +// +// Parameters: +// lookahead: If non-zero, each iterator's seek operation will start the +// search from the previously visited record (doing at most 'lookahead' +// steps). This is an optimization for the access pattern including many +// seeks with consecutive keys. class SkipListFactory : public MemTableRepFactory { public: + explicit SkipListFactory(size_t lookahead = 0) : lookahead_(lookahead) {} + virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&, Arena*, const SliceTransform*, Logger* logger) override; virtual const char* Name() const override { return "SkipListFactory"; } + + private: + const size_t lookahead_; }; #ifndef ROCKSDB_LITE diff --git a/util/skiplistrep.cc b/util/skiplistrep.cc index a3c940d0e..1322f6c9a 100644 --- a/util/skiplistrep.cc +++ b/util/skiplistrep.cc @@ -12,9 +12,16 @@ namespace rocksdb { namespace { class SkipListRep : public MemTableRep { SkipList skip_list_; + const MemTableRep::KeyComparator& cmp_; + const SliceTransform* transform_; + const size_t lookahead_; + + friend class LookaheadIterator; public: - explicit SkipListRep(const MemTableRep::KeyComparator& compare, Arena* arena) - : MemTableRep(arena), skip_list_(compare, arena) { + explicit SkipListRep(const MemTableRep::KeyComparator& compare, Arena* arena, + const SliceTransform* transform, const size_t lookahead) + : MemTableRep(arena), skip_list_(compare, arena), cmp_(compare), + transform_(transform), lookahead_(lookahead) { } // Insert key into the list. @@ -106,11 +113,110 @@ public: std::string tmp_; // For passing to EncodeKey }; + // Iterator over the contents of a skip list which also keeps track of the + // previously visited node. In Seek(), it examines a few nodes after it + // first, falling back to O(log n) search from the head of the list only if + // the target key hasn't been found. + class LookaheadIterator : public MemTableRep::Iterator { + public: + explicit LookaheadIterator(const SkipListRep& rep) : + rep_(rep), iter_(&rep_.skip_list_), prev_(iter_) {} + + virtual ~LookaheadIterator() override {} + + virtual bool Valid() const override { + return iter_.Valid(); + } + + virtual const char *key() const override { + assert(Valid()); + return iter_.key(); + } + + virtual void Next() override { + assert(Valid()); + + bool advance_prev = true; + if (prev_.Valid()) { + auto k1 = rep_.UserKey(prev_.key()); + auto k2 = rep_.UserKey(iter_.key()); + + if (k1.compare(k2) == 0) { + // same user key, don't move prev_ + advance_prev = false; + } else if (rep_.transform_) { + // only advance prev_ if it has the same prefix as iter_ + auto t1 = rep_.transform_->Transform(k1); + auto t2 = rep_.transform_->Transform(k2); + advance_prev = t1.compare(t2) == 0; + } + } + + if (advance_prev) { + prev_ = iter_; + } + iter_.Next(); + } + + virtual void Prev() override { + assert(Valid()); + iter_.Prev(); + prev_ = iter_; + } + + virtual void Seek(const Slice& internal_key, const char *memtable_key) + override { + const char *encoded_key = + (memtable_key != nullptr) ? + memtable_key : EncodeKey(&tmp_, internal_key); + + if (prev_.Valid() && rep_.cmp_(encoded_key, prev_.key()) >= 0) { + // prev_.key() is smaller or equal to our target key; do a quick + // linear search (at most lookahead_ steps) starting from prev_ + iter_ = prev_; + + size_t cur = 0; + while (cur++ <= rep_.lookahead_ && iter_.Valid()) { + if (rep_.cmp_(encoded_key, iter_.key()) <= 0) { + return; + } + Next(); + } + } + + iter_.Seek(encoded_key); + prev_ = iter_; + } + + virtual void SeekToFirst() override { + iter_.SeekToFirst(); + prev_ = iter_; + } + + virtual void SeekToLast() override { + iter_.SeekToLast(); + prev_ = iter_; + } + + protected: + std::string tmp_; // For passing to EncodeKey + + private: + const SkipListRep& rep_; + SkipList::Iterator iter_; + SkipList::Iterator prev_; + }; + virtual MemTableRep::Iterator* GetIterator(Arena* arena = nullptr) override { - if (arena == nullptr) { - return new SkipListRep::Iterator(&skip_list_); + if (lookahead_ > 0) { + void *mem = + arena ? arena->AllocateAligned(sizeof(SkipListRep::LookaheadIterator)) + : operator new(sizeof(SkipListRep::LookaheadIterator)); + return new (mem) SkipListRep::LookaheadIterator(*this); } else { - auto mem = arena->AllocateAligned(sizeof(SkipListRep::Iterator)); + void *mem = + arena ? arena->AllocateAligned(sizeof(SkipListRep::Iterator)) + : operator new(sizeof(SkipListRep::Iterator)); return new (mem) SkipListRep::Iterator(&skip_list_); } } @@ -119,8 +225,8 @@ public: MemTableRep* SkipListFactory::CreateMemTableRep( const MemTableRep::KeyComparator& compare, Arena* arena, - const SliceTransform*, Logger* logger) { - return new SkipListRep(compare, arena); + const SliceTransform* transform, Logger* logger) { + return new SkipListRep(compare, arena, transform, lookahead_); } } // namespace rocksdb