From dfb6fe6755bfce3b73d572f15ab9fa081bc15270 Mon Sep 17 00:00:00 2001 From: Yi Wu Date: Tue, 22 Nov 2016 14:06:54 -0800 Subject: [PATCH] Unified InlineSkipList::Insert algorithm with hinting Summary: This PR is based on nbronson's diff with small modifications to wire it up with existing interface. Comparing to previous version, this approach works better for inserting keys in decreasing order or updating the same key, and impose less restriction to the prefix extractor. ---- Summary from original diff ---- This diff introduces a single InlineSkipList::Insert that unifies the existing sequential insert optimization (prev_), concurrent insertion, and insertion using externally-managed insertion point hints. There's a deep symmetry between insertion hints (cursors) and the concurrent algorithm. In both cases we have partial information from the recent past that is likely but not certain to be accurate. This diff introduces the struct InlineSkipList::Splice, which encodes predecessor and successor information in the same form that was previously only used within a single call to InsertConcurrently. Splice holds information about an insertion point that can be used to levera Closes https://github.com/facebook/rocksdb/pull/1561 Differential Revision: D4217283 Pulled By: yiwu-arbug fbshipit-source-id: 33ee437 --- db/column_family.cc | 5 - db/db_properties_test.cc | 4 +- db/inlineskiplist.h | 552 ++++++++++++++++------------------ db/inlineskiplist_test.cc | 11 +- db/memtable.cc | 14 +- include/rocksdb/memtablerep.h | 9 +- include/rocksdb/options.h | 26 +- memtable/skiplistrep.cc | 4 +- 8 files changed, 294 insertions(+), 331 deletions(-) diff --git a/db/column_family.cc b/db/column_family.cc index 80417b71b..b4fe6c181 100644 --- a/db/column_family.cc +++ b/db/column_family.cc @@ -275,11 +275,6 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options, result.max_compaction_bytes = result.target_file_size_base * 25; } - // Insert into memtable with hint is incompatible with concurrent inserts. - if (db_options.allow_concurrent_memtable_write) { - result.memtable_insert_with_hint_prefix_extractor = nullptr; - } - return result; } diff --git a/db/db_properties_test.cc b/db/db_properties_test.cc index ff6b8c320..4604e4991 100644 --- a/db/db_properties_test.cc +++ b/db/db_properties_test.cc @@ -559,9 +559,9 @@ TEST_F(DBPropertiesTest, NumImmutableMemTable) { ASSERT_EQ(num, "3"); ASSERT_TRUE(dbfull()->GetIntProperty( handles_[1], "rocksdb.cur-size-active-mem-table", &value)); - // "384" is the size of the metadata of two empty skiplists, this would + // "192" is the size of the metadata of two empty skiplists, this would // break if we change the default skiplist implementation - ASSERT_GE(value, 384); + ASSERT_GE(value, 192); uint64_t int_num; uint64_t base_total_size; diff --git a/db/inlineskiplist.h b/db/inlineskiplist.h index c31aa4aa5..43bb09ac8 100644 --- a/db/inlineskiplist.h +++ b/db/inlineskiplist.h @@ -54,13 +54,13 @@ namespace rocksdb { template class InlineSkipList { - public: - struct InsertHint; - private: struct Node; + struct Splice; public: + static const uint16_t kMaxPossibleHeight = 32; + // Create a new InlineSkipList object that will use "cmp" for comparing // keys, and will allocate memory using "*allocator". Objects allocated // in the allocator must remain allocated for the lifetime of the @@ -74,29 +74,49 @@ class InlineSkipList { // is thread-safe. char* AllocateKey(size_t key_size); + // Allocate a splice using allocator. + Splice* AllocateSplice(); + // Inserts a key allocated by AllocateKey, after the actual key value // has been filled in. // // REQUIRES: nothing that compares equal to key is currently in the list. - // REQUIRES: no concurrent calls to INSERT + // REQUIRES: no concurrent calls to any of inserts. void Insert(const char* key); - // Inserts a key allocated by AllocateKey with a hint. It can be used to - // optimize sequential inserts, or inserting a key close to the largest - // key inserted previously with the same hint. + // Inserts a key allocated by AllocateKey with a hint of last insert + // position in the skip-list. If hint points to nullptr, a new hint will be + // populated, which can be used in subsequent calls. // - // If hint points to nullptr, a new hint will be populated, which can be - // used in subsequent calls. + // It can be used to optimize the workload where there are multiple groups + // of keys, and each key is likely to insert to a location close to the last + // inserted key in the same group. One example is sequential inserts. // - // REQUIRES: All keys inserted with the same hint must be consecutive in the - // skip-list, i.e. let [k1..k2] be the range of keys inserted with hint h, - // there shouldn't be a key k in the skip-list with k1 < k < k2, unless k is - // also inserted with the same hint. - void InsertWithHint(const char* key, InsertHint** hint); + // REQUIRES: nothing that compares equal to key is currently in the list. + // REQUIRES: no concurrent calls to any of inserts. + void InsertWithHint(const char* key, void** hint); // Like Insert, but external synchronization is not required. void InsertConcurrently(const char* key); + // Inserts a node into the skip list. key must have been allocated by + // AllocateKey and then filled in by the caller. If UseCAS is true, + // then external synchronization is not required, otherwise this method + // may not be called concurrently with any other insertions. + // + // Regardless of whether UseCAS is true, the splice must be owned + // exclusively by the current thread. If allow_partial_splice_fix is + // true, then the cost of insertion is amortized O(log D), where D is + // the distance from the splice to the inserted key (measured as the + // number of intervening nodes). Note that this bound is very good for + // sequential insertions! If allow_partial_splice_fix is false then + // the existing splice will be ignored unless the current key is being + // inserted immediately after the splice. allow_partial_splice_fix == + // false has worse running time for the non-sequential case O(log N), + // but a better constant factor. + template + void Insert(const char* key, Splice* splice, bool allow_partial_splice_fix); + // Returns true iff an entry that compares equal to key is in the list. bool Contains(const char* key) const; @@ -154,8 +174,6 @@ class InlineSkipList { }; private: - static const uint16_t kMaxPossibleHeight = 32; - const uint16_t kMaxHeight_; const uint16_t kBranching_; const uint32_t kScaledInverseBranching_; @@ -170,13 +188,10 @@ class InlineSkipList { // values are ok. std::atomic max_height_; // Height of the entire list - // Used for optimizing sequential insert patterns. Tricky. prev_height_ - // of zero means prev_ is undefined. Otherwise: prev_[i] for i up - // to max_height_ - 1 (inclusive) is the predecessor of prev_[0], and - // prev_height_ is the height of prev_[0]. prev_[0] can only be equal - // to head when max_height_ and prev_height_ are both 1. - Node** prev_; - std::atomic prev_height_; + // seq_splice_ is a Splice used for insertions in the non-concurrent + // case. It caches the prev and next found during the most recent + // non-concurrent insertion. + Splice* seq_splice_; inline int GetMaxHeight() const { return max_height_.load(std::memory_order_relaxed); @@ -186,13 +201,6 @@ class InlineSkipList { Node* AllocateNode(size_t key_size, int height); - // Allocate a hint used by InsertWithHint(). - InsertHint* AllocateInsertHint(); - - // Extract the node from a key allocated by AllocateKey(), and populate - // height of the node. - Node* GetNodeForInsert(const char* key, int* height); - bool Equal(const char* a, const char* b) const { return (compare_(a, b) == 0); } @@ -202,7 +210,7 @@ class InlineSkipList { } // Return true if key is greater than the data stored in "n". Null n - // is considered infinite. + // is considered infinite. n should not be head_. bool KeyIsAfterNode(const char* key, Node* n) const; // Returns the earliest node with a key >= key. @@ -232,12 +240,13 @@ class InlineSkipList { // point to a node that is before the key, and after should point to // a node that is after the key. after should be nullptr if a good after // node isn't conveniently available. - void FindLevelSplice(const char* key, Node* before, Node* after, int level, - Node** out_prev, Node** out_next); + void FindSpliceForLevel(const char* key, Node* before, Node* after, int level, + Node** out_prev, Node** out_next); - // Check if we need to invalidate prev_ cache after inserting a node of - // given height. - void MaybeInvalidatePrev(int height); + // Recomputes Splice levels from highest_level (inclusive) down to + // lowest_level (inclusive). + void RecomputeSpliceLevels(const char* key, Splice* splice, + int recompute_level); // No copying allowed InlineSkipList(const InlineSkipList&); @@ -246,6 +255,19 @@ class InlineSkipList { // Implementation details follow +template +struct InlineSkipList::Splice { + // The invariant of a Splice is that prev_[i+1].key <= prev_[i].key < + // next_[i].key <= next_[i+1].key for all i. That means that if a + // key is bracketed by prev_[i] and next_[i] then it is bracketed by + // all higher levels. It is _not_ required that prev_[i]->Next(i) == + // next_[i] (it probably did at some point in the past, but intervening + // or concurrent operations might have inserted nodes in between). + int height_ = 0; + Node** prev_; + Node** next_; +}; + // The Node data type is more of a pointer into custom-managed memory than // a traditional C++ struct. The key is stored in the bytes immediately // after the struct, and the next_ pointers for nodes with height > 1 are @@ -317,17 +339,6 @@ struct InlineSkipList::Node { std::atomic next_[1]; }; -// -// -// Hint to insert position to speed-up inserts. See implementation of -// InsertWithHint() for more details. -template -struct InlineSkipList::InsertHint { - Node** prev; - uint8_t* prev_height; - int num_levels; -}; - template inline InlineSkipList::Iterator::Iterator( const InlineSkipList* list) { @@ -419,6 +430,7 @@ template bool InlineSkipList::KeyIsAfterNode(const char* key, Node* n) const { // nullptr n is considered infinite + assert(n != head_); return (n != nullptr) && (compare_(n->Key(), key) < 0); } @@ -549,19 +561,14 @@ InlineSkipList::InlineSkipList(const Comparator cmp, allocator_(allocator), head_(AllocateNode(0, max_height)), max_height_(1), - prev_height_(1) { + seq_splice_(AllocateSplice()) { assert(max_height > 0 && kMaxHeight_ == static_cast(max_height)); assert(branching_factor > 1 && kBranching_ == static_cast(branching_factor)); assert(kScaledInverseBranching_ > 0); - // Allocate the prev_ Node* array, directly from the passed-in allocator. - // prev_ does not need to be freed, as its life cycle is tied up with - // the allocator as a whole. - prev_ = reinterpret_cast( - allocator_->AllocateAligned(sizeof(Node*) * kMaxHeight_)); - for (int i = 0; i < kMaxHeight_; i++) { + + for (int i = 0; i < kMaxHeight_; ++i) { head_->SetNext(i, nullptr); - prev_[i] = head_; } } @@ -595,226 +602,49 @@ InlineSkipList::AllocateNode(size_t key_size, int height) { } template -typename InlineSkipList::InsertHint* -InlineSkipList::AllocateInsertHint() { - InsertHint* hint = reinterpret_cast( - allocator_->AllocateAligned(sizeof(InsertHint))); - // Allocate an extra level on kMaxHeight_, to make boundary cases easier to - // handle. - hint->prev = reinterpret_cast( - allocator_->AllocateAligned(sizeof(Node*) * (kMaxHeight_ + 1))); - hint->prev_height = reinterpret_cast( - allocator_->AllocateAligned(sizeof(uint8_t*) * kMaxHeight_)); - for (int i = 0; i <= kMaxHeight_; i++) { - hint->prev[i] = head_; - } - hint->num_levels = 0; - return hint; -} - -template -typename InlineSkipList::Node* -InlineSkipList::GetNodeForInsert(const char* key, int* height) { - // Find the Node that we placed before the key in AllocateKey - Node* x = reinterpret_cast(const_cast(key)) - 1; - assert(height != nullptr); - *height = x->UnstashHeight(); - assert(*height >= 1 && *height <= kMaxHeight_); - - if (*height > GetMaxHeight()) { - // It is ok to mutate max_height_ without any synchronization - // with concurrent readers. A concurrent reader that observes - // the new value of max_height_ will see either the old value of - // new level pointers from head_ (nullptr), or a new value set in - // the loop below. In the former case the reader will - // immediately drop to the next level since nullptr sorts after all - // keys. In the latter case the reader will use the new node. - max_height_.store(*height, std::memory_order_relaxed); - } - - return x; -} - -template -void InlineSkipList::MaybeInvalidatePrev(int height) { - // We don't have a lock-free algorithm for updating prev_, but we do have - // the option of invalidating the entire sequential-insertion cache. - // prev_'s invariant is that prev_[i] (i > 0) is the predecessor of - // prev_[0] at that level. We're only going to violate that if height - // > 1 and key lands after prev_[height - 1] but before prev_[0]. - // Comparisons are pretty expensive, so an easier version is to just - // clear the cache if height > 1. We only write to prev_height_ if the - // nobody else has, to avoid invalidating the root of the skip list in - // all of the other CPU caches. - if (height > 1 && prev_height_.load(std::memory_order_relaxed) != 0) { - prev_height_.store(0, std::memory_order_relaxed); - } +typename InlineSkipList::Splice* +InlineSkipList::AllocateSplice() { + // size of prev_ and next_ + size_t array_size = sizeof(Node*) * (kMaxHeight_ + 1); + char* raw = allocator_->AllocateAligned(sizeof(Splice) + array_size * 2); + Splice* splice = reinterpret_cast(raw); + splice->height_ = 0; + splice->prev_ = reinterpret_cast(raw + sizeof(Splice)); + splice->next_ = reinterpret_cast(raw + sizeof(Splice) + array_size); + return splice; } template void InlineSkipList::Insert(const char* key) { - // InsertConcurrently often can't maintain the prev_ invariants, so - // it just sets prev_height_ to zero, letting us know that we should - // ignore it. A relaxed load suffices here because write thread - // synchronization separates Insert calls from InsertConcurrently calls. - auto prev_height = prev_height_.load(std::memory_order_relaxed); - - // fast path for sequential insertion - if (prev_height > 0 && !KeyIsAfterNode(key, prev_[0]->NoBarrier_Next(0)) && - (prev_[0] == head_ || KeyIsAfterNode(key, prev_[0]))) { - assert(prev_[0] != head_ || (prev_height == 1 && GetMaxHeight() == 1)); - - // Outside of this method prev_[1..max_height_] is the predecessor - // of prev_[0], and prev_height_ refers to prev_[0]. Inside Insert - // prev_[0..max_height - 1] is the predecessor of key. Switch from - // the external state to the internal - for (int i = 1; i < prev_height; i++) { - prev_[i] = prev_[0]; - } - } else { - // TODO(opt): we could use a NoBarrier predecessor search as an - // optimization for architectures where memory_order_acquire needs - // a synchronization instruction. Doesn't matter on x86 - FindLessThan(key, prev_); - } - - // Our data structure does not allow duplicate insertion - assert(prev_[0]->Next(0) == nullptr || !Equal(key, prev_[0]->Next(0)->Key())); - - int height = 0; - Node* x = GetNodeForInsert(key, &height); - - for (int i = 0; i < height; i++) { - x->InsertAfter(prev_[i], i); - } - prev_[0] = x; - prev_height_.store(height, std::memory_order_relaxed); -} - -// The goal here is to reduce the number of key comparisons, as it can be -// expensive. We maintain a hint which help us to find a insert position -// between or next to previously inserted keys with the same hint. -// Note that we require all keys inserted with the same hint are consecutive -// in the skip-list. -// -// The hint keeps a list of nodes previous inserted with the same hint: -// * The first level, prev[0], points to the largest key of them. -// * For 0 < i < num_levels, prev[i] is the previous node of prev[i-1] -// on level i, i.e. -// prev[i] < prev[i-1] <= prev[i]->Next(i) -// (prev[i-1] and prev[i]->Next(i) could be the same node.) -// In addition prev_height keeps the height of prev[i]. -// -// When inserting a new key, we look for the lowest level L where -// prev[L] < key < prev[L-1]. Let -// M = max(prev_height[i]..prev_height[num_levels-1]) -// For each level between in [L, M), the previous node of -// the new key must be one of prev[i]. For levels below L and above M -// we do normal skip-list search if needed. -// -// The optimization is suitable for stream of keys where new inserts are next -// to or close to the largest key ever inserted, e.g. sequential inserts. -template -void InlineSkipList::InsertWithHint(const char* key, - InsertHint** hint_ptr) { - int height = 0; - Node* x = GetNodeForInsert(key, &height); - - // InsertWithHint() is not compatible with prev_ optimization used by - // Insert(). - MaybeInvalidatePrev(height); - - assert(hint_ptr != nullptr); - InsertHint* hint = *hint_ptr; - if (hint == nullptr) { - // AllocateInsertHint will initialize hint with num_levels = 0 and - // prev[i] = head_ for all i. - hint = AllocateInsertHint(); - *hint_ptr = hint; - } - - // Look for the first level i < num_levels with prev[i] < key. - int level = 0; - for (; level < hint->num_levels; level++) { - if (KeyIsAfterNode(key, hint->prev[level])) { - assert(!KeyIsAfterNode(key, hint->prev[level]->Next(level))); - break; - } - } - Node* tmp_prev[kMaxPossibleHeight]; - if (level >= hint->num_levels) { - // The hint is not useful in this case. Fallback to full search. - FindLessThan(key, tmp_prev); - for (int i = 0; i < height; i++) { - assert(tmp_prev[i] == head_ || KeyIsAfterNode(key, tmp_prev[i])); - assert(!KeyIsAfterNode(key, tmp_prev[i]->Next(i))); - x->InsertAfter(tmp_prev[i], i); - } - } else { - // Search on levels below "level", using prev[level] as root. - if (level > 0) { - FindLessThan(key, tmp_prev, hint->prev[level], level, 0); - for (int i = 0; i < level && i < height; i++) { - assert(tmp_prev[i] == head_ || KeyIsAfterNode(key, tmp_prev[i])); - assert(!KeyIsAfterNode(key, tmp_prev[i]->Next(i))); - x->InsertAfter(tmp_prev[i], i); - } - } - // The current level where the new node is to insert into skip-list. - int current_level = level; - for (int i = level; i < hint->num_levels; i++) { - while (current_level < height && current_level < hint->prev_height[i]) { - // In this case, prev[i] is the previous node of key on current_level, - // since: - // * prev[i] < key; - // * no other nodes less than prev[level-1] has height greater than - // current_level, and prev[level-1] > key. - assert(KeyIsAfterNode(key, hint->prev[i])); - assert(!KeyIsAfterNode(key, hint->prev[i]->Next(current_level))); - x->InsertAfter(hint->prev[i], current_level); - current_level++; - } - } - // Full search on levels above current_level if needed. - if (current_level < height) { - FindLessThan(key, tmp_prev, head_, GetMaxHeight(), current_level); - for (int i = current_level; i < height; i++) { - assert(tmp_prev[i] == head_ || KeyIsAfterNode(key, tmp_prev[i])); - assert(!KeyIsAfterNode(key, tmp_prev[i]->Next(i))); - x->InsertAfter(tmp_prev[i], i); - } - } - } - // The last step is update the new node into the hint. - // * If "height" <= "level", prev[level] is still the previous node of - // prev[level-1] on level "level". Stop. - // * Otherwise, the new node becomes the new previous node of - // prev[level-1], or if level=0, the new node becomes the largest node - // inserted with the same hint. Replace prev[level] with the new node. - // * If prev[i] is replaced by another node, check if it can replace - // prev[i+1] using a similar rule, up till "num_levels" level. - Node* p = x; - uint8_t h = static_cast(height); - for (int i = level; i < hint->num_levels; i++) { - if (h <= i) { - p = nullptr; - break; - } - std::swap(p, hint->prev[i]); - std::swap(h, hint->prev_height[i]); - } - if (p != nullptr && h > hint->num_levels) { - hint->prev[hint->num_levels] = p; - hint->prev_height[hint->num_levels] = h; - hint->num_levels++; - } + Insert(key, seq_splice_, false); } template -void InlineSkipList::FindLevelSplice(const char* key, Node* before, - Node* after, int level, - Node** out_prev, - Node** out_next) { +void InlineSkipList::InsertConcurrently(const char* key) { + Node* prev[kMaxPossibleHeight]; + Node* next[kMaxPossibleHeight]; + Splice splice; + splice.prev_ = prev; + splice.next_ = next; + Insert(key, &splice, false); +} + +template +void InlineSkipList::InsertWithHint(const char* key, void** hint) { + assert(hint != nullptr); + Splice* splice = reinterpret_cast(*hint); + if (splice == nullptr) { + splice = AllocateSplice(); + *hint = reinterpret_cast(splice); + } + Insert(key, splice, true); +} + +template +void InlineSkipList::FindSpliceForLevel(const char* key, + Node* before, Node* after, + int level, Node** out_prev, + Node** out_next) { while (true) { Node* next = before->Next(level); assert(before == head_ || next == nullptr || @@ -831,15 +661,28 @@ void InlineSkipList::FindLevelSplice(const char* key, Node* before, } template -void InlineSkipList::InsertConcurrently(const char* key) { +void InlineSkipList::RecomputeSpliceLevels(const char* key, + Splice* splice, + int recompute_level) { + assert(recompute_level > 0); + assert(recompute_level <= splice->height_); + for (int i = recompute_level - 1; i >= 0; --i) { + FindSpliceForLevel(key, splice->prev_[i + 1], splice->next_[i + 1], i, + &splice->prev_[i], &splice->next_[i]); + } +} + +template +template +void InlineSkipList::Insert(const char* key, Splice* splice, + bool allow_partial_splice_fix) { Node* x = reinterpret_cast(const_cast(key)) - 1; int height = x->UnstashHeight(); assert(height >= 1 && height <= kMaxHeight_); - MaybeInvalidatePrev(height); int max_height = max_height_.load(std::memory_order_relaxed); while (height > max_height) { - if (max_height_.compare_exchange_strong(max_height, height)) { + if (max_height_.compare_exchange_weak(max_height, height)) { // successfully updated it max_height = height; break; @@ -849,28 +692,159 @@ void InlineSkipList::InsertConcurrently(const char* key) { } assert(max_height <= kMaxPossibleHeight); - Node* prev[kMaxPossibleHeight + 1]; - Node* next[kMaxPossibleHeight + 1]; - prev[max_height] = head_; - next[max_height] = nullptr; - for (int i = max_height - 1; i >= 0; --i) { - FindLevelSplice(key, prev[i + 1], next[i + 1], i, &prev[i], &next[i]); - } - for (int i = 0; i < height; ++i) { - while (true) { - x->NoBarrier_SetNext(i, next[i]); - if (prev[i]->CASNext(i, next[i], x)) { - // success + int recompute_height = 0; + if (splice->height_ < max_height) { + // Either splice has never been used or max_height has grown since + // last use. We could potentially fix it in the latter case, but + // that is tricky. + splice->prev_[max_height] = head_; + splice->next_[max_height] = nullptr; + splice->height_ = max_height; + recompute_height = max_height; + } else { + // Splice is a valid proper-height splice that brackets some + // key, but does it bracket this one? We need to validate it and + // recompute a portion of the splice (levels 0..recompute_height-1) + // that is a superset of all levels that don't bracket the new key. + // Several choices are reasonable, because we have to balance the work + // saved against the extra comparisons required to validate the Splice. + // + // One strategy is just to recompute all of orig_splice_height if the + // bottom level isn't bracketing. This pessimistically assumes that + // we will either get a perfect Splice hit (increasing sequential + // inserts) or have no locality. + // + // Another strategy is to walk up the Splice's levels until we find + // a level that brackets the key. This strategy lets the Splice + // hint help for other cases: it turns insertion from O(log N) into + // O(log D), where D is the number of nodes in between the key that + // produced the Splice and the current insert (insertion is aided + // whether the new key is before or after the splice). If you have + // a way of using a prefix of the key to map directly to the closest + // Splice out of O(sqrt(N)) Splices and we make it so that splices + // can also be used as hints during read, then we end up with Oshman's + // and Shavit's SkipTrie, which has O(log log N) lookup and insertion + // (compare to O(log N) for skip list). + // + // We control the pessimistic strategy with allow_partial_splice_fix. + // A good strategy is probably to be pessimistic for seq_splice_, + // optimistic if the caller actually went to the work of providing + // a Splice. + while (recompute_height < max_height) { + if (splice->prev_[recompute_height]->Next(recompute_height) != + splice->next_[recompute_height]) { + // splice isn't tight at this level, there must have been some inserts + // to this + // location that didn't update the splice. We might only be a little + // stale, but if + // the splice is very stale it would be O(N) to fix it. We haven't used + // up any of + // our budget of comparisons, so always move up even if we are + // pessimistic about + // our chances of success. + ++recompute_height; + } else if (splice->prev_[recompute_height] != head_ && + !KeyIsAfterNode(key, splice->prev_[recompute_height])) { + // key is from before splice + if (allow_partial_splice_fix) { + // skip all levels with the same node without more comparisons + Node* bad = splice->prev_[recompute_height]; + while (splice->prev_[recompute_height] == bad) { + ++recompute_height; + } + } else { + // we're pessimistic, recompute everything + recompute_height = max_height; + } + } else if (KeyIsAfterNode(key, splice->next_[recompute_height])) { + // key is from after splice + if (allow_partial_splice_fix) { + Node* bad = splice->next_[recompute_height]; + while (splice->next_[recompute_height] == bad) { + ++recompute_height; + } + } else { + recompute_height = max_height; + } + } else { + // this level brackets the key, we won! break; } - // CAS failed, we need to recompute prev and next. It is unlikely - // to be helpful to try to use a different level as we redo the - // search, because it should be unlikely that lots of nodes have - // been inserted between prev[i] and next[i]. No point in using - // next[i] as the after hint, because we know it is stale. - FindLevelSplice(key, prev[i], nullptr, i, &prev[i], &next[i]); } } + assert(recompute_height <= max_height); + if (recompute_height > 0) { + RecomputeSpliceLevels(key, splice, recompute_height); + } + + bool splice_is_valid = true; + if (UseCAS) { + for (int i = 0; i < height; ++i) { + while (true) { + assert(splice->next_[i] == nullptr || + compare_(x->Key(), splice->next_[i]->Key()) < 0); + assert(splice->prev_[i] == head_ || + compare_(splice->prev_[i]->Key(), x->Key()) < 0); + x->NoBarrier_SetNext(i, splice->next_[i]); + if (splice->prev_[i]->CASNext(i, splice->next_[i], x)) { + // success + break; + } + // CAS failed, we need to recompute prev and next. It is unlikely + // to be helpful to try to use a different level as we redo the + // search, because it should be unlikely that lots of nodes have + // been inserted between prev[i] and next[i]. No point in using + // next[i] as the after hint, because we know it is stale. + FindSpliceForLevel(key, splice->prev_[i], nullptr, i, &splice->prev_[i], + &splice->next_[i]); + + // Since we've narrowed the bracket for level i, we might have + // violated the Splice constraint between i and i-1. Make sure + // we recompute the whole thing next time. + if (i > 0) { + splice_is_valid = false; + } + } + } + } else { + for (int i = 0; i < height; ++i) { + if (i >= recompute_height && + splice->prev_[i]->Next(i) != splice->next_[i]) { + FindSpliceForLevel(key, splice->prev_[i], nullptr, i, &splice->prev_[i], + &splice->next_[i]); + } + assert(splice->next_[i] == nullptr || + compare_(x->Key(), splice->next_[i]->Key()) < 0); + assert(splice->prev_[i] == head_ || + compare_(splice->prev_[i]->Key(), x->Key()) < 0); + assert(splice->prev_[i]->Next(i) == splice->next_[i]); + x->NoBarrier_SetNext(i, splice->next_[i]); + splice->prev_[i]->SetNext(i, x); + } + } + if (splice_is_valid) { + for (int i = 0; i < height; ++i) { + splice->prev_[i] = x; + } + assert(splice->prev_[splice->height_] == head_); + assert(splice->next_[splice->height_] == nullptr); + for (int i = 0; i < splice->height_; ++i) { + assert(splice->next_[i] == nullptr || + compare_(key, splice->next_[i]->Key()) < 0); + assert(splice->prev_[i] == head_ || + compare_(splice->prev_[i]->Key(), key) <= 0); + assert(splice->prev_[i + 1] == splice->prev_[i] || + splice->prev_[i + 1] == head_ || + compare_(splice->prev_[i + 1]->Key(), splice->prev_[i]->Key()) < + 0); + assert(splice->next_[i + 1] == splice->next_[i] || + splice->next_[i + 1] == nullptr || + compare_(splice->next_[i]->Key(), splice->next_[i + 1]->Key()) < + 0); + } + } else { + splice->height_ = 0; + } } template diff --git a/db/inlineskiplist_test.cc b/db/inlineskiplist_test.cc index 658ca4364..a06c49898 100644 --- a/db/inlineskiplist_test.cc +++ b/db/inlineskiplist_test.cc @@ -54,8 +54,7 @@ class InlineSkipTest : public testing::Test { keys_.insert(key); } - void InsertWithHint(TestInlineSkipList* list, Key key, - TestInlineSkipList::InsertHint** hint) { + void InsertWithHint(TestInlineSkipList* list, Key key, void** hint) { char* buf = list->AllocateKey(sizeof(Key)); memcpy(buf, &key, sizeof(Key)); list->InsertWithHint(buf, hint); @@ -201,7 +200,7 @@ TEST_F(InlineSkipTest, InsertWithHint_Sequential) { Arena arena; TestComparator cmp; TestInlineSkipList list(cmp, &arena); - TestInlineSkipList::InsertHint* hint = nullptr; + void* hint = nullptr; for (int i = 0; i < N; i++) { Key key = i; InsertWithHint(&list, key, &hint); @@ -216,7 +215,7 @@ TEST_F(InlineSkipTest, InsertWithHint_MultipleHints) { Arena arena; TestComparator cmp; TestInlineSkipList list(cmp, &arena); - TestInlineSkipList::InsertHint* hints[S]; + void* hints[S]; Key last_key[S]; for (int i = 0; i < S; i++) { hints[i] = nullptr; @@ -237,7 +236,7 @@ TEST_F(InlineSkipTest, InsertWithHint_MultipleHintsRandom) { Arena arena; TestComparator cmp; TestInlineSkipList list(cmp, &arena); - TestInlineSkipList::InsertHint* hints[S]; + void* hints[S]; for (int i = 0; i < S; i++) { hints[i] = nullptr; } @@ -260,7 +259,7 @@ TEST_F(InlineSkipTest, InsertWithHint_CompatibleWithInsertWithoutHint) { std::unordered_set used; Key with_hint[S1]; Key without_hint[S2]; - TestInlineSkipList::InsertHint* hints[S1]; + void* hints[S1]; for (int i = 0; i < S1; i++) { hints[i] = nullptr; while (true) { diff --git a/db/memtable.cc b/db/memtable.cc index 6de1bfff9..ef6356d3b 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -441,16 +441,12 @@ void MemTable::Add(SequenceNumber s, ValueType type, assert((unsigned)(p + val_size - buf) == (unsigned)encoded_len); if (!allow_concurrent) { // Extract prefix for insert with hint. - Slice prefix; - if (insert_with_hint_prefix_extractor_ != nullptr) { - if (insert_with_hint_prefix_extractor_->InDomain(key_slice)) { - prefix = insert_with_hint_prefix_extractor_->Transform(key_slice); - } - } - if (prefix.empty()) { - table->Insert(handle); - } else { + if (insert_with_hint_prefix_extractor_ != nullptr && + insert_with_hint_prefix_extractor_->InDomain(key_slice)) { + Slice prefix = insert_with_hint_prefix_extractor_->Transform(key_slice); table->InsertWithHint(handle, &insert_hints_[prefix]); + } else { + table->Insert(handle); } // this is a bit ugly, but is the way to avoid locked instructions diff --git a/include/rocksdb/memtablerep.h b/include/rocksdb/memtablerep.h index 31280bbf2..3ae89a980 100644 --- a/include/rocksdb/memtablerep.h +++ b/include/rocksdb/memtablerep.h @@ -83,9 +83,12 @@ class MemTableRep { // collection, and no concurrent modifications to the table in progress virtual void Insert(KeyHandle handle) = 0; - // Same as Insert(), but in additional pass a hint to optimize sequential - // inserts. A new hint will be return from the hint pointer. Caller can get - // an initial hint by passing hint pointing to nullptr. + // Same as Insert(), but in additional pass a hint to insert location for + // the key. If hint points to nullptr, a new hint will be populated. + // otherwise the hint will be updated to reflect the last insert location. + // + // Currently only skip-list based memtable implement the interface. Other + // implementations will fallback to Insert() by default. virtual void InsertWithHint(KeyHandle handle, void** hint) { // Ignore the hint by default. Insert(handle); diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 8906c5fbd..7e6657793 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -747,24 +747,22 @@ struct ColumnFamilyOptions { size_t memtable_huge_page_size; // If non-nullptr, memtable will use the specified function to extract - // prefixes for keys, and for each non-empty prefix maintain a hint to - // reduce CPU usage for inserting keys with the prefix. Keys with empty - // prefix will be insert without using a hint. + // prefixes for keys, and for each prefix maintain a hint of insert location + // to reduce CPU usage for inserting keys with the prefix. Keys out of + // domain of the prefix extractor will be insert without using hints. // // Currently only the default skiplist based memtable implements the feature. - // All other memtable implementation will ignore the option. It incurs ~150 + // All other memtable implementation will ignore the option. It incurs ~250 // additional bytes of memory overhead to store a hint for each prefix. - // If allow_concurrent_memtable_write is true, the option will also be - // ignored. + // Also concurrent writes (when allow_concurrent_memtable_write is true) will + // ignore the option. // - // The option is best suited for sequential inserts, or inserts that's - // almost sequential. One scenario could be inserting keys of the form - // (prefix + timestamp), and keys of the same prefix always comes in - // with time order, or in some cases a key with a smaller timestamp comes - // in later due to network latency. - // - // REQUIRES: If custom comparator is provided, it has to make sure keys - // with the same prefix appear in consecutive range. + // The option is best suited for workloads where keys will likely to insert + // to a location close the the last inserted key with the same prefix. + // One example could be inserting keys of the form (prefix + timestamp), + // and keys of the same prefix always comes in with time order. Another + // example would be updating the same key over and over again, in which case + // the prefix can be the key itself. // // Default: nullptr (disable) std::shared_ptr diff --git a/memtable/skiplistrep.cc b/memtable/skiplistrep.cc index 59ca2af7c..d07fa1218 100644 --- a/memtable/skiplistrep.cc +++ b/memtable/skiplistrep.cc @@ -37,9 +37,7 @@ public: } virtual void InsertWithHint(KeyHandle handle, void** hint) override { - skip_list_.InsertWithHint( - static_cast(handle), - reinterpret_cast(hint)); + skip_list_.InsertWithHint(static_cast(handle), hint); } virtual void InsertConcurrently(KeyHandle handle) override {