From 906f3dca72ac0b101eae2554efb3ee5bdb6e4191 Mon Sep 17 00:00:00 2001 From: kailiu Date: Tue, 18 Feb 2014 14:58:55 -0800 Subject: [PATCH] Add a hash-index component for block Summary: this is the key component extracted from diff: https://reviews.facebook.net/D14271 I separate it to a dedicated patch to make the review easier. Test Plan: added a unit test and passed it. Reviewers: haobo, sdong, dhruba CC: leveldb Differential Revision: https://reviews.facebook.net/D16245 --- Makefile | 4 ++ db/memtable.cc | 12 +--- table/block_hash_index.cc | 112 +++++++++++++++++++++++++++++++ table/block_hash_index.h | 72 ++++++++++++++++++++ table/block_hash_index_test.cc | 117 +++++++++++++++++++++++++++++++++ util/arena.h | 4 +- util/hash.cc | 1 - util/murmurhash.h | 11 +++- 8 files changed, 319 insertions(+), 14 deletions(-) create mode 100644 table/block_hash_index.cc create mode 100644 table/block_hash_index.h create mode 100644 table/block_hash_index_test.cc diff --git a/Makefile b/Makefile index dddd48d49..7a8e16777 100644 --- a/Makefile +++ b/Makefile @@ -55,6 +55,7 @@ VALGRIND_OPTS = --error-exitcode=$(VALGRIND_ERROR) --leak-check=full TESTS = \ db_test \ + block_hash_index_test \ autovector_test \ table_properties_collector_test \ arena_test \ @@ -227,6 +228,9 @@ $(LIBRARY): $(LIBOBJECTS) db_bench: db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) $(CXX) db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) +block_hash_index_test: table/block_hash_index_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) table/block_hash_index_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + db_stress: tools/db_stress.o $(LIBOBJECTS) $(TESTUTIL) $(CXX) tools/db_stress.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) diff --git a/db/memtable.cc b/db/memtable.cc index e9f528725..c229c0a0b 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -26,15 +26,6 @@ #include "util/statistics.h" #include "util/stop_watch.h" -namespace std { -template <> -struct hash { - size_t operator()(const rocksdb::Slice& slice) const { - return MurmurHash(slice.data(), slice.size(), 0); - } -}; -} - namespace rocksdb { MemTable::MemTable(const InternalKeyComparator& cmp, const Options& options) @@ -167,7 +158,8 @@ Iterator* MemTable::NewIterator(const ReadOptions& options) { } port::RWMutex* MemTable::GetLock(const Slice& key) { - return &locks_[std::hash()(key) % locks_.size()]; + static murmur_hash hash; + return &locks_[hash(key) % locks_.size()]; } void MemTable::Add(SequenceNumber s, ValueType type, diff --git a/table/block_hash_index.cc b/table/block_hash_index.cc new file mode 100644 index 000000000..0c9674c95 --- /dev/null +++ b/table/block_hash_index.cc @@ -0,0 +1,112 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include + +#include "table/block_hash_index.h" +#include "rocksdb/comparator.h" +#include "rocksdb/iterator.h" +#include "rocksdb/slice_transform.h" + +namespace rocksdb { + +BlockHashIndex* CreateBlockHashIndex(Iterator* index_iter, Iterator* data_iter, + const uint32_t num_restarts, + const Comparator* comparator, + const SliceTransform* hash_key_extractor) { + assert(hash_key_extractor); + auto hash_index = new BlockHashIndex(hash_key_extractor); + uint64_t current_restart_index = 0; + + std::string pending_entry_prefix; + // pending_block_num == 0 also implies there is no entry inserted at all. + uint32_t pending_block_num = 0; + uint32_t pending_entry_index = 0; + + // scan all the entries and create a hash index based on their prefixes. + data_iter->SeekToFirst(); + for (index_iter->SeekToFirst(); + index_iter->Valid() && current_restart_index < num_restarts; + index_iter->Next()) { + Slice last_key_in_block = index_iter->key(); + assert(data_iter->Valid() && data_iter->status().ok()); + + // scan through all entries within a data block. + while (data_iter->Valid() && + comparator->Compare(data_iter->key(), last_key_in_block) <= 0) { + auto key_prefix = hash_key_extractor->Transform(data_iter->key()); + bool is_first_entry = pending_block_num == 0; + + // Keys may share the prefix + if (is_first_entry || pending_entry_prefix != key_prefix) { + if (!is_first_entry) { + bool succeeded = hash_index->Add( + pending_entry_prefix, pending_entry_index, pending_block_num); + if (!succeeded) { + delete hash_index; + return nullptr; + } + } + + // update the status. + // needs a hard copy otherwise the underlying data changes all the time. + pending_entry_prefix = key_prefix.ToString(); + pending_block_num = 1; + pending_entry_index = current_restart_index; + } else { + // entry number increments when keys share the prefix reside in + // differnt data blocks. + auto last_restart_index = pending_entry_index + pending_block_num - 1; + assert(last_restart_index <= current_restart_index); + if (last_restart_index != current_restart_index) { + ++pending_block_num; + } + } + data_iter->Next(); + } + + ++current_restart_index; + } + + // make sure all entries has been scaned. + assert(!index_iter->Valid()); + assert(!data_iter->Valid()); + + if (pending_block_num > 0) { + auto succeeded = hash_index->Add(pending_entry_prefix, pending_entry_index, + pending_block_num); + if (!succeeded) { + delete hash_index; + return nullptr; + } + } + + return hash_index; +} + +bool BlockHashIndex::Add(const Slice& prefix, uint32_t restart_index, + uint32_t num_blocks) { + auto prefix_ptr = arena_.Allocate(prefix.size()); + std::copy(prefix.data() /* begin */, prefix.data() + prefix.size() /* end */, + prefix_ptr /* destination */); + auto result = + restart_indices_.insert({Slice(prefix_ptr, prefix.size()), + RestartIndex(restart_index, num_blocks)}); + return result.second; +} + +const BlockHashIndex::RestartIndex* BlockHashIndex::GetRestartIndex( + const Slice& key) { + auto key_prefix = hash_key_extractor_->Transform(key); + + auto pos = restart_indices_.find(key_prefix); + if (pos == restart_indices_.end()) { + return nullptr; + } + + return &pos->second; +} + +} // namespace rocksdb diff --git a/table/block_hash_index.h b/table/block_hash_index.h new file mode 100644 index 000000000..0ff65b418 --- /dev/null +++ b/table/block_hash_index.h @@ -0,0 +1,72 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +#pragma once + +#include +#include + +#include "util/arena.h" +#include "util/murmurhash.h" + +namespace rocksdb { + +class Comparator; +class Iterator; +class Slice; +class SliceTransform; + +// Build a hash-based index to speed up the lookup for "index block". +// BlockHashIndex accepts a key and, if found, returns its restart index within +// that index block. +class BlockHashIndex { + public: + // Represents a restart index in the index block's restart array. + struct RestartIndex { + explicit RestartIndex(uint32_t first_index, uint32_t num_blocks = 1) + : first_index(first_index), num_blocks(num_blocks) {} + + // For a given prefix, what is the restart index for the first data block + // that contains it. + uint32_t first_index = 0; + + // How many data blocks contains this prefix? + uint32_t num_blocks = 1; + }; + + explicit BlockHashIndex(const SliceTransform* hash_key_extractor) + : hash_key_extractor_(hash_key_extractor) {} + + // Maps a key to its restart first_index. + // Returns nullptr if the restart first_index is found + const RestartIndex* GetRestartIndex(const Slice& key); + + bool Add(const Slice& key_prefix, uint32_t restart_index, + uint32_t num_blocks); + + size_t ApproximateMemoryUsage() const { + return arena_.ApproximateMemoryUsage(); + } + + private: + const SliceTransform* hash_key_extractor_; + std::unordered_map restart_indices_; + Arena arena_; +}; + +// Create hash index by scanning the entries in index as well as the whole +// dataset. +// @params index_iter: an iterator with the pointer to the first entry in a +// block. +// @params data_iter: an iterator that can scan all the entries reside in a +// table. +// @params num_restarts: used for correctness verification. +// @params hash_key_extractor: extract the hashable part of a given key. +// On error, nullptr will be returned. +BlockHashIndex* CreateBlockHashIndex(Iterator* index_iter, Iterator* data_iter, + const uint32_t num_restarts, + const Comparator* comparator, + const SliceTransform* hash_key_extractor); + +} // namespace rocksdb diff --git a/table/block_hash_index_test.cc b/table/block_hash_index_test.cc new file mode 100644 index 000000000..f4c0ac4a4 --- /dev/null +++ b/table/block_hash_index_test.cc @@ -0,0 +1,117 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include +#include +#include + +#include "rocksdb/comparator.h" +#include "rocksdb/iterator.h" +#include "rocksdb/slice_transform.h" +#include "table/block_hash_index.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { + +typedef std::map Data; + +class MapIterator : public Iterator { + public: + explicit MapIterator(const Data& data) : data_(data), pos_(data_.end()) {} + + virtual bool Valid() const { return pos_ != data_.end(); } + + virtual void SeekToFirst() { pos_ = data_.begin(); } + + virtual void SeekToLast() { + pos_ = data_.end(); + --pos_; + } + + virtual void Seek(const Slice& target) { + pos_ = data_.find(target.ToString()); + } + + virtual void Next() { ++pos_; } + + virtual void Prev() { --pos_; } + + virtual Slice key() const { return pos_->first; } + + virtual Slice value() const { return pos_->second; } + + virtual Status status() const { return Status::OK(); } + + private: + const Data& data_; + Data::const_iterator pos_; +}; + +class BlockTest {}; + +TEST(BlockTest, BasicTest) { + const size_t keys_per_block = 4; + const size_t prefix_size = 2; + std::vector keys = {/* block 1 */ + "0101", "0102", "0103", "0201", + /* block 2 */ + "0202", "0203", "0301", "0401", + /* block 3 */ + "0501", "0601", "0701", "0801", + /* block 4 */ + "0802", "0803", "0804", "0805", + /* block 5 */ + "0806", "0807", "0808", "0809", }; + + Data data_entries; + for (const auto key : keys) { + data_entries.insert({key, key}); + } + + Data index_entries; + for (size_t i = 3; i < keys.size(); i += keys_per_block) { + // simply ignore the value part + index_entries.insert({keys[i], ""}); + } + + MapIterator data_iter(data_entries); + MapIterator index_iter(index_entries); + + auto prefix_extractor = NewFixedPrefixTransform(prefix_size); + std::unique_ptr block_hash_index( + CreateBlockHashIndex(&index_iter, &data_iter, index_entries.size(), + BytewiseComparator(), prefix_extractor)); + + std::map expected = { + {"01xx", BlockHashIndex::RestartIndex(0, 1)}, + {"02yy", BlockHashIndex::RestartIndex(0, 2)}, + {"03zz", BlockHashIndex::RestartIndex(1, 1)}, + {"04pp", BlockHashIndex::RestartIndex(1, 1)}, + {"05ww", BlockHashIndex::RestartIndex(2, 1)}, + {"06xx", BlockHashIndex::RestartIndex(2, 1)}, + {"07pp", BlockHashIndex::RestartIndex(2, 1)}, + {"08xz", BlockHashIndex::RestartIndex(2, 3)}, }; + + const BlockHashIndex::RestartIndex* index = nullptr; + // search existed prefixes + for (const auto& item : expected) { + index = block_hash_index->GetRestartIndex(item.first); + ASSERT_TRUE(index != nullptr); + ASSERT_EQ(item.second.first_index, index->first_index); + ASSERT_EQ(item.second.num_blocks, index->num_blocks); + } + + // search non exist prefixes + ASSERT_TRUE(!block_hash_index->GetRestartIndex("00xx")); + ASSERT_TRUE(!block_hash_index->GetRestartIndex("10yy")); + ASSERT_TRUE(!block_hash_index->GetRestartIndex("20zz")); + + delete prefix_extractor; +} + +} // namespace rocksdb + +int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); } diff --git a/util/arena.h b/util/arena.h index 4c45417f4..bfa7fe4d8 100644 --- a/util/arena.h +++ b/util/arena.h @@ -39,12 +39,12 @@ class Arena { // Returns an estimate of the total memory usage of data allocated // by the arena (exclude the space allocated but not yet used for future // allocations). - const size_t ApproximateMemoryUsage() { + size_t ApproximateMemoryUsage() const { return blocks_memory_ + blocks_.capacity() * sizeof(char*) - alloc_bytes_remaining_; } - const size_t MemoryAllocatedBytes() { return blocks_memory_; } + size_t MemoryAllocatedBytes() const { return blocks_memory_; } private: // Number of bytes allocated in one block diff --git a/util/hash.cc b/util/hash.cc index 6f0e9cc92..e38c186c3 100644 --- a/util/hash.cc +++ b/util/hash.cc @@ -46,5 +46,4 @@ uint32_t Hash(const char* data, size_t n, uint32_t seed) { return h; } - } // namespace rocksdb diff --git a/util/murmurhash.h b/util/murmurhash.h index 9707e5635..faa86556d 100644 --- a/util/murmurhash.h +++ b/util/murmurhash.h @@ -11,6 +11,7 @@ */ #pragma once #include +#include "rocksdb/slice.h" #if defined(__x86_64__) #define MURMUR_HASH MurmurHash64A @@ -29,5 +30,13 @@ typedef unsigned int murmur_t; unsigned int MurmurHashNeutral2 ( const void * key, int len, unsigned int seed ); #define MurmurHash MurmurHashNeutral2 typedef unsigned int murmur_t; - #endif + +// Allow slice to be hashable by murmur hash. +namespace rocksdb { +struct murmur_hash { + size_t operator()(const Slice& slice) const { + return MurmurHash(slice.data(), slice.size(), 0); + } +}; +} // rocksdb