From 75b59d5146b078d67968b30d28bf0657574ec968 Mon Sep 17 00:00:00 2001
From: Kai Liu <kailiu@fb.com>
Date: Thu, 10 Apr 2014 14:19:43 -0700
Subject: [PATCH] Enable hash index for block-based table

Summary: Based on previous patches, this diff eventually provides the end-to-end mechanism for users to specify the hash-index.

Test Plan: Wrote several new unit tests.

Reviewers: sdong, haobo, dhruba

Reviewed By: sdong

CC: leveldb

Differential Revision: https://reviews.facebook.net/D16539
---
 db/db_test.cc                      |  42 +++++++-
 db/dbformat.h                      |  31 ++++++
 include/rocksdb/table.h            |   6 ++
 table/block.cc                     | 113 +++++++++++++-------
 table/block.h                      |  18 +++-
 table/block_based_table_builder.cc |  36 ++++++-
 table/block_based_table_reader.cc  |  76 +++++++++++--
 table/block_based_table_reader.h   |   4 +-
 table/block_test.cc                | 164 ++++++++++++++++++++++++++---
 table/table_test.cc                | 110 +++++++++++++++++++
 10 files changed, 521 insertions(+), 79 deletions(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index 0285905f2..1ec56d712 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -266,6 +266,8 @@ class DBTest {
   // Sequence of option configurations to try
   enum OptionConfig {
     kDefault,
+    kBlockBasedTableWithPrefixHashIndex,
+    kBlockBasedTableWithWholeKeyHashIndex,
     kPlainTableFirstBytePrefix,
     kPlainTableAllBytesPrefix,
     kVectorRep,
@@ -303,7 +305,8 @@ class DBTest {
     kSkipDeletesFilterFirst = 1,
     kSkipUniversalCompaction = 2,
     kSkipMergePut = 4,
-    kSkipPlainTable = 8
+    kSkipPlainTable = 8,
+    kSkipHashIndex = 16
   };
 
   DBTest() : option_config_(kDefault),
@@ -343,6 +346,12 @@ class DBTest {
               || option_config_ == kPlainTableFirstBytePrefix)) {
         continue;
       }
+      if ((skip_mask & kSkipPlainTable) &&
+          (option_config_ == kBlockBasedTableWithPrefixHashIndex ||
+           option_config_ == kBlockBasedTableWithWholeKeyHashIndex)) {
+        continue;
+      }
+
       break;
     }
 
@@ -439,6 +448,20 @@ class DBTest {
       case kInfiniteMaxOpenFiles:
         options.max_open_files = -1;
         break;
+      case kBlockBasedTableWithPrefixHashIndex: {
+        BlockBasedTableOptions table_options;
+        table_options.index_type = BlockBasedTableOptions::kHashSearch;
+        options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+        options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+        break;
+      }
+      case kBlockBasedTableWithWholeKeyHashIndex: {
+        BlockBasedTableOptions table_options;
+        table_options.index_type = BlockBasedTableOptions::kHashSearch;
+        options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+        options.prefix_extractor.reset(NewNoopTransform());
+        break;
+      }
       default:
         break;
     }
@@ -1363,7 +1386,7 @@ TEST(DBTest, KeyMayExist) {
 
     // KeyMayExist function only checks data in block caches, which is not used
     // by plain table format.
-  } while (ChangeOptions(kSkipPlainTable));
+  } while (ChangeOptions(kSkipPlainTable | kSkipHashIndex));
 }
 
 TEST(DBTest, NonBlockingIteration) {
@@ -6184,7 +6207,9 @@ TEST(DBTest, Randomized) {
       int minimum = 0;
       if (option_config_ == kHashSkipList ||
           option_config_ == kHashLinkList ||
-          option_config_ == kPlainTableFirstBytePrefix) {
+          option_config_ == kPlainTableFirstBytePrefix ||
+          option_config_ == kBlockBasedTableWithWholeKeyHashIndex ||
+          option_config_ == kBlockBasedTableWithPrefixHashIndex) {
         minimum = 1;
       }
       if (p < 45) {                               // Put
@@ -6224,8 +6249,15 @@ TEST(DBTest, Randomized) {
       }
 
       if ((step % 100) == 0) {
-        ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr));
-        ASSERT_TRUE(CompareIterators(step, &model, db_, model_snap, db_snap));
+        // For DB instances that use the hash index + block-based table, the
+        // iterator will be invalid right when seeking a non-existent key, right
+        // than return a key that is close to it.
+        if (option_config_ != kBlockBasedTableWithWholeKeyHashIndex &&
+            option_config_ != kBlockBasedTableWithPrefixHashIndex) {
+          ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr));
+          ASSERT_TRUE(CompareIterators(step, &model, db_, model_snap, db_snap));
+        }
+
         // Save a snapshot from each DB this time that we'll use next
         // time we compare things, to make sure the current state is
         // preserved with the snapshot
diff --git a/db/dbformat.h b/db/dbformat.h
index 99925d284..27a082284 100644
--- a/db/dbformat.h
+++ b/db/dbformat.h
@@ -13,6 +13,7 @@
 #include "rocksdb/db.h"
 #include "rocksdb/filter_policy.h"
 #include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
 #include "rocksdb/table.h"
 #include "rocksdb/types.h"
 #include "util/coding.h"
@@ -304,4 +305,34 @@ class IterKey {
   void operator=(const IterKey&) = delete;
 };
 
+class InternalKeySliceTransform : public SliceTransform {
+ public:
+  explicit InternalKeySliceTransform(const SliceTransform* transform)
+      : transform_(transform) {}
+
+  virtual const char* Name() const { return transform_->Name(); }
+
+  virtual Slice Transform(const Slice& src) const {
+    auto user_key = ExtractUserKey(src);
+    return transform_->Transform(user_key);
+  }
+
+  virtual bool InDomain(const Slice& src) const {
+    auto user_key = ExtractUserKey(src);
+    return transform_->InDomain(user_key);
+  }
+
+  virtual bool InRange(const Slice& dst) const {
+    auto user_key = ExtractUserKey(dst);
+    return transform_->InRange(user_key);
+  }
+
+  const SliceTransform* user_prefix_extractor() const { return transform_; }
+
+ private:
+  // Like comparator, InternalKeySliceTransform will not take care of the
+  // deletion of transform_
+  const SliceTransform* const transform_;
+};
+
 }  // namespace rocksdb
diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h
index e350c7780..1016bcf14 100644
--- a/include/rocksdb/table.h
+++ b/include/rocksdb/table.h
@@ -60,6 +60,12 @@ struct BlockBasedTableOptions {
     // A space efficient index block that is optimized for
     // binary-search-based index.
     kBinarySearch,
+
+    // The hash index, if enabled, will do the hash lookup when
+    // `ReadOption.prefix_seek == true`. User should also specify
+    // `Options.prefix_extractor` to allow the index block to correctly
+    // extract the prefix of the given key and perform hash table lookup.
+    kHashSearch,
   };
 
   IndexType index_type = kBinarySearch;
diff --git a/table/block.cc b/table/block.cc
index 3f969fe2a..6a6751ca7 100644
--- a/table/block.cc
+++ b/table/block.cc
@@ -11,16 +11,20 @@
 
 #include "table/block.h"
 
-#include <vector>
 #include <algorithm>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
 #include "rocksdb/comparator.h"
+#include "table/block_hash_index.h"
 #include "table/format.h"
 #include "util/coding.h"
 #include "util/logging.h"
 
 namespace rocksdb {
 
-inline uint32_t Block::NumRestarts() const {
+uint32_t Block::NumRestarts() const {
   assert(size_ >= 2*sizeof(uint32_t));
   return DecodeFixed32(data_ + size_ - sizeof(uint32_t));
 }
@@ -92,6 +96,7 @@ class Block::Iter : public Iterator {
   std::string key_;
   Slice value_;
   Status status_;
+  BlockHashIndex* hash_index_;
 
   inline int Compare(const Slice& a, const Slice& b) const {
     return comparator_->Compare(a, b);
@@ -118,16 +123,15 @@ class Block::Iter : public Iterator {
   }
 
  public:
-  Iter(const Comparator* comparator,
-       const char* data,
-       uint32_t restarts,
-       uint32_t num_restarts)
+  Iter(const Comparator* comparator, const char* data, uint32_t restarts,
+       uint32_t num_restarts, BlockHashIndex* hash_index)
       : comparator_(comparator),
         data_(data),
         restarts_(restarts),
         num_restarts_(num_restarts),
         current_(restarts_),
-        restart_index_(num_restarts_) {
+        restart_index_(num_restarts_),
+        hash_index_(hash_index) {
     assert(num_restarts_ > 0);
   }
 
@@ -169,45 +173,22 @@ class Block::Iter : public Iterator {
   }
 
   virtual void Seek(const Slice& target) {
-    // Binary search in restart array to find the first restart point
-    // with a key >= target
-    uint32_t left = 0;
-    uint32_t right = num_restarts_ - 1;
-    while (left < right) {
-      uint32_t mid = (left + right + 1) / 2;
-      uint32_t region_offset = GetRestartPoint(mid);
-      uint32_t shared, non_shared, value_length;
-      const char* key_ptr = DecodeEntry(data_ + region_offset,
-                                        data_ + restarts_,
-                                        &shared, &non_shared, &value_length);
-      if (key_ptr == nullptr || (shared != 0)) {
-        CorruptionError();
-        return;
-      }
-      Slice mid_key(key_ptr, non_shared);
-      if (Compare(mid_key, target) < 0) {
-        // Key at "mid" is smaller than "target".  Therefore all
-        // blocks before "mid" are uninteresting.
-        left = mid;
-      } else {
-        // Key at "mid" is >= "target".  Therefore all blocks at or
-        // after "mid" are uninteresting.
-        right = mid - 1;
-      }
-    }
+    uint32_t index = 0;
+    bool ok = hash_index_ ? HashSeek(target, &index)
+                          : BinarySeek(target, 0, num_restarts_ - 1, &index);
 
+    if (!ok) {
+      return;
+    }
+    SeekToRestartPoint(index);
     // Linear search (within restart block) for first key >= target
-    SeekToRestartPoint(left);
+
     while (true) {
-      if (!ParseNextKey()) {
-        return;
-      }
-      if (Compare(key_, target) >= 0) {
+      if (!ParseNextKey() || Compare(key_, target) >= 0) {
         return;
       }
     }
   }
-
   virtual void SeekToFirst() {
     SeekToRestartPoint(0);
     ParseNextKey();
@@ -257,6 +238,53 @@ class Block::Iter : public Iterator {
       return true;
     }
   }
+  // Binary search in restart array to find the first restart point
+  // with a key >= target
+  bool BinarySeek(const Slice& target, uint32_t left, uint32_t right,
+                  uint32_t* index) {
+    assert(left <= right);
+
+    while (left < right) {
+      uint32_t mid = (left + right + 1) / 2;
+      uint32_t region_offset = GetRestartPoint(mid);
+      uint32_t shared, non_shared, value_length;
+      const char* key_ptr =
+          DecodeEntry(data_ + region_offset, data_ + restarts_, &shared,
+                      &non_shared, &value_length);
+      if (key_ptr == nullptr || (shared != 0)) {
+        CorruptionError();
+        return false;
+      }
+      Slice mid_key(key_ptr, non_shared);
+      if (Compare(mid_key, target) < 0) {
+        // Key at "mid" is smaller than "target". Therefore all
+        // blocks before "mid" are uninteresting.
+        left = mid;
+      } else {
+        // Key at "mid" is >= "target". Therefore all blocks at or
+        // after "mid" are uninteresting.
+        right = mid - 1;
+      }
+    }
+
+    *index = left;
+    return true;
+  }
+
+  bool HashSeek(const Slice& target, uint32_t* index) {
+    assert(hash_index_);
+    auto restart_index = hash_index_->GetRestartIndex(target);
+    if (restart_index == nullptr) {
+      current_ = restarts_;
+      return 0;
+    }
+
+    // the elements in restart_array[index : index + num_blocks]
+    // are all with same prefix. We'll do binary search in that small range.
+    auto left = restart_index->first_index;
+    auto right = restart_index->first_index + restart_index->num_blocks - 1;
+    return BinarySeek(target, left, right, index);
+  }
 };
 
 Iterator* Block::NewIterator(const Comparator* cmp) {
@@ -267,8 +295,13 @@ Iterator* Block::NewIterator(const Comparator* cmp) {
   if (num_restarts == 0) {
     return NewEmptyIterator();
   } else {
-    return new Iter(cmp, data_, restart_offset_, num_restarts);
+    return new Iter(cmp, data_, restart_offset_, num_restarts,
+                    hash_index_.get());
   }
 }
 
+void Block::SetBlockHashIndex(BlockHashIndex* hash_index) {
+  hash_index_.reset(hash_index);
+}
+
 }  // namespace rocksdb
diff --git a/table/block.h b/table/block.h
index 6d74bb417..b363d62fe 100644
--- a/table/block.h
+++ b/table/block.h
@@ -10,6 +10,7 @@
 #pragma once
 #include <stddef.h>
 #include <stdint.h>
+
 #include "rocksdb/iterator.h"
 #include "rocksdb/options.h"
 
@@ -17,6 +18,7 @@ namespace rocksdb {
 
 struct BlockContents;
 class Comparator;
+class BlockHashIndex;
 
 class Block {
  public:
@@ -26,20 +28,28 @@ class Block {
   ~Block();
 
   size_t size() const { return size_; }
-  bool   cachable() const { return cachable_; }
+  const char* data() const { return data_; }
+  bool cachable() const { return cachable_; }
+  uint32_t NumRestarts() const;
   CompressionType compression_type() const { return compression_type_; }
+
+  // If hash index lookup is enabled and `use_hash_index` is true. This block
+  // will do hash lookup for the key prefix.
+  //
+  // NOTE: for the hash based lookup, if a key prefix doesn't match any key,
+  // the iterator will simply be set as "invalid", rather than returning
+  // the key that is just pass the target key.
   Iterator* NewIterator(const Comparator* comparator);
-  const char* data() { return data_; }
+  void SetBlockHashIndex(BlockHashIndex* hash_index);
 
  private:
-  uint32_t NumRestarts() const;
-
   const char* data_;
   size_t size_;
   uint32_t restart_offset_;     // Offset in data_ of restart array
   bool owned_;                  // Block owns data_[]
   bool cachable_;
   CompressionType compression_type_;
+  std::unique_ptr<BlockHashIndex> hash_index_;
 
   // No copying allowed
   Block(const Block&);
diff --git a/table/block_based_table_builder.cc b/table/block_based_table_builder.cc
index de2466605..6b48bf0e6 100644
--- a/table/block_based_table_builder.cc
+++ b/table/block_based_table_builder.cc
@@ -97,9 +97,9 @@ class IndexBuilder {
 //  2. Shorten the key length for index block. Other than honestly using the
 //     last key in the data block as the index key, we instead find a shortest
 //     substitute key that serves the same function.
-class BinarySearchIndexBuilder : public IndexBuilder {
+class ShortenedIndexBuilder : public IndexBuilder {
  public:
-  explicit BinarySearchIndexBuilder(const Comparator* comparator)
+  explicit ShortenedIndexBuilder(const Comparator* comparator)
       : IndexBuilder(comparator),
         index_block_builder_(1 /* block_restart_interval == 1 */, comparator) {}
 
@@ -128,11 +128,41 @@ class BinarySearchIndexBuilder : public IndexBuilder {
   BlockBuilder index_block_builder_;
 };
 
+// FullKeyIndexBuilder is also based on BlockBuilder. It works pretty much like
+// ShortenedIndexBuilder, but preserves the full key instead the substitude key.
+// with the reason being that hash index is based on "prefix".
+class FullKeyIndexBuilder : public IndexBuilder {
+ public:
+  explicit FullKeyIndexBuilder(const Comparator* comparator)
+      : IndexBuilder(comparator),
+        index_block_builder_(1 /* block_restart_interval == 1 */, comparator) {}
+
+  virtual void AddEntry(std::string* last_key_in_current_block,
+                        const Slice* first_key_in_next_block,
+                        const BlockHandle& block_handle) override {
+    std::string handle_encoding;
+    block_handle.EncodeTo(&handle_encoding);
+    index_block_builder_.Add(*last_key_in_current_block, handle_encoding);
+  }
+
+  virtual Slice Finish() override { return index_block_builder_.Finish(); }
+
+  virtual size_t EstimatedSize() const {
+    return index_block_builder_.CurrentSizeEstimate();
+  }
+
+ private:
+  BlockBuilder index_block_builder_;
+};
+
 // Create a index builder based on its type.
 IndexBuilder* CreateIndexBuilder(IndexType type, const Comparator* comparator) {
   switch (type) {
     case BlockBasedTableOptions::kBinarySearch: {
-      return new BinarySearchIndexBuilder(comparator);
+      return new ShortenedIndexBuilder(comparator);
+    }
+    case BlockBasedTableOptions::kHashSearch: {
+      return new FullKeyIndexBuilder(comparator);
     }
     default: {
       assert(!"Do not recognize the index type ");
diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc
index c1555747a..f686239cb 100644
--- a/table/block_based_table_reader.cc
+++ b/table/block_based_table_reader.cc
@@ -25,6 +25,7 @@
 
 #include "table/block.h"
 #include "table/filter_block.h"
+#include "table/block_hash_index.h"
 #include "table/format.h"
 #include "table/meta_blocks.h"
 #include "table/two_level_iterator.h"
@@ -180,19 +181,51 @@ class BinarySearchIndexReader : public IndexReader {
   std::unique_ptr<Block> index_block_;
 };
 
-// TODO(kailiu) This class is only a stub for now. And the comment below is also
-// not completed.
 // Index that leverages an internal hash table to quicken the lookup for a given
 // key.
+// @param data_iter_gen, equavalent to BlockBasedTable::NewIterator(). But that
+// functions requires index to be initalized. To avoid this problem external
+// caller will pass a function that can create the iterator over the entries
+// without the table to be fully initialized.
 class HashIndexReader : public IndexReader {
  public:
   static Status Create(RandomAccessFile* file, const BlockHandle& index_handle,
                        Env* env, const Comparator* comparator,
-                       BlockBasedTable* table,
+                       std::function<Iterator*(Iterator*)> data_iter_gen,
                        const SliceTransform* prefix_extractor,
                        IndexReader** index_reader) {
-    return Status::NotSupported("not implemented yet!");
+    assert(prefix_extractor);
+    Block* index_block = nullptr;
+    auto s =
+        ReadBlockFromFile(file, ReadOptions(), index_handle, &index_block, env);
+
+    if (!s.ok()) {
+      return s;
+    }
+
+    *index_reader = new HashIndexReader(comparator, index_block);
+    std::unique_ptr<Iterator> index_iter(index_block->NewIterator(nullptr));
+    std::unique_ptr<Iterator> data_iter(
+        data_iter_gen(index_block->NewIterator(nullptr)));
+    auto hash_index = CreateBlockHashIndex(index_iter.get(), data_iter.get(),
+                                           index_block->NumRestarts(),
+                                           comparator, prefix_extractor);
+    index_block->SetBlockHashIndex(hash_index);
+    return s;
   }
+
+  virtual Iterator* NewIterator() override {
+    return index_block_->NewIterator(comparator_);
+  }
+
+  virtual size_t size() const override { return index_block_->size(); }
+
+ private:
+  HashIndexReader(const Comparator* comparator, Block* index_block)
+      : IndexReader(comparator), index_block_(index_block) {
+    assert(index_block_ != nullptr);
+  }
+  std::unique_ptr<Block> index_block_;
 };
 
 
@@ -223,6 +256,11 @@ struct BlockBasedTable::Rep {
 
   std::shared_ptr<const TableProperties> table_properties;
   BlockBasedTableOptions::IndexType index_type;
+  // TODO(kailiu) It is very ugly to use internal key in table, since table
+  // module should not be relying on db module. However to make things easier
+  // and compatible with existing code, we introduce a wrapper that allows
+  // block to extract prefix without knowing if a key is internal or not.
+  unique_ptr<SliceTransform> internal_prefix_transform;
 };
 
 BlockBasedTable::~BlockBasedTable() {
@@ -747,8 +785,7 @@ BlockBasedTable::CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
   return { filter, cache_handle };
 }
 
-Iterator* BlockBasedTable::NewIndexIterator(const ReadOptions& read_options)
-    const {
+Iterator* BlockBasedTable::NewIndexIterator(const ReadOptions& read_options) {
   // index reader has already been pre-populated.
   if (rep_->index_reader) {
     return rep_->index_reader->NewIterator();
@@ -978,7 +1015,7 @@ bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options,
 //  3. options
 //  4. internal_comparator
 //  5. index_type
-Status BlockBasedTable::CreateIndexReader(IndexReader** index_reader) const {
+Status BlockBasedTable::CreateIndexReader(IndexReader** index_reader) {
   // Some old version of block-based tables don't have index type present in
   // table properties. If that's the case we can safely use the kBinarySearch.
   auto index_type = BlockBasedTableOptions::kBinarySearch;
@@ -989,11 +1026,30 @@ Status BlockBasedTable::CreateIndexReader(IndexReader** index_reader) const {
         DecodeFixed32(pos->second.c_str()));
   }
 
+  auto file = rep_->file.get();
+  const auto& index_handle = rep_->index_handle;
+  auto env = rep_->options.env;
+  auto comparator = &rep_->internal_comparator;
+
   switch (index_type) {
     case BlockBasedTableOptions::kBinarySearch: {
-      return BinarySearchIndexReader::Create(
-          rep_->file.get(), rep_->index_handle, rep_->options.env,
-          &rep_->internal_comparator, index_reader);
+      return BinarySearchIndexReader::Create(file, index_handle, env,
+                                             comparator, index_reader);
+    }
+    case BlockBasedTableOptions::kHashSearch: {
+      // We need to wrap data with internal_prefix_transform to make sure it can
+      // handle prefix correctly.
+      rep_->internal_prefix_transform.reset(
+          new InternalKeySliceTransform(rep_->options.prefix_extractor.get()));
+      return HashIndexReader::Create(
+          file, index_handle, env, comparator,
+          [&](Iterator* index_iter) {
+            return NewTwoLevelIterator(
+                index_iter, &BlockBasedTable::DataBlockReader,
+                const_cast<BlockBasedTable*>(this), ReadOptions(),
+                rep_->soptions, rep_->internal_comparator);
+          },
+          rep_->internal_prefix_transform.get(), index_reader);
     }
     default: {
       std::string error_message =
diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h
index 8b8f09bd3..613460634 100644
--- a/table/block_based_table_reader.h
+++ b/table/block_based_table_reader.h
@@ -131,7 +131,7 @@ class BlockBasedTable : public TableReader {
   //  2. index is not present in block cache.
   //  3. We disallowed any io to be performed, that is, read_options ==
   //     kBlockCacheTier
-  Iterator* NewIndexIterator(const ReadOptions& read_options) const;
+  Iterator* NewIndexIterator(const ReadOptions& read_options);
 
   // Read block cache from block caches (if set): block_cache and
   // block_cache_compressed.
@@ -164,7 +164,7 @@ class BlockBasedTable : public TableReader {
 
   void ReadMeta(const Footer& footer);
   void ReadFilter(const Slice& filter_handle_value);
-  Status CreateIndexReader(IndexReader** index_reader) const;
+  Status CreateIndexReader(IndexReader** index_reader);
 
   // Read the meta block from sst.
   static Status ReadMetaBlock(
diff --git a/table/block_test.cc b/table/block_test.cc
index 588ce6729..fdba8e99b 100644
--- a/table/block_test.cc
+++ b/table/block_test.cc
@@ -3,7 +3,10 @@
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 //
+#include <stdio.h>
 #include <string>
+#include <vector>
+
 #include "db/dbformat.h"
 #include "db/memtable.h"
 #include "db/write_batch_internal.h"
@@ -11,9 +14,11 @@
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/table.h"
+#include "rocksdb/slice_transform.h"
 #include "table/block.h"
 #include "table/block_builder.h"
 #include "table/format.h"
+#include "table/block_hash_index.h"
 #include "util/random.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
@@ -25,6 +30,40 @@ static std::string RandomString(Random* rnd, int len) {
   test::RandomString(rnd, len, &r);
   return r;
 }
+std::string GenerateKey(int primary_key, int secondary_key, int padding_size,
+                        Random *rnd) {
+  char buf[50];
+  char *p = &buf[0];
+  snprintf(buf, sizeof(buf), "%6d%4d", primary_key, secondary_key);
+  std::string k(p);
+  if (padding_size) {
+    k += RandomString(rnd, padding_size);
+  }
+
+  return k;
+}
+
+// Generate random key value pairs.
+// The generated key will be sorted. You can tune the parameters to generated
+// different kinds of test key/value pairs for different scenario.
+void GenerateRandomKVs(std::vector<std::string> *keys,
+                       std::vector<std::string> *values, const int from,
+                       const int len, const int step = 1,
+                       const int padding_size = 0,
+                       const int keys_share_prefix = 1) {
+  Random rnd(302);
+
+  // generate different prefix
+  for (int i = from; i < from + len; i += step) {
+    // generating keys that shares the prefix
+    for (int j = 0; j < keys_share_prefix; ++j) {
+      keys->emplace_back(GenerateKey(i, j, padding_size, &rnd));
+
+      // 100 bytes values
+      values->emplace_back(RandomString(&rnd, 100));
+    }
+  }
+}
 
 class BlockTest {};
 
@@ -39,24 +78,11 @@ TEST(BlockTest, SimpleTest) {
   std::vector<std::string> values;
   BlockBuilder builder(options, ic.get());
   int num_records = 100000;
-  char buf[10];
-  char* p = &buf[0];
 
+  GenerateRandomKVs(&keys, &values, 0, num_records);
   // add a bunch of records to a block
   for (int i = 0; i < num_records; i++) {
-    // generate random kvs
-    sprintf(p, "%6d", i);
-    std::string k(p);
-    std::string v = RandomString(&rnd, 100); // 100 byte values
-
-    // write kvs to the block
-    Slice key(k);
-    Slice value(v);
-    builder.Add(key, value);
-
-    // remember kvs in a lookaside array
-    keys.push_back(k);
-    values.push_back(v);
+    builder.Add(keys[i], values[i]);
   }
 
   // read serialized contents of the block
@@ -101,6 +127,114 @@ TEST(BlockTest, SimpleTest) {
   delete iter;
 }
 
+// return the block contents
+BlockContents GetBlockContents(std::unique_ptr<BlockBuilder> *builder,
+                               const std::vector<std::string> &keys,
+                               const std::vector<std::string> &values,
+                               const int prefix_group_size = 1) {
+  builder->reset(
+      new BlockBuilder(1 /* restart interval */, BytewiseComparator()));
+
+  // Add only half of the keys
+  for (size_t i = 0; i < keys.size(); ++i) {
+    (*builder)->Add(keys[i], values[i]);
+  }
+  Slice rawblock = (*builder)->Finish();
+
+  BlockContents contents;
+  contents.data = rawblock;
+  contents.cachable = false;
+  contents.heap_allocated = false;
+
+  return contents;
+}
+
+void CheckBlockContents(BlockContents contents, const int max_key,
+                        const std::vector<std::string> &keys,
+                        const std::vector<std::string> &values) {
+  const size_t prefix_size = 6;
+  // create block reader
+  Block reader1(contents);
+  Block reader2(contents);
+
+  std::unique_ptr<const SliceTransform> prefix_extractor(
+      NewFixedPrefixTransform(prefix_size));
+
+  {
+    auto iter1 = reader1.NewIterator(nullptr);
+    auto iter2 = reader1.NewIterator(nullptr);
+    reader1.SetBlockHashIndex(CreateBlockHashIndex(iter1, iter2, keys.size(),
+                                                   BytewiseComparator(),
+                                                   prefix_extractor.get()));
+
+    delete iter1;
+    delete iter2;
+  }
+
+  std::unique_ptr<Iterator> hash_iter(
+      reader1.NewIterator(BytewiseComparator()));
+
+  std::unique_ptr<Iterator> regular_iter(
+      reader2.NewIterator(BytewiseComparator()));
+
+  // Seek existent keys
+  for (size_t i = 0; i < keys.size(); i++) {
+    hash_iter->Seek(keys[i]);
+    ASSERT_OK(hash_iter->status());
+    ASSERT_TRUE(hash_iter->Valid());
+
+    Slice v = hash_iter->value();
+    ASSERT_EQ(v.ToString().compare(values[i]), 0);
+  }
+
+  // Seek non-existent keys.
+  // For hash index, if no key with a given prefix is not found, iterator will
+  // simply be set as invalid; whereas the binary search based iterator will
+  // return the one that is closest.
+  for (int i = 1; i < max_key - 1; i += 2) {
+    auto key = GenerateKey(i, 0, 0, nullptr);
+    hash_iter->Seek(key);
+    ASSERT_TRUE(!hash_iter->Valid());
+
+    regular_iter->Seek(key);
+    ASSERT_TRUE(regular_iter->Valid());
+  }
+}
+
+// In this test case, no two key share same prefix.
+TEST(BlockTest, SimpleIndexHash) {
+  const int kMaxKey = 100000;
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+  GenerateRandomKVs(&keys, &values, 0 /* first key id */,
+                    kMaxKey /* last key id */, 2 /* step */,
+                    8 /* padding size (8 bytes randomly generated suffix) */);
+
+  std::unique_ptr<BlockBuilder> builder;
+  auto contents = GetBlockContents(&builder, keys, values);
+
+  CheckBlockContents(contents, kMaxKey, keys, values);
+}
+
+TEST(BlockTest, IndexHashWithSharedPrefix) {
+  const int kMaxKey = 100000;
+  // for each prefix, there will be 5 keys starts with it.
+  const int kPrefixGroup = 5;
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+  // Generate keys with same prefix.
+  GenerateRandomKVs(&keys, &values, 0,  // first key id
+                    kMaxKey,            // last key id
+                    2,                  // step
+                    10,                 // padding size,
+                    kPrefixGroup);
+
+  std::unique_ptr<BlockBuilder> builder;
+  auto contents = GetBlockContents(&builder, keys, values, kPrefixGroup);
+
+  CheckBlockContents(contents, kMaxKey, keys, values);
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/table/table_test.cc b/table/table_test.cc
index 18ae2a3aa..0426122ff 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -1055,6 +1055,116 @@ static std::string RandomString(Random* rnd, int len) {
   return r;
 }
 
+void AddInternalKey(TableConstructor* c, const std::string prefix,
+                    int suffix_len = 800) {
+  static Random rnd(1023);
+  InternalKey k(prefix + RandomString(&rnd, 800), 0, kTypeValue);
+  c->Add(k.Encode().ToString(), "v");
+}
+
+TEST(TableTest, HashIndexTest) {
+  TableConstructor c(BytewiseComparator());
+
+  // keys with prefix length 3, make sure the key/value is big enough to fill
+  // one block
+  AddInternalKey(&c, "0015");
+  AddInternalKey(&c, "0035");
+
+  AddInternalKey(&c, "0054");
+  AddInternalKey(&c, "0055");
+
+  AddInternalKey(&c, "0056");
+  AddInternalKey(&c, "0057");
+
+  AddInternalKey(&c, "0058");
+  AddInternalKey(&c, "0075");
+
+  AddInternalKey(&c, "0076");
+  AddInternalKey(&c, "0095");
+
+  std::vector<std::string> keys;
+  KVMap kvmap;
+  Options options;
+  BlockBasedTableOptions table_options;
+  table_options.index_type = BlockBasedTableOptions::kHashSearch;
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+
+  options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+  options.block_cache = NewLRUCache(1024);
+  options.block_size = 1700;
+
+  std::unique_ptr<InternalKeyComparator> comparator(
+      new InternalKeyComparator(BytewiseComparator()));
+  c.Finish(options, *comparator, &keys, &kvmap);
+  auto reader = c.table_reader();
+
+  auto props = c.table_reader()->GetTableProperties();
+  ASSERT_EQ(5u, props->num_data_blocks);
+
+  std::unique_ptr<Iterator> hash_iter(reader->NewIterator(ReadOptions()));
+
+  // -- Find keys do not exist, but have common prefix.
+  std::vector<std::string> prefixes = {"001", "003", "005", "007", "009"};
+  std::vector<std::string> lower_bound = {keys[0], keys[1], keys[2],
+                                          keys[7], keys[9], };
+
+  // find the lower bound of the prefix
+  for (size_t i = 0; i < prefixes.size(); ++i) {
+    hash_iter->Seek(InternalKey(prefixes[i], 0, kTypeValue).Encode());
+    ASSERT_OK(hash_iter->status());
+    ASSERT_TRUE(hash_iter->Valid());
+
+    // seek the first element in the block
+    ASSERT_EQ(lower_bound[i], hash_iter->key().ToString());
+    ASSERT_EQ("v", hash_iter->value().ToString());
+  }
+
+  // find the upper bound of prefixes
+  std::vector<std::string> upper_bound = {keys[1], keys[2], keys[7], keys[9], };
+
+  // find existing keys
+  for (const auto& item : kvmap) {
+    auto ukey = ExtractUserKey(item.first).ToString();
+    hash_iter->Seek(ukey);
+
+    // ASSERT_OK(regular_iter->status());
+    ASSERT_OK(hash_iter->status());
+
+    // ASSERT_TRUE(regular_iter->Valid());
+    ASSERT_TRUE(hash_iter->Valid());
+
+    ASSERT_EQ(item.first, hash_iter->key().ToString());
+    ASSERT_EQ(item.second, hash_iter->value().ToString());
+  }
+
+  for (size_t i = 0; i < prefixes.size(); ++i) {
+    // the key is greater than any existing keys.
+    auto key = prefixes[i] + "9";
+    hash_iter->Seek(InternalKey(key, 0, kTypeValue).Encode());
+
+    ASSERT_OK(hash_iter->status());
+    if (i == prefixes.size() - 1) {
+      // last key
+      ASSERT_TRUE(!hash_iter->Valid());
+    } else {
+      ASSERT_TRUE(hash_iter->Valid());
+      // seek the first element in the block
+      ASSERT_EQ(upper_bound[i], hash_iter->key().ToString());
+      ASSERT_EQ("v", hash_iter->value().ToString());
+    }
+  }
+
+  // find keys with prefix that don't match any of the existing prefixes.
+  std::vector<std::string> non_exist_prefixes = {"002", "004", "006", "008"};
+  for (const auto& prefix : non_exist_prefixes) {
+    hash_iter->Seek(InternalKey(prefix, 0, kTypeValue).Encode());
+    // regular_iter->Seek(prefix);
+
+    ASSERT_OK(hash_iter->status());
+    ASSERT_TRUE(!hash_iter->Valid());
+  }
+}
+
 // It's very hard to figure out the index block size of a block accurately.
 // To make sure we get the index size, we just make sure as key number
 // grows, the filter block size also grows.