Enable hash index for block-based table

Summary: Based on previous patches, this diff eventually provides the end-to-end mechanism for users to specify the hash-index. Test Plan: Wrote several new unit tests. Reviewers: sdong, haobo, dhruba Reviewed By: sdong CC: leveldb Differential Revision: https://reviews.facebook.net/D16539 Use shorten index key for hash-index Summary: I was wrong about the "index builder", right now since we create index by scanning both whole table and index, there is not need to preserve the whole key as the index key. I switch back to original way index which is both space efficient and able to supprot in-fly construction of hash index. IN this patch, I made minimal change since I'm not sure if we still need the "pluggable index builder", under current circumstance it is of no use and kind of over-engineered. But I'm not sure if we can still exploit its usefulness in the future; otherwise I think I can just burn them with great vengeance. Test Plan: unit tests Reviewers: sdong, haobo CC: leveldb Differential Revision: https://reviews.facebook.net/D17745 Conflicts: table/block_based_table_reader.cc table/block_based_table_reader.h
2014-04-10 14:19:43 -07:00 · 2014-04-10 14:19:43 -07:00 · 22f396798e
commit 22f396798e
parent 258eac1772
10 changed files with 545 additions and 107 deletions
--- a/db/db_test.cc
+++ b/db/db_test.cc
@ -266,6 +266,8 @@ class DBTest {
  // Sequence of option configurations to try
  enum OptionConfig {
    kDefault,
+    kBlockBasedTableWithPrefixHashIndex,
+    kBlockBasedTableWithWholeKeyHashIndex,
    kPlainTableFirstBytePrefix,
    kPlainTableAllBytesPrefix,
    kVectorRep,
@ -302,7 +304,8 @@ class DBTest {
    kSkipDeletesFilterFirst = 1,
    kSkipUniversalCompaction = 2,
    kSkipMergePut = 4,
-    kSkipPlainTable = 8
+    kSkipPlainTable = 8,
+    kSkipHashIndex = 16
  };

  DBTest() : option_config_(kDefault),
@ -343,6 +346,12 @@ class DBTest {
              || option_config_ == kPlainTableFirstBytePrefix)) {
        continue;
      }
+      if ((skip_mask & kSkipPlainTable) &&
+          (option_config_ == kBlockBasedTableWithPrefixHashIndex ||
+           option_config_ == kBlockBasedTableWithWholeKeyHashIndex)) {
+        continue;
+      }
+
      break;
    }

@ -441,6 +450,20 @@ class DBTest {
      case kInfiniteMaxOpenFiles:
        options.max_open_files = -1;
        break;
+      case kBlockBasedTableWithPrefixHashIndex: {
+        BlockBasedTableOptions table_options;
+        table_options.index_type = BlockBasedTableOptions::kHashSearch;
+        options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+        options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+        break;
+      }
+      case kBlockBasedTableWithWholeKeyHashIndex: {
+        BlockBasedTableOptions table_options;
+        table_options.index_type = BlockBasedTableOptions::kHashSearch;
+        options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+        options.prefix_extractor.reset(NewNoopTransform());
+        break;
+      }
      default:
        break;
    }
@ -1251,7 +1274,7 @@ TEST(DBTest, KeyMayExist) {

    // KeyMayExist function only checks data in block caches, which is not used
    // by plain table format.
-  } while (ChangeOptions(kSkipPlainTable));
+  } while (ChangeOptions(kSkipPlainTable | kSkipHashIndex));
 }

 TEST(DBTest, NonBlockingIteration) {
@ -5882,7 +5905,9 @@ TEST(DBTest, Randomized) {
      int minimum = 0;
      if (option_config_ == kHashSkipList ||
          option_config_ == kHashLinkList ||
-          option_config_ == kPlainTableFirstBytePrefix) {
+          option_config_ == kPlainTableFirstBytePrefix ||
+          option_config_ == kBlockBasedTableWithWholeKeyHashIndex ||
+          option_config_ == kBlockBasedTableWithPrefixHashIndex) {
        minimum = 1;
      }
      if (p < 45) {                               // Put
@ -5922,8 +5947,15 @@ TEST(DBTest, Randomized) {
      }

      if ((step % 100) == 0) {
-        ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr));
-        ASSERT_TRUE(CompareIterators(step, &model, db_, model_snap, db_snap));
+        // For DB instances that use the hash index + block-based table, the
+        // iterator will be invalid right when seeking a non-existent key, right
+        // than return a key that is close to it.
+        if (option_config_ != kBlockBasedTableWithWholeKeyHashIndex &&
+            option_config_ != kBlockBasedTableWithPrefixHashIndex) {
+          ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr));
+          ASSERT_TRUE(CompareIterators(step, &model, db_, model_snap, db_snap));
+        }
+
        // Save a snapshot from each DB this time that we'll use next
        // time we compare things, to make sure the current state is
        // preserved with the snapshot
--- a/db/dbformat.h
+++ b/db/dbformat.h
@ -13,6 +13,7 @@
 #include "rocksdb/db.h"
 #include "rocksdb/filter_policy.h"
 #include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
 #include "rocksdb/table.h"
 #include "rocksdb/types.h"
 #include "util/coding.h"
@ -301,4 +302,34 @@ class IterKey {
  void operator=(const IterKey&) = delete;
 };

+class InternalKeySliceTransform : public SliceTransform {
+ public:
+  explicit InternalKeySliceTransform(const SliceTransform* transform)
+      : transform_(transform) {}
+
+  virtual const char* Name() const { return transform_->Name(); }
+
+  virtual Slice Transform(const Slice& src) const {
+    auto user_key = ExtractUserKey(src);
+    return transform_->Transform(user_key);
+  }
+
+  virtual bool InDomain(const Slice& src) const {
+    auto user_key = ExtractUserKey(src);
+    return transform_->InDomain(user_key);
+  }
+
+  virtual bool InRange(const Slice& dst) const {
+    auto user_key = ExtractUserKey(dst);
+    return transform_->InRange(user_key);
+  }
+
+  const SliceTransform* user_prefix_extractor() const { return transform_; }
+
+ private:
+  // Like comparator, InternalKeySliceTransform will not take care of the
+  // deletion of transform_
+  const SliceTransform* const transform_;
+};
+
 }  // namespace rocksdb
--- a/include/rocksdb/table.h
+++ b/include/rocksdb/table.h
@ -60,6 +60,12 @@ struct BlockBasedTableOptions {
    // A space efficient index block that is optimized for
    // binary-search-based index.
    kBinarySearch,
+
+    // The hash index, if enabled, will do the hash lookup when
+    // `ReadOption.prefix_seek == true`. User should also specify
+    // `Options.prefix_extractor` to allow the index block to correctly
+    // extract the prefix of the given key and perform hash table lookup.
+    kHashSearch,
  };

  IndexType index_type = kBinarySearch;
--- a/table/block.cc
+++ b/table/block.cc
@ -11,16 +11,20 @@

 #include "table/block.h"

-#include <vector>
 #include <algorithm>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
 #include "rocksdb/comparator.h"
+#include "table/block_hash_index.h"
 #include "table/format.h"
 #include "util/coding.h"
 #include "util/logging.h"

 namespace rocksdb {

-inline uint32_t Block::NumRestarts() const {
+uint32_t Block::NumRestarts() const {
  assert(size_ >= 2*sizeof(uint32_t));
  return DecodeFixed32(data_ + size_ - sizeof(uint32_t));
 }
@ -92,6 +96,7 @@ class Block::Iter : public Iterator {
  std::string key_;
  Slice value_;
  Status status_;
+  BlockHashIndex* hash_index_;

  inline int Compare(const Slice& a, const Slice& b) const {
    return comparator_->Compare(a, b);
@ -118,16 +123,15 @@ class Block::Iter : public Iterator {
  }

 public:
-  Iter(const Comparator* comparator,
-       const char* data,
-       uint32_t restarts,
-       uint32_t num_restarts)
+  Iter(const Comparator* comparator, const char* data, uint32_t restarts,
+       uint32_t num_restarts, BlockHashIndex* hash_index)
      : comparator_(comparator),
        data_(data),
        restarts_(restarts),
        num_restarts_(num_restarts),
        current_(restarts_),
-        restart_index_(num_restarts_) {
+        restart_index_(num_restarts_),
+        hash_index_(hash_index) {
    assert(num_restarts_ > 0);
  }

@ -169,45 +173,22 @@ class Block::Iter : public Iterator {
  }

  virtual void Seek(const Slice& target) {
-    // Binary search in restart array to find the first restart point
-    // with a key >= target
-    uint32_t left = 0;
-    uint32_t right = num_restarts_ - 1;
-    while (left < right) {
-      uint32_t mid = (left + right + 1) / 2;
-      uint32_t region_offset = GetRestartPoint(mid);
-      uint32_t shared, non_shared, value_length;
-      const char* key_ptr = DecodeEntry(data_ + region_offset,
-                                        data_ + restarts_,
-                                        &shared, &non_shared, &value_length);
-      if (key_ptr == nullptr || (shared != 0)) {
-        CorruptionError();
-        return;
-      }
-      Slice mid_key(key_ptr, non_shared);
-      if (Compare(mid_key, target) < 0) {
-        // Key at "mid" is smaller than "target".  Therefore all
-        // blocks before "mid" are uninteresting.
-        left = mid;
-      } else {
-        // Key at "mid" is >= "target".  Therefore all blocks at or
-        // after "mid" are uninteresting.
-        right = mid - 1;
-      }
-    }
+    uint32_t index = 0;
+    bool ok = hash_index_ ? HashSeek(target, &index)
+                          : BinarySeek(target, 0, num_restarts_ - 1, &index);

+    if (!ok) {
+      return;
+    }
+    SeekToRestartPoint(index);
    // Linear search (within restart block) for first key >= target
-    SeekToRestartPoint(left);
+
    while (true) {
-      if (!ParseNextKey()) {
-        return;
-      }
-      if (Compare(key_, target) >= 0) {
+      if (!ParseNextKey() || Compare(key_, target) >= 0) {
        return;
      }
    }
  }
-
  virtual void SeekToFirst() {
    SeekToRestartPoint(0);
    ParseNextKey();
@ -257,6 +238,53 @@ class Block::Iter : public Iterator {
      return true;
    }
  }
+  // Binary search in restart array to find the first restart point
+  // with a key >= target
+  bool BinarySeek(const Slice& target, uint32_t left, uint32_t right,
+                  uint32_t* index) {
+    assert(left <= right);
+
+    while (left < right) {
+      uint32_t mid = (left + right + 1) / 2;
+      uint32_t region_offset = GetRestartPoint(mid);
+      uint32_t shared, non_shared, value_length;
+      const char* key_ptr =
+          DecodeEntry(data_ + region_offset, data_ + restarts_, &shared,
+                      &non_shared, &value_length);
+      if (key_ptr == nullptr || (shared != 0)) {
+        CorruptionError();
+        return false;
+      }
+      Slice mid_key(key_ptr, non_shared);
+      if (Compare(mid_key, target) < 0) {
+        // Key at "mid" is smaller than "target". Therefore all
+        // blocks before "mid" are uninteresting.
+        left = mid;
+      } else {
+        // Key at "mid" is >= "target". Therefore all blocks at or
+        // after "mid" are uninteresting.
+        right = mid - 1;
+      }
+    }
+
+    *index = left;
+    return true;
+  }
+
+  bool HashSeek(const Slice& target, uint32_t* index) {
+    assert(hash_index_);
+    auto restart_index = hash_index_->GetRestartIndex(target);
+    if (restart_index == nullptr) {
+      current_ = restarts_;
+      return 0;
+    }
+
+    // the elements in restart_array[index : index + num_blocks]
+    // are all with same prefix. We'll do binary search in that small range.
+    auto left = restart_index->first_index;
+    auto right = restart_index->first_index + restart_index->num_blocks - 1;
+    return BinarySeek(target, left, right, index);
+  }
 };

 Iterator* Block::NewIterator(const Comparator* cmp) {
@ -267,8 +295,13 @@ Iterator* Block::NewIterator(const Comparator* cmp) {
  if (num_restarts == 0) {
    return NewEmptyIterator();
  } else {
-    return new Iter(cmp, data_, restart_offset_, num_restarts);
+    return new Iter(cmp, data_, restart_offset_, num_restarts,
+                    hash_index_.get());
  }
 }

+void Block::SetBlockHashIndex(BlockHashIndex* hash_index) {
+  hash_index_.reset(hash_index);
+}
+
 }  // namespace rocksdb
--- a/table/block.h
+++ b/table/block.h
@ -10,6 +10,7 @@
 #pragma once
 #include <stddef.h>
 #include <stdint.h>
+
 #include "rocksdb/iterator.h"
 #include "rocksdb/options.h"

@ -17,6 +18,7 @@ namespace rocksdb {

 struct BlockContents;
 class Comparator;
+class BlockHashIndex;

 class Block {
 public:
@ -26,20 +28,28 @@ class Block {
  ~Block();

  size_t size() const { return size_; }
-  bool   cachable() const { return cachable_; }
+  const char* data() const { return data_; }
+  bool cachable() const { return cachable_; }
+  uint32_t NumRestarts() const;
  CompressionType compression_type() const { return compression_type_; }
+
+  // If hash index lookup is enabled and `use_hash_index` is true. This block
+  // will do hash lookup for the key prefix.
+  //
+  // NOTE: for the hash based lookup, if a key prefix doesn't match any key,
+  // the iterator will simply be set as "invalid", rather than returning
+  // the key that is just pass the target key.
  Iterator* NewIterator(const Comparator* comparator);
-  const char* data() { return data_; }
+  void SetBlockHashIndex(BlockHashIndex* hash_index);

 private:
-  uint32_t NumRestarts() const;
-
  const char* data_;
  size_t size_;
  uint32_t restart_offset_;     // Offset in data_ of restart array
  bool owned_;                  // Block owns data_[]
  bool cachable_;
  CompressionType compression_type_;
+  std::unique_ptr<BlockHashIndex> hash_index_;

  // No copying allowed
  Block(const Block&);
--- a/table/block_based_table_builder.cc
+++ b/table/block_based_table_builder.cc
@ -88,8 +88,7 @@ class IndexBuilder {
  const Comparator* comparator_;
 };

-// This index builder builds space-efficient index block for binary-search-based
-// index.
+// This index builder builds space-efficient index block.
 //
 // Optimizations:
 //  1. Made block's `block_restart_interval` to be 1, which will avoid linear
@ -97,9 +96,9 @@ class IndexBuilder {
 //  2. Shorten the key length for index block. Other than honestly using the
 //     last key in the data block as the index key, we instead find a shortest
 //     substitute key that serves the same function.
-class BinarySearchIndexBuilder : public IndexBuilder {
+class ShortenedIndexBuilder : public IndexBuilder {
 public:
-  explicit BinarySearchIndexBuilder(const Comparator* comparator)
+  explicit ShortenedIndexBuilder(const Comparator* comparator)
      : IndexBuilder(comparator),
        index_block_builder_(1 /* block_restart_interval == 1 */, comparator) {}

@ -128,11 +127,40 @@ class BinarySearchIndexBuilder : public IndexBuilder {
  BlockBuilder index_block_builder_;
 };

+// FullKeyIndexBuilder is also based on BlockBuilder. It works pretty much like
+// ShortenedIndexBuilder, but preserves the full key instead the substitude key.
+class FullKeyIndexBuilder : public IndexBuilder {
+ public:
+  explicit FullKeyIndexBuilder(const Comparator* comparator)
+      : IndexBuilder(comparator),
+        index_block_builder_(1 /* block_restart_interval == 1 */, comparator) {}
+
+  virtual void AddEntry(std::string* last_key_in_current_block,
+                        const Slice* first_key_in_next_block,
+                        const BlockHandle& block_handle) override {
+    std::string handle_encoding;
+    block_handle.EncodeTo(&handle_encoding);
+    index_block_builder_.Add(*last_key_in_current_block, handle_encoding);
+  }
+
+  virtual Slice Finish() override { return index_block_builder_.Finish(); }
+
+  virtual size_t EstimatedSize() const {
+    return index_block_builder_.CurrentSizeEstimate();
+  }
+
+ private:
+  BlockBuilder index_block_builder_;
+};
+
 // Create a index builder based on its type.
 IndexBuilder* CreateIndexBuilder(IndexType type, const Comparator* comparator) {
  switch (type) {
    case BlockBasedTableOptions::kBinarySearch: {
-      return new BinarySearchIndexBuilder(comparator);
+      return new ShortenedIndexBuilder(comparator);
+    }
+    case BlockBasedTableOptions::kHashSearch: {
+      return new ShortenedIndexBuilder(comparator);
    }
    default: {
      assert(!"Do not recognize the index type ");
--- a/table/block_based_table_reader.cc
+++ b/table/block_based_table_reader.cc
@ -25,6 +25,7 @@

 #include "table/block.h"
 #include "table/filter_block.h"
+#include "table/block_hash_index.h"
 #include "table/format.h"
 #include "table/meta_blocks.h"
 #include "table/two_level_iterator.h"
@ -45,7 +46,9 @@ namespace {
 // The longest the prefix of the cache key used to identify blocks can be.
 // We are using the fact that we know for Posix files the unique ID is three
 // varints.
-const size_t kMaxCacheKeyPrefixSize = kMaxVarint64Length*3+1;
+// For some reason, compiling for iOS complains that this variable is unused
+const size_t kMaxCacheKeyPrefixSize __attribute__((unused)) =
+    kMaxVarint64Length * 3 + 1;

 // Read the block identified by "handle" from "file".
 // The only relevant option is options.verify_checksums for now.
@ -105,7 +108,7 @@ Cache::Handle* GetEntryFromCache(Cache* block_cache, const Slice& key,
                                 Statistics* statistics) {
  auto cache_handle = block_cache->Lookup(key);
  if (cache_handle != nullptr) {
-    BumpPerfCount(&perf_context.block_cache_hit_count);
+    PERF_COUNTER_ADD(block_cache_hit_count, 1);
    // overall cache hit
    RecordTick(statistics, BLOCK_CACHE_HIT);
    // block-type specific cache hit
@ -148,26 +151,20 @@ class BinarySearchIndexReader : public IndexReader {
 public:
  // Read index from the file and create an intance for
  // `BinarySearchIndexReader`.
-  // The return value is a pair, where
-  //  * first element is the status indicating if the operation succeeded.
-  //  * second element is the index reader to be created. On failure, this
-  // element will be nullptr
-  static std::pair<Status, IndexReader*> Create(RandomAccessFile* file,
-                                                const BlockHandle& index_handle,
-                                                Env* env,
-                                                const Comparator* comparator) {
+  // On success, index_reader will be populated; otherwise it will remain
+  // unmodified.
+  static Status Create(RandomAccessFile* file, const BlockHandle& index_handle,
+                       Env* env, const Comparator* comparator,
+                       IndexReader** index_reader) {
    Block* index_block = nullptr;
-    auto s =
-        ReadBlockFromFile(file, ReadOptions(), index_handle, &index_block, env);
+    auto s = ReadBlockFromFile(file, ReadOptions(), index_handle,
+                               &index_block, env);

-    if (!s.ok()) {
-      // Logically, index_block shouldn't have been populated if any error
-      // occurred.
-      assert(index_block == nullptr);
-      return {s, nullptr};
+    if (s.ok()) {
+      *index_reader = new BinarySearchIndexReader(comparator, index_block);
    }

-    return {s, new BinarySearchIndexReader(comparator, index_block)};
+    return s;
  }

  virtual Iterator* NewIterator() override {
@ -184,19 +181,51 @@ class BinarySearchIndexReader : public IndexReader {
  std::unique_ptr<Block> index_block_;
 };

-// TODO(kailiu) This class is only a stub for now. And the comment below is also
-// not completed.
 // Index that leverages an internal hash table to quicken the lookup for a given
 // key.
+// @param data_iter_gen, equavalent to BlockBasedTable::NewIterator(). But that
+// functions requires index to be initalized. To avoid this problem external
+// caller will pass a function that can create the iterator over the entries
+// without the table to be fully initialized.
 class HashIndexReader : public IndexReader {
 public:
-  static std::pair<Status, IndexReader*> Create(
-      RandomAccessFile* file, const BlockHandle& index_handle, Env* env,
-      const Comparator* comparator, BlockBasedTable* table,
-      const SliceTransform* prefix_extractor) {
-    return {Status::NotSupported("not implemented yet!"),
-            nullptr};  // not finished
+  static Status Create(RandomAccessFile* file, const BlockHandle& index_handle,
+                       Env* env, const Comparator* comparator,
+                       std::function<Iterator*(Iterator*)> data_iter_gen,
+                       const SliceTransform* prefix_extractor,
+                       IndexReader** index_reader) {
+    assert(prefix_extractor);
+    Block* index_block = nullptr;
+    auto s =
+        ReadBlockFromFile(file, ReadOptions(), index_handle, &index_block, env);
+
+    if (!s.ok()) {
+      return s;
+    }
+
+    *index_reader = new HashIndexReader(comparator, index_block);
+    std::unique_ptr<Iterator> index_iter(index_block->NewIterator(nullptr));
+    std::unique_ptr<Iterator> data_iter(
+        data_iter_gen(index_block->NewIterator(nullptr)));
+    auto hash_index = CreateBlockHashIndex(index_iter.get(), data_iter.get(),
+                                           index_block->NumRestarts(),
+                                           comparator, prefix_extractor);
+    index_block->SetBlockHashIndex(hash_index);
+    return s;
  }
+
+  virtual Iterator* NewIterator() override {
+    return index_block_->NewIterator(comparator_);
+  }
+
+  virtual size_t size() const override { return index_block_->size(); }
+
+ private:
+  HashIndexReader(const Comparator* comparator, Block* index_block)
+      : IndexReader(comparator), index_block_(index_block) {
+    assert(index_block_ != nullptr);
+  }
+  std::unique_ptr<Block> index_block_;
 };


@ -227,6 +256,11 @@ struct BlockBasedTable::Rep {

  std::shared_ptr<const TableProperties> table_properties;
  BlockBasedTableOptions::IndexType index_type;
+  // TODO(kailiu) It is very ugly to use internal key in table, since table
+  // module should not be relying on db module. However to make things easier
+  // and compatible with existing code, we introduce a wrapper that allows
+  // block to extract prefix without knowing if a key is internal or not.
+  unique_ptr<SliceTransform> internal_prefix_transform;
 };

 BlockBasedTable::~BlockBasedTable() {
@ -367,7 +401,7 @@ Status BlockBasedTable::Open(const Options& options, const EnvOptions& soptions,
    // and with a same life-time as this table object.
    IndexReader* index_reader = nullptr;
    // TODO: we never really verify check sum for index block
-    std::tie(s, index_reader) = new_table->CreateIndexReader();
+    s = new_table->CreateIndexReader(&index_reader);

    if (s.ok()) {
      rep->index_reader.reset(index_reader);
@ -751,8 +785,7 @@ BlockBasedTable::CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
  return { filter, cache_handle };
 }

-Iterator* BlockBasedTable::NewIndexIterator(const ReadOptions& read_options)
-    const {
+Iterator* BlockBasedTable::NewIndexIterator(const ReadOptions& read_options) {
  // index reader has already been pre-populated.
  if (rep_->index_reader) {
    return rep_->index_reader->NewIterator();
@ -779,7 +812,7 @@ Iterator* BlockBasedTable::NewIndexIterator(const ReadOptions& read_options)
  } else {
    // Create index reader and put it in the cache.
    Status s;
-    std::tie(s, index_reader) = CreateIndexReader();
+    s = CreateIndexReader(&index_reader);

    if (!s.ok()) {
      // make sure if something goes wrong, index_reader shall remain intact.
@ -958,11 +991,14 @@ Status BlockBasedTable::Get(
  return s;
 }

+namespace {
 bool SaveDidIO(void* arg, const ParsedInternalKey& key, const Slice& value,
               bool didIO) {
  *reinterpret_cast<bool*>(arg) = didIO;
  return false;
 }
+}  // namespace
+
 bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options,
                                      const Slice& key) {
  // We use Get() as it has logic that checks whether we read the
@ -979,7 +1015,7 @@ bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options,
 //  3. options
 //  4. internal_comparator
 //  5. index_type
-std::pair<Status, IndexReader*> BlockBasedTable::CreateIndexReader() const {
+Status BlockBasedTable::CreateIndexReader(IndexReader** index_reader) {
  // Some old version of block-based tables don't have index type present in
  // table properties. If that's the case we can safely use the kBinarySearch.
  auto index_type = BlockBasedTableOptions::kBinarySearch;
@ -990,19 +1026,37 @@ std::pair<Status, IndexReader*> BlockBasedTable::CreateIndexReader() const {
        DecodeFixed32(pos->second.c_str()));
  }

+  auto file = rep_->file.get();
+  const auto& index_handle = rep_->index_handle;
+  auto env = rep_->options.env;
+  auto comparator = &rep_->internal_comparator;
+
  switch (index_type) {
    case BlockBasedTableOptions::kBinarySearch: {
-      return BinarySearchIndexReader::Create(
-          rep_->file.get(), rep_->index_handle, rep_->options.env,
-          &rep_->internal_comparator);
+      return BinarySearchIndexReader::Create(file, index_handle, env,
+                                             comparator, index_reader);
+    }
+    case BlockBasedTableOptions::kHashSearch: {
+      // We need to wrap data with internal_prefix_transform to make sure it can
+      // handle prefix correctly.
+      rep_->internal_prefix_transform.reset(
+          new InternalKeySliceTransform(rep_->options.prefix_extractor.get()));
+      return HashIndexReader::Create(
+          file, index_handle, env, comparator,
+          [&](Iterator* index_iter) {
+            return NewTwoLevelIterator(
+                index_iter, &BlockBasedTable::DataBlockReader,
+                const_cast<BlockBasedTable*>(this), ReadOptions(),
+                rep_->soptions, rep_->internal_comparator);
+          },
+          rep_->internal_prefix_transform.get(), index_reader);
    }
    default: {
      std::string error_message =
          "Unrecognized index type: " + std::to_string(rep_->index_type);
      // equivalent to assert(false), but more informative.
      assert(!error_message.c_str());
-      return {Status::InvalidArgument(error_message.c_str()),
-              nullptr};  // cannot reach here
+      return Status::InvalidArgument(error_message.c_str());
    }
  }
 }
--- a/table/block_based_table_reader.h
+++ b/table/block_based_table_reader.h
@ -131,7 +131,7 @@ class BlockBasedTable : public TableReader {
  //  2. index is not present in block cache.
  //  3. We disallowed any io to be performed, that is, read_options ==
  //     kBlockCacheTier
-  Iterator* NewIndexIterator(const ReadOptions& read_options) const;
+  Iterator* NewIndexIterator(const ReadOptions& read_options);

  // Read block cache from block caches (if set): block_cache and
  // block_cache_compressed.
@ -164,7 +164,7 @@ class BlockBasedTable : public TableReader {

  void ReadMeta(const Footer& footer);
  void ReadFilter(const Slice& filter_handle_value);
-  std::pair<Status, IndexReader*> CreateIndexReader() const;
+  Status CreateIndexReader(IndexReader** index_reader);

  // Read the meta block from sst.
  static Status ReadMetaBlock(
--- a/table/block_test.cc
+++ b/table/block_test.cc
@ -3,7 +3,10 @@
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 //
+#include <stdio.h>
 #include <string>
+#include <vector>
+
 #include "db/dbformat.h"
 #include "db/memtable.h"
 #include "db/write_batch_internal.h"
@ -11,9 +14,11 @@
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/table.h"
+#include "rocksdb/slice_transform.h"
 #include "table/block.h"
 #include "table/block_builder.h"
 #include "table/format.h"
+#include "table/block_hash_index.h"
 #include "util/random.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
@ -25,6 +30,40 @@ static std::string RandomString(Random* rnd, int len) {
  test::RandomString(rnd, len, &r);
  return r;
 }
+std::string GenerateKey(int primary_key, int secondary_key, int padding_size,
+                        Random *rnd) {
+  char buf[50];
+  char *p = &buf[0];
+  snprintf(buf, sizeof(buf), "%6d%4d", primary_key, secondary_key);
+  std::string k(p);
+  if (padding_size) {
+    k += RandomString(rnd, padding_size);
+  }
+
+  return k;
+}
+
+// Generate random key value pairs.
+// The generated key will be sorted. You can tune the parameters to generated
+// different kinds of test key/value pairs for different scenario.
+void GenerateRandomKVs(std::vector<std::string> *keys,
+                       std::vector<std::string> *values, const int from,
+                       const int len, const int step = 1,
+                       const int padding_size = 0,
+                       const int keys_share_prefix = 1) {
+  Random rnd(302);
+
+  // generate different prefix
+  for (int i = from; i < from + len; i += step) {
+    // generating keys that shares the prefix
+    for (int j = 0; j < keys_share_prefix; ++j) {
+      keys->emplace_back(GenerateKey(i, j, padding_size, &rnd));
+
+      // 100 bytes values
+      values->emplace_back(RandomString(&rnd, 100));
+    }
+  }
+}

 class BlockTest {};

@ -39,24 +78,11 @@ TEST(BlockTest, SimpleTest) {
  std::vector<std::string> values;
  BlockBuilder builder(options, ic.get());
  int num_records = 100000;
-  char buf[10];
-  char* p = &buf[0];

+  GenerateRandomKVs(&keys, &values, 0, num_records);
  // add a bunch of records to a block
  for (int i = 0; i < num_records; i++) {
-    // generate random kvs
-    sprintf(p, "%6d", i);
-    std::string k(p);
-    std::string v = RandomString(&rnd, 100); // 100 byte values
-
-    // write kvs to the block
-    Slice key(k);
-    Slice value(v);
-    builder.Add(key, value);
-
-    // remember kvs in a lookaside array
-    keys.push_back(k);
-    values.push_back(v);
+    builder.Add(keys[i], values[i]);
  }

  // read serialized contents of the block
@ -101,6 +127,114 @@ TEST(BlockTest, SimpleTest) {
  delete iter;
 }

+// return the block contents
+BlockContents GetBlockContents(std::unique_ptr<BlockBuilder> *builder,
+                               const std::vector<std::string> &keys,
+                               const std::vector<std::string> &values,
+                               const int prefix_group_size = 1) {
+  builder->reset(
+      new BlockBuilder(1 /* restart interval */, BytewiseComparator()));
+
+  // Add only half of the keys
+  for (size_t i = 0; i < keys.size(); ++i) {
+    (*builder)->Add(keys[i], values[i]);
+  }
+  Slice rawblock = (*builder)->Finish();
+
+  BlockContents contents;
+  contents.data = rawblock;
+  contents.cachable = false;
+  contents.heap_allocated = false;
+
+  return contents;
+}
+
+void CheckBlockContents(BlockContents contents, const int max_key,
+                        const std::vector<std::string> &keys,
+                        const std::vector<std::string> &values) {
+  const size_t prefix_size = 6;
+  // create block reader
+  Block reader1(contents);
+  Block reader2(contents);
+
+  std::unique_ptr<const SliceTransform> prefix_extractor(
+      NewFixedPrefixTransform(prefix_size));
+
+  {
+    auto iter1 = reader1.NewIterator(nullptr);
+    auto iter2 = reader1.NewIterator(nullptr);
+    reader1.SetBlockHashIndex(CreateBlockHashIndex(iter1, iter2, keys.size(),
+                                                   BytewiseComparator(),
+                                                   prefix_extractor.get()));
+
+    delete iter1;
+    delete iter2;
+  }
+
+  std::unique_ptr<Iterator> hash_iter(
+      reader1.NewIterator(BytewiseComparator()));
+
+  std::unique_ptr<Iterator> regular_iter(
+      reader2.NewIterator(BytewiseComparator()));
+
+  // Seek existent keys
+  for (size_t i = 0; i < keys.size(); i++) {
+    hash_iter->Seek(keys[i]);
+    ASSERT_OK(hash_iter->status());
+    ASSERT_TRUE(hash_iter->Valid());
+
+    Slice v = hash_iter->value();
+    ASSERT_EQ(v.ToString().compare(values[i]), 0);
+  }
+
+  // Seek non-existent keys.
+  // For hash index, if no key with a given prefix is not found, iterator will
+  // simply be set as invalid; whereas the binary search based iterator will
+  // return the one that is closest.
+  for (int i = 1; i < max_key - 1; i += 2) {
+    auto key = GenerateKey(i, 0, 0, nullptr);
+    hash_iter->Seek(key);
+    ASSERT_TRUE(!hash_iter->Valid());
+
+    regular_iter->Seek(key);
+    ASSERT_TRUE(regular_iter->Valid());
+  }
+}
+
+// In this test case, no two key share same prefix.
+TEST(BlockTest, SimpleIndexHash) {
+  const int kMaxKey = 100000;
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+  GenerateRandomKVs(&keys, &values, 0 /* first key id */,
+                    kMaxKey /* last key id */, 2 /* step */,
+                    8 /* padding size (8 bytes randomly generated suffix) */);
+
+  std::unique_ptr<BlockBuilder> builder;
+  auto contents = GetBlockContents(&builder, keys, values);
+
+  CheckBlockContents(contents, kMaxKey, keys, values);
+}
+
+TEST(BlockTest, IndexHashWithSharedPrefix) {
+  const int kMaxKey = 100000;
+  // for each prefix, there will be 5 keys starts with it.
+  const int kPrefixGroup = 5;
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+  // Generate keys with same prefix.
+  GenerateRandomKVs(&keys, &values, 0,  // first key id
+                    kMaxKey,            // last key id
+                    2,                  // step
+                    10,                 // padding size,
+                    kPrefixGroup);
+
+  std::unique_ptr<BlockBuilder> builder;
+  auto contents = GetBlockContents(&builder, keys, values, kPrefixGroup);
+
+  CheckBlockContents(contents, kMaxKey, keys, values);
+}
+
 }  // namespace rocksdb

 int main(int argc, char** argv) {
--- a/table/table_test.cc
+++ b/table/table_test.cc
@ -1055,6 +1055,116 @@ static std::string RandomString(Random* rnd, int len) {
  return r;
 }

+void AddInternalKey(TableConstructor* c, const std::string prefix,
+                    int suffix_len = 800) {
+  static Random rnd(1023);
+  InternalKey k(prefix + RandomString(&rnd, 800), 0, kTypeValue);
+  c->Add(k.Encode().ToString(), "v");
+}
+
+TEST(TableTest, HashIndexTest) {
+  TableConstructor c(BytewiseComparator());
+
+  // keys with prefix length 3, make sure the key/value is big enough to fill
+  // one block
+  AddInternalKey(&c, "0015");
+  AddInternalKey(&c, "0035");
+
+  AddInternalKey(&c, "0054");
+  AddInternalKey(&c, "0055");
+
+  AddInternalKey(&c, "0056");
+  AddInternalKey(&c, "0057");
+
+  AddInternalKey(&c, "0058");
+  AddInternalKey(&c, "0075");
+
+  AddInternalKey(&c, "0076");
+  AddInternalKey(&c, "0095");
+
+  std::vector<std::string> keys;
+  KVMap kvmap;
+  Options options;
+  BlockBasedTableOptions table_options;
+  table_options.index_type = BlockBasedTableOptions::kHashSearch;
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+
+  options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+  options.block_cache = NewLRUCache(1024);
+  options.block_size = 1700;
+
+  std::unique_ptr<InternalKeyComparator> comparator(
+      new InternalKeyComparator(BytewiseComparator()));
+  c.Finish(options, *comparator, &keys, &kvmap);
+  auto reader = c.table_reader();
+
+  auto props = c.table_reader()->GetTableProperties();
+  ASSERT_EQ(5u, props->num_data_blocks);
+
+  std::unique_ptr<Iterator> hash_iter(reader->NewIterator(ReadOptions()));
+
+  // -- Find keys do not exist, but have common prefix.
+  std::vector<std::string> prefixes = {"001", "003", "005", "007", "009"};
+  std::vector<std::string> lower_bound = {keys[0], keys[1], keys[2],
+                                          keys[7], keys[9], };
+
+  // find the lower bound of the prefix
+  for (size_t i = 0; i < prefixes.size(); ++i) {
+    hash_iter->Seek(InternalKey(prefixes[i], 0, kTypeValue).Encode());
+    ASSERT_OK(hash_iter->status());
+    ASSERT_TRUE(hash_iter->Valid());
+
+    // seek the first element in the block
+    ASSERT_EQ(lower_bound[i], hash_iter->key().ToString());
+    ASSERT_EQ("v", hash_iter->value().ToString());
+  }
+
+  // find the upper bound of prefixes
+  std::vector<std::string> upper_bound = {keys[1], keys[2], keys[7], keys[9], };
+
+  // find existing keys
+  for (const auto& item : kvmap) {
+    auto ukey = ExtractUserKey(item.first).ToString();
+    hash_iter->Seek(ukey);
+
+    // ASSERT_OK(regular_iter->status());
+    ASSERT_OK(hash_iter->status());
+
+    // ASSERT_TRUE(regular_iter->Valid());
+    ASSERT_TRUE(hash_iter->Valid());
+
+    ASSERT_EQ(item.first, hash_iter->key().ToString());
+    ASSERT_EQ(item.second, hash_iter->value().ToString());
+  }
+
+  for (size_t i = 0; i < prefixes.size(); ++i) {
+    // the key is greater than any existing keys.
+    auto key = prefixes[i] + "9";
+    hash_iter->Seek(InternalKey(key, 0, kTypeValue).Encode());
+
+    ASSERT_OK(hash_iter->status());
+    if (i == prefixes.size() - 1) {
+      // last key
+      ASSERT_TRUE(!hash_iter->Valid());
+    } else {
+      ASSERT_TRUE(hash_iter->Valid());
+      // seek the first element in the block
+      ASSERT_EQ(upper_bound[i], hash_iter->key().ToString());
+      ASSERT_EQ("v", hash_iter->value().ToString());
+    }
+  }
+
+  // find keys with prefix that don't match any of the existing prefixes.
+  std::vector<std::string> non_exist_prefixes = {"002", "004", "006", "008"};
+  for (const auto& prefix : non_exist_prefixes) {
+    hash_iter->Seek(InternalKey(prefix, 0, kTypeValue).Encode());
+    // regular_iter->Seek(prefix);
+
+    ASSERT_OK(hash_iter->status());
+    ASSERT_TRUE(!hash_iter->Valid());
+  }
+}
+
 // It's very hard to figure out the index block size of a block accurately.
 // To make sure we get the index size, we just make sure as key number
 // grows, the filter block size also grows.