Improve point-lookup performance using a data block hash index (#4174)

Summary: Add hash index support to data blocks, which helps to reduce the CPU utilization of point-lookup operations. This feature is backward compatible with the data block created without the hash index. It is disabled by default unless `BlockBasedTableOptions::data_block_index_type` is set to `data_block_index_type = kDataBlockBinaryAndHash.` The DB size would be bigger with the hash index option as a hash table is added at the end of each data block. If the hash utilization ratio is 1:1, the space overhead is one byte per key. The hash table utilization ratio is adjustable using `BlockBasedTableOptions::data_block_hash_table_util_ratio`. A lower utilization ratio will improve more on the point-lookup efficiency, but take more space too. Pull Request resolved: https://github.com/facebook/rocksdb/pull/4174 Differential Revision: D8965914 Pulled By: fgwu fbshipit-source-id: 1c6bae5d1fc39c80282d8890a72e9e67bc247198
2018-08-15 14:27:47 -07:00 · 2018-08-15 14:27:47 -07:00 · 19ec44fd39
commit 19ec44fd39
parent 8ae2bf5331
24 changed files with 1248 additions and 258 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -550,6 +550,7 @@ set(SOURCES
        table/cuckoo_table_factory.cc
        table/cuckoo_table_reader.cc
        table/data_block_hash_index.cc
+        table/data_block_footer.cc
        table/flush_block_policy.cc
        table/format.cc
        table/full_filter_block.cc
--- a/HISTORY.md
+++ b/HISTORY.md
@ -4,6 +4,7 @@
 ### New Features
 * Changes the format of index blocks by delta encoding the index values, which are the block handles. This saves the encoding of BlockHandle::offset of the non-head index entries in each restart interval. The feature is backward compatible but not forward compatible. It is disabled by default unless format_version 4 or above is used.
 * Add a new tool: trace_analyzer. Trace_analyzer analyzes the trace file generated by using trace_replay API. It can convert the binary format trace file to a human readable txt file, output the statistics of the analyzed query types such as access statistics and size statistics, combining the dumped whole key space file to analyze, support query correlation analyzing, and etc. Current supported query types are: Get, Put, Delete, SingleDelete, DeleteRange, Merge, Iterator (Seek, SeekForPrev only).
+* Add hash index support to data blocks, which helps reducing the cpu utilization of point-lookup operations. This feature is backward compatible with the data block created without the hash index. It is disabled by default unless BlockBasedTableOptions::data_block_index_type is set to data_block_index_type = kDataBlockBinaryAndHash.
 ### Bug Fixes
 * Fix a bug in misreporting the estimated partition index size in properties block.

--- a/3
+++ b/3
@ -171,6 +171,7 @@ cpp_library(
        "table/cuckoo_table_builder.cc",
        "table/cuckoo_table_factory.cc",
        "table/cuckoo_table_reader.cc",
+        "table/data_block_footer.cc",
        "table/data_block_hash_index.cc",
        "table/flush_block_policy.cc",
        "table/format.cc",
@ -270,8 +271,8 @@ cpp_library(
        "utilities/redis/redis_lists.cc",
        "utilities/simulator_cache/sim_cache.cc",
        "utilities/spatialdb/spatial_db.cc",
-        "utilities/trace/file_trace_reader_writer.cc",
        "utilities/table_properties_collectors/compact_on_deletion_collector.cc",
+        "utilities/trace/file_trace_reader_writer.cc",
        "utilities/transactions/optimistic_transaction.cc",
        "utilities/transactions/optimistic_transaction_db_impl.cc",
        "utilities/transactions/pessimistic_transaction.cc",
--- a/include/rocksdb/comparator.h
+++ b/include/rocksdb/comparator.h
@ -74,6 +74,12 @@ class Comparator {
                                              const Slice& /*t*/) const {
    return false;
  }
+
+  // return true if two keys with different byte sequences can be regarded
+  // as equal by this comparator.
+  // The major use case is to determine if DataBlockHashIndex is compatible
+  // with the customized comparator.
+  virtual bool CanKeysWithDifferentByteContentsBeEqual() const { return true; }
 };

 // Return a builtin comparator that uses lexicographic byte-wise
--- a/include/rocksdb/table.h
+++ b/include/rocksdb/table.h
@ -101,14 +101,18 @@ struct BlockBasedTableOptions {
  IndexType index_type = kBinarySearch;

  // The index type that will be used for the data block.
-  // The kDataBlockHashSearch index type is not yet implemented.
  enum DataBlockIndexType : char {
    kDataBlockBinarySearch = 0,  // traditional block type
-    kDataBlockHashSearch = 1,     // additional hash index appended to the end.
+    kDataBlockBinaryAndHash = 1, // additional hash index
  };

  DataBlockIndexType data_block_index_type = kDataBlockBinarySearch;

+  // #entries/#buckets. It is valid only when data_block_hash_index_type is
+  // kDataBlockBinaryAndHash.
+  double data_block_hash_table_util_ratio = 0.75;
+
+
  // This option is now deprecated. No matter what value it is set to,
  // it will behave as if hash_index_allow_collision=true.
  bool hash_index_allow_collision = true;
--- a/options/options.cc
+++ b/options/options.cc
@ -480,7 +480,8 @@ ColumnFamilyOptions* ColumnFamilyOptions::OptimizeForPointLookup(
  BlockBasedTableOptions block_based_options;
  block_based_options.index_type = BlockBasedTableOptions::kHashSearch;
  block_based_options.data_block_index_type =
-      BlockBasedTableOptions::kDataBlockBinarySearch;
+      BlockBasedTableOptions::kDataBlockBinaryAndHash;
+  block_based_options.data_block_hash_table_util_ratio = 0.75;
  block_based_options.filter_policy.reset(NewBloomFilterPolicy(10));
  block_based_options.block_cache =
      NewLRUCache(static_cast<size_t>(block_cache_size_mb * 1024 * 1024));
--- a/options/options_helper.cc
+++ b/options/options_helper.cc
@ -1567,8 +1567,8 @@ std::unordered_map<std::string, BlockBasedTableOptions::DataBlockIndexType>
    OptionsHelper::block_base_table_data_block_index_type_string_map = {
        {"kDataBlockBinarySearch",
         BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinarySearch},
-        {"kDataBlockHashSearch",
-         BlockBasedTableOptions::DataBlockIndexType::kDataBlockHashSearch}};
+        {"kDataBlockBinaryAndHash",
+         BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinaryAndHash}};

 std::unordered_map<std::string, EncodingType>
    OptionsHelper::encoding_type_string_map = {{"kPlain", kPlain},
--- a/options/options_settable_test.cc
+++ b/options/options_settable_test.cc
@ -142,7 +142,8 @@ TEST_F(OptionsSettableTest, BlockBasedTableOptionsAllFieldsSettable) {
      "pin_l0_filter_and_index_blocks_in_cache=1;"
      "pin_top_level_index_and_filter=1;"
      "index_type=kHashSearch;"
-      "data_block_index_type=kDataBlockHashSearch;"
+      "data_block_index_type=kDataBlockBinaryAndHash;"
+      "data_block_hash_table_util_ratio=0.75;"
      "checksum=kxxHash;hash_index_allow_collision=1;no_block_cache=1;"
      "block_cache=1M;block_cache_compressed=1k;block_size=1024;"
      "block_size_deviation=8;block_restart_interval=4; "
--- a/src.mk
+++ b/src.mk
@ -104,6 +104,7 @@ LIB_SOURCES =                                                   \
  table/cuckoo_table_factory.cc                                 \
  table/cuckoo_table_reader.cc                                  \
  table/data_block_hash_index.cc                                \
+  table/data_block_footer.cc                                    \
  table/flush_block_policy.cc                                   \
  table/format.cc                                               \
  table/full_filter_block.cc                                    \
--- a/table/block.cc
+++ b/table/block.cc
@ -20,6 +20,7 @@
 #include "port/stack_trace.h"
 #include "rocksdb/comparator.h"
 #include "table/block_prefix_index.h"
+#include "table/data_block_footer.h"
 #include "table/format.h"
 #include "util/coding.h"
 #include "util/logging.h"
@ -223,6 +224,116 @@ void DataBlockIter::Seek(const Slice& target) {
  }
 }

+// Optimized Seek for point lookup for an internal key `target`
+// target = "seek_user_key @ type | seqno".
+//
+// For any type other than kTypeValue, kTypeDeletion, kTypeSingleDeletion,
+// or kTypeBlobIndex, this function behaves identically as Seek().
+//
+// For any type in kTypeValue, kTypeDeletion, kTypeSingleDeletion,
+// or kTypeBlobIndex:
+//
+// If the return value is FALSE, iter location is undefined, and it means:
+// 1) there is no key in this block falling into the range:
+//    ["seek_user_key @ type | seqno", "seek_user_key @ type |  0"],
+//    inclusive; AND
+// 2) the last key of this block has a greater user_key from seek_user_key
+//
+// If the return value is TRUE, iter location has two possibilies:
+// 1) If iter is valid, it is set to a location as if set by BinarySeek. In
+//    this case, it points to the first key_ with a larger user_key or a
+//    matching user_key with a seqno no greater than the seeking seqno.
+// 2) If the iter is invalid, it means either the block has no such user_key,
+//    or the block ends with a matching user_key but with a larger seqno.
+bool DataBlockIter::SeekForGetImpl(const Slice& target) {
+  Slice user_key = ExtractUserKey(target);
+  uint32_t map_offset = restarts_ + num_restarts_ * sizeof(uint32_t);
+  uint8_t entry = data_block_hash_index_->Lookup(data_, map_offset, user_key);
+
+  if (entry == kNoEntry) {
+    // Even if we cannot find the user_key in this block, the result may
+    // exist in the next block. Consider this exmpale:
+    //
+    // Block N:    [aab@100, ... , app@120]
+    // bounary key: axy@50 (we make minimal assumption about a boundary key)
+    // Block N+1:  [axy@10, ...   ]
+    //
+    // If seek_key = axy@60, the search will starts from Block N.
+    // Even if the user_key is not found in the hash map, the caller still
+    // have to conntinue searching the next block. So we invalidate the
+    // iterator to tell the caller to go on.
+    current_ = restarts_; // Invalidate the iter
+    return true;
+  }
+
+  if (entry == kCollision) {
+    // HashSeek not effective, falling back
+    Seek(target);
+    return true;
+  }
+
+  uint32_t restart_index = entry;
+
+  // check if the key is in the restart_interval
+  assert(restart_index < num_restarts_);
+  SeekToRestartPoint(restart_index);
+
+  const char* limit = nullptr;
+  if (restart_index_ + 1 < num_restarts_) {
+    limit = data_ + GetRestartPoint(restart_index_ + 1);
+  } else {
+    limit = data_ + restarts_;
+  }
+
+  while (true) {
+    // Here we only linear seek the target key inside the restart interval.
+    // If a key does not exist inside a restart interval, we avoid
+    // further searching the block content accross restart interval boundary.
+    //
+    // TODO(fwu): check the left and write boundary of the restart interval
+    // to avoid linear seek a target key that is out of range.
+    if (!ParseNextDataKey(limit) || Compare(key_, target) >= 0) {
+      // we stop at the first potential matching user key.
+      break;
+    }
+  }
+
+  if (current_ == restarts_) {
+    // Search reaches to the end of the block. There are two possibilites;
+    // 1) there is only one user_key match in the block (otherwise collsion).
+    //    the matching user_key resides in the last restart interval.
+    //    it is the last key of the restart interval and of the block too.
+    //    ParseNextDataKey() skiped it as its seqno is newer.
+    //
+    // 2) The seek_key is a false positive and got hashed to the last restart
+    //    interval.
+    //    All existing keys in the restart interval are less than seek_key.
+    //
+    // The result may exist in the next block in either case, so may_exist is
+    // returned as true.
+    return true;
+  }
+
+  if (user_comparator_->Compare(key_.GetUserKey(), user_key) != 0) {
+    // the key is not in this block and cannot be at the next block either.
+    // return false to tell the caller to break from the top-level for-loop
+    return false;
+  }
+
+  // Here we are conservative and only support a limited set of cases
+  ValueType value_type = ExtractValueType(key_.GetKey());
+  if (value_type != ValueType::kTypeValue &&
+      value_type != ValueType::kTypeDeletion &&
+      value_type != ValueType::kTypeSingleDeletion &&
+      value_type != ValueType::kTypeBlobIndex) {
+    Seek(target);
+    return true;
+  }
+
+  // Result found, and the iter is correctly set.
+  return true;
+}
+
 void IndexBlockIter::Seek(const Slice& target) {
  Slice seek_key = target;
  if (!key_includes_seq_) {
@ -329,10 +440,13 @@ void BlockIter<TValue>::CorruptionError() {
  value_.clear();
 }

-bool DataBlockIter::ParseNextDataKey() {
+bool DataBlockIter::ParseNextDataKey(const char* limit) {
  current_ = NextEntryOffset();
  const char* p = data_ + current_;
-  const char* limit = data_ + restarts_;  // Restarts come right after data
+  if (!limit) {
+    limit = data_ + restarts_;  // Restarts come right after data
+  }
+
  if (p >= limit) {
    // No more entries to return.  Mark as invalid.
    current_ = restarts_;
@ -608,7 +722,34 @@ bool IndexBlockIter::PrefixSeek(const Slice& target, uint32_t* index) {

 uint32_t Block::NumRestarts() const {
  assert(size_ >= 2*sizeof(uint32_t));
-  return DecodeFixed32(data_ + size_ - sizeof(uint32_t));
+  uint32_t block_footer = DecodeFixed32(data_ + size_ - sizeof(uint32_t));
+  uint32_t num_restarts = block_footer;
+  if (size_ > kMaxBlockSizeSupportedByHashIndex) {
+    // We ensure a block with HashIndex is less than 64KiB in BlockBuilder.
+    // Therefore the footer cannot be encoded as a packed index type and
+    // num_restarts.
+    // Such check can ensure legacy block with a vary large num_restarts
+    // i.e. >= 0x10000000 can be interpreted correctly as no HashIndex.
+    // If a legacy block hash a num_restarts >= 0x10000000, size_ will be
+    // much large than 64KiB.
+    return num_restarts;
+  }
+  BlockBasedTableOptions::DataBlockIndexType index_type;
+  UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts);
+  return num_restarts;
+}
+
+BlockBasedTableOptions::DataBlockIndexType Block::IndexType() const {
+  assert(size_ >= 2 * sizeof(uint32_t));
+  if (size_ > kMaxBlockSizeSupportedByHashIndex) {
+    // The check is for the same reason as that in NumRestarts()
+    return BlockBasedTableOptions::kDataBlockBinarySearch;
+  }
+  uint32_t block_footer = DecodeFixed32(data_ + size_ - sizeof(uint32_t));
+  uint32_t num_restarts = block_footer;
+  BlockBasedTableOptions::DataBlockIndexType index_type;
+  UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts);
+  return index_type;
 }

 Block::~Block() { TEST_SYNC_POINT("Block::~Block"); }
@ -628,12 +769,42 @@ Block::Block(BlockContents&& contents, SequenceNumber _global_seqno,
    // Should only decode restart points for uncompressed blocks
    if (compression_type() == kNoCompression) {
      num_restarts_ = NumRestarts();
-      restart_offset_ =
-          static_cast<uint32_t>(size_) - (1 + num_restarts_) * sizeof(uint32_t);
-      if (restart_offset_ > size_ - sizeof(uint32_t)) {
-        // The size is too small for NumRestarts() and therefore
-        // restart_offset_ wrapped around.
-        size_ = 0;
+      switch (IndexType()) {
+        case BlockBasedTableOptions::kDataBlockBinarySearch:
+          restart_offset_ = static_cast<uint32_t>(size_) -
+                            (1 + num_restarts_) * sizeof(uint32_t);
+          if (restart_offset_ > size_ - sizeof(uint32_t)) {
+            // The size is too small for NumRestarts() and therefore
+            // restart_offset_ wrapped around.
+            size_ = 0;
+          }
+          break;
+        case BlockBasedTableOptions::kDataBlockBinaryAndHash:
+          if (size_ < sizeof(uint32_t) /* block footer */ +
+                          sizeof(uint16_t)  /* NUM_BUCK */) {
+            size_ = 0;
+            break;
+          }
+
+          uint16_t map_offset;
+          data_block_hash_index_.Initialize(
+              contents.data.data(),
+              static_cast<uint16_t>(
+                  contents.data.size() - sizeof(uint32_t)),     /*chop off
+                                                            NUM_RESTARTS*/
+              &map_offset);
+
+          restart_offset_ = map_offset - num_restarts_ * sizeof(uint32_t);
+
+          if (restart_offset_ > map_offset) {
+            // map_offset is too small for NumRestarts() and
+            // therefore restart_offset_ wrapped around.
+            size_ = 0;
+            break;
+          }
+          break;
+        default:
+          size_ = 0;  // Error marker
      }
    }
  }
@ -665,8 +836,10 @@ DataBlockIter* Block::NewIterator(const Comparator* cmp, const Comparator* ucmp,
    ret_iter->Invalidate(Status::OK());
    return ret_iter;
  } else {
-    ret_iter->Initialize(cmp, ucmp, data_, restart_offset_, num_restarts_,
-                         global_seqno_, read_amp_bitmap_.get(), cachable());
+    ret_iter->Initialize(
+        cmp, ucmp, data_, restart_offset_, num_restarts_, global_seqno_,
+        read_amp_bitmap_.get(), cachable(),
+        data_block_hash_index_.Valid() ? &data_block_hash_index_ : nullptr);
    if (read_amp_bitmap_) {
      if (read_amp_bitmap_->GetStatistics() != stats) {
        // DB changed the Statistics pointer, we need to notify read_amp_bitmap_
@ -703,7 +876,7 @@ IndexBlockIter* Block::NewIterator(const Comparator* cmp,
        total_order_seek ? nullptr : prefix_index;
    ret_iter->Initialize(cmp, ucmp, data_, restart_offset_, num_restarts_,
                         prefix_index_ptr, key_includes_seq, value_is_full,
-                         cachable());
+                         cachable(), nullptr /* data_block_hash_index */);
  }

  return ret_iter;
--- a/table/block.h
+++ b/table/block.h
@ -22,14 +22,16 @@

 #include "db/dbformat.h"
 #include "db/pinned_iterators_manager.h"
+#include "format.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/options.h"
 #include "rocksdb/statistics.h"
+#include "rocksdb/table.h"
 #include "table/block_prefix_index.h"
+#include "table/data_block_hash_index.h"
 #include "table/internal_iterator.h"
 #include "util/random.h"
 #include "util/sync_point.h"
-#include "format.h"

 namespace rocksdb {

@ -155,6 +157,7 @@ class Block {
  // The additional memory space taken by the block data.
  size_t usable_size() const { return contents_.usable_size(); }
  uint32_t NumRestarts() const;
+  BlockBasedTableOptions::DataBlockIndexType IndexType() const;
  CompressionType compression_type() const {
    return contents_.compression_type;
  }
@ -203,6 +206,8 @@ class Block {
  // the encoded value (kDisableGlobalSequenceNumber means disabled)
  const SequenceNumber global_seqno_;

+  DataBlockHashIndex data_block_hash_index_;
+
  // No copying allowed
  Block(const Block&) = delete;
  void operator=(const Block&) = delete;
@ -330,22 +335,27 @@ class DataBlockIter final : public BlockIter<Slice> {
  DataBlockIter(const Comparator* comparator, const Comparator* user_comparator,
                const char* data, uint32_t restarts, uint32_t num_restarts,
                SequenceNumber global_seqno,
-                BlockReadAmpBitmap* read_amp_bitmap, bool block_contents_pinned)
+                BlockReadAmpBitmap* read_amp_bitmap, bool block_contents_pinned,
+                DataBlockHashIndex* data_block_hash_index)
      : DataBlockIter() {
    Initialize(comparator, user_comparator, data, restarts, num_restarts,
-               global_seqno, read_amp_bitmap, block_contents_pinned);
+               global_seqno, read_amp_bitmap, block_contents_pinned,
+               data_block_hash_index);
  }
  void Initialize(const Comparator* comparator,
-                  const Comparator* /*user_comparator*/, const char* data,
+                  const Comparator* user_comparator, const char* data,
                  uint32_t restarts, uint32_t num_restarts,
                  SequenceNumber global_seqno,
                  BlockReadAmpBitmap* read_amp_bitmap,
-                  bool block_contents_pinned) {
+                  bool block_contents_pinned,
+                  DataBlockHashIndex* data_block_hash_index) {
    InitializeBase(comparator, data, restarts, num_restarts, global_seqno,
                   block_contents_pinned);
+    user_comparator_ = user_comparator;
    key_.SetIsUserKey(false);
    read_amp_bitmap_ = read_amp_bitmap;
    last_bitmap_offset_ = current_ + 1;
+    data_block_hash_index_ = data_block_hash_index;
  }

  virtual Slice value() const override {
@ -361,6 +371,15 @@ class DataBlockIter final : public BlockIter<Slice> {

  virtual void Seek(const Slice& target) override;

+  inline bool SeekForGet(const Slice& target) {
+    if (!data_block_hash_index_) {
+      Seek(target);
+      return true;
+    }
+
+    return SeekForGetImpl(target);
+  }
+
  virtual void SeekForPrev(const Slice& target) override;

  virtual void Prev() override;
@ -408,11 +427,16 @@ class DataBlockIter final : public BlockIter<Slice> {
  std::vector<CachedPrevEntry> prev_entries_;
  int32_t prev_entries_idx_ = -1;

-  inline bool ParseNextDataKey();
+  DataBlockHashIndex* data_block_hash_index_;
+  const Comparator* user_comparator_;
+
+  inline bool ParseNextDataKey(const char* limit = nullptr);

  inline int Compare(const IterKey& ikey, const Slice& b) const {
    return comparator_->Compare(ikey.GetInternalKey(), b);
  }
+
+  bool SeekForGetImpl(const Slice& target);
 };

 class IndexBlockIter final : public BlockIter<BlockHandle> {
@ -435,14 +459,15 @@ class IndexBlockIter final : public BlockIter<BlockHandle> {
      : IndexBlockIter() {
    Initialize(comparator, user_comparator, data, restarts, num_restarts,
               prefix_index, key_includes_seq, block_contents_pinned,
-               value_is_full);
+               value_is_full, nullptr /* data_block_hash_index */);
  }

  void Initialize(const Comparator* comparator,
                  const Comparator* user_comparator, const char* data,
                  uint32_t restarts, uint32_t num_restarts,
                  BlockPrefixIndex* prefix_index, bool key_includes_seq,
-                  bool value_is_full, bool block_contents_pinned) {
+                  bool value_is_full, bool block_contents_pinned,
+                  DataBlockHashIndex* /*data_block_hash_index*/) {
    InitializeBase(comparator, data, restarts, num_restarts,
                   kDisableGlobalSequenceNumber, block_contents_pinned);
    key_includes_seq_ = key_includes_seq;
--- a/table/block_based_table_builder.cc
+++ b/table/block_based_table_builder.cc
@ -303,7 +303,13 @@ struct BlockBasedTableBuilder::Rep {
                      ? std::min(table_options.block_size, kDefaultPageSize)
                      : 0),
        data_block(table_options.block_restart_interval,
-                   table_options.use_delta_encoding),
+                   table_options.use_delta_encoding,
+                   false /* use_value_delta_encoding */,
+                   icomparator.user_comparator()
+                           ->CanKeysWithDifferentByteContentsBeEqual()
+                       ? BlockBasedTableOptions::kDataBlockBinarySearch
+                       : table_options.data_block_index_type,
+                   table_options.data_block_hash_table_util_ratio),
        range_del_block(1 /* block_restart_interval */),
        internal_prefix_transform(_moptions.prefix_extractor.get()),
        compression_dict(_compression_dict),
--- a/table/block_based_table_factory.cc
+++ b/table/block_based_table_factory.cc
@ -259,6 +259,13 @@ Status BlockBasedTableFactory::SanitizeOptions(
    return Status::InvalidArgument(
        "Block alignment requested but block size is not a power of 2");
  }
+  if (table_options_.data_block_index_type ==
+      BlockBasedTableOptions::kDataBlockBinaryAndHash &&
+      table_options_.data_block_hash_table_util_ratio <= 0) {
+    return Status::InvalidArgument(
+        "data_block_hash_table_util_ratio should be greater than 0 when "
+        "data_block_index_type is set to kDataBlockBinaryAndHash");
+  }
  return Status::OK();
 }

--- a/table/block_based_table_factory.h
+++ b/table/block_based_table_factory.h
@ -127,6 +127,10 @@ static std::unordered_map<std::string, OptionTypeInfo>
         {offsetof(struct BlockBasedTableOptions, data_block_index_type),
          OptionType::kBlockBasedTableDataBlockIndexType,
          OptionVerificationType::kNormal, false, 0}},
+        {"data_block_hash_table_util_ratio",
+         {offsetof(struct BlockBasedTableOptions,
+                   data_block_hash_table_util_ratio),
+          OptionType::kDouble, OptionVerificationType::kNormal, false, 0}},
        {"checksum",
         {offsetof(struct BlockBasedTableOptions, checksum),
          OptionType::kChecksumType, OptionVerificationType::kNormal, false,
--- a/table/block_based_table_reader.cc
+++ b/table/block_based_table_reader.cc
@ -2378,8 +2378,17 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
          break;
        }

+        bool may_exist = biter.SeekForGet(key);
+        if (!may_exist) {
+          // HashSeek cannot find the key this block and the the iter is not
+          // the end of the block, i.e. cannot be in the following blocks
+          // either. In this case, the seek_key cannot be found, so we break
+          // from the top level for-loop.
+          break;
+        }
+
        // Call the *saver function on each entry/block until it returns false
-        for (biter.Seek(key); biter.Valid(); biter.Next()) {
+        for (; biter.Valid(); biter.Next()) {
          ParsedInternalKey parsed_key;
          if (!ParseInternalKey(biter.key(), &parsed_key)) {
            s = Status::Corruption(Slice());
--- a/table/block_builder.cc
+++ b/table/block_builder.cc
@ -35,20 +35,34 @@

 #include <algorithm>
 #include <assert.h>
-#include "rocksdb/comparator.h"
 #include "db/dbformat.h"
+#include "rocksdb/comparator.h"
+#include "table/data_block_footer.h"
 #include "util/coding.h"

 namespace rocksdb {

-BlockBuilder::BlockBuilder(int block_restart_interval, bool use_delta_encoding,
-                           bool use_value_delta_encoding)
+BlockBuilder::BlockBuilder(
+    int block_restart_interval, bool use_delta_encoding,
+    bool use_value_delta_encoding,
+    BlockBasedTableOptions::DataBlockIndexType index_type,
+    double data_block_hash_table_util_ratio)
    : block_restart_interval_(block_restart_interval),
      use_delta_encoding_(use_delta_encoding),
      use_value_delta_encoding_(use_value_delta_encoding),
      restarts_(),
      counter_(0),
      finished_(false) {
+  switch (index_type) {
+    case BlockBasedTableOptions::kDataBlockBinarySearch:
+      break;
+    case BlockBasedTableOptions::kDataBlockBinaryAndHash:
+      data_block_hash_index_builder_.Initialize(
+          data_block_hash_table_util_ratio);
+      break;
+    default:
+      assert(0);
+  }
  assert(block_restart_interval_ >= 1);
  restarts_.push_back(0);       // First restart point is at offset 0
  estimate_ = sizeof(uint32_t) + sizeof(uint32_t);
@ -62,6 +76,9 @@ void BlockBuilder::Reset() {
  counter_ = 0;
  finished_ = false;
  last_key_.clear();
+  if (data_block_hash_index_builder_.Valid()) {
+    data_block_hash_index_builder_.Reset();
+  }
 }

 size_t BlockBuilder::EstimateSizeAfterKV(const Slice& key, const Slice& value)
@ -89,6 +106,7 @@ size_t BlockBuilder::EstimateSizeAfterKV(const Slice& key, const Slice& value)
    estimate += VarintLength(value.size());  // varint for value length.
  }

+  // TODO(fwu): add the delta of the DataBlockHashIndex
  return estimate;
 }

@ -97,7 +115,21 @@ Slice BlockBuilder::Finish() {
  for (size_t i = 0; i < restarts_.size(); i++) {
    PutFixed32(&buffer_, restarts_[i]);
  }
-  PutFixed32(&buffer_, static_cast<uint32_t>(restarts_.size()));
+
+  uint32_t num_restarts = static_cast<uint32_t>(restarts_.size());
+  BlockBasedTableOptions::DataBlockIndexType index_type =
+    BlockBasedTableOptions::kDataBlockBinarySearch;
+  if (data_block_hash_index_builder_.Valid() &&
+      CurrentSizeEstimate() <= kMaxBlockSizeSupportedByHashIndex) {
+    data_block_hash_index_builder_.Finish(buffer_);
+    index_type = BlockBasedTableOptions::kDataBlockBinaryAndHash;
+  }
+
+  // footer is a packed format of data_block_index_type and num_restarts
+  uint32_t block_footer = PackIndexTypeAndNumRestarts(
+      index_type, num_restarts);
+
+  PutFixed32(&buffer_, block_footer);
  finished_ = true;
  return Slice(buffer_);
 }
@ -154,6 +186,11 @@ void BlockBuilder::Add(const Slice& key, const Slice& value,
    buffer_.append(value.data(), value.size());
  }

+  if (data_block_hash_index_builder_.Valid()) {
+    data_block_hash_index_builder_.Add(ExtractUserKey(key),
+                                       restarts_.size() - 1);
+  }
+
  counter_++;
  estimate_ += buffer_.size() - curr_size;
 }
--- a/table/block_builder.h
+++ b/table/block_builder.h
@ -12,6 +12,8 @@

 #include <stdint.h>
 #include "rocksdb/slice.h"
+#include "rocksdb/table.h"
+#include "table/data_block_hash_index.h"

 namespace rocksdb {

@ -22,7 +24,10 @@ class BlockBuilder {

  explicit BlockBuilder(int block_restart_interval,
                        bool use_delta_encoding = true,
-                        bool use_value_delta_encoding = false);
+                        bool use_value_delta_encoding = false,
+                        BlockBasedTableOptions::DataBlockIndexType index_type =
+                            BlockBasedTableOptions::kDataBlockBinarySearch,
+                        double data_block_hash_table_util_ratio = 0.75);

  // Reset the contents as if the BlockBuilder was just constructed.
  void Reset();
@ -39,7 +44,11 @@ class BlockBuilder {

  // Returns an estimate of the current (uncompressed) size of the block
  // we are building.
-  inline size_t CurrentSizeEstimate() const { return estimate_; }
+  inline size_t CurrentSizeEstimate() const {
+    return estimate_ + (data_block_hash_index_builder_.Valid()
+                            ? data_block_hash_index_builder_.EstimateSize()
+                            : 0);
+  }

  // Returns an estimated block size after appending key and value.
  size_t EstimateSizeAfterKV(const Slice& key, const Slice& value) const;
@ -62,6 +71,7 @@ class BlockBuilder {
  int                   counter_;   // Number of entries emitted since restart
  bool                  finished_;  // Has Finish() been called?
  std::string           last_key_;
+  DataBlockHashIndexBuilder data_block_hash_index_builder_;
 };

 }  // namespace rocksdb
--- a/table/data_block_footer.cc
+++ b/table/data_block_footer.cc
@ -0,0 +1,60 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "data_block_footer.h"
+
+#include "rocksdb/table.h"
+
+namespace rocksdb {
+
+const int kDataBlockIndexTypeBitShift = 31;
+
+// 0x7FFFFFFF
+const uint32_t kMaxNumRestarts = (1u << kDataBlockIndexTypeBitShift) - 1u;
+
+// 0x7FFFFFFF
+const uint32_t kNumRestartsMask = (1u << kDataBlockIndexTypeBitShift) - 1u;
+
+uint32_t PackIndexTypeAndNumRestarts(
+    BlockBasedTableOptions::DataBlockIndexType index_type,
+    uint32_t num_restarts) {
+  if (num_restarts > kMaxNumRestarts) {
+    assert(0);  // mute travis "unused" warning
+  }
+
+  uint32_t block_footer = num_restarts;
+  if (index_type == BlockBasedTableOptions::kDataBlockBinaryAndHash) {
+    block_footer |= 1u << kDataBlockIndexTypeBitShift;
+  } else if (index_type != BlockBasedTableOptions::kDataBlockBinarySearch) {
+      assert(0);
+  }
+
+  return block_footer;
+}
+
+
+void UnPackIndexTypeAndNumRestarts(
+    uint32_t block_footer,
+    BlockBasedTableOptions::DataBlockIndexType* index_type,
+    uint32_t* num_restarts) {
+  if (index_type) {
+    if (block_footer & 1u << kDataBlockIndexTypeBitShift) {
+      *index_type = BlockBasedTableOptions::kDataBlockBinaryAndHash;
+    } else {
+      *index_type = BlockBasedTableOptions::kDataBlockBinarySearch;
+    }
+  }
+
+  if (num_restarts) {
+    *num_restarts = block_footer & kNumRestartsMask;
+    assert(*num_restarts <= kMaxNumRestarts);
+  }
+}
+
+} // namespace rocksdb
--- a/table/data_block_footer.h
+++ b/table/data_block_footer.h
@ -0,0 +1,26 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "rocksdb/table.h"
+
+namespace rocksdb {
+
+uint32_t PackIndexTypeAndNumRestarts(
+    BlockBasedTableOptions::DataBlockIndexType index_type,
+    uint32_t num_restarts);
+
+
+void UnPackIndexTypeAndNumRestarts(
+    uint32_t block_footer,
+    BlockBasedTableOptions::DataBlockIndexType* index_type,
+    uint32_t* num_restarts);
+
+} // namespace rocksdb
--- a/table/data_block_hash_index.cc
+++ b/table/data_block_hash_index.cc
@ -12,115 +12,80 @@

 namespace rocksdb {

-const uint32_t kSeed = 2018;
-const uint32_t kSeed_tag = 214; /* second hash seed */
-
-inline uint16_t HashToBucket(const Slice& s, uint16_t num_buckets) {
-  return static_cast<uint16_t>(
-      rocksdb::Hash(s.data(), s.size(), kSeed) % num_buckets);
-}
-
 void DataBlockHashIndexBuilder::Add(const Slice& key,
-                                    const uint16_t& restart_index) {
-  uint16_t idx = HashToBucket(key, num_buckets_);
-  /* push a TAG to avoid false postive */
-  /* the TAG is the hash function value of another seed */
-  uint16_t tag = static_cast<uint16_t>(
-      rocksdb::Hash(key.data(), key.size(), kSeed_tag));
-  buckets_[idx].push_back(tag);
-  buckets_[idx].push_back(restart_index);
-  estimate_ += 2 * sizeof(uint16_t);
+                                    const size_t restart_index) {
+  assert(Valid());
+  if (restart_index > kMaxRestartSupportedByHashIndex) {
+    valid_ = false;
+    return;
+  }
+
+  uint32_t hash_value = GetSliceHash(key);
+  hash_and_restart_pairs_.emplace_back(hash_value,
+                                       static_cast<uint8_t>(restart_index));
 }

 void DataBlockHashIndexBuilder::Finish(std::string& buffer) {
-  // offset is the byte offset within the buffer
-  std::vector<uint16_t> bucket_offsets(num_buckets_, 0);
+  assert(Valid());
+  uint16_t num_buckets = static_cast<uint16_t>(
+      static_cast<double>(hash_and_restart_pairs_.size()) / util_ratio_);
+  if (num_buckets == 0) {
+    num_buckets = 1;  // sanity check
+  }

-  uint16_t map_start = static_cast<uint16_t>(buffer.size());
+  // The build-in hash cannot well distribute strings when into different
+  // buckets when num_buckets is power of two, resulting in high hash
+  // collision.
+  // We made the num_buckets to be odd to avoid this issue.
+  num_buckets |= 1;

-  // write each bucket to the string
-  for (uint16_t i = 0; i < num_buckets_; i++) {
-    // remember the start offset of the buckets in bucket_offsets
-    bucket_offsets[i] = static_cast<uint16_t>(buffer.size());
-    for (uint16_t elem : buckets_[i]) {
-      // the elem is alternative "TAG" and "offset"
-      PutFixed16(&buffer, elem);
+  std::vector<uint8_t> buckets(num_buckets, kNoEntry);
+  // write the restart_index array
+  for (auto& entry : hash_and_restart_pairs_) {
+    uint32_t hash_value = entry.first;
+    uint8_t restart_index = entry.second;
+    uint16_t buck_idx = static_cast<uint16_t>(hash_value % num_buckets);
+    if (buckets[buck_idx] == kNoEntry) {
+      buckets[buck_idx] = restart_index;
+    } else if (buckets[buck_idx] != restart_index) {
+      // same bucket cannot store two different restart_index, mark collision
+      buckets[buck_idx] = kCollision;
    }
  }

-  // write the bucket_offsets
-  for (uint16_t i = 0; i < num_buckets_; i++) {
-    PutFixed16(&buffer, bucket_offsets[i]);
+  for (uint8_t restart_index : buckets) {
+    buffer.append(const_cast<const char*>(
+                      reinterpret_cast<char*>(&restart_index)),
+                  sizeof(restart_index));
  }

  // write NUM_BUCK
-  PutFixed16(&buffer, num_buckets_);
+  PutFixed16(&buffer, num_buckets);

-  // write MAP_START
-  PutFixed16(&buffer, map_start);
-
-  // Because we use uint16_t address, we only support block less than 64KB
-  assert(buffer.size() < (1 << 16));
+  assert(buffer.size() <= kMaxBlockSizeSupportedByHashIndex);
 }

 void DataBlockHashIndexBuilder::Reset() {
-//  buckets_.clear();
-std::fill(buckets_.begin(), buckets_.end(), std::vector<uint16_t>());
-estimate_ = 0;
+  hash_and_restart_pairs_.clear();
+  valid_ = true;
 }

-DataBlockHashIndex::DataBlockHashIndex(Slice block_content) {
-  assert(block_content.size() >=
-         2 * sizeof(uint16_t));  // NUM_BUCK and MAP_START
-
-  data_ = block_content.data();
-  size_ = static_cast<uint16_t>(block_content.size());
-
-  map_start_ = data_ + DecodeFixed16(data_ + size_ - sizeof(uint16_t));
-  assert(map_start_ < data_ + size_);
-
-  num_buckets_ = DecodeFixed16(data_ + size_ - 2 * sizeof(uint16_t));
+void DataBlockHashIndex::Initialize(const char* data, uint16_t size,
+                                    uint16_t* map_offset) {
+  assert(size >= sizeof(uint16_t));  // NUM_BUCKETS
+  num_buckets_ = DecodeFixed16(data + size - sizeof(uint16_t));
  assert(num_buckets_ > 0);
-
-  assert(size_ >= sizeof(uint16_t) * (2 + num_buckets_));
-  bucket_table_ = data_ + size_ - sizeof(uint16_t) * (2 + num_buckets_);
-
-  assert(map_start_ <  bucket_table_);
+  assert(size > num_buckets_ * sizeof(uint8_t));
+  *map_offset = static_cast<uint16_t>(size - sizeof(uint16_t) -
+                                      num_buckets_ * sizeof(uint8_t));
 }

-DataBlockHashIndexIterator* DataBlockHashIndex::NewIterator(
-    const Slice& key) const {
-  uint16_t idx = HashToBucket(key, num_buckets_);
-  uint16_t bucket_off = DecodeFixed16(bucket_table_ + idx * sizeof(uint16_t));
-  const char* limit;
-  if (idx < num_buckets_ - 1) {
-    // limited by the start offset of the next bucket
-    limit = data_ + DecodeFixed16(bucket_table_ + (idx + 1) * sizeof(uint16_t));
-  } else {
-    // limited by the location of the NUM_BUCK
-    limit = data_ + (size_ - 2 * sizeof(uint16_t));
-  }
-  uint16_t tag = (uint16_t)rocksdb::Hash(key.data(), key.size(), kSeed_tag);
-  return new DataBlockHashIndexIterator(data_ + bucket_off, limit, tag);
-}
-
-bool DataBlockHashIndexIterator::Valid() {
-  return current_ < end_;
-}
-
-void DataBlockHashIndexIterator::Next() {
-  for (current_ += 2 * sizeof(uint16_t); current_ < end_;
-       current_ += 2 * sizeof(uint16_t)) {
-    // stop at a offset that match the tag, i.e. a possible match
-    uint16_t tag_found = DecodeFixed16(current_);
-    if (tag_found == tag_) {
-      break;
-    }
-  }
-}
-
-uint16_t DataBlockHashIndexIterator::Value() {
-  return DecodeFixed16(current_ + sizeof(uint16_t));
+uint8_t DataBlockHashIndex::Lookup(const char* data, uint32_t map_offset,
+                                 const Slice& key) const {
+  uint32_t hash_value = GetSliceHash(key);
+  uint16_t idx = static_cast<uint16_t>(hash_value % num_buckets_);
+  const char* bucket_table = data + map_offset;
+  return static_cast<uint8_t>(*(bucket_table + idx * sizeof(uint8_t)));
 }

 }  // namespace rocksdb
--- a/table/data_block_hash_index.h
+++ b/table/data_block_hash_index.h
@ -8,11 +8,14 @@
 #include <string>
 #include <vector>

+#include "rocksdb/slice.h"
+
 namespace rocksdb {
 // This is an experimental feature aiming to reduce the CPU utilization of
-// point-lookup within a data-block. It is not used in per-table index-blocks.
-// It supports Get(), but not Seek() or Scan(). If the key does not exist,
-// the iterator is set to invalid.
+// point-lookup within a data-block. It is only used in data blocks, and not
+// in meta-data blocks or per-table index blocks.
+//
+// It only used to support BlockBasedTable::Get().
 //
 // A serialized hash index is appended to the data-block. The new block data
 // format is as follows:
@ -25,114 +28,105 @@ namespace rocksdb {
 // FOOTER:   A 32bit block footer, which is the NUM_RESTARTS with the MSB as
 //           the flag indicating if this hash index is in use. Note that
 //           given a data block < 32KB, the MSB is never used. So we can
-//           borrow the MSB as the hash index flag. Besides, this format is
-//           compatible with the legacy data-blocks < 32KB, as the MSB is 0.
+//           borrow the MSB as the hash index flag. Therefore, this format is
+//           compatible with the legacy data-blocks with num_restarts < 32768,
+//           as the MSB is 0.
 //
-// If we zoom in the HASH_IDX, the format of the data-block hash index is as
-// follows:
+// The format of the data-block hash index is as follows:
 //
-// HASH_IDX: [B B B ... B IDX NUM_BUCK MAP_START]
+// HASH_IDX: [B B B ... B NUM_BUCK]
 //
-// B:        B = bucket, an array of pairs <TAG, restart index>.
-//           TAG is the second hash value of the string. It is used to flag a
-//           matching entry among different keys that are hashed to the same
-//           bucket. A similar tagging idea is used in [Lim et. al, SOSP'11].
-//           However we have a differnet hash design that is not based on cuckoo
-//           hashing as Lim's paper is.
-//           We do not have to store the length of individual buckets, as they
-//           are delimited by the next bucket offset.
-// IDX:      Array of offsets of the index hash bucket (relative to MAP_START)
-// NUM_BUCK: Number of buckets, which is the length of the IDX array.
-// MAP_START: the starting offset of the data-block hash index.
+// B:         bucket, an array of restart index. Each buckets is uint8_t.
+// NUM_BUCK:  Number of buckets, which is the length of the bucket array.
 //
-// Each bucket B has the following structure:
-// [TAG RESTART_INDEX][TAG RESTART_INDEX]...[TAG RESTART_INDEX]
-// where TAG is the hash value of the second hash funtion.
+// We reserve two special flag:
+//    kNoEntry=255,
+//    kCollision=254.
 //
-// pairs of <key, restart index> are inserted to the hash index. Queries will
-// first lookup this hash index to find the restart index, then go to the
-// corresponding restart interval to search linearly for the key.
+// Therefore, the max number of restarts this hash index can supoport is 253.
 //
-// For a point-lookup for a key K:
+// Buckets are initialized to be kNoEntry.
 //
-//        Hash1()
-// 1) K ===========> bucket_id
+// When storing a key in the hash index, the key is first hashed to a bucket.
+// If there the bucket is empty (kNoEntry), the restart index is stored in
+// the bucket. If there is already a restart index there, we will update the
+// existing restart index to a collision marker (kCollision). If the
+// the bucket is already marked as collision, we do not store the restart
+// index either.
 //
-// 2) Look up this bucket_id in the IDX table to find the offset of the bucket
-//
-//        Hash2()
-// 3) K ============> TAG
-// 3) examine the first field (which is TAG) of each entry within this bucket,
-//    skip those without a matching TAG.
-// 4) for the entries matching the TAG, get the restart interval index from the
-//    second field.
-//
-// (following step are implemented in block.cc)
-// 5) lookup the restart index table (refer to the traditional block format),
-//    use the restart interval index to find the offset of the restart interval.
-// 6) linearly search the restart interval for the key.
+// During query process, a key is first hashed to a bucket. Then we examine if
+// the buckets store nothing (kNoEntry) or the bucket had a collision
+// (kCollision). If either of those happens, we get the restart index of
+// the key and will directly go to the restart interval to search the key.
 //
+// Note that we only support blocks with #restart_interval < 254. If a block
+// has more restart interval than that, hash index will not be create for it.
+
+const uint8_t kNoEntry = 255;
+const uint8_t kCollision = 254;
+const uint8_t kMaxRestartSupportedByHashIndex = 253;
+
+// Because we use uint16_t address, we only support block no more than 64KB
+const size_t kMaxBlockSizeSupportedByHashIndex = 1u << 16;
+const double kDefaultUtilRatio = 0.75;

 class DataBlockHashIndexBuilder {
 public:
-  explicit DataBlockHashIndexBuilder(uint16_t n)
-      : num_buckets_(n),
-        buckets_(n),
-        estimate_((n + 2) *
-                  sizeof(uint16_t) /* n buckets, 2 num at the end */) {}
-  void Add(const Slice& key, const uint16_t& restart_index);
+  DataBlockHashIndexBuilder() : util_ratio_(0), valid_(false) {}
+
+  void Initialize(double util_ratio) {
+    if (util_ratio <= 0) {
+      util_ratio = kDefaultUtilRatio;  // sanity check
+    }
+    util_ratio_ = util_ratio;
+    valid_ = true;
+  }
+
+  inline bool Valid() const { return valid_ && util_ratio_ > 0; }
+  void Add(const Slice& key, const size_t restart_index);
  void Finish(std::string& buffer);
  void Reset();
-  inline size_t EstimateSize() { return estimate_; }
+  inline size_t EstimateSize() const {
+    uint16_t estimated_num_buckets = static_cast<uint16_t>(
+        static_cast<double>(hash_and_restart_pairs_.size()) / util_ratio_);
+
+    // Maching the num_buckets number in DataBlockHashIndexBuilder::Finish.
+    estimated_num_buckets |= 1;
+
+    return sizeof(uint16_t) +
+           static_cast<size_t>(estimated_num_buckets * sizeof(uint8_t));
+  }

 private:
-  uint16_t num_buckets_;
-  std::vector<std::vector<uint16_t>> buckets_;
-  size_t estimate_;
-};
+  double util_ratio_;

-class DataBlockHashIndexIterator;
+  // Now the only usage for `valid_` is to mark false when the inserted
+  // restart_index is larger than supported. In this case HashIndex is not
+  // appended to the block content.
+  bool valid_;
+
+  std::vector<std::pair<uint32_t, uint8_t>> hash_and_restart_pairs_;
+  friend class DataBlockHashIndex_DataBlockHashTestSmall_Test;
+};

 class DataBlockHashIndex {
 public:
-  explicit DataBlockHashIndex(Slice  block_content);
+  DataBlockHashIndex() : num_buckets_(0) {}

-  inline uint16_t DataBlockHashMapStart() const {
-    return static_cast<uint16_t>(map_start_ - data_);
-  }
+  void Initialize(const char* data, uint16_t size, uint16_t* map_offset);

-  DataBlockHashIndexIterator* NewIterator(const Slice& key) const;
+  uint8_t Lookup(const char* data, uint32_t map_offset, const Slice& key) const;
+
+  inline bool Valid() { return num_buckets_ != 0; }

 private:
-  const char *data_;
  // To make the serialized hash index compact and to save the space overhead,
  // here all the data fields persisted in the block are in uint16 format.
  // We find that a uint16 is large enough to index every offset of a 64KiB
  // block.
  // So in other words, DataBlockHashIndex does not support block size equal
  // or greater then 64KiB.
-  uint16_t size_;
  uint16_t num_buckets_;
-  const char *map_start_;    // start of the map
-  const char *bucket_table_; // start offset of the bucket index table
-};
-
-class DataBlockHashIndexIterator {
- public:
-  DataBlockHashIndexIterator(const char* start, const char* end,
-                             const uint16_t tag)
-      : end_(end), tag_(tag) {
-    current_ = start - 2 * sizeof(uint16_t);
-    Next();
-  }
-  bool Valid();
-  void Next();
-  uint16_t Value();
-
- private:
-  const char* end_; // the end of the bucket
-  const uint16_t tag_;  // the fingerprint (2nd hash value) of the searching key
-  const char* current_;
 };

 }  // namespace rocksdb
--- a/table/data_block_hash_index_test.cc
+++ b/table/data_block_hash_index_test.cc
@ -8,64 +8,119 @@
 #include <unordered_map>

 #include "rocksdb/slice.h"
+#include "table/block.h"
+#include "table/block_builder.h"
 #include "table/data_block_hash_index.h"
+#include "table/get_context.h"
+#include "table/block_based_table_reader.h"
 #include "util/testharness.h"
 #include "util/testutil.h"

 namespace rocksdb {

-bool SearchForOffset(DataBlockHashIndex& index, const Slice& key,
-                     uint16_t& restart_point) {
-  std::unique_ptr<DataBlockHashIndexIterator> iter;
-  iter.reset(index.NewIterator(key));
-  for (; iter->Valid(); iter->Next()) {
-    if (iter->Value() == restart_point) {
-      return true;
+bool SearchForOffset(DataBlockHashIndex& index, const char* data,
+                     uint16_t map_offset, const Slice& key,
+                     uint8_t& restart_point) {
+  uint8_t entry = index.Lookup(data, map_offset, key);
+  if (entry == kCollision) {
+    return true;
+  }
+
+  if (entry == kNoEntry) {
+    return false;
+  }
+
+  return entry == restart_point;
+}
+
+// Random KV generator similer to block_test
+static std::string RandomString(Random* rnd, int len) {
+  std::string r;
+  test::RandomString(rnd, len, &r);
+  return r;
+}
+std::string GenerateKey(int primary_key, int secondary_key, int padding_size,
+                        Random *rnd) {
+  char buf[50];
+  char *p = &buf[0];
+  snprintf(buf, sizeof(buf), "%6d%4d", primary_key, secondary_key);
+  std::string k(p);
+  if (padding_size) {
+    k += RandomString(rnd, padding_size);
+  }
+
+  return k;
+}
+
+// Generate random key value pairs.
+// The generated key will be sorted. You can tune the parameters to generated
+// different kinds of test key/value pairs for different scenario.
+void GenerateRandomKVs(std::vector<std::string> *keys,
+                       std::vector<std::string> *values, const int from,
+                       const int len, const int step = 1,
+                       const int padding_size = 0,
+                       const int keys_share_prefix = 1) {
+  Random rnd(302);
+
+  // generate different prefix
+  for (int i = from; i < from + len; i += step) {
+    // generating keys that shares the prefix
+    for (int j = 0; j < keys_share_prefix; ++j) {
+      keys->emplace_back(GenerateKey(i, j, padding_size, &rnd));
+
+      // 100 bytes values
+      values->emplace_back(RandomString(&rnd, 100));
    }
  }
-  return false;
 }

 TEST(DataBlockHashIndex, DataBlockHashTestSmall) {
-  // bucket_num = 5, #keys = 2. 40% utilization
-  DataBlockHashIndexBuilder builder(5);
+  DataBlockHashIndexBuilder builder;
+  builder.Initialize(0.75 /*util_ratio*/);
+  for (int j = 0; j < 5; j++) {
+    for (uint8_t i = 0; i < 2 + j; i++) {
+      std::string key("key" + std::to_string(i));
+      uint8_t restart_point = i;
+      builder.Add(key, restart_point);
+    }

-  for (uint16_t i = 0; i < 2; i++) {
-    std::string key("key" + std::to_string(i));
-    uint16_t restart_point = i;
-    builder.Add(key, restart_point);
-  }
+    size_t estimated_size = builder.EstimateSize();

-  size_t estimated_size = builder.EstimateSize();
+    std::string buffer("fake"), buffer2;
+    size_t original_size = buffer.size();
+    estimated_size += original_size;
+    builder.Finish(buffer);

-  std::string buffer("fake"), buffer2;
-  size_t original_size = buffer.size();
-  estimated_size += original_size;
-  builder.Finish(buffer);
+    ASSERT_EQ(buffer.size(), estimated_size);

-  ASSERT_EQ(buffer.size(), estimated_size);
+    buffer2 = buffer; // test for the correctness of relative offset

-  buffer2 = buffer; // test for the correctness of relative offset

-  Slice s(buffer2);
-  DataBlockHashIndex index(s);
+    Slice s(buffer2);
+    DataBlockHashIndex index;
+    uint16_t map_offset;
+    index.Initialize(s.data(), static_cast<uint16_t>(s.size()), &map_offset);

-  // the additional hash map should start at the end of the buffer
-  ASSERT_EQ(original_size, index.DataBlockHashMapStart());
-  for (uint16_t i = 0; i < 2; i++) {
-    std::string key("key" + std::to_string(i));
-    uint16_t restart_point = i;
-    ASSERT_TRUE(SearchForOffset(index, key, restart_point));
+    // the additional hash map should start at the end of the buffer
+    ASSERT_EQ(original_size, map_offset);
+    for (uint8_t i = 0; i < 2; i++) {
+      std::string key("key" + std::to_string(i));
+      uint8_t restart_point = i;
+      ASSERT_TRUE(
+          SearchForOffset(index, s.data(), map_offset, key, restart_point));
+    }
+    builder.Reset();
  }
 }

 TEST(DataBlockHashIndex, DataBlockHashTest) {
  // bucket_num = 200, #keys = 100. 50% utilization
-  DataBlockHashIndexBuilder builder(200);
+  DataBlockHashIndexBuilder builder;
+  builder.Initialize(0.75 /*util_ratio*/);

-  for (uint16_t i = 0; i < 100; i++) {
+  for (uint8_t i = 0; i < 100; i++) {
    std::string key("key" + std::to_string(i));
-    uint16_t restart_point = i;
+    uint8_t restart_point = i;
    builder.Add(key, restart_point);
  }

@ -81,24 +136,28 @@ TEST(DataBlockHashIndex, DataBlockHashTest) {
  buffer2 = buffer; // test for the correctness of relative offset

  Slice s(buffer2);
-  DataBlockHashIndex index(s);
+  DataBlockHashIndex index;
+  uint16_t map_offset;
+  index.Initialize(s.data(), static_cast<uint16_t>(s.size()), &map_offset);

  // the additional hash map should start at the end of the buffer
-  ASSERT_EQ(original_size, index.DataBlockHashMapStart());
-  for (uint16_t i = 0; i < 100; i++) {
+  ASSERT_EQ(original_size, map_offset);
+  for (uint8_t i = 0; i < 100; i++) {
    std::string key("key" + std::to_string(i));
-    uint16_t restart_point = i;
-    ASSERT_TRUE(SearchForOffset(index, key, restart_point));
+    uint8_t restart_point = i;
+    ASSERT_TRUE(
+        SearchForOffset(index, s.data(), map_offset, key, restart_point));
  }
 }

 TEST(DataBlockHashIndex, DataBlockHashTestCollision) {
  // bucket_num = 2. There will be intense hash collisions
-  DataBlockHashIndexBuilder builder(2);
+  DataBlockHashIndexBuilder builder;
+  builder.Initialize(0.75 /*util_ratio*/);

-  for (uint16_t i = 0; i < 100; i++) {
+  for (uint8_t i = 0; i < 100; i++) {
    std::string key("key" + std::to_string(i));
-    uint16_t restart_point = i;
+    uint8_t restart_point = i;
    builder.Add(key, restart_point);
  }

@ -114,27 +173,31 @@ TEST(DataBlockHashIndex, DataBlockHashTestCollision) {
  buffer2 = buffer; // test for the correctness of relative offset

  Slice s(buffer2);
-  DataBlockHashIndex index(s);
+  DataBlockHashIndex index;
+  uint16_t map_offset;
+  index.Initialize(s.data(), static_cast<uint16_t>(s.size()), &map_offset);

  // the additional hash map should start at the end of the buffer
-  ASSERT_EQ(original_size, index.DataBlockHashMapStart());
-  for (uint16_t i = 0; i < 100; i++) {
+  ASSERT_EQ(original_size, map_offset);
+  for (uint8_t i = 0; i < 100; i++) {
    std::string key("key" + std::to_string(i));
-    uint16_t restart_point = i;
-    ASSERT_TRUE(SearchForOffset(index, key, restart_point));
+    uint8_t restart_point = i;
+    ASSERT_TRUE(
+        SearchForOffset(index, s.data(), map_offset, key, restart_point));
  }
 }

 TEST(DataBlockHashIndex, DataBlockHashTestLarge) {
-  DataBlockHashIndexBuilder builder(1000);
-  std::unordered_map<std::string, uint16_t> m;
+  DataBlockHashIndexBuilder builder;
+  builder.Initialize(0.75 /*util_ratio*/);
+  std::unordered_map<std::string, uint8_t> m;

-  for (uint16_t i = 0; i < 10000; i++) {
+  for (uint8_t i = 0; i < 100; i++) {
    if (i % 2) {
      continue;  // leave half of the keys out
    }
    std::string key = "key" + std::to_string(i);
-    uint16_t restart_point = i;
+    uint8_t restart_point = i;
    builder.Add(key, restart_point);
    m[key] = restart_point;
  }
@ -151,16 +214,19 @@ TEST(DataBlockHashIndex, DataBlockHashTestLarge) {
  buffer2 = buffer; // test for the correctness of relative offset

  Slice s(buffer2);
-  DataBlockHashIndex index(s);
+  DataBlockHashIndex index;
+  uint16_t map_offset;
+  index.Initialize(s.data(), static_cast<uint16_t>(s.size()), &map_offset);

  // the additional hash map should start at the end of the buffer
-  ASSERT_EQ(original_size, index.DataBlockHashMapStart());
-  for (uint16_t i = 0; i < 100; i++) {
+  ASSERT_EQ(original_size, map_offset);
+  for (uint8_t i = 0; i < 100; i++) {
    std::string key = "key" + std::to_string(i);
-    uint16_t restart_point = i;
+    uint8_t restart_point = i;
    if (m.count(key)) {
      ASSERT_TRUE(m[key] == restart_point);
-      ASSERT_TRUE(SearchForOffset(index, key, restart_point));
+      ASSERT_TRUE(
+          SearchForOffset(index, s.data(), map_offset, key, restart_point));
    } else {
      // we allow false positve, so don't test the nonexisting keys.
      // when false positive happens, the search will continue to the
@ -169,6 +235,492 @@ TEST(DataBlockHashIndex, DataBlockHashTestLarge) {
  }
 }

+TEST(DataBlockHashIndex, RestartIndexExceedMax) {
+  DataBlockHashIndexBuilder builder;
+  builder.Initialize(0.75 /*util_ratio*/);
+  std::unordered_map<std::string, uint8_t> m;
+
+  for (uint8_t i = 0; i <= 253; i++) {
+    std::string key = "key" + std::to_string(i);
+    uint8_t restart_point = i;
+    builder.Add(key, restart_point);
+  }
+  ASSERT_TRUE(builder.Valid());
+
+  builder.Reset();
+
+  for (uint8_t i = 0; i <= 254; i++) {
+    std::string key = "key" + std::to_string(i);
+    uint8_t restart_point = i;
+    builder.Add(key, restart_point);
+  }
+
+  ASSERT_FALSE(builder.Valid());
+
+  builder.Reset();
+  ASSERT_TRUE(builder.Valid());
+}
+
+TEST(DataBlockHashIndex, BlockRestartIndexExceedMax) {
+  Options options = Options();
+
+  BlockBuilder builder(1 /* block_restart_interval */,
+                       true /* use_delta_encoding */,
+                       false /* use_value_delta_encoding */,
+                       BlockBasedTableOptions::kDataBlockBinaryAndHash);
+
+  // #restarts <= 253. HashIndex is valid
+  for (int i = 0; i <= 253; i++) {
+    std::string ukey = "key" + std::to_string(i);
+    InternalKey ikey(ukey, 0, kTypeValue);
+    builder.Add(ikey.Encode().ToString(), "value");
+  }
+
+  {
+    // read serialized contents of the block
+    Slice rawblock = builder.Finish();
+
+    // create block reader
+    BlockContents contents;
+    contents.data = rawblock;
+    contents.cachable = false;
+    Block reader(std::move(contents), kDisableGlobalSequenceNumber);
+
+    ASSERT_EQ(reader.IndexType(),
+              BlockBasedTableOptions::kDataBlockBinaryAndHash);
+  }
+
+
+  builder.Reset();
+
+  // #restarts > 253. HashIndex is not used
+  for (int i = 0; i <= 254; i++) {
+    std::string ukey = "key" + std::to_string(i);
+    InternalKey ikey(ukey, 0, kTypeValue);
+    builder.Add(ikey.Encode().ToString(), "value");
+  }
+
+  {
+    // read serialized contents of the block
+    Slice rawblock = builder.Finish();
+
+    // create block reader
+    BlockContents contents;
+    contents.data = rawblock;
+    contents.cachable = false;
+    Block reader(std::move(contents), kDisableGlobalSequenceNumber);
+
+    ASSERT_EQ(reader.IndexType(),
+              BlockBasedTableOptions::kDataBlockBinarySearch);
+  }
+}
+
+TEST(DataBlockHashIndex, BlockSizeExceedMax) {
+  Options options = Options();
+  std::string ukey(10, 'k');
+  InternalKey ikey(ukey, 0, kTypeValue);
+
+  BlockBuilder builder(1 /* block_restart_interval */,
+                       false /* use_delta_encoding */,
+                       false /* use_value_delta_encoding */,
+                       BlockBasedTableOptions::kDataBlockBinaryAndHash);
+
+  {
+    // insert a large value. The block size plus HashIndex is 65536.
+    std::string value(65502, 'v');
+
+    builder.Add(ikey.Encode().ToString(), value);
+
+    // read serialized contents of the block
+    Slice rawblock = builder.Finish();
+    ASSERT_LE(rawblock.size(), kMaxBlockSizeSupportedByHashIndex);
+    std::cerr << "block size: " << rawblock.size() << std::endl;
+
+    // create block reader
+    BlockContents contents;
+    contents.data = rawblock;
+    contents.cachable = false;
+    Block reader(std::move(contents), kDisableGlobalSequenceNumber);
+
+    ASSERT_EQ(reader.IndexType(),
+               BlockBasedTableOptions::kDataBlockBinaryAndHash);
+  }
+
+  builder.Reset();
+
+  {
+    // insert a large value. The block size plus HashIndex would be 65537.
+    // This excceed the max block size supported by HashIndex (65536).
+    // So when build finishes HashIndex will not be created for the block.
+    std::string value(65503, 'v');
+
+    builder.Add(ikey.Encode().ToString(), value);
+
+    // read serialized contents of the block
+    Slice rawblock = builder.Finish();
+    ASSERT_LE(rawblock.size(), kMaxBlockSizeSupportedByHashIndex);
+    std::cerr << "block size: " << rawblock.size() << std::endl;
+
+    // create block reader
+    BlockContents contents;
+    contents.data = rawblock;
+    contents.cachable = false;
+    Block reader(std::move(contents), kDisableGlobalSequenceNumber);
+
+    // the index type have fallen back to binary when build finish.
+    ASSERT_EQ(reader.IndexType(),
+              BlockBasedTableOptions::kDataBlockBinarySearch);
+  }
+}
+
+TEST(DataBlockHashIndex, BlockTestSingleKey) {
+  Options options = Options();
+
+  BlockBuilder builder(16 /* block_restart_interval */,
+                       true /* use_delta_encoding */,
+                       false /* use_value_delta_encoding */,
+                       BlockBasedTableOptions::kDataBlockBinaryAndHash);
+
+  std::string ukey("gopher");
+  std::string value("gold");
+  InternalKey ikey(ukey, 10, kTypeValue);
+  builder.Add(ikey.Encode().ToString(), value /*value*/);
+
+  // read serialized contents of the block
+  Slice rawblock = builder.Finish();
+
+  // create block reader
+  BlockContents contents;
+  contents.data = rawblock;
+  contents.cachable = false;
+  Block reader(std::move(contents), kDisableGlobalSequenceNumber);
+
+  const InternalKeyComparator icmp(BytewiseComparator());
+  auto iter = reader.NewIterator<DataBlockIter>(&icmp, icmp.user_comparator());
+  bool may_exist;
+  // search in block for the key just inserted
+  {
+    InternalKey seek_ikey(ukey, 10, kValueTypeForSeek);
+    may_exist = iter->SeekForGet(seek_ikey.Encode().ToString());
+    ASSERT_TRUE(may_exist);
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(options.comparator->Compare(
+                  iter->key(), ikey.Encode().ToString()), 0);
+    ASSERT_EQ(iter->value(), value);
+  }
+
+  // search in block for the existing ukey, but with higher seqno
+  {
+    InternalKey seek_ikey(ukey, 20, kValueTypeForSeek);
+
+    // HashIndex should be able to set the iter correctly
+    may_exist = iter->SeekForGet(seek_ikey.Encode().ToString());
+    ASSERT_TRUE(may_exist);
+    ASSERT_TRUE(iter->Valid());
+
+    // user key should match
+    ASSERT_EQ(options.comparator->Compare(
+                  ExtractUserKey(iter->key()), ukey), 0);
+
+    // seek_key seqno number should be greater than that of iter result
+    ASSERT_GT(GetInternalKeySeqno(seek_ikey.Encode()),
+              GetInternalKeySeqno(iter->key()));
+
+    ASSERT_EQ(iter->value(), value);
+  }
+
+  // Search in block for the existing ukey, but with lower seqno
+  // in this case, hash can find the only occurrence of the user_key, but
+  // ParseNextDataKey() will skip it as it does not have a older seqno.
+  // In this case, GetForSeek() is effective to locate the user_key, and
+  // iter->Valid() == false indicates that we've reached to the end of
+  // the block and the caller should continue searching the next block.
+  {
+    InternalKey seek_ikey(ukey, 5, kValueTypeForSeek);
+    may_exist = iter->SeekForGet(seek_ikey.Encode().ToString());
+    ASSERT_TRUE(may_exist);
+    ASSERT_FALSE(iter->Valid());  // should have reached to the end of block
+  }
+
+  delete iter;
+}
+
+TEST(DataBlockHashIndex, BlockTestLarge) {
+  Random rnd(1019);
+  Options options = Options();
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+
+  BlockBuilder builder(16 /* block_restart_interval */,
+                       true /* use_delta_encoding */,
+                       false /* use_value_delta_encoding */,
+                       BlockBasedTableOptions::kDataBlockBinaryAndHash);
+  int num_records = 500;
+
+  GenerateRandomKVs(&keys, &values, 0, num_records);
+
+  // Generate keys. Adding a trailing "1" to indicate existent keys.
+  // Later will Seeking for keys with a trailing "0" to test seeking
+  // non-existent keys.
+  for (int i = 0; i < num_records; i++) {
+    std::string ukey(keys[i] + "1" /* existing key marker */);
+    InternalKey ikey(ukey, 0, kTypeValue);
+    builder.Add(ikey.Encode().ToString(), values[i]);
+  }
+
+  // read serialized contents of the block
+  Slice rawblock = builder.Finish();
+
+  // create block reader
+  BlockContents contents;
+  contents.data = rawblock;
+  contents.cachable = false;
+  Block reader(std::move(contents), kDisableGlobalSequenceNumber);
+  const InternalKeyComparator icmp(BytewiseComparator());
+
+  // random seek existent keys
+  for (int i = 0; i < num_records; i++) {
+    auto iter =
+        reader.NewIterator<DataBlockIter>(&icmp, icmp.user_comparator());
+    // find a random key in the lookaside array
+    int index = rnd.Uniform(num_records);
+    std::string ukey(keys[index] + "1" /* existing key marker */);
+    InternalKey ikey(ukey, 0, kTypeValue);
+
+    // search in block for this key
+    bool may_exist = iter->SeekForGet(ikey.Encode().ToString());
+    ASSERT_TRUE(may_exist);
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(values[index], iter->value());
+
+    delete iter;
+  }
+
+  // random seek non-existent user keys
+  // In this case A), the user_key cannot be found in HashIndex. The key may
+  // exist in the next block. So the iter is set invalidated to tell the
+  // caller to search the next block. This test case belongs to this case A).
+  //
+  // Note that for non-existent keys, there is possibility of false positive,
+  // i.e. the key is still hashed into some restart interval.
+  // Two additional possible outcome:
+  // B) linear seek the restart interval and not found, the iter stops at the
+  //    starting of the next restart interval. The key does not exist
+  //    anywhere.
+  // C) linear seek the restart interval and not found, the iter stops at the
+  //    the end of the block, i.e. restarts_. The key may exist in the next
+  //    block.
+  // So these combinations are possible when searching non-existent user_key:
+  //
+  // case#    may_exist  iter->Valid()
+  //     A         true          false
+  //     B        false           true
+  //     C         true          false
+
+  for (int i = 0; i < num_records; i++) {
+    auto iter =
+        reader.NewIterator<DataBlockIter>(&icmp, icmp.user_comparator());
+    // find a random key in the lookaside array
+    int index = rnd.Uniform(num_records);
+    std::string ukey(keys[index] + "0" /* non-existing key marker */);
+    InternalKey ikey(ukey, 0, kTypeValue);
+
+    // search in block for this key
+    bool may_exist = iter->SeekForGet(ikey.Encode().ToString());
+    if (!may_exist) {
+      ASSERT_TRUE(iter->Valid());
+    }
+    if (!iter->Valid()) {
+      ASSERT_TRUE(may_exist);
+    }
+
+    delete iter;
+  }
+}
+
+// helper routine for DataBlockHashIndex.BlockBoundary
+void TestBoundary(InternalKey& ik1, std::string& v1, InternalKey& ik2,
+                  std::string& v2, InternalKey& seek_ikey,
+                  GetContext& get_context, Options& options) {
+  unique_ptr<WritableFileWriter> file_writer;
+  unique_ptr<RandomAccessFileReader> file_reader;
+  unique_ptr<TableReader> table_reader;
+  int level_ = -1;
+
+  std::vector<std::string> keys;
+  const ImmutableCFOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  const InternalKeyComparator internal_comparator(options.comparator);
+
+  EnvOptions soptions;
+
+  soptions.use_mmap_reads = ioptions.allow_mmap_reads;
+  file_writer.reset(test::GetWritableFileWriter(new test::StringSink()));
+  unique_ptr<TableBuilder> builder;
+  std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
+      int_tbl_prop_collector_factories;
+  std::string column_family_name;
+  builder.reset(ioptions.table_factory->NewTableBuilder(
+      TableBuilderOptions(ioptions, moptions, internal_comparator,
+                          &int_tbl_prop_collector_factories,
+                          options.compression, CompressionOptions(),
+                          nullptr /* compression_dict */,
+                          false /* skip_filters */, column_family_name, level_),
+      TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
+      file_writer.get()));
+
+  builder->Add(ik1.Encode().ToString(), v1);
+  builder->Add(ik2.Encode().ToString(), v2);
+  EXPECT_TRUE(builder->status().ok());
+
+  Status s = builder->Finish();
+  file_writer->Flush();
+  EXPECT_TRUE(s.ok()) << s.ToString();
+
+  EXPECT_EQ(static_cast<test::StringSink*>(file_writer->writable_file())
+                ->contents()
+                .size(),
+            builder->FileSize());
+
+  // Open the table
+  file_reader.reset(test::GetRandomAccessFileReader(new test::StringSource(
+      static_cast<test::StringSink*>(file_writer->writable_file())->contents(),
+      0 /*uniq_id*/, ioptions.allow_mmap_reads)));
+  const bool kSkipFilters = true;
+  const bool kImmortal = true;
+  ioptions.table_factory->NewTableReader(
+      TableReaderOptions(ioptions, moptions.prefix_extractor.get(), soptions,
+                         internal_comparator, !kSkipFilters, !kImmortal,
+                         level_),
+      std::move(file_reader),
+      static_cast<test::StringSink*>(file_writer->writable_file())
+          ->contents()
+          .size(),
+      &table_reader);
+  // Search using Get()
+  ReadOptions ro;
+
+  ASSERT_OK(table_reader->Get(ro, seek_ikey.Encode().ToString(), &get_context,
+                               moptions.prefix_extractor.get()));
+}
+
+TEST(DataBlockHashIndex, BlockBoundary) {
+  BlockBasedTableOptions table_options;
+  table_options.data_block_index_type =
+      BlockBasedTableOptions::kDataBlockBinaryAndHash;
+  table_options.block_restart_interval = 1;
+  table_options.block_size = 4096;
+
+  Options options;
+  options.comparator = BytewiseComparator();
+
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  // insert two large k/v pair. Given that the block_size is 4096, one k/v
+  // pair will take up one block.
+  // [    k1/v1   ][    k2/v2  ]
+  // [   Block N  ][ Block N+1 ]
+
+  {
+    // [ "aab"@100 ][ "axy"@10  ]
+    // | Block  N  ][ Block N+1 ]
+    // seek for "axy"@60
+    std::string uk1("aab");
+    InternalKey ik1(uk1, 100, kTypeValue);
+    std::string v1(4100, '1');  // large value
+
+    std::string uk2("axy");
+    InternalKey ik2(uk2, 10, kTypeValue);
+    std::string v2(4100, '2');  // large value
+
+    PinnableSlice value;
+    std::string seek_ukey("axy");
+    InternalKey seek_ikey(seek_ukey, 60, kTypeValue);
+    GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+                           GetContext::kNotFound, seek_ukey, &value, nullptr,
+                           nullptr, nullptr, nullptr);
+
+    TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options);
+    ASSERT_EQ(get_context.State(), GetContext::kFound);
+    ASSERT_EQ(value, v2);
+    value.Reset();
+  }
+
+  {
+    // [ "axy"@100 ][ "axy"@10  ]
+    // | Block  N  ][ Block N+1 ]
+    // seek for "axy"@60
+    std::string uk1("axy");
+    InternalKey ik1(uk1, 100, kTypeValue);
+    std::string v1(4100, '1');  // large value
+
+    std::string uk2("axy");
+    InternalKey ik2(uk2, 10, kTypeValue);
+    std::string v2(4100, '2');  // large value
+
+    PinnableSlice value;
+    std::string seek_ukey("axy");
+    InternalKey seek_ikey(seek_ukey, 60, kTypeValue);
+    GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+                           GetContext::kNotFound, seek_ukey, &value, nullptr,
+                           nullptr, nullptr, nullptr);
+
+    TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options);
+    ASSERT_EQ(get_context.State(), GetContext::kFound);
+    ASSERT_EQ(value, v2);
+    value.Reset();
+  }
+
+  {
+    // [ "axy"@100 ][ "axy"@10  ]
+    // | Block  N  ][ Block N+1 ]
+    // seek for "axy"@120
+    std::string uk1("axy");
+    InternalKey ik1(uk1, 100, kTypeValue);
+    std::string v1(4100, '1');  // large value
+
+    std::string uk2("axy");
+    InternalKey ik2(uk2, 10, kTypeValue);
+    std::string v2(4100, '2');  // large value
+
+    PinnableSlice value;
+    std::string seek_ukey("axy");
+    InternalKey seek_ikey(seek_ukey, 120, kTypeValue);
+    GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+                           GetContext::kNotFound, seek_ukey, &value, nullptr,
+                           nullptr, nullptr, nullptr);
+
+    TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options);
+    ASSERT_EQ(get_context.State(), GetContext::kFound);
+    ASSERT_EQ(value, v1);
+    value.Reset();
+  }
+
+  {
+    // [ "axy"@100 ][ "axy"@10  ]
+    // | Block  N  ][ Block N+1 ]
+    // seek for "axy"@5
+    std::string uk1("axy");
+    InternalKey ik1(uk1, 100, kTypeValue);
+    std::string v1(4100, '1');  // large value
+
+    std::string uk2("axy");
+    InternalKey ik2(uk2, 10, kTypeValue);
+    std::string v2(4100, '2');  // large value
+
+    PinnableSlice value;
+    std::string seek_ukey("axy");
+    InternalKey seek_ikey(seek_ukey, 5, kTypeValue);
+    GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+                           GetContext::kNotFound, seek_ukey, &value, nullptr,
+                           nullptr, nullptr, nullptr);
+
+    TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options);
+    ASSERT_EQ(get_context.State(), GetContext::kNotFound);
+    value.Reset();
+  }
+}
+
 }  // namespace rocksdb

 int main(int argc, char** argv) {
--- a/table/table_test.cc
+++ b/table/table_test.cc
@ -3653,6 +3653,104 @@ TEST_F(BBTTailPrefetchTest, FilePrefetchBufferMinOffset) {
  ASSERT_EQ(480, buffer.min_offset_read());
 }

+TEST_P(BlockBasedTableTest, DataBlockHashIndex) {
+  const int kNumKeys = 500;
+  const int kKeySize = 8;
+  const int kValSize = 40;
+
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  table_options.data_block_index_type =
+    BlockBasedTableOptions::kDataBlockBinaryAndHash;
+
+  Options options;
+  options.comparator = BytewiseComparator();
+
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+
+  TableConstructor c(options.comparator);
+
+  static Random rnd(1048);
+  for (int i = 0; i < kNumKeys; i++) {
+    // padding one "0" to mark existent keys.
+    std::string random_key(RandomString(&rnd, kKeySize - 1) + "1");
+    InternalKey k(random_key, 0, kTypeValue);
+    c.Add(k.Encode().ToString(), RandomString(&rnd, kValSize));
+  }
+
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  const ImmutableCFOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  const InternalKeyComparator internal_comparator(options.comparator);
+  c.Finish(options, ioptions, moptions, table_options, internal_comparator,
+           &keys, &kvmap);
+
+
+  auto reader = c.GetTableReader();
+
+  std::unique_ptr<InternalIterator> seek_iter;
+  seek_iter.reset(reader->NewIterator(ReadOptions(),
+                                      moptions.prefix_extractor.get()));
+  for (int i = 0; i < 2; ++i) {
+
+    ReadOptions ro;
+    // for every kv, we seek using two method: Get() and Seek()
+    // Get() will use the SuffixIndexHash in Block. For non-existent key it
+    //      will invalidate the iterator
+    // Seek() will use the default BinarySeek() in Block. So for non-existent
+    //      key it will land at the closest key that is large than target.
+
+    // Search for existent keys
+    for (auto& kv : kvmap) {
+      if (i == 0) {
+        // Search using Seek()
+        seek_iter->Seek(kv.first);
+        ASSERT_OK(seek_iter->status());
+        ASSERT_TRUE(seek_iter->Valid());
+        ASSERT_EQ(seek_iter->key(), kv.first);
+        ASSERT_EQ(seek_iter->value(), kv.second);
+      } else {
+        // Search using Get()
+        PinnableSlice value;
+        std::string user_key = ExtractUserKey(kv.first).ToString();
+        GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+                               GetContext::kNotFound, user_key, &value, nullptr,
+                               nullptr, nullptr, nullptr);
+        ASSERT_OK(reader->Get(ro, kv.first, &get_context,
+                              moptions.prefix_extractor.get()));
+        ASSERT_EQ(get_context.State(), GetContext::kFound);
+        ASSERT_EQ(value, Slice(kv.second));
+        value.Reset();
+      }
+    }
+
+    // Search for non-existent keys
+    for (auto& kv : kvmap) {
+      std::string user_key = ExtractUserKey(kv.first).ToString();
+      user_key.back() = '0'; // make it non-existent key
+      InternalKey internal_key(user_key, 0, kTypeValue);
+      std::string encoded_key = internal_key.Encode().ToString();
+      if (i == 0) { // Search using Seek()
+        seek_iter->Seek(encoded_key);
+        ASSERT_OK(seek_iter->status());
+        if (seek_iter->Valid()){
+          ASSERT_TRUE(BytewiseComparator()->Compare(
+                          user_key, ExtractUserKey(seek_iter->key())) < 0);
+        }
+      } else { // Search using Get()
+        PinnableSlice value;
+        GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+                               GetContext::kNotFound, user_key, &value, nullptr,
+                               nullptr, nullptr, nullptr);
+        ASSERT_OK(reader->Get(ro, encoded_key, &get_context,
+                              moptions.prefix_extractor.get()));
+        ASSERT_EQ(get_context.State(), GetContext::kNotFound);
+        value.Reset();
+      }
+    }
+  }
+}
+
 }  // namespace rocksdb

 int main(int argc, char** argv) {
--- a/util/comparator.cc
+++ b/util/comparator.cc
@ -124,6 +124,10 @@ class BytewiseComparatorImpl : public Comparator {
      return false;
    }
  }
+
+  virtual bool CanKeysWithDifferentByteContentsBeEqual() const override {
+    return false;
+  }
 };

 class ReverseBytewiseComparatorImpl : public BytewiseComparatorImpl {
@ -188,6 +192,10 @@ class ReverseBytewiseComparatorImpl : public BytewiseComparatorImpl {
  void FindShortSuccessor(std::string* /*key*/) const override {
    // Don't do anything for simplicity.
  }
+
+  virtual bool CanKeysWithDifferentByteContentsBeEqual() const override {
+    return false;
+  }
 };
 }// namespace