Adding option to save PlainTable index and bloom filter in SST file.

Summary: Adding option to save PlainTable index and bloom filter in SST file. If there is no bloom block and/or index block, PlainTableReader builds new ones. Otherwise PlainTableReader just use these blocks. Test Plan: make all check Reviewers: sdong Reviewed By: sdong Subscribers: leveldb Differential Revision: https://reviews.facebook.net/D19527
2014-07-18 16:58:13 -07:00 · 2014-07-18 16:58:13 -07:00 · 9d70cce047
commit 9d70cce047
parent 92d73cbe78
17 changed files with 1041 additions and 448 deletions
--- a/db/plain_table_db_test.cc
+++ b/db/plain_table_db_test.cc
@ -23,6 +23,7 @@
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/table.h"
 #include "table/meta_blocks.h"
+#include "table/bloom_block.h"
 #include "table/plain_table_factory.h"
 #include "table/plain_table_reader.h"
 #include "util/hash.h"
@ -70,10 +71,11 @@ class PlainTableDBTest {
    plain_table_options.huge_page_tlb_size = 0;
    plain_table_options.encoding_type = kPrefix;
    plain_table_options.full_scan_mode = false;
+    plain_table_options.store_index_in_file = false;

    options.table_factory.reset(NewPlainTableFactory(plain_table_options));
+    options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0, 3, true));

-    options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0, 3, true, 3));
    options.prefix_extractor.reset(NewFixedPrefixTransform(8));
    options.allow_mmap_reads = true;
    return options;
@ -186,6 +188,8 @@ TEST(PlainTableDBTest, Empty) {
  ASSERT_EQ("NOT_FOUND", Get("0000000000000foo"));
 }

+extern const uint64_t kPlainTableMagicNumber;
+
 class TestPlainTableReader : public PlainTableReader {
 public:
  TestPlainTableReader(const EnvOptions& storage_options,
@ -195,7 +199,8 @@ class TestPlainTableReader : public PlainTableReader {
                       size_t index_sparseness,
                       const TableProperties* table_properties,
                       unique_ptr<RandomAccessFile>&& file,
-                       const Options& options, bool* expect_bloom_not_match)
+                       const Options& options, bool* expect_bloom_not_match,
+                       bool store_index_in_file)
      : PlainTableReader(options, std::move(file), storage_options, icomparator,
                         encoding_type, file_size, table_properties),
        expect_bloom_not_match_(expect_bloom_not_match) {
@ -206,6 +211,19 @@ class TestPlainTableReader : public PlainTableReader {
                      bloom_bits_per_key, hash_table_ratio, index_sparseness,
                      2 * 1024 * 1024);
    ASSERT_TRUE(s.ok());
+
+    TableProperties* props = const_cast<TableProperties*>(table_properties);
+    if (store_index_in_file) {
+      auto bloom_version_ptr = props->user_collected_properties.find(
+          PlainTablePropertyNames::kBloomVersion);
+      ASSERT_TRUE(bloom_version_ptr != props->user_collected_properties.end());
+      ASSERT_EQ(bloom_version_ptr->second, std::string("1"));
+      if (options.bloom_locality > 0) {
+        auto num_blocks_ptr = props->user_collected_properties.find(
+            PlainTablePropertyNames::kNumBloomBlocks);
+        ASSERT_TRUE(num_blocks_ptr != props->user_collected_properties.end());
+      }
+    }
  }

  virtual ~TestPlainTableReader() {}
@ -213,7 +231,11 @@ class TestPlainTableReader : public PlainTableReader {
 private:
  virtual bool MatchBloom(uint32_t hash) const override {
    bool ret = PlainTableReader::MatchBloom(hash);
-    ASSERT_TRUE(!*expect_bloom_not_match_ || !ret);
+    if (*expect_bloom_not_match_) {
+      ASSERT_TRUE(!ret);
+    } else {
+      ASSERT_TRUE(ret);
+    }
    return ret;
  }
  bool* expect_bloom_not_match_;
@ -228,6 +250,7 @@ class TestPlainTableFactory : public PlainTableFactory {
        bloom_bits_per_key_(options.bloom_bits_per_key),
        hash_table_ratio_(options.hash_table_ratio),
        index_sparseness_(options.index_sparseness),
+        store_index_in_file_(options.store_index_in_file),
        expect_bloom_not_match_(expect_bloom_not_match) {}

  Status NewTableReader(const Options& options, const EnvOptions& soptions,
@ -239,6 +262,20 @@ class TestPlainTableFactory : public PlainTableFactory {
                                 options.env, options.info_log.get(), &props);
    ASSERT_TRUE(s.ok());

+    if (store_index_in_file_) {
+      BlockHandle bloom_block_handle;
+      s = FindMetaBlock(file.get(), file_size, kPlainTableMagicNumber,
+                        options.env, BloomBlockBuilder::kBloomBlock,
+                        &bloom_block_handle);
+      ASSERT_TRUE(s.ok());
+
+      BlockHandle index_block_handle;
+      s = FindMetaBlock(
+          file.get(), file_size, kPlainTableMagicNumber, options.env,
+          PlainTableIndexBuilder::kPlainTableIndexBlock, &index_block_handle);
+      ASSERT_TRUE(s.ok());
+    }
+
    auto& user_props = props->user_collected_properties;
    auto encoding_type_prop =
        user_props.find(PlainTablePropertyNames::kEncodingType);
@ -249,7 +286,8 @@ class TestPlainTableFactory : public PlainTableFactory {
    std::unique_ptr<PlainTableReader> new_reader(new TestPlainTableReader(
        soptions, internal_comparator, encoding_type, file_size,
        bloom_bits_per_key_, hash_table_ratio_, index_sparseness_, props,
-        std::move(file), options, expect_bloom_not_match_));
+        std::move(file), options, expect_bloom_not_match_,
+        store_index_in_file_));

    *table = std::move(new_reader);
    return s;
@ -259,6 +297,7 @@ class TestPlainTableFactory : public PlainTableFactory {
  int bloom_bits_per_key_;
  double hash_table_ratio_;
  size_t index_sparseness_;
+  bool store_index_in_file_;
  bool* expect_bloom_not_match_;
 };

@ -268,6 +307,12 @@ TEST(PlainTableDBTest, Flush) {
    for (EncodingType encoding_type : {kPlain, kPrefix}) {
    for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
      for (int total_order = 0; total_order <= 1; total_order++) {
+        for (int store_index_in_file = 0; store_index_in_file <= 1;
+             ++store_index_in_file) {
+          if (!bloom_bits && store_index_in_file) {
+            continue;
+          }
+
          Options options = CurrentOptions();
          options.create_if_missing = true;
          // Set only one bucket to force bucket conflict.
@ -283,6 +328,7 @@ TEST(PlainTableDBTest, Flush) {
            plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
            plain_table_options.encoding_type = encoding_type;
            plain_table_options.full_scan_mode = false;
+            plain_table_options.store_index_in_file = store_index_in_file;

            options.table_factory.reset(
                NewPlainTableFactory(plain_table_options));
@ -295,12 +341,12 @@ TEST(PlainTableDBTest, Flush) {
            plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
            plain_table_options.encoding_type = encoding_type;
            plain_table_options.full_scan_mode = false;
+            plain_table_options.store_index_in_file = store_index_in_file;

            options.table_factory.reset(
                NewPlainTableFactory(plain_table_options));
          }
          DestroyAndReopen(&options);
-
          ASSERT_OK(Put("1000000000000foo", "v1"));
          ASSERT_OK(Put("0000000000000bar", "v2"));
          ASSERT_OK(Put("1000000000000foo", "v3"));
@ -311,17 +357,26 @@ TEST(PlainTableDBTest, Flush) {
          ASSERT_EQ(1U, ptc.size());
          auto row = ptc.begin();
          auto tp = row->second;
-        ASSERT_EQ(total_order ? "4" : "12", (tp->user_collected_properties).at(
-                                                "plain_table_hash_table_size"));
+
+          if (!store_index_in_file) {
+            ASSERT_EQ(total_order ? "4" : "12",
+                      (tp->user_collected_properties)
+                          .at("plain_table_hash_table_size"));
            ASSERT_EQ("0", (tp->user_collected_properties)
                               .at("plain_table_sub_index_size"));
-
+          } else {
+            ASSERT_EQ("0", (tp->user_collected_properties)
+                               .at("plain_table_hash_table_size"));
+            ASSERT_EQ("0", (tp->user_collected_properties)
+                               .at("plain_table_sub_index_size"));
+          }
          ASSERT_EQ("v3", Get("1000000000000foo"));
          ASSERT_EQ("v2", Get("0000000000000bar"));
        }
        }
      }
    }
+  }
 }

 TEST(PlainTableDBTest, Flush2) {
@ -330,7 +385,15 @@ TEST(PlainTableDBTest, Flush2) {
    for (EncodingType encoding_type : {kPlain, kPrefix}) {
    for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
      for (int total_order = 0; total_order <= 1; total_order++) {
-        if (encoding_type == kPrefix && total_order == 1) {
+        for (int store_index_in_file = 0; store_index_in_file <= 1;
+             ++store_index_in_file) {
+          if (encoding_type == kPrefix && total_order) {
+            continue;
+          }
+          if (!bloom_bits && store_index_in_file) {
+            continue;
+          }
+          if (total_order && store_index_in_file) {
          continue;
        }
        bool expect_bloom_not_match = false;
@ -338,30 +401,23 @@ TEST(PlainTableDBTest, Flush2) {
        options.create_if_missing = true;
        // Set only one bucket to force bucket conflict.
        // Test index interval for the same prefix to be 1, 2 and 4
+        PlainTableOptions plain_table_options;
        if (total_order) {
          options.prefix_extractor = nullptr;
-          PlainTableOptions plain_table_options;
-          plain_table_options.user_key_len = 0;
-          plain_table_options.bloom_bits_per_key = bloom_bits;
          plain_table_options.hash_table_ratio = 0;
          plain_table_options.index_sparseness = 2;
-          plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
-          plain_table_options.encoding_type = encoding_type;
-
-          options.table_factory.reset(new TestPlainTableFactory(
-              &expect_bloom_not_match, plain_table_options));
        } else {
-          PlainTableOptions plain_table_options;
-          plain_table_options.user_key_len = 0;
-          plain_table_options.bloom_bits_per_key = bloom_bits;
          plain_table_options.hash_table_ratio = 0.75;
          plain_table_options.index_sparseness = 16;
+        }
+        plain_table_options.user_key_len = kPlainTableVariableLength;
+        plain_table_options.bloom_bits_per_key = bloom_bits;
        plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
        plain_table_options.encoding_type = encoding_type;
-
+        plain_table_options.store_index_in_file = store_index_in_file;
        options.table_factory.reset(new TestPlainTableFactory(
            &expect_bloom_not_match, plain_table_options));
-        }
+
        DestroyAndReopen(&options);
        ASSERT_OK(Put("0000000000000bar", "b"));
        ASSERT_OK(Put("1000000000000foo", "v1"));
@ -389,7 +445,6 @@ TEST(PlainTableDBTest, Flush2) {
          // Neither key nor value should exist.
          expect_bloom_not_match = true;
          ASSERT_EQ("NOT_FOUND", Get("5_not00000000bar"));
-
          // Key doesn't exist any more but prefix exists.
          if (total_order) {
            ASSERT_EQ("NOT_FOUND", Get("1000000000000not"));
@ -401,6 +456,7 @@ TEST(PlainTableDBTest, Flush2) {
      }
    }
    }
+  }
 }

 TEST(PlainTableDBTest, Iterator) {
--- a/include/rocksdb/table.h
+++ b/include/rocksdb/table.h
@ -119,6 +119,8 @@ enum EncodingType : char {
 struct PlainTablePropertyNames {
  static const std::string kPrefixExtractorName;
  static const std::string kEncodingType;
+  static const std::string kBloomVersion;
+  static const std::string kNumBloomBlocks;
 };

 const uint32_t kPlainTableVariableLength = 0;
@ -166,6 +168,11 @@ EncodingType encoding_type = kPlain;
 // @full_scan_mode: mode for reading the whole file one record by one without
 //                  using the index.
  bool full_scan_mode = false;
+
+  // @store_index_in_file: compute plain table index and bloom filter during
+  //                       file building and store it in file. When reading
+  //                       file, index will be mmaped instead of recomputation.
+  bool store_index_in_file = false;
 };

 // -- Plain Table with prefix-only seek
--- a/table/bloom_block.cc
+++ b/table/bloom_block.cc
@ -0,0 +1,23 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include "table/bloom_block.h"
+
+#include <string>
+#include "rocksdb/slice.h"
+#include "util/dynamic_bloom.h"
+
+namespace rocksdb {
+
+void BloomBlockBuilder::AddKeysHashes(const std::vector<uint32_t> keys_hashes) {
+  for (auto hash : keys_hashes) {
+    bloom_.AddHash(hash);
+  }
+}
+
+Slice BloomBlockBuilder::Finish() { return bloom_.GetRawData(); }
+
+const std::string BloomBlockBuilder::kBloomBlock = "kBloomBlock";
+}  // namespace rocksdb
--- a/table/bloom_block.h
+++ b/table/bloom_block.h
@ -0,0 +1,37 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+#pragma once
+
+#include <vector>
+#include <string>
+#include "util/dynamic_bloom.h"
+
+namespace rocksdb {
+class Logger;
+
+class BloomBlockBuilder {
+ public:
+  static const std::string kBloomBlock;
+
+  explicit BloomBlockBuilder(uint32_t num_probes = 6)
+      : bloom_(num_probes, nullptr) {}
+
+  void SetTotalBits(Arena* arena, uint32_t total_bits, uint32_t locality,
+                    size_t huge_page_tlb_size, Logger* logger) {
+    bloom_.SetTotalBits(arena, total_bits, locality, huge_page_tlb_size,
+                        logger);
+  }
+
+  uint32_t GetNumBlocks() const { return bloom_.GetNumBlocks(); }
+
+  void AddKeysHashes(const std::vector<uint32_t> keys_hashes);
+
+  Slice Finish();
+
+ private:
+  DynamicBloom bloom_;
+};
+
+};  // namespace rocksdb
--- a/table/meta_blocks.cc
+++ b/table/meta_blocks.cc
@ -273,4 +273,72 @@ Status FindMetaBlock(Iterator* meta_index_iter,
  }
 }

+Status FindMetaBlock(RandomAccessFile* file, uint64_t file_size,
+                     uint64_t table_magic_number, Env* env,
+                     const std::string& meta_block_name,
+                     BlockHandle* block_handle) {
+  Footer footer(table_magic_number);
+  auto s = ReadFooterFromFile(file, file_size, &footer);
+  if (!s.ok()) {
+    return s;
+  }
+
+  auto metaindex_handle = footer.metaindex_handle();
+  BlockContents metaindex_contents;
+  ReadOptions read_options;
+  read_options.verify_checksums = false;
+  s = ReadBlockContents(file, footer, read_options, metaindex_handle,
+                        &metaindex_contents, env, false);
+  if (!s.ok()) {
+    return s;
+  }
+  Block metaindex_block(metaindex_contents);
+
+  std::unique_ptr<Iterator> meta_iter;
+  meta_iter.reset(metaindex_block.NewIterator(BytewiseComparator()));
+
+  return FindMetaBlock(meta_iter.get(), meta_block_name, block_handle);
+}
+
+Status ReadMetaBlock(RandomAccessFile* file, uint64_t file_size,
+                     uint64_t table_magic_number, Env* env,
+                     const std::string& meta_block_name,
+                     BlockContents* contents) {
+  Footer footer(table_magic_number);
+  auto s = ReadFooterFromFile(file, file_size, &footer);
+  if (!s.ok()) {
+    return s;
+  }
+
+  // Reading metaindex block
+  auto metaindex_handle = footer.metaindex_handle();
+  BlockContents metaindex_contents;
+  ReadOptions read_options;
+  read_options.verify_checksums = false;
+  s = ReadBlockContents(file, footer, read_options, metaindex_handle,
+                        &metaindex_contents, env, false);
+  if (!s.ok()) {
+    return s;
+  }
+
+  // Finding metablock
+  Block metaindex_block(metaindex_contents);
+
+  std::unique_ptr<Iterator> meta_iter;
+  meta_iter.reset(metaindex_block.NewIterator(BytewiseComparator()));
+
+  BlockHandle block_handle;
+  s = FindMetaBlock(meta_iter.get(), meta_block_name, &block_handle);
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  // Reading metablock
+  s = ReadBlockContents(file, footer, read_options, block_handle, contents, env,
+                        false);
+
+  return s;
+}
+
 }  // namespace rocksdb
--- a/table/meta_blocks.h
+++ b/table/meta_blocks.h
@ -15,6 +15,7 @@
 #include "rocksdb/slice.h"
 #include "rocksdb/table_properties.h"
 #include "table/block_builder.h"
+#include "table/format.h"

 namespace rocksdb {

@ -128,4 +129,18 @@ Status FindMetaBlock(Iterator* meta_index_iter,
                     const std::string& meta_block_name,
                     BlockHandle* block_handle);

+// Find the meta block
+Status FindMetaBlock(RandomAccessFile* file, uint64_t file_size,
+                     uint64_t table_magic_number, Env* env,
+                     const std::string& meta_block_name,
+                     BlockHandle* block_handle);
+
+// Read the specified meta block with name meta_block_name
+// from `file` and initialize `contents` with contents of this block.
+// Return Status::OK in case of success.
+Status ReadMetaBlock(RandomAccessFile* file, uint64_t file_size,
+                     uint64_t table_magic_number, Env* env,
+                     const std::string& meta_block_name,
+                     BlockContents* contents);
+
 }  // namespace rocksdb
--- a/table/plain_table_builder.cc
+++ b/table/plain_table_builder.cc
@ -6,6 +6,7 @@
 #ifndef ROCKSDB_LITE
 #include "table/plain_table_builder.h"

+#include <string>
 #include <assert.h>
 #include <map>

@ -17,6 +18,8 @@
 #include "table/plain_table_factory.h"
 #include "db/dbformat.h"
 #include "table/block_builder.h"
+#include "table/bloom_block.h"
+#include "table/plain_table_index.h"
 #include "table/filter_block.h"
 #include "table/format.h"
 #include "table/meta_blocks.h"
@ -54,20 +57,36 @@ Status WriteBlock(
 extern const uint64_t kPlainTableMagicNumber = 0x8242229663bf9564ull;
 extern const uint64_t kLegacyPlainTableMagicNumber = 0x4f3418eb7a8f13b8ull;

-PlainTableBuilder::PlainTableBuilder(const Options& options, WritableFile* file,
-                                     uint32_t user_key_len,
-                                     EncodingType encoding_type,
-                                     size_t index_sparseness)
+PlainTableBuilder::PlainTableBuilder(
+    const Options& options, WritableFile* file, uint32_t user_key_len,
+    EncodingType encoding_type, size_t index_sparseness,
+    uint32_t bloom_bits_per_key, uint32_t num_probes, size_t huge_page_tlb_size,
+    double hash_table_ratio, bool store_index_in_file)
    : options_(options),
+      bloom_block_(num_probes),
      file_(file),
+      bloom_bits_per_key_(bloom_bits_per_key),
+      huge_page_tlb_size_(huge_page_tlb_size),
      encoder_(encoding_type, user_key_len, options.prefix_extractor.get(),
-               index_sparseness) {
+               index_sparseness),
+      store_index_in_file_(store_index_in_file),
+      prefix_extractor_(options.prefix_extractor.get()) {
+  // Build index block and save it in the file if hash_table_ratio > 0
+  if (store_index_in_file_) {
+    assert(hash_table_ratio > 0 || IsTotalOrderMode());
+    index_builder_.reset(
+        new PlainTableIndexBuilder(&arena_, options, index_sparseness,
+                                   hash_table_ratio, huge_page_tlb_size_));
+    assert(bloom_bits_per_key_ > 0);
+    properties_.user_collected_properties
+        [PlainTablePropertyNames::kBloomVersion] = "1";  // For future use
+  }
+
  properties_.fixed_key_len = user_key_len;

  // for plain table, we put all the data in a big chuck.
  properties_.num_data_blocks = 1;
-  // emphasize that currently plain table doesn't have persistent index or
-  // filter block.
+  // Fill it later if store_index_in_file_ == true
  properties_.index_size = 0;
  properties_.filter_size = 0;
  // To support roll-back to previous version, now still use version 0 for
@ -100,9 +119,28 @@ void PlainTableBuilder::Add(const Slice& key, const Slice& value) {
  char meta_bytes_buf[6];
  size_t meta_bytes_buf_size = 0;

+  ParsedInternalKey internal_key;
+  ParseInternalKey(key, &internal_key);
+
+  // Store key hash
+  if (store_index_in_file_) {
+    if (options_.prefix_extractor.get() == nullptr) {
+      keys_or_prefixes_hashes_.push_back(GetSliceHash(internal_key.user_key));
+    } else {
+      Slice prefix =
+          options_.prefix_extractor->Transform(internal_key.user_key);
+      keys_or_prefixes_hashes_.push_back(GetSliceHash(prefix));
+    }
+  }
+
+  // Write value
+  auto prev_offset = offset_;
  // Write out the key
  encoder_.AppendKey(key, file_, &offset_, meta_bytes_buf,
                     &meta_bytes_buf_size);
+  if (SaveIndexInFile()) {
+    index_builder_->AddKeyPrefix(GetPrefix(internal_key), prev_offset);
+  }

  // Write value length
  int value_size = value.size();
@ -134,11 +172,50 @@ Status PlainTableBuilder::Finish() {
  properties_.data_size = offset_;

  //  Write the following blocks
-  //  1. [meta block: properties]
-  //  2. [metaindex block]
-  //  3. [footer]
+  //  1. [meta block: bloom] - optional
+  //  2. [meta block: index] - optional
+  //  3. [meta block: properties]
+  //  4. [metaindex block]
+  //  5. [footer]
+
  MetaIndexBuilder meta_index_builer;

+  if (store_index_in_file_ && (properties_.num_entries > 0)) {
+    bloom_block_.SetTotalBits(
+        &arena_, properties_.num_entries * bloom_bits_per_key_,
+        options_.bloom_locality, huge_page_tlb_size_, options_.info_log.get());
+
+    PutVarint32(&properties_.user_collected_properties
+                     [PlainTablePropertyNames::kNumBloomBlocks],
+                bloom_block_.GetNumBlocks());
+
+    bloom_block_.AddKeysHashes(keys_or_prefixes_hashes_);
+    BlockHandle bloom_block_handle;
+    auto finish_result = bloom_block_.Finish();
+
+    properties_.filter_size = finish_result.size();
+    auto s = WriteBlock(finish_result, file_, &offset_, &bloom_block_handle);
+
+    if (!s.ok()) {
+      return s;
+    }
+
+    BlockHandle index_block_handle;
+    finish_result = index_builder_->Finish();
+
+    properties_.index_size = finish_result.size();
+    s = WriteBlock(finish_result, file_, &offset_, &index_block_handle);
+
+    if (!s.ok()) {
+      return s;
+    }
+
+    meta_index_builer.Add(BloomBlockBuilder::kBloomBlock, bloom_block_handle);
+    meta_index_builer.Add(PlainTableIndexBuilder::kPlainTableIndexBlock,
+                          index_block_handle);
+  }
+
+  // Calculate bloom block size and index block size
  PropertyBlockBuilder property_block_builder;
  // -- Add basic properties
  property_block_builder.AddTableProperty(properties_);
--- a/table/plain_table_builder.h
+++ b/table/plain_table_builder.h
@ -13,6 +13,8 @@
 #include "table/plain_table_key_coding.h"
 #include "rocksdb/table.h"
 #include "rocksdb/table_properties.h"
+#include "table/bloom_block.h"
+#include "table/plain_table_index.h"

 namespace rocksdb {

@ -30,7 +32,10 @@ class PlainTableBuilder: public TableBuilder {
  // that the caller does not know which level the output file will reside.
  PlainTableBuilder(const Options& options, WritableFile* file,
                    uint32_t user_key_size, EncodingType encoding_type,
-                    size_t index_sparseness);
+                    size_t index_sparseness, uint32_t bloom_bits_per_key,
+                    uint32_t num_probes = 6, size_t huge_page_tlb_size = 0,
+                    double hash_table_ratio = 0,
+                    bool store_index_in_file = false);

  // REQUIRES: Either Finish() or Abandon() has been called.
  ~PlainTableBuilder();
@ -62,18 +67,59 @@ class PlainTableBuilder: public TableBuilder {
  // Finish() call, returns the size of the final generated file.
  uint64_t FileSize() const override;

+  bool SaveIndexInFile() const { return store_index_in_file_; }
+
 private:
+  Arena arena_;
  Options options_;
  std::vector<std::unique_ptr<TablePropertiesCollector>>
      table_properties_collectors_;
+
+  BloomBlockBuilder bloom_block_;
+  std::unique_ptr<PlainTableIndexBuilder> index_builder_;
+
  WritableFile* file_;
  uint64_t offset_ = 0;
+  uint32_t bloom_bits_per_key_;
+  uint32_t huge_page_tlb_size_;
  Status status_;
  TableProperties properties_;
  PlainTableKeyEncoder encoder_;

+  bool store_index_in_file_;
+
+  std::vector<uint32_t> keys_or_prefixes_hashes_;
  bool closed_ = false;  // Either Finish() or Abandon() has been called.

+  const SliceTransform* prefix_extractor_;
+
+  Slice GetPrefix(const Slice& target) const {
+    assert(target.size() >= 8);  // target is internal key
+    return GetPrefixFromUserKey(GetUserKey(target));
+  }
+
+  Slice GetPrefix(const ParsedInternalKey& target) const {
+    return GetPrefixFromUserKey(target.user_key);
+  }
+
+  Slice GetUserKey(const Slice& key) const {
+    return Slice(key.data(), key.size() - 8);
+  }
+
+  Slice GetPrefixFromUserKey(const Slice& user_key) const {
+    if (!IsTotalOrderMode()) {
+      return prefix_extractor_->Transform(user_key);
+    } else {
+      // Use empty slice as prefix if prefix_extractor is not set.
+      // In that case,
+      // it falls back to pure binary search and
+      // total iterator seek is supported.
+      return Slice();
+    }
+  }
+
+  bool IsTotalOrderMode() const { return (prefix_extractor_ == nullptr); }
+
  // No copying allowed
  PlainTableBuilder(const PlainTableBuilder&) = delete;
  void operator=(const PlainTableBuilder&) = delete;
--- a/table/plain_table_factory.cc
+++ b/table/plain_table_factory.cc
@ -30,7 +30,9 @@ TableBuilder* PlainTableFactory::NewTableBuilder(
    const Options& options, const InternalKeyComparator& internal_comparator,
    WritableFile* file, CompressionType compression_type) const {
  return new PlainTableBuilder(options, file, user_key_len_, encoding_type_,
-                               index_sparseness_);
+                               index_sparseness_, bloom_bits_per_key_, 6,
+                               huge_page_tlb_size_, hash_table_ratio_,
+                               store_index_in_file_);
 }

 extern TableFactory* NewPlainTableFactory(const PlainTableOptions& options) {
@ -43,5 +45,11 @@ const std::string PlainTablePropertyNames::kPrefixExtractorName =
 const std::string PlainTablePropertyNames::kEncodingType =
    "rocksdb.plain.table.encoding.type";

+const std::string PlainTablePropertyNames::kBloomVersion =
+    "rocksdb.plain.table.bloom.version";
+
+const std::string PlainTablePropertyNames::kNumBloomBlocks =
+    "rocksdb.plain.table.bloom.numblocks";
+
 }  // namespace rocksdb
 #endif  // ROCKSDB_LITE
--- a/table/plain_table_factory.h
+++ b/table/plain_table_factory.h
@ -151,7 +151,8 @@ class PlainTableFactory : public TableFactory {
        index_sparseness_(options.index_sparseness),
        huge_page_tlb_size_(options.huge_page_tlb_size),
        encoding_type_(options.encoding_type),
-        full_scan_mode_(options.full_scan_mode) {}
+        full_scan_mode_(options.full_scan_mode),
+        store_index_in_file_(options.store_index_in_file) {}
  const char* Name() const override { return "PlainTable"; }
  Status NewTableReader(const Options& options, const EnvOptions& soptions,
                        const InternalKeyComparator& internal_comparator,
@ -173,6 +174,7 @@ class PlainTableFactory : public TableFactory {
  size_t huge_page_tlb_size_;
  EncodingType encoding_type_;
  bool full_scan_mode_;
+  bool store_index_in_file_;
 };

 }  // namespace rocksdb
--- a/table/plain_table_index.cc
+++ b/table/plain_table_index.cc
@ -0,0 +1,196 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include "table/plain_table_index.h"
+#include "util/coding.h"
+#include "util/hash.h"
+
+namespace rocksdb {
+
+namespace {
+inline uint32_t GetBucketIdFromHash(uint32_t hash, uint32_t num_buckets) {
+  assert(num_buckets > 0);
+  return hash % num_buckets;
+}
+}
+
+void PlainTableIndex::InitFromRawData(Slice data) {
+  assert(GetVarint32(&data, &index_size_));
+  assert(index_size_ > 0);
+  assert(GetVarint32(&data, &num_prefixes_));
+  sub_index_size_ = data.size() - index_size_ * kOffsetLen;
+
+  char* index_data_begin = const_cast<char*>(data.data());
+  index_ = reinterpret_cast<uint32_t*>(index_data_begin);
+  sub_index_ = reinterpret_cast<char*>(index_ + index_size_);
+}
+
+PlainTableIndex::IndexSearchResult PlainTableIndex::GetOffset(
+    uint32_t prefix_hash, uint32_t* bucket_value) const {
+  int bucket = GetBucketIdFromHash(prefix_hash, index_size_);
+  *bucket_value = index_[bucket];
+  if ((*bucket_value & kSubIndexMask) == kSubIndexMask) {
+    *bucket_value ^= kSubIndexMask;
+    return kSubindex;
+  }
+  if (*bucket_value >= kMaxFileSize) {
+    return kNoPrefixForBucket;
+  } else {
+    // point directly to the file
+    return kDirectToFile;
+  }
+}
+
+void PlainTableIndexBuilder::IndexRecordList::AddRecord(murmur_t hash,
+                                                        uint32_t offset) {
+  if (num_records_in_current_group_ == kNumRecordsPerGroup) {
+    current_group_ = AllocateNewGroup();
+    num_records_in_current_group_ = 0;
+  }
+  auto& new_record = current_group_[num_records_in_current_group_++];
+  new_record.hash = hash;
+  new_record.offset = offset;
+  new_record.next = nullptr;
+}
+
+void PlainTableIndexBuilder::AddKeyPrefix(Slice key_prefix_slice,
+                                          uint64_t key_offset) {
+  if (is_first_record_ || prev_key_prefix_ != key_prefix_slice.ToString()) {
+    ++num_prefixes_;
+    if (!is_first_record_) {
+      keys_per_prefix_hist_.Add(num_keys_per_prefix_);
+    }
+    num_keys_per_prefix_ = 0;
+    prev_key_prefix_ = key_prefix_slice.ToString();
+    prev_key_prefix_hash_ = GetSliceHash(key_prefix_slice);
+    due_index_ = true;
+  }
+
+  if (due_index_) {
+    // Add an index key for every kIndexIntervalForSamePrefixKeys keys
+    record_list_.AddRecord(prev_key_prefix_hash_, key_offset);
+    due_index_ = false;
+  }
+
+  num_keys_per_prefix_++;
+  if (index_sparseness_ == 0 || num_keys_per_prefix_ % index_sparseness_ == 0) {
+    due_index_ = true;
+  }
+  is_first_record_ = false;
+}
+
+Slice PlainTableIndexBuilder::Finish() {
+  AllocateIndex();
+  std::vector<IndexRecord*> hash_to_offsets(index_size_, nullptr);
+  std::vector<uint32_t> entries_per_bucket(index_size_, 0);
+  BucketizeIndexes(&hash_to_offsets, &entries_per_bucket);
+
+  keys_per_prefix_hist_.Add(num_keys_per_prefix_);
+  Log(options_.info_log, "Number of Keys per prefix Histogram: %s",
+      keys_per_prefix_hist_.ToString().c_str());
+
+  // From the temp data structure, populate indexes.
+  return FillIndexes(hash_to_offsets, entries_per_bucket);
+}
+
+void PlainTableIndexBuilder::AllocateIndex() {
+  if (prefix_extractor_ == nullptr || hash_table_ratio_ <= 0) {
+    // Fall back to pure binary search if the user fails to specify a prefix
+    // extractor.
+    index_size_ = 1;
+  } else {
+    double hash_table_size_multipier = 1.0 / hash_table_ratio_;
+    index_size_ = num_prefixes_ * hash_table_size_multipier + 1;
+    assert(index_size_ > 0);
+  }
+}
+
+void PlainTableIndexBuilder::BucketizeIndexes(
+    std::vector<IndexRecord*>* hash_to_offsets,
+    std::vector<uint32_t>* entries_per_bucket) {
+  bool first = true;
+  uint32_t prev_hash = 0;
+  size_t num_records = record_list_.GetNumRecords();
+  for (size_t i = 0; i < num_records; i++) {
+    IndexRecord* index_record = record_list_.At(i);
+    uint32_t cur_hash = index_record->hash;
+    if (first || prev_hash != cur_hash) {
+      prev_hash = cur_hash;
+      first = false;
+    }
+    uint32_t bucket = GetBucketIdFromHash(cur_hash, index_size_);
+    IndexRecord* prev_bucket_head = (*hash_to_offsets)[bucket];
+    index_record->next = prev_bucket_head;
+    (*hash_to_offsets)[bucket] = index_record;
+    (*entries_per_bucket)[bucket]++;
+  }
+
+  sub_index_size_ = 0;
+  for (auto entry_count : *entries_per_bucket) {
+    if (entry_count <= 1) {
+      continue;
+    }
+    // Only buckets with more than 1 entry will have subindex.
+    sub_index_size_ += VarintLength(entry_count);
+    // total bytes needed to store these entries' in-file offsets.
+    sub_index_size_ += entry_count * PlainTableIndex::kOffsetLen;
+  }
+}
+
+Slice PlainTableIndexBuilder::FillIndexes(
+    const std::vector<IndexRecord*>& hash_to_offsets,
+    const std::vector<uint32_t>& entries_per_bucket) {
+  Log(options_.info_log, "Reserving %zu bytes for plain table's sub_index",
+      sub_index_size_);
+  auto total_allocate_size = GetTotalSize();
+  char* allocated = arena_->AllocateAligned(
+      total_allocate_size, huge_page_tlb_size_, options_.info_log.get());
+
+  auto temp_ptr = EncodeVarint32(allocated, index_size_);
+  uint32_t* index =
+      reinterpret_cast<uint32_t*>(EncodeVarint32(temp_ptr, num_prefixes_));
+  char* sub_index = reinterpret_cast<char*>(index + index_size_);
+
+  size_t sub_index_offset = 0;
+  for (uint32_t i = 0; i < index_size_; i++) {
+    uint32_t num_keys_for_bucket = entries_per_bucket[i];
+    switch (num_keys_for_bucket) {
+      case 0:
+        // No key for bucket
+        index[i] = PlainTableIndex::kMaxFileSize;
+        break;
+      case 1:
+        // point directly to the file offset
+        index[i] = hash_to_offsets[i]->offset;
+        break;
+      default:
+        // point to second level indexes.
+        index[i] = sub_index_offset | PlainTableIndex::kSubIndexMask;
+        char* prev_ptr = &sub_index[sub_index_offset];
+        char* cur_ptr = EncodeVarint32(prev_ptr, num_keys_for_bucket);
+        sub_index_offset += (cur_ptr - prev_ptr);
+        char* sub_index_pos = &sub_index[sub_index_offset];
+        IndexRecord* record = hash_to_offsets[i];
+        int j;
+        for (j = num_keys_for_bucket - 1; j >= 0 && record;
+             j--, record = record->next) {
+          EncodeFixed32(sub_index_pos + j * sizeof(uint32_t), record->offset);
+        }
+        assert(j == -1 && record == nullptr);
+        sub_index_offset += PlainTableIndex::kOffsetLen * num_keys_for_bucket;
+        assert(sub_index_offset <= sub_index_size_);
+        break;
+    }
+  }
+  assert(sub_index_offset == sub_index_size_);
+
+  Log(options_.info_log, "hash table size: %d, suffix_map length %zu",
+      index_size_, sub_index_size_);
+  return Slice(allocated, GetTotalSize());
+}
+
+const std::string PlainTableIndexBuilder::kPlainTableIndexBlock =
+    "PlainTableIndexBlock";
+};  // namespace rocksdb
--- a/table/plain_table_index.h
+++ b/table/plain_table_index.h
@ -0,0 +1,221 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "rocksdb/options.h"
+#include "util/murmurhash.h"
+#include "util/hash.h"
+#include "util/arena.h"
+#include "util/histogram.h"
+
+namespace rocksdb {
+
+// PlainTableIndex contains buckets size of index_size_, each is a
+// 32-bit integer. The lower 31 bits contain an offset value (explained below)
+// and the first bit of the integer indicates type of the offset.
+//
+// +--------------+------------------------------------------------------+
+// | Flag (1 bit) | Offset to binary search buffer or file (31 bits)     +
+// +--------------+------------------------------------------------------+
+//
+// Explanation for the "flag bit":
+//
+// 0 indicates that the bucket contains only one prefix (no conflict when
+//   hashing this prefix), whose first row starts from this offset of the
+// file.
+// 1 indicates that the bucket contains more than one prefixes, or there
+//   are too many rows for one prefix so we need a binary search for it. In
+//   this case, the offset indicates the offset of sub_index_ holding the
+//   binary search indexes of keys for those rows. Those binary search indexes
+//   are organized in this way:
+//
+// The first 4 bytes, indicate how many indexes (N) are stored after it. After
+// it, there are N 32-bit integers, each points of an offset of the file,
+// which
+// points to starting of a row. Those offsets need to be guaranteed to be in
+// ascending order so the keys they are pointing to are also in ascending
+// order
+// to make sure we can use them to do binary searches. Below is visual
+// presentation of a bucket.
+//
+// <begin>
+//   number_of_records:  varint32
+//   record 1 file offset:  fixedint32
+//   record 2 file offset:  fixedint32
+//    ....
+//   record N file offset:  fixedint32
+// <end>
+class PlainTableIndex {
+ public:
+  enum IndexSearchResult {
+    kNoPrefixForBucket = 0,
+    kDirectToFile = 1,
+    kSubindex = 2
+  };
+
+  explicit PlainTableIndex(Slice data) { InitFromRawData(data); }
+
+  PlainTableIndex()
+      : index_size_(0),
+        sub_index_size_(0),
+        num_prefixes_(0),
+        index_(nullptr),
+        sub_index_(nullptr) {}
+
+  IndexSearchResult GetOffset(uint32_t prefix_hash,
+                              uint32_t* bucket_value) const;
+
+  void InitFromRawData(Slice data);
+
+  const char* GetSubIndexBasePtrAndUpperBound(uint32_t offset,
+                                              uint32_t* upper_bound) const {
+    const char* index_ptr = &sub_index_[offset];
+    return GetVarint32Ptr(index_ptr, index_ptr + 4, upper_bound);
+  }
+
+  uint32_t GetIndexSize() const { return index_size_; }
+
+  uint32_t GetSubIndexSize() const { return sub_index_size_; }
+
+  uint32_t GetNumPrefixes() const { return num_prefixes_; }
+
+  static const uint64_t kMaxFileSize = (1u << 31) - 1;
+  static const uint32_t kSubIndexMask = 0x80000000;
+  static const size_t kOffsetLen = sizeof(uint32_t);
+
+ private:
+  uint32_t index_size_;
+  size_t sub_index_size_;
+  uint32_t num_prefixes_;
+
+  uint32_t* index_;
+  char* sub_index_;
+};
+
+// PlainTableIndexBuilder is used to create plain table index.
+// After calling Finish(), it returns Slice, which is usually
+// used either to initialize PlainTableIndex or
+// to save index to sst file.
+// For more details about the  index, please refer to:
+// https://github.com/facebook/rocksdb/wiki/PlainTable-Format
+// #wiki-in-memory-index-format
+class PlainTableIndexBuilder {
+ public:
+  PlainTableIndexBuilder(Arena* arena, const Options& options,
+                         uint32_t index_sparseness, double hash_table_ratio,
+                         double huge_page_tlb_size)
+      : arena_(arena),
+        options_(options),
+        record_list_(kRecordsPerGroup),
+        is_first_record_(true),
+        due_index_(false),
+        num_prefixes_(0),
+        num_keys_per_prefix_(0),
+        prev_key_prefix_hash_(0),
+        index_sparseness_(index_sparseness),
+        prefix_extractor_(options.prefix_extractor.get()),
+        hash_table_ratio_(hash_table_ratio),
+        huge_page_tlb_size_(huge_page_tlb_size) {}
+
+  void AddKeyPrefix(Slice key_prefix_slice, uint64_t key_offset);
+
+  Slice Finish();
+
+  uint32_t GetTotalSize() const {
+    return VarintLength(index_size_) + VarintLength(num_prefixes_) +
+           PlainTableIndex::kOffsetLen * index_size_ + sub_index_size_;
+  }
+
+  static const std::string kPlainTableIndexBlock;
+
+ private:
+  struct IndexRecord {
+    uint32_t hash;    // hash of the prefix
+    uint32_t offset;  // offset of a row
+    IndexRecord* next;
+  };
+
+  // Helper class to track all the index records
+  class IndexRecordList {
+   public:
+    explicit IndexRecordList(size_t num_records_per_group)
+        : kNumRecordsPerGroup(num_records_per_group),
+          current_group_(nullptr),
+          num_records_in_current_group_(num_records_per_group) {}
+
+    ~IndexRecordList() {
+      for (size_t i = 0; i < groups_.size(); i++) {
+        delete[] groups_[i];
+      }
+    }
+
+    void AddRecord(murmur_t hash, uint32_t offset);
+
+    size_t GetNumRecords() const {
+      return (groups_.size() - 1) * kNumRecordsPerGroup +
+             num_records_in_current_group_;
+    }
+    IndexRecord* At(size_t index) {
+      return &(groups_[index / kNumRecordsPerGroup]
+                      [index % kNumRecordsPerGroup]);
+    }
+
+   private:
+    IndexRecord* AllocateNewGroup() {
+      IndexRecord* result = new IndexRecord[kNumRecordsPerGroup];
+      groups_.push_back(result);
+      return result;
+    }
+
+    // Each group in `groups_` contains fix-sized records (determined by
+    // kNumRecordsPerGroup). Which can help us minimize the cost if resizing
+    // occurs.
+    const size_t kNumRecordsPerGroup;
+    IndexRecord* current_group_;
+    // List of arrays allocated
+    std::vector<IndexRecord*> groups_;
+    size_t num_records_in_current_group_;
+  };
+
+  void AllocateIndex();
+
+  // Internal helper function to bucket index record list to hash buckets.
+  void BucketizeIndexes(std::vector<IndexRecord*>* hash_to_offsets,
+                        std::vector<uint32_t>* entries_per_bucket);
+
+  // Internal helper class to fill the indexes and bloom filters to internal
+  // data structures.
+  Slice FillIndexes(const std::vector<IndexRecord*>& hash_to_offsets,
+                    const std::vector<uint32_t>& entries_per_bucket);
+
+  Arena* arena_;
+  Options options_;
+  HistogramImpl keys_per_prefix_hist_;
+  IndexRecordList record_list_;
+  bool is_first_record_;
+  bool due_index_;
+  uint32_t num_prefixes_;
+  uint32_t num_keys_per_prefix_;
+
+  uint32_t prev_key_prefix_hash_;
+  uint32_t index_sparseness_;
+  uint32_t index_size_;
+  size_t sub_index_size_;
+
+  const SliceTransform* prefix_extractor_;
+  double hash_table_ratio_;
+  double huge_page_tlb_size_;
+
+  std::string prev_key_prefix_;
+
+  static const size_t kRecordsPerGroup = 256;
+};
+
+};  // namespace rocksdb
--- a/table/plain_table_reader.cc
+++ b/table/plain_table_reader.cc
@ -3,6 +3,7 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.

 #ifndef ROCKSDB_LITE
+
 #include "table/plain_table_reader.h"

 #include <string>
@ -18,6 +19,7 @@
 #include "rocksdb/statistics.h"

 #include "table/block.h"
+#include "table/bloom_block.h"
 #include "table/filter_block.h"
 #include "table/format.h"
 #include "table/meta_blocks.h"
@ -39,15 +41,6 @@ namespace rocksdb {

 namespace {

-inline uint32_t GetSliceHash(const Slice& s) {
-  return Hash(s.data(), s.size(), 397) ;
-}
-
-inline uint32_t GetBucketIdFromHash(uint32_t hash, uint32_t num_buckets) {
-  assert(num_buckets >= 0);
-  return hash % num_buckets;
-}
-
 // Safely getting a uint32_t element from a char array, where, starting from
 // `base`, every 4 bytes are considered as an fixed 32 bit integer.
 inline uint32_t GetFixed32Element(const char* base, size_t offset) {
@ -103,6 +96,7 @@ PlainTableReader::PlainTableReader(const Options& options,
                                   const TableProperties* table_properties)
    : internal_comparator_(icomparator),
      encoding_type_(encoding_type),
+      full_scan_mode_(false),
      data_end_offset_(table_properties->data_size),
      user_key_len_(table_properties->fixed_key_len),
      prefix_extractor_(options.prefix_extractor.get()),
@ -126,8 +120,7 @@ Status PlainTableReader::Open(const Options& options,
                              double hash_table_ratio, size_t index_sparseness,
                              size_t huge_page_tlb_size, bool full_scan_mode) {
  assert(options.allow_mmap_reads);
-
-  if (file_size > kMaxFileSize) {
+  if (file_size > PlainTableIndex::kMaxFileSize) {
    return Status::NotSupported("File is too large for PlainTableReader!");
  }

@ -173,7 +166,6 @@ Status PlainTableReader::Open(const Options& options,
    return s;
  }

-  // -- Populate Index
  if (!full_scan_mode) {
    s = new_reader->PopulateIndex(props, bloom_bits_per_key, hash_table_ratio,
                                  index_sparseness, huge_page_tlb_size);
@ -183,7 +175,7 @@ Status PlainTableReader::Open(const Options& options,
  } else {
    // Flag to indicate it is a full scan mode so that none of the indexes
    // can be used.
-    new_reader->index_size_ = kFullScanModeFlag;
+    new_reader->full_scan_mode_ = true;
  }

  *table_reader = std::move(new_reader);
@ -203,79 +195,15 @@ Iterator* PlainTableReader::NewIterator(const ReadOptions& options,
  }
 }

-struct PlainTableReader::IndexRecord {
-  uint32_t hash; // hash of the prefix
-  uint32_t offset; // offset of a row
-  IndexRecord* next;
-};
-
-// Helper class to track all the index records
-class PlainTableReader::IndexRecordList {
- public:
-  explicit IndexRecordList(size_t num_records_per_group)
-      : kNumRecordsPerGroup(num_records_per_group),
-        current_group_(nullptr),
-        num_records_in_current_group_(num_records_per_group) {}
-
-  ~IndexRecordList() {
-    for (size_t i = 0; i < groups_.size(); i++) {
-      delete[] groups_[i];
-    }
-  }
-
-  void AddRecord(murmur_t hash, uint32_t offset) {
-    if (num_records_in_current_group_ == kNumRecordsPerGroup) {
-      current_group_ = AllocateNewGroup();
-      num_records_in_current_group_ = 0;
-    }
-    auto& new_record = current_group_[num_records_in_current_group_++];
-    new_record.hash = hash;
-    new_record.offset = offset;
-    new_record.next = nullptr;
-  }
-
-  size_t GetNumRecords() const {
-    return (groups_.size() - 1) * kNumRecordsPerGroup +
-           num_records_in_current_group_;
-  }
-  IndexRecord* At(size_t index) {
-    return &(groups_[index / kNumRecordsPerGroup][index % kNumRecordsPerGroup]);
-  }
-
- private:
-  IndexRecord* AllocateNewGroup() {
-    IndexRecord* result = new IndexRecord[kNumRecordsPerGroup];
-    groups_.push_back(result);
-    return result;
-  }
-
-  // Each group in `groups_` contains fix-sized records (determined by
-  // kNumRecordsPerGroup). Which can help us minimize the cost if resizing
-  // occurs.
-  const size_t kNumRecordsPerGroup;
-  IndexRecord* current_group_;
-  // List of arrays allocated
-  std::vector<IndexRecord*> groups_;
-  size_t num_records_in_current_group_;
-};
-
-Status PlainTableReader::PopulateIndexRecordList(IndexRecordList* record_list,
-                                                 int* num_prefixes,
-                                                 int bloom_bits_per_key,
-                                                 size_t index_sparseness) {
+Status PlainTableReader::PopulateIndexRecordList(
+    PlainTableIndexBuilder* index_builder, vector<uint32_t>* prefix_hashes) {
  Slice prev_key_prefix_slice;
-  uint32_t prev_key_prefix_hash = 0;
  uint32_t pos = data_start_offset_;
-  int num_keys_per_prefix = 0;
-  bool is_first_record = true;
-  HistogramImpl keys_per_prefix_hist;
-  // Need map to be ordered to make sure sub indexes generated
-  // are in order.

-  *num_prefixes = 0;
+  bool is_first_record = true;
+  Slice key_prefix_slice;
  PlainTableKeyDecoder decoder(encoding_type_, user_key_len_,
                               options_.prefix_extractor.get());
-  bool due_index = false;
  while (pos < data_end_offset_) {
    uint32_t key_offset = pos;
    ParsedInternalKey key;
@ -285,152 +213,53 @@ Status PlainTableReader::PopulateIndexRecordList(IndexRecordList* record_list,
    if (!s.ok()) {
      return s;
    }
+
+    key_prefix_slice = GetPrefix(key);
    if (enable_bloom_) {
-      // total order mode and bloom filter is enabled.
      bloom_.AddHash(GetSliceHash(key.user_key));
-    }
-    Slice key_prefix_slice = GetPrefix(key);
-
+    } else {
      if (is_first_record || prev_key_prefix_slice != key_prefix_slice) {
-      ++(*num_prefixes);
        if (!is_first_record) {
-        keys_per_prefix_hist.Add(num_keys_per_prefix);
+          prefix_hashes->push_back(GetSliceHash(prev_key_prefix_slice));
        }
-      num_keys_per_prefix = 0;
        prev_key_prefix_slice = key_prefix_slice;
-      prev_key_prefix_hash = GetSliceHash(key_prefix_slice);
-      due_index = true;
+      }
    }

-    if (due_index) {
-      if (!seekable) {
+    index_builder->AddKeyPrefix(GetPrefix(key), key_offset);
+
+    if (!seekable && is_first_record) {
      return Status::Corruption("Key for a prefix is not seekable");
    }
-      // Add an index key for every kIndexIntervalForSamePrefixKeys keys
-      record_list->AddRecord(prev_key_prefix_hash, key_offset);
-      due_index = false;
-    }

-    num_keys_per_prefix++;
-    if (index_sparseness == 0 || num_keys_per_prefix % index_sparseness == 0) {
-      due_index = true;
-    }
    is_first_record = false;
  }

-  keys_per_prefix_hist.Add(num_keys_per_prefix);
-  Log(options_.info_log, "Number of Keys per prefix Histogram: %s",
-      keys_per_prefix_hist.ToString().c_str());
-
+  prefix_hashes->push_back(GetSliceHash(key_prefix_slice));
+  index_.InitFromRawData(index_builder->Finish());
  return Status::OK();
 }

-void PlainTableReader::AllocateIndexAndBloom(int num_prefixes,
-                                             int bloom_bits_per_key,
-                                             double hash_table_ratio,
-                                             size_t huge_page_tlb_size) {
-  if (prefix_extractor_ != nullptr) {
+void PlainTableReader::AllocateAndFillBloom(int bloom_bits_per_key,
+                                            int num_prefixes,
+                                            size_t huge_page_tlb_size,
+                                            vector<uint32_t>* prefix_hashes) {
+  if (!IsTotalOrderMode()) {
    uint32_t bloom_total_bits = num_prefixes * bloom_bits_per_key;
    if (bloom_total_bits > 0) {
      enable_bloom_ = true;
      bloom_.SetTotalBits(&arena_, bloom_total_bits, options_.bloom_locality,
                          huge_page_tlb_size, options_.info_log.get());
+      FillBloom(prefix_hashes);
    }
  }
-
-  if (prefix_extractor_ == nullptr || hash_table_ratio <= 0) {
-    // Fall back to pure binary search if the user fails to specify a prefix
-    // extractor.
-    index_size_ = 1;
-  } else {
-    double hash_table_size_multipier = 1.0 / hash_table_ratio;
-    index_size_ = num_prefixes * hash_table_size_multipier + 1;
-  }
 }

-size_t PlainTableReader::BucketizeIndexesAndFillBloom(
-    IndexRecordList* record_list, std::vector<IndexRecord*>* hash_to_offsets,
-    std::vector<uint32_t>* entries_per_bucket) {
-  bool first = true;
-  uint32_t prev_hash = 0;
-  size_t num_records = record_list->GetNumRecords();
-  for (size_t i = 0; i < num_records; i++) {
-    IndexRecord* index_record = record_list->At(i);
-    uint32_t cur_hash = index_record->hash;
-    if (first || prev_hash != cur_hash) {
-      prev_hash = cur_hash;
-      first = false;
-      if (enable_bloom_ && !IsTotalOrderMode()) {
-        bloom_.AddHash(cur_hash);
+void PlainTableReader::FillBloom(vector<uint32_t>* prefix_hashes) {
+  assert(bloom_.IsInitialized());
+  for (auto prefix_hash : *prefix_hashes) {
+    bloom_.AddHash(prefix_hash);
  }
-    }
-    uint32_t bucket = GetBucketIdFromHash(cur_hash, index_size_);
-    IndexRecord* prev_bucket_head = (*hash_to_offsets)[bucket];
-    index_record->next = prev_bucket_head;
-    (*hash_to_offsets)[bucket] = index_record;
-    (*entries_per_bucket)[bucket]++;
-  }
-  size_t sub_index_size = 0;
-  for (auto entry_count : *entries_per_bucket) {
-    if (entry_count <= 1) {
-      continue;
-    }
-    // Only buckets with more than 1 entry will have subindex.
-    sub_index_size += VarintLength(entry_count);
-    // total bytes needed to store these entries' in-file offsets.
-    sub_index_size += entry_count * kOffsetLen;
-  }
-  return sub_index_size;
-}
-
-void PlainTableReader::FillIndexes(
-    const size_t kSubIndexSize,
-    const std::vector<IndexRecord*>& hash_to_offsets,
-    const std::vector<uint32_t>& entries_per_bucket,
-    size_t huge_page_tlb_size) {
-  Log(options_.info_log, "Reserving %zu bytes for plain table's sub_index",
-      kSubIndexSize);
-  auto total_allocate_size = sizeof(uint32_t) * index_size_ + kSubIndexSize;
-  char* allocated = arena_.AllocateAligned(
-      total_allocate_size, huge_page_tlb_size, options_.info_log.get());
-  index_ = reinterpret_cast<uint32_t*>(allocated);
-  sub_index_ = allocated + sizeof(uint32_t) * index_size_;
-
-  size_t sub_index_offset = 0;
-  for (int i = 0; i < index_size_; i++) {
-    uint32_t num_keys_for_bucket = entries_per_bucket[i];
-    switch (num_keys_for_bucket) {
-    case 0:
-      // No key for bucket
-      index_[i] = data_end_offset_;
-      break;
-    case 1:
-      // point directly to the file offset
-      index_[i] = hash_to_offsets[i]->offset;
-      break;
-    default:
-      // point to second level indexes.
-      index_[i] = sub_index_offset | kSubIndexMask;
-      char* prev_ptr = &sub_index_[sub_index_offset];
-      char* cur_ptr = EncodeVarint32(prev_ptr, num_keys_for_bucket);
-      sub_index_offset += (cur_ptr - prev_ptr);
-      char* sub_index_pos = &sub_index_[sub_index_offset];
-      IndexRecord* record = hash_to_offsets[i];
-      int j;
-      for (j = num_keys_for_bucket - 1; j >= 0 && record;
-           j--, record = record->next) {
-        EncodeFixed32(sub_index_pos + j * sizeof(uint32_t), record->offset);
-      }
-      assert(j == -1 && record == nullptr);
-      sub_index_offset += kOffsetLen * num_keys_for_bucket;
-      assert(sub_index_offset <= kSubIndexSize);
-      break;
-    }
-  }
-  assert(sub_index_offset == kSubIndexSize);
-
-  Log(options_.info_log, "hash table size: %d, suffix_map length %zu",
-      index_size_, kSubIndexSize);
 }

 Status PlainTableReader::MmapDataFile() {
@ -445,21 +274,50 @@ Status PlainTableReader::PopulateIndex(TableProperties* props,
                                       size_t huge_page_tlb_size) {
  assert(props != nullptr);
  table_properties_.reset(props);
-  // options.prefix_extractor is requried for a hash-based look-up.
-  if ((options_.prefix_extractor.get() == nullptr) && (hash_table_ratio != 0)) {

+  BlockContents bloom_block_contents;
+  auto s = ReadMetaBlock(file_.get(), file_size_, kPlainTableMagicNumber,
+                         options_.env, BloomBlockBuilder::kBloomBlock,
+                         &bloom_block_contents);
+  bool index_in_file = s.ok();
+
+  BlockContents index_block_contents;
+  s = ReadMetaBlock(file_.get(), file_size_, kPlainTableMagicNumber,
+                    options_.env, PlainTableIndexBuilder::kPlainTableIndexBlock,
+                    &index_block_contents);
+
+  index_in_file &= s.ok();
+
+  Slice* bloom_block;
+  if (index_in_file) {
+    bloom_block = &bloom_block_contents.data;
+  } else {
+    bloom_block = nullptr;
+  }
+
+  // index_in_file == true only if there are kBloomBlock and
+  // kPlainTableIndexBlock
+  // in file
+
+  Slice* index_block;
+  if (index_in_file) {
+    index_block = &index_block_contents.data;
+  } else {
+    index_block = nullptr;
+  }
+
+  if ((options_.prefix_extractor.get() == nullptr) && (hash_table_ratio != 0)) {
  // options.prefix_extractor is requried for a hash-based look-up.
    return Status::NotSupported(
        "PlainTable requires a prefix extractor enable prefix hash mode.");
  }

-  IndexRecordList record_list(kRecordsPerGroup);
  // First, read the whole file, for every kIndexIntervalForSamePrefixKeys rows
  // for a prefix (starting from the first one), generate a record of (hash,
  // offset) and append it to IndexRecordList, which is a data structure created
  // to store them.
-  int num_prefixes;

+  if (!index_in_file) {
    // Allocate bloom filter here for total order mode.
    if (IsTotalOrderMode()) {
      uint32_t num_bloom_bits =
@ -470,34 +328,57 @@ Status PlainTableReader::PopulateIndex(TableProperties* props,
                            huge_page_tlb_size, options_.info_log.get());
      }
    }
+  } else {
+    enable_bloom_ = true;
+    auto num_blocks_property = props->user_collected_properties.find(
+        PlainTablePropertyNames::kNumBloomBlocks);

-  Status s = PopulateIndexRecordList(&record_list, &num_prefixes,
-                                     bloom_bits_per_key, index_sparseness);
+    uint32_t num_blocks = 0;
+    if (num_blocks_property != props->user_collected_properties.end()) {
+      Slice temp_slice(num_blocks_property->second);
+      if (!GetVarint32(&temp_slice, &num_blocks)) {
+        num_blocks = 0;
+      }
+    }
+    // cast away const qualifier, because bloom_ won't be changed
+    bloom_.SetRawData(
+        const_cast<unsigned char*>(
+            reinterpret_cast<const unsigned char*>(bloom_block->data())),
+        bloom_block->size() * 8, num_blocks);
+  }
+
+  PlainTableIndexBuilder index_builder(&arena_, options_, index_sparseness,
+                                       hash_table_ratio, huge_page_tlb_size);
+
+  std::vector<uint32_t> prefix_hashes;
+  if (!index_in_file) {
+    Status s = PopulateIndexRecordList(&index_builder, &prefix_hashes);
    if (!s.ok()) {
      return s;
    }
-  // Calculated hash table and bloom filter size and allocate memory for indexes
-  // and bloom filter based on the number of prefixes.
-  AllocateIndexAndBloom(num_prefixes, bloom_bits_per_key, hash_table_ratio,
-                        huge_page_tlb_size);
+  } else {
+    index_.InitFromRawData(*index_block);
+  }

-  // Bucketize all the index records to a temp data structure, in which for
-  // each bucket, we generate a linked list of IndexRecord, in reversed order.
-  std::vector<IndexRecord*> hash_to_offsets(index_size_, nullptr);
-  std::vector<uint32_t> entries_per_bucket(index_size_, 0);
-  size_t sub_index_size_needed = BucketizeIndexesAndFillBloom(
-      &record_list, &hash_to_offsets, &entries_per_bucket);
-  // From the temp data structure, populate indexes.
-  FillIndexes(sub_index_size_needed, hash_to_offsets, entries_per_bucket,
-              huge_page_tlb_size);
+  if (!index_in_file) {
+    // Calculated bloom filter size and allocate memory for
+    // bloom filter based on the number of prefixes, then fill it.
+    AllocateAndFillBloom(bloom_bits_per_key, index_.GetNumPrefixes(),
+                         huge_page_tlb_size, &prefix_hashes);
+  }

  // Fill two table properties.
-  // TODO(sdong): after we have the feature of storing index in file, this
-  // properties need to be populated to index_size instead.
+  if (!index_in_file) {
    props->user_collected_properties["plain_table_hash_table_size"] =
-      std::to_string(index_size_ * 4U);
+        std::to_string(index_.GetIndexSize() * PlainTableIndex::kOffsetLen);
    props->user_collected_properties["plain_table_sub_index_size"] =
-      std::to_string(sub_index_size_needed);
+        std::to_string(index_.GetSubIndexSize());
+  } else {
+    props->user_collected_properties["plain_table_hash_table_size"] =
+        std::to_string(0);
+    props->user_collected_properties["plain_table_sub_index_size"] =
+        std::to_string(0);
+  }

  return Status::OK();
 }
@ -506,24 +387,21 @@ Status PlainTableReader::GetOffset(const Slice& target, const Slice& prefix,
                                   uint32_t prefix_hash, bool& prefix_matched,
                                   uint32_t* offset) const {
  prefix_matched = false;
-  int bucket = GetBucketIdFromHash(prefix_hash, index_size_);
-  uint32_t bucket_value = index_[bucket];
-  if (bucket_value == data_end_offset_) {
+  uint32_t prefix_index_offset;
+  auto res = index_.GetOffset(prefix_hash, &prefix_index_offset);
+  if (res == PlainTableIndex::kNoPrefixForBucket) {
    *offset = data_end_offset_;
    return Status::OK();
-  } else if ((bucket_value & kSubIndexMask) == 0) {
-    // point directly to the file
-    *offset = bucket_value;
+  } else if (res == PlainTableIndex::kDirectToFile) {
+    *offset = prefix_index_offset;
    return Status::OK();
  }

  // point to sub-index, need to do a binary search
+  uint32_t upper_bound;
+  const char* base_ptr =
+      index_.GetSubIndexBasePtrAndUpperBound(prefix_index_offset, &upper_bound);
  uint32_t low = 0;
-  uint64_t prefix_index_offset = bucket_value ^ kSubIndexMask;
-
-  const char* index_ptr = &sub_index_[prefix_index_offset];
-  uint32_t upper_bound = 0;
-  const char* base_ptr = GetVarint32Ptr(index_ptr, index_ptr + 4, &upper_bound);
  uint32_t high = upper_bound;
  ParsedInternalKey mid_key;
  ParsedInternalKey parsed_target;
@ -593,9 +471,6 @@ bool PlainTableReader::MatchBloom(uint32_t hash) const {
  return !enable_bloom_ || bloom_.MayContainHash(hash);
 }

-Slice PlainTableReader::GetPrefix(const ParsedInternalKey& target) const {
-  return GetPrefixFromUserKey(target.user_key);
-}

 Status PlainTableReader::Next(PlainTableKeyDecoder* decoder, uint32_t* offset,
                              ParsedInternalKey* parsed_key,
@ -650,8 +525,7 @@ Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target,
  Slice prefix_slice;
  uint32_t prefix_hash;
  if (IsTotalOrderMode()) {
-    if (index_size_ == kFullScanModeFlag) {
-      // Full Scan Mode
+    if (full_scan_mode_) {
      status_ =
          Status::InvalidArgument("Get() is not allowed in full scan mode.");
    }
@ -682,7 +556,6 @@ Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target,
  if (!ParseInternalKey(target, &parsed_target)) {
    return Status::Corruption(Slice());
  }
-
  Slice found_value;
  PlainTableKeyDecoder decoder(encoding_type_, user_key_len_,
                               options_.prefix_extractor.get());
@ -747,13 +620,12 @@ void PlainTableIterator::Seek(const Slice& target) {
  // If the user doesn't set prefix seek option and we are not able to do a
  // total Seek(). assert failure.
  if (!use_prefix_seek_) {
-    if (table_->index_size_ == PlainTableReader::kFullScanModeFlag) {
-      // Full Scan Mode.
+    if (table_->full_scan_mode_) {
      status_ =
          Status::InvalidArgument("Seek() is not allowed in full scan mode.");
      offset_ = next_offset_ = table_->data_end_offset_;
      return;
-    } else if (table_->index_size_ > 1) {
+    } else if (table_->GetIndexSize() > 1) {
      assert(false);
      status_ = Status::NotSupported(
          "PlainTable cannot issue non-prefix seek unless in total order "
--- a/table/plain_table_reader.h
+++ b/table/plain_table_reader.h
@ -19,12 +19,14 @@
 #include "rocksdb/table_properties.h"
 #include "table/table_reader.h"
 #include "table/plain_table_factory.h"
+#include "table/plain_table_index.h"
 #include "util/arena.h"
 #include "util/dynamic_bloom.h"

 namespace rocksdb {

 class Block;
+class BlockContents;
 class BlockHandle;
 class Footer;
 struct Options;
@ -37,6 +39,7 @@ class PlainTableKeyDecoder;

 using std::unique_ptr;
 using std::unordered_map;
+using std::vector;
 extern const uint32_t kPlainTableVariableLength;

 // Based on following output file format shown in plain_table_factory.h
@ -68,6 +71,7 @@ class PlainTableReader: public TableReader {

  uint64_t ApproximateOffsetOf(const Slice& key);

+  uint32_t GetIndexSize() const { return index_.GetIndexSize(); }
  void SetupForCompaction();

  std::shared_ptr<const TableProperties> GetTableProperties() const {
@ -93,65 +97,23 @@ class PlainTableReader: public TableReader {
  // props: the table properties object that need to be stored. Ownership of
  //        the object will be passed.
  //
-  // index_ contains buckets size of index_size_, each is a
-  // 32-bit integer. The lower 31 bits contain an offset value (explained below)
-  // and the first bit of the integer indicates type of the offset.
-  //
-  // +--------------+------------------------------------------------------+
-  // | Flag (1 bit) | Offset to binary search buffer or file (31 bits)     +
-  // +--------------+------------------------------------------------------+
-  //
-  // Explanation for the "flag bit":
-  //
-  // 0 indicates that the bucket contains only one prefix (no conflict when
-  //   hashing this prefix), whose first row starts from this offset of the
-  // file.
-  // 1 indicates that the bucket contains more than one prefixes, or there
-  //   are too many rows for one prefix so we need a binary search for it. In
-  //   this case, the offset indicates the offset of sub_index_ holding the
-  //   binary search indexes of keys for those rows. Those binary search indexes
-  //   are organized in this way:
-  //
-  // The first 4 bytes, indicate how many indexes (N) are stored after it. After
-  // it, there are N 32-bit integers, each points of an offset of the file,
-  // which
-  // points to starting of a row. Those offsets need to be guaranteed to be in
-  // ascending order so the keys they are pointing to are also in ascending
-  // order
-  // to make sure we can use them to do binary searches. Below is visual
-  // presentation of a bucket.
-  //
-  // <begin>
-  //   number_of_records:  varint32
-  //   record 1 file offset:  fixedint32
-  //   record 2 file offset:  fixedint32
-  //    ....
-  //   record N file offset:  fixedint32
-  // <end>
+
  Status PopulateIndex(TableProperties* props, int bloom_bits_per_key,
                       double hash_table_ratio, size_t index_sparseness,
                       size_t huge_page_tlb_size);
+
  Status MmapDataFile();

 private:
-  struct IndexRecord;
-  class IndexRecordList;
-
-  // Plain table maintains an index and a sub index.
-  // index is implemented by a hash table.
-  // subindex is a big of memory array.
-  // For more details about the in-memory index, please refer to:
-  // https://github.com/facebook/rocksdb/wiki/PlainTable-Format
-  // #wiki-in-memory-index-format
-  uint32_t* index_;
-  int index_size_ = 0;
-  char* sub_index_;
  const InternalKeyComparator internal_comparator_;
  EncodingType encoding_type_;
  // represents plain table's current status.
  Status status_;
  Slice file_data_;

+  PlainTableIndex index_;
+  bool full_scan_mode_;
+
  // data_start_offset_ and data_end_offset_ defines the range of the
  // sst file that stores data.
  const uint32_t data_start_offset_ = 0;
@ -160,11 +122,6 @@ class PlainTableReader: public TableReader {
  const SliceTransform* prefix_extractor_;

  static const size_t kNumInternalBytes = 8;
-  static const uint32_t kSubIndexMask = 0x80000000;
-  static const size_t kOffsetLen = sizeof(uint32_t);
-  static const uint64_t kMaxFileSize = 1u << 31;
-  static const size_t kRecordsPerGroup = 256;
-  static const int kFullScanModeFlag = -1;

  // Bloom filter is used to rule out non-existent key
  bool enable_bloom_;
@ -184,6 +141,31 @@ class PlainTableReader: public TableReader {
    return user_key_len_ + kNumInternalBytes;
  }

+  Slice GetPrefix(const Slice& target) const {
+    assert(target.size() >= 8);  // target is internal key
+    return GetPrefixFromUserKey(GetUserKey(target));
+  }
+
+  Slice GetPrefix(const ParsedInternalKey& target) const {
+    return GetPrefixFromUserKey(target.user_key);
+  }
+
+  Slice GetUserKey(const Slice& key) const {
+    return Slice(key.data(), key.size() - 8);
+  }
+
+  Slice GetPrefixFromUserKey(const Slice& user_key) const {
+    if (!IsTotalOrderMode()) {
+      return prefix_extractor_->Transform(user_key);
+    } else {
+      // Use empty slice as prefix if prefix_extractor is not set.
+      // In that case,
+      // it falls back to pure binary search and
+      // total iterator seek is supported.
+      return Slice();
+    }
+  }
+
  friend class TableCache;
  friend class PlainTableIterator;

@ -191,33 +173,15 @@ class PlainTableReader: public TableReader {
  // the rows, which contains index records as a list.
  // If bloom_ is not null, all the keys' full-key hash will be added to the
  // bloom filter.
-  Status PopulateIndexRecordList(IndexRecordList* record_list,
-                                 int* num_prefixes, int bloom_bits_per_key,
-                                 size_t index_sparseness);
+  Status PopulateIndexRecordList(PlainTableIndexBuilder* index_builder,
+                                 vector<uint32_t>* prefix_hashes);

-  // Internal helper function to allocate memory for indexes and bloom filters
-  void AllocateIndexAndBloom(int num_prefixes, int bloom_bits_per_key,
-                             double hash_table_ratio,
-                             size_t huge_page_tlb_size);
+  // Internal helper function to allocate memory for bloom filter and fill it
+  void AllocateAndFillBloom(int bloom_bits_per_key, int num_prefixes,
+                            size_t huge_page_tlb_size,
+                            vector<uint32_t>* prefix_hashes);

-  // Internal helper function to bucket index record list to hash buckets.
-  // bucket_header is a vector of size hash_table_size_, with each entry
-  // containing a linklist of IndexRecord hashed to the same bucket, in reverse
-  // order.
-  // of offsets for the hash, in reversed order.
-  // entries_per_bucket is sized of index_size_. The value is how many index
-  // records are there in bucket_headers for the same bucket.
-  size_t BucketizeIndexesAndFillBloom(
-      IndexRecordList* record_list, std::vector<IndexRecord*>* bucket_headers,
-      std::vector<uint32_t>* entries_per_bucket);
-
-  // Internal helper class to fill the indexes and bloom filters to internal
-  // data structures. bucket_headers and entries_per_bucket are bucketized
-  // indexes and counts generated by BucketizeIndexesAndFillBloom().
-  void FillIndexes(const size_t kSubIndexSize,
-                   const std::vector<IndexRecord*>& bucket_headers,
-                   const std::vector<uint32_t>& entries_per_bucket,
-                   size_t huge_page_tlb_size);
+  void FillBloom(vector<uint32_t>* prefix_hashes);

  // Read the key and value at `offset` to parameters for keys, the and
  // `seekable`.
@ -237,28 +201,6 @@ class PlainTableReader: public TableReader {
                   uint32_t prefix_hash, bool& prefix_matched,
                   uint32_t* offset) const;

-  Slice GetUserKey(const Slice& key) const {
-    return Slice(key.data(), key.size() - 8);
-  }
-
-  Slice GetPrefix(const Slice& target) const {
-    assert(target.size() >= 8);  // target is internal key
-    return GetPrefixFromUserKey(GetUserKey(target));
-  }
-
-  inline Slice GetPrefix(const ParsedInternalKey& target) const;
-
-  Slice GetPrefixFromUserKey(const Slice& user_key) const {
-    if (!IsTotalOrderMode()) {
-      return prefix_extractor_->Transform(user_key);
-    } else {
-      // Use empty slice as prefix if prefix_extractor is not set. In that case,
-      // it falls back to pure binary search and total iterator seek is
-      // supported.
-      return Slice();
-    }
-  }
-
  bool IsTotalOrderMode() const { return (prefix_extractor_ == nullptr); }

  // No copying allowed
--- a/util/dynamic_bloom.cc
+++ b/util/dynamic_bloom.cc
@ -48,6 +48,13 @@ DynamicBloom::DynamicBloom(uint32_t num_probes,
      kNumProbes(num_probes),
      hash_func_(hash_func == nullptr ? &BloomHash : hash_func) {}

+void DynamicBloom::SetRawData(unsigned char* raw_data, uint32_t total_bits,
+                              uint32_t num_blocks) {
+  data_ = raw_data;
+  kTotalBits = total_bits;
+  kNumBlocks = num_blocks;
+}
+
 void DynamicBloom::SetTotalBits(Arena* arena,
                                uint32_t total_bits, uint32_t locality,
                                size_t huge_page_tlb_size,
--- a/util/dynamic_bloom.h
+++ b/util/dynamic_bloom.h
@ -5,6 +5,10 @@

 #pragma once

+#include <string>
+
+#include "rocksdb/slice.h"
+
 #include <util/arena.h>
 #include <port/port_posix.h>

@ -57,6 +61,19 @@ class DynamicBloom {

  void Prefetch(uint32_t h);

+  uint32_t GetNumBlocks() const { return kNumBlocks; }
+
+  Slice GetRawData() const {
+    return Slice(reinterpret_cast<char*>(data_), GetTotalBits() / 8);
+  }
+
+  void SetRawData(unsigned char* raw_data, uint32_t total_bits,
+                  uint32_t num_blocks = 0);
+
+  uint32_t GetTotalBits() const { return kTotalBits; }
+
+  bool IsInitialized() const { return kNumBlocks > 0 || kTotalBits > 0; }
+
 private:
  uint32_t kTotalBits;
  uint32_t kNumBlocks;
@ -81,7 +98,7 @@ inline void DynamicBloom::Prefetch(uint32_t h) {
 }

 inline bool DynamicBloom::MayContainHash(uint32_t h) const {
-  assert(kNumBlocks > 0 || kTotalBits > 0);
+  assert(IsInitialized());
  const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
  if (kNumBlocks != 0) {
    uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8);
@ -98,10 +115,6 @@ inline bool DynamicBloom::MayContainHash(uint32_t h) const {
      h += delta;
    }
  } else {
-    if (kTotalBits == 0) {
-      // Not initialized.
-      return true;
-    }
    for (uint32_t i = 0; i < kNumProbes; ++i) {
      const uint32_t bitpos = h % kTotalBits;
      if (((data_[bitpos / 8]) & (1 << (bitpos % 8))) == 0) {
@ -114,7 +127,7 @@ inline bool DynamicBloom::MayContainHash(uint32_t h) const {
 }

 inline void DynamicBloom::AddHash(uint32_t h) {
-  assert(kNumBlocks > 0 || kTotalBits > 0);
+  assert(IsInitialized());
  const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
  if (kNumBlocks != 0) {
    uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8);
--- a/util/hash.h
+++ b/util/hash.h
@ -17,4 +17,7 @@ namespace rocksdb {

 extern uint32_t Hash(const char* data, size_t n, uint32_t seed);

+inline uint32_t GetSliceHash(const Slice& s) {
+  return Hash(s.data(), s.size(), 397);
+}
 }