Disable warning as error

[FB Internal] Point to the latest tool chain.
One extra include<functional>
2019-10-31 15:17:49 -07:00 · 2019-10-31 15:17:49 -07:00 · 2019-10-31 15:16:56 -07:00 · 2019-10-31 15:16:56 -07:00 · 2019-10-31 15:16:50 -07:00 · 2019-10-31 15:16:33 -07:00
47 changed files with 1136 additions and 337 deletions
--- a/3
+++ b/3
@ -44,7 +44,6 @@ else
 	PLATFORM_CCFLAGS += $(JEMALLOC_INCLUDE) -DHAVE_JEMALLOC
 endif

-WARNING_FLAGS = -Wall -Werror -Wno-sign-compare
 CFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CCFLAGS) $(OPT)
 CXXFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) -Woverloaded-virtual

@ -170,7 +169,7 @@ dbg: $(LIBRARY) $(PROGRAMS)
 # Will also generate shared libraries.
 release:
 	$(MAKE) clean
-	OPT="-DNDEBUG -O2" $(MAKE) all -j32
+	OPT="-DNDEBUG -O2" $(MAKE) $(LIBRARY) db_bench sst_dump ldb -j32

 coverage:
 	$(MAKE) clean
--- a/build_tools/build_detect_platform
+++ b/build_tools/build_detect_platform
@ -47,13 +47,7 @@ COMMON_FLAGS="-DROCKSDB_PLATFORM_POSIX"
 if [ -d /mnt/gvfs/third-party -a -z "$CXX" ]; then
    FBCODE_BUILD="true"
    if [ -z "$USE_CLANG" ]; then
-        CENTOS_VERSION=`rpm -q --qf "%{VERSION}" \
-          $(rpm -q --whatprovides redhat-release)`
-        if [ "$CENTOS_VERSION" = "6" ]; then
-          source $PWD/build_tools/fbcode.gcc481.sh
-        else
-          source $PWD/build_tools/fbcode.gcc471.sh
-        fi
+        source $PWD/build_tools/fbcode_config.sh
    else
        source $PWD/build_tools/fbcode.clang31.sh
    fi
@ -228,7 +222,7 @@ EOF
      int main() {}
 EOF
    if [ "$?" = 0 ]; then
-        COMMON_FLAGS="$COMMON_FLAGS -DGFLAGS"
+        COMMON_FLAGS="$COMMON_FLAGS -DGFLAGS=gflags"
        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lgflags"
    fi

--- a/build_tools/dependencies.sh
+++ b/build_tools/dependencies.sh
@ -0,0 +1,19 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+GCC_BASE=/mnt/gvfs/third-party2/gcc/7331085db891a2ef4a88a48a751d834e8d68f4cb/7.x/centos7-native/b2ef2b6
+CLANG_BASE=/mnt/gvfs/third-party2/llvm-fb/963d9aeda70cc4779885b1277484fe7544a04e3e/9.0.0/platform007/9e92d53/
+LIBGCC_BASE=/mnt/gvfs/third-party2/libgcc/6ace84e956873d53638c738b6f65f3f469cca74c/7.x/platform007/5620abc
+GLIBC_BASE=/mnt/gvfs/third-party2/glibc/192b0f42d63dcf6210d6ceae387b49af049e6e0c/2.26/platform007/f259413
+SNAPPY_BASE=/mnt/gvfs/third-party2/snappy/7f9bdaada18f59bc27ec2b0871eb8a6144343aef/1.1.3/platform007/ca4da3d
+ZLIB_BASE=/mnt/gvfs/third-party2/zlib/2d9f0b9a4274cc21f61272a9e89bdb859bce8f1f/1.2.8/platform007/ca4da3d
+BZIP2_BASE=/mnt/gvfs/third-party2/bzip2/dc49a21c5fceec6456a7a28a94dcd16690af1337/1.0.6/platform007/ca4da3d
+LZ4_BASE=/mnt/gvfs/third-party2/lz4/0f607f8fc442ea7d6b876931b1898bb573d5e5da/1.9.1/platform007/ca4da3d
+ZSTD_BASE=/mnt/gvfs/third-party2/zstd/ca22bc441a4eb709e9e0b1f9fec9750fed7b31c5/1.4.x/platform007/15a3614
+GFLAGS_BASE=/mnt/gvfs/third-party2/gflags/0b9929d2588991c65a57168bf88aff2db87c5d48/2.2.0/platform007/ca4da3d
+JEMALLOC_BASE=/mnt/gvfs/third-party2/jemalloc/c26f08f47ac35fc31da2633b7da92d6b863246eb/master/platform007/c26c002
+NUMA_BASE=/mnt/gvfs/third-party2/numa/3f3fb57a5ccc5fd21c66416c0b83e0aa76a05376/2.0.11/platform007/ca4da3d
+LIBUNWIND_BASE=/mnt/gvfs/third-party2/libunwind/40c73d874898b386a71847f1b99115d93822d11f/1.4/platform007/6f3e0a9
+TBB_BASE=/mnt/gvfs/third-party2/tbb/4ce8e8dba77cdbd81b75d6f0c32fd7a1b76a11ec/2018_U5/platform007/ca4da3d
+KERNEL_HEADERS_BASE=/mnt/gvfs/third-party2/kernel-headers/fb251ecd2f5ae16f8671f7014c246e52a748fe0b/fb/platform007/da39a3e
+BINUTILS_BASE=/mnt/gvfs/third-party2/binutils/ab9f09bba370e7066cafd4eb59752db93f2e8312/2.29.1/platform007/15a3614
+VALGRIND_BASE=/mnt/gvfs/third-party2/valgrind/d42d152a15636529b0861ec493927200ebebca8e/3.15.0/platform007/ca4da3d
+LUA_BASE=/mnt/gvfs/third-party2/lua/f0cd714433206d5139df61659eb7b28b1dea6683/5.3.4/platform007/5007832
--- a/build_tools/fbcode.gcc481.sh
+++ b/build_tools/fbcode.gcc481.sh
@ -4,74 +4,82 @@
 # fbcode settings.  It uses the latest g++ compiler and also
 # uses jemalloc

-TOOLCHAIN_REV=53dc1fe83f84e9145b9ffb81b81aa7f6a49c87cc
-CENTOS_VERSION=`rpm -q --qf "%{VERSION}" $(rpm -q --whatprovides redhat-release)`
-if [ "$CENTOS_VERSION" = "6" ]; then
-  TOOLCHAIN_EXECUTABLES="/mnt/gvfs/third-party/$TOOLCHAIN_REV/centos6-native"
-else
-  TOOLCHAIN_EXECUTABLES="/mnt/gvfs/third-party/$TOOLCHAIN_REV/centos5.2-native"
-fi
-TOOLCHAIN_LIB_BASE="/mnt/gvfs/third-party/$TOOLCHAIN_REV/gcc-4.8.1-glibc-2.17"
+GCC_BASE=/mnt/gvfs/third-party2/gcc/8219ec1bcedf8ad9da05e121e193364de2cc4f61/5.x/centos6-native/c447969
+CLANG_BASE=/mnt/gvfs/third-party2/llvm-fb/64d8d58e3d84f8bde7a029763d4f5baf39d0d5b9/stable/centos6-native/6aaf4de
+LIBGCC_BASE=/mnt/gvfs/third-party2/libgcc/ba9be983c81de7299b59fe71950c664a84dcb5f8/5.x/gcc-5-glibc-2.23/339d858
+GLIBC_BASE=/mnt/gvfs/third-party2/glibc/f20197cf3d4bd50339c9777aaa0b2ccadad9e2cb/2.23/gcc-5-glibc-2.23/ca1d1c0
+SNAPPY_BASE=/mnt/gvfs/third-party2/snappy/6427ce8c7496e4ab06c2da81543b94c0de8be3d0/1.1.3/gcc-5-glibc-2.23/9bc6787
+ZLIB_BASE=/mnt/gvfs/third-party2/zlib/8f1e8b867d26efef93eac2fabbdb2e1d512665d7/1.2.8/gcc-5-glibc-2.23/9bc6787
+BZIP2_BASE=/mnt/gvfs/third-party2/bzip2/70471c0571559fe0af7db6d7e8860b93a7eadfe1/1.0.6/gcc-5-glibc-2.23/9bc6787
+LZ4_BASE=/mnt/gvfs/third-party2/lz4/453c89d6f0e68cdf1c151c769197fabedad9cac8/r131/gcc-5-glibc-2.23/9bc6787
+ZSTD_BASE=/mnt/gvfs/third-party2/zstd/00a40fa5f8bd2cd0622f2e868552793aef37ccf4/1.3.0/gcc-5-glibc-2.23/03859b5
+GFLAGS_BASE=/mnt/gvfs/third-party2/gflags/47eef08f9acb77de982fbda6047c26d330739538/2.2.0/gcc-5-glibc-2.23/9bc6787
+JEMALLOC_BASE=/mnt/gvfs/third-party2/jemalloc/4414ddc78df8008b35cc4adac23590ad29148584/master/gcc-5-glibc-2.23/d506c82
+NUMA_BASE=/mnt/gvfs/third-party2/numa/9d7ae2693d05d62f9a579cb21e6b717cf257a75d/2.0.11/gcc-5-glibc-2.23/9bc6787
+LIBUNWIND_BASE=/mnt/gvfs/third-party2/libunwind/2b2dd58e3a52ccf2c1d827def59e5f740de0ad15/1.2/gcc-5-glibc-2.23/b443de1
+TBB_BASE=/mnt/gvfs/third-party2/tbb/379addf7ab2468a2b4293b47456cfcd1c9cb318d/4.3/gcc-5-glibc-2.23/9bc6787
+KERNEL_HEADERS_BASE=/mnt/gvfs/third-party2/kernel-headers/3f68f5fe65a85b7c2d3e66852268fbd1efdb3151/4.0.9-36_fbk5_2933_gd092e3f/gcc-5-glibc-2.23/da39a3e
+BINUTILS_BASE=/mnt/gvfs/third-party2/binutils/b9fab0aec99d9c36408e810b2677e91c12807afd/2.28/centos6-native/da39a3e
+VALGRIND_BASE=/mnt/gvfs/third-party2/valgrind/423431d61786b20bcc3bde8972901130cb29e6b3/3.11.0/gcc-5-glibc-2.23/9bc6787
+LUA_BASE=/mnt/gvfs/third-party2/lua/3b0bb3bd9a0f690a069c479fcc0f7424fc7456d2/5.2.3/gcc-5-glibc-2.23/65372bd

-# location of libhdfs libraries
-if test "$USE_HDFS"; then
-  JAVA_HOME="/usr/local/jdk-6u22-64"
-  JINCLUDE="-I$JAVA_HOME/include -I$JAVA_HOME/include/linux"
-  GLIBC_RUNTIME_PATH="/usr/local/fbcode/gcc-4.8.1-glibc-2.17"
-  HDFSLIB=" -Wl,--no-whole-archive hdfs/libhdfs.a -L$JAVA_HOME/jre/lib/amd64 "
-  HDFSLIB+=" -L$JAVA_HOME/jre/lib/amd64/server -L$GLIBC_RUNTIME_PATH/lib "
-  HDFSLIB+=" -ldl -lverify -ljava -ljvm "
-fi

 # location of libgcc
-LIBGCC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.8.1/8aac7fc/include"
-LIBGCC_LIBS=" -L $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.8.1/8aac7fc/libs"
+LIBGCC_INCLUDE=" -I $LIBGCC_BASE/include"
+LIBGCC_LIBS=" -L $LIBGCC_BASE/lib"

 # location of glibc
-GLIBC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/glibc/glibc-2.17/99df8fc/include"
-GLIBC_LIBS=" -L $TOOLCHAIN_LIB_BASE/glibc/glibc-2.17/99df8fc/lib"
+GLIBC_INCLUDE=" -I $GLIBC_BASE/include"
+GLIBC_LIBS=" -L $GLIBC_BASE/lib"

 # location of snappy headers and libraries
-SNAPPY_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/43d84e2/include"
-SNAPPY_LIBS=" $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/43d84e2/lib/libsnappy.a"
+SNAPPY_INCLUDE=" -I $SNAPPY_BASE/include/"
+SNAPPY_LIBS=" $SNAPPY_BASE/lib/libsnappy.a"

 # location of zlib headers and libraries
-ZLIB_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/c3f970a/include"
-ZLIB_LIBS=" $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/c3f970a/lib/libz.a"
+ZLIB_INCLUDE=" -I $ZLIB_BASE/include/"
+ZLIB_LIBS=" $ZLIB_BASE/lib/libz.a"
+

 # location of bzip headers and libraries
-BZIP_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/c3f970a/include"
-BZIP_LIBS=" $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/c3f970a/lib/libbz2.a"
+BZIP_INCLUDE=" -I $BZIP2_BASE/include"
+BZIP_LIBS=" $BZIP2_BASE/lib/libbz2.a"
+
+LZ4_INCLUDE=" -I $LZ4_BASE/include/"
+LZ4_LIBS=" $LZ4_BASE/lib/liblz4.a"
+

 # location of gflags headers and libraries
-GFLAGS_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/c3f970a/include"
-GFLAGS_LIBS=" $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/c3f970a/lib/libgflags.a"
+GFLAGS_INCLUDE=" -I $GFLAGS_BASE/include/"
+GFLAGS_LIBS=" $GFLAGS_BASE/lib/libgflags.a"

 # location of jemalloc
-JEMALLOC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/jemalloc/jemalloc-3.4.1/4d53c6f/include/"
-JEMALLOC_LIB=" -Wl,--whole-archive $TOOLCHAIN_LIB_BASE/jemalloc/jemalloc-3.4.1/4d53c6f/lib/libjemalloc.a"
+JEMALLOC_INCLUDE=" -I $JEMALLOC_BASE/include/"
+JEMALLOC_LIB=" -Wl,--whole-archive  $JEMALLOC_BASE/lib/libjemalloc.a"

 # use Intel SSE support for checksum calculations
 export USE_SSE=" -msse -msse4.2 "

-CC="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.8.1/cc6c9dc/bin/gcc"
-CXX="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.8.1/cc6c9dc/bin/g++ $JINCLUDE $SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $GFLAGS_INCLUDE"
-AR=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ar
-RANLIB=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ranlib
+CC="$GCC_BASE/bin/gcc"
+CXX="$GCC_BASE/bin/g++ $JINCLUDE $SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $LZ4_INCLUDE $GFLAGS_INCLUDE"
+AR="$BINUTILS_BASE/bin/ar"
+RANLIB="$BINUTILS_BASE/bin/ranlib"

-CFLAGS="-B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/gold -m64 -mtune=generic"
+CFLAGS="-B$BINUTILS/bin/gold -m64 -mtune=generic"
 CFLAGS+=" $LIBGCC_INCLUDE $GLIBC_INCLUDE"
 CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_ATOMIC_PRESENT -DROCKSDB_FALLOCATE_PRESENT"
 CFLAGS+=" -DSNAPPY -DGFLAGS -DZLIB -DBZIP2"

-EXEC_LDFLAGS="-Wl,--dynamic-linker,/usr/local/fbcode/gcc-4.8.1-glibc-2.17/lib/ld.so"
-EXEC_LDFLAGS+=" -Wl,--no-whole-archive $TOOLCHAIN_LIB_BASE/libunwind/libunwind-1.0.1/675d945/lib/libunwind.a"
+EXEC_LDFLAGS+=" -B$BINUTILS_BASE/bin/gold"
+EXEC_LDFLAGS+=" -Wl,--dynamic-linker,/usr/local/fbcode/gcc-5-glibc-2.23/lib/ld.so"
+EXEC_LDFLAGS+=" $LIBUNWIND"
+EXEC_LDFLAGS+=" -Wl,-rpath=/usr/local/fbcode/gcc-5-glibc-2.23/lib"
 EXEC_LDFLAGS+=" $HDFSLIB $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $GFLAGS_LIBS"

 PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS "

 EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $GFLAGS_LIBS"

-VALGRIND_VER="$TOOLCHAIN_LIB_BASE/valgrind/valgrind-3.8.1/c3f970a/bin/"
+VALGRIND_VER="$VALGRIND_BASE/bin/"

 export CC CXX AR RANLIB CFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE
--- a/build_tools/fbcode_config.sh
+++ b/build_tools/fbcode_config.sh
@ -0,0 +1,139 @@
+#!/bin/sh
+#
+# Set environment variables so that we can compile rocksdb using
+# fbcode settings.  It uses the latest g++ and clang compilers and also
+# uses jemalloc
+# Environment variables that change the behavior of this script:
+# PIC_BUILD -- if true, it will only take pic versions of libraries from fbcode. libraries that don't have pic variant will not be included
+
+
+BASEDIR=`dirname $BASH_SOURCE`
+source "$BASEDIR/dependencies.sh"
+
+CFLAGS=""
+
+# libgcc
+LIBGCC_INCLUDE="$LIBGCC_BASE/include/c++/7.3.0"
+LIBGCC_LIBS=" -L $LIBGCC_BASE/lib"
+
+# glibc
+GLIBC_INCLUDE="$GLIBC_BASE/include"
+GLIBC_LIBS=" -L $GLIBC_BASE/lib"
+
+# snappy
+SNAPPY_INCLUDE=" -I $SNAPPY_BASE/include/"
+if test -z $PIC_BUILD; then
+  SNAPPY_LIBS=" $SNAPPY_BASE/lib/libsnappy.a"
+else
+  SNAPPY_LIBS=" $SNAPPY_BASE/lib/libsnappy_pic.a"
+fi
+CFLAGS+=" -DSNAPPY"
+
+if test -z $PIC_BUILD; then
+  # location of zlib headers and libraries
+  ZLIB_INCLUDE=" -I $ZLIB_BASE/include/"
+  ZLIB_LIBS=" $ZLIB_BASE/lib/libz.a"
+  CFLAGS+=" -DZLIB"
+
+  LZ4_INCLUDE=" -I $LZ4_BASE/include/"
+  LZ4_LIBS=" $LZ4_BASE/lib/liblz4.a"
+  CFLAGS+=" -DLZ4"
+fi
+
+# location of gflags headers and libraries
+GFLAGS_INCLUDE=" -I $GFLAGS_BASE/include/"
+if test -z $PIC_BUILD; then
+  GFLAGS_LIBS=" $GFLAGS_BASE/lib/libgflags.a"
+else
+  GFLAGS_LIBS=" $GFLAGS_BASE/lib/libgflags_pic.a"
+fi
+CFLAGS+=" -DGFLAGS=gflags"
+
+# location of jemalloc
+JEMALLOC_INCLUDE=" -I $JEMALLOC_BASE/include/"
+JEMALLOC_LIB=" $JEMALLOC_BASE/lib/libjemalloc.a"
+
+if test -z $PIC_BUILD; then
+  # location of numa
+  NUMA_INCLUDE=" -I $NUMA_BASE/include/"
+  NUMA_LIB=" $NUMA_BASE/lib/libnuma.a"
+  CFLAGS+=" -DNUMA"
+
+  # location of libunwind
+  LIBUNWIND="$LIBUNWIND_BASE/lib/libunwind.a"
+fi
+
+# location of TBB
+TBB_INCLUDE=" -isystem $TBB_BASE/include/"
+if test -z $PIC_BUILD; then
+  TBB_LIBS="$TBB_BASE/lib/libtbb.a"
+else
+  TBB_LIBS="$TBB_BASE/lib/libtbb_pic.a"
+fi
+CFLAGS+=" -DTBB"
+
+# use Intel SSE support for checksum calculations
+export USE_SSE=" -msse -msse4.2 "
+
+BINUTILS="$BINUTILS_BASE/bin"
+AR="$BINUTILS/ar"
+
+DEPS_INCLUDE="$SNAPPY_INCLUDE $ZLIB_INCLUDE $LZ4_INCLUDE $GFLAGS_INCLUDE"
+
+STDLIBS="-L $GCC_BASE/lib64"
+
+CLANG_BIN="$CLANG_BASE/bin"
+CLANG_LIB="$CLANG_BASE/lib"
+CLANG_SRC="$CLANG_BASE/../../src"
+
+CLANG_ANALYZER="$CLANG_BIN/clang++"
+CLANG_SCAN_BUILD="$CLANG_SRC/llvm/tools/clang/tools/scan-build/bin/scan-build"
+
+if [ -z "$USE_CLANG" ]; then
+  # gcc
+  CC="$GCC_BASE/bin/gcc"
+  CXX="$GCC_BASE/bin/g++"
+
+  CFLAGS+=" -B$BINUTILS/gold"
+  CFLAGS+=" -isystem $LIBGCC_INCLUDE"
+  CFLAGS+=" -isystem $GLIBC_INCLUDE"
+  JEMALLOC=1
+else
+  # clang
+  CLANG_INCLUDE="$CLANG_LIB/clang/stable/include"
+  CC="$CLANG_BIN/clang"
+  CXX="$CLANG_BIN/clang++"
+
+  KERNEL_HEADERS_INCLUDE="$KERNEL_HEADERS_BASE/include"
+
+  CFLAGS+=" -B$BINUTILS/gold -nostdinc -nostdlib"
+  CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/7.x "
+  CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/7.x/x86_64-facebook-linux "
+  CFLAGS+=" -isystem $GLIBC_INCLUDE"
+  CFLAGS+=" -isystem $LIBGCC_INCLUDE"
+  CFLAGS+=" -isystem $CLANG_INCLUDE"
+  CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE/linux "
+  CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE "
+  CFLAGS+=" -Wno-expansion-to-defined "
+  CXXFLAGS="-nostdinc++"
+fi
+
+CFLAGS+=" $DEPS_INCLUDE"
+CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE -DROCKSDB_RANGESYNC_PRESENT -DROCKSDB_SCHED_GETCPU_PRESENT -DROCKSDB_SUPPORT_THREAD_LOCAL -DHAVE_SSE42"
+CXXFLAGS+=" $CFLAGS"
+
+EXEC_LDFLAGS=" $SNAPPY_LIBS $ZLIB_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS $NUMA_LIB $TBB_LIBS"
+EXEC_LDFLAGS+=" -B$BINUTILS/gold"
+EXEC_LDFLAGS+=" -Wl,--dynamic-linker,/usr/local/fbcode/platform007/lib/ld.so"
+EXEC_LDFLAGS+=" $LIBUNWIND"
+EXEC_LDFLAGS+=" -Wl,-rpath=/usr/local/fbcode/platform007/lib"
+# required by libtbb
+EXEC_LDFLAGS+=" -ldl"
+
+PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS $STDLIBS -lgcc -lstdc++"
+
+EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS $TBB_LIBS"
+
+VALGRIND_VER="$VALGRIND_BASE/bin/"
+
+export CC CXX AR CFLAGS CXXFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE CLANG_ANALYZER CLANG_SCAN_BUILD 
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@ -378,7 +378,7 @@ static bool ValidateCompressionLevel(const char* flagname, int32_t value) {
 }

 static const bool FLAGS_compression_level_dummy __attribute__((unused)) =
-    google::RegisterFlagValidator(&FLAGS_compression_level,
+    GFLAGS::RegisterFlagValidator(&FLAGS_compression_level,
                                  &ValidateCompressionLevel);

 DEFINE_int32(min_level_to_compress, -1, "If non-negative, compression starts"
@ -535,32 +535,32 @@ DEFINE_string(merge_operator, "", "The merge operator to use with the database."
              " utilities/merge_operators.h");

 static const bool FLAGS_soft_rate_limit_dummy __attribute__((unused)) =
-  google::RegisterFlagValidator(&FLAGS_soft_rate_limit,
+  GFLAGS::RegisterFlagValidator(&FLAGS_soft_rate_limit,
                                &ValidateRateLimit);

 static const bool FLAGS_hard_rate_limit_dummy __attribute__((unused)) =
-  google::RegisterFlagValidator(&FLAGS_hard_rate_limit, &ValidateRateLimit);
+  GFLAGS::RegisterFlagValidator(&FLAGS_hard_rate_limit, &ValidateRateLimit);

 static const bool FLAGS_prefix_size_dummy __attribute__((unused)) =
-  google::RegisterFlagValidator(&FLAGS_prefix_size, &ValidatePrefixSize);
+  GFLAGS::RegisterFlagValidator(&FLAGS_prefix_size, &ValidatePrefixSize);

 static const bool FLAGS_key_size_dummy __attribute__((unused)) =
-  google::RegisterFlagValidator(&FLAGS_key_size, &ValidateKeySize);
+  GFLAGS::RegisterFlagValidator(&FLAGS_key_size, &ValidateKeySize);

 static const bool FLAGS_cache_numshardbits_dummy __attribute__((unused)) =
-  google::RegisterFlagValidator(&FLAGS_cache_numshardbits,
+  GFLAGS::RegisterFlagValidator(&FLAGS_cache_numshardbits,
                                &ValidateCacheNumshardbits);

 static const bool FLAGS_readwritepercent_dummy __attribute__((unused)) =
-  google::RegisterFlagValidator(&FLAGS_readwritepercent,
+  GFLAGS::RegisterFlagValidator(&FLAGS_readwritepercent,
                                &ValidateInt32Percent);

 static const bool FLAGS_deletepercent_dummy __attribute__((unused)) =
-  google::RegisterFlagValidator(&FLAGS_deletepercent,
+  GFLAGS::RegisterFlagValidator(&FLAGS_deletepercent,
                                &ValidateInt32Percent);
 static const bool
  FLAGS_table_cache_numshardbits_dummy __attribute__((unused)) =
-  google::RegisterFlagValidator(&FLAGS_table_cache_numshardbits,
+  GFLAGS::RegisterFlagValidator(&FLAGS_table_cache_numshardbits,
                                &ValidateTableCacheNumshardbits);

 namespace rocksdb {
@ -2775,9 +2775,9 @@ class Benchmark {

 int main(int argc, char** argv) {
  rocksdb::InstallStackTraceHandler();
-  google::SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
+  GFLAGS::SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
                          " [OPTIONS]...");
-  google::ParseCommandLineFlags(&argc, &argv, true);
+  GFLAGS::ParseCommandLineFlags(&argc, &argv, true);

  FLAGS_compaction_style_e = (rocksdb::CompactionStyle) FLAGS_compaction_style;
  if (FLAGS_statistics) {
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@ -2540,9 +2540,9 @@ Status DBImpl::ProcessKeyValueCompaction(
  Status status;
  std::string compaction_filter_value;
  ParsedInternalKey ikey;
-  std::string current_user_key;
+  IterKey current_user_key;
  bool has_current_user_key = false;
-  std::vector<char> delete_key;   // for compaction filter
+  IterKey delete_key;
  SequenceNumber last_sequence_for_key __attribute__((unused)) =
    kMaxSequenceNumber;
  SequenceNumber visible_in_snapshot = kMaxSequenceNumber;
@ -2612,16 +2612,16 @@ Status DBImpl::ProcessKeyValueCompaction(
      // Do not hide error keys
      // TODO: error key stays in db forever? Figure out the intention/rationale
      // v10 error v8 : we cannot hide v8 even though it's pretty obvious.
-      current_user_key.clear();
+      current_user_key.Clear();
      has_current_user_key = false;
      last_sequence_for_key = kMaxSequenceNumber;
      visible_in_snapshot = kMaxSequenceNumber;
    } else {
      if (!has_current_user_key ||
          user_comparator()->Compare(ikey.user_key,
-            Slice(current_user_key)) != 0) {
+            current_user_key.GetKey()) != 0) {
        // First occurrence of this user key
-        current_user_key.assign(ikey.user_key.data(), ikey.user_key.size());
+        current_user_key.SetUserKey(ikey.user_key);
        has_current_user_key = true;
        last_sequence_for_key = kMaxSequenceNumber;
        visible_in_snapshot = kMaxSequenceNumber;
@ -2642,13 +2642,11 @@ Status DBImpl::ProcessKeyValueCompaction(
                &compaction_filter_value,
                &value_changed);
          if (to_delete) {
-            // make a copy of the original key
-            delete_key.assign(key.data(), key.data() + key.size());
-            // convert it to a delete
-            UpdateInternalKey(&delete_key[0], delete_key.size(),
-                ikey.sequence, kTypeDeletion);
+            // make a copy of the original key and convert it to a delete
+            delete_key.SetInternalKey(ExtractUserKey(key), ikey.sequence,
+                                      kTypeDeletion);
            // anchor the key again
-            key = Slice(&delete_key[0], delete_key.size());
+            key = delete_key.GetKey();
            // needed because ikey is backed by key
            ParseInternalKey(key, &ikey);
            // no value associated with delete
@ -3455,7 +3453,7 @@ Status DBImpl::GetImpl(const ReadOptions& options,
    StartPerfTimer(&from_files_timer);

    sv->current->Get(options, lkey, value, &s, &merge_context, &stats,
-                     options_, value_found);
+                     value_found);
    have_stat_update = true;
    BumpPerfTime(&perf_context.get_from_output_files_time, &from_files_timer);
    RecordTick(options_.statistics.get(), MEMTABLE_MISS);
@ -3559,7 +3557,7 @@ std::vector<Status> DBImpl::MultiGet(const ReadOptions& options,
      // Done
    } else {
      get_version->current->Get(options, lkey, value, &s, &merge_context,
-                                &stats, options_);
+                                &stats);
      have_stat_update = true;
    }

--- a/db/db_impl_readonly.cc
+++ b/db/db_impl_readonly.cc
@ -64,7 +64,7 @@ Status DBImplReadOnly::Get(const ReadOptions& options,
  } else {
    Version::GetStats stats;
    super_version->current->Get(options, lkey, value, &s, &merge_context,
-                                &stats, options_);
+                                &stats);
  }
  return s;
 }
--- a/db/db_iter.cc
+++ b/db/db_iter.cc
@ -39,71 +39,6 @@ static void DumpInternalIter(Iterator* iter) {

 namespace {

-class IterLookupKey {
- public:
-  IterLookupKey() : key_(space_), buf_size_(sizeof(space_)), key_size_(0) {}
-
-  ~IterLookupKey() { Clear(); }
-
-  Slice GetKey() const {
-    if (key_ != nullptr) {
-      return Slice(key_, key_size_);
-    } else {
-      return Slice();
-    }
-  }
-
-  bool Valid() const { return key_ != nullptr; }
-
-  void Clear() {
-    if (key_ != nullptr && key_ != space_) {
-      delete[] key_;
-    }
-    key_ = space_;
-    buf_size_ = sizeof(buf_size_);
-  }
-
-  // Enlarge the buffer size if needed based on key_size.
-  // By default, static allocated buffer is used. Once there is a key
-  // larger than the static allocated buffer, another buffer is dynamically
-  // allocated, until a larger key buffer is requested. In that case, we
-  // reallocate buffer and delete the old one.
-  void EnlargeBufferIfNeeded(size_t key_size) {
-    // If size is smaller than buffer size, continue using current buffer,
-    // or the static allocated one, as default
-    if (key_size > buf_size_) {
-      // Need to enlarge the buffer.
-      Clear();
-      key_ = new char[key_size];
-      buf_size_ = key_size;
-    }
-    key_size_ = key_size;
-  }
-
-  void SetUserKey(const Slice& user_key) {
-    size_t size = user_key.size();
-    EnlargeBufferIfNeeded(size);
-    memcpy(key_, user_key.data(), size);
-  }
-
-  void SetInternalKey(const Slice& user_key, SequenceNumber s) {
-    size_t usize = user_key.size();
-    EnlargeBufferIfNeeded(usize + sizeof(uint64_t));
-    memcpy(key_, user_key.data(), usize);
-    EncodeFixed64(key_ + usize, PackSequenceAndType(s, kValueTypeForSeek));
-  }
-
- private:
-  char* key_;
-  size_t buf_size_;
-  size_t key_size_;
-  char space_[32];  // Avoid allocation for short keys
-
-  // No copying allowed
-  IterLookupKey(const IterLookupKey&) = delete;
-  void operator=(const LookupKey&) = delete;
-};
-
 // Memtables and sstables that make the DB representation contain
 // (userkey,seq,type) => uservalue entries.  DBIter
 // combines multiple entries for the same userkey found in the DB
@ -191,7 +126,7 @@ class DBIter: public Iterator {
  SequenceNumber const sequence_;

  Status status_;
-  IterLookupKey saved_key_;   // == current key when direction_==kReverse
+  IterKey saved_key_;   // == current key when direction_==kReverse
  std::string saved_value_;   // == current raw value when direction_==kReverse
  std::string skip_key_;
  Direction direction_;
--- a/db/db_test.cc
+++ b/db/db_test.cc
@ -266,6 +266,9 @@ class DBTest {
  // Sequence of option configurations to try
  enum OptionConfig {
    kDefault,
+    kBlockBasedTableWithPrefixHashIndex,
+    // TODO(kailiu) figure this out
+    // kBlockBasedTableWithWholeKeyHashIndex,
    kPlainTableFirstBytePrefix,
    kPlainTableAllBytesPrefix,
    kVectorRep,
@ -302,7 +305,8 @@ class DBTest {
    kSkipDeletesFilterFirst = 1,
    kSkipUniversalCompaction = 2,
    kSkipMergePut = 4,
-    kSkipPlainTable = 8
+    kSkipPlainTable = 8,
+    kSkipHashIndex = 16
  };

  DBTest() : option_config_(kDefault),
@ -343,6 +347,11 @@ class DBTest {
              || option_config_ == kPlainTableFirstBytePrefix)) {
        continue;
      }
+      if ((skip_mask & kSkipPlainTable) &&
+          option_config_ == kBlockBasedTableWithPrefixHashIndex) {
+        continue;
+      }
+
      break;
    }

@ -441,6 +450,21 @@ class DBTest {
      case kInfiniteMaxOpenFiles:
        options.max_open_files = -1;
        break;
+      case kBlockBasedTableWithPrefixHashIndex: {
+        BlockBasedTableOptions table_options;
+        table_options.index_type = BlockBasedTableOptions::kHashSearch;
+        options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+        options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+        break;
+      }
+      // TODO(kailiu) figure out why it's failing and fix
+      // case kBlockBasedTableWithWholeKeyHashIndex: {
+      //   BlockBasedTableOptions table_options;
+      //   table_options.index_type = BlockBasedTableOptions::kHashSearch;
+      //   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+      //   options.prefix_extractor.reset(NewNoopTransform());
+      //   break;
+      // }
      default:
        break;
    }
@ -885,13 +909,29 @@ TEST(DBTest, Empty) {
    options.write_buffer_size = 100000;  // Small write buffer
    Reopen(&options);

+    std::string num;
+    ASSERT_TRUE(dbfull()->GetProperty(
+        "rocksdb.num-entries-active-mem-table", &num));
+    ASSERT_EQ("0", num);
+
    ASSERT_OK(Put("foo", "v1"));
    ASSERT_EQ("v1", Get("foo"));
+    ASSERT_TRUE(dbfull()->GetProperty(
+        "rocksdb.num-entries-active-mem-table", &num));
+    ASSERT_EQ("1", num);

    env_->delay_sstable_sync_.Release_Store(env_);   // Block sync calls
    Put("k1", std::string(100000, 'x'));             // Fill memtable
+    ASSERT_TRUE(dbfull()->GetProperty(
+        "rocksdb.num-entries-active-mem-table", &num));
+    ASSERT_EQ("2", num);
+
    Put("k2", std::string(100000, 'y'));             // Trigger compaction
    ASSERT_EQ("v1", Get("foo"));
+    ASSERT_TRUE(dbfull()->GetProperty(
+        "rocksdb.num-entries-active-mem-table", &num));
+    ASSERT_EQ("1", num);
+
    env_->delay_sstable_sync_.Release_Store(nullptr);   // Release sync calls
  } while (ChangeOptions());
 }
@ -1251,7 +1291,7 @@ TEST(DBTest, KeyMayExist) {

    // KeyMayExist function only checks data in block caches, which is not used
    // by plain table format.
-  } while (ChangeOptions(kSkipPlainTable));
+  } while (ChangeOptions(kSkipPlainTable | kSkipHashIndex));
 }

 TEST(DBTest, NonBlockingIteration) {
@ -2064,6 +2104,9 @@ TEST(DBTest, NumImmutableMemTable) {
    ASSERT_OK(dbfull()->Put(writeOpt, "k1", big_value));
    ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
    ASSERT_EQ(num, "0");
+    ASSERT_TRUE(dbfull()->GetProperty(
+        "rocksdb.num-entries-active-mem-table", &num));
+    ASSERT_EQ(num, "1");
    perf_context.Reset();
    Get("k1");
    ASSERT_EQ(1, (int) perf_context.get_from_memtable_count);
@ -2071,6 +2114,13 @@ TEST(DBTest, NumImmutableMemTable) {
    ASSERT_OK(dbfull()->Put(writeOpt, "k2", big_value));
    ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
    ASSERT_EQ(num, "1");
+    ASSERT_TRUE(dbfull()->GetProperty(
+        "rocksdb.num-entries-active-mem-table", &num));
+    ASSERT_EQ(num, "1");
+    ASSERT_TRUE(dbfull()->GetProperty(
+        "rocksdb.num-entries-imm-mem-tables", &num));
+    ASSERT_EQ(num, "1");
+
    perf_context.Reset();
    Get("k1");
    ASSERT_EQ(2, (int) perf_context.get_from_memtable_count);
@ -2083,6 +2133,12 @@ TEST(DBTest, NumImmutableMemTable) {
                                      &num));
    ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
    ASSERT_EQ(num, "2");
+    ASSERT_TRUE(dbfull()->GetProperty(
+       "rocksdb.num-entries-active-mem-table", &num));
+    ASSERT_EQ(num, "1");
+    ASSERT_TRUE(dbfull()->GetProperty(
+        "rocksdb.num-entries-imm-mem-tables", &num));
+    ASSERT_EQ(num, "2");
    perf_context.Reset();
    Get("k2");
    ASSERT_EQ(2, (int) perf_context.get_from_memtable_count);
@ -5882,7 +5938,8 @@ TEST(DBTest, Randomized) {
      int minimum = 0;
      if (option_config_ == kHashSkipList ||
          option_config_ == kHashLinkList ||
-          option_config_ == kPlainTableFirstBytePrefix) {
+          option_config_ == kPlainTableFirstBytePrefix ||
+          option_config_ == kBlockBasedTableWithPrefixHashIndex) {
        minimum = 1;
      }
      if (p < 45) {                               // Put
@ -5924,6 +5981,7 @@ TEST(DBTest, Randomized) {
      if ((step % 100) == 0) {
        ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr));
        ASSERT_TRUE(CompareIterators(step, &model, db_, model_snap, db_snap));
+
        // Save a snapshot from each DB this time that we'll use next
        // time we compare things, to make sure the current state is
        // preserved with the snapshot
--- a/db/dbformat.h
+++ b/db/dbformat.h
@ -13,6 +13,7 @@
 #include "rocksdb/db.h"
 #include "rocksdb/filter_policy.h"
 #include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
 #include "rocksdb/table.h"
 #include "rocksdb/types.h"
 #include "util/coding.h"
@ -235,4 +236,100 @@ inline LookupKey::~LookupKey() {
  if (start_ != space_) delete[] start_;
 }

+class IterKey {
+ public:
+  IterKey() : key_(space_), buf_size_(sizeof(space_)), key_size_(0) {}
+
+  ~IterKey() { ResetBuffer(); }
+
+  Slice GetKey() const { return Slice(key_, key_size_); }
+
+  void Clear() { key_size_ = 0; }
+
+  void SetUserKey(const Slice& user_key) {
+    size_t size = user_key.size();
+    EnlargeBufferIfNeeded(size);
+    memcpy(key_, user_key.data(), size);
+    key_size_ = size;
+  }
+
+  void SetInternalKey(const Slice& user_key, SequenceNumber s,
+                      ValueType value_type = kValueTypeForSeek) {
+    size_t usize = user_key.size();
+    EnlargeBufferIfNeeded(usize + sizeof(uint64_t));
+    memcpy(key_, user_key.data(), usize);
+    EncodeFixed64(key_ + usize, PackSequenceAndType(s, value_type));
+    key_size_ = usize + sizeof(uint64_t);
+  }
+
+  void SetInternalKey(const ParsedInternalKey& parsed_key) {
+    SetInternalKey(parsed_key.user_key, parsed_key.sequence, parsed_key.type);
+  }
+
+ private:
+  char* key_;
+  size_t buf_size_;
+  size_t key_size_;
+  char space_[32];  // Avoid allocation for short keys
+
+  void ResetBuffer() {
+    if (key_ != nullptr && key_ != space_) {
+      delete[] key_;
+    }
+    key_ = space_;
+    buf_size_ = sizeof(space_);
+    key_size_ = 0;
+  }
+
+  // Enlarge the buffer size if needed based on key_size.
+  // By default, static allocated buffer is used. Once there is a key
+  // larger than the static allocated buffer, another buffer is dynamically
+  // allocated, until a larger key buffer is requested. In that case, we
+  // reallocate buffer and delete the old one.
+  void EnlargeBufferIfNeeded(size_t key_size) {
+    // If size is smaller than buffer size, continue using current buffer,
+    // or the static allocated one, as default
+    if (key_size > buf_size_) {
+      // Need to enlarge the buffer.
+      ResetBuffer();
+      key_ = new char[key_size];
+      buf_size_ = key_size;
+    }
+  }
+
+  // No copying allowed
+  IterKey(const IterKey&) = delete;
+  void operator=(const IterKey&) = delete;
+};
+
+class InternalKeySliceTransform : public SliceTransform {
+ public:
+  explicit InternalKeySliceTransform(const SliceTransform* transform)
+      : transform_(transform) {}
+
+  virtual const char* Name() const { return transform_->Name(); }
+
+  virtual Slice Transform(const Slice& src) const {
+    auto user_key = ExtractUserKey(src);
+    return transform_->Transform(user_key);
+  }
+
+  virtual bool InDomain(const Slice& src) const {
+    auto user_key = ExtractUserKey(src);
+    return transform_->InDomain(user_key);
+  }
+
+  virtual bool InRange(const Slice& dst) const {
+    auto user_key = ExtractUserKey(dst);
+    return transform_->InRange(user_key);
+  }
+
+  const SliceTransform* user_prefix_extractor() const { return transform_; }
+
+ private:
+  // Like comparator, InternalKeySliceTransform will not take care of the
+  // deletion of transform_
+  const SliceTransform* const transform_;
+};
+
 }  // namespace rocksdb
--- a/db/internal_stats.cc
+++ b/db/internal_stats.cc
@ -38,6 +38,10 @@ DBPropertyType GetPropertyType(const Slice& property) {
    return kBackgroundErrors;
  } else if (in == "cur-size-active-mem-table") {
    return kCurSizeActiveMemTable;
+  } else if (in == "num-entries-active-mem-table") {
+    return kNumEntriesInMutableMemtable;
+  } else if (in == "num-entries-imm-mem-tables") {
+    return kNumEntriesInImmutableMemtable;
  }
  return kUnknown;
 }
@ -47,7 +51,7 @@ bool InternalStats::GetProperty(DBPropertyType property_type,
                                DBImpl* db) {
  VersionSet* version_set = db->versions_.get();
  Version* current = version_set->current();
-  const MemTableList& imm = db->imm_;
+  MemTableList& imm = db->imm_;
  Slice in = property;

  switch (property_type) {
@ -353,6 +357,14 @@ bool InternalStats::GetProperty(DBPropertyType property_type,
      // Current size of the active memtable
      *value = std::to_string(db->mem_->ApproximateMemoryUsage());
      return true;
+    case kNumEntriesInMutableMemtable:
+      // Current size of the active memtable
+      *value = std::to_string(db->mem_->GetNumEntries());
+      return true;
+    case kNumEntriesInImmutableMemtable:
+      // Current size of the active memtable
+      *value = std::to_string(imm.current()->GetTotalNumEntries());
+      return true;
    default:
      return false;
  }
--- a/db/internal_stats.h
+++ b/db/internal_stats.h
@ -28,11 +28,14 @@ enum DBPropertyType {
  kSsTables,         // Return a human readable string of current SST files
  kNumImmutableMemTable,   // Return number of immutable mem tables
  kMemtableFlushPending,   // Return 1 if mem table flushing is pending,
-                          // otherwise
-                          // 0.
+                           // otherwise 0.
  kCompactionPending,      // Return 1 if a compaction is pending. Otherwise 0.
  kBackgroundErrors,       // Return accumulated background errors encountered.
  kCurSizeActiveMemTable,  // Return current size of the active memtable
+  kNumEntriesInMutableMemtable,    // Return number of entries in the mutable
+                                   // memtable.
+  kNumEntriesInImmutableMemtable,  // Return sum of number of entries in all
+                                   // the immutable mem tables.
  kUnknown,
 };

--- a/db/memtable.cc
+++ b/db/memtable.cc
@ -37,6 +37,7 @@ MemTable::MemTable(const InternalKeyComparator& cmp, const Options& options)
      arena_(options.arena_block_size),
      table_(options.memtable_factory->CreateMemTableRep(
          comparator_, &arena_, options.prefix_extractor.get())),
+      num_entries_(0),
      flush_in_progress_(false),
      flush_completed_(false),
      file_number_(0),
@ -260,6 +261,7 @@ void MemTable::Add(SequenceNumber s, ValueType type,
  memcpy(p, value.data(), val_size);
  assert((unsigned)(p + val_size - buf) == (unsigned)encoded_len);
  table_->Insert(handle);
+  num_entries_++;

  if (prefix_bloom_) {
    assert(prefix_extractor_);
--- a/db/memtable.h
+++ b/db/memtable.h
@ -132,6 +132,9 @@ class MemTable {
  // key in the memtable.
  size_t CountSuccessiveMergeEntries(const LookupKey& key);

+  // Get total number of entries in the mem table.
+  uint64_t GetNumEntries() const { return num_entries_; }
+
  // Returns the edits area that is needed for flushing the memtable
  VersionEdit* GetEdits() { return &edit_; }

@ -182,6 +185,8 @@ class MemTable {
  Arena arena_;
  unique_ptr<MemTableRep> table_;

+  uint64_t num_entries_;
+
  // These are used to manage memtable flushes to storage
  bool flush_in_progress_; // started the flush
  bool flush_completed_;   // finished the flush
--- a/db/memtable_list.cc
+++ b/db/memtable_list.cc
@ -77,6 +77,14 @@ void MemTableListVersion::AddIterators(const ReadOptions& options,
  }
 }

+uint64_t MemTableListVersion::GetTotalNumEntries() const {
+  uint64_t total_num = 0;
+  for (auto& m : memlist_) {
+    total_num += m->GetNumEntries();
+  }
+  return total_num;
+}
+
 // caller is responsible for referencing m
 void MemTableListVersion::Add(MemTable* m) {
  assert(refs_ == 1);  // only when refs_ == 1 is MemTableListVersion mutable
--- a/db/memtable_list.h
+++ b/db/memtable_list.h
@ -43,6 +43,8 @@ class MemTableListVersion {
  void AddIterators(const ReadOptions& options,
                    std::vector<Iterator*>* iterator_list);

+  uint64_t GetTotalNumEntries() const;
+
 private:
  // REQUIRE: m is mutable memtable
  void Add(MemTable* m);
--- a/db/plain_table_db_test.cc
+++ b/db/plain_table_db_test.cc
@ -190,7 +190,7 @@ class TestPlainTableReader : public PlainTableReader {
                         file_size, bloom_bits_per_key, hash_table_ratio,
                         index_sparseness, table_properties),
        expect_bloom_not_match_(expect_bloom_not_match) {
-    Status s = PopulateIndex();
+    Status s = PopulateIndex(const_cast<TableProperties*>(table_properties));
    ASSERT_TRUE(s.ok());
  }

@ -265,6 +265,19 @@ TEST(PlainTableDBTest, Flush) {
      ASSERT_OK(Put("0000000000000bar", "v2"));
      ASSERT_OK(Put("1000000000000foo", "v3"));
      dbfull()->TEST_FlushMemTable();
+
+      TablePropertiesCollection ptc;
+      reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc);
+      ASSERT_EQ(1U, ptc.size());
+      auto row = ptc.begin();
+      auto tp = row->second;
+      ASSERT_EQ(
+          total_order ? "4" : "12",
+          (tp->user_collected_properties).at("plain_table_hash_table_size"));
+      ASSERT_EQ(
+          total_order ? "9" : "0",
+          (tp->user_collected_properties).at("plain_table_sub_index_size"));
+
      ASSERT_EQ("v3", Get("1000000000000foo"));
      ASSERT_EQ("v2", Get("0000000000000bar"));
    }
@ -429,6 +442,48 @@ TEST(PlainTableDBTest, Iterator) {
  }
 }

+std::string MakeLongKey(size_t length, char c) {
+  return std::string(length, c);
+}
+
+TEST(PlainTableDBTest, IteratorLargeKeys) {
+  Options options = CurrentOptions();
+  options.table_factory.reset(NewTotalOrderPlainTableFactory(0, 0, 16));
+  options.create_if_missing = true;
+  options.prefix_extractor.reset();
+  DestroyAndReopen(&options);
+
+  std::string key_list[] = {
+      MakeLongKey(30, '0'),
+      MakeLongKey(16, '1'),
+      MakeLongKey(32, '2'),
+      MakeLongKey(60, '3'),
+      MakeLongKey(90, '4'),
+      MakeLongKey(50, '5'),
+      MakeLongKey(26, '6')
+  };
+
+  for (size_t i = 0; i < 7; i++) {
+    ASSERT_OK(Put(key_list[i], std::to_string(i)));
+  }
+
+  dbfull()->TEST_FlushMemTable();
+
+  Iterator* iter = dbfull()->NewIterator(ro_);
+  iter->Seek(key_list[0]);
+
+  for (size_t i = 0; i < 7; i++) {
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(key_list[i], iter->key().ToString());
+    ASSERT_EQ(std::to_string(i), iter->value().ToString());
+    iter->Next();
+  }
+
+  ASSERT_TRUE(!iter->Valid());
+
+  delete iter;
+}
+
 // A test comparator which compare two strings in this way:
 // (1) first compare prefix of 8 bytes in alphabet order,
 // (2) if two strings share the same prefix, sort the other part of the string
--- a/db/prefix_test.cc
+++ b/db/prefix_test.cc
@ -620,7 +620,7 @@ TEST(PrefixTest, PrefixHash) {
 }

 int main(int argc, char** argv) {
-  google::ParseCommandLineFlags(&argc, &argv, true);
+  GFLAGS::ParseCommandLineFlags(&argc, &argv, true);
  std::cout << kDbName << "\n";

  rocksdb::test::RunAllTests();
--- a/db/table_cache.cc
+++ b/db/table_cache.cc
@ -111,19 +111,20 @@ Iterator* TableCache::NewIterator(const ReadOptions& options,
  if (table_reader_ptr != nullptr) {
    *table_reader_ptr = nullptr;
  }
-  Cache::Handle* handle = file_meta.table_reader_handle;
+  TableReader* table_reader = file_meta.table_reader;
+  Cache::Handle* handle = nullptr;
  Status s;
-  if (!handle) {
+  if (table_reader == nullptr) {
    s = FindTable(toptions, icomparator, file_meta.number, file_meta.file_size,
                  &handle, nullptr, options.read_tier == kBlockCacheTier);
-  }
    if (!s.ok()) {
      return NewErrorIterator(s);
    }
+    table_reader = GetTableReaderFromHandle(handle);
+  }

-  TableReader* table_reader = GetTableReaderFromHandle(handle);
  Iterator* result = table_reader->NewIterator(options);
-  if (!file_meta.table_reader_handle) {
+  if (handle != nullptr) {
    result->RegisterCleanup(&UnrefEntry, cache_.get(), handle);
  }
  if (table_reader_ptr != nullptr) {
@ -143,17 +144,20 @@ Status TableCache::Get(const ReadOptions& options,
                       bool (*saver)(void*, const ParsedInternalKey&,
                                     const Slice&, bool),
                       bool* table_io, void (*mark_key_may_exist)(void*)) {
-  Cache::Handle* handle = file_meta.table_reader_handle;
+  TableReader* t = file_meta.table_reader;
  Status s;
-  if (!handle) {
+  Cache::Handle* handle = nullptr;
+  if (!t) {
    s = FindTable(storage_options_, internal_comparator, file_meta.number,
                  file_meta.file_size, &handle, table_io,
                  options.read_tier == kBlockCacheTier);
+    if (s.ok()) {
+      t = GetTableReaderFromHandle(handle);
+    }
  }
  if (s.ok()) {
-    TableReader* t = GetTableReaderFromHandle(handle);
    s = t->Get(options, k, arg, saver, mark_key_may_exist);
-    if (!file_meta.table_reader_handle) {
+    if (handle != nullptr) {
      ReleaseHandle(handle);
    }
  } else if (options.read_tier && s.IsIncomplete()) {
@ -169,15 +173,16 @@ Status TableCache::GetTableProperties(
    const FileMetaData& file_meta,
    std::shared_ptr<const TableProperties>* properties, bool no_io) {
  Status s;
-  auto table_handle = file_meta.table_reader_handle;
+  auto table_reader = file_meta.table_reader;
  // table already been pre-loaded?
-  if (table_handle) {
-    auto table = GetTableReaderFromHandle(table_handle);
-    *properties = table->GetTableProperties();
+  if (table_reader) {
+    *properties = table_reader->GetTableProperties();
+
    return s;
  }

  bool table_io;
+  Cache::Handle* table_handle = nullptr;
  s = FindTable(toptions, internal_comparator, file_meta.number,
                file_meta.file_size, &table_handle, &table_io, no_io);
  if (!s.ok()) {
@ -195,20 +200,21 @@ bool TableCache::PrefixMayMatch(const ReadOptions& options,
                                const FileMetaData& file_meta,
                                const Slice& internal_prefix, bool* table_io) {
  bool may_match = true;
-  auto table_handle = file_meta.table_reader_handle;
-  if (table_handle == nullptr) {
+  auto table_reader = file_meta.table_reader;
+  Cache::Handle* table_handle = nullptr;
+  if (table_reader == nullptr) {
    // Need to get table handle from file number
    Status s = FindTable(storage_options_, icomparator, file_meta.number,
                         file_meta.file_size, &table_handle, table_io);
    if (!s.ok()) {
      return may_match;
    }
+    table_reader = GetTableReaderFromHandle(table_handle);
  }

-  auto table = GetTableReaderFromHandle(table_handle);
-  may_match = table->PrefixMayMatch(internal_prefix);
+  may_match = table_reader->PrefixMayMatch(internal_prefix);

-  if (file_meta.table_reader_handle == nullptr) {
+  if (table_handle != nullptr) {
    // Need to release handle if it is generated from here.
    ReleaseHandle(table_handle);
  }
--- a/db/version_edit.h
+++ b/db/version_edit.h
@ -31,10 +31,13 @@ struct FileMetaData {

  // Needs to be disposed when refs becomes 0.
  Cache::Handle* table_reader_handle;
+  // Table reader in table_reader_handle
+  TableReader* table_reader;

  FileMetaData(uint64_t number, uint64_t file_size) :
      refs(0), allowed_seeks(1 << 30), number(number), file_size(file_size),
-      being_compacted(false), table_reader_handle(nullptr) {
+      being_compacted(false), table_reader_handle(nullptr),
+      table_reader(nullptr) {
  }
  FileMetaData() : FileMetaData(0, 0) { }
 };
--- a/db/version_set.cc
+++ b/db/version_set.cc
@ -148,7 +148,7 @@ namespace {
 struct EncodedFileMetaData {
  uint64_t number;   // file number
  uint64_t file_size;   // file size
-  Cache::Handle* table_reader_handle;   // cached table reader's handler
+  TableReader* table_reader;   // cached table reader
 };
 }  // namespace

@ -196,7 +196,7 @@ class Version::LevelFileNumIterator : public Iterator {
    auto* file_meta = (*flist_)[index_];
    current_value_.number = file_meta->number;
    current_value_.file_size = file_meta->file_size;
-    current_value_.table_reader_handle = file_meta->table_reader_handle;
+    current_value_.table_reader = file_meta->table_reader;
    return Slice(reinterpret_cast<const char*>(&current_value_),
                 sizeof(EncodedFileMetaData));
  }
@ -228,7 +228,7 @@ static Iterator* GetFileIterator(void* arg, const ReadOptions& options,
    const EncodedFileMetaData* encoded_meta =
        reinterpret_cast<const EncodedFileMetaData*>(file_value.data());
    FileMetaData meta(encoded_meta->number, encoded_meta->file_size);
-    meta.table_reader_handle = encoded_meta->table_reader_handle;
+    meta.table_reader = encoded_meta->table_reader;
    return cache->NewIterator(
        options.prefix ? options_copy : options, soptions, icomparator, meta,
        nullptr /* don't need reference to table*/, for_compaction);
@ -254,7 +254,7 @@ bool Version::PrefixMayMatch(const ReadOptions& options,
        reinterpret_cast<const EncodedFileMetaData*>(
            level_iter->value().data());
    FileMetaData meta(encoded_meta->number, encoded_meta->file_size);
-    meta.table_reader_handle = encoded_meta->table_reader_handle;
+    meta.table_reader = encoded_meta->table_reader;
    may_match = vset_->table_cache_->PrefixMayMatch(options, vset_->icmp_, meta,
                                                    internal_prefix, nullptr);
  }
@ -478,6 +478,12 @@ bool BySmallestKey(FileMetaData* a, FileMetaData* b,

 Version::Version(VersionSet* vset, uint64_t version_number)
    : vset_(vset),
+      internal_comparator_(&(vset->icmp_)),
+      user_comparator_(internal_comparator_->user_comparator()),
+      table_cache_(vset->table_cache_),
+      merge_operator_(vset->options_->merge_operator.get()),
+      info_log_(vset->options_->info_log.get()),
+      db_statistics_(vset->options_->statistics.get()),
      next_(this),
      prev_(this),
      refs_(0),
@ -497,27 +503,22 @@ void Version::Get(const ReadOptions& options,
                  Status* status,
                  MergeContext* merge_context,
                  GetStats* stats,
-                  const Options& db_options,
                  bool* value_found) {
  Slice ikey = k.internal_key();
  Slice user_key = k.user_key();
-  const Comparator* ucmp = vset_->icmp_.user_comparator();
-
-  auto merge_operator = db_options.merge_operator.get();
-  auto logger = db_options.info_log.get();

  assert(status->ok() || status->IsMergeInProgress());
  Saver saver;
  saver.state = status->ok()? kNotFound : kMerge;
-  saver.ucmp = ucmp;
+  saver.ucmp = user_comparator_;
  saver.user_key = user_key;
  saver.value_found = value_found;
  saver.value = value;
-  saver.merge_operator = merge_operator;
+  saver.merge_operator = merge_operator_;
  saver.merge_context = merge_context;
-  saver.logger = logger;
+  saver.logger = info_log_;
  saver.didIO = false;
-  saver.statistics = db_options.statistics.get();
+  saver.statistics = db_statistics_;

  stats->seek_file = nullptr;
  stats->seek_file_level = -1;
@ -548,7 +549,7 @@ void Version::Get(const ReadOptions& options,
      // On Level-n (n>=1), files are sorted.
      // Binary search to find earliest index whose largest key >= ikey.
      // We will also stop when the file no longer overlaps ikey
-      start_index = FindFile(vset_->icmp_, files_[level], ikey);
+      start_index = FindFile(*internal_comparator_, files_[level], ikey);
    }

    // Traverse each relevant file to find the desired key
@ -557,8 +558,10 @@ void Version::Get(const ReadOptions& options,
 #endif
    for (uint32_t i = start_index; i < num_files; ++i) {
      FileMetaData* f = files[i];
-      if (ucmp->Compare(user_key, f->smallest.user_key()) < 0 ||
-          ucmp->Compare(user_key, f->largest.user_key()) > 0) {
+      // Skip key range filtering for levle 0 if there are few level 0 files.
+      if ((level > 0 || num_files > 2) &&
+          (user_comparator_->Compare(user_key, f->smallest.user_key()) < 0 ||
+           user_comparator_->Compare(user_key, f->largest.user_key()) > 0)) {
        // Only process overlapping files.
        if (level > 0) {
          // If on Level-n (n>=1) then the files are sorted.
@ -574,7 +577,8 @@ void Version::Get(const ReadOptions& options,
      // Sanity check to make sure that the files are correctly sorted
      if (prev_file) {
        if (level != 0) {
-          int comp_sign = vset_->icmp_.Compare(prev_file->largest, f->smallest);
+          int comp_sign =
+              internal_comparator_->Compare(prev_file->largest, f->smallest);
          assert(comp_sign < 0);
        } else {
          // level == 0, the current file cannot be newer than the previous one.
@ -588,9 +592,8 @@ void Version::Get(const ReadOptions& options,
      prev_file = f;
 #endif
      bool tableIO = false;
-      *status =
-          vset_->table_cache_->Get(options, vset_->icmp_, *f, ikey, &saver,
-                                   SaveValue, &tableIO, MarkKeyMayExist);
+      *status = table_cache_->Get(options, *internal_comparator_, *f, ikey,
+                                  &saver, SaveValue, &tableIO, MarkKeyMayExist);
      // TODO: examine the behavior for corrupted key
      if (!status->ok()) {
        return;
@ -635,12 +638,12 @@ void Version::Get(const ReadOptions& options,
  if (kMerge == saver.state) {
    // merge_operands are in saver and we hit the beginning of the key history
    // do a final merge of nullptr and operands;
-    if (merge_operator->FullMerge(user_key, nullptr,
-                                  saver.merge_context->GetOperands(),
-                                  value, logger)) {
+    if (merge_operator_->FullMerge(user_key, nullptr,
+                                   saver.merge_context->GetOperands(), value,
+                                   info_log_)) {
      *status = Status::OK();
    } else {
-      RecordTick(db_options.statistics.get(), NUMBER_MERGE_FAILURES);
+      RecordTick(db_statistics_, NUMBER_MERGE_FAILURES);
      *status = Status::Corruption("could not perform end-of-key merge for ",
                                   user_key);
    }
@ -1447,6 +1450,12 @@ class VersionSet::Builder {
                                       file_meta->number, file_meta->file_size,
                                       &file_meta->table_reader_handle,
                                       &table_io, false);
+        if (file_meta->table_reader_handle != nullptr) {
+          // Load table_reader
+          file_meta->table_reader =
+              vset_->table_cache_->GetTableReaderFromHandle(
+                  file_meta->table_reader_handle);
+        }
      }
    }
  }
--- a/db/version_set.h
+++ b/db/version_set.h
@ -83,8 +83,7 @@ class Version {
    int seek_file_level;
  };
  void Get(const ReadOptions&, const LookupKey& key, std::string* val,
-           Status* status, MergeContext* merge_context,
-           GetStats* stats, const Options& db_option,
+           Status* status, MergeContext* merge_context, GetStats* stats,
           bool* value_found = nullptr);

  // Adds "stats" into the current state.  Returns true if a new
@ -224,6 +223,12 @@ class Version {
  void UpdateFilesBySize();

  VersionSet* vset_;            // VersionSet to which this Version belongs
+  const InternalKeyComparator* internal_comparator_;
+  const Comparator* user_comparator_;
+  TableCache* table_cache_;
+  const MergeOperator* merge_operator_;
+  Logger* info_log_;
+  Statistics* db_statistics_;
  Version* next_;               // Next version in linked list
  Version* prev_;               // Previous version in linked list
  int refs_;                    // Number of live refs to this version
--- a/include/rocksdb/table.h
+++ b/include/rocksdb/table.h
@ -60,6 +60,12 @@ struct BlockBasedTableOptions {
    // A space efficient index block that is optimized for
    // binary-search-based index.
    kBinarySearch,
+
+    // The hash index, if enabled, will do the hash lookup when
+    // `ReadOption.prefix_seek == true`. User should also specify
+    // `Options.prefix_extractor` to allow the index block to correctly
+    // extract the prefix of the given key and perform hash table lookup.
+    kHashSearch,
  };

  IndexType index_type = kBinarySearch;
--- a/include/rocksdb/table_properties.h
+++ b/include/rocksdb/table_properties.h
@ -23,7 +23,7 @@ namespace rocksdb {
 //      ++pos) {
 //   ...
 // }
-typedef std::map<std::string, std::string> UserCollectedProperties;
+typedef std::map<const std::string, std::string> UserCollectedProperties;

 // TableProperties contains a bunch of read-only properties of its associated
 // table.
--- a/port/stack_trace.cc
+++ b/port/stack_trace.cc
@ -10,7 +10,9 @@
 #include <execinfo.h>
 #include <signal.h>
 #include <stdio.h>
-#include <stdlib.h>
+// It's odd that including this breaks in GCC 7 but the build doesn't break
+// if I remove it even under GCC 4.8.
+// #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>

--- a/table/block.cc
+++ b/table/block.cc
@ -11,16 +11,20 @@

 #include "table/block.h"

-#include <vector>
 #include <algorithm>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
 #include "rocksdb/comparator.h"
+#include "table/block_hash_index.h"
 #include "table/format.h"
 #include "util/coding.h"
 #include "util/logging.h"

 namespace rocksdb {

-inline uint32_t Block::NumRestarts() const {
+uint32_t Block::NumRestarts() const {
  assert(size_ >= 2*sizeof(uint32_t));
  return DecodeFixed32(data_ + size_ - sizeof(uint32_t));
 }
@ -92,6 +96,7 @@ class Block::Iter : public Iterator {
  std::string key_;
  Slice value_;
  Status status_;
+  BlockHashIndex* hash_index_;

  inline int Compare(const Slice& a, const Slice& b) const {
    return comparator_->Compare(a, b);
@ -118,16 +123,15 @@ class Block::Iter : public Iterator {
  }

 public:
-  Iter(const Comparator* comparator,
-       const char* data,
-       uint32_t restarts,
-       uint32_t num_restarts)
+  Iter(const Comparator* comparator, const char* data, uint32_t restarts,
+       uint32_t num_restarts, BlockHashIndex* hash_index)
      : comparator_(comparator),
        data_(data),
        restarts_(restarts),
        num_restarts_(num_restarts),
        current_(restarts_),
-        restart_index_(num_restarts_) {
+        restart_index_(num_restarts_),
+        hash_index_(hash_index) {
    assert(num_restarts_ > 0);
  }

@ -169,45 +173,22 @@ class Block::Iter : public Iterator {
  }

  virtual void Seek(const Slice& target) {
-    // Binary search in restart array to find the first restart point
-    // with a key >= target
-    uint32_t left = 0;
-    uint32_t right = num_restarts_ - 1;
-    while (left < right) {
-      uint32_t mid = (left + right + 1) / 2;
-      uint32_t region_offset = GetRestartPoint(mid);
-      uint32_t shared, non_shared, value_length;
-      const char* key_ptr = DecodeEntry(data_ + region_offset,
-                                        data_ + restarts_,
-                                        &shared, &non_shared, &value_length);
-      if (key_ptr == nullptr || (shared != 0)) {
-        CorruptionError();
+    uint32_t index = 0;
+    bool ok = hash_index_ ? HashSeek(target, &index)
+                          : BinarySeek(target, 0, num_restarts_ - 1, &index);
+
+    if (!ok) {
      return;
    }
-      Slice mid_key(key_ptr, non_shared);
-      if (Compare(mid_key, target) < 0) {
-        // Key at "mid" is smaller than "target".  Therefore all
-        // blocks before "mid" are uninteresting.
-        left = mid;
-      } else {
-        // Key at "mid" is >= "target".  Therefore all blocks at or
-        // after "mid" are uninteresting.
-        right = mid - 1;
-      }
-    }
-
+    SeekToRestartPoint(index);
    // Linear search (within restart block) for first key >= target
-    SeekToRestartPoint(left);
-    while (true) {
-      if (!ParseNextKey()) {
-        return;
-      }
-      if (Compare(key_, target) >= 0) {
-        return;
-      }
-    }
-  }

+    while (true) {
+      if (!ParseNextKey() || Compare(key_, target) >= 0) {
+        return;
+      }
+    }
+  }
  virtual void SeekToFirst() {
    SeekToRestartPoint(0);
    ParseNextKey();
@ -257,6 +238,53 @@ class Block::Iter : public Iterator {
      return true;
    }
  }
+  // Binary search in restart array to find the first restart point
+  // with a key >= target
+  bool BinarySeek(const Slice& target, uint32_t left, uint32_t right,
+                  uint32_t* index) {
+    assert(left <= right);
+
+    while (left < right) {
+      uint32_t mid = (left + right + 1) / 2;
+      uint32_t region_offset = GetRestartPoint(mid);
+      uint32_t shared, non_shared, value_length;
+      const char* key_ptr =
+          DecodeEntry(data_ + region_offset, data_ + restarts_, &shared,
+                      &non_shared, &value_length);
+      if (key_ptr == nullptr || (shared != 0)) {
+        CorruptionError();
+        return false;
+      }
+      Slice mid_key(key_ptr, non_shared);
+      if (Compare(mid_key, target) < 0) {
+        // Key at "mid" is smaller than "target". Therefore all
+        // blocks before "mid" are uninteresting.
+        left = mid;
+      } else {
+        // Key at "mid" is >= "target". Therefore all blocks at or
+        // after "mid" are uninteresting.
+        right = mid - 1;
+      }
+    }
+
+    *index = left;
+    return true;
+  }
+
+  bool HashSeek(const Slice& target, uint32_t* index) {
+    assert(hash_index_);
+    auto restart_index = hash_index_->GetRestartIndex(target);
+    if (restart_index == nullptr) {
+      current_ = restarts_;
+      return 0;
+    }
+
+    // the elements in restart_array[index : index + num_blocks]
+    // are all with same prefix. We'll do binary search in that small range.
+    auto left = restart_index->first_index;
+    auto right = restart_index->first_index + restart_index->num_blocks - 1;
+    return BinarySeek(target, left, right, index);
+  }
 };

 Iterator* Block::NewIterator(const Comparator* cmp) {
@ -267,8 +295,13 @@ Iterator* Block::NewIterator(const Comparator* cmp) {
  if (num_restarts == 0) {
    return NewEmptyIterator();
  } else {
-    return new Iter(cmp, data_, restart_offset_, num_restarts);
+    return new Iter(cmp, data_, restart_offset_, num_restarts,
+                    hash_index_.get());
  }
 }

+void Block::SetBlockHashIndex(BlockHashIndex* hash_index) {
+  hash_index_.reset(hash_index);
+}
+
 }  // namespace rocksdb
--- a/table/block.h
+++ b/table/block.h
@ -10,6 +10,7 @@
 #pragma once
 #include <stddef.h>
 #include <stdint.h>
+
 #include "rocksdb/iterator.h"
 #include "rocksdb/options.h"

@ -17,6 +18,7 @@ namespace rocksdb {

 struct BlockContents;
 class Comparator;
+class BlockHashIndex;

 class Block {
 public:
@ -26,20 +28,28 @@ class Block {
  ~Block();

  size_t size() const { return size_; }
+  const char* data() const { return data_; }
  bool cachable() const { return cachable_; }
+  uint32_t NumRestarts() const;
  CompressionType compression_type() const { return compression_type_; }
+
+  // If hash index lookup is enabled and `use_hash_index` is true. This block
+  // will do hash lookup for the key prefix.
+  //
+  // NOTE: for the hash based lookup, if a key prefix doesn't match any key,
+  // the iterator will simply be set as "invalid", rather than returning
+  // the key that is just pass the target key.
  Iterator* NewIterator(const Comparator* comparator);
-  const char* data() { return data_; }
+  void SetBlockHashIndex(BlockHashIndex* hash_index);

 private:
-  uint32_t NumRestarts() const;
-
  const char* data_;
  size_t size_;
  uint32_t restart_offset_;     // Offset in data_ of restart array
  bool owned_;                  // Block owns data_[]
  bool cachable_;
  CompressionType compression_type_;
+  std::unique_ptr<BlockHashIndex> hash_index_;

  // No copying allowed
  Block(const Block&);
--- a/table/block_based_table_builder.cc
+++ b/table/block_based_table_builder.cc
@ -88,8 +88,7 @@ class IndexBuilder {
  const Comparator* comparator_;
 };

-// This index builder builds space-efficient index block for binary-search-based
-// index.
+// This index builder builds space-efficient index block.
 //
 // Optimizations:
 //  1. Made block's `block_restart_interval` to be 1, which will avoid linear
@ -97,9 +96,9 @@ class IndexBuilder {
 //  2. Shorten the key length for index block. Other than honestly using the
 //     last key in the data block as the index key, we instead find a shortest
 //     substitute key that serves the same function.
-class BinarySearchIndexBuilder : public IndexBuilder {
+class ShortenedIndexBuilder : public IndexBuilder {
 public:
-  explicit BinarySearchIndexBuilder(const Comparator* comparator)
+  explicit ShortenedIndexBuilder(const Comparator* comparator)
      : IndexBuilder(comparator),
        index_block_builder_(1 /* block_restart_interval == 1 */, comparator) {}

@ -128,11 +127,37 @@ class BinarySearchIndexBuilder : public IndexBuilder {
  BlockBuilder index_block_builder_;
 };

+// FullKeyIndexBuilder is also based on BlockBuilder. It works pretty much like
+// ShortenedIndexBuilder, but preserves the full key instead the substitude key.
+class FullKeyIndexBuilder : public IndexBuilder {
+ public:
+  explicit FullKeyIndexBuilder(const Comparator* comparator)
+      : IndexBuilder(comparator),
+        index_block_builder_(1 /* block_restart_interval == 1 */, comparator) {}
+
+  virtual void AddEntry(std::string* last_key_in_current_block,
+                        const Slice* first_key_in_next_block,
+                        const BlockHandle& block_handle) override {
+    std::string handle_encoding;
+    block_handle.EncodeTo(&handle_encoding);
+    index_block_builder_.Add(*last_key_in_current_block, handle_encoding);
+  }
+
+  virtual Slice Finish() override { return index_block_builder_.Finish(); }
+
+  virtual size_t EstimatedSize() const {
+    return index_block_builder_.CurrentSizeEstimate();
+  }
+
+ private:
+  BlockBuilder index_block_builder_;
+};
+
 // Create a index builder based on its type.
 IndexBuilder* CreateIndexBuilder(IndexType type, const Comparator* comparator) {
  switch (type) {
    case BlockBasedTableOptions::kBinarySearch: {
-      return new BinarySearchIndexBuilder(comparator);
+      return new ShortenedIndexBuilder(comparator);
    }
    default: {
      assert(!"Do not recognize the index type ");
@ -296,13 +321,16 @@ struct BlockBasedTableBuilder::Rep {
  }
 };

+// TODO(sdong): Currently only write out binary search index. In
+// BlockBasedTableReader, Hash index will be built using binary search index.
 BlockBasedTableBuilder::BlockBasedTableBuilder(
    const Options& options, const BlockBasedTableOptions& table_options,
    const InternalKeyComparator& internal_comparator, WritableFile* file,
    CompressionType compression_type)
    : rep_(new Rep(options, internal_comparator, file,
                   table_options.flush_block_policy_factory.get(),
-                   compression_type, table_options.index_type)) {
+                   compression_type,
+                   BlockBasedTableOptions::IndexType::kBinarySearch)) {
  if (rep_->filter_block != nullptr) {
    rep_->filter_block->StartBlock(0);
  }
--- a/table/block_based_table_reader.cc
+++ b/table/block_based_table_reader.cc
@ -9,6 +9,7 @@

 #include "table/block_based_table_reader.h"

+#include <functional>
 #include <string>
 #include <utility>

@ -25,6 +26,7 @@

 #include "table/block.h"
 #include "table/filter_block.h"
+#include "table/block_hash_index.h"
 #include "table/format.h"
 #include "table/meta_blocks.h"
 #include "table/two_level_iterator.h"
@ -180,19 +182,51 @@ class BinarySearchIndexReader : public IndexReader {
  std::unique_ptr<Block> index_block_;
 };

-// TODO(kailiu) This class is only a stub for now. And the comment below is also
-// not completed.
 // Index that leverages an internal hash table to quicken the lookup for a given
 // key.
+// @param data_iter_gen, equavalent to BlockBasedTable::NewIterator(). But that
+// functions requires index to be initalized. To avoid this problem external
+// caller will pass a function that can create the iterator over the entries
+// without the table to be fully initialized.
 class HashIndexReader : public IndexReader {
 public:
  static Status Create(RandomAccessFile* file, const BlockHandle& index_handle,
                       Env* env, const Comparator* comparator,
-                       BlockBasedTable* table,
+                       std::function<Iterator*(Iterator*)> data_iter_gen,
                       const SliceTransform* prefix_extractor,
                       IndexReader** index_reader) {
-    return Status::NotSupported("not implemented yet!");
+    assert(prefix_extractor);
+    Block* index_block = nullptr;
+    auto s =
+        ReadBlockFromFile(file, ReadOptions(), index_handle, &index_block, env);
+
+    if (!s.ok()) {
+      return s;
    }
+
+    *index_reader = new HashIndexReader(comparator, index_block);
+    std::unique_ptr<Iterator> index_iter(index_block->NewIterator(nullptr));
+    std::unique_ptr<Iterator> data_iter(
+        data_iter_gen(index_block->NewIterator(nullptr)));
+    auto hash_index = CreateBlockHashIndex(index_iter.get(), data_iter.get(),
+                                           index_block->NumRestarts(),
+                                           comparator, prefix_extractor);
+    index_block->SetBlockHashIndex(hash_index);
+    return s;
+  }
+
+  virtual Iterator* NewIterator() override {
+    return index_block_->NewIterator(comparator_);
+  }
+
+  virtual size_t size() const override { return index_block_->size(); }
+
+ private:
+  HashIndexReader(const Comparator* comparator, Block* index_block)
+      : IndexReader(comparator), index_block_(index_block) {
+    assert(index_block_ != nullptr);
+  }
+  std::unique_ptr<Block> index_block_;
 };


@ -223,6 +257,11 @@ struct BlockBasedTable::Rep {

  std::shared_ptr<const TableProperties> table_properties;
  BlockBasedTableOptions::IndexType index_type;
+  // TODO(kailiu) It is very ugly to use internal key in table, since table
+  // module should not be relying on db module. However to make things easier
+  // and compatible with existing code, we introduce a wrapper that allows
+  // block to extract prefix without knowing if a key is internal or not.
+  unique_ptr<SliceTransform> internal_prefix_transform;
 };

 BlockBasedTable::~BlockBasedTable() {
@ -327,8 +366,20 @@ Status BlockBasedTable::Open(const Options& options, const EnvOptions& soptions,
  s = ReadMetaBlock(rep, &meta, &meta_iter);

  // Read the properties
+  bool found_properties_block = true;
  meta_iter->Seek(kPropertiesBlock);
-  if (meta_iter->Valid() && meta_iter->key() == kPropertiesBlock) {
+  if (meta_iter->status().ok() &&
+      (!meta_iter->Valid() || meta_iter->key() != kPropertiesBlock)) {
+    meta_iter->Seek(kPropertiesBlockOldName);
+    if (meta_iter->status().ok() &&
+        (!meta_iter->Valid() || meta_iter->key() != kPropertiesBlockOldName)) {
+      found_properties_block = false;
+      Log(WARN, rep->options.info_log,
+          "Cannot find Properties block from file.");
+    }
+  }
+
+  if (found_properties_block) {
    s = meta_iter->status();
    TableProperties* table_properties = nullptr;
    if (s.ok()) {
@ -747,8 +798,7 @@ BlockBasedTable::CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
  return { filter, cache_handle };
 }

-Iterator* BlockBasedTable::NewIndexIterator(const ReadOptions& read_options)
-    const {
+Iterator* BlockBasedTable::NewIndexIterator(const ReadOptions& read_options) {
  // index reader has already been pre-populated.
  if (rep_->index_reader) {
    return rep_->index_reader->NewIterator();
@ -954,11 +1004,14 @@ Status BlockBasedTable::Get(
  return s;
 }

+namespace {
 bool SaveDidIO(void* arg, const ParsedInternalKey& key, const Slice& value,
               bool didIO) {
  *reinterpret_cast<bool*>(arg) = didIO;
  return false;
 }
+}  // namespace
+
 bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options,
                                      const Slice& key) {
  // We use Get() as it has logic that checks whether we read the
@ -975,22 +1028,51 @@ bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options,
 //  3. options
 //  4. internal_comparator
 //  5. index_type
-Status BlockBasedTable::CreateIndexReader(IndexReader** index_reader) const {
+Status BlockBasedTable::CreateIndexReader(IndexReader** index_reader) {
  // Some old version of block-based tables don't have index type present in
  // table properties. If that's the case we can safely use the kBinarySearch.
-  auto index_type = BlockBasedTableOptions::kBinarySearch;
+
+  auto index_type_on_file = BlockBasedTableOptions::kBinarySearch;
+  if (rep_->table_properties) {
    auto& props = rep_->table_properties->user_collected_properties;
    auto pos = props.find(BlockBasedTablePropertyNames::kIndexType);
    if (pos != props.end()) {
-    index_type = static_cast<BlockBasedTableOptions::IndexType>(
+      index_type_on_file = static_cast<BlockBasedTableOptions::IndexType>(
          DecodeFixed32(pos->second.c_str()));
    }
+  }

-  switch (index_type) {
+  // TODO(sdong): Currently binary index is the only index type we support in
+  // files. Hash index is built on top of binary index too.
+  if (index_type_on_file != BlockBasedTableOptions::kBinarySearch) {
+    return Status::NotSupported("File Contains not supported index type: ",
+                                std::to_string(index_type_on_file));
+  }
+
+  auto file = rep_->file.get();
+  const auto& index_handle = rep_->index_handle;
+  auto env = rep_->options.env;
+  auto comparator = &rep_->internal_comparator;
+
+  switch (rep_->index_type) {
    case BlockBasedTableOptions::kBinarySearch: {
-      return BinarySearchIndexReader::Create(
-          rep_->file.get(), rep_->index_handle, rep_->options.env,
-          &rep_->internal_comparator, index_reader);
+      return BinarySearchIndexReader::Create(file, index_handle, env,
+                                             comparator, index_reader);
+    }
+    case BlockBasedTableOptions::kHashSearch: {
+      // We need to wrap data with internal_prefix_transform to make sure it can
+      // handle prefix correctly.
+      rep_->internal_prefix_transform.reset(
+          new InternalKeySliceTransform(rep_->options.prefix_extractor.get()));
+      return HashIndexReader::Create(
+          file, index_handle, env, comparator,
+          [&](Iterator* index_iter) {
+            return NewTwoLevelIterator(
+                index_iter, &BlockBasedTable::DataBlockReader,
+                const_cast<BlockBasedTable*>(this), ReadOptions(),
+                rep_->soptions, rep_->internal_comparator);
+          },
+          rep_->internal_prefix_transform.get(), index_reader);
    }
    default: {
      std::string error_message =
@ -1023,7 +1105,10 @@ uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key) {
    // key is past the last key in the file. If table_properties is not
    // available, approximate the offset by returning the offset of the
    // metaindex block (which is right near the end of the file).
+    result = 0;
+    if (rep_->table_properties) {
      result = rep_->table_properties->data_size;
+    }
    // table_properties is not present in the table.
    if (result == 0) {
      result = rep_->metaindex_handle.offset();
--- a/table/block_based_table_reader.h
+++ b/table/block_based_table_reader.h
@ -12,6 +12,7 @@
 #include <stdint.h>
 #include <memory>
 #include <utility>
+#include <string>

 #include "rocksdb/statistics.h"
 #include "rocksdb/status.h"
@ -131,7 +132,7 @@ class BlockBasedTable : public TableReader {
  //  2. index is not present in block cache.
  //  3. We disallowed any io to be performed, that is, read_options ==
  //     kBlockCacheTier
-  Iterator* NewIndexIterator(const ReadOptions& read_options) const;
+  Iterator* NewIndexIterator(const ReadOptions& read_options);

  // Read block cache from block caches (if set): block_cache and
  // block_cache_compressed.
@ -164,7 +165,7 @@ class BlockBasedTable : public TableReader {

  void ReadMeta(const Footer& footer);
  void ReadFilter(const Slice& filter_handle_value);
-  Status CreateIndexReader(IndexReader** index_reader) const;
+  Status CreateIndexReader(IndexReader** index_reader);

  // Read the meta block from sst.
  static Status ReadMetaBlock(
@ -198,4 +199,8 @@ class BlockBasedTable : public TableReader {
  void operator=(const TableReader&) = delete;
 };

+// Backward compatible properties block name. Limited in block based
+// table.
+extern const std::string kPropertiesBlockOldName;
+
 }  // namespace rocksdb
--- a/table/block_test.cc
+++ b/table/block_test.cc
@ -3,7 +3,10 @@
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 //
+#include <stdio.h>
 #include <string>
+#include <vector>
+
 #include "db/dbformat.h"
 #include "db/memtable.h"
 #include "db/write_batch_internal.h"
@ -11,9 +14,11 @@
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/table.h"
+#include "rocksdb/slice_transform.h"
 #include "table/block.h"
 #include "table/block_builder.h"
 #include "table/format.h"
+#include "table/block_hash_index.h"
 #include "util/random.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
@ -25,6 +30,40 @@ static std::string RandomString(Random* rnd, int len) {
  test::RandomString(rnd, len, &r);
  return r;
 }
+std::string GenerateKey(int primary_key, int secondary_key, int padding_size,
+                        Random *rnd) {
+  char buf[50];
+  char *p = &buf[0];
+  snprintf(buf, sizeof(buf), "%6d%4d", primary_key, secondary_key);
+  std::string k(p);
+  if (padding_size) {
+    k += RandomString(rnd, padding_size);
+  }
+
+  return k;
+}
+
+// Generate random key value pairs.
+// The generated key will be sorted. You can tune the parameters to generated
+// different kinds of test key/value pairs for different scenario.
+void GenerateRandomKVs(std::vector<std::string> *keys,
+                       std::vector<std::string> *values, const int from,
+                       const int len, const int step = 1,
+                       const int padding_size = 0,
+                       const int keys_share_prefix = 1) {
+  Random rnd(302);
+
+  // generate different prefix
+  for (int i = from; i < from + len; i += step) {
+    // generating keys that shares the prefix
+    for (int j = 0; j < keys_share_prefix; ++j) {
+      keys->emplace_back(GenerateKey(i, j, padding_size, &rnd));
+
+      // 100 bytes values
+      values->emplace_back(RandomString(&rnd, 100));
+    }
+  }
+}

 class BlockTest {};

@ -39,24 +78,11 @@ TEST(BlockTest, SimpleTest) {
  std::vector<std::string> values;
  BlockBuilder builder(options, ic.get());
  int num_records = 100000;
-  char buf[10];
-  char* p = &buf[0];

+  GenerateRandomKVs(&keys, &values, 0, num_records);
  // add a bunch of records to a block
  for (int i = 0; i < num_records; i++) {
-    // generate random kvs
-    sprintf(p, "%6d", i);
-    std::string k(p);
-    std::string v = RandomString(&rnd, 100); // 100 byte values
-
-    // write kvs to the block
-    Slice key(k);
-    Slice value(v);
-    builder.Add(key, value);
-
-    // remember kvs in a lookaside array
-    keys.push_back(k);
-    values.push_back(v);
+    builder.Add(keys[i], values[i]);
  }

  // read serialized contents of the block
@ -101,6 +127,114 @@ TEST(BlockTest, SimpleTest) {
  delete iter;
 }

+// return the block contents
+BlockContents GetBlockContents(std::unique_ptr<BlockBuilder> *builder,
+                               const std::vector<std::string> &keys,
+                               const std::vector<std::string> &values,
+                               const int prefix_group_size = 1) {
+  builder->reset(
+      new BlockBuilder(1 /* restart interval */, BytewiseComparator()));
+
+  // Add only half of the keys
+  for (size_t i = 0; i < keys.size(); ++i) {
+    (*builder)->Add(keys[i], values[i]);
+  }
+  Slice rawblock = (*builder)->Finish();
+
+  BlockContents contents;
+  contents.data = rawblock;
+  contents.cachable = false;
+  contents.heap_allocated = false;
+
+  return contents;
+}
+
+void CheckBlockContents(BlockContents contents, const int max_key,
+                        const std::vector<std::string> &keys,
+                        const std::vector<std::string> &values) {
+  const size_t prefix_size = 6;
+  // create block reader
+  Block reader1(contents);
+  Block reader2(contents);
+
+  std::unique_ptr<const SliceTransform> prefix_extractor(
+      NewFixedPrefixTransform(prefix_size));
+
+  {
+    auto iter1 = reader1.NewIterator(nullptr);
+    auto iter2 = reader1.NewIterator(nullptr);
+    reader1.SetBlockHashIndex(CreateBlockHashIndex(iter1, iter2, keys.size(),
+                                                   BytewiseComparator(),
+                                                   prefix_extractor.get()));
+
+    delete iter1;
+    delete iter2;
+  }
+
+  std::unique_ptr<Iterator> hash_iter(
+      reader1.NewIterator(BytewiseComparator()));
+
+  std::unique_ptr<Iterator> regular_iter(
+      reader2.NewIterator(BytewiseComparator()));
+
+  // Seek existent keys
+  for (size_t i = 0; i < keys.size(); i++) {
+    hash_iter->Seek(keys[i]);
+    ASSERT_OK(hash_iter->status());
+    ASSERT_TRUE(hash_iter->Valid());
+
+    Slice v = hash_iter->value();
+    ASSERT_EQ(v.ToString().compare(values[i]), 0);
+  }
+
+  // Seek non-existent keys.
+  // For hash index, if no key with a given prefix is not found, iterator will
+  // simply be set as invalid; whereas the binary search based iterator will
+  // return the one that is closest.
+  for (int i = 1; i < max_key - 1; i += 2) {
+    auto key = GenerateKey(i, 0, 0, nullptr);
+    hash_iter->Seek(key);
+    ASSERT_TRUE(!hash_iter->Valid());
+
+    regular_iter->Seek(key);
+    ASSERT_TRUE(regular_iter->Valid());
+  }
+}
+
+// In this test case, no two key share same prefix.
+TEST(BlockTest, SimpleIndexHash) {
+  const int kMaxKey = 100000;
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+  GenerateRandomKVs(&keys, &values, 0 /* first key id */,
+                    kMaxKey /* last key id */, 2 /* step */,
+                    8 /* padding size (8 bytes randomly generated suffix) */);
+
+  std::unique_ptr<BlockBuilder> builder;
+  auto contents = GetBlockContents(&builder, keys, values);
+
+  CheckBlockContents(contents, kMaxKey, keys, values);
+}
+
+TEST(BlockTest, IndexHashWithSharedPrefix) {
+  const int kMaxKey = 100000;
+  // for each prefix, there will be 5 keys starts with it.
+  const int kPrefixGroup = 5;
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+  // Generate keys with same prefix.
+  GenerateRandomKVs(&keys, &values, 0,  // first key id
+                    kMaxKey,            // last key id
+                    2,                  // step
+                    10,                 // padding size,
+                    kPrefixGroup);
+
+  std::unique_ptr<BlockBuilder> builder;
+  auto contents = GetBlockContents(&builder, keys, values, kPrefixGroup);
+
+  CheckBlockContents(contents, kMaxKey, keys, values);
+}
+
 }  // namespace rocksdb

 int main(int argc, char** argv) {
--- a/table/meta_blocks.cc
+++ b/table/meta_blocks.cc
@ -244,6 +244,8 @@ Status ReadTableProperties(RandomAccessFile* file, uint64_t file_size,
      metaindex_block.NewIterator(BytewiseComparator()));

  // -- Read property block
+  // This function is not used by BlockBasedTable, so we don't have to
+  // worry about old properties block name.
  meta_iter->Seek(kPropertiesBlock);
  TableProperties table_properties;
  if (meta_iter->Valid() &&
--- a/table/plain_table_builder.cc
+++ b/table/plain_table_builder.cc
@ -74,10 +74,12 @@ void PlainTableBuilder::Add(const Slice& key, const Slice& value) {

  if (!IsFixedLength()) {
    // Write key length
-    key_size_str_.clear();
-    PutVarint32(&key_size_str_, user_key_size);
-    file_->Append(key_size_str_);
-    offset_ += key_size_str_.length();
+    char key_size_buf[5];  // tmp buffer for key size as varint32
+    char* ptr = EncodeVarint32(key_size_buf, user_key_size);
+    assert(ptr <= key_size_buf + sizeof(key_size_buf));
+    auto len = ptr - key_size_buf;
+    file_->Append(Slice(key_size_buf, len));
+    offset_ += len;
  }

  // Write key
@ -86,25 +88,32 @@ void PlainTableBuilder::Add(const Slice& key, const Slice& value) {
    status_ = Status::Corruption(Slice());
    return;
  }
+  // For value size as varint32 (up to 5 bytes).
+  // If the row is of value type with seqId 0, flush the special flag together
+  // in this buffer to safe one file append call, which takes 1 byte.
+  char value_size_buf[6];
+  size_t value_size_buf_size = 0;
  if (parsed_key.sequence == 0 && parsed_key.type == kTypeValue) {
    file_->Append(Slice(key.data(), user_key_size));
-    char tmp_char = PlainTableFactory::kValueTypeSeqId0;
-    file_->Append(Slice(&tmp_char, 1));
-    offset_ += key.size() - 7;
+    offset_ += user_key_size;
+    value_size_buf[0] = PlainTableFactory::kValueTypeSeqId0;
+    value_size_buf_size = 1;
  } else {
    file_->Append(key);
    offset_ += key.size();
  }

  // Write value length
-  value_size_str_.clear();
  int value_size = value.size();
-  PutVarint32(&value_size_str_, value_size);
-  file_->Append(value_size_str_);
+  char* end_ptr =
+      EncodeVarint32(value_size_buf + value_size_buf_size, value_size);
+  assert(end_ptr <= value_size_buf + sizeof(value_size_buf));
+  value_size_buf_size = end_ptr - value_size_buf;
+  file_->Append(Slice(value_size_buf, value_size_buf_size));

  // Write value
  file_->Append(value);
-  offset_ += value_size + value_size_str_.length();
+  offset_ += value_size + value_size_buf_size;

  properties_.num_entries++;
  properties_.raw_key_size += key.size();
--- a/table/plain_table_builder.h
+++ b/table/plain_table_builder.h
@ -69,9 +69,6 @@ private:
  const size_t user_key_len_;
  bool closed_ = false;  // Either Finish() or Abandon() has been called.

-  std::string key_size_str_;
-  std::string value_size_str_;
-
  bool IsFixedLength() const {
    return user_key_len_ > 0;
  }
--- a/table/plain_table_reader.cc
+++ b/table/plain_table_reader.cc
@ -81,10 +81,9 @@ class PlainTableIterator : public Iterator {
  bool use_prefix_seek_;
  uint32_t offset_;
  uint32_t next_offset_;
-  Slice key_;
+  IterKey key_;
  Slice value_;
  Status status_;
-  std::string tmp_str_;
  // No copying allowed
  PlainTableIterator(const PlainTableIterator&) = delete;
  void operator=(const Iterator&) = delete;
@ -104,8 +103,8 @@ PlainTableReader::PlainTableReader(
      kHashTableRatio(hash_table_ratio),
      kBloomBitsPerKey(bloom_bits_per_key),
      kIndexIntervalForSamePrefixKeys(index_sparseness),
-      table_properties_(table_properties),
-      data_end_offset_(table_properties_->data_size),
+      table_properties_(nullptr),
+      data_end_offset_(table_properties->data_size),
      user_key_len_(table_properties->fixed_key_len) {
  assert(kHashTableRatio >= 0.0);
 }
@ -137,7 +136,7 @@ Status PlainTableReader::Open(
      bloom_bits_per_key, hash_table_ratio, index_sparseness, props));

  // -- Populate Index
-  s = new_reader->PopulateIndex();
+  s = new_reader->PopulateIndex(props);
  if (!s.ok()) {
    return s;
  }
@ -364,7 +363,10 @@ void PlainTableReader::FillIndexes(
      index_size_, kSubIndexSize);
 }

-Status PlainTableReader::PopulateIndex() {
+Status PlainTableReader::PopulateIndex(TableProperties* props) {
+  assert(props != nullptr);
+  table_properties_.reset(props);
+
  // options.prefix_extractor is requried for a hash-based look-up.
  if (options_.prefix_extractor.get() == nullptr && kHashTableRatio != 0) {
    return Status::NotSupported(
@ -409,6 +411,14 @@ Status PlainTableReader::PopulateIndex() {
  // From the temp data structure, populate indexes.
  FillIndexes(sub_index_size_needed, hash_to_offsets, entries_per_bucket);

+  // Fill two table properties.
+  // TODO(sdong): after we have the feature of storing index in file, this
+  // properties need to be populated to index_size instead.
+  props->user_collected_properties["plain_table_hash_table_size"] =
+      std::to_string(index_size_ * 4U);
+  props->user_collected_properties["plain_table_sub_index_size"] =
+      std::to_string(sub_index_size_needed);
+
  return Status::OK();
 }

@ -720,9 +730,7 @@ void PlainTableIterator::Next() {
    status_ = table_->Next(&next_offset_, &parsed_key, &value_);
    if (status_.ok()) {
      // Make a copy in this case. TODO optimize.
-      tmp_str_.clear();
-      AppendInternalKey(&tmp_str_, parsed_key);
-      key_ = Slice(tmp_str_);
+      key_.SetInternalKey(parsed_key);
    } else {
      offset_ = next_offset_ = table_->data_end_offset_;
    }
@ -735,7 +743,7 @@ void PlainTableIterator::Prev() {

 Slice PlainTableIterator::key() const {
  assert(Valid());
-  return key_;
+  return key_.GetKey();
 }

 Slice PlainTableIterator::value() const {
--- a/table/plain_table_reader.h
+++ b/table/plain_table_reader.h
@ -86,6 +86,9 @@ class PlainTableReader: public TableReader {
  // PopulateIndex() builds index of keys. It must be called before any query
  // to the table.
  //
+  // props: the table properties object that need to be stored. Ownership of
+  //        the object will be passed.
+  //
  // index_ contains buckets size of index_size_, each is a
  // 32-bit integer. The lower 31 bits contain an offset value (explained below)
  // and the first bit of the integer indicates type of the offset.
@ -121,7 +124,7 @@ class PlainTableReader: public TableReader {
  //    ....
  //   record N file offset:  fixedint32
  // <end>
-  Status PopulateIndex();
+  Status PopulateIndex(TableProperties* props);

 private:
  struct IndexRecord;
--- a/table/table_properties.cc
+++ b/table/table_properties.cc
@ -91,5 +91,7 @@ const std::string TablePropertiesNames::kFixedKeyLen =
    "rocksdb.fixed.key.length";

 extern const std::string kPropertiesBlock = "rocksdb.properties";
+// Old property block name for backward compatibility
+extern const std::string kPropertiesBlockOldName = "rocksdb.stats";

 }  // namespace rocksdb
--- a/table/table_reader_bench.cc
+++ b/table/table_reader_bench.cc
@ -233,9 +233,9 @@ DEFINE_string(time_unit, "microsecond",
              "`microsecond` (default) or `nanosecond`");

 int main(int argc, char** argv) {
-  google::SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
+  GFLAGS::SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
                          " [OPTIONS]...");
-  google::ParseCommandLineFlags(&argc, &argv, true);
+  GFLAGS::ParseCommandLineFlags(&argc, &argv, true);

  rocksdb::TableFactory* tf = new rocksdb::BlockBasedTableFactory();
  rocksdb::Options options;
--- a/table/table_test.cc
+++ b/table/table_test.cc
@ -1055,6 +1055,116 @@ static std::string RandomString(Random* rnd, int len) {
  return r;
 }

+void AddInternalKey(TableConstructor* c, const std::string prefix,
+                    int suffix_len = 800) {
+  static Random rnd(1023);
+  InternalKey k(prefix + RandomString(&rnd, 800), 0, kTypeValue);
+  c->Add(k.Encode().ToString(), "v");
+}
+
+TEST(TableTest, HashIndexTest) {
+  TableConstructor c(BytewiseComparator());
+
+  // keys with prefix length 3, make sure the key/value is big enough to fill
+  // one block
+  AddInternalKey(&c, "0015");
+  AddInternalKey(&c, "0035");
+
+  AddInternalKey(&c, "0054");
+  AddInternalKey(&c, "0055");
+
+  AddInternalKey(&c, "0056");
+  AddInternalKey(&c, "0057");
+
+  AddInternalKey(&c, "0058");
+  AddInternalKey(&c, "0075");
+
+  AddInternalKey(&c, "0076");
+  AddInternalKey(&c, "0095");
+
+  std::vector<std::string> keys;
+  KVMap kvmap;
+  Options options;
+  BlockBasedTableOptions table_options;
+  table_options.index_type = BlockBasedTableOptions::kHashSearch;
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+
+  options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+  options.block_cache = NewLRUCache(1024);
+  options.block_size = 1700;
+
+  std::unique_ptr<InternalKeyComparator> comparator(
+      new InternalKeyComparator(BytewiseComparator()));
+  c.Finish(options, *comparator, &keys, &kvmap);
+  auto reader = c.table_reader();
+
+  auto props = c.table_reader()->GetTableProperties();
+  ASSERT_EQ(5u, props->num_data_blocks);
+
+  std::unique_ptr<Iterator> hash_iter(reader->NewIterator(ReadOptions()));
+
+  // -- Find keys do not exist, but have common prefix.
+  std::vector<std::string> prefixes = {"001", "003", "005", "007", "009"};
+  std::vector<std::string> lower_bound = {keys[0], keys[1], keys[2],
+                                          keys[7], keys[9], };
+
+  // find the lower bound of the prefix
+  for (size_t i = 0; i < prefixes.size(); ++i) {
+    hash_iter->Seek(InternalKey(prefixes[i], 0, kTypeValue).Encode());
+    ASSERT_OK(hash_iter->status());
+    ASSERT_TRUE(hash_iter->Valid());
+
+    // seek the first element in the block
+    ASSERT_EQ(lower_bound[i], hash_iter->key().ToString());
+    ASSERT_EQ("v", hash_iter->value().ToString());
+  }
+
+  // find the upper bound of prefixes
+  std::vector<std::string> upper_bound = {keys[1], keys[2], keys[7], keys[9], };
+
+  // find existing keys
+  for (const auto& item : kvmap) {
+    auto ukey = ExtractUserKey(item.first).ToString();
+    hash_iter->Seek(ukey);
+
+    // ASSERT_OK(regular_iter->status());
+    ASSERT_OK(hash_iter->status());
+
+    // ASSERT_TRUE(regular_iter->Valid());
+    ASSERT_TRUE(hash_iter->Valid());
+
+    ASSERT_EQ(item.first, hash_iter->key().ToString());
+    ASSERT_EQ(item.second, hash_iter->value().ToString());
+  }
+
+  for (size_t i = 0; i < prefixes.size(); ++i) {
+    // the key is greater than any existing keys.
+    auto key = prefixes[i] + "9";
+    hash_iter->Seek(InternalKey(key, 0, kTypeValue).Encode());
+
+    ASSERT_OK(hash_iter->status());
+    if (i == prefixes.size() - 1) {
+      // last key
+      ASSERT_TRUE(!hash_iter->Valid());
+    } else {
+      ASSERT_TRUE(hash_iter->Valid());
+      // seek the first element in the block
+      ASSERT_EQ(upper_bound[i], hash_iter->key().ToString());
+      ASSERT_EQ("v", hash_iter->value().ToString());
+    }
+  }
+
+  // find keys with prefix that don't match any of the existing prefixes.
+  std::vector<std::string> non_exist_prefixes = {"002", "004", "006", "008"};
+  for (const auto& prefix : non_exist_prefixes) {
+    hash_iter->Seek(InternalKey(prefix, 0, kTypeValue).Encode());
+    // regular_iter->Seek(prefix);
+
+    ASSERT_OK(hash_iter->status());
+    ASSERT_TRUE(!hash_iter->Valid());
+  }
+}
+
 // It's very hard to figure out the index block size of a block accurately.
 // To make sure we get the index size, we just make sure as key number
 // grows, the filter block size also grows.
--- a/tools/db_repl_stress.cc
+++ b/tools/db_repl_stress.cc
@ -87,10 +87,10 @@ DEFINE_uint64(wal_size_limit_MB, 10, "the wal size limit for the run"
              "(in MB)");

 int main(int argc, const char** argv) {
-  google::SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
+  GFLAGS::SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
    " --num_inserts=<num_inserts> --wal_ttl_seconds=<WAL_ttl_seconds>" +
    " --wal_size_limit_MB=<WAL_size_limit_MB>");
-  google::ParseCommandLineFlags(&argc, const_cast<char***>(&argv), true);
+  GFLAGS::ParseCommandLineFlags(&argc, const_cast<char***>(&argv), true);

  Env* env = Env::Default();
  std::string default_db_path;
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@ -61,7 +61,7 @@ static bool ValidateUint32Range(const char* flagname, uint64_t value) {
 }
 DEFINE_uint64(seed, 2341234, "Seed for PRNG");
 static const bool FLAGS_seed_dummy =
-  google::RegisterFlagValidator(&FLAGS_seed, &ValidateUint32Range);
+  GFLAGS::RegisterFlagValidator(&FLAGS_seed, &ValidateUint32Range);

 DEFINE_int64(max_key, 1 * KB * KB * KB,
             "Max number of key/values to place in database");
@ -171,7 +171,7 @@ static bool ValidateInt32Positive(const char* flagname, int32_t value) {
 }
 DEFINE_int32(reopen, 10, "Number of times database reopens");
 static const bool FLAGS_reopen_dummy =
-  google::RegisterFlagValidator(&FLAGS_reopen, &ValidateInt32Positive);
+  GFLAGS::RegisterFlagValidator(&FLAGS_reopen, &ValidateInt32Positive);

 DEFINE_int32(bloom_bits, 10, "Bloom filter bits per key. "
             "Negative means use default settings.");
@ -199,7 +199,7 @@ DEFINE_int32(kill_random_test, 0,
             "If non-zero, kill at various points in source code with "
             "probability 1/this");
 static const bool FLAGS_kill_random_test_dummy =
-  google::RegisterFlagValidator(&FLAGS_kill_random_test,
+  GFLAGS::RegisterFlagValidator(&FLAGS_kill_random_test,
                                &ValidateInt32Positive);
 extern int rocksdb_kill_odds;

@ -227,32 +227,32 @@ static bool ValidateInt32Percent(const char* flagname, int32_t value) {
 DEFINE_int32(readpercent, 10,
             "Ratio of reads to total workload (expressed as a percentage)");
 static const bool FLAGS_readpercent_dummy =
-  google::RegisterFlagValidator(&FLAGS_readpercent, &ValidateInt32Percent);
+  GFLAGS::RegisterFlagValidator(&FLAGS_readpercent, &ValidateInt32Percent);

 DEFINE_int32(prefixpercent, 20,
             "Ratio of prefix iterators to total workload (expressed as a"
             " percentage)");
 static const bool FLAGS_prefixpercent_dummy =
-  google::RegisterFlagValidator(&FLAGS_prefixpercent, &ValidateInt32Percent);
+  GFLAGS::RegisterFlagValidator(&FLAGS_prefixpercent, &ValidateInt32Percent);

 DEFINE_int32(writepercent, 45,
             " Ratio of deletes to total workload (expressed as a percentage)");
 static const bool FLAGS_writepercent_dummy =
-  google::RegisterFlagValidator(&FLAGS_writepercent, &ValidateInt32Percent);
+  GFLAGS::RegisterFlagValidator(&FLAGS_writepercent, &ValidateInt32Percent);

 DEFINE_int32(delpercent, 15,
             "Ratio of deletes to total workload (expressed as a percentage)");
 static const bool FLAGS_delpercent_dummy =
-  google::RegisterFlagValidator(&FLAGS_delpercent, &ValidateInt32Percent);
+  GFLAGS::RegisterFlagValidator(&FLAGS_delpercent, &ValidateInt32Percent);

 DEFINE_int32(iterpercent, 10, "Ratio of iterations to total workload"
             " (expressed as a percentage)");
 static const bool FLAGS_iterpercent_dummy =
-  google::RegisterFlagValidator(&FLAGS_iterpercent, &ValidateInt32Percent);
+  GFLAGS::RegisterFlagValidator(&FLAGS_iterpercent, &ValidateInt32Percent);

 DEFINE_uint64(num_iterations, 10, "Number of iterations per MultiIterate run");
 static const bool FLAGS_num_iterations_dummy =
-  google::RegisterFlagValidator(&FLAGS_num_iterations, &ValidateUint32Range);
+  GFLAGS::RegisterFlagValidator(&FLAGS_num_iterations, &ValidateUint32Range);

 DEFINE_bool(disable_seek_compaction, false,
            "Option to disable compation triggered by read.");
@ -292,18 +292,18 @@ static rocksdb::Env* FLAGS_env = rocksdb::Env::Default();

 DEFINE_uint64(ops_per_thread, 600000, "Number of operations per thread.");
 static const bool FLAGS_ops_per_thread_dummy =
-  google::RegisterFlagValidator(&FLAGS_ops_per_thread, &ValidateUint32Range);
+  GFLAGS::RegisterFlagValidator(&FLAGS_ops_per_thread, &ValidateUint32Range);

 DEFINE_uint64(log2_keys_per_lock, 2, "Log2 of number of keys per lock");
 static const bool FLAGS_log2_keys_per_lock_dummy =
-  google::RegisterFlagValidator(&FLAGS_log2_keys_per_lock,
+  GFLAGS::RegisterFlagValidator(&FLAGS_log2_keys_per_lock,
                                &ValidateUint32Range);

 DEFINE_int32(purge_redundant_percent, 50,
             "Percentage of times we want to purge redundant keys in memory "
             "before flushing");
 static const bool FLAGS_purge_redundant_percent_dummy =
-  google::RegisterFlagValidator(&FLAGS_purge_redundant_percent,
+  GFLAGS::RegisterFlagValidator(&FLAGS_purge_redundant_percent,
                                &ValidateInt32Percent);

 DEFINE_bool(filter_deletes, false, "On true, deletes use KeyMayExist to drop"
@ -340,7 +340,7 @@ static bool ValidatePrefixSize(const char* flagname, int32_t value) {
 }
 DEFINE_int32(prefix_size, 7, "Control the prefix size for HashSkipListRep");
 static const bool FLAGS_prefix_size_dummy =
-  google::RegisterFlagValidator(&FLAGS_prefix_size, &ValidatePrefixSize);
+  GFLAGS::RegisterFlagValidator(&FLAGS_prefix_size, &ValidatePrefixSize);

 DEFINE_bool(use_merge, false, "On true, replaces all writes with a Merge "
            "that behaves like a Put");
@ -1502,9 +1502,9 @@ class StressTest {


 int main(int argc, char** argv) {
-  google::SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
+  GFLAGS::SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
                          " [OPTIONS]...");
-  google::ParseCommandLineFlags(&argc, &argv, true);
+  GFLAGS::ParseCommandLineFlags(&argc, &argv, true);

  if (FLAGS_statistics) {
    dbstats = rocksdb::CreateDBStatistics();
--- a/util/bloom_test.cc
+++ b/util/bloom_test.cc
@ -164,7 +164,7 @@ TEST(BloomTest, VaryingLengths) {
 }  // namespace rocksdb

 int main(int argc, char** argv) {
-  google::ParseCommandLineFlags(&argc, &argv, true);
+  GFLAGS::ParseCommandLineFlags(&argc, &argv, true);

  return rocksdb::test::RunAllTests();
 }
--- a/util/dynamic_bloom_test.cc
+++ b/util/dynamic_bloom_test.cc
@ -196,7 +196,7 @@ TEST(DynamicBloomTest, perf) {
 }  // namespace rocksdb

 int main(int argc, char** argv) {
-  google::ParseCommandLineFlags(&argc, &argv, true);
+  GFLAGS::ParseCommandLineFlags(&argc, &argv, true);

  return rocksdb::test::RunAllTests();
 }
--- a/util/env_posix.cc
+++ b/util/env_posix.cc
@ -761,6 +761,10 @@ class PosixWritableFile : public WritableFile {
  }

  virtual Status Sync() {
+    Status s = Flush();
+    if (!s.ok()) {
+      return s;
+    }
    TEST_KILL_RANDOM(rocksdb_kill_odds);
    if (pending_sync_ && fdatasync(fd_) < 0) {
      return IOError(filename_, errno);
@ -771,6 +775,10 @@ class PosixWritableFile : public WritableFile {
  }

  virtual Status Fsync() {
+    Status s = Flush();
+    if (!s.ok()) {
+      return s;
+    }
    TEST_KILL_RANDOM(rocksdb_kill_odds);
    if (pending_fsync_ && fsync(fd_) < 0) {
      return IOError(filename_, errno);
--- a/util/env_test.cc
+++ b/util/env_test.cc
@ -290,7 +290,6 @@ TEST(EnvPosixTest, AllocateTest) {
  // allocate 100 MB
  size_t kPreallocateSize = 100 * 1024 * 1024;
  size_t kBlockSize = 512;
-  size_t kPageSize = 4096;
  std::string data = "test";
  wfile->SetPreallocationBlockSize(kPreallocateSize);
  ASSERT_OK(wfile->Append(Slice(data)));
@ -299,8 +298,9 @@ TEST(EnvPosixTest, AllocateTest) {
  struct stat f_stat;
  stat(fname.c_str(), &f_stat);
  ASSERT_EQ((unsigned int)data.size(), f_stat.st_size);
+  auto st_blocks = f_stat.st_blocks;
  // verify that blocks are preallocated
-  ASSERT_EQ((unsigned int)(kPreallocateSize / kBlockSize), f_stat.st_blocks);
+  ASSERT_LE((unsigned int)(kPreallocateSize / kBlockSize), st_blocks);

  // close the file, should deallocate the blocks
  wfile.reset();
@ -308,8 +308,7 @@ TEST(EnvPosixTest, AllocateTest) {
  stat(fname.c_str(), &f_stat);
  ASSERT_EQ((unsigned int)data.size(), f_stat.st_size);
  // verify that preallocated blocks were deallocated on file close
-  size_t data_blocks_pages = ((data.size() + kPageSize - 1) / kPageSize);
-  ASSERT_EQ((unsigned int)(data_blocks_pages * kPageSize / kBlockSize), f_stat.st_blocks);
+  ASSERT_LT(f_stat.st_blocks, st_blocks);
 }
 #endif

--- a/util/thread_local.h
+++ b/util/thread_local.h
@ -10,6 +10,7 @@
 #pragma once

 #include <atomic>
+#include <functional>
 #include <memory>
 #include <unordered_map>
 #include <vector>
Author	SHA1	Message	Date
sdong	7a8cee8fa3	Disable warning as error	2019-10-31 15:17:49 -07:00
sdong	b142adf735	[FB Internal] Point to the latest tool chain.	2019-10-31 15:17:49 -07:00
sdong	97b4abe820	One extra include<functional>	2019-10-31 15:16:56 -07:00
sdong	eba2962e51	An odd fix for GCC 7	2019-10-31 15:16:56 -07:00
sdong	b133eb7234	Disable warning as error	2019-10-31 15:16:50 -07:00
sdong	a8a4ad4f03	Add some include<functional>	2019-10-31 15:16:33 -07:00
sdong	59fb551386	[FB Internal] Point to the latest tool chain.	2019-10-31 15:16:33 -07:00
sdong	30922aac96	fb internal tool chain upgrade to gcc-5	2018-06-06 16:14:21 -07:00
sdong	47a8228881	[FB Internal] use gcc-5	2017-07-17 15:31:03 -07:00
sdong	0f1692cdad	fb internal: Should also use GCC 4.8.1 for CentOS 7	2016-10-13 13:48:00 -07:00
sdong	eb96dc003a	Fix a bug in IterKey Summary: IterKey set buffer_size_ to a wrong initial value, causing it to always allocate values from heap instead of stack if the key size is smaller. Fix it. Test Plan: make all check Reviewers: haobo, ljin Reviewed By: haobo CC: igor, dhruba, yhchiang, leveldb Differential Revision: https://reviews.facebook.net/D18279	2014-04-23 19:45:58 -07:00
sdong	8db376d494	Fix a sign and unsign comparating in plain_table_db_test Summary: Test Plan: Reviewers: CC: Task ID: # Blame Rev:	2014-04-23 16:09:45 -07:00
sdong	73895c9478	Expose number of entries in mem tables to users Summary: In this patch, two new DB properties are defined: rocksdb.num-immutable-mem-table and rocksdb.num-entries-imm-mem-tables, from where number of entries in mem tables can be exposed to users Test Plan: Cover the codes in db_test make all check Reviewers: haobo, ljin, igor Reviewed By: igor CC: nkg-, igor, yhchiang, dhruba, leveldb Differential Revision: https://reviews.facebook.net/D18207 Conflicts: db/db_test.cc	2014-04-22 22:38:44 -07:00
sdong	7b37f0d5af	PlainTableReader to expose index size to users Summary: This is a temp solution to expose index sizes to users from PlainTableReader before we persistent them to files. In this patch, the memory consumption of indexes used by PlainTableReader will be reported as two user defined properties, so that users can monitor them. Test Plan: Add a unit test. make all check` Reviewers: haobo, ljin Reviewed By: haobo CC: nkg-, yhchiang, igor, ljin, dhruba, leveldb Differential Revision: https://reviews.facebook.net/D18195	2014-04-22 19:31:12 -07:00
Igor Canadi	8cd08fdca4	Fix allocate test in 2.8.fb.trunk. Merging: `c7076a7a05` and `05c168658e` into 2.8.fb.trunk	2014-04-21 18:26:53 -07:00
Igor Canadi	ce068c09dd	Flush before Sync()	2014-04-21 17:47:48 -07:00
Igor Canadi	4b7b1949d4	REmove occurrences of kBlockBasedTableWithWholeKeyHashIndex	2014-04-21 12:00:31 -07:00
Igor Canadi	9e04ce7645	Don't execute WholeKeyPrefix test	2014-04-21 11:23:47 -07:00
sdong	13dc9c7f56	Use a different approach to make sure BlockBasedTableReader can use hash index on older files Summary: A recent commit `e37dd216f9` makes sure hash index can be used when reading existing files. This patch uses another way to achieve the approach: (1) Currently, always writing kBinarySearch to files, despite of BlockBasedTableOptions.IndexType setting. (2) When reading a file, read out the field, and make sure it is kBinarySearch, while always use index type by users. The reason for doing it is, to reserve kHashSearch property on disk to future. If now we write out binary index for both of kHashSearch and kBinarySearch. We have to use a new flag in the future for hash index on disk, otherwise compatibility would break. Also, we want the real index type and type shown in properties block to be consistent. Test Plan: make all check Reviewers: haobo, kailiu Reviewed By: kailiu CC: igor, ljin, yhchiang, xjin, dhruba, leveldb Differential Revision: https://reviews.facebook.net/D18009 Conflicts: table/block_based_table_reader.cc	2014-04-18 14:46:44 -07:00
sdong	034b494774	Fix bugs introduced by D17961 Summary: D17961 has two bugs: (1) two level iterator fails to populate FileMetaData.table_reader, causing performance regression. (2) table cache handle the !status.ok() case in the wrong place, causing seg fault which shouldn't happen. Test Plan: make all check Reviewers: ljin, igor, haobo Reviewed By: ljin CC: yhchiang, dhruba, leveldb Differential Revision: https://reviews.facebook.net/D17991 Conflicts: db/version_set.cc	2014-04-18 14:40:58 -07:00
sdong	d705755e52	Minimize accessing multiple objects in Version::Get() Summary: One of our profilings shows that Version::Get() sometimes is slow when getting pointer of user comparators or other global objects. In this patch: (1) we keep pointers of immutable objects in Version to avoid accesses them though option objects or cfd objects (2) table_reader is directly cached in FileMetaData so that table cache don't have to go through handle first to fetch it (3) If level 0 has less than 3 files, skip the filtering logic based on SST tables' key range. Smallest and largest key are stored in separated memory locations, which has potential cache misses Test Plan: make all check Reviewers: haobo, ljin Reviewed By: haobo CC: igor, yhchiang, nkg-, leveldb Differential Revision: https://reviews.facebook.net/D17739 Conflicts: db/db_impl.cc db/db_impl_readonly.cc db/table_cache.cc db/version_edit.h db/version_set.cc db/version_set.h	2014-04-18 14:27:06 -07:00
sdong	26f36347d0	RocksDB 2.8 to be able to read files generated by 2.6 Summary: From 2.6 to 2.7, property block name is renamed from rocksdb.stats to rocksdb.properties. Older properties were not able to be loaded. In 2.8, we seem to have added some logic that uses property block without checking null pointers, which create segment faults. In this patch, we fix it by: (1) try rocksdb.stats if rocksdb.properties is not found (2) add some null checking before consuming rep->table_properties Test Plan: make sure a file generated in 2.7 couldn't be opened now can be opened. Reviewers: haobo, igor, yhchiang Reviewed By: igor CC: ljin, xjin, dhruba, kailiu, leveldb Differential Revision: https://reviews.facebook.net/D17961	2014-04-17 10:17:12 -07:00
sdong	0f7daf5fb4	Miss one file from previous cherry-pick Summary: Test Plan: Reviewers: CC: Task ID: # Blame Rev:	2014-04-15 20:20:01 -07:00
Kai Liu	22f396798e	Enable hash index for block-based table Summary: Based on previous patches, this diff eventually provides the end-to-end mechanism for users to specify the hash-index. Test Plan: Wrote several new unit tests. Reviewers: sdong, haobo, dhruba Reviewed By: sdong CC: leveldb Differential Revision: https://reviews.facebook.net/D16539 Use shorten index key for hash-index Summary: I was wrong about the "index builder", right now since we create index by scanning both whole table and index, there is not need to preserve the whole key as the index key. I switch back to original way index which is both space efficient and able to supprot in-fly construction of hash index. IN this patch, I made minimal change since I'm not sure if we still need the "pluggable index builder", under current circumstance it is of no use and kind of over-engineered. But I'm not sure if we can still exploit its usefulness in the future; otherwise I think I can just burn them with great vengeance. Test Plan: unit tests Reviewers: sdong, haobo CC: leveldb Differential Revision: https://reviews.facebook.net/D17745 Conflicts: table/block_based_table_reader.cc table/block_based_table_reader.h	2014-04-15 20:13:34 -07:00
sdong	258eac1772	Polish IterKey and use it in DBImpl::ProcessKeyValueCompaction() Summary: 1. Polish IterKey a little bit. 2. Turn to use it in local parameter of current_user_key in DBImpl::ProcessKeyValueCompaction(). Our profile showing that DBImpl::ProcessKeyValueCompaction() has about 14% costs in std::string (the base including reading and writing data but excluding compaction filtering), which is higher than it should be. There are two std::string used in DBImpl::ProcessKeyValueCompaction(), compaction_filter_value and current_user_key and it's hard to distinguish the two. Test Plan: make all check Reviewers: haobo, ljin Reviewed By: haobo CC: igor, yhchiang, dhruba, leveldb Differential Revision: https://reviews.facebook.net/D17613 Conflicts: db/db_impl.cc	2014-04-09 20:54:16 -07:00
kailiu	c80c3f3b05	Make the block-based table's index pluggable Summary: This patch introduced a new table options that allows us to make block-based table's index pluggable. To support that new features: * Code has been refacotred to be more flexible and supports this option well. * More documentation is added for the existing obsecure functionalities. * Big surgeon on DataBlockReader(), where the logic was really convoluted. * Other small code cleanups. The pluggablility will mostly affect development of internal modules and won't change frequently, as a result I intentionally avoid heavy-weight patterns (like factory) and try to make it simple. Test Plan: make all check Reviewers: haobo, sdong Reviewed By: sdong CC: leveldb Differential Revision: https://reviews.facebook.net/D16395 Conflicts: table/block_based_table_reader.cc table/block_based_table_reader.h	2014-04-09 11:34:13 -07:00
sdong	14534f0d7b	PlainTableBuilder::Add() to use local char array instead of reused std::string as tmp buffer Summary: Our profile shows that in one of the applications, 5% of the CPU costs of PlainTableBuilder::Add() are spent on std::string stacks. By this simple change, we avoid this global reusable string. Also, we avoid another call of file appending, which probably gives another 2%. Test Plan: make all check Reviewers: haobo, ljin Reviewed By: haobo CC: igor, yhchiang, dhruba, leveldb Differential Revision: https://reviews.facebook.net/D17601	2014-04-09 10:33:15 -07:00
sdong	b405cb886b	PlainTableIterator not to store copied key in std::string Summary: Move PlainTableIterator's copied key from std::string local buffer to avoid paying the extra costs in std::string related to sharing. Reuse the same buffer class in DbIter. Move the class to dbformat.h. This patch improves iterator performance significantly. Running this benchmark: ./table_reader_bench --num_keys2=17 --iterator --plain_table --time_unit=nanosecond The average latency is improved to about 750 nanoseconds from 1100 nanoseconds. Test Plan: Add a unit test. make all check Reviewers: haobo, ljin Reviewed By: haobo CC: igor, yhchiang, dhruba, leveldb Differential Revision: https://reviews.facebook.net/D17547	2014-04-07 22:18:09 -07:00