Resolving merge conflict

2014-09-23 16:00:54 -07:00 · 2014-09-23 16:00:54 -07:00 · ba6d660f6d
commit ba6d660f6d
parent 51eeaf65e2 0a29ce5393
191 changed files with 10589 additions and 4492 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -14,7 +14,6 @@ before_install:
 - sudo dpkg -i libgflags-dev_2.0-1_amd64.deb
 # Lousy hack to disable use and testing of fallocate, which doesn't behave quite
 # as EnvPosixTest::AllocateTest expects within the Travis OpenVZ environment.
- - sed -i "s/fallocate(/HACK_NO_fallocate(/" build_tools/build_detect_platform
-script: make check -j8
+script: OPT=-DTRAVIS make check -j8
 notifications:
    email: false
--- a/HISTORY.md
+++ b/HISTORY.md
@ -1,10 +1,34 @@
 # Rocksdb Change Log

-### Unreleased
+## Unreleased (will be released with 3.6)
+### Disk format changes
+* If you're using RocksDB on ARM platforms and you're using default bloom filter, there is a disk format change you need to be aware of. There are three steps you need to do when you convert to new release: 1. turn off filter policy, 2. compact the whole database, 3. turn on filter policy
+
+### Behavior changes
+* We have refactored our system of stalling writes.  Any stall-related statistics' meanings are changed. Instead of per-write stall counts, we now count stalls per-epoch, where epochs are periods between flushes and compactions. You'll find more information in our Tuning Perf Guide once we release RocksDB 3.6.
+* When disableDataSync=true, we no longer sync the MANIFEST file.
+* Add identity_as_first_hash property to CuckooTable. SST file needs to be rebuilt to be opened by reader properly.
+* Change target_file_size_base type to uint64_t from int.
+
+----- Past Releases -----
+
+## 3.5.0 (9/3/2014)
+### New Features
+* Add include/utilities/write_batch_with_index.h, providing a utilitiy class to query data out of WriteBatch when building it.
+* Move BlockBasedTable related options to BlockBasedTableOptions from Options. Change corresponding JNI interface. Options affected include:
+  no_block_cache, block_cache, block_cache_compressed, block_size, block_size_deviation, block_restart_interval, filter_policy, whole_key_filtering. filter_policy is changed to shared_ptr from a raw pointer.
+* Remove deprecated options: disable_seek_compaction and db_stats_log_interval
+* OptimizeForPointLookup() takes one parameter for block cache size. It now builds hash index, bloom filter, and block cache.
+
+### Public API changes
+* The Prefix Extractor used with V2 compaction filters is now passed user key to SliceTransform::Transform instead of unparsed RocksDB key.
+
+## 3.4.0 (8/18/2014)
 ### New Features
 * Support Multiple DB paths in universal style compactions
 * Add feature of storing plain table index and bloom filter in SST file.
 * CompactRange() will never output compacted files to level 0. This used to be the case when all the compaction input files were at level 0.
+* Added iterate_upper_bound to define the extent upto which the forward iterator will return entries. This will prevent iterating over delete markers and overwritten entries for edge cases where you want to break out the iterator anyways. This may improve perfomance in case there are a large number of delete markers or overwritten entries.

 ### Public API changes
 * DBOptions.db_paths now is a vector of a DBPath structure which indicates both of path and target size
--- a/56
+++ b/56
@ -3,7 +3,6 @@
 # found in the LICENSE file. See the AUTHORS file for names of contributors.

 # Inherit some settings from environment variables, if available
-INSTALL_PATH ?= $(CURDIR)

 #-----------------------------------------------

@ -49,6 +48,27 @@ else
 	PLATFORM_CCFLAGS += $(JEMALLOC_INCLUDE) -DHAVE_JEMALLOC
 endif

+#-------------------------------------------------
+# make install related stuff
+INSTALL_PATH ?= /usr/local
+
+uninstall:
+	@rm -rf $(INSTALL_PATH)/include/rocksdb
+	@rm -rf $(INSTALL_PATH)/lib/$(LIBRARY)
+	@rm -rf $(INSTALL_PATH)/lib/$(SHARED)
+
+install:
+	@install -d $(INSTALL_PATH)/lib
+	@for header_dir in `find "include/rocksdb" -type d`; do \
+		install -d $(INSTALL_PATH)/$$header_dir; \
+	done
+	@for header in `find "include/rocksdb" -type f -name *.h`; do \
+		install -C -m 644 $$header $(INSTALL_PATH)/$$header; \
+	done
+	@[ ! -e $(LIBRARY) ] || install -C -m 644 $(LIBRARY) $(INSTALL_PATH)/lib
+	@[ ! -e $(SHARED) ] || install -C -m 644 $(SHARED) $(INSTALL_PATH)/lib
+#-------------------------------------------------
+
 WARNING_FLAGS = -Wall -Werror -Wsign-compare
 CFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CCFLAGS) $(OPT)
 CXXFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) -Woverloaded-virtual
@ -90,12 +110,14 @@ TESTS = \
 	blob_store_test \
 	filelock_test \
 	filename_test \
-	filter_block_test \
+	block_based_filter_block_test \
+	full_filter_block_test \
 	histogram_test \
 	log_test \
 	manual_compaction_test \
 	memenv_test \
 	merge_test \
+	merger_test \
 	redis_test \
 	reduce_levels_test \
 	plain_table_db_test \
@ -111,17 +133,18 @@ TESTS = \
 	version_edit_test \
 	version_set_test \
 	file_indexer_test \
-	write_batch_test\
+	write_batch_test \
+	write_controller_test\
 	deletefile_test \
 	table_test \
 	thread_local_test \
 	geodb_test \
 	rate_limiter_test \
-	cuckoo_table_builder_test \
 	options_test \
 	cuckoo_table_builder_test \
 	cuckoo_table_reader_test \
-	cuckoo_table_db_test
+	cuckoo_table_db_test \
+	write_batch_with_index_test

 TOOLS = \
        sst_dump \
@ -132,7 +155,7 @@ TOOLS = \
  options_test \
 	blob_store_bench

-PROGRAMS = db_bench signal_test table_reader_bench log_and_apply_bench $(TOOLS)
+PROGRAMS = db_bench signal_test table_reader_bench log_and_apply_bench cache_bench $(TOOLS)

 # The library name is configurable since we are maintaining libraries of both
 # debug/release mode.
@ -175,7 +198,7 @@ endif  # PLATFORM_SHARED_EXT

 .PHONY: blackbox_crash_test check clean coverage crash_test ldb_tests \
 	release tags valgrind_check whitebox_crash_test format static_lib shared_lib all \
-	dbg rocksdbjavastatic rocksdbjava
+	dbg rocksdbjavastatic rocksdbjava install uninstall

 all: $(LIBRARY) $(PROGRAMS) $(TESTS)

@ -264,6 +287,9 @@ $(LIBRARY): $(LIBOBJECTS)
 db_bench: db/db_bench.o $(LIBOBJECTS) $(TESTUTIL)
 	$(CXX) db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)

+cache_bench: util/cache_bench.o $(LIBOBJECTS) $(TESTUTIL)
+	$(CXX) util/cache_bench.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)
+
 block_hash_index_test: table/block_hash_index_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	 $(CXX) table/block_hash_index_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)

@ -375,6 +401,9 @@ spatial_db_test: utilities/spatialdb/spatial_db_test.o $(LIBOBJECTS) $(TESTHARNE
 ttl_test: utilities/ttl/ttl_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) utilities/ttl/ttl_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)

+write_batch_with_index_test: utilities/write_batch_with_index/write_batch_with_index_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) utilities/write_batch_with_index/write_batch_with_index_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)
+
 dbformat_test: db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)

@ -387,8 +416,11 @@ rate_limiter_test: util/rate_limiter_test.o $(LIBOBJECTS) $(TESTHARNESS)
 filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)

-filter_block_test: table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS)
-	$(CXX) table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+block_based_filter_block_test: table/block_based_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) table/block_based_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+full_filter_block_test: table/full_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) table/full_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)

 log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
@ -417,9 +449,15 @@ reduce_levels_test: tools/reduce_levels_test.o $(LIBOBJECTS) $(TESTHARNESS)
 write_batch_test: db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)

+write_controller_test: db/write_controller_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) db/write_controller_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
 merge_test: db/merge_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) db/merge_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)

+merger_test: table/merger_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) table/merger_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
 deletefile_test: db/deletefile_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) db/deletefile_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS)

--- a/README.md
+++ b/README.md
@ -3,7 +3,7 @@
 [![Build Status](https://travis-ci.org/facebook/rocksdb.svg?branch=master)](https://travis-ci.org/facebook/rocksdb)

 RocksDB is developed and maintained by Facebook Database Engineering Team.
-It is built on on earlier work on LevelDB by Sanjay Ghemawat (sanjay@google.com)
+It is built on earlier work on LevelDB by Sanjay Ghemawat (sanjay@google.com)
 and Jeff Dean (jeff@google.com)

 This code is a library that forms the core building block for a fast
--- a/build_tools/build_detect_platform
+++ b/build_tools/build_detect_platform
@ -46,7 +46,7 @@ PLATFORM_CXXFLAGS="-std=c++11"
 COMMON_FLAGS="-DROCKSDB_PLATFORM_POSIX"

 # Default to fbcode gcc on internal fb machines
-if [ -d /mnt/gvfs/third-party -a -z "$CXX" ]; then
+if [ -z "$ROCKSDB_NO_FBCODE" -a -d /mnt/gvfs/third-party ]; then
    FBCODE_BUILD="true"
    if [ -z "$USE_CLANG" ]; then
        CENTOS_VERSION=`rpm -q --qf "%{VERSION}" \
--- a/build_tools/regression_build_test.sh
+++ b/build_tools/regression_build_test.sh
@ -344,6 +344,38 @@ common_in_mem_args="--db=/dev/shm/rocksdb \
    --threads=32 \
    --writes_per_second=81920 > ${STAT_FILE}.seekwhilewriting_in_ram

+# measure fillseq with bunch of column families
+./db_bench \
+    --benchmarks=fillseq \
+    --num_column_families=500 \
+    --write_buffer_size=1048576 \
+    --db=$DATA_DIR \
+    --use_existing_db=0 \
+    --num=$NUM \
+    --writes=$NUM \
+    --open_files=55000 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0  > ${STAT_FILE}.fillseq_lots_column_families
+
+# measure overwrite performance with bunch of column families
+./db_bench \
+    --benchmarks=overwrite \
+    --num_column_families=500 \
+    --write_buffer_size=1048576 \
+    --db=$DATA_DIR \
+    --use_existing_db=1 \
+    --num=$NUM \
+    --writes=$((NUM / 10)) \
+    --open_files=55000 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=8 > ${STAT_FILE}.overwrite_lots_column_families

 # send data to ods
 function send_to_ods {
@ -392,3 +424,5 @@ send_benchmark_to_ods readrandom memtablereadrandom $STAT_FILE.memtablefillreadr
 send_benchmark_to_ods readwhilewriting readwhilewriting $STAT_FILE.readwhilewriting
 send_benchmark_to_ods readwhilewriting readwhilewriting_in_ram ${STAT_FILE}.readwhilewriting_in_ram
 send_benchmark_to_ods seekrandomwhilewriting seekwhilewriting_in_ram ${STAT_FILE}.seekwhilewriting_in_ram
+send_benchmark_to_ods fillseq fillseq_lots_column_families ${STAT_FILE}.fillseq_lots_column_families
+send_benchmark_to_ods overwrite overwrite_lots_column_families ${STAT_FILE}.overwrite_lots_column_families
--- a/db/builder.cc
+++ b/db/builder.cc
@ -26,21 +26,24 @@ namespace rocksdb {

 class TableFactory;

-TableBuilder* NewTableBuilder(const Options& options,
+TableBuilder* NewTableBuilder(const ImmutableCFOptions& ioptions,
                              const InternalKeyComparator& internal_comparator,
                              WritableFile* file,
-                              CompressionType compression_type) {
-  return options.table_factory->NewTableBuilder(options, internal_comparator,
-                                                file, compression_type);
+                              const CompressionType compression_type,
+                              const CompressionOptions& compression_opts) {
+  return ioptions.table_factory->NewTableBuilder(
+      ioptions, internal_comparator, file, compression_type, compression_opts);
 }

-Status BuildTable(const std::string& dbname, Env* env, const Options& options,
-                  const EnvOptions& soptions, TableCache* table_cache,
+Status BuildTable(const std::string& dbname, Env* env,
+                  const ImmutableCFOptions& ioptions,
+                  const EnvOptions& env_options, TableCache* table_cache,
                  Iterator* iter, FileMetaData* meta,
                  const InternalKeyComparator& internal_comparator,
                  const SequenceNumber newest_snapshot,
                  const SequenceNumber earliest_seqno_in_memtable,
                  const CompressionType compression,
+                  const CompressionOptions& compression_opts,
                  const Env::IOPriority io_priority) {
  Status s;
  meta->fd.file_size = 0;
@ -50,23 +53,24 @@ Status BuildTable(const std::string& dbname, Env* env, const Options& options,
  // If the sequence number of the smallest entry in the memtable is
  // smaller than the most recent snapshot, then we do not trigger
  // removal of duplicate/deleted keys as part of this builder.
-  bool purge = options.purge_redundant_kvs_while_flush;
+  bool purge = ioptions.purge_redundant_kvs_while_flush;
  if (earliest_seqno_in_memtable <= newest_snapshot) {
    purge = false;
  }

-  std::string fname = TableFileName(options.db_paths, meta->fd.GetNumber(),
+  std::string fname = TableFileName(ioptions.db_paths, meta->fd.GetNumber(),
                                    meta->fd.GetPathId());
  if (iter->Valid()) {
    unique_ptr<WritableFile> file;
-    s = env->NewWritableFile(fname, &file, soptions);
+    s = env->NewWritableFile(fname, &file, env_options);
    if (!s.ok()) {
      return s;
    }
    file->SetIOPriority(io_priority);

-    TableBuilder* builder =
-        NewTableBuilder(options, internal_comparator, file.get(), compression);
+    TableBuilder* builder = NewTableBuilder(
+        ioptions, internal_comparator, file.get(),
+        compression, compression_opts);

    // the first key is the smallest key
    Slice key = iter->key();
@ -75,8 +79,8 @@ Status BuildTable(const std::string& dbname, Env* env, const Options& options,
    meta->largest_seqno = meta->smallest_seqno;

    MergeHelper merge(internal_comparator.user_comparator(),
-                      options.merge_operator.get(), options.info_log.get(),
-                      options.min_partial_merge_operands,
+                      ioptions.merge_operator, ioptions.info_log,
+                      ioptions.min_partial_merge_operands,
                      true /* internal key corruption is not ok */);

    if (purge) {
@ -196,12 +200,12 @@ Status BuildTable(const std::string& dbname, Env* env, const Options& options,
    delete builder;

    // Finish and check for file errors
-    if (s.ok() && !options.disableDataSync) {
-      if (options.use_fsync) {
-        StopWatch sw(env, options.statistics.get(), TABLE_SYNC_MICROS);
+    if (s.ok() && !ioptions.disable_data_sync) {
+      if (ioptions.use_fsync) {
+        StopWatch sw(env, ioptions.statistics, TABLE_SYNC_MICROS);
        s = file->Fsync();
      } else {
-        StopWatch sw(env, options.statistics.get(), TABLE_SYNC_MICROS);
+        StopWatch sw(env, ioptions.statistics, TABLE_SYNC_MICROS);
        s = file->Sync();
      }
    }
@ -211,7 +215,7 @@ Status BuildTable(const std::string& dbname, Env* env, const Options& options,

    if (s.ok()) {
      // Verify that the table is usable
-      Iterator* it = table_cache->NewIterator(ReadOptions(), soptions,
+      Iterator* it = table_cache->NewIterator(ReadOptions(), env_options,
                                              internal_comparator, meta->fd);
      s = it->status();
      delete it;
--- a/db/builder.h
+++ b/db/builder.h
@ -11,6 +11,7 @@
 #include "rocksdb/status.h"
 #include "rocksdb/types.h"
 #include "rocksdb/options.h"
+#include "rocksdb/immutable_options.h"

 namespace rocksdb {

@ -26,8 +27,10 @@ class TableBuilder;
 class WritableFile;

 extern TableBuilder* NewTableBuilder(
-    const Options& options, const InternalKeyComparator& internal_comparator,
-    WritableFile* file, CompressionType compression_type);
+    const ImmutableCFOptions& options,
+    const InternalKeyComparator& internal_comparator,
+    WritableFile* file, const CompressionType compression_type,
+    const CompressionOptions& compression_opts);

 // Build a Table file from the contents of *iter.  The generated file
 // will be named according to number specified in meta. On success, the rest of
@ -35,13 +38,15 @@ extern TableBuilder* NewTableBuilder(
 // If no data is present in *iter, meta->file_size will be set to
 // zero, and no Table file will be produced.
 extern Status BuildTable(const std::string& dbname, Env* env,
-                         const Options& options, const EnvOptions& soptions,
+                         const ImmutableCFOptions& options,
+                         const EnvOptions& env_options,
                         TableCache* table_cache, Iterator* iter,
                         FileMetaData* meta,
                         const InternalKeyComparator& internal_comparator,
                         const SequenceNumber newest_snapshot,
                         const SequenceNumber earliest_seqno_in_memtable,
                         const CompressionType compression,
+                         const CompressionOptions& compression_opts,
                         const Env::IOPriority io_priority = Env::IO_HIGH);

 }  // namespace rocksdb
--- a/db/c.cc
+++ b/db/c.cc
@ -55,6 +55,7 @@ using rocksdb::MergeOperator;
 using rocksdb::NewBloomFilterPolicy;
 using rocksdb::NewLRUCache;
 using rocksdb::Options;
+using rocksdb::BlockBasedTableOptions;
 using rocksdb::RandomAccessFile;
 using rocksdb::Range;
 using rocksdb::ReadOptions;
@ -81,6 +82,7 @@ struct rocksdb_fifo_compaction_options_t { CompactionOptionsFIFO rep; };
 struct rocksdb_readoptions_t     { ReadOptions       rep; };
 struct rocksdb_writeoptions_t    { WriteOptions      rep; };
 struct rocksdb_options_t         { Options           rep; };
+struct rocksdb_block_based_table_options_t  { BlockBasedTableOptions rep; };
 struct rocksdb_seqfile_t         { SequentialFile*   rep; };
 struct rocksdb_randomfile_t      { RandomAccessFile* rep; };
 struct rocksdb_writablefile_t    { WritableFile*     rep; };
@ -116,7 +118,7 @@ struct rocksdb_compactionfilter_t : public CompactionFilter {
      const Slice& existing_value,
      std::string* new_value,
      bool* value_changed) const {
-    char* c_new_value = NULL;
+    char* c_new_value = nullptr;
    size_t new_value_length = 0;
    unsigned char c_value_changed = 0;
    unsigned char result = (*filter_)(
@ -1053,6 +1055,74 @@ const char* rocksdb_writebatch_data(rocksdb_writebatch_t* b, size_t* size) {
  return b->rep.Data().c_str();
 }

+rocksdb_block_based_table_options_t*
+rocksdb_block_based_options_create() {
+  return new rocksdb_block_based_table_options_t;
+}
+
+void rocksdb_block_based_options_destroy(
+    rocksdb_block_based_table_options_t* options) {
+  delete options;
+}
+
+void rocksdb_block_based_options_set_block_size(
+    rocksdb_block_based_table_options_t* options, size_t block_size) {
+  options->rep.block_size = block_size;
+}
+
+void rocksdb_block_based_options_set_block_size_deviation(
+    rocksdb_block_based_table_options_t* options, int block_size_deviation) {
+  options->rep.block_size_deviation = block_size_deviation;
+}
+
+void rocksdb_block_based_options_set_block_restart_interval(
+    rocksdb_block_based_table_options_t* options, int block_restart_interval) {
+  options->rep.block_restart_interval = block_restart_interval;
+}
+
+void rocksdb_block_based_options_set_filter_policy(
+    rocksdb_block_based_table_options_t* options,
+    rocksdb_filterpolicy_t* filter_policy) {
+  options->rep.filter_policy.reset(filter_policy);
+}
+
+void rocksdb_block_based_options_set_no_block_cache(
+    rocksdb_block_based_table_options_t* options,
+    unsigned char no_block_cache) {
+  options->rep.no_block_cache = no_block_cache;
+}
+
+void rocksdb_block_based_options_set_block_cache(
+    rocksdb_block_based_table_options_t* options,
+    rocksdb_cache_t* block_cache) {
+  if (block_cache) {
+    options->rep.block_cache = block_cache->rep;
+  }
+}
+
+void rocksdb_block_based_options_set_block_cache_compressed(
+    rocksdb_block_based_table_options_t* options,
+    rocksdb_cache_t* block_cache_compressed) {
+  if (block_cache_compressed) {
+    options->rep.block_cache_compressed = block_cache_compressed->rep;
+  }
+}
+
+void rocksdb_block_based_options_set_whole_key_filtering(
+    rocksdb_block_based_table_options_t* options, unsigned char v) {
+  options->rep.whole_key_filtering = v;
+}
+
+void rocksdb_options_set_block_based_table_factory(
+    rocksdb_options_t *opt,
+    rocksdb_block_based_table_options_t* table_options) {
+  if (table_options) {
+    opt->rep.table_factory.reset(
+        rocksdb::NewBlockBasedTableFactory(table_options->rep));
+  }
+}
+
+
 rocksdb_options_t* rocksdb_options_create() {
  return new rocksdb_options_t;
 }
@ -1067,8 +1137,8 @@ void rocksdb_options_increase_parallelism(
 }

 void rocksdb_options_optimize_for_point_lookup(
-    rocksdb_options_t* opt) {
-  opt->rep.OptimizeForPointLookup();
+    rocksdb_options_t* opt, uint64_t block_cache_size_mb) {
+  opt->rep.OptimizeForPointLookup(block_cache_size_mb);
 }

 void rocksdb_options_optimize_level_style_compaction(
@ -1111,12 +1181,6 @@ void rocksdb_options_set_compaction_filter_factory_v2(
  opt->rep.compaction_filter_factory_v2 = std::shared_ptr<CompactionFilterFactoryV2>(compaction_filter_factory_v2);
 }

-void rocksdb_options_set_filter_policy(
-    rocksdb_options_t* opt,
-    rocksdb_filterpolicy_t* policy) {
-  opt->rep.filter_policy = policy;
-}
-
 void rocksdb_options_set_create_if_missing(
    rocksdb_options_t* opt, unsigned char v) {
  opt->rep.create_if_missing = v;
@ -1160,26 +1224,6 @@ void rocksdb_options_set_max_open_files(rocksdb_options_t* opt, int n) {
  opt->rep.max_open_files = n;
 }

-void rocksdb_options_set_cache(rocksdb_options_t* opt, rocksdb_cache_t* c) {
-  if (c) {
-    opt->rep.block_cache = c->rep;
-  }
-}
-
-void rocksdb_options_set_cache_compressed(rocksdb_options_t* opt, rocksdb_cache_t* c) {
-  if (c) {
-    opt->rep.block_cache_compressed = c->rep;
-  }
-}
-
-void rocksdb_options_set_block_size(rocksdb_options_t* opt, size_t s) {
-  opt->rep.block_size = s;
-}
-
-void rocksdb_options_set_block_restart_interval(rocksdb_options_t* opt, int n) {
-  opt->rep.block_restart_interval = n;
-}
-
 void rocksdb_options_set_target_file_size_base(
    rocksdb_options_t* opt, uint64_t n) {
  opt->rep.target_file_size_base = n;
@ -1272,11 +1316,6 @@ void rocksdb_options_set_prefix_extractor(
  opt->rep.prefix_extractor.reset(prefix_extractor);
 }

-void rocksdb_options_set_whole_key_filtering(
-    rocksdb_options_t* opt, unsigned char v) {
-  opt->rep.whole_key_filtering = v;
-}
-
 void rocksdb_options_set_disable_data_sync(
    rocksdb_options_t* opt, int disable_data_sync) {
  opt->rep.disableDataSync = disable_data_sync;
@ -1287,11 +1326,6 @@ void rocksdb_options_set_use_fsync(
  opt->rep.use_fsync = use_fsync;
 }

-void rocksdb_options_set_db_stats_log_interval(
-    rocksdb_options_t* opt, int db_stats_log_interval) {
-  opt->rep.db_stats_log_interval = db_stats_log_interval;
-}
-
 void rocksdb_options_set_db_log_dir(
    rocksdb_options_t* opt, const char* db_log_dir) {
  opt->rep.db_log_dir = db_log_dir;
@ -1351,11 +1385,6 @@ void rocksdb_options_set_stats_dump_period_sec(
  opt->rep.stats_dump_period_sec = v;
 }

-void rocksdb_options_set_block_size_deviation(
-    rocksdb_options_t* opt, int v) {
-  opt->rep.block_size_deviation = v;
-}
-
 void rocksdb_options_set_advise_random_on_open(
    rocksdb_options_t* opt, unsigned char v) {
  opt->rep.advise_random_on_open = v;
@ -1450,11 +1479,6 @@ void rocksdb_options_set_max_manifest_file_size(
  opt->rep.max_manifest_file_size = v;
 }

-void rocksdb_options_set_no_block_cache(
-    rocksdb_options_t* opt, unsigned char v) {
-  opt->rep.no_block_cache = v;
-}
-
 void rocksdb_options_set_table_cache_numshardbits(
    rocksdb_options_t* opt, int v) {
  opt->rep.table_cache_numshardbits = v;
@ -1474,10 +1498,6 @@ void rocksdb_options_set_disable_auto_compactions(rocksdb_options_t* opt, int di
  opt->rep.disable_auto_compactions = disable;
 }

-void rocksdb_options_set_disable_seek_compaction(rocksdb_options_t* opt, int disable) {
-  opt->rep.disable_seek_compaction = disable;
-}
-
 void rocksdb_options_set_delete_obsolete_files_period_micros(
    rocksdb_options_t* opt, uint64_t v) {
  opt->rep.delete_obsolete_files_period_micros = v;
@ -1824,6 +1844,13 @@ void rocksdb_readoptions_set_snapshot(
  opt->rep.snapshot = (snap ? snap->rep : nullptr);
 }

+void rocksdb_readoptions_set_iterate_upper_bound(
+    rocksdb_readoptions_t* opt,
+    const char* key, size_t keylen) {
+  Slice prefix = Slice(key, keylen);
+  opt->rep.iterate_upper_bound = &prefix;
+}
+
 void rocksdb_readoptions_set_read_tier(
    rocksdb_readoptions_t* opt, int v) {
  opt->rep.read_tier = static_cast<rocksdb::ReadTier>(v);
--- a/db/c_test.c
+++ b/db/c_test.c
@ -335,6 +335,7 @@ int main(int argc, char** argv) {
  rocksdb_cache_t* cache;
  rocksdb_env_t* env;
  rocksdb_options_t* options;
+  rocksdb_block_based_table_options_t* table_options;
  rocksdb_readoptions_t* roptions;
  rocksdb_writeoptions_t* woptions;
  char* err = NULL;
@ -353,14 +354,15 @@ int main(int argc, char** argv) {
  options = rocksdb_options_create();
  rocksdb_options_set_comparator(options, cmp);
  rocksdb_options_set_error_if_exists(options, 1);
-  rocksdb_options_set_cache(options, cache);
  rocksdb_options_set_env(options, env);
  rocksdb_options_set_info_log(options, NULL);
  rocksdb_options_set_write_buffer_size(options, 100000);
  rocksdb_options_set_paranoid_checks(options, 1);
  rocksdb_options_set_max_open_files(options, 10);
-  rocksdb_options_set_block_size(options, 1024);
-  rocksdb_options_set_block_restart_interval(options, 8);
+  table_options = rocksdb_block_based_options_create();
+  rocksdb_block_based_options_set_block_cache(table_options, cache);
+  rocksdb_options_set_block_based_table_factory(options, table_options);
+
  rocksdb_options_set_compression(options, rocksdb_no_compression);
  rocksdb_options_set_compression_options(options, -14, -1, 0);
  int compression_levels[] = {rocksdb_no_compression, rocksdb_no_compression,
@ -540,10 +542,12 @@ int main(int argc, char** argv) {
      policy = rocksdb_filterpolicy_create_bloom(10);
    }

+    rocksdb_block_based_options_set_filter_policy(table_options, policy);
+
    // Create new database
    rocksdb_close(db);
    rocksdb_destroy_db(options, dbname, &err);
-    rocksdb_options_set_filter_policy(options, policy);
+    rocksdb_options_set_block_based_table_factory(options, table_options);
    db = rocksdb_open(options, dbname, &err);
    CheckNoError(err);
    rocksdb_put(db, woptions, "foo", 3, "foovalue", 8, &err);
@ -565,8 +569,9 @@ int main(int argc, char** argv) {
      CheckGet(db, roptions, "foo", "foovalue");
      CheckGet(db, roptions, "bar", "barvalue");
    }
-    rocksdb_options_set_filter_policy(options, NULL);
-    rocksdb_filterpolicy_destroy(policy);
+    // Reset the policy
+    rocksdb_block_based_options_set_filter_policy(table_options, NULL);
+    rocksdb_options_set_block_based_table_factory(options, table_options);
  }

  StartPhase("compaction_filter");
@ -757,8 +762,7 @@ int main(int argc, char** argv) {
  StartPhase("prefix");
  {
    // Create new database
-    rocksdb_filterpolicy_t* policy = rocksdb_filterpolicy_create_bloom(10);
-    rocksdb_options_set_filter_policy(options, policy);
+    rocksdb_options_set_allow_mmap_reads(options, 1);
    rocksdb_options_set_prefix_extractor(options, rocksdb_slicetransform_create_fixed_prefix(3));
    rocksdb_options_set_hash_skip_list_rep(options, 5000, 4, 4);
    rocksdb_options_set_plain_table_factory(options, 4, 10, 0.75, 16);
@ -795,13 +799,13 @@ int main(int argc, char** argv) {
    rocksdb_iter_get_error(iter, &err);
    CheckNoError(err);
    rocksdb_iter_destroy(iter);
-    rocksdb_filterpolicy_destroy(policy);
  }


  StartPhase("cleanup");
  rocksdb_close(db);
  rocksdb_options_destroy(options);
+  rocksdb_block_based_options_destroy(table_options);
  rocksdb_readoptions_destroy(roptions);
  rocksdb_writeoptions_destroy(woptions);
  rocksdb_cache_destroy(cache);
--- a/db/column_family.cc
+++ b/db/column_family.cc
@ -9,6 +9,11 @@

 #include "db/column_family.h"

+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
 #include <vector>
 #include <string>
 #include <algorithm>
@ -19,11 +24,43 @@
 #include "db/internal_stats.h"
 #include "db/compaction_picker.h"
 #include "db/table_properties_collector.h"
+#include "db/write_controller.h"
 #include "util/autovector.h"
 #include "util/hash_skiplist_rep.h"
+#include "util/options_helper.h"

 namespace rocksdb {

+namespace {
+// This function computes the amount of time in microseconds by which a write
+// should be delayed based on the number of level-0 files according to the
+// following formula:
+// if n < bottom, return 0;
+// if n >= top, return 1000;
+// otherwise, let r = (n - bottom) /
+//                    (top - bottom)
+//  and return r^2 * 1000.
+// The goal of this formula is to gradually increase the rate at which writes
+// are slowed. We also tried linear delay (r * 1000), but it seemed to do
+// slightly worse. There is no other particular reason for choosing quadratic.
+uint64_t SlowdownAmount(int n, double bottom, double top) {
+  uint64_t delay;
+  if (n >= top) {
+    delay = 1000;
+  } else if (n < bottom) {
+    delay = 0;
+  } else {
+    // If we are here, we know that:
+    //   level0_start_slowdown <= n < level0_slowdown
+    // since the previous two conditions are false.
+    double how_much = static_cast<double>(n - bottom) / (top - bottom);
+    delay = std::max(how_much * how_much * 1000, 100.0);
+  }
+  assert(delay <= 1000);
+  return delay;
+}
+}  // namespace
+
 ColumnFamilyHandleImpl::ColumnFamilyHandleImpl(ColumnFamilyData* cfd,
                                               DBImpl* db, port::Mutex* mutex)
    : cfd_(cfd), db_(db), mutex_(mutex) {
@ -49,12 +86,14 @@ ColumnFamilyHandleImpl::~ColumnFamilyHandleImpl() {

 uint32_t ColumnFamilyHandleImpl::GetID() const { return cfd()->GetID(); }

+const Comparator* ColumnFamilyHandleImpl::user_comparator() const {
+  return cfd()->user_comparator();
+}
+
 ColumnFamilyOptions SanitizeOptions(const InternalKeyComparator* icmp,
-                                    const InternalFilterPolicy* ipolicy,
                                    const ColumnFamilyOptions& src) {
  ColumnFamilyOptions result = src;
  result.comparator = icmp;
-  result.filter_policy = (src.filter_policy != nullptr) ? ipolicy : nullptr;
 #ifdef OS_MACOSX
  // TODO(icanadi) make write_buffer_size uint64_t instead of size_t
  ClipToRange(&result.write_buffer_size, ((size_t)64) << 10, ((size_t)1) << 30);
@ -70,13 +109,7 @@ ColumnFamilyOptions SanitizeOptions(const InternalKeyComparator* icmp,
  result.min_write_buffer_number_to_merge =
      std::min(result.min_write_buffer_number_to_merge,
               result.max_write_buffer_number - 1);
-  if (result.block_cache == nullptr && !result.no_block_cache) {
-    result.block_cache = NewLRUCache(8 << 20);
-  }
  result.compression_per_level = src.compression_per_level;
-  if (result.block_size_deviation < 0 || result.block_size_deviation > 100) {
-    result.block_size_deviation = 0;
-  }
  if (result.max_mem_compaction_level >= result.num_levels) {
    result.max_mem_compaction_level = result.num_levels - 1;
  }
@ -184,9 +217,9 @@ void SuperVersionUnrefHandle(void* ptr) {

 ColumnFamilyData::ColumnFamilyData(uint32_t id, const std::string& name,
                                   Version* dummy_versions, Cache* table_cache,
-                                   const ColumnFamilyOptions& options,
+                                   const ColumnFamilyOptions& cf_options,
                                   const DBOptions* db_options,
-                                   const EnvOptions& storage_options,
+                                   const EnvOptions& env_options,
                                   ColumnFamilySet* column_family_set)
    : id_(id),
      name_(name),
@ -194,10 +227,10 @@ ColumnFamilyData::ColumnFamilyData(uint32_t id, const std::string& name,
      current_(nullptr),
      refs_(0),
      dropped_(false),
-      internal_comparator_(options.comparator),
-      internal_filter_policy_(options.filter_policy),
-      options_(*db_options, SanitizeOptions(&internal_comparator_,
-                                            &internal_filter_policy_, options)),
+      internal_comparator_(cf_options.comparator),
+      options_(*db_options, SanitizeOptions(&internal_comparator_, cf_options)),
+      ioptions_(options_),
+      mutable_cf_options_(options_),
      mem_(nullptr),
      imm_(options_.min_write_buffer_number_to_merge),
      super_version_(nullptr),
@ -206,7 +239,6 @@ ColumnFamilyData::ColumnFamilyData(uint32_t id, const std::string& name,
      next_(nullptr),
      prev_(nullptr),
      log_number_(0),
-      need_slowdown_for_num_level0_files_(false),
      column_family_set_(column_family_set) {
  Ref();

@ -214,7 +246,7 @@ ColumnFamilyData::ColumnFamilyData(uint32_t id, const std::string& name,
  if (dummy_versions != nullptr) {
    internal_stats_.reset(
        new InternalStats(options_.num_levels, db_options->env, this));
-    table_cache_.reset(new TableCache(&options_, storage_options, table_cache));
+    table_cache_.reset(new TableCache(ioptions_, env_options, table_cache));
    if (options_.compaction_style == kCompactionStyleUniversal) {
      compaction_picker_.reset(
          new UniversalCompactionPicker(&options_, &internal_comparator_));
@ -287,57 +319,82 @@ ColumnFamilyData::~ColumnFamilyData() {
 }

 void ColumnFamilyData::RecalculateWriteStallConditions() {
-  need_wait_for_num_memtables_ =
-    (imm()->size() == options()->max_write_buffer_number - 1);
-
  if (current_ != nullptr) {
-    need_wait_for_num_level0_files_ =
-      (current_->NumLevelFiles(0) >= options()->level0_stop_writes_trigger);
-  } else {
-    need_wait_for_num_level0_files_ = false;
-  }
+    const double score = current_->MaxCompactionScore();
+    const int max_level = current_->MaxCompactionScoreLevel();

-  RecalculateWriteStallRateLimitsConditions();
-}
+    auto write_controller = column_family_set_->write_controller_;

-void ColumnFamilyData::RecalculateWriteStallRateLimitsConditions() {
-  if (current_ != nullptr) {
-    exceeds_hard_rate_limit_ =
-        (options()->hard_rate_limit > 1.0 &&
-         current_->MaxCompactionScore() > options()->hard_rate_limit);
-
-    exceeds_soft_rate_limit_ =
-        (options()->soft_rate_limit > 0.0 &&
-         current_->MaxCompactionScore() > options()->soft_rate_limit);
-  } else {
-    exceeds_hard_rate_limit_ = false;
-    exceeds_soft_rate_limit_ = false;
+    if (imm()->size() == options_.max_write_buffer_number) {
+      write_controller_token_ = write_controller->GetStopToken();
+      internal_stats_->AddCFStats(InternalStats::MEMTABLE_COMPACTION, 1);
+      Log(options_.info_log,
+          "[%s] Stopping writes because we have %d immutable memtables "
+          "(waiting for flush)",
+          name_.c_str(), imm()->size());
+    } else if (current_->NumLevelFiles(0) >=
+               options_.level0_stop_writes_trigger) {
+      write_controller_token_ = write_controller->GetStopToken();
+      internal_stats_->AddCFStats(InternalStats::LEVEL0_NUM_FILES, 1);
+      Log(options_.info_log,
+          "[%s] Stopping writes because we have %d level-0 files",
+          name_.c_str(), current_->NumLevelFiles(0));
+    } else if (options_.level0_slowdown_writes_trigger >= 0 &&
+               current_->NumLevelFiles(0) >=
+                   options_.level0_slowdown_writes_trigger) {
+      uint64_t slowdown = SlowdownAmount(
+          current_->NumLevelFiles(0), options_.level0_slowdown_writes_trigger,
+          options_.level0_stop_writes_trigger);
+      write_controller_token_ = write_controller->GetDelayToken(slowdown);
+      internal_stats_->AddCFStats(InternalStats::LEVEL0_SLOWDOWN, slowdown);
+      Log(options_.info_log,
+          "[%s] Stalling writes because we have %d level-0 files (%" PRIu64
+          "us)",
+          name_.c_str(), current_->NumLevelFiles(0), slowdown);
+    } else if (options_.hard_rate_limit > 1.0 &&
+               score > options_.hard_rate_limit) {
+      uint64_t kHardLimitSlowdown = 1000;
+      write_controller_token_ =
+          write_controller->GetDelayToken(kHardLimitSlowdown);
+      internal_stats_->RecordLevelNSlowdown(max_level, kHardLimitSlowdown,
+                                            false);
+      Log(options_.info_log,
+          "[%s] Stalling writes because we hit hard limit on level %d. "
+          "(%" PRIu64 "us)",
+          name_.c_str(), max_level, kHardLimitSlowdown);
+    } else if (options_.soft_rate_limit > 0.0 &&
+               score > options_.soft_rate_limit) {
+      uint64_t slowdown = SlowdownAmount(score, options_.soft_rate_limit,
+                                         options_.hard_rate_limit);
+      write_controller_token_ = write_controller->GetDelayToken(slowdown);
+      internal_stats_->RecordLevelNSlowdown(max_level, slowdown, true);
+      Log(options_.info_log,
+          "[%s] Stalling writes because we hit soft limit on level %d (%" PRIu64
+          "us)",
+          name_.c_str(), max_level, slowdown);
+    } else {
+      write_controller_token_.reset();
+    }
  }
 }

 const EnvOptions* ColumnFamilyData::soptions() const {
-  return &(column_family_set_->storage_options_);
+  return &(column_family_set_->env_options_);
 }

-void ColumnFamilyData::SetCurrent(Version* current) {
-  current_ = current;
-  need_slowdown_for_num_level0_files_ =
-      (options_.level0_slowdown_writes_trigger >= 0 &&
-       current_->NumLevelFiles(0) >= options_.level0_slowdown_writes_trigger);
-}
+void ColumnFamilyData::SetCurrent(Version* current) { current_ = current; }

-void ColumnFamilyData::CreateNewMemtable() {
+void ColumnFamilyData::CreateNewMemtable(const MemTableOptions& moptions) {
  assert(current_ != nullptr);
  if (mem_ != nullptr) {
    delete mem_->Unref();
  }
-  mem_ = new MemTable(internal_comparator_, options_);
+  mem_ = new MemTable(internal_comparator_, ioptions_, moptions);
  mem_->Ref();
 }

 Compaction* ColumnFamilyData::PickCompaction(LogBuffer* log_buffer) {
  auto result = compaction_picker_->PickCompaction(current_, log_buffer);
-  RecalculateWriteStallRateLimitsConditions();
  return result;
 }

@ -434,7 +491,15 @@ bool ColumnFamilyData::ReturnThreadLocalSuperVersion(SuperVersion* sv) {

 SuperVersion* ColumnFamilyData::InstallSuperVersion(
    SuperVersion* new_superversion, port::Mutex* db_mutex) {
+  db_mutex->AssertHeld();
+  return InstallSuperVersion(new_superversion, db_mutex, mutable_cf_options_);
+}
+
+SuperVersion* ColumnFamilyData::InstallSuperVersion(
+    SuperVersion* new_superversion, port::Mutex* db_mutex,
+    const MutableCFOptions& mutable_cf_options) {
  new_superversion->db_mutex = db_mutex;
+  new_superversion->mutable_cf_options = mutable_cf_options;
  new_superversion->Init(mem_, imm_.current(), current_);
  SuperVersion* old_superversion = super_version_;
  super_version_ = new_superversion;
@ -470,19 +535,32 @@ void ColumnFamilyData::ResetThreadLocalSuperVersions() {
  }
 }

+bool ColumnFamilyData::SetOptions(
+      const std::unordered_map<std::string, std::string>& options_map) {
+  MutableCFOptions new_mutable_cf_options;
+  if (GetMutableOptionsFromStrings(mutable_cf_options_, options_map,
+                                   &new_mutable_cf_options)) {
+    mutable_cf_options_ = new_mutable_cf_options;
+    return true;
+  }
+  return false;
+}
+
 ColumnFamilySet::ColumnFamilySet(const std::string& dbname,
                                 const DBOptions* db_options,
-                                 const EnvOptions& storage_options,
-                                 Cache* table_cache)
+                                 const EnvOptions& env_options,
+                                 Cache* table_cache,
+                                 WriteController* write_controller)
    : max_column_family_(0),
      dummy_cfd_(new ColumnFamilyData(0, "", nullptr, nullptr,
                                      ColumnFamilyOptions(), db_options,
-                                      storage_options_, nullptr)),
+                                      env_options, nullptr)),
      default_cfd_cache_(nullptr),
      db_name_(dbname),
      db_options_(db_options),
-      storage_options_(storage_options),
+      env_options_(env_options),
      table_cache_(table_cache),
+      write_controller_(write_controller),
      spin_lock_(ATOMIC_FLAG_INIT) {
  // initialize linked list
  dummy_cfd_->prev_ = dummy_cfd_;
@ -547,7 +625,7 @@ ColumnFamilyData* ColumnFamilySet::CreateColumnFamily(
  assert(column_families_.find(name) == column_families_.end());
  ColumnFamilyData* new_cfd =
      new ColumnFamilyData(id, name, dummy_versions, table_cache_, options,
-                           db_options_, storage_options_, this);
+                           db_options_, env_options_, this);
  Lock();
  column_families_.insert({name, id});
  column_family_data_.insert({id, new_cfd});
@ -606,6 +684,11 @@ bool ColumnFamilyMemTablesImpl::Seek(uint32_t column_family_id) {
    column_family_set_->Lock();
    current_ = column_family_set_->GetColumnFamily(column_family_id);
    column_family_set_->Unlock();
+    // TODO(icanadi) Maybe remove column family from the hash table when it's
+    // dropped?
+    if (current_ != nullptr && current_->IsDropped()) {
+      current_ = nullptr;
+    }
  }
  handle_.SetCFD(current_);
  return current_ != nullptr;
@ -631,4 +714,29 @@ ColumnFamilyHandle* ColumnFamilyMemTablesImpl::GetColumnFamilyHandle() {
  return &handle_;
 }

+void ColumnFamilyMemTablesImpl::CheckMemtableFull() {
+  if (current_ != nullptr && current_->mem()->ShouldScheduleFlush()) {
+    flush_scheduler_->ScheduleFlush(current_);
+    current_->mem()->MarkFlushScheduled();
+  }
+}
+
+uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family) {
+  uint32_t column_family_id = 0;
+  if (column_family != nullptr) {
+    auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+    column_family_id = cfh->GetID();
+  }
+  return column_family_id;
+}
+
+const Comparator* GetColumnFamilyUserComparator(
+    ColumnFamilyHandle* column_family) {
+  if (column_family != nullptr) {
+    auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+    return cfh->user_comparator();
+  }
+  return nullptr;
+}
+
 }  // namespace rocksdb
--- a/db/column_family.h
+++ b/db/column_family.h
@ -19,8 +19,11 @@
 #include "rocksdb/env.h"
 #include "db/memtable_list.h"
 #include "db/write_batch_internal.h"
+#include "db/write_controller.h"
 #include "db/table_cache.h"
 #include "util/thread_local.h"
+#include "db/flush_scheduler.h"
+#include "util/mutable_cf_options.h"

 namespace rocksdb {

@ -46,6 +49,7 @@ class ColumnFamilyHandleImpl : public ColumnFamilyHandle {
  // destroy without mutex
  virtual ~ColumnFamilyHandleImpl();
  virtual ColumnFamilyData* cfd() const { return cfd_; }
+  virtual const Comparator* user_comparator() const;

  virtual uint32_t GetID() const;

@ -78,6 +82,7 @@ struct SuperVersion {
  MemTable* mem;
  MemTableListVersion* imm;
  Version* current;
+  MutableCFOptions mutable_cf_options;
  std::atomic<uint32_t> refs;
  // We need to_delete because during Cleanup(), imm->Unref() returns
  // all memtables that we need to free through this vector. We then
@ -113,7 +118,6 @@ struct SuperVersion {
 };

 extern ColumnFamilyOptions SanitizeOptions(const InternalKeyComparator* icmp,
-                                           const InternalFilterPolicy* ipolicy,
                                           const ColumnFamilyOptions& src);

 class ColumnFamilySet;
@ -133,7 +137,7 @@ class ColumnFamilyData {
  void Ref() { ++refs_; }
  // will just decrease reference count to 0, but will not delete it. returns
  // true if the ref count was decreased to zero. in that case, it can be
-  // deleted by the caller immediatelly, or later, by calling
+  // deleted by the caller immediately, or later, by calling
  // FreeDeadColumnFamilies()
  bool Unref() {
    assert(refs_ > 0);
@ -157,6 +161,7 @@ class ColumnFamilyData {
    // can't drop default CF
    assert(id_ != 0);
    dropped_ = true;
+    write_controller_token_.reset();
  }
  bool IsDropped() const { return dropped_; }

@ -169,6 +174,21 @@ class ColumnFamilyData {
  // thread-safe
  const Options* options() const { return &options_; }
  const EnvOptions* soptions() const;
+  const ImmutableCFOptions* ioptions() const { return &ioptions_; }
+  // REQUIRES: DB mutex held
+  // This returns the MutableCFOptions used by current SuperVersion
+  // You shoul use this API to reference MutableCFOptions most of the time.
+  const MutableCFOptions* mutable_cf_options() const {
+    return &(super_version_->mutable_cf_options);
+  }
+  // REQUIRES: DB mutex held
+  // This returns the latest MutableCFOptions, which may be not in effect yet.
+  const MutableCFOptions* GetLatestMutableCFOptions() const {
+    return &mutable_cf_options_;
+  }
+  // REQUIRES: DB mutex held
+  bool SetOptions(
+      const std::unordered_map<std::string, std::string>& options_map);

  InternalStats* internal_stats() { return internal_stats_.get(); }

@ -178,7 +198,7 @@ class ColumnFamilyData {
  Version* dummy_versions() { return dummy_versions_; }
  void SetMemtable(MemTable* new_mem) { mem_ = new_mem; }
  void SetCurrent(Version* current);
-  void CreateNewMemtable();
+  void CreateNewMemtable(const MemTableOptions& moptions);

  TableCache* table_cache() const { return table_cache_.get(); }

@ -219,40 +239,20 @@ class ColumnFamilyData {
  // if its reference count is zero and needs deletion or nullptr if not
  // As argument takes a pointer to allocated SuperVersion to enable
  // the clients to allocate SuperVersion outside of mutex.
+  SuperVersion* InstallSuperVersion(SuperVersion* new_superversion,
+                                    port::Mutex* db_mutex,
+                                    const MutableCFOptions& mutable_cf_options);
  SuperVersion* InstallSuperVersion(SuperVersion* new_superversion,
                                    port::Mutex* db_mutex);

  void ResetThreadLocalSuperVersions();

-  // A Flag indicating whether write needs to slowdown because of there are
-  // too many number of level0 files.
-  bool NeedSlowdownForNumLevel0Files() const {
-    return need_slowdown_for_num_level0_files_;
-  }
-
-  bool NeedWaitForNumLevel0Files() const {
-    return need_wait_for_num_level0_files_;
-  }
-
-  bool NeedWaitForNumMemtables() const {
-    return need_wait_for_num_memtables_;
-  }
-
-  bool ExceedsSoftRateLimit() const {
-    return exceeds_soft_rate_limit_;
-  }
-
-  bool ExceedsHardRateLimit() const {
-    return exceeds_hard_rate_limit_;
-  }
-
 private:
  friend class ColumnFamilySet;
  ColumnFamilyData(uint32_t id, const std::string& name,
                   Version* dummy_versions, Cache* table_cache,
                   const ColumnFamilyOptions& options,
-                   const DBOptions* db_options,
-                   const EnvOptions& storage_options,
+                   const DBOptions* db_options, const EnvOptions& env_options,
                   ColumnFamilySet* column_family_set);

  // Recalculate some small conditions, which are changed only during
@ -261,7 +261,6 @@ class ColumnFamilyData {
  // DBImpl::MakeRoomForWrite function to decide, if it need to make
  // a write stall
  void RecalculateWriteStallConditions();
-  void RecalculateWriteStallRateLimitsConditions();

  uint32_t id_;
  const std::string name_;
@ -272,9 +271,10 @@ class ColumnFamilyData {
  bool dropped_;               // true if client dropped it

  const InternalKeyComparator internal_comparator_;
-  const InternalFilterPolicy internal_filter_policy_;

-  Options const options_;
+  const Options options_;
+  const ImmutableCFOptions ioptions_;
+  MutableCFOptions mutable_cf_options_;

  std::unique_ptr<TableCache> table_cache_;

@ -303,31 +303,13 @@ class ColumnFamilyData {
  // recovered from
  uint64_t log_number_;

-  // A flag indicating whether we should delay writes because
-  // we have too many level 0 files
-  bool need_slowdown_for_num_level0_files_;
-
-  // These 4 variables are updated only after compaction,
-  // adding new memtable, flushing memtables to files
-  // and/or add recalculation of compaction score.
-  // That's why theirs values are cached in ColumnFamilyData.
-  // Recalculation is made by RecalculateWriteStallConditions and
-  // RecalculateWriteStallRateLimitsConditions function. They are used
-  // in DBImpl::MakeRoomForWrite function to decide, if it need
-  // to sleep during write operation
-  bool need_wait_for_num_memtables_;
-
-  bool need_wait_for_num_level0_files_;
-
-  bool exceeds_hard_rate_limit_;
-
-  bool exceeds_soft_rate_limit_;
-
  // An object that keeps all the compaction stats
  // and picks the next compaction
  std::unique_ptr<CompactionPicker> compaction_picker_;

  ColumnFamilySet* column_family_set_;
+
+  std::unique_ptr<WriteControllerToken> write_controller_token_;
 };

 // ColumnFamilySet has interesting thread-safety requirements
@ -369,7 +351,8 @@ class ColumnFamilySet {
  };

  ColumnFamilySet(const std::string& dbname, const DBOptions* db_options,
-                  const EnvOptions& storage_options, Cache* table_cache);
+                  const EnvOptions& env_options, Cache* table_cache,
+                  WriteController* write_controller);
  ~ColumnFamilySet();

  ColumnFamilyData* GetDefault() const;
@ -422,8 +405,9 @@ class ColumnFamilySet {

  const std::string db_name_;
  const DBOptions* const db_options_;
-  const EnvOptions storage_options_;
+  const EnvOptions env_options_;
  Cache* table_cache_;
+  WriteController* write_controller_;
  std::atomic_flag spin_lock_;
 };

@ -431,8 +415,11 @@ class ColumnFamilySet {
 // memtables of different column families (specified by ID in the write batch)
 class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables {
 public:
-  explicit ColumnFamilyMemTablesImpl(ColumnFamilySet* column_family_set)
-      : column_family_set_(column_family_set), current_(nullptr) {}
+  explicit ColumnFamilyMemTablesImpl(ColumnFamilySet* column_family_set,
+                                     FlushScheduler* flush_scheduler)
+      : column_family_set_(column_family_set),
+        current_(nullptr),
+        flush_scheduler_(flush_scheduler) {}

  // sets current_ to ColumnFamilyData with column_family_id
  // returns false if column family doesn't exist
@ -451,10 +438,18 @@ class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables {
  // Returns column family handle for the selected column family
  virtual ColumnFamilyHandle* GetColumnFamilyHandle() override;

+  virtual void CheckMemtableFull() override;
+
 private:
  ColumnFamilySet* column_family_set_;
  ColumnFamilyData* current_;
+  FlushScheduler* flush_scheduler_;
  ColumnFamilyHandleInternal handle_;
 };

+extern uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family);
+
+extern const Comparator* GetColumnFamilyUserComparator(
+    ColumnFamilyHandle* column_family);
+
 }  // namespace rocksdb
--- a/db/column_family_test.cc
+++ b/db/column_family_test.cc
@ -408,9 +408,15 @@ TEST(ColumnFamilyTest, WriteBatchFailure) {
  Open();
  CreateColumnFamiliesAndReopen({"one", "two"});
  WriteBatch batch;
+  batch.Put(handles_[0], Slice("existing"), Slice("column-family"));
  batch.Put(handles_[1], Slice("non-existing"), Slice("column-family"));
  ASSERT_OK(db_->Write(WriteOptions(), &batch));
  DropColumnFamilies({1});
+  WriteOptions woptions_ignore_missing_cf;
+  woptions_ignore_missing_cf.ignore_missing_column_families = true;
+  batch.Put(handles_[0], Slice("still here"), Slice("column-family"));
+  ASSERT_OK(db_->Write(woptions_ignore_missing_cf, &batch));
+  ASSERT_EQ("column-family", Get(0, "still here"));
  Status s = db_->Write(WriteOptions(), &batch);
  ASSERT_TRUE(s.IsInvalidArgument());
  Close();
@ -746,9 +752,10 @@ TEST(ColumnFamilyTest, DifferentCompactionStyles) {
  default_cf.num_levels = 3;
  default_cf.write_buffer_size = 64 << 10;  // 64KB
  default_cf.target_file_size_base = 30 << 10;
-  default_cf.filter_policy = nullptr;
-  default_cf.no_block_cache = true;
  default_cf.source_compaction_factor = 100;
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = true;
+  default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));

  one.compaction_style = kCompactionStyleUniversal;
  // trigger compaction if there are >= 4 files
--- a/db/compaction.cc
+++ b/db/compaction.cc
@ -9,7 +9,10 @@

 #include "db/compaction.h"

+#ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
+#endif
+
 #include <inttypes.h>
 #include <vector>

@ -110,8 +113,8 @@ void Compaction::AddInputDeletions(VersionEdit* edit) {
 }

 bool Compaction::KeyNotExistsBeyondOutputLevel(const Slice& user_key) {
-  assert(cfd_->options()->compaction_style != kCompactionStyleFIFO);
-  if (cfd_->options()->compaction_style == kCompactionStyleUniversal) {
+  assert(cfd_->ioptions()->compaction_style != kCompactionStyleFIFO);
+  if (cfd_->ioptions()->compaction_style == kCompactionStyleUniversal) {
    return bottommost_level_;
  }
  // Maybe use binary search to find right entry instead of linear search?
@ -174,8 +177,8 @@ void Compaction::MarkFilesBeingCompacted(bool mark_as_compacted) {

 // Is this compaction producing files at the bottommost level?
 void Compaction::SetupBottomMostLevel(bool is_manual) {
-  assert(cfd_->options()->compaction_style != kCompactionStyleFIFO);
-  if (cfd_->options()->compaction_style == kCompactionStyleUniversal) {
+  assert(cfd_->ioptions()->compaction_style != kCompactionStyleFIFO);
+  if (cfd_->ioptions()->compaction_style == kCompactionStyleUniversal) {
    // If universal compaction style is used and manual
    // compaction is occuring, then we are guaranteed that
    // all files will be picked in a single compaction
@ -267,7 +270,7 @@ void Compaction::Summary(char* output, int len) {
 uint64_t Compaction::OutputFilePreallocationSize() {
  uint64_t preallocation_size = 0;

-  if (cfd_->options()->compaction_style == kCompactionStyleLevel) {
+  if (cfd_->ioptions()->compaction_style == kCompactionStyleLevel) {
    preallocation_size =
        cfd_->compaction_picker()->MaxFileSizeForLevel(output_level());
  } else {
--- a/db/compaction_picker.cc
+++ b/db/compaction_picker.cc
@ -9,7 +9,10 @@

 #include "db/compaction_picker.h"

+#ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
+#endif
+
 #include <inttypes.h>
 #include <limits>
 #include "db/filename.h"
@ -39,13 +42,13 @@ CompressionType GetCompressionType(const Options& options, int level,
    return kNoCompression;
  }
  // If the use has specified a different compression level for each level,
-  // then pick the compresison for that level.
+  // then pick the compression for that level.
  if (!options.compression_per_level.empty()) {
    const int n = options.compression_per_level.size() - 1;
    // It is possible for level_ to be -1; in that case, we use level
    // 0's compression.  This occurs mostly in backwards compatibility
    // situations when the builder doesn't know what level the file
-    // belongs to.  Likewise, if level_ is beyond the end of the
+    // belongs to.  Likewise, if level is beyond the end of the
    // specified compression levels, use the last value.
    return options.compression_per_level[std::max(0, std::min(level, n))];
  } else {
@ -173,9 +176,12 @@ void CompactionPicker::GetRange(const std::vector<FileMetaData*>& inputs1,
 }

 bool CompactionPicker::ExpandWhileOverlapping(Compaction* c) {
+  assert(c != nullptr);
  // If inputs are empty then there is nothing to expand.
-  if (!c || c->inputs_[0].empty()) {
-    return true;
+  if (c->inputs_[0].empty()) {
+    assert(c->inputs_[1].empty());
+    // This isn't good compaction
+    return false;
  }

  // GetOverlappingInputs will always do the right thing for level-0.
@ -427,7 +433,7 @@ Compaction* LevelCompactionPicker::PickCompaction(Version* version,
    level = version->compaction_level_[i];
    if ((version->compaction_score_[i] >= 1)) {
      c = PickCompactionBySize(version, level, version->compaction_score_[i]);
-      if (ExpandWhileOverlapping(c) == false) {
+      if (c == nullptr || ExpandWhileOverlapping(c) == false) {
        delete c;
        c = nullptr;
      } else {
--- a/db/corruption_test.cc
+++ b/db/corruption_test.cc
@ -45,7 +45,9 @@ class CorruptionTest {

    db_ = nullptr;
    options_.create_if_missing = true;
-    options_.block_size_deviation = 0; // make unit test pass for now
+    BlockBasedTableOptions table_options;
+    table_options.block_size_deviation = 0;  // make unit test pass for now
+    options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
    Reopen();
    options_.create_if_missing = false;
  }
@ -60,9 +62,11 @@ class CorruptionTest {
    db_ = nullptr;
    Options opt = (options ? *options : options_);
    opt.env = &env_;
-    opt.block_cache = tiny_cache_;
-    opt.block_size_deviation = 0;
    opt.arena_block_size = 4096;
+    BlockBasedTableOptions table_options;
+    table_options.block_cache = tiny_cache_;
+    table_options.block_size_deviation = 0;
+    opt.table_factory.reset(NewBlockBasedTableFactory(table_options));
    return DB::Open(opt, dbname_, &db_);
  }

@ -328,6 +332,9 @@ TEST(CorruptionTest, CorruptedDescriptor) {
 }

 TEST(CorruptionTest, CompactionInputError) {
+  Options options;
+  options.max_background_flushes = 0;
+  Reopen(&options);
  Build(10);
  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
  dbi->TEST_FlushMemTable();
@ -347,6 +354,7 @@ TEST(CorruptionTest, CompactionInputErrorParanoid) {
  options.paranoid_checks = true;
  options.write_buffer_size = 131072;
  options.max_write_buffer_number = 2;
+  options.max_background_flushes = 0;
  Reopen(&options);
  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);

--- a/db/cuckoo_table_db_test.cc
+++ b/db/cuckoo_table_db_test.cc
@ -131,8 +131,6 @@ TEST(CuckooTableDBTest, Flush) {
  ASSERT_EQ("v2", Get("key2"));
  ASSERT_EQ("v3", Get("key3"));
  ASSERT_EQ("NOT_FOUND", Get("key4"));
-  ASSERT_EQ("Invalid argument: Length of key is invalid.", Get("somelongkey"));
-  ASSERT_EQ("Invalid argument: Length of key is invalid.", Get("s"));

  // Now add more keys and flush.
  ASSERT_OK(Put("key4", "v4"));
@ -195,6 +193,38 @@ static std::string Key(int i) {
  snprintf(buf, sizeof(buf), "key_______%06d", i);
  return std::string(buf);
 }
+static std::string Uint64Key(uint64_t i) {
+  std::string str;
+  str.resize(8);
+  memcpy(&str[0], static_cast<void*>(&i), 8);
+  return str;
+}
+}  // namespace.
+
+TEST(CuckooTableDBTest, Uint64Comparator) {
+  Options options = CurrentOptions();
+  options.comparator = test::Uint64Comparator();
+  Reopen(&options);
+
+  ASSERT_OK(Put(Uint64Key(1), "v1"));
+  ASSERT_OK(Put(Uint64Key(2), "v2"));
+  ASSERT_OK(Put(Uint64Key(3), "v3"));
+  dbfull()->TEST_FlushMemTable();
+
+  ASSERT_EQ("v1", Get(Uint64Key(1)));
+  ASSERT_EQ("v2", Get(Uint64Key(2)));
+  ASSERT_EQ("v3", Get(Uint64Key(3)));
+  ASSERT_EQ("NOT_FOUND", Get(Uint64Key(4)));
+
+  // Add more keys.
+  ASSERT_OK(Delete(Uint64Key(2)));  // Delete.
+  ASSERT_OK(Put(Uint64Key(3), "v0"));  // Update.
+  ASSERT_OK(Put(Uint64Key(4), "v4"));
+  dbfull()->TEST_FlushMemTable();
+  ASSERT_EQ("v1", Get(Uint64Key(1)));
+  ASSERT_EQ("NOT_FOUND", Get(Uint64Key(2)));
+  ASSERT_EQ("v0", Get(Uint64Key(3)));
+  ASSERT_EQ("v4", Get(Uint64Key(4)));
 }

 TEST(CuckooTableDBTest, CompactionTrigger) {
@ -215,14 +245,38 @@ TEST(CuckooTableDBTest, CompactionTrigger) {
    ASSERT_OK(Put(Key(idx), std::string(10000, 'a' + idx)));
  }
  dbfull()->TEST_WaitForFlushMemTable();
-  dbfull()->TEST_CompactRange(0, nullptr, nullptr);
+  ASSERT_EQ("2", FilesPerLevel());

+  dbfull()->TEST_CompactRange(0, nullptr, nullptr);
  ASSERT_EQ("0,2", FilesPerLevel());
  for (int idx = 0; idx < 22; ++idx) {
    ASSERT_EQ(std::string(10000, 'a' + idx), Get(Key(idx)));
  }
 }

+TEST(CuckooTableDBTest, CompactionIntoMultipleFiles) {
+  // Create a big L0 file and check it compacts into multiple files in L1.
+  Options options = CurrentOptions();
+  options.write_buffer_size = 270 << 10;
+  // Two SST files should be created, each containing 14 keys.
+  // Number of buckets will be 16. Total size ~156 KB.
+  options.target_file_size_base = 160 << 10;
+  Reopen(&options);
+
+  // Write 28 values, each 10016 B ~ 10KB
+  for (int idx = 0; idx < 28; ++idx) {
+    ASSERT_OK(Put(Key(idx), std::string(10000, 'a' + idx)));
+  }
+  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_EQ("1", FilesPerLevel());
+
+  dbfull()->TEST_CompactRange(0, nullptr, nullptr);
+  ASSERT_EQ("0,2", FilesPerLevel());
+  for (int idx = 0; idx < 28; ++idx) {
+    ASSERT_EQ(std::string(10000, 'a' + idx), Get(Key(idx)));
+  }
+}
+
 TEST(CuckooTableDBTest, SameKeyInsertedInTwoDifferentFilesAndCompacted) {
  // Insert same key twice so that they go to different SST files. Then wait for
  // compaction and check if the latest value is stored and old value removed.
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@ -7,7 +7,9 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.

+#ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
+#endif

 #ifndef GFLAGS
 #include <cstdio>
@ -37,8 +39,8 @@ int main() {
 #include "rocksdb/memtablerep.h"
 #include "rocksdb/write_batch.h"
 #include "rocksdb/slice.h"
+#include "rocksdb/filter_policy.h"
 #include "rocksdb/slice_transform.h"
-#include "rocksdb/statistics.h"
 #include "rocksdb/perf_context.h"
 #include "port/port.h"
 #include "port/stack_trace.h"
@ -146,6 +148,7 @@ DEFINE_int64(merge_keys, -1,
             "Number of distinct keys to use for MergeRandom and "
             "ReadRandomMergeRandom. "
             "If negative, there will be FLAGS_num keys.");
+DEFINE_int32(num_column_families, 1, "Number of Column Families to use.");

 DEFINE_int64(reads, -1, "Number of read operations to do.  "
             "If negative, do FLAGS_num reads.");
@ -162,6 +165,7 @@ DEFINE_int32(duration, 0, "Time in seconds for the random-ops tests to run."

 DEFINE_int32(value_size, 100, "Size of each value");

+DEFINE_bool(use_uint64_comparator, false, "use Uint64 user comparator");

 static bool ValidateKeySize(const char* flagname, int32_t value) {
  return true;
@ -238,10 +242,11 @@ DEFINE_int32(universal_compression_size_percent, -1,
 DEFINE_int64(cache_size, -1, "Number of bytes to use as a cache of uncompressed"
             "data. Negative means use default settings.");

-DEFINE_int32(block_size, rocksdb::Options().block_size,
+DEFINE_int32(block_size, rocksdb::BlockBasedTableOptions().block_size,
             "Number of bytes in a block.");

-DEFINE_int32(block_restart_interval, rocksdb::Options().block_restart_interval,
+DEFINE_int32(block_restart_interval,
+             rocksdb::BlockBasedTableOptions().block_restart_interval,
             "Number of keys between restart points "
             "for delta encoding of keys.");

@ -302,7 +307,7 @@ DEFINE_string(wal_dir, "", "If not empty, use the given dir for WAL");

 DEFINE_int32(num_levels, 7, "The total number of levels");

-DEFINE_int32(target_file_size_base, 2 * 1048576, "Target file size at level-1");
+DEFINE_int64(target_file_size_base, 2 * 1048576, "Target file size at level-1");

 DEFINE_int32(target_file_size_multiplier, 1,
             "A multiplier to compute target level-N file size (N >= 2)");
@ -509,6 +514,9 @@ DEFINE_int64(keys_per_prefix, 0, "control average number of keys generated "
             "i.e. use the prefix comes with the generated random number.");
 DEFINE_bool(enable_io_prio, false, "Lower the background flush/compaction "
            "threads' IO priority");
+DEFINE_bool(identity_as_first_hash, false, "the first hash function of cuckoo "
+            "table becomes an identity function. This is only valid when key "
+            "is 8 bytes");

 enum RepFactory {
  kSkipList,
@ -548,7 +556,9 @@ DEFINE_double(cuckoo_hash_ratio, 0.9, "Hash ratio for Cuckoo SST table.");
 DEFINE_bool(use_hash_search, false, "if use kHashSearch "
            "instead of kBinarySearch. "
            "This is valid if only we use BlockTable");
-
+DEFINE_bool(use_block_based_filter, false, "if use kBlockBasedFilter "
+            "instead of kFullFilter for filter block. "
+            "This is valid if only we use BlockTable");
 DEFINE_string(merge_operator, "", "The merge operator to use with the database."
              "If a new merge operator is specified, be sure to use fresh"
              " database The possible merge operators are defined in"
@ -843,12 +853,19 @@ class Duration {

 class Benchmark {
 private:
-  shared_ptr<Cache> cache_;
-  shared_ptr<Cache> compressed_cache_;
-  const FilterPolicy* filter_policy_;
+  std::shared_ptr<Cache> cache_;
+  std::shared_ptr<Cache> compressed_cache_;
+  std::shared_ptr<const FilterPolicy> filter_policy_;
  const SliceTransform* prefix_extractor_;
-  DB* db_;
-  std::vector<DB*> multi_dbs_;
+  struct DBWithColumnFamilies {
+    std::vector<ColumnFamilyHandle*> cfh;
+    DB* db;
+    DBWithColumnFamilies() : db(nullptr) {
+      cfh.clear();
+    }
+  };
+  DBWithColumnFamilies db_;
+  std::vector<DBWithColumnFamilies> multi_dbs_;
  int64_t num_;
  int value_size_;
  int key_size_;
@ -1064,11 +1081,10 @@ class Benchmark {
           (FLAGS_cache_numshardbits >= 1 ?
            NewLRUCache(FLAGS_compressed_cache_size, FLAGS_cache_numshardbits) :
            NewLRUCache(FLAGS_compressed_cache_size)) : nullptr),
-    filter_policy_(FLAGS_bloom_bits >= 0
-                   ? NewBloomFilterPolicy(FLAGS_bloom_bits)
-                   : nullptr),
+    filter_policy_(FLAGS_bloom_bits >= 0 ?
+        NewBloomFilterPolicy(FLAGS_bloom_bits, FLAGS_use_block_based_filter)
+        : nullptr),
    prefix_extractor_(NewFixedPrefixTransform(FLAGS_prefix_size)),
-    db_(nullptr),
    num_(FLAGS_num),
    value_size_(FLAGS_value_size),
    key_size_(FLAGS_key_size),
@ -1099,8 +1115,9 @@ class Benchmark {
  }

  ~Benchmark() {
-    delete db_;
-    delete filter_policy_;
+    std::for_each(db_.cfh.begin(), db_.cfh.end(),
+                  [](ColumnFamilyHandle* cfh) { delete cfh; });
+    delete db_.db;
    delete prefix_extractor_;
  }

@ -1159,6 +1176,16 @@ class Benchmark {
    return base_name + std::to_string(id);
  }

+  std::string ColumnFamilyName(int i) {
+    if (i == 0) {
+      return kDefaultColumnFamilyName;
+    } else {
+      char name[100];
+      snprintf(name, sizeof(name), "column_family_name_%06d", i);
+      return std::string(name);
+    }
+  }
+
  void Run() {
    if (!SanityCheck()) {
      exit(1);
@ -1313,13 +1340,16 @@ class Benchmark {
                  name.ToString().c_str());
          method = nullptr;
        } else {
-          if (db_ != nullptr) {
-            delete db_;
-            db_ = nullptr;
+          if (db_.db != nullptr) {
+            std::for_each(db_.cfh.begin(), db_.cfh.end(),
+                          [](ColumnFamilyHandle* cfh) { delete cfh; });
+            delete db_.db;
+            db_.db = nullptr;
+            db_.cfh.clear();
            DestroyDB(FLAGS_db, Options());
          }
          for (size_t i = 0; i < multi_dbs_.size(); i++) {
-            delete multi_dbs_[i];
+            delete multi_dbs_[i].db;
            DestroyDB(GetDbNameForMultiple(FLAGS_db, i), Options());
          }
          multi_dbs_.clear();
@ -1491,7 +1521,7 @@ class Benchmark {

  void Compress(ThreadState *thread) {
    RandomGenerator gen;
-    Slice input = gen.Generate(Options().block_size);
+    Slice input = gen.Generate(FLAGS_block_size);
    int64_t bytes = 0;
    int64_t produced = 0;
    bool ok = true;
@ -1541,7 +1571,7 @@ class Benchmark {

  void Uncompress(ThreadState *thread) {
    RandomGenerator gen;
-    Slice input = gen.Generate(Options().block_size);
+    Slice input = gen.Generate(FLAGS_block_size);
    std::string compressed;

    bool ok;
@ -1617,14 +1647,10 @@ class Benchmark {
  }

  void Open() {
-    assert(db_ == nullptr);
+    assert(db_.db == nullptr);
    Options options;
    options.create_if_missing = !FLAGS_use_existing_db;
-    options.block_cache = cache_;
-    options.block_cache_compressed = compressed_cache_;
-    if (cache_ == nullptr) {
-      options.no_block_cache = true;
-    }
+    options.create_missing_column_families = FLAGS_num_column_families > 1;
    options.write_buffer_size = FLAGS_write_buffer_size;
    options.max_write_buffer_number = FLAGS_max_write_buffer_number;
    options.min_write_buffer_number_to_merge =
@ -1632,13 +1658,17 @@ class Benchmark {
    options.max_background_compactions = FLAGS_max_background_compactions;
    options.max_background_flushes = FLAGS_max_background_flushes;
    options.compaction_style = FLAGS_compaction_style_e;
-    options.block_size = FLAGS_block_size;
-    options.block_restart_interval = FLAGS_block_restart_interval;
-    options.filter_policy = filter_policy_;
    if (FLAGS_prefix_size != 0) {
      options.prefix_extractor.reset(
          NewFixedPrefixTransform(FLAGS_prefix_size));
    }
+    if (FLAGS_use_uint64_comparator) {
+      options.comparator = test::Uint64Comparator();
+      if (FLAGS_key_size != 8) {
+        fprintf(stderr, "Using Uint64 comparator but key size is not 8.\n");
+        exit(1);
+      }
+    }
    options.memtable_prefix_bloom_bits = FLAGS_memtable_bloom_bits;
    options.bloom_locality = FLAGS_bloom_locality;
    options.max_open_files = FLAGS_open_files;
@ -1712,8 +1742,11 @@ class Benchmark {
        fprintf(stderr, "Invalid cuckoo_hash_ratio\n");
        exit(1);
      }
+      rocksdb::CuckooTableOptions table_options;
+      table_options.hash_table_ratio = FLAGS_cuckoo_hash_ratio;
+      table_options.identity_as_first_hash = FLAGS_identity_as_first_hash;
      options.table_factory = std::shared_ptr<TableFactory>(
-          NewCuckooTableFactory(FLAGS_cuckoo_hash_ratio));
+          NewCuckooTableFactory(table_options));
    } else {
      BlockBasedTableOptions block_based_options;
      if (FLAGS_use_hash_search) {
@ -1726,6 +1759,14 @@ class Benchmark {
      } else {
        block_based_options.index_type = BlockBasedTableOptions::kBinarySearch;
      }
+      if (cache_ == nullptr) {
+        block_based_options.no_block_cache = true;
+      }
+      block_based_options.block_cache = cache_;
+      block_based_options.block_cache_compressed = compressed_cache_;
+      block_based_options.block_size = FLAGS_block_size;
+      block_based_options.block_restart_interval = FLAGS_block_restart_interval;
+      block_based_options.filter_policy = filter_policy_;
      options.table_factory.reset(
          NewBlockBasedTableFactory(block_based_options));
    }
@ -1816,10 +1857,9 @@ class Benchmark {
      OpenDb(options, FLAGS_db, &db_);
    } else {
      multi_dbs_.clear();
+      multi_dbs_.resize(FLAGS_num_multi_db);
      for (int i = 0; i < FLAGS_num_multi_db; i++) {
-        DB* db;
-        OpenDb(options, GetDbNameForMultiple(FLAGS_db, i), &db);
-        multi_dbs_.push_back(db);
+        OpenDb(options, GetDbNameForMultiple(FLAGS_db, i), &multi_dbs_[i]);
      }
    }
    if (FLAGS_min_level_to_compress >= 0) {
@ -1827,12 +1867,27 @@ class Benchmark {
    }
  }

-  void OpenDb(Options options, std::string db_name, DB** db) {
+  void OpenDb(const Options& options, const std::string& db_name,
+      DBWithColumnFamilies* db) {
    Status s;
-    if(FLAGS_readonly) {
-      s = DB::OpenForReadOnly(options, db_name, db);
+    // Open with column families if necessary.
+    if (FLAGS_num_column_families > 1) {
+      db->cfh.resize(FLAGS_num_column_families);
+      std::vector<ColumnFamilyDescriptor> column_families;
+      for (int i = 0; i < FLAGS_num_column_families; i++) {
+        column_families.push_back(ColumnFamilyDescriptor(
+              ColumnFamilyName(i), ColumnFamilyOptions(options)));
+      }
+      if (FLAGS_readonly) {
+        s = DB::OpenForReadOnly(options, db_name, column_families,
+            &db->cfh, &db->db);
+      } else {
+        s = DB::Open(options, db_name, column_families, &db->cfh, &db->db);
+      }
+    } else if (FLAGS_readonly) {
+      s = DB::OpenForReadOnly(options, db_name, &db->db);
    } else {
-      s = DB::Open(options, db_name, db);
+      s = DB::Open(options, db_name, &db->db);
    }
    if (!s.ok()) {
      fprintf(stderr, "open error: %s\n", s.ToString().c_str());
@ -1900,10 +1955,18 @@ class Benchmark {
  };

  DB* SelectDB(ThreadState* thread) {
-    if (db_ != nullptr) {
-      return db_;
-    } else {
-      return multi_dbs_[thread->rand.Next() % multi_dbs_.size()];
+    return SelectDBWithCfh(thread)->db;
+  }
+
+  DBWithColumnFamilies* SelectDBWithCfh(ThreadState* thread) {
+    return SelectDBWithCfh(thread->rand.Next());
+  }
+
+  DBWithColumnFamilies* SelectDBWithCfh(uint64_t rand_int) {
+    if (db_.db != nullptr) {
+      return &db_;
+    } else  {
+      return &multi_dbs_[rand_int % multi_dbs_.size()];
    }
  }

@ -1912,7 +1975,7 @@ class Benchmark {
    const int64_t num_ops = writes_ == 0 ? num_ : writes_;

    size_t num_key_gens = 1;
-    if (db_ == nullptr) {
+    if (db_.db == nullptr) {
      num_key_gens = multi_dbs_.size();
    }
    std::vector<std::unique_ptr<KeyGenerator>> key_gens(num_key_gens);
@ -1935,20 +1998,25 @@ class Benchmark {
    Slice key = AllocateKey();
    std::unique_ptr<const char[]> key_guard(key.data());
    while (!duration.Done(entries_per_batch_)) {
-      size_t id = 0;
-      DB* db_to_write = db_;
-      if (db_to_write == nullptr) {
-        id = thread->rand.Next() % num_key_gens;
-        db_to_write = multi_dbs_[id];
-      }
+      size_t id = thread->rand.Next() % num_key_gens;
+      DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(id);
      batch.Clear();
      for (int64_t j = 0; j < entries_per_batch_; j++) {
-        GenerateKeyFromInt(key_gens[id]->Next(), FLAGS_num, &key);
-        batch.Put(key, gen.Generate(value_size_));
+        int64_t rand_num = key_gens[id]->Next();
+        GenerateKeyFromInt(rand_num, FLAGS_num, &key);
+        if (FLAGS_num_column_families <= 1) {
+          batch.Put(key, gen.Generate(value_size_));
+        } else {
+          // We use same rand_num as seed for key and column family so that we
+          // can deterministically find the cfh corresponding to a particular
+          // key while reading the key.
+          batch.Put(db_with_cfh->cfh[rand_num % db_with_cfh->cfh.size()],
+              key, gen.Generate(value_size_));
+        }
        bytes += value_size_ + key_size_;
      }
-      s = db_to_write->Write(write_options_, &batch);
-      thread->stats.FinishedOps(db_to_write, entries_per_batch_);
+      s = db_with_cfh->db->Write(write_options_, &batch);
+      thread->stats.FinishedOps(db_with_cfh->db, entries_per_batch_);
      if (!s.ok()) {
        fprintf(stderr, "put error: %s\n", s.ToString().c_str());
        exit(1);
@ -1958,11 +2026,11 @@ class Benchmark {
  }

  void ReadSequential(ThreadState* thread) {
-    if (db_ != nullptr) {
-      ReadSequential(thread, db_);
+    if (db_.db != nullptr) {
+      ReadSequential(thread, db_.db);
    } else {
-      for (DB* db : multi_dbs_) {
-        ReadSequential(thread, db);
+      for (const auto& db_with_cfh : multi_dbs_) {
+        ReadSequential(thread, db_with_cfh.db);
      }
    }
  }
@ -1981,11 +2049,11 @@ class Benchmark {
  }

  void ReadReverse(ThreadState* thread) {
-    if (db_ != nullptr) {
-      ReadReverse(thread, db_);
+    if (db_.db != nullptr) {
+      ReadReverse(thread, db_.db);
    } else {
-      for (DB* db : multi_dbs_) {
-        ReadReverse(thread, db);
+      for (const auto& db_with_cfh : multi_dbs_) {
+        ReadReverse(thread, db_with_cfh.db);
      }
    }
  }
@ -1996,7 +2064,7 @@ class Benchmark {
    int64_t bytes = 0;
    for (iter->SeekToLast(); i < reads_ && iter->Valid(); iter->Prev()) {
      bytes += iter->key().size() + iter->value().size();
-      thread->stats.FinishedOps(db_, 1);
+      thread->stats.FinishedOps(db, 1);
      ++i;
    }
    delete iter;
@ -2013,13 +2081,24 @@ class Benchmark {

    Duration duration(FLAGS_duration, reads_);
    while (!duration.Done(1)) {
-      DB* db = SelectDB(thread);
-      GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
+      DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread);
+      // We use same key_rand as seed for key and column family so that we can
+      // deterministically find the cfh corresponding to a particular key, as it
+      // is done in DoWrite method.
+      int64_t key_rand = thread->rand.Next() % FLAGS_num;
+      GenerateKeyFromInt(key_rand, FLAGS_num, &key);
      read++;
-      if (db->Get(options, key, &value).ok()) {
+      Status s;
+      if (FLAGS_num_column_families > 1) {
+        s = db_with_cfh->db->Get(options,
+            db_with_cfh->cfh[key_rand % db_with_cfh->cfh.size()], key, &value);
+      } else {
+        s = db_with_cfh->db->Get(options, key, &value);
+      }
+      if (s.ok()) {
        found++;
      }
-      thread->stats.FinishedOps(db_, 1);
+      thread->stats.FinishedOps(db_with_cfh->db, 1);
    }

    char msg[100];
@ -2061,6 +2140,7 @@ class Benchmark {
          ++found;
        }
      }
+      thread->stats.FinishedOps(db, entries_per_batch_);
    }
    for (auto& k : keys) {
      delete k.data();
@ -2099,11 +2179,11 @@ class Benchmark {

    Iterator* single_iter = nullptr;
    std::vector<Iterator*> multi_iters;
-    if (db_ != nullptr) {
-      single_iter = db_->NewIterator(options);
+    if (db_.db != nullptr) {
+      single_iter = db_.db->NewIterator(options);
    } else {
-      for (DB* db : multi_dbs_) {
-        multi_iters.push_back(db->NewIterator(options));
+      for (const auto& db_with_cfh : multi_dbs_) {
+        multi_iters.push_back(db_with_cfh.db->NewIterator(options));
      }
    }
    uint64_t last_refresh = FLAGS_env->NowMicros();
@ -2116,16 +2196,16 @@ class Benchmark {
      if (!FLAGS_use_tailing_iterator && FLAGS_iter_refresh_interval_us >= 0) {
        uint64_t now = FLAGS_env->NowMicros();
        if (now - last_refresh > (uint64_t)FLAGS_iter_refresh_interval_us) {
-          if (db_ != nullptr) {
+          if (db_.db != nullptr) {
            delete single_iter;
-            single_iter = db_->NewIterator(options);
+            single_iter = db_.db->NewIterator(options);
          } else {
            for (auto iter : multi_iters) {
              delete iter;
            }
            multi_iters.clear();
-            for (DB* db : multi_dbs_) {
-              multi_iters.push_back(db->NewIterator(options));
+            for (const auto& db_with_cfh : multi_dbs_) {
+              multi_iters.push_back(db_with_cfh.db->NewIterator(options));
            }
          }
        }
@ -2143,7 +2223,7 @@ class Benchmark {
      if (iter_to_use->Valid() && iter_to_use->key().compare(key) == 0) {
        found++;
      }
-      thread->stats.FinishedOps(db_, 1);
+      thread->stats.FinishedOps(db_.db, 1);
    }
    delete single_iter;
    for (auto iter : multi_iters) {
@ -2243,7 +2323,7 @@ class Benchmark {
        fprintf(stderr, "put error: %s\n", s.ToString().c_str());
        exit(1);
      }
-      thread->stats.FinishedOps(db_, 1);
+      thread->stats.FinishedOps(db_.db, 1);

      ++num_writes;
      if (writes_per_second_by_10 && num_writes >= writes_per_second_by_10) {
@ -2403,7 +2483,7 @@ class Benchmark {
        deletes_done++;
      }

-      thread->stats.FinishedOps(db_, 1);
+      thread->stats.FinishedOps(db_.db, 1);
    }
    char msg[100];
    snprintf(msg, sizeof(msg),
@ -2542,7 +2622,7 @@ class Benchmark {
        fprintf(stderr, "put error: %s\n", s.ToString().c_str());
        exit(1);
      }
-      thread->stats.FinishedOps(db_, 1);
+      thread->stats.FinishedOps(db, 1);
    }

    char msg[100];
@ -2578,7 +2658,7 @@ class Benchmark {
        fprintf(stderr, "merge error: %s\n", s.ToString().c_str());
        exit(1);
      }
-      thread->stats.FinishedOps(db_, 1);
+      thread->stats.FinishedOps(db, 1);
    }

    // Print some statistics
@ -2639,7 +2719,7 @@ class Benchmark {

      }

-      thread->stats.FinishedOps(db_, 1);
+      thread->stats.FinishedOps(db, 1);
    }

    char msg[100];
@ -2656,11 +2736,11 @@ class Benchmark {
  }

  void PrintStats(const char* key) {
-    if (db_ != nullptr) {
-      PrintStats(db_, key, false);
+    if (db_.db != nullptr) {
+      PrintStats(db_.db, key, false);
    }
-    for (DB* db : multi_dbs_) {
-      PrintStats(db, key, true);
+    for (const auto& db_with_cfh : multi_dbs_) {
+      PrintStats(db_with_cfh.db, key, true);
    }
  }

--- a/db/db_filesnapshot.cc
+++ b/db/db_filesnapshot.cc
@ -9,7 +9,10 @@

 #ifndef ROCKSDB_LITE

+#ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
+#endif
+
 #include <inttypes.h>
 #include <algorithm>
 #include <string>
@ -29,9 +32,9 @@ Status DBImpl::DisableFileDeletions() {
  MutexLock l(&mutex_);
  ++disable_delete_obsolete_files_;
  if (disable_delete_obsolete_files_ == 1) {
-    Log(options_.info_log, "File Deletions Disabled");
+    Log(db_options_.info_log, "File Deletions Disabled");
  } else {
-    Log(options_.info_log,
+    Log(db_options_.info_log,
        "File Deletions Disabled, but already disabled. Counter: %d",
        disable_delete_obsolete_files_);
  }
@ -50,11 +53,11 @@ Status DBImpl::EnableFileDeletions(bool force) {
      --disable_delete_obsolete_files_;
    }
    if (disable_delete_obsolete_files_ == 0)  {
-      Log(options_.info_log, "File Deletions Enabled");
+      Log(db_options_.info_log, "File Deletions Enabled");
      should_purge_files = true;
      FindObsoleteFiles(deletion_state, true);
    } else {
-      Log(options_.info_log,
+      Log(db_options_.info_log,
          "File Deletions Enable, but not really enabled. Counter: %d",
          disable_delete_obsolete_files_);
    }
@ -62,10 +65,14 @@ Status DBImpl::EnableFileDeletions(bool force) {
  if (should_purge_files)  {
    PurgeObsoleteFiles(deletion_state);
  }
-  LogFlush(options_.info_log);
+  LogFlush(db_options_.info_log);
  return Status::OK();
 }

+int DBImpl::IsFileDeletionsEnabled() const {
+  return disable_delete_obsolete_files_;
+}
+
 Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
                            uint64_t* manifest_file_size,
                            bool flush_memtable) {
@ -91,7 +98,7 @@ Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,

    if (!status.ok()) {
      mutex_.Unlock();
-      Log(options_.info_log, "Cannot Flush data %s\n",
+      Log(db_options_.info_log, "Cannot Flush data %s\n",
          status.ToString().c_str());
      return status;
    }
@ -129,7 +136,7 @@ Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) {
  Status s;
  // list wal files in main db dir.
  VectorLogPtr logs;
-  s = GetSortedWalsOfType(options_.wal_dir, logs, kAliveLogFile);
+  s = GetSortedWalsOfType(db_options_.wal_dir, logs, kAliveLogFile);
  if (!s.ok()) {
    return s;
  }
@ -142,7 +149,7 @@ Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) {

  files.clear();
  // list wal files in archive dir.
-  std::string archivedir = ArchivalDirectory(options_.wal_dir);
+  std::string archivedir = ArchivalDirectory(db_options_.wal_dir);
  if (env_->FileExists(archivedir)) {
    s = GetSortedWalsOfType(archivedir, files, kArchivedLogFile);
    if (!s.ok()) {
@ -153,7 +160,7 @@ Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) {
  uint64_t latest_archived_log_number = 0;
  if (!files.empty()) {
    latest_archived_log_number = files.back()->LogNumber();
-    Log(options_.info_log, "Latest Archived log: %" PRIu64,
+    Log(db_options_.info_log, "Latest Archived log: %" PRIu64,
        latest_archived_log_number);
  }

@ -166,7 +173,7 @@ Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) {
      // same log in both db dir and archived dir. Simply
      // ignore the one in db dir. Note that, if we read
      // archived dir first, we would have missed the log file.
-      Log(options_.info_log, "%s already moved to archive",
+      Log(db_options_.info_log, "%s already moved to archive",
          log->PathName().c_str());
    }
  }
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
--- a/db/db_impl.h
+++ b/db/db_impl.h
@ -30,7 +30,11 @@
 #include "util/autovector.h"
 #include "util/stop_watch.h"
 #include "util/thread_local.h"
+#include "util/scoped_arena_iterator.h"
 #include "db/internal_stats.h"
+#include "db/write_controller.h"
+#include "db/flush_scheduler.h"
+#include "db/write_thread.h"

 namespace rocksdb {

@ -108,6 +112,10 @@ class DBImpl : public DB {
                              bool reduce_level = false, int target_level = -1,
                              uint32_t target_path_id = 0);

+  using DB::SetOptions;
+  bool SetOptions(ColumnFamilyHandle* column_family,
+      const std::unordered_map<std::string, std::string>& options_map);
+
  using DB::NumberLevels;
  virtual int NumberLevels(ColumnFamilyHandle* column_family);
  using DB::MaxMemCompactionLevel;
@ -127,6 +135,7 @@ class DBImpl : public DB {
 #ifndef ROCKSDB_LITE
  virtual Status DisableFileDeletions();
  virtual Status EnableFileDeletions(bool force);
+  virtual int IsFileDeletionsEnabled() const;
  // All the returned filenames start with "/"
  virtual Status GetLiveFiles(std::vector<std::string>&,
                              uint64_t* manifest_file_size,
@ -172,8 +181,8 @@ class DBImpl : public DB {
  // Return an internal iterator over the current state of the database.
  // The keys of this iterator are internal keys (see format.h).
  // The returned iterator should be deleted when no longer needed.
-  Iterator* TEST_NewInternalIterator(ColumnFamilyHandle* column_family =
-                                         nullptr);
+  Iterator* TEST_NewInternalIterator(
+      Arena* arena, ColumnFamilyHandle* column_family = nullptr);

  // Return the maximum overlapping data (in bytes) at next level for any
  // file at a level >= 1.
@ -201,6 +210,17 @@ class DBImpl : public DB {
                              SequenceNumber* sequence);

  Status TEST_ReadFirstLine(const std::string& fname, SequenceNumber* sequence);
+
+  void TEST_LockMutex();
+
+  void TEST_UnlockMutex();
+
+  // REQUIRES: mutex locked
+  void* TEST_BeginWrite();
+
+  // REQUIRES: mutex locked
+  // pass the pointer that you got from TEST_BeginWrite()
+  void TEST_EndWrite(void* w);
 #endif  // NDEBUG

  // Structure to store information for candidate files to delete.
@ -274,7 +294,7 @@ class DBImpl : public DB {
  // Returns the list of live files in 'live' and the list
  // of all files in the filesystem in 'candidate_files'.
  // If force == false and the last call was less than
-  // options_.delete_obsolete_files_period_micros microseconds ago,
+  // db_options_.delete_obsolete_files_period_micros microseconds ago,
  // it will not fill up the deletion_state
  void FindObsoleteFiles(DeletionState& deletion_state,
                         bool force,
@ -292,23 +312,21 @@ class DBImpl : public DB {
  Env* const env_;
  const std::string dbname_;
  unique_ptr<VersionSet> versions_;
-  const DBOptions options_;
+  const DBOptions db_options_;
  Statistics* stats_;

  Iterator* NewInternalIterator(const ReadOptions&, ColumnFamilyData* cfd,
-                                SuperVersion* super_version,
-                                Arena* arena = nullptr);
+                                SuperVersion* super_version, Arena* arena);

 private:
  friend class DB;
  friend class InternalStats;
 #ifndef ROCKSDB_LITE
-  friend class TailingIterator;
  friend class ForwardIterator;
 #endif
  friend struct SuperVersion;
  struct CompactionState;
-  struct Writer;
+
  struct WriteContext;

  Status NewDB();
@ -332,8 +350,9 @@ class DBImpl : public DB {
                                   DeletionState& deletion_state,
                                   LogBuffer* log_buffer);

-  Status RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence,
-                        bool read_only);
+  // REQUIRES: log_numbers are sorted in ascending order
+  Status RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
+                         SequenceNumber* max_sequence, bool read_only);

  // The following two methods are used to flush a memtable to
  // storage. The first one is used atdatabase RecoveryTime (when the
@ -346,43 +365,13 @@ class DBImpl : public DB {
                          VersionEdit* edit, uint64_t* filenumber,
                          LogBuffer* log_buffer);

-  uint64_t SlowdownAmount(int n, double bottom, double top);
+  void DelayWrite(uint64_t expiration_time);

-  // Before applying write operation (such as DBImpl::Write, DBImpl::Flush)
-  // thread should grab the mutex_ and be the first on writers queue.
-  // BeginWrite is used for it.
-  // Be aware! Writer's job can be done by other thread (see DBImpl::Write
-  // for examples), so check it via w.done before applying changes.
-  //
-  // Writer* w:                writer to be placed in the queue
-  // uint64_t expiration_time: maximum time to be in the queue
-  // See also: EndWrite
-  Status BeginWrite(Writer* w, uint64_t expiration_time);
-
-  // After doing write job, we need to remove already used writers from
-  // writers_ queue and notify head of the queue about it.
-  // EndWrite is used for this.
-  //
-  // Writer* w:           Writer, that was added by BeginWrite function
-  // Writer* last_writer: Since we can join a few Writers (as DBImpl::Write
-  //                      does)
-  //                      we should pass last_writer as a parameter to
-  //                      EndWrite
-  //                      (if you don't touch other writers, just pass w)
-  // Status status:       Status of write operation
-  // See also: BeginWrite
-  void EndWrite(Writer* w, Writer* last_writer, Status status);
-
-  Status MakeRoomForWrite(ColumnFamilyData* cfd,
-                          WriteContext* context,
-                          uint64_t expiration_time);
+  Status ScheduleFlushes(WriteContext* context);

  Status SetNewMemtableAndNewLogFile(ColumnFamilyData* cfd,
                                     WriteContext* context);

-  void BuildBatchGroup(Writer** last_writer,
-                       autovector<WriteBatch*>* write_batch_group);
-
  // Force current memtable contents to be flushed.
  Status FlushMemTable(ColumnFamilyData* cfd, const FlushOptions& options);

@ -527,10 +516,13 @@ class DBImpl : public DB {

  std::unique_ptr<Directory> db_directory_;

-  // Queue of writers.
-  std::deque<Writer*> writers_;
+  WriteThread write_thread_;
+
  WriteBatch tmp_batch_;

+  WriteController write_controller_;
+  FlushScheduler flush_scheduler_;
+
  SnapshotList snapshots_;

  // cache for ReadFirstRecord() calls
@ -599,14 +591,10 @@ class DBImpl : public DB {
  bool flush_on_destroy_; // Used when disableWAL is true.

  static const int KEEP_LOG_FILE_NUM = 1000;
-  static const uint64_t kNoTimeOut = std::numeric_limits<uint64_t>::max();
  std::string db_absolute_path_;

-  // count of the number of contiguous delaying writes
-  int delayed_writes_;
-
  // The options to access storage files
-  const EnvOptions storage_options_;
+  const EnvOptions env_options_;

  // A value of true temporarily disables scheduling of background work
  bool bg_work_gate_closed_;
@ -621,9 +609,6 @@ class DBImpl : public DB {
  DBImpl(const DBImpl&);
  void operator=(const DBImpl&);

-  // dump the delayed_writes_ to the log file and reset counter.
-  void DelayLoggingAndReset();
-
  // Return the earliest snapshot where seqno is visible.
  // Store the snapshot right before that, if any, in prev_snapshot
  inline SequenceNumber findEarliestVisibleSnapshot(
@ -669,7 +654,6 @@ class DBImpl : public DB {
 // it is not equal to src.info_log.
 extern Options SanitizeOptions(const std::string& db,
                               const InternalKeyComparator* icmp,
-                               const InternalFilterPolicy* ipolicy,
                               const Options& src);
 extern DBOptions SanitizeOptions(const std::string& db, const DBOptions& src);

--- a/db/db_impl_debug.cc
+++ b/db/db_impl_debug.cc
@ -20,7 +20,8 @@ uint64_t DBImpl::TEST_GetLevel0TotalSize() {
  return default_cf_handle_->cfd()->current()->NumLevelBytes(0);
 }

-Iterator* DBImpl::TEST_NewInternalIterator(ColumnFamilyHandle* column_family) {
+Iterator* DBImpl::TEST_NewInternalIterator(Arena* arena,
+                                           ColumnFamilyHandle* column_family) {
  ColumnFamilyData* cfd;
  if (column_family == nullptr) {
    cfd = default_cf_handle_->cfd();
@ -33,7 +34,7 @@ Iterator* DBImpl::TEST_NewInternalIterator(ColumnFamilyHandle* column_family) {
  SuperVersion* super_version = cfd->GetSuperVersion()->Ref();
  mutex_.Unlock();
  ReadOptions roptions;
-  return NewInternalIterator(roptions, cfd, super_version);
+  return NewInternalIterator(roptions, cfd, super_version, arena);
 }

 int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes(
@ -129,5 +130,27 @@ Status DBImpl::TEST_ReadFirstLine(const std::string& fname,
                                  SequenceNumber* sequence) {
  return ReadFirstLine(fname, sequence);
 }
+
+void DBImpl::TEST_LockMutex() {
+  mutex_.Lock();
+}
+
+void DBImpl::TEST_UnlockMutex() {
+  mutex_.Unlock();
+}
+
+void* DBImpl::TEST_BeginWrite() {
+  auto w = new WriteThread::Writer(&mutex_);
+  Status s = write_thread_.EnterWriteThread(w, 0);
+  assert(s.ok() && !w->done);  // No timeout and nobody should do our job
+  return reinterpret_cast<void*>(w);
+}
+
+void DBImpl::TEST_EndWrite(void* w) {
+  auto writer = reinterpret_cast<WriteThread::Writer*>(w);
+  write_thread_.ExitWriteThread(writer, writer, Status::OK());
+  delete writer;
+}
+
 }  // namespace rocksdb
 #endif  // ROCKSDB_LITE
--- a/db/db_impl_readonly.cc
+++ b/db/db_impl_readonly.cc
@ -16,7 +16,6 @@
 #include <stdint.h>
 #include <stdio.h>
 #include <vector>
-#include <algorithm>
 #include "db/db_iter.h"
 #include "db/dbformat.h"
 #include "db/filename.h"
@ -42,17 +41,17 @@

 namespace rocksdb {

-DBImplReadOnly::DBImplReadOnly(const DBOptions& options,
+DBImplReadOnly::DBImplReadOnly(const DBOptions& db_options,
                               const std::string& dbname)
-    : DBImpl(options, dbname) {
-  Log(options_.info_log, "Opening the db in read only mode");
+    : DBImpl(db_options, dbname) {
+  Log(db_options_.info_log, "Opening the db in read only mode");
 }

 DBImplReadOnly::~DBImplReadOnly() {
 }

 // Implementations of the DB interface
-Status DBImplReadOnly::Get(const ReadOptions& options,
+Status DBImplReadOnly::Get(const ReadOptions& read_options,
                           ColumnFamilyHandle* column_family, const Slice& key,
                           std::string* value) {
  Status s;
@ -62,33 +61,34 @@ Status DBImplReadOnly::Get(const ReadOptions& options,
  SuperVersion* super_version = cfd->GetSuperVersion();
  MergeContext merge_context;
  LookupKey lkey(key, snapshot);
-  if (super_version->mem->Get(lkey, value, &s, merge_context,
-                              *cfd->options())) {
+  if (super_version->mem->Get(lkey, value, &s, &merge_context)) {
  } else {
-    super_version->current->Get(options, lkey, value, &s, &merge_context);
+    super_version->current->Get(read_options, lkey, value, &s, &merge_context);
  }
  return s;
 }

-Iterator* DBImplReadOnly::NewIterator(const ReadOptions& options,
+Iterator* DBImplReadOnly::NewIterator(const ReadOptions& read_options,
                                      ColumnFamilyHandle* column_family) {
  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
  auto cfd = cfh->cfd();
  SuperVersion* super_version = cfd->GetSuperVersion()->Ref();
  SequenceNumber latest_snapshot = versions_->LastSequence();
  auto db_iter = NewArenaWrappedDbIterator(
-      env_, *cfd->options(), cfd->user_comparator(),
-      (options.snapshot != nullptr
-           ? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_
-           : latest_snapshot));
-  auto internal_iter =
-      NewInternalIterator(options, cfd, super_version, db_iter->GetArena());
+      env_, *cfd->ioptions(), cfd->user_comparator(),
+      (read_options.snapshot != nullptr
+           ? reinterpret_cast<const SnapshotImpl*>(
+                read_options.snapshot)->number_
+           : latest_snapshot),
+      cfd->options()->max_sequential_skip_in_iterations);
+  auto internal_iter = NewInternalIterator(
+      read_options, cfd, super_version, db_iter->GetArena());
  db_iter->SetIterUnderDBIter(internal_iter);
  return db_iter;
 }

 Status DBImplReadOnly::NewIterators(
-    const ReadOptions& options,
+    const ReadOptions& read_options,
    const std::vector<ColumnFamilyHandle*>& column_families,
    std::vector<Iterator*>* iterators) {
  if (iterators == nullptr) {
@ -101,12 +101,14 @@ Status DBImplReadOnly::NewIterators(
  for (auto cfh : column_families) {
    auto cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(cfh)->cfd();
    auto db_iter = NewArenaWrappedDbIterator(
-        env_, *cfd->options(), cfd->user_comparator(),
-        options.snapshot != nullptr
-            ? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_
-            : latest_snapshot);
+        env_, *cfd->ioptions(), cfd->user_comparator(),
+        (read_options.snapshot != nullptr
+            ? reinterpret_cast<const SnapshotImpl*>(
+                  read_options.snapshot)->number_
+            : latest_snapshot),
+        cfd->options()->max_sequential_skip_in_iterations);
    auto internal_iter = NewInternalIterator(
-        options, cfd, cfd->GetSuperVersion()->Ref(), db_iter->GetArena());
+        read_options, cfd, cfd->GetSuperVersion()->Ref(), db_iter->GetArena());
    db_iter->SetIterUnderDBIter(internal_iter);
    iterators->push_back(db_iter);
  }
--- a/db/db_impl_readonly.h
+++ b/db/db_impl_readonly.h
@ -74,6 +74,8 @@ class DBImplReadOnly : public DBImpl {
                              uint32_t target_path_id = 0) override {
    return Status::NotSupported("Not supported operation in read only mode.");
  }
+
+#ifndef ROCKSDB_LITE
  virtual Status DisableFileDeletions() override {
    return Status::NotSupported("Not supported operation in read only mode.");
  }
@ -85,6 +87,8 @@ class DBImplReadOnly : public DBImpl {
                              bool flush_memtable = true) override {
    return Status::NotSupported("Not supported operation in read only mode.");
  }
+#endif  // ROCKSDB_LITE
+
  using DBImpl::Flush;
  virtual Status Flush(const FlushOptions& options,
                       ColumnFamilyHandle* column_family) override {
--- a/db/db_iter.cc
+++ b/db/db_iter.cc
@ -58,22 +58,25 @@ class DBIter: public Iterator {
    kReverse
  };

-  DBIter(Env* env, const Options& options, const Comparator* cmp,
-         Iterator* iter, SequenceNumber s, bool arena_mode)
+  DBIter(Env* env, const ImmutableCFOptions& ioptions,
+         const Comparator* cmp, Iterator* iter, SequenceNumber s,
+         bool arena_mode, uint64_t max_sequential_skip_in_iterations,
+         const Slice* iterate_upper_bound = nullptr)
      : arena_mode_(arena_mode),
        env_(env),
-        logger_(options.info_log.get()),
+        logger_(ioptions.info_log),
        user_comparator_(cmp),
-        user_merge_operator_(options.merge_operator.get()),
+        user_merge_operator_(ioptions.merge_operator),
        iter_(iter),
        sequence_(s),
        direction_(kForward),
        valid_(false),
        current_entry_is_merged_(false),
-        statistics_(options.statistics.get()) {
+        statistics_(ioptions.statistics),
+        iterate_upper_bound_(iterate_upper_bound) {
    RecordTick(statistics_, NO_ITERATORS);
-    has_prefix_extractor_ = (options.prefix_extractor.get() != nullptr);
-    max_skip_ = options.max_sequential_skip_in_iterations;
+    prefix_extractor_ = ioptions.prefix_extractor;
+    max_skip_ = max_sequential_skip_in_iterations;
  }
  virtual ~DBIter() {
    RecordTick(statistics_, NO_ITERATORS, -1);
@ -132,7 +135,7 @@ class DBIter: public Iterator {
    }
  }

-  bool has_prefix_extractor_;
+  const SliceTransform* prefix_extractor_;
  bool arena_mode_;
  Env* const env_;
  Logger* logger_;
@ -149,6 +152,7 @@ class DBIter: public Iterator {
  bool current_entry_is_merged_;
  Statistics* statistics_;
  uint64_t max_skip_;
+  const Slice* iterate_upper_bound_;

  // No copying allowed
  DBIter(const DBIter&);
@ -194,9 +198,8 @@ void DBIter::Next() {
 // NOTE: In between, saved_key_ can point to a user key that has
 //       a delete marker
 inline void DBIter::FindNextUserEntry(bool skipping) {
-  PERF_TIMER_AUTO(find_next_user_entry_time);
+  PERF_TIMER_GUARD(find_next_user_entry_time);
  FindNextUserEntryInternal(skipping);
-  PERF_TIMER_STOP(find_next_user_entry_time);
 }

 // Actual implementation of DBIter::FindNextUserEntry()
@ -208,36 +211,44 @@ void DBIter::FindNextUserEntryInternal(bool skipping) {
  uint64_t num_skipped = 0;
  do {
    ParsedInternalKey ikey;
-    if (ParseKey(&ikey) && ikey.sequence <= sequence_) {
-      if (skipping &&
-          user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) <= 0) {
-        num_skipped++; // skip this entry
-        PERF_COUNTER_ADD(internal_key_skipped_count, 1);
-      } else {
-        skipping = false;
-        switch (ikey.type) {
-          case kTypeDeletion:
-            // Arrange to skip all upcoming entries for this key since
-            // they are hidden by this deletion.
-            saved_key_.SetKey(ikey.user_key);
-            skipping = true;
-            num_skipped = 0;
-            PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
-            break;
-          case kTypeValue:
-            valid_ = true;
-            saved_key_.SetKey(ikey.user_key);
-            return;
-          case kTypeMerge:
-            // By now, we are sure the current ikey is going to yield a value
-            saved_key_.SetKey(ikey.user_key);
-            current_entry_is_merged_ = true;
-            valid_ = true;
-            MergeValuesNewToOld();  // Go to a different state machine
-            return;
-          default:
-            assert(false);
-            break;
+
+    if (ParseKey(&ikey)) {
+      if (iterate_upper_bound_ != nullptr &&
+          ikey.user_key.compare(*iterate_upper_bound_) >= 0) {
+        break;
+      }
+
+      if (ikey.sequence <= sequence_) {
+        if (skipping &&
+           user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) <= 0) {
+          num_skipped++;  // skip this entry
+          PERF_COUNTER_ADD(internal_key_skipped_count, 1);
+        } else {
+          skipping = false;
+          switch (ikey.type) {
+            case kTypeDeletion:
+              // Arrange to skip all upcoming entries for this key since
+              // they are hidden by this deletion.
+              saved_key_.SetKey(ikey.user_key);
+              skipping = true;
+              num_skipped = 0;
+              PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
+              break;
+            case kTypeValue:
+              valid_ = true;
+              saved_key_.SetKey(ikey.user_key);
+              return;
+            case kTypeMerge:
+              // By now, we are sure the current ikey is going to yield a value
+              saved_key_.SetKey(ikey.user_key);
+              current_entry_is_merged_ = true;
+              valid_ = true;
+              MergeValuesNewToOld();  // Go to a different state machine
+              return;
+            default:
+              assert(false);
+              break;
+          }
        }
      }
    }
@ -399,6 +410,7 @@ bool DBIter::FindValueForCurrentKey() {
      case kTypeDeletion:
        operands.clear();
        last_not_merge_type = kTypeDeletion;
+        PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
        break;
      case kTypeMerge:
        assert(user_merge_operator_ != nullptr);
@ -408,6 +420,7 @@ bool DBIter::FindValueForCurrentKey() {
        assert(false);
    }

+    PERF_COUNTER_ADD(internal_key_skipped_count, 1);
    assert(user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) == 0);
    iter_->Prev();
    ++num_skipped;
@ -554,12 +567,29 @@ void DBIter::FindParseableKey(ParsedInternalKey* ikey, Direction direction) {
 void DBIter::Seek(const Slice& target) {
  StopWatch sw(env_, statistics_, DB_SEEK);

+  // total ordering is not guaranteed if prefix_extractor is set
+  // hence prefix based seeks will not give correct results
+  if (iterate_upper_bound_ != nullptr && prefix_extractor_ != nullptr) {
+    if (!prefix_extractor_->InDomain(*iterate_upper_bound_) ||
+        !prefix_extractor_->InDomain(target) ||
+        prefix_extractor_->Transform(*iterate_upper_bound_).compare(
+          prefix_extractor_->Transform(target)) != 0) {
+      status_ = Status::InvalidArgument("read_options.iterate_*_bound "
+                  " and seek target need to have the same prefix.");
+      valid_ = false;
+      return;
+    }
+  }
+
  saved_key_.Clear();
  // now savved_key is used to store internal key.
  saved_key_.SetInternalKey(target, sequence_);
-  PERF_TIMER_AUTO(seek_internal_seek_time);
-  iter_->Seek(saved_key_.GetKey());
-  PERF_TIMER_STOP(seek_internal_seek_time);
+
+  {
+    PERF_TIMER_GUARD(seek_internal_seek_time);
+    iter_->Seek(saved_key_.GetKey());
+  }
+
  if (iter_->Valid()) {
    direction_ = kForward;
    ClearSavedValue();
@ -572,14 +602,17 @@ void DBIter::Seek(const Slice& target) {
 void DBIter::SeekToFirst() {
  // Don't use iter_::Seek() if we set a prefix extractor
  // because prefix seek wiil be used.
-  if (has_prefix_extractor_) {
+  if (prefix_extractor_ != nullptr) {
    max_skip_ = std::numeric_limits<uint64_t>::max();
  }
  direction_ = kForward;
  ClearSavedValue();
-  PERF_TIMER_AUTO(seek_internal_seek_time);
-  iter_->SeekToFirst();
-  PERF_TIMER_STOP(seek_internal_seek_time);
+
+  {
+    PERF_TIMER_GUARD(seek_internal_seek_time);
+    iter_->SeekToFirst();
+  }
+
  if (iter_->Valid()) {
    FindNextUserEntry(false /* not skipping */);
  } else {
@ -590,24 +623,29 @@ void DBIter::SeekToFirst() {
 void DBIter::SeekToLast() {
  // Don't use iter_::Seek() if we set a prefix extractor
  // because prefix seek wiil be used.
-  if (has_prefix_extractor_) {
+  if (prefix_extractor_ != nullptr) {
    max_skip_ = std::numeric_limits<uint64_t>::max();
  }
  direction_ = kReverse;
  ClearSavedValue();
-  PERF_TIMER_AUTO(seek_internal_seek_time);
-  iter_->SeekToLast();
-  PERF_TIMER_STOP(seek_internal_seek_time);
+
+  {
+    PERF_TIMER_GUARD(seek_internal_seek_time);
+    iter_->SeekToLast();
+  }

  PrevInternal();
 }

-Iterator* NewDBIterator(Env* env, const Options& options,
+Iterator* NewDBIterator(Env* env, const ImmutableCFOptions& ioptions,
                        const Comparator* user_key_comparator,
                        Iterator* internal_iter,
-                        const SequenceNumber& sequence) {
-  return new DBIter(env, options, user_key_comparator, internal_iter, sequence,
-                    false);
+                        const SequenceNumber& sequence,
+                        uint64_t max_sequential_skip_in_iterations,
+                        const Slice* iterate_upper_bound) {
+  return new DBIter(env, ioptions, user_key_comparator, internal_iter, sequence,
+                    false, max_sequential_skip_in_iterations,
+                    iterate_upper_bound);
 }

 ArenaWrappedDBIter::~ArenaWrappedDBIter() { db_iter_->~DBIter(); }
@ -635,14 +673,20 @@ void ArenaWrappedDBIter::RegisterCleanup(CleanupFunction function, void* arg1,
 }

 ArenaWrappedDBIter* NewArenaWrappedDbIterator(
-    Env* env, const Options& options, const Comparator* user_key_comparator,
-    const SequenceNumber& sequence) {
+    Env* env, const ImmutableCFOptions& ioptions,
+    const Comparator* user_key_comparator,
+    const SequenceNumber& sequence,
+    uint64_t max_sequential_skip_in_iterations,
+    const Slice* iterate_upper_bound) {
  ArenaWrappedDBIter* iter = new ArenaWrappedDBIter();
  Arena* arena = iter->GetArena();
  auto mem = arena->AllocateAligned(sizeof(DBIter));
-  DBIter* db_iter = new (mem)
-      DBIter(env, options, user_key_comparator, nullptr, sequence, true);
+  DBIter* db_iter = new (mem) DBIter(env, ioptions, user_key_comparator,
+      nullptr, sequence, true, max_sequential_skip_in_iterations,
+      iterate_upper_bound);
+
  iter->SetDBIter(db_iter);
+
  return iter;
 }

--- a/db/db_iter.h
+++ b/db/db_iter.h
@ -24,10 +24,12 @@ class DBIter;
 // into appropriate user keys.
 extern Iterator* NewDBIterator(
    Env* env,
-    const Options& options,
+    const ImmutableCFOptions& options,
    const Comparator *user_key_comparator,
    Iterator* internal_iter,
-    const SequenceNumber& sequence);
+    const SequenceNumber& sequence,
+    uint64_t max_sequential_skip_in_iterations,
+    const Slice* iterate_upper_bound = nullptr);

 // A wrapper iterator which wraps DB Iterator and the arena, with which the DB
 // iterator is supposed be allocated. This class is used as an entry point of
@ -67,7 +69,9 @@ class ArenaWrappedDBIter : public Iterator {

 // Generate the arena wrapped iterator class.
 extern ArenaWrappedDBIter* NewArenaWrappedDbIterator(
-    Env* env, const Options& options, const Comparator* user_key_comparator,
-    const SequenceNumber& sequence);
+    Env* env, const ImmutableCFOptions& options,
+    const Comparator* user_key_comparator,
+    const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations,
+    const Slice* iterate_upper_bound = nullptr);

 }  // namespace rocksdb
--- a/db/db_iter_test.cc
+++ b/db/db_iter_test.cc
@ -158,7 +158,9 @@ TEST(DBIteratorTest, DBIteratorPrevNext) {
    internal_iter->Finish();

    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 10));
+        NewDBIterator(env_, ImmutableCFOptions(options),
+                      BytewiseComparator(), internal_iter, 10,
+                      options.max_sequential_skip_in_iterations));

    db_iter->SeekToLast();
    ASSERT_TRUE(db_iter->Valid());
@ -191,7 +193,9 @@ TEST(DBIteratorTest, DBIteratorPrevNext) {
    internal_iter->Finish();

    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 10));
+        NewDBIterator(env_, ImmutableCFOptions(options),
+                      BytewiseComparator(), internal_iter, 10,
+                      options.max_sequential_skip_in_iterations));

    db_iter->SeekToFirst();
    ASSERT_TRUE(db_iter->Valid());
@ -232,7 +236,9 @@ TEST(DBIteratorTest, DBIteratorPrevNext) {
    internal_iter->Finish();

    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 2));
+        NewDBIterator(env_, ImmutableCFOptions(options),
+                      BytewiseComparator(), internal_iter, 2,
+                      options.max_sequential_skip_in_iterations));
    db_iter->SeekToLast();
    ASSERT_TRUE(db_iter->Valid());
    ASSERT_EQ(db_iter->key().ToString(), "b");
@ -262,7 +268,9 @@ TEST(DBIteratorTest, DBIteratorPrevNext) {
    internal_iter->Finish();

    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 10));
+        NewDBIterator(env_, ImmutableCFOptions(options),
+                      BytewiseComparator(), internal_iter, 10,
+                      options.max_sequential_skip_in_iterations));
    db_iter->SeekToLast();
    ASSERT_TRUE(db_iter->Valid());
    ASSERT_EQ(db_iter->key().ToString(), "c");
@ -288,7 +296,9 @@ TEST(DBIteratorTest, DBIteratorEmpty) {
    internal_iter->Finish();

    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 0));
+        NewDBIterator(env_, ImmutableCFOptions(options),
+                      BytewiseComparator(), internal_iter, 0,
+                      options.max_sequential_skip_in_iterations));
    db_iter->SeekToLast();
    ASSERT_TRUE(!db_iter->Valid());
  }
@ -298,7 +308,9 @@ TEST(DBIteratorTest, DBIteratorEmpty) {
    internal_iter->Finish();

    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 0));
+        NewDBIterator(env_, ImmutableCFOptions(options),
+                      BytewiseComparator(), internal_iter, 0,
+                      options.max_sequential_skip_in_iterations));
    db_iter->SeekToFirst();
    ASSERT_TRUE(!db_iter->Valid());
  }
@ -318,7 +330,9 @@ TEST(DBIteratorTest, DBIteratorUseSkipCountSkips) {
  internal_iter->Finish();

  std::unique_ptr<Iterator> db_iter(
-      NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 2));
+      NewDBIterator(env_, ImmutableCFOptions(options),
+                    BytewiseComparator(), internal_iter, 2,
+                    options.max_sequential_skip_in_iterations));
  db_iter->SeekToLast();
  ASSERT_TRUE(db_iter->Valid());
  ASSERT_EQ(db_iter->key().ToString(), "c");
@ -357,7 +371,9 @@ TEST(DBIteratorTest, DBIteratorUseSkip) {

      options.statistics = rocksdb::CreateDBStatistics();
      std::unique_ptr<Iterator> db_iter(NewDBIterator(
-          env_, options, BytewiseComparator(), internal_iter, i + 2));
+          env_, ImmutableCFOptions(options),
+          BytewiseComparator(), internal_iter, i + 2,
+          options.max_sequential_skip_in_iterations));
      db_iter->SeekToLast();
      ASSERT_TRUE(db_iter->Valid());

@ -391,7 +407,9 @@ TEST(DBIteratorTest, DBIteratorUseSkip) {
      internal_iter->Finish();

      std::unique_ptr<Iterator> db_iter(NewDBIterator(
-          env_, options, BytewiseComparator(), internal_iter, i + 2));
+          env_, ImmutableCFOptions(options),
+          BytewiseComparator(), internal_iter, i + 2,
+          options.max_sequential_skip_in_iterations));
      db_iter->SeekToLast();
      ASSERT_TRUE(db_iter->Valid());

@ -418,7 +436,9 @@ TEST(DBIteratorTest, DBIteratorUseSkip) {
      internal_iter->Finish();

      std::unique_ptr<Iterator> db_iter(NewDBIterator(
-          env_, options, BytewiseComparator(), internal_iter, 202));
+          env_, ImmutableCFOptions(options),
+          BytewiseComparator(), internal_iter, 202,
+          options.max_sequential_skip_in_iterations));
      db_iter->SeekToLast();
      ASSERT_TRUE(db_iter->Valid());

@ -449,7 +469,9 @@ TEST(DBIteratorTest, DBIteratorUseSkip) {
      internal_iter->AddPut("c", "200");
      internal_iter->Finish();
      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, i));
+          NewDBIterator(env_, ImmutableCFOptions(options),
+                        BytewiseComparator(), internal_iter, i,
+                        options.max_sequential_skip_in_iterations));
      db_iter->SeekToLast();
      ASSERT_TRUE(!db_iter->Valid());

@ -464,7 +486,9 @@ TEST(DBIteratorTest, DBIteratorUseSkip) {
    internal_iter->AddPut("c", "200");
    internal_iter->Finish();
    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 200));
+        NewDBIterator(env_, ImmutableCFOptions(options),
+                      BytewiseComparator(), internal_iter, 200,
+                      options.max_sequential_skip_in_iterations));
    db_iter->SeekToLast();
    ASSERT_TRUE(db_iter->Valid());
    ASSERT_EQ(db_iter->key().ToString(), "c");
@ -497,7 +521,9 @@ TEST(DBIteratorTest, DBIteratorUseSkip) {
      internal_iter->Finish();

      std::unique_ptr<Iterator> db_iter(NewDBIterator(
-          env_, options, BytewiseComparator(), internal_iter, i + 2));
+          env_, ImmutableCFOptions(options),
+          BytewiseComparator(), internal_iter, i + 2,
+          options.max_sequential_skip_in_iterations));
      db_iter->SeekToLast();
      ASSERT_TRUE(db_iter->Valid());

@ -530,7 +556,9 @@ TEST(DBIteratorTest, DBIteratorUseSkip) {
      internal_iter->Finish();

      std::unique_ptr<Iterator> db_iter(NewDBIterator(
-          env_, options, BytewiseComparator(), internal_iter, i + 2));
+          env_, ImmutableCFOptions(options),
+          BytewiseComparator(), internal_iter, i + 2,
+          options.max_sequential_skip_in_iterations));
      db_iter->SeekToLast();
      ASSERT_TRUE(db_iter->Valid());

@ -570,7 +598,9 @@ TEST(DBIteratorTest, DBIterator) {
    internal_iter->Finish();

    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 1));
+        NewDBIterator(env_, ImmutableCFOptions(options),
+                      BytewiseComparator(), internal_iter, 1,
+                      options.max_sequential_skip_in_iterations));
    db_iter->SeekToFirst();
    ASSERT_TRUE(db_iter->Valid());
    ASSERT_EQ(db_iter->key().ToString(), "a");
@ -590,7 +620,9 @@ TEST(DBIteratorTest, DBIterator) {
    internal_iter->Finish();

    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 0));
+        NewDBIterator(env_, ImmutableCFOptions(options),
+                      BytewiseComparator(), internal_iter, 0,
+                      options.max_sequential_skip_in_iterations));
    db_iter->SeekToFirst();
    ASSERT_TRUE(db_iter->Valid());
    ASSERT_EQ(db_iter->key().ToString(), "a");
@ -609,7 +641,9 @@ TEST(DBIteratorTest, DBIterator) {
    internal_iter->Finish();

    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 2));
+        NewDBIterator(env_, ImmutableCFOptions(options),
+                      BytewiseComparator(), internal_iter, 2,
+                      options.max_sequential_skip_in_iterations));
    db_iter->SeekToFirst();
    ASSERT_TRUE(db_iter->Valid());
    ASSERT_EQ(db_iter->key().ToString(), "a");
@ -628,7 +662,9 @@ TEST(DBIteratorTest, DBIterator) {
    internal_iter->Finish();

    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 4));
+        NewDBIterator(env_, ImmutableCFOptions(options),
+                      BytewiseComparator(), internal_iter, 4,
+                      options.max_sequential_skip_in_iterations));
    db_iter->SeekToFirst();
    ASSERT_TRUE(db_iter->Valid());
    ASSERT_EQ(db_iter->key().ToString(), "a");
@ -654,7 +690,9 @@ TEST(DBIteratorTest, DBIterator) {
      internal_iter->Finish();

      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 0));
+          NewDBIterator(env_, ImmutableCFOptions(options),
+                        BytewiseComparator(), internal_iter, 0,
+                        options.max_sequential_skip_in_iterations));
      db_iter->SeekToLast();
      ASSERT_TRUE(db_iter->Valid());
      ASSERT_EQ(db_iter->key().ToString(), "a");
@ -675,7 +713,9 @@ TEST(DBIteratorTest, DBIterator) {
      internal_iter->Finish();

      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 1));
+          NewDBIterator(env_, ImmutableCFOptions(options),
+                        BytewiseComparator(), internal_iter, 1,
+                        options.max_sequential_skip_in_iterations));
      db_iter->SeekToLast();
      ASSERT_TRUE(db_iter->Valid());
      ASSERT_EQ(db_iter->key().ToString(), "a");
@ -696,7 +736,9 @@ TEST(DBIteratorTest, DBIterator) {
      internal_iter->Finish();

      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 2));
+          NewDBIterator(env_, ImmutableCFOptions(options),
+                        BytewiseComparator(), internal_iter, 2,
+                        options.max_sequential_skip_in_iterations));
      db_iter->SeekToLast();
      ASSERT_TRUE(db_iter->Valid());
      ASSERT_EQ(db_iter->key().ToString(), "a");
@ -717,7 +759,9 @@ TEST(DBIteratorTest, DBIterator) {
      internal_iter->Finish();

      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 3));
+          NewDBIterator(env_, ImmutableCFOptions(options),
+                        BytewiseComparator(), internal_iter, 3,
+                        options.max_sequential_skip_in_iterations));
      db_iter->SeekToLast();
      ASSERT_TRUE(db_iter->Valid());
      ASSERT_EQ(db_iter->key().ToString(), "a");
@ -738,7 +782,9 @@ TEST(DBIteratorTest, DBIterator) {
      internal_iter->Finish();

      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 4));
+          NewDBIterator(env_, ImmutableCFOptions(options),
+                        BytewiseComparator(), internal_iter, 4,
+                        options.max_sequential_skip_in_iterations));
      db_iter->SeekToLast();
      ASSERT_TRUE(db_iter->Valid());
      ASSERT_EQ(db_iter->key().ToString(), "a");
@ -759,7 +805,9 @@ TEST(DBIteratorTest, DBIterator) {
      internal_iter->Finish();

      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 5));
+          NewDBIterator(env_, ImmutableCFOptions(options),
+                        BytewiseComparator(), internal_iter, 5,
+                        options.max_sequential_skip_in_iterations));
      db_iter->SeekToLast();
      ASSERT_TRUE(db_iter->Valid());
      ASSERT_EQ(db_iter->key().ToString(), "a");
@ -780,7 +828,9 @@ TEST(DBIteratorTest, DBIterator) {
      internal_iter->Finish();

      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 6));
+          NewDBIterator(env_, ImmutableCFOptions(options),
+                        BytewiseComparator(), internal_iter, 6,
+                        options.max_sequential_skip_in_iterations));
      db_iter->SeekToLast();
      ASSERT_TRUE(db_iter->Valid());
      ASSERT_EQ(db_iter->key().ToString(), "a");
@ -803,7 +853,9 @@ TEST(DBIteratorTest, DBIterator) {
      internal_iter->Finish();

      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 0));
+          NewDBIterator(env_, ImmutableCFOptions(options),
+                        BytewiseComparator(), internal_iter, 0,
+                        options.max_sequential_skip_in_iterations));
      db_iter->SeekToLast();
      ASSERT_TRUE(db_iter->Valid());
      ASSERT_EQ(db_iter->key().ToString(), "a");
@ -824,7 +876,9 @@ TEST(DBIteratorTest, DBIterator) {
      internal_iter->Finish();

      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 1));
+          NewDBIterator(env_, ImmutableCFOptions(options),
+                        BytewiseComparator(), internal_iter, 1,
+                        options.max_sequential_skip_in_iterations));
      db_iter->SeekToLast();
      ASSERT_TRUE(db_iter->Valid());
      ASSERT_EQ(db_iter->key().ToString(), "a");
@ -845,7 +899,9 @@ TEST(DBIteratorTest, DBIterator) {
      internal_iter->Finish();

      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 2));
+          NewDBIterator(env_, ImmutableCFOptions(options),
+                        BytewiseComparator(), internal_iter, 2,
+                        options.max_sequential_skip_in_iterations));
      db_iter->SeekToLast();
      ASSERT_TRUE(db_iter->Valid());
      ASSERT_EQ(db_iter->key().ToString(), "a");
@ -866,7 +922,9 @@ TEST(DBIteratorTest, DBIterator) {
      internal_iter->Finish();

      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 3));
+          NewDBIterator(env_, ImmutableCFOptions(options),
+                        BytewiseComparator(), internal_iter, 3,
+                        options.max_sequential_skip_in_iterations));
      db_iter->SeekToLast();
      ASSERT_TRUE(!db_iter->Valid());
    }
@ -883,7 +941,9 @@ TEST(DBIteratorTest, DBIterator) {
      internal_iter->Finish();

      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 4));
+          NewDBIterator(env_, ImmutableCFOptions(options),
+                        BytewiseComparator(), internal_iter, 4,
+                        options.max_sequential_skip_in_iterations));
      db_iter->SeekToLast();
      ASSERT_TRUE(db_iter->Valid());
      ASSERT_EQ(db_iter->key().ToString(), "a");
@ -904,7 +964,9 @@ TEST(DBIteratorTest, DBIterator) {
      internal_iter->Finish();

      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 5));
+          NewDBIterator(env_, ImmutableCFOptions(options),
+                        BytewiseComparator(), internal_iter, 5,
+                        options.max_sequential_skip_in_iterations));
      db_iter->SeekToLast();
      ASSERT_TRUE(db_iter->Valid());
      ASSERT_EQ(db_iter->key().ToString(), "a");
@ -925,7 +987,9 @@ TEST(DBIteratorTest, DBIterator) {
      internal_iter->Finish();

      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 6));
+          NewDBIterator(env_, ImmutableCFOptions(options),
+                        BytewiseComparator(), internal_iter, 6,
+                        options.max_sequential_skip_in_iterations));
      db_iter->SeekToLast();
      ASSERT_TRUE(db_iter->Valid());
      ASSERT_EQ(db_iter->key().ToString(), "a");
@ -960,7 +1024,9 @@ TEST(DBIteratorTest, DBIterator) {
      internal_iter->Finish();

      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 0));
+          NewDBIterator(env_, ImmutableCFOptions(options),
+                        BytewiseComparator(), internal_iter, 0,
+                        options.max_sequential_skip_in_iterations));
      db_iter->SeekToLast();
      ASSERT_TRUE(db_iter->Valid());
      ASSERT_EQ(db_iter->key().ToString(), "a");
@ -993,7 +1059,9 @@ TEST(DBIteratorTest, DBIterator) {
      internal_iter->Finish();

      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 2));
+          NewDBIterator(env_, ImmutableCFOptions(options),
+                        BytewiseComparator(), internal_iter, 2,
+                        options.max_sequential_skip_in_iterations));
      db_iter->SeekToLast();
      ASSERT_TRUE(db_iter->Valid());

@ -1032,7 +1100,9 @@ TEST(DBIteratorTest, DBIterator) {
      internal_iter->Finish();

      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 4));
+          NewDBIterator(env_, ImmutableCFOptions(options),
+                        BytewiseComparator(), internal_iter, 4,
+                        options.max_sequential_skip_in_iterations));
      db_iter->SeekToLast();
      ASSERT_TRUE(db_iter->Valid());

@ -1071,7 +1141,9 @@ TEST(DBIteratorTest, DBIterator) {
      internal_iter->Finish();

      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 5));
+          NewDBIterator(env_, ImmutableCFOptions(options),
+                        BytewiseComparator(), internal_iter, 5,
+                        options.max_sequential_skip_in_iterations));
      db_iter->SeekToLast();
      ASSERT_TRUE(db_iter->Valid());

@ -1115,7 +1187,9 @@ TEST(DBIteratorTest, DBIterator) {
      internal_iter->Finish();

      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 6));
+          NewDBIterator(env_, ImmutableCFOptions(options),
+                        BytewiseComparator(), internal_iter, 6,
+                        options.max_sequential_skip_in_iterations));
      db_iter->SeekToLast();
      ASSERT_TRUE(db_iter->Valid());

@ -1160,7 +1234,9 @@ TEST(DBIteratorTest, DBIterator) {
      internal_iter->Finish();

      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 7));
+          NewDBIterator(env_, ImmutableCFOptions(options),
+                        BytewiseComparator(), internal_iter, 7,
+                        options.max_sequential_skip_in_iterations));
      db_iter->SeekToLast();
      ASSERT_TRUE(db_iter->Valid());

@ -1199,7 +1275,9 @@ TEST(DBIteratorTest, DBIterator) {
      internal_iter->Finish();

      std::unique_ptr<Iterator> db_iter(
-          NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 9));
+          NewDBIterator(env_, ImmutableCFOptions(options),
+                        BytewiseComparator(), internal_iter, 9,
+                        options.max_sequential_skip_in_iterations));
      db_iter->SeekToLast();
      ASSERT_TRUE(db_iter->Valid());

@ -1244,7 +1322,9 @@ TEST(DBIteratorTest, DBIterator) {
      internal_iter->Finish();

      std::unique_ptr<Iterator> db_iter(NewDBIterator(
-          env_, options, BytewiseComparator(), internal_iter, 13));
+          env_, ImmutableCFOptions(options),
+          BytewiseComparator(), internal_iter, 13,
+          options.max_sequential_skip_in_iterations));
      db_iter->SeekToLast();
      ASSERT_TRUE(db_iter->Valid());

@ -1290,7 +1370,9 @@ TEST(DBIteratorTest, DBIterator) {
      internal_iter->Finish();

      std::unique_ptr<Iterator> db_iter(NewDBIterator(
-          env_, options, BytewiseComparator(), internal_iter, 14));
+          env_, ImmutableCFOptions(options),
+          BytewiseComparator(), internal_iter, 14,
+          options.max_sequential_skip_in_iterations));
      db_iter->SeekToLast();
      ASSERT_TRUE(db_iter->Valid());

@ -1316,7 +1398,9 @@ TEST(DBIteratorTest, DBIterator) {
    internal_iter->Finish();

    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 10));
+        NewDBIterator(env_, ImmutableCFOptions(options),
+                      BytewiseComparator(), internal_iter, 10,
+                      options.max_sequential_skip_in_iterations));
    db_iter->SeekToLast();
    ASSERT_TRUE(db_iter->Valid());
    ASSERT_EQ(db_iter->key().ToString(), "b");
--- a/db/db_test.cc
+++ b/db/db_test.cc
--- a/db/dbformat.cc
+++ b/db/dbformat.cc
@ -127,26 +127,6 @@ void InternalKeyComparator::FindShortSuccessor(std::string* key) const {
  }
 }

-const char* InternalFilterPolicy::Name() const {
-  return user_policy_->Name();
-}
-
-void InternalFilterPolicy::CreateFilter(const Slice* keys, int n,
-                                        std::string* dst) const {
-  // We rely on the fact that the code in table.cc does not mind us
-  // adjusting keys[].
-  Slice* mkey = const_cast<Slice*>(keys);
-  for (int i = 0; i < n; i++) {
-    mkey[i] = ExtractUserKey(keys[i]);
-    // TODO(sanjay): Suppress dups?
-  }
-  user_policy_->CreateFilter(keys, n, dst);
-}
-
-bool InternalFilterPolicy::KeyMayMatch(const Slice& key, const Slice& f) const {
-  return user_policy_->KeyMayMatch(ExtractUserKey(key), f);
-}
-
 LookupKey::LookupKey(const Slice& user_key, SequenceNumber s) {
  size_t usize = user_key.size();
  size_t needed = usize + 13;  // A conservative estimate
--- a/db/dbformat.h
+++ b/db/dbformat.h
@ -124,17 +124,6 @@ class InternalKeyComparator : public Comparator {
  int Compare(const ParsedInternalKey& a, const ParsedInternalKey& b) const;
 };

-// Filter policy wrapper that converts from internal keys to user keys
-class InternalFilterPolicy : public FilterPolicy {
- private:
-  const FilterPolicy* const user_policy_;
- public:
-  explicit InternalFilterPolicy(const FilterPolicy* p) : user_policy_(p) { }
-  virtual const char* Name() const;
-  virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const;
-  virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const;
-};
-
 // Modules in this directory should keep internal keys wrapped inside
 // the following class instead of plain strings so that we do not
 // incorrectly use string comparisons instead of an InternalKeyComparator.
@ -255,7 +244,7 @@ class IterKey {

  Slice GetKey() const { return Slice(key_, key_size_); }

-  const size_t Size() { return key_size_; }
+  size_t Size() { return key_size_; }

  void Clear() { key_size_ = 0; }

@ -401,4 +390,12 @@ class InternalKeySliceTransform : public SliceTransform {
  const SliceTransform* const transform_;
 };

+// Read record from a write batch piece from input.
+// tag, column_family, key, value and blob are return values. Callers own the
+// Slice they point to.
+// Tag is defined as ValueType.
+// input will be advanced to after the record.
+extern Status ReadRecordFromWriteBatch(Slice* input, char* tag,
+                                       uint32_t* column_family, Slice* key,
+                                       Slice* value, Slice* blob);
 }  // namespace rocksdb
--- a/db/deletefile_test.cc
+++ b/db/deletefile_test.cc
@ -34,6 +34,7 @@ class DeleteFileTest {
  DeleteFileTest() {
    db_ = nullptr;
    env_ = Env::Default();
+    options_.max_background_flushes = 0;
    options_.write_buffer_size = 1024*1024*1000;
    options_.target_file_size_base = 1024*1024*1000;
    options_.max_bytes_for_level_base = 1024*1024*1000;
--- a/db/filename.cc
+++ b/db/filename.cc
@ -6,7 +6,10 @@
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
+#ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
+#endif
+
 #include "db/filename.h"
 #include <inttypes.h>

--- a/db/flush_scheduler.cc
+++ b/db/flush_scheduler.cc
@ -0,0 +1,62 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include "db/flush_scheduler.h"
+
+#include <cassert>
+
+#include "db/column_family.h"
+
+namespace rocksdb {
+
+void FlushScheduler::ScheduleFlush(ColumnFamilyData* cfd) {
+#ifndef NDEBUG
+  assert(column_families_set_.find(cfd) == column_families_set_.end());
+  column_families_set_.insert(cfd);
+#endif  // NDEBUG
+  cfd->Ref();
+  column_families_.push_back(cfd);
+}
+
+ColumnFamilyData* FlushScheduler::GetNextColumnFamily() {
+  ColumnFamilyData* cfd = nullptr;
+  while (column_families_.size() > 0) {
+    cfd = column_families_.front();
+    column_families_.pop_front();
+    if (cfd->IsDropped()) {
+      if (cfd->Unref()) {
+        delete cfd;
+      }
+    } else {
+      break;
+    }
+  }
+#ifndef NDEBUG
+  if (cfd != nullptr) {
+    auto itr = column_families_set_.find(cfd);
+    assert(itr != column_families_set_.end());
+    column_families_set_.erase(itr);
+  }
+#endif  // NDEBUG
+  return cfd;
+}
+
+bool FlushScheduler::Empty() { return column_families_.empty(); }
+
+void FlushScheduler::Clear() {
+  for (auto cfd : column_families_) {
+#ifndef NDEBUG
+    auto itr = column_families_set_.find(cfd);
+    assert(itr != column_families_set_.end());
+    column_families_set_.erase(itr);
+#endif  // NDEBUG
+    if (cfd->Unref()) {
+      delete cfd;
+    }
+  }
+  column_families_.clear();
+}
+
+}  // namespace rocksdb
--- a/db/flush_scheduler.h
+++ b/db/flush_scheduler.h
@ -0,0 +1,39 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#include <stdint.h>
+#include <deque>
+#include <set>
+#include <vector>
+
+namespace rocksdb {
+
+class ColumnFamilyData;
+
+// This class is thread-compatible. It's should only be accessed from single
+// write thread (between BeginWrite() and EndWrite())
+class FlushScheduler {
+ public:
+  FlushScheduler() = default;
+  ~FlushScheduler() = default;
+
+  void ScheduleFlush(ColumnFamilyData* cfd);
+  // Returns Ref()-ed column family. Client needs to Unref()
+  ColumnFamilyData* GetNextColumnFamily();
+
+  bool Empty();
+
+  void Clear();
+
+ private:
+  std::deque<ColumnFamilyData*> column_families_;
+#ifndef NDEBUG
+  std::set<ColumnFamilyData*> column_families_set_;
+#endif  // NDEBUG
+};
+
+}  // namespace rocksdb
--- a/db/forward_iterator.cc
+++ b/db/forward_iterator.cc
@ -6,9 +6,10 @@
 #ifndef ROCKSDB_LITE
 #include "db/forward_iterator.h"

+#include <limits>
 #include <string>
 #include <utility>
-#include <limits>
+
 #include "db/db_impl.h"
 #include "db/db_iter.h"
 #include "db/column_family.h"
@ -37,12 +38,16 @@ class LevelIterator : public Iterator {
    assert(file_index < files_.size());
    if (file_index != file_index_) {
      file_index_ = file_index;
-      file_iter_.reset(cfd_->table_cache()->NewIterator(
-          read_options_, *(cfd_->soptions()), cfd_->internal_comparator(),
-          files_[file_index_]->fd, nullptr /* table_reader_ptr */, false));
+      Reset();
    }
    valid_ = false;
  }
+  void Reset() {
+    assert(file_index_ < files_.size());
+    file_iter_.reset(cfd_->table_cache()->NewIterator(
+        read_options_, *(cfd_->soptions()), cfd_->internal_comparator(),
+        files_[file_index_]->fd, nullptr /* table_reader_ptr */, false));
+  }
  void SeekToLast() override {
    status_ = Status::NotSupported("LevelIterator::SeekToLast()");
    valid_ = false;
@ -63,12 +68,15 @@ class LevelIterator : public Iterator {
    assert(file_iter_ != nullptr);
    file_iter_->Seek(internal_key);
    valid_ = file_iter_->Valid();
-    assert(valid_);
  }
  void Next() override {
    assert(valid_);
    file_iter_->Next();
-    while (!file_iter_->Valid()) {
+    for (;;) {
+      if (file_iter_->status().IsIncomplete() || file_iter_->Valid()) {
+        valid_ = !file_iter_->status().IsIncomplete();
+        return;
+      }
      if (file_index_ + 1 >= files_.size()) {
        valid_ = false;
        return;
@ -76,7 +84,6 @@ class LevelIterator : public Iterator {
      SetFileIndex(file_index_ + 1);
      file_iter_->SeekToFirst();
    }
-    valid_ = file_iter_->Valid();
  }
  Slice key() const override {
    assert(valid_);
@ -125,9 +132,11 @@ ForwardIterator::~ForwardIterator() {
 }

 void ForwardIterator::Cleanup() {
-  delete mutable_iter_;
+  if (mutable_iter_ != nullptr) {
+    mutable_iter_->~Iterator();
+  }
  for (auto* m : imm_iters_) {
-    delete m;
+    m->~Iterator();
  }
  imm_iters_.clear();
  for (auto* f : l0_iters_) {
@ -160,6 +169,8 @@ void ForwardIterator::SeekToFirst() {
  if (sv_ == nullptr ||
      sv_ ->version_number != cfd_->GetSuperVersionNumber()) {
    RebuildIterators();
+  } else if (status_.IsIncomplete()) {
+    ResetIncompleteIterators();
  }
  SeekInternal(Slice(), true);
 }
@ -168,6 +179,8 @@ void ForwardIterator::Seek(const Slice& internal_key) {
  if (sv_ == nullptr ||
      sv_ ->version_number != cfd_->GetSuperVersionNumber()) {
    RebuildIterators();
+  } else if (status_.IsIncomplete()) {
+    ResetIncompleteIterators();
  }
  SeekInternal(internal_key, false);
 }
@ -211,7 +224,15 @@ void ForwardIterator::SeekInternal(const Slice& internal_key,
        }
        l0_iters_[i]->Seek(internal_key);
      }
-      if (l0_iters_[i]->Valid()) {
+
+      if (l0_iters_[i]->status().IsIncomplete()) {
+        // if any of the immutable iterators is incomplete (no-io option was
+        // used), we are unable to reliably find the smallest key
+        assert(read_options_.read_tier == kBlockCacheTier);
+        status_ = l0_iters_[i]->status();
+        valid_ = false;
+        return;
+      } else if (l0_iters_[i]->Valid()) {
        immutable_min_heap_.push(l0_iters_[i]);
      }
    }
@ -280,7 +301,14 @@ void ForwardIterator::SeekInternal(const Slice& internal_key,
        level_iters_[level - 1]->SetFileIndex(f_idx);
        seek_to_first ? level_iters_[level - 1]->SeekToFirst() :
                        level_iters_[level - 1]->Seek(internal_key);
-        if (level_iters_[level - 1]->Valid()) {
+
+        if (level_iters_[level - 1]->status().IsIncomplete()) {
+          // see above
+          assert(read_options_.read_tier == kBlockCacheTier);
+          status_ = level_iters_[level - 1]->status();
+          valid_ = false;
+          return;
+        } else if (level_iters_[level - 1]->Valid()) {
          immutable_min_heap_.push(level_iters_[level - 1]);
        }
      }
@ -304,7 +332,7 @@ void ForwardIterator::Next() {
  assert(valid_);

  if (sv_ == nullptr ||
-      sv_ ->version_number != cfd_->GetSuperVersionNumber()) {
+      sv_->version_number != cfd_->GetSuperVersionNumber()) {
    std::string current_key = key().ToString();
    Slice old_key(current_key.data(), current_key.size());

@ -320,9 +348,17 @@ void ForwardIterator::Next() {
  }

  current_->Next();
-  if (current_->Valid() && current_ != mutable_iter_) {
-    immutable_min_heap_.push(current_);
+  if (current_ != mutable_iter_) {
+    if (current_->status().IsIncomplete()) {
+      assert(read_options_.read_tier == kBlockCacheTier);
+      status_ = current_->status();
+      valid_ = false;
+      return;
+    } else if (current_->Valid()) {
+      immutable_min_heap_.push(current_);
+    }
  }
+
  UpdateCurrent();
 }

@ -367,8 +403,8 @@ void ForwardIterator::RebuildIterators() {
  Cleanup();
  // New
  sv_ = cfd_->GetReferencedSuperVersion(&(db_->mutex_));
-  mutable_iter_ = sv_->mem->NewIterator(read_options_);
-  sv_->imm->AddIterators(read_options_, &imm_iters_);
+  mutable_iter_ = sv_->mem->NewIterator(read_options_, &arena_);
+  sv_->imm->AddIterators(read_options_, &imm_iters_, &arena_);
  const auto& l0_files = sv_->current->files_[0];
  l0_iters_.reserve(l0_files.size());
  for (const auto* l0 : l0_files) {
@ -389,6 +425,29 @@ void ForwardIterator::RebuildIterators() {
  is_prev_set_ = false;
 }

+void ForwardIterator::ResetIncompleteIterators() {
+  const auto& l0_files = sv_->current->files_[0];
+  for (uint32_t i = 0; i < l0_iters_.size(); ++i) {
+    assert(i < l0_files.size());
+    if (!l0_iters_[i]->status().IsIncomplete()) {
+      continue;
+    }
+    delete l0_iters_[i];
+    l0_iters_[i] = cfd_->table_cache()->NewIterator(
+        read_options_, *cfd_->soptions(), cfd_->internal_comparator(),
+        l0_files[i]->fd);
+  }
+
+  for (auto* level_iter : level_iters_) {
+    if (level_iter && level_iter->status().IsIncomplete()) {
+      level_iter->Reset();
+    }
+  }
+
+  current_ = nullptr;
+  is_prev_set_ = false;
+}
+
 void ForwardIterator::UpdateCurrent() {
  if (immutable_min_heap_.empty() && !mutable_iter_->Valid()) {
    current_ = nullptr;
@ -417,7 +476,7 @@ void ForwardIterator::UpdateCurrent() {
 }

 bool ForwardIterator::NeedToSeekImmutable(const Slice& target) {
-  if (!is_prev_set_) {
+  if (!valid_ || !is_prev_set_) {
    return true;
  }
  Slice prev_key = prev_key_.GetKey();
--- a/db/forward_iterator.h
+++ b/db/forward_iterator.h
@ -14,6 +14,7 @@
 #include "rocksdb/iterator.h"
 #include "rocksdb/options.h"
 #include "db/dbformat.h"
+#include "util/arena.h"

 namespace rocksdb {

@ -73,6 +74,7 @@ class ForwardIterator : public Iterator {
 private:
  void Cleanup();
  void RebuildIterators();
+  void ResetIncompleteIterators();
  void SeekInternal(const Slice& internal_key, bool seek_to_first);
  void UpdateCurrent();
  bool NeedToSeekImmutable(const Slice& internal_key);
@ -99,6 +101,7 @@ class ForwardIterator : public Iterator {

  IterKey prev_key_;
  bool is_prev_set_;
+  Arena arena_;
 };

 }  // namespace rocksdb
--- a/db/internal_stats.cc
+++ b/db/internal_stats.cc
@ -7,10 +7,15 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.

 #include "db/internal_stats.h"
+
+#ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
+#endif
+
 #include <inttypes.h>
 #include <vector>
 #include "db/column_family.h"
+#include "db/db_impl.h"

 namespace rocksdb {

@ -133,6 +138,8 @@ DBPropertyType GetPropertyType(const Slice& property, bool* is_int_property,
  } else if (in == "estimate-table-readers-mem") {
    *need_out_of_mutex = true;
    return kEstimatedUsageByTableReaders;
+  } else if (in == "is-file-deletions-enabled") {
+    return kIsFileDeletionEnabled;
  }
  return kUnknown;
 }
@ -215,7 +222,7 @@ bool InternalStats::GetStringProperty(DBPropertyType property_type,
 }

 bool InternalStats::GetIntProperty(DBPropertyType property_type,
-                                   uint64_t* value) const {
+                                   uint64_t* value, DBImpl* db) const {
  Version* current = cfd_->current();

  switch (property_type) {
@ -254,6 +261,11 @@ bool InternalStats::GetIntProperty(DBPropertyType property_type,
               cfd_->imm()->current()->GetTotalNumEntries() +
               current->GetEstimatedActiveKeys();
      return true;
+#ifndef ROCKSDB_LITE
+    case kIsFileDeletionEnabled:
+      *value = db->IsFileDeletionsEnabled();
+      return true;
+#endif
    default:
      return false;
  }
--- a/db/internal_stats.h
+++ b/db/internal_stats.h
@ -42,6 +42,8 @@ enum DBPropertyType : uint32_t {
                                   // the immutable mem tables.
  kEstimatedNumKeys,  // Estimated total number of keys in the database.
  kEstimatedUsageByTableReaders,  // Estimated memory by table readers.
+  kIsFileDeletionEnabled,         // Equals disable_delete_obsolete_files_,
+                                  // 0 means file deletions enabled
 };

 extern DBPropertyType GetPropertyType(const Slice& property,
@ -197,7 +199,8 @@ class InternalStats {
  bool GetStringProperty(DBPropertyType property_type, const Slice& property,
                         std::string* value);

-  bool GetIntProperty(DBPropertyType property_type, uint64_t* value) const;
+  bool GetIntProperty(DBPropertyType property_type, uint64_t* value,
+                      DBImpl* db) const;

  bool GetIntPropertyOutOfMutex(DBPropertyType property_type, Version* version,
                                uint64_t* value) const;
--- a/db/log_and_apply_bench.cc
+++ b/db/log_and_apply_bench.cc
@ -9,6 +9,7 @@
 #include "util/testharness.h"
 #include "util/benchharness.h"
 #include "db/version_set.h"
+#include "db/write_controller.h"
 #include "util/mutexlock.h"

 namespace rocksdb {
@ -21,6 +22,7 @@ std::string MakeKey(unsigned int num) {

 void BM_LogAndApply(int iters, int num_base_files) {
  VersionSet* vset;
+  WriteController wc;
  ColumnFamilyData* default_cfd;
  uint64_t fnum = 1;
  port::Mutex mu;
@ -47,7 +49,7 @@ void BM_LogAndApply(int iters, int num_base_files) {
    options.db_paths.emplace_back(dbname, 0);
    // The parameter of table cache is passed in as null, so any file I/O
    // operation is likely to fail.
-    vset = new VersionSet(dbname, &options, sopt, nullptr);
+    vset = new VersionSet(dbname, &options, sopt, nullptr, &wc);
    std::vector<ColumnFamilyDescriptor> dummy;
    dummy.push_back(ColumnFamilyDescriptor());
    ASSERT_OK(vset->Recover(dummy));
@ -69,6 +71,7 @@ void BM_LogAndApply(int iters, int num_base_files) {
    vedit.AddFile(2, ++fnum, 0, 1 /* file size */, start, limit, 1, 1);
    vset->LogAndApply(default_cfd, &vedit, &mu);
  }
+  delete vset;
 }

 BENCHMARK_NAMED_PARAM(BM_LogAndApply, 1000_iters_1_file, 1000, 1)
--- a/db/memtable.cc
+++ b/db/memtable.cc
@ -31,41 +31,57 @@

 namespace rocksdb {

-MemTable::MemTable(const InternalKeyComparator& cmp, const Options& options)
+MemTableOptions::MemTableOptions(
+    const MutableCFOptions& mutable_cf_options, const Options& options)
+  : write_buffer_size(mutable_cf_options.write_buffer_size),
+    arena_block_size(mutable_cf_options.arena_block_size),
+    memtable_prefix_bloom_bits(mutable_cf_options.memtable_prefix_bloom_bits),
+    memtable_prefix_bloom_probes(
+        mutable_cf_options.memtable_prefix_bloom_probes),
+    memtable_prefix_bloom_huge_page_tlb_size(
+        mutable_cf_options.memtable_prefix_bloom_huge_page_tlb_size),
+    inplace_update_support(options.inplace_update_support),
+    inplace_update_num_locks(options.inplace_update_num_locks),
+    inplace_callback(options.inplace_callback),
+    max_successive_merges(mutable_cf_options.max_successive_merges),
+    filter_deletes(mutable_cf_options.filter_deletes) {}
+
+MemTable::MemTable(const InternalKeyComparator& cmp,
+                   const ImmutableCFOptions& ioptions,
+                   const MemTableOptions& moptions)
    : comparator_(cmp),
+      ioptions_(ioptions),
+      moptions_(moptions),
      refs_(0),
-      kArenaBlockSize(OptimizeBlockSize(options.arena_block_size)),
-      kWriteBufferSize(options.write_buffer_size),
-      arena_(options.arena_block_size),
-      table_(options.memtable_factory->CreateMemTableRep(
-          comparator_, &arena_, options.prefix_extractor.get(),
-          options.info_log.get())),
+      kArenaBlockSize(OptimizeBlockSize(moptions.arena_block_size)),
+      arena_(moptions.arena_block_size),
+      table_(ioptions.memtable_factory->CreateMemTableRep(
+          comparator_, &arena_, ioptions.prefix_extractor, ioptions.info_log)),
      num_entries_(0),
      flush_in_progress_(false),
      flush_completed_(false),
      file_number_(0),
      first_seqno_(0),
      mem_next_logfile_number_(0),
-      locks_(options.inplace_update_support ? options.inplace_update_num_locks
-                                            : 0),
-      prefix_extractor_(options.prefix_extractor.get()),
-      should_flush_(ShouldFlushNow()) {
+      locks_(moptions.inplace_update_support ? moptions.inplace_update_num_locks
+                                             : 0),
+      prefix_extractor_(ioptions.prefix_extractor),
+      should_flush_(ShouldFlushNow()),
+      flush_scheduled_(false) {
  // if should_flush_ == true without an entry inserted, something must have
  // gone wrong already.
  assert(!should_flush_);
-  if (prefix_extractor_ && options.memtable_prefix_bloom_bits > 0) {
+  if (prefix_extractor_ && moptions.memtable_prefix_bloom_bits > 0) {
    prefix_bloom_.reset(new DynamicBloom(
        &arena_,
-        options.memtable_prefix_bloom_bits, options.bloom_locality,
-        options.memtable_prefix_bloom_probes, nullptr,
-        options.memtable_prefix_bloom_huge_page_tlb_size,
-        options.info_log.get()));
+        moptions.memtable_prefix_bloom_bits, ioptions.bloom_locality,
+        moptions.memtable_prefix_bloom_probes, nullptr,
+        moptions.memtable_prefix_bloom_huge_page_tlb_size,
+        ioptions.info_log));
  }
 }

-MemTable::~MemTable() {
-  assert(refs_ == 0);
-}
+MemTable::~MemTable() { assert(refs_ == 0); }

 size_t MemTable::ApproximateMemoryUsage() {
  size_t arena_usage = arena_.ApproximateMemoryUsage();
@ -97,14 +113,16 @@ bool MemTable::ShouldFlushNow() const {
  // if we can still allocate one more block without exceeding the
  // over-allocation ratio, then we should not flush.
  if (allocated_memory + kArenaBlockSize <
-      kWriteBufferSize + kArenaBlockSize * kAllowOverAllocationRatio) {
+      moptions_.write_buffer_size +
+      kArenaBlockSize * kAllowOverAllocationRatio) {
    return false;
  }

-  // if user keeps adding entries that exceeds kWriteBufferSize, we need to
-  // flush earlier even though we still have much available memory left.
-  if (allocated_memory >
-      kWriteBufferSize + kArenaBlockSize * kAllowOverAllocationRatio) {
+  // if user keeps adding entries that exceeds moptions.write_buffer_size,
+  // we need to flush earlier even though we still have much available
+  // memory left.
+  if (allocated_memory > moptions_.write_buffer_size +
+      kArenaBlockSize * kAllowOverAllocationRatio) {
    return true;
  }

@ -174,13 +192,13 @@ const char* EncodeKey(std::string* scratch, const Slice& target) {

 class MemTableIterator: public Iterator {
 public:
-  MemTableIterator(const MemTable& mem, const ReadOptions& options,
-                   bool enforce_total_order, Arena* arena)
+  MemTableIterator(
+      const MemTable& mem, const ReadOptions& read_options, Arena* arena)
      : bloom_(nullptr),
        prefix_extractor_(mem.prefix_extractor_),
        valid_(false),
        arena_mode_(arena != nullptr) {
-    if (prefix_extractor_ != nullptr && !enforce_total_order) {
+    if (prefix_extractor_ != nullptr && !read_options.total_order_seek) {
      bloom_ = mem.prefix_bloom_.get();
      iter_ = mem.table_->GetDynamicPrefixIterator(arena);
    } else {
@ -248,15 +266,10 @@ class MemTableIterator: public Iterator {
  void operator=(const MemTableIterator&);
 };

-Iterator* MemTable::NewIterator(const ReadOptions& options,
-                                bool enforce_total_order, Arena* arena) {
-  if (arena == nullptr) {
-    return new MemTableIterator(*this, options, enforce_total_order, nullptr);
-  } else {
-    auto mem = arena->AllocateAligned(sizeof(MemTableIterator));
-    return new (mem)
-        MemTableIterator(*this, options, enforce_total_order, arena);
-  }
+Iterator* MemTable::NewIterator(const ReadOptions& read_options, Arena* arena) {
+  assert(arena != nullptr);
+  auto mem = arena->AllocateAligned(sizeof(MemTableIterator));
+  return new (mem) MemTableIterator(*this, read_options, arena);
 }

 port::RWMutex* MemTable::GetLock(const Slice& key) {
@ -417,8 +430,13 @@ static bool SaveValue(void* arg, const char* entry) {
 }

 bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
-                   MergeContext& merge_context, const Options& options) {
-  PERF_TIMER_AUTO(get_from_memtable_time);
+                   MergeContext* merge_context) {
+  // The sequence number is updated synchronously in version_set.h
+  if (IsEmpty()) {
+    // Avoiding recording stats for speed.
+    return false;
+  }
+  PERF_TIMER_GUARD(get_from_memtable_time);

  Slice user_key = key.user_key();
  bool found_final_value = false;
@ -436,11 +454,11 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
    saver.value = value;
    saver.status = s;
    saver.mem = this;
-    saver.merge_context = &merge_context;
-    saver.merge_operator = options.merge_operator.get();
-    saver.logger = options.info_log.get();
-    saver.inplace_update_support = options.inplace_update_support;
-    saver.statistics = options.statistics.get();
+    saver.merge_context = merge_context;
+    saver.merge_operator = ioptions_.merge_operator;
+    saver.logger = ioptions_.info_log;
+    saver.inplace_update_support = moptions_.inplace_update_support;
+    saver.statistics = ioptions_.statistics;
    table_->Get(key, &saver, SaveValue);
  }

@ -448,7 +466,6 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
  if (!found_final_value && merge_in_progress) {
    *s = Status::MergeInProgress("");
  }
-  PERF_TIMER_STOP(get_from_memtable_time);
  PERF_COUNTER_ADD(get_from_memtable_count, 1);
  return found_final_value;
 }
@ -513,8 +530,7 @@ void MemTable::Update(SequenceNumber seq,

 bool MemTable::UpdateCallback(SequenceNumber seq,
                              const Slice& key,
-                              const Slice& delta,
-                              const Options& options) {
+                              const Slice& delta) {
  LookupKey lkey(key, seq);
  Slice memkey = lkey.memtable_key();

@ -549,8 +565,8 @@ bool MemTable::UpdateCallback(SequenceNumber seq,

          std::string str_value;
          WriteLock wl(GetLock(lkey.user_key()));
-          auto status = options.inplace_callback(prev_buffer, &new_prev_size,
-                                                    delta, &str_value);
+          auto status = moptions_.inplace_callback(prev_buffer, &new_prev_size,
+                                                   delta, &str_value);
          if (status == UpdateStatus::UPDATED_INPLACE) {
            // Value already updated by callback.
            assert(new_prev_size <= prev_size);
@ -563,12 +579,12 @@ bool MemTable::UpdateCallback(SequenceNumber seq,
                memcpy(p, prev_buffer, new_prev_size);
              }
            }
-            RecordTick(options.statistics.get(), NUMBER_KEYS_UPDATED);
+            RecordTick(ioptions_.statistics, NUMBER_KEYS_UPDATED);
            should_flush_ = ShouldFlushNow();
            return true;
          } else if (status == UpdateStatus::UPDATED) {
            Add(seq, kTypeValue, key, Slice(str_value));
-            RecordTick(options.statistics.get(), NUMBER_KEYS_WRITTEN);
+            RecordTick(ioptions_.statistics, NUMBER_KEYS_WRITTEN);
            should_flush_ = ShouldFlushNow();
            return true;
          } else if (status == UpdateStatus::UPDATE_FAILED) {
--- a/db/memtable.h
+++ b/db/memtable.h
@ -10,14 +10,18 @@
 #pragma once
 #include <string>
 #include <memory>
+#include <functional>
 #include <deque>
+#include <vector>
 #include "db/dbformat.h"
 #include "db/skiplist.h"
 #include "db/version_edit.h"
 #include "rocksdb/db.h"
 #include "rocksdb/memtablerep.h"
+#include "rocksdb/immutable_options.h"
 #include "util/arena.h"
 #include "util/dynamic_bloom.h"
+#include "util/mutable_cf_options.h"

 namespace rocksdb {

@ -26,6 +30,25 @@ class Mutex;
 class MemTableIterator;
 class MergeContext;

+struct MemTableOptions {
+  explicit MemTableOptions(
+      const MutableCFOptions& mutable_cf_options,
+      const Options& options);
+  size_t write_buffer_size;
+  size_t arena_block_size;
+  uint32_t memtable_prefix_bloom_bits;
+  uint32_t memtable_prefix_bloom_probes;
+  size_t memtable_prefix_bloom_huge_page_tlb_size;
+  bool inplace_update_support;
+  size_t inplace_update_num_locks;
+  UpdateStatus (*inplace_callback)(char* existing_value,
+                                   uint32_t* existing_value_size,
+                                   Slice delta_value,
+                                   std::string* merged_value);
+  size_t max_successive_merges;
+  bool filter_deletes;
+};
+
 class MemTable {
 public:
  struct KeyComparator : public MemTableRep::KeyComparator {
@ -40,7 +63,8 @@ class MemTable {
  // MemTables are reference counted.  The initial reference count
  // is zero and the caller must call Ref() at least once.
  explicit MemTable(const InternalKeyComparator& comparator,
-                    const Options& options);
+                    const ImmutableCFOptions& ioptions,
+                    const MemTableOptions& moptions);

  ~MemTable();

@ -67,7 +91,11 @@ class MemTable {

  // This method heuristically determines if the memtable should continue to
  // host more data.
-  bool ShouldFlush() const { return should_flush_; }
+  bool ShouldScheduleFlush() const {
+    return flush_scheduled_ == false && should_flush_;
+  }
+
+  void MarkFlushScheduled() { flush_scheduled_ = true; }

  // Return an iterator that yields the contents of the memtable.
  //
@ -81,9 +109,7 @@ class MemTable {
  // arena: If not null, the arena needs to be used to allocate the Iterator.
  //        Calling ~Iterator of the iterator will destroy all the states but
  //        those allocated in arena.
-  Iterator* NewIterator(const ReadOptions& options,
-                        bool enforce_total_order = false,
-                        Arena* arena = nullptr);
+  Iterator* NewIterator(const ReadOptions& read_options, Arena* arena);

  // Add an entry into memtable that maps key to value at the
  // specified sequence number and with the specified type.
@ -101,7 +127,7 @@ class MemTable {
  //   store MergeInProgress in s, and return false.
  // Else, return false.
  bool Get(const LookupKey& key, std::string* value, Status* s,
-           MergeContext& merge_context, const Options& options);
+           MergeContext* merge_context);

  // Attempts to update the new_value inplace, else does normal Add
  // Pseudocode
@ -125,8 +151,7 @@ class MemTable {
  //   else return false
  bool UpdateCallback(SequenceNumber seq,
                      const Slice& key,
-                      const Slice& delta,
-                      const Options& options);
+                      const Slice& delta);

  // Returns the number of successive merge entries starting from the newest
  // entry for the key up to the last non-merge entry or last entry for the
@ -139,6 +164,9 @@ class MemTable {
  // Returns the edits area that is needed for flushing the memtable
  VersionEdit* GetEdits() { return &edit_; }

+  // Returns if there is no entry inserted to the mem table.
+  bool IsEmpty() const { return first_seqno_ == 0; }
+
  // Returns the sequence number of the first element that was inserted
  // into the memtable
  SequenceNumber GetFirstSequenceNumber() { return first_seqno_; }
@ -171,8 +199,11 @@ class MemTable {

  const Arena& TEST_GetArena() const { return arena_; }

+  const ImmutableCFOptions* GetImmutableOptions() const { return &ioptions_; }
+  const MemTableOptions* GetMemTableOptions() const { return &moptions_; }
+
 private:
-  // Dynamically check if we can add more incoming entries.
+  // Dynamically check if we can add more incoming entries
  bool ShouldFlushNow() const;

  friend class MemTableIterator;
@ -180,9 +211,10 @@ class MemTable {
  friend class MemTableList;

  KeyComparator comparator_;
+  const ImmutableCFOptions& ioptions_;
+  const MemTableOptions moptions_;
  int refs_;
  const size_t kArenaBlockSize;
-  const size_t kWriteBufferSize;
  Arena arena_;
  unique_ptr<MemTableRep> table_;

@ -215,6 +247,9 @@ class MemTable {

  // a flag indicating if a memtable has met the criteria to flush
  bool should_flush_;
+
+  // a flag indicating if flush has been scheduled
+  bool flush_scheduled_;
 };

 extern const char* EncodeKey(std::string* scratch, const Slice& target);
--- a/db/memtable_list.cc
+++ b/db/memtable_list.cc
@ -62,10 +62,9 @@ int MemTableList::size() const {
 // Return the most recent value found, if any.
 // Operands stores the list of merge operations to apply, so far.
 bool MemTableListVersion::Get(const LookupKey& key, std::string* value,
-                              Status* s, MergeContext& merge_context,
-                              const Options& options) {
+                              Status* s, MergeContext* merge_context) {
  for (auto& memtable : memlist_) {
-    if (memtable->Get(key, value, s, merge_context, options)) {
+    if (memtable->Get(key, value, s, merge_context)) {
      return true;
    }
  }
@ -73,9 +72,10 @@ bool MemTableListVersion::Get(const LookupKey& key, std::string* value,
 }

 void MemTableListVersion::AddIterators(const ReadOptions& options,
-                                       std::vector<Iterator*>* iterator_list) {
+                                       std::vector<Iterator*>* iterator_list,
+                                       Arena* arena) {
  for (auto& m : memlist_) {
-    iterator_list->push_back(m->NewIterator(options));
+    iterator_list->push_back(m->NewIterator(options, arena));
  }
 }

--- a/db/memtable_list.h
+++ b/db/memtable_list.h
@ -46,10 +46,10 @@ class MemTableListVersion {
  // Search all the memtables starting from the most recent one.
  // Return the most recent value found, if any.
  bool Get(const LookupKey& key, std::string* value, Status* s,
-           MergeContext& merge_context, const Options& options);
+           MergeContext* merge_context);

  void AddIterators(const ReadOptions& options,
-                    std::vector<Iterator*>* iterator_list);
+                    std::vector<Iterator*>* iterator_list, Arena* arena);

  void AddIterators(const ReadOptions& options,
                    MergeIteratorBuilder* merge_iter_builder);
--- a/db/plain_table_db_test.cc
+++ b/db/plain_table_db_test.cc
@ -192,16 +192,17 @@ extern const uint64_t kPlainTableMagicNumber;

 class TestPlainTableReader : public PlainTableReader {
 public:
-  TestPlainTableReader(const EnvOptions& storage_options,
+  TestPlainTableReader(const EnvOptions& env_options,
                       const InternalKeyComparator& icomparator,
                       EncodingType encoding_type, uint64_t file_size,
                       int bloom_bits_per_key, double hash_table_ratio,
                       size_t index_sparseness,
                       const TableProperties* table_properties,
                       unique_ptr<RandomAccessFile>&& file,
-                       const Options& options, bool* expect_bloom_not_match,
+                       const ImmutableCFOptions& ioptions,
+                       bool* expect_bloom_not_match,
                       bool store_index_in_file)
-      : PlainTableReader(options, std::move(file), storage_options, icomparator,
+      : PlainTableReader(ioptions, std::move(file), env_options, icomparator,
                         encoding_type, file_size, table_properties),
        expect_bloom_not_match_(expect_bloom_not_match) {
    Status s = MmapDataFile();
@ -218,7 +219,7 @@ class TestPlainTableReader : public PlainTableReader {
          PlainTablePropertyNames::kBloomVersion);
      ASSERT_TRUE(bloom_version_ptr != props->user_collected_properties.end());
      ASSERT_EQ(bloom_version_ptr->second, std::string("1"));
-      if (options.bloom_locality > 0) {
+      if (ioptions.bloom_locality > 0) {
        auto num_blocks_ptr = props->user_collected_properties.find(
            PlainTablePropertyNames::kNumBloomBlocks);
        ASSERT_TRUE(num_blocks_ptr != props->user_collected_properties.end());
@ -253,25 +254,26 @@ class TestPlainTableFactory : public PlainTableFactory {
        store_index_in_file_(options.store_index_in_file),
        expect_bloom_not_match_(expect_bloom_not_match) {}

-  Status NewTableReader(const Options& options, const EnvOptions& soptions,
+  Status NewTableReader(const ImmutableCFOptions& ioptions,
+                        const EnvOptions& env_options,
                        const InternalKeyComparator& internal_comparator,
                        unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
                        unique_ptr<TableReader>* table) const override {
    TableProperties* props = nullptr;
    auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber,
-                                 options.env, options.info_log.get(), &props);
+                                 ioptions.env, ioptions.info_log, &props);
    ASSERT_TRUE(s.ok());

    if (store_index_in_file_) {
      BlockHandle bloom_block_handle;
      s = FindMetaBlock(file.get(), file_size, kPlainTableMagicNumber,
-                        options.env, BloomBlockBuilder::kBloomBlock,
+                        ioptions.env, BloomBlockBuilder::kBloomBlock,
                        &bloom_block_handle);
      ASSERT_TRUE(s.ok());

      BlockHandle index_block_handle;
      s = FindMetaBlock(
-          file.get(), file_size, kPlainTableMagicNumber, options.env,
+          file.get(), file_size, kPlainTableMagicNumber, ioptions.env,
          PlainTableIndexBuilder::kPlainTableIndexBlock, &index_block_handle);
      ASSERT_TRUE(s.ok());
    }
@ -284,9 +286,9 @@ class TestPlainTableFactory : public PlainTableFactory {
        DecodeFixed32(encoding_type_prop->second.c_str()));

    std::unique_ptr<PlainTableReader> new_reader(new TestPlainTableReader(
-        soptions, internal_comparator, encoding_type, file_size,
+        env_options, internal_comparator, encoding_type, file_size,
        bloom_bits_per_key_, hash_table_ratio_, index_sparseness_, props,
-        std::move(file), options, expect_bloom_not_match_,
+        std::move(file), ioptions, expect_bloom_not_match_,
        store_index_in_file_));

    *table = std::move(new_reader);
--- a/db/repair.cc
+++ b/db/repair.cc
@ -31,7 +31,10 @@

 #ifndef ROCKSDB_LITE

+#ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
+#endif
+
 #include <inttypes.h>
 #include "db/builder.h"
 #include "db/db_impl.h"
@ -46,6 +49,9 @@
 #include "rocksdb/comparator.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/immutable_options.h"
+#include "util/scoped_arena_iterator.h"

 namespace rocksdb {

@ -57,8 +63,8 @@ class Repairer {
      : dbname_(dbname),
        env_(options.env),
        icmp_(options.comparator),
-        ipolicy_(options.filter_policy),
-        options_(SanitizeOptions(dbname, &icmp_, &ipolicy_, options)),
+        options_(SanitizeOptions(dbname, &icmp_, options)),
+        ioptions_(options_),
        raw_table_cache_(
            // TableCache can be small since we expect each table to be opened
            // once.
@ -66,7 +72,7 @@ class Repairer {
                        options_.table_cache_remove_scan_count_limit)),
        next_file_number_(1) {
    table_cache_ =
-        new TableCache(&options_, storage_options_, raw_table_cache_.get());
+        new TableCache(ioptions_, env_options_, raw_table_cache_.get());
    edit_ = new VersionEdit();
  }

@ -108,9 +114,9 @@ class Repairer {

  std::string const dbname_;
  Env* const env_;
-  InternalKeyComparator const icmp_;
-  InternalFilterPolicy const ipolicy_;
-  Options const options_;
+  const InternalKeyComparator icmp_;
+  const Options options_;
+  const ImmutableCFOptions ioptions_;
  std::shared_ptr<Cache> raw_table_cache_;
  TableCache* table_cache_;
  VersionEdit* edit_;
@ -120,7 +126,7 @@ class Repairer {
  std::vector<uint64_t> logs_;
  std::vector<TableInfo> tables_;
  uint64_t next_file_number_;
-  const EnvOptions storage_options_;
+  const EnvOptions env_options_;

  Status FindFiles() {
    std::vector<std::string> filenames;
@ -192,7 +198,7 @@ class Repairer {
    // Open the log file
    std::string logname = LogFileName(dbname_, log);
    unique_ptr<SequentialFile> lfile;
-    Status status = env_->NewSequentialFile(logname, &lfile, storage_options_);
+    Status status = env_->NewSequentialFile(logname, &lfile, env_options_);
    if (!status.ok()) {
      return status;
    }
@ -213,7 +219,8 @@ class Repairer {
    std::string scratch;
    Slice record;
    WriteBatch batch;
-    MemTable* mem = new MemTable(icmp_, options_);
+    MemTable* mem = new MemTable(icmp_, ioptions_,
+        MemTableOptions(MutableCFOptions(options_), options_));
    auto cf_mems_default = new ColumnFamilyMemTablesDefault(mem, &options_);
    mem->Ref();
    int counter = 0;
@ -238,11 +245,15 @@ class Repairer {
    // since ExtractMetaData() will also generate edits.
    FileMetaData meta;
    meta.fd = FileDescriptor(next_file_number_++, 0, 0);
-    ReadOptions ro;
-    Iterator* iter = mem->NewIterator(ro, true /* enforce_total_order */);
-    status = BuildTable(dbname_, env_, options_, storage_options_, table_cache_,
-                        iter, &meta, icmp_, 0, 0, kNoCompression);
-    delete iter;
+    {
+      ReadOptions ro;
+      ro.total_order_seek = true;
+      Arena arena;
+      ScopedArenaIterator iter(mem->NewIterator(ro, &arena));
+      status = BuildTable(dbname_, env_, ioptions_, env_options_, table_cache_,
+                          iter.get(), &meta, icmp_, 0, 0, kNoCompression,
+                          CompressionOptions());
+    }
    delete mem->Unref();
    delete cf_mems_default;
    mem = nullptr;
@ -287,7 +298,7 @@ class Repairer {
                                file_size);
    if (status.ok()) {
      Iterator* iter = table_cache_->NewIterator(
-          ReadOptions(), storage_options_, icmp_, t->meta.fd);
+          ReadOptions(), env_options_, icmp_, t->meta.fd);
      bool empty = true;
      ParsedInternalKey parsed;
      t->min_sequence = 0;
@ -327,7 +338,7 @@ class Repairer {
    std::string tmp = TempFileName(dbname_, 1);
    unique_ptr<WritableFile> file;
    Status status = env_->NewWritableFile(
-        tmp, &file, env_->OptimizeForManifestWrite(storage_options_));
+        tmp, &file, env_->OptimizeForManifestWrite(env_options_));
    if (!status.ok()) {
      return status;
    }
--- a/db/simple_table_db_test.cc
+++ b/db/simple_table_db_test.cc
@ -79,7 +79,8 @@ public:
  // for the duration of the returned table's lifetime.
  //
  // *file must remain live while this Table is in use.
-  static Status Open(const Options& options, const EnvOptions& soptions,
+  static Status Open(const ImmutableCFOptions& options,
+                     const EnvOptions& env_options,
                     unique_ptr<RandomAccessFile> && file, uint64_t file_size,
                     unique_ptr<TableReader>* table_reader);

@ -160,14 +161,14 @@ private:
 struct SimpleTableReader::Rep {
  ~Rep() {
  }
-  Rep(const EnvOptions& storage_options, uint64_t index_start_offset,
-      int num_entries) :
-      soptions(storage_options), index_start_offset(index_start_offset),
-      num_entries(num_entries) {
+  Rep(const ImmutableCFOptions& ioptions, const EnvOptions& env_options,
+      uint64_t index_start_offset, int num_entries) :
+      ioptions(ioptions), env_options(env_options),
+      index_start_offset(index_start_offset), num_entries(num_entries) {
  }

-  Options options;
-  const EnvOptions& soptions;
+  const ImmutableCFOptions& ioptions;
+  const EnvOptions& env_options;
  Status status;
  unique_ptr<RandomAccessFile> file;
  uint64_t index_start_offset;
@ -187,8 +188,8 @@ SimpleTableReader::~SimpleTableReader() {
  delete rep_;
 }

-Status SimpleTableReader::Open(const Options& options,
-                               const EnvOptions& soptions,
+Status SimpleTableReader::Open(const ImmutableCFOptions& ioptions,
+                               const EnvOptions& env_options,
                               unique_ptr<RandomAccessFile> && file,
                               uint64_t size,
                               unique_ptr<TableReader>* table_reader) {
@ -201,12 +202,10 @@ Status SimpleTableReader::Open(const Options& options,

    int num_entries = (size - Rep::offset_length - index_start_offset)
        / (Rep::GetInternalKeyLength() + Rep::offset_length);
-    SimpleTableReader::Rep* rep = new SimpleTableReader::Rep(soptions,
-                                                             index_start_offset,
-                                                             num_entries);
+    SimpleTableReader::Rep* rep = new SimpleTableReader::Rep(
+        ioptions, env_options, index_start_offset, num_entries);

    rep->file = std::move(file);
-    rep->options = options;
    table_reader->reset(new SimpleTableReader(rep));
  }
  return s;
@ -248,7 +247,7 @@ Status SimpleTableReader::GetOffset(const Slice& target, uint64_t* offset) {
      return s;
    }

-    InternalKeyComparator ikc(rep_->options.comparator);
+    InternalKeyComparator ikc(rep_->ioptions.comparator);
    int compare_result = ikc.Compare(tmp_slice, target);

    if (compare_result < 0) {
@ -382,7 +381,7 @@ void SimpleTableIterator::Prev() {
 }

 Slice SimpleTableIterator::key() const {
-  Log(table_->rep_->options.info_log, "key!!!!");
+  Log(table_->rep_->ioptions.info_log, "key!!!!");
  return key_;
 }

@ -401,7 +400,7 @@ public:
  // caller to close the file after calling Finish(). The output file
  // will be part of level specified by 'level'.  A value of -1 means
  // that the caller does not know which level the output file will reside.
-  SimpleTableBuilder(const Options& options, WritableFile* file,
+  SimpleTableBuilder(const ImmutableCFOptions& ioptions, WritableFile* file,
                     CompressionType compression_type);

  // REQUIRES: Either Finish() or Abandon() has been called.
@ -444,7 +443,7 @@ private:
 };

 struct SimpleTableBuilder::Rep {
-  Options options;
+  const ImmutableCFOptions& ioptions;
  WritableFile* file;
  uint64_t offset = 0;
  Status status;
@ -463,17 +462,17 @@ struct SimpleTableBuilder::Rep {

  std::string index;

-  Rep(const Options& opt, WritableFile* f) :
-      options(opt), file(f) {
+  Rep(const ImmutableCFOptions& iopt, WritableFile* f) :
+      ioptions(iopt), file(f) {
  }
  ~Rep() {
  }
 };

-SimpleTableBuilder::SimpleTableBuilder(const Options& options,
+SimpleTableBuilder::SimpleTableBuilder(const ImmutableCFOptions& ioptions,
                                       WritableFile* file,
                                       CompressionType compression_type) :
-    rep_(new SimpleTableBuilder::Rep(options, file)) {
+    rep_(new SimpleTableBuilder::Rep(ioptions, file)) {
 }

 SimpleTableBuilder::~SimpleTableBuilder() {
@ -546,31 +545,45 @@ public:
  const char* Name() const override {
    return "SimpleTable";
  }
-  Status NewTableReader(const Options& options, const EnvOptions& soptions,
+  Status NewTableReader(const ImmutableCFOptions& ioptions,
+                        const EnvOptions& env_options,
                        const InternalKeyComparator& internal_key,
                        unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
                        unique_ptr<TableReader>* table_reader) const;

-  TableBuilder* NewTableBuilder(const Options& options,
-                                const InternalKeyComparator& internal_key,
-                                WritableFile* file,
-                                CompressionType compression_type) const;
+  TableBuilder* NewTableBuilder(
+      const ImmutableCFOptions& ioptions,
+      const InternalKeyComparator& internal_key,
+      WritableFile* file,
+      const CompressionType compression_type,
+      const CompressionOptions& compression_opts) const;
+
+  virtual Status SanitizeDBOptions(const DBOptions* db_opts) const override {
+    return Status::OK();
+  }
+
+  virtual std::string GetPrintableTableOptions() const override {
+    return std::string();
+  }
 };

 Status SimpleTableFactory::NewTableReader(
-    const Options& options, const EnvOptions& soptions,
+    const ImmutableCFOptions& ioptions,
+    const EnvOptions& env_options,
    const InternalKeyComparator& internal_key,
    unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
    unique_ptr<TableReader>* table_reader) const {

-  return SimpleTableReader::Open(options, soptions, std::move(file), file_size,
-                                 table_reader);
+  return SimpleTableReader::Open(ioptions, env_options, std::move(file),
+                                 file_size, table_reader);
 }

 TableBuilder* SimpleTableFactory::NewTableBuilder(
-    const Options& options, const InternalKeyComparator& internal_key,
-    WritableFile* file, CompressionType compression_type) const {
-  return new SimpleTableBuilder(options, file, compression_type);
+    const ImmutableCFOptions& ioptions,
+    const InternalKeyComparator& internal_key,
+    WritableFile* file, const CompressionType compression_type,
+    const CompressionOptions& compression_opts) const {
+  return new SimpleTableBuilder(ioptions, file, compression_type);
 }

 class SimpleTableDBTest {
--- a/db/snapshot.h
+++ b/db/snapshot.h
@ -71,7 +71,7 @@ class SnapshotList {
  }

  // get the sequence number of the most recent snapshot
-  const SequenceNumber GetNewest() {
+  SequenceNumber GetNewest() {
    if (empty()) {
      return 0;
    }
--- a/db/table_cache.cc
+++ b/db/table_cache.cc
@ -36,12 +36,10 @@ static Slice GetSliceForFileNumber(const uint64_t* file_number) {
               sizeof(*file_number));
 }

-TableCache::TableCache(const Options* options,
-                       const EnvOptions& storage_options, Cache* const cache)
-    : env_(options->env),
-      db_paths_(options->db_paths),
-      options_(options),
-      storage_options_(storage_options),
+TableCache::TableCache(const ImmutableCFOptions& ioptions,
+                       const EnvOptions& env_options, Cache* const cache)
+    : ioptions_(ioptions),
+      env_options_(env_options),
      cache_(cache) {}

 TableCache::~TableCache() {
@ -55,7 +53,7 @@ void TableCache::ReleaseHandle(Cache::Handle* handle) {
  cache_->Release(handle);
 }

-Status TableCache::FindTable(const EnvOptions& toptions,
+Status TableCache::FindTable(const EnvOptions& env_options,
                             const InternalKeyComparator& internal_comparator,
                             const FileDescriptor& fd, Cache::Handle** handle,
                             const bool no_io) {
@ -68,24 +66,24 @@ Status TableCache::FindTable(const EnvOptions& toptions,
      return Status::Incomplete("Table not found in table_cache, no_io is set");
    }
    std::string fname =
-        TableFileName(db_paths_, fd.GetNumber(), fd.GetPathId());
+        TableFileName(ioptions_.db_paths, fd.GetNumber(), fd.GetPathId());
    unique_ptr<RandomAccessFile> file;
    unique_ptr<TableReader> table_reader;
-    s = env_->NewRandomAccessFile(fname, &file, toptions);
-    RecordTick(options_->statistics.get(), NO_FILE_OPENS);
+    s = ioptions_.env->NewRandomAccessFile(fname, &file, env_options);
+    RecordTick(ioptions_.statistics, NO_FILE_OPENS);
    if (s.ok()) {
-      if (options_->advise_random_on_open) {
+      if (ioptions_.advise_random_on_open) {
        file->Hint(RandomAccessFile::RANDOM);
      }
-      StopWatch sw(env_, options_->statistics.get(), TABLE_OPEN_IO_MICROS);
-      s = options_->table_factory->NewTableReader(
-          *options_, toptions, internal_comparator, std::move(file),
+      StopWatch sw(ioptions_.env, ioptions_.statistics, TABLE_OPEN_IO_MICROS);
+      s = ioptions_.table_factory->NewTableReader(
+          ioptions_, env_options, internal_comparator, std::move(file),
          fd.GetFileSize(), &table_reader);
    }

    if (!s.ok()) {
      assert(table_reader == nullptr);
-      RecordTick(options_->statistics.get(), NO_FILE_ERRORS);
+      RecordTick(ioptions_.statistics, NO_FILE_ERRORS);
      // We do not cache error results so that if the error is transient,
      // or somebody repairs the file, we recover automatically.
    } else {
@ -97,7 +95,7 @@ Status TableCache::FindTable(const EnvOptions& toptions,
 }

 Iterator* TableCache::NewIterator(const ReadOptions& options,
-                                  const EnvOptions& toptions,
+                                  const EnvOptions& env_options,
                                  const InternalKeyComparator& icomparator,
                                  const FileDescriptor& fd,
                                  TableReader** table_reader_ptr,
@ -109,7 +107,7 @@ Iterator* TableCache::NewIterator(const ReadOptions& options,
  Cache::Handle* handle = nullptr;
  Status s;
  if (table_reader == nullptr) {
-    s = FindTable(toptions, icomparator, fd, &handle,
+    s = FindTable(env_options, icomparator, fd, &handle,
                  options.read_tier == kBlockCacheTier);
    if (!s.ok()) {
      return NewErrorIterator(s, arena);
@ -142,7 +140,7 @@ Status TableCache::Get(const ReadOptions& options,
  Status s;
  Cache::Handle* handle = nullptr;
  if (!t) {
-    s = FindTable(storage_options_, internal_comparator, fd, &handle,
+    s = FindTable(env_options_, internal_comparator, fd, &handle,
                  options.read_tier == kBlockCacheTier);
    if (s.ok()) {
      t = GetTableReaderFromHandle(handle);
@ -160,8 +158,9 @@ Status TableCache::Get(const ReadOptions& options,
  }
  return s;
 }
+
 Status TableCache::GetTableProperties(
-    const EnvOptions& toptions,
+    const EnvOptions& env_options,
    const InternalKeyComparator& internal_comparator, const FileDescriptor& fd,
    std::shared_ptr<const TableProperties>* properties, bool no_io) {
  Status s;
@ -174,7 +173,7 @@ Status TableCache::GetTableProperties(
  }

  Cache::Handle* table_handle = nullptr;
-  s = FindTable(toptions, internal_comparator, fd, &table_handle, no_io);
+  s = FindTable(env_options, internal_comparator, fd, &table_handle, no_io);
  if (!s.ok()) {
    return s;
  }
@ -186,7 +185,7 @@ Status TableCache::GetTableProperties(
 }

 size_t TableCache::GetMemoryUsageByTableReader(
-    const EnvOptions& toptions,
+    const EnvOptions& env_options,
    const InternalKeyComparator& internal_comparator,
    const FileDescriptor& fd) {
  Status s;
@ -197,7 +196,7 @@ size_t TableCache::GetMemoryUsageByTableReader(
  }

  Cache::Handle* table_handle = nullptr;
-  s = FindTable(toptions, internal_comparator, fd, &table_handle, true);
+  s = FindTable(env_options, internal_comparator, fd, &table_handle, true);
  if (!s.ok()) {
    return 0;
  }
--- a/db/table_cache.h
+++ b/db/table_cache.h
@ -19,6 +19,7 @@
 #include "rocksdb/cache.h"
 #include "rocksdb/env.h"
 #include "rocksdb/table.h"
+#include "rocksdb/options.h"
 #include "table/table_reader.h"

 namespace rocksdb {
@ -29,8 +30,8 @@ struct FileDescriptor;

 class TableCache {
 public:
-  TableCache(const Options* options, const EnvOptions& storage_options,
-             Cache* cache);
+  TableCache(const ImmutableCFOptions& ioptions,
+             const EnvOptions& storage_options, Cache* cache);
  ~TableCache();

  // Return an iterator for the specified file number (the corresponding
@ -91,10 +92,8 @@ class TableCache {
  void ReleaseHandle(Cache::Handle* handle);

 private:
-  Env* const env_;
-  const std::vector<DbPath> db_paths_;
-  const Options* options_;
-  const EnvOptions& storage_options_;
+  const ImmutableCFOptions& ioptions_;
+  const EnvOptions& env_options_;
  Cache* const cache_;
 };

--- a/db/table_properties_collector_test.cc
+++ b/db/table_properties_collector_test.cc
@ -11,6 +11,7 @@
 #include "db/dbformat.h"
 #include "db/table_properties_collector.h"
 #include "rocksdb/table.h"
+#include "rocksdb/immutable_options.h"
 #include "table/block_based_table_factory.h"
 #include "table/meta_blocks.h"
 #include "table/plain_table_factory.h"
@ -85,12 +86,14 @@ class DumbLogger : public Logger {
 // Utilities test functions
 namespace {
 void MakeBuilder(const Options& options,
+                 const ImmutableCFOptions& ioptions,
                 const InternalKeyComparator& internal_comparator,
                 std::unique_ptr<FakeWritableFile>* writable,
                 std::unique_ptr<TableBuilder>* builder) {
  writable->reset(new FakeWritableFile);
-  builder->reset(options.table_factory->NewTableBuilder(
-      options, internal_comparator, writable->get(), options.compression));
+  builder->reset(ioptions.table_factory->NewTableBuilder(
+      ioptions, internal_comparator, writable->get(),
+      options.compression, options.compression_opts));
 }
 }  // namespace

@ -153,7 +156,8 @@ void TestCustomizedTablePropertiesCollector(
  // -- Step 1: build table
  std::unique_ptr<TableBuilder> builder;
  std::unique_ptr<FakeWritableFile> writable;
-  MakeBuilder(options, internal_comparator, &writable, &builder);
+  const ImmutableCFOptions ioptions(options);
+  MakeBuilder(options, ioptions, internal_comparator, &writable, &builder);

  for (const auto& kv : kvs) {
    if (encode_as_internal) {
@ -257,16 +261,17 @@ void TestInternalKeyPropertiesCollector(
    // SanitizeOptions().
    options.info_log = std::make_shared<DumbLogger>();
    options = SanitizeOptions("db",            // just a place holder
-                              &pikc, nullptr,  // don't care filter policy
+                              &pikc,
                              options);
    options.comparator = comparator;
  } else {
    options.table_properties_collector_factories = {
        std::make_shared<InternalKeyPropertiesCollectorFactory>()};
  }
+  const ImmutableCFOptions ioptions(options);

  for (int iter = 0; iter < 2; ++iter) {
-    MakeBuilder(options, pikc, &writable, &builder);
+    MakeBuilder(options, ioptions, pikc, &writable, &builder);
    for (const auto& k : keys) {
      builder->Add(k.Encode(), "val");
    }
--- a/db/version_edit.h
+++ b/db/version_edit.h
@ -163,13 +163,13 @@ class VersionEdit {
  // Add the specified file at the specified number.
  // REQUIRES: This version has not been saved (see VersionSet::SaveTo)
  // REQUIRES: "smallest" and "largest" are smallest and largest keys in file
-  void AddFile(int level, uint64_t file, uint64_t file_size,
-               uint64_t file_path_id, const InternalKey& smallest,
+  void AddFile(int level, uint64_t file, uint64_t file_path_id,
+               uint64_t file_size, const InternalKey& smallest,
               const InternalKey& largest, const SequenceNumber& smallest_seqno,
               const SequenceNumber& largest_seqno) {
    assert(smallest_seqno <= largest_seqno);
    FileMetaData f;
-    f.fd = FileDescriptor(file, file_size, file_path_id);
+    f.fd = FileDescriptor(file, file_path_id, file_size);
    f.smallest = smallest;
    f.largest = largest;
    f.smallest_seqno = smallest_seqno;
--- a/db/version_set.cc
+++ b/db/version_set.cc
@ -9,7 +9,10 @@

 #include "db/version_set.h"

+#ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
+#endif
+
 #include <inttypes.h>
 #include <algorithm>
 #include <map>
@ -509,9 +512,9 @@ Status Version::GetTableProperties(std::shared_ptr<const TableProperties>* tp,
                                   const FileMetaData* file_meta,
                                   const std::string* fname) {
  auto table_cache = cfd_->table_cache();
-  auto options = cfd_->options();
+  auto ioptions = cfd_->ioptions();
  Status s = table_cache->GetTableProperties(
-      vset_->storage_options_, cfd_->internal_comparator(), file_meta->fd,
+      vset_->env_options_, cfd_->internal_comparator(), file_meta->fd,
      tp, true /* no io */);
  if (s.ok()) {
    return s;
@ -527,13 +530,13 @@ Status Version::GetTableProperties(std::shared_ptr<const TableProperties>* tp,
  // directly from the properties block in the file.
  std::unique_ptr<RandomAccessFile> file;
  if (fname != nullptr) {
-    s = options->env->NewRandomAccessFile(
-        *fname, &file, vset_->storage_options_);
+    s = ioptions->env->NewRandomAccessFile(
+        *fname, &file, vset_->env_options_);
  } else {
-    s = options->env->NewRandomAccessFile(
-        TableFileName(vset_->options_->db_paths, file_meta->fd.GetNumber(),
+    s = ioptions->env->NewRandomAccessFile(
+        TableFileName(vset_->db_options_->db_paths, file_meta->fd.GetNumber(),
                      file_meta->fd.GetPathId()),
-        &file, vset_->storage_options_);
+        &file, vset_->env_options_);
  }
  if (!s.ok()) {
    return s;
@ -545,11 +548,11 @@ Status Version::GetTableProperties(std::shared_ptr<const TableProperties>* tp,
  s = ReadTableProperties(
      file.get(), file_meta->fd.GetFileSize(),
      Footer::kInvalidTableMagicNumber /* table's magic number */,
-      vset_->env_, options->info_log.get(), &raw_table_properties);
+      vset_->env_, ioptions->info_log, &raw_table_properties);
  if (!s.ok()) {
    return s;
  }
-  RecordTick(options->statistics.get(), NUMBER_DIRECT_LOAD_TABLE_PROPERTIES);
+  RecordTick(ioptions->statistics, NUMBER_DIRECT_LOAD_TABLE_PROPERTIES);

  *tp = std::shared_ptr<const TableProperties>(raw_table_properties);
  return s;
@ -559,7 +562,7 @@ Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props) {
  for (int level = 0; level < num_levels_; level++) {
    for (const auto& file_meta : files_[level]) {
      auto fname =
-          TableFileName(vset_->options_->db_paths, file_meta->fd.GetNumber(),
+          TableFileName(vset_->db_options_->db_paths, file_meta->fd.GetNumber(),
                        file_meta->fd.GetPathId());
      // 1. If the table is already present in table cache, load table
      // properties from there.
@ -581,7 +584,7 @@ size_t Version::GetMemoryUsageByTableReaders() {
  for (auto& file_level : file_levels_) {
    for (size_t i = 0; i < file_level.num_files; i++) {
      total_usage += cfd_->table_cache()->GetMemoryUsageByTableReader(
-          vset_->storage_options_, cfd_->internal_comparator(),
+          vset_->env_options_, cfd_->internal_comparator(),
          file_level.files[i].fd);
    }
  }
@ -596,31 +599,6 @@ uint64_t Version::GetEstimatedActiveKeys() {
  return num_non_deletions_ - num_deletions_;
 }

-void Version::AddIterators(const ReadOptions& read_options,
-                           const EnvOptions& soptions,
-                           std::vector<Iterator*>* iters) {
-  // Merge all level zero files together since they may overlap
-  for (size_t i = 0; i < file_levels_[0].num_files; i++) {
-    const auto& file = file_levels_[0].files[i];
-    iters->push_back(cfd_->table_cache()->NewIterator(
-        read_options, soptions, cfd_->internal_comparator(), file.fd));
-  }
-
-  // For levels > 0, we can use a concatenating iterator that sequentially
-  // walks through the non-overlapping files in the level, opening them
-  // lazily.
-  for (int level = 1; level < num_levels_; level++) {
-    if (file_levels_[level].num_files != 0) {
-      iters->push_back(NewTwoLevelIterator(new LevelFileIteratorState(
-          cfd_->table_cache(), read_options, soptions,
-          cfd_->internal_comparator(), false /* for_compaction */,
-          cfd_->options()->prefix_extractor != nullptr),
-        new LevelFileNumIterator(cfd_->internal_comparator(),
-            &file_levels_[level])));
-    }
-  }
-}
-
 void Version::AddIterators(const ReadOptions& read_options,
                           const EnvOptions& soptions,
                           MergeIteratorBuilder* merge_iter_builder) {
@ -641,7 +619,7 @@ void Version::AddIterators(const ReadOptions& read_options,
          new LevelFileIteratorState(
              cfd_->table_cache(), read_options, soptions,
              cfd_->internal_comparator(), false /* for_compaction */,
-              cfd_->options()->prefix_extractor != nullptr),
+              cfd_->ioptions()->prefix_extractor != nullptr),
          new LevelFileNumIterator(cfd_->internal_comparator(),
              &file_levels_[level]), merge_iter_builder->GetArena()));
    }
@ -757,10 +735,10 @@ Version::Version(ColumnFamilyData* cfd, VersionSet* vset,
          (cfd == nullptr) ? nullptr : internal_comparator_->user_comparator()),
      table_cache_((cfd == nullptr) ? nullptr : cfd->table_cache()),
      merge_operator_((cfd == nullptr) ? nullptr
-                                       : cfd->options()->merge_operator.get()),
-      info_log_((cfd == nullptr) ? nullptr : cfd->options()->info_log.get()),
+                                       : cfd->ioptions()->merge_operator),
+      info_log_((cfd == nullptr) ? nullptr : cfd->ioptions()->info_log),
      db_statistics_((cfd == nullptr) ? nullptr
-                                      : cfd->options()->statistics.get()),
+                                      : cfd->ioptions()->statistics),
      // cfd is nullptr if Version is dummy
      num_levels_(cfd == nullptr ? 0 : cfd->NumberLevels()),
      num_non_empty_levels_(num_levels_),
@ -886,7 +864,7 @@ bool Version::MaybeInitializeFileMetaData(FileMetaData* file_meta) {
  Status s = GetTableProperties(&tp, file_meta);
  file_meta->init_stats_from_file = true;
  if (!s.ok()) {
-    Log(vset_->options_->info_log,
+    Log(vset_->db_options_->info_log,
        "Unable to load table properties for file %" PRIu64 " --- %s\n",
        file_meta->fd.GetNumber(), s.ToString().c_str());
    return false;
@ -969,7 +947,7 @@ void Version::ComputeCompactionScore(
          numfiles++;
        }
      }
-      if (cfd_->options()->compaction_style == kCompactionStyleFIFO) {
+      if (cfd_->ioptions()->compaction_style == kCompactionStyleFIFO) {
        score = static_cast<double>(total_size) /
                cfd_->options()->compaction_options_fifo.max_table_files_size;
      } else if (numfiles >= cfd_->options()->level0_stop_writes_trigger) {
@ -1038,8 +1016,8 @@ void Version::UpdateNumNonEmptyLevels() {
 }

 void Version::UpdateFilesBySize() {
-  if (cfd_->options()->compaction_style == kCompactionStyleFIFO ||
-      cfd_->options()->compaction_style == kCompactionStyleUniversal) {
+  if (cfd_->ioptions()->compaction_style == kCompactionStyleFIFO ||
+      cfd_->ioptions()->compaction_style == kCompactionStyleUniversal) {
    // don't need this
    return;
  }
@ -1699,7 +1677,7 @@ class VersionSet::Builder {
      for (auto& file_meta : *(levels_[level].added_files)) {
        assert (!file_meta->table_reader_handle);
        cfd_->table_cache()->FindTable(
-            base_->vset_->storage_options_, cfd_->internal_comparator(),
+            base_->vset_->env_options_, cfd_->internal_comparator(),
            file_meta->fd, &file_meta->table_reader_handle, false);
        if (file_meta->table_reader_handle != nullptr) {
          // Load table_reader
@ -1727,13 +1705,14 @@ class VersionSet::Builder {
  }
 };

-VersionSet::VersionSet(const std::string& dbname, const DBOptions* options,
-                       const EnvOptions& storage_options, Cache* table_cache)
-    : column_family_set_(new ColumnFamilySet(dbname, options, storage_options,
-                                             table_cache)),
-      env_(options->env),
+VersionSet::VersionSet(const std::string& dbname, const DBOptions* db_options,
+                       const EnvOptions& env_options, Cache* table_cache,
+                       WriteController* write_controller)
+    : column_family_set_(new ColumnFamilySet(dbname, db_options, env_options,
+                                             table_cache, write_controller)),
+      env_(db_options->env),
      dbname_(dbname),
-      options_(options),
+      db_options_(db_options),
      next_file_number_(2),
      manifest_file_number_(0),  // Filled by Recover()
      pending_manifest_file_number_(0),
@ -1741,8 +1720,8 @@ VersionSet::VersionSet(const std::string& dbname, const DBOptions* options,
      prev_log_number_(0),
      current_version_number_(0),
      manifest_file_size_(0),
-      storage_options_(storage_options),
-      storage_options_compactions_(storage_options_) {}
+      env_options_(env_options),
+      env_options_compactions_(env_options_) {}

 VersionSet::~VersionSet() {
  // we need to delete column_family_set_ because its destructor depends on
@ -1844,7 +1823,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,

  assert(pending_manifest_file_number_ == 0);
  if (!descriptor_log_ ||
-      manifest_file_size_ > options_->max_manifest_file_size) {
+      manifest_file_size_ > db_options_->max_manifest_file_size) {
    pending_manifest_file_number_ = NewFileNumber();
    batch_edits.back()->SetNextFile(next_file_number_);
    new_descriptor_log = true;
@ -1872,7 +1851,8 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,

    mu->Unlock();

-    if (!edit->IsColumnFamilyManipulation() && options_->max_open_files == -1) {
+    if (!edit->IsColumnFamilyManipulation() &&
+        db_options_->max_open_files == -1) {
      // unlimited table cache. Pre-load table handle now.
      // Need to do it out of the mutex.
      builder->LoadTableHandlers();
@ -1882,15 +1862,15 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
    // only one thread can be here at the same time
    if (new_descriptor_log) {
      // create manifest file
-      Log(options_->info_log,
+      Log(db_options_->info_log,
          "Creating manifest %" PRIu64 "\n", pending_manifest_file_number_);
      unique_ptr<WritableFile> descriptor_file;
      s = env_->NewWritableFile(
          DescriptorFileName(dbname_, pending_manifest_file_number_),
-          &descriptor_file, env_->OptimizeForManifestWrite(storage_options_));
+          &descriptor_file, env_->OptimizeForManifestWrite(env_options_));
      if (s.ok()) {
        descriptor_file->SetPreallocationBlockSize(
-            options_->manifest_preallocation_size);
+            db_options_->manifest_preallocation_size);
        descriptor_log_.reset(new log::Writer(std::move(descriptor_file)));
        s = WriteSnapshot(descriptor_log_.get());
      }
@ -1911,19 +1891,20 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
          break;
        }
      }
-      if (s.ok()) {
-        if (options_->use_fsync) {
-          StopWatch sw(env_, options_->statistics.get(),
+      if (s.ok() && db_options_->disableDataSync == false) {
+        if (db_options_->use_fsync) {
+          StopWatch sw(env_, db_options_->statistics.get(),
                       MANIFEST_FILE_SYNC_MICROS);
          s = descriptor_log_->file()->Fsync();
        } else {
-          StopWatch sw(env_, options_->statistics.get(),
+          StopWatch sw(env_, db_options_->statistics.get(),
                       MANIFEST_FILE_SYNC_MICROS);
          s = descriptor_log_->file()->Sync();
        }
      }
      if (!s.ok()) {
-        Log(options_->info_log, "MANIFEST write: %s\n", s.ToString().c_str());
+        Log(db_options_->info_log, "MANIFEST write: %s\n",
+            s.ToString().c_str());
        bool all_records_in = true;
        for (auto& e : batch_edits) {
          std::string record;
@ -1934,7 +1915,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
          }
        }
        if (all_records_in) {
-          Log(options_->info_log,
+          Log(db_options_->info_log,
              "MANIFEST contains log record despite error; advancing to new "
              "version to prevent mismatch between in-memory and logged state"
              " If paranoid is set, then the db is now in readonly mode.");
@ -1947,10 +1928,10 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
    // new CURRENT file that points to it.
    if (s.ok() && new_descriptor_log) {
      s = SetCurrentFile(env_, dbname_, pending_manifest_file_number_,
-                         db_directory);
+                         db_options_->disableDataSync ? nullptr : db_directory);
      if (s.ok() && pending_manifest_file_number_ > manifest_file_number_) {
        // delete old manifest file
-        Log(options_->info_log,
+        Log(db_options_->info_log,
            "Deleting manifest %" PRIu64 " current manifest %" PRIu64 "\n",
            manifest_file_number_, pending_manifest_file_number_);
        // we don't care about an error here, PurgeObsoleteFiles will take care
@ -1964,7 +1945,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
      new_manifest_file_size = descriptor_log_->file()->GetFileSize();
    }

-    LogFlush(options_->info_log);
+    LogFlush(db_options_->info_log);
    mu->Lock();
  }

@ -2000,12 +1981,12 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
    manifest_file_size_ = new_manifest_file_size;
    prev_log_number_ = edit->prev_log_number_;
  } else {
-    Log(options_->info_log, "Error in committing version %lu to [%s]",
+    Log(db_options_->info_log, "Error in committing version %lu to [%s]",
        (unsigned long)v->GetVersionNumber(),
        column_family_data->GetName().c_str());
    delete v;
    if (new_descriptor_log) {
-      Log(options_->info_log,
+      Log(db_options_->info_log,
        "Deleting manifest %" PRIu64 " current manifest %" PRIu64 "\n",
        manifest_file_number_, pending_manifest_file_number_);
      descriptor_log_.reset();
@ -2097,13 +2078,13 @@ Status VersionSet::Recover(
    return Status::Corruption("CURRENT file corrupted");
  }

-  Log(options_->info_log, "Recovering from manifest file: %s\n",
+  Log(db_options_->info_log, "Recovering from manifest file: %s\n",
      manifest_filename.c_str());

  manifest_filename = dbname_ + "/" + manifest_filename;
  unique_ptr<SequentialFile> manifest_file;
  s = env_->NewSequentialFile(manifest_filename, &manifest_file,
-                              storage_options_);
+                              env_options_);
  if (!s.ok()) {
    return s;
  }
@ -2230,7 +2211,7 @@ Status VersionSet::Recover(
      if (cfd != nullptr) {
        if (edit.has_log_number_) {
          if (cfd->GetLogNumber() > edit.log_number_) {
-            Log(options_->info_log,
+            Log(db_options_->info_log,
                "MANIFEST corruption detected, but ignored - Log numbers in "
                "records NOT monotonically increasing");
          } else {
@ -2306,7 +2287,7 @@ Status VersionSet::Recover(
      assert(builders_iter != builders.end());
      auto builder = builders_iter->second;

-      if (options_->max_open_files == -1) {
+      if (db_options_->max_open_files == -1) {
      // unlimited table cache. Pre-load table handle now.
      // Need to do it out of the mutex.
        builder->LoadTableHandlers();
@ -2327,7 +2308,7 @@ Status VersionSet::Recover(
    last_sequence_ = last_sequence;
    prev_log_number_ = prev_log_number;

-    Log(options_->info_log,
+    Log(db_options_->info_log,
        "Recovered from manifest file:%s succeeded,"
        "manifest_file_number is %lu, next_file_number is %lu, "
        "last_sequence is %lu, log_number is %lu,"
@ -2339,7 +2320,7 @@ Status VersionSet::Recover(
        column_family_set_->GetMaxColumnFamily());

    for (auto cfd : *column_family_set_) {
-      Log(options_->info_log,
+      Log(db_options_->info_log,
          "Column family [%s] (ID %u), log number is %" PRIu64 "\n",
          cfd->GetName().c_str(), cfd->GetID(), cfd->GetLogNumber());
    }
@ -2422,7 +2403,7 @@ Status VersionSet::ListColumnFamilies(std::vector<std::string>* column_families,
 #ifndef ROCKSDB_LITE
 Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
                                        const Options* options,
-                                        const EnvOptions& storage_options,
+                                        const EnvOptions& env_options,
                                        int new_levels) {
  if (new_levels <= 1) {
    return Status::InvalidArgument(
@ -2433,7 +2414,8 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
  std::shared_ptr<Cache> tc(NewLRUCache(
      options->max_open_files - 10, options->table_cache_numshardbits,
      options->table_cache_remove_scan_count_limit));
-  VersionSet versions(dbname, options, storage_options, tc.get());
+  WriteController wc;
+  VersionSet versions(dbname, options, env_options, tc.get(), &wc);
  Status status;

  std::vector<ColumnFamilyDescriptor> dummy;
@ -2504,7 +2486,7 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
                                bool verbose, bool hex) {
  // Open the specified manifest file.
  unique_ptr<SequentialFile> file;
-  Status s = options.env->NewSequentialFile(dscname, &file, storage_options_);
+  Status s = options.env->NewSequentialFile(dscname, &file, env_options_);
  if (!s.ok()) {
    return s;
  }
@ -2746,12 +2728,12 @@ bool VersionSet::ManifestContains(uint64_t manifest_file_number,
                                  const std::string& record) const {
  std::string fname =
      DescriptorFileName(dbname_, manifest_file_number);
-  Log(options_->info_log, "ManifestContains: checking %s\n", fname.c_str());
+  Log(db_options_->info_log, "ManifestContains: checking %s\n", fname.c_str());
  unique_ptr<SequentialFile> file;
-  Status s = env_->NewSequentialFile(fname, &file, storage_options_);
+  Status s = env_->NewSequentialFile(fname, &file, env_options_);
  if (!s.ok()) {
-    Log(options_->info_log, "ManifestContains: %s\n", s.ToString().c_str());
-    Log(options_->info_log,
+    Log(db_options_->info_log, "ManifestContains: %s\n", s.ToString().c_str());
+    Log(db_options_->info_log,
        "ManifestContains: is unable to reopen the manifest file  %s",
        fname.c_str());
    return false;
@ -2766,7 +2748,7 @@ bool VersionSet::ManifestContains(uint64_t manifest_file_number,
      break;
    }
  }
-  Log(options_->info_log, "ManifestContains: result = %d\n", result ? 1 : 0);
+  Log(db_options_->info_log, "ManifestContains: result = %d\n", result ? 1 : 0);
  return result;
 }

@ -2794,7 +2776,7 @@ uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) {
        // approximate offset of "ikey" within the table.
        TableReader* table_reader_ptr;
        Iterator* iter = v->cfd_->table_cache()->NewIterator(
-            ReadOptions(), storage_options_, v->cfd_->internal_comparator(),
+            ReadOptions(), env_options_, v->cfd_->internal_comparator(),
            files[i]->fd, &table_reader_ptr);
        if (table_reader_ptr != nullptr) {
          result += table_reader_ptr->ApproximateOffsetOf(ikey.Encode());
@ -2856,14 +2838,14 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) {
        const FileLevel* flevel = c->input_levels(which);
        for (size_t i = 0; i < flevel->num_files; i++) {
          list[num++] = cfd->table_cache()->NewIterator(
-              read_options, storage_options_compactions_,
+              read_options, env_options_compactions_,
              cfd->internal_comparator(), flevel->files[i].fd, nullptr,
              true /* for compaction */);
        }
      } else {
        // Create concatenating iterator for the files from this level
        list[num++] = NewTwoLevelIterator(new Version::LevelFileIteratorState(
-              cfd->table_cache(), read_options, storage_options_,
+              cfd->table_cache(), read_options, env_options_,
              cfd->internal_comparator(), true /* for_compaction */,
              false /* prefix enabled */),
            new Version::LevelFileNumIterator(cfd->internal_comparator(),
@ -2884,7 +2866,7 @@ bool VersionSet::VerifyCompactionFileConsistency(Compaction* c) {
 #ifndef NDEBUG
  Version* version = c->column_family_data()->current();
  if (c->input_version() != version) {
-    Log(options_->info_log,
+    Log(db_options_->info_log,
        "[%s] VerifyCompactionFileConsistency version mismatch",
        c->column_family_data()->GetName().c_str());
  }
@ -2955,11 +2937,11 @@ void VersionSet::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
        LiveFileMetaData filemetadata;
        filemetadata.column_family_name = cfd->GetName();
        uint32_t path_id = file->fd.GetPathId();
-        if (path_id < options_->db_paths.size()) {
-          filemetadata.db_path = options_->db_paths[path_id].path;
+        if (path_id < db_options_->db_paths.size()) {
+          filemetadata.db_path = db_options_->db_paths[path_id].path;
        } else {
-          assert(!options_->db_paths.empty());
-          filemetadata.db_path = options_->db_paths.back().path;
+          assert(!db_options_->db_paths.empty());
+          filemetadata.db_path = db_options_->db_paths.back().path;
        }
        filemetadata.name = MakeTableFileName("", file->fd.GetNumber());
        filemetadata.level = level;
@ -2980,17 +2962,21 @@ void VersionSet::GetObsoleteFiles(std::vector<FileMetaData*>* files) {
 }

 ColumnFamilyData* VersionSet::CreateColumnFamily(
-    const ColumnFamilyOptions& options, VersionEdit* edit) {
+    const ColumnFamilyOptions& cf_options, VersionEdit* edit) {
  assert(edit->is_column_family_add_);

  Version* dummy_versions = new Version(nullptr, this);
  auto new_cfd = column_family_set_->CreateColumnFamily(
-      edit->column_family_name_, edit->column_family_, dummy_versions, options);
+      edit->column_family_name_, edit->column_family_, dummy_versions,
+      cf_options);

  Version* v = new Version(new_cfd, this, current_version_number_++);

  AppendVersion(new_cfd, v);
-  new_cfd->CreateNewMemtable();
+  // GetLatestMutableCFOptions() is safe here without mutex since the
+  // cfd is not available to client
+  new_cfd->CreateNewMemtable(MemTableOptions(
+        *new_cfd->GetLatestMutableCFOptions(), *new_cfd->options()));
  new_cfd->SetLogNumber(edit->log_number_);
  return new_cfd;
 }
--- a/db/version_set.h
+++ b/db/version_set.h
@ -34,6 +34,7 @@
 #include "db/column_family.h"
 #include "db/log_reader.h"
 #include "db/file_indexer.h"
+#include "db/write_controller.h"

 namespace rocksdb {

@ -86,8 +87,6 @@ class Version {
  // Append to *iters a sequence of iterators that will
  // yield the contents of this Version when merged together.
  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
-  void AddIterators(const ReadOptions&, const EnvOptions& soptions,
-                    std::vector<Iterator*>* iters);

  void AddIterators(const ReadOptions&, const EnvOptions& soptions,
                    MergeIteratorBuilder* merger_iter_builder);
@ -257,7 +256,7 @@ class Version {
  class LevelFileNumIterator;
  class LevelFileIteratorState;

-  bool PrefixMayMatch(const ReadOptions& options, Iterator* level_iter,
+  bool PrefixMayMatch(const ReadOptions& read_options, Iterator* level_iter,
                      const Slice& internal_prefix) const;

  // Update num_non_empty_levels_.
@ -323,8 +322,8 @@ class Version {
  // These are used to pick the best compaction level
  std::vector<double> compaction_score_;
  std::vector<int> compaction_level_;
-  double max_compaction_score_; // max score in l1 to ln-1
-  int max_compaction_score_level_; // level on which max score occurs
+  double max_compaction_score_ = 0.0;   // max score in l1 to ln-1
+  int max_compaction_score_level_ = 0;  // level on which max score occurs

  // A version number that uniquely represents this version. This is
  // used for debugging and logging purposes only.
@ -358,8 +357,9 @@ class Version {

 class VersionSet {
 public:
-  VersionSet(const std::string& dbname, const DBOptions* options,
-             const EnvOptions& storage_options, Cache* table_cache);
+  VersionSet(const std::string& dbname, const DBOptions* db_options,
+             const EnvOptions& env_options, Cache* table_cache,
+             WriteController* write_controller);
  ~VersionSet();

  // Apply *edit to the current version to form a new descriptor that
@ -397,7 +397,7 @@ class VersionSet {
  // among [4-6] contains files.
  static Status ReduceNumberOfLevels(const std::string& dbname,
                                     const Options* options,
-                                     const EnvOptions& storage_options,
+                                     const EnvOptions& env_options,
                                     int new_levels);

  // printf contents (for debugging)
@ -506,14 +506,14 @@ class VersionSet {
  bool ManifestContains(uint64_t manifest_file_number,
                        const std::string& record) const;

-  ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& options,
+  ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& cf_options,
                                       VersionEdit* edit);

  std::unique_ptr<ColumnFamilySet> column_family_set_;

  Env* const env_;
  const std::string dbname_;
-  const DBOptions* const options_;
+  const DBOptions* const db_options_;
  uint64_t next_file_number_;
  uint64_t manifest_file_number_;
  uint64_t pending_manifest_file_number_;
@ -534,12 +534,12 @@ class VersionSet {

  std::vector<FileMetaData*> obsolete_files_;

-  // storage options for all reads and writes except compactions
-  const EnvOptions& storage_options_;
+  // env options for all reads and writes except compactions
+  const EnvOptions& env_options_;

-  // storage options used for compactions. This is a copy of
-  // storage_options_ but with readaheads set to readahead_compactions_.
-  const EnvOptions storage_options_compactions_;
+  // env options used for compactions. This is a copy of
+  // env_options_ but with readaheads set to readahead_compactions_.
+  const EnvOptions env_options_compactions_;

  // No copying allowed
  VersionSet(const VersionSet&);
--- a/db/write_batch.cc
+++ b/db/write_batch.cc
@ -23,10 +23,10 @@
 //    data: uint8[len]

 #include "rocksdb/write_batch.h"
-#include "rocksdb/options.h"
 #include "rocksdb/merge_operator.h"
 #include "db/dbformat.h"
 #include "db/db_impl.h"
+#include "db/column_family.h"
 #include "db/memtable.h"
 #include "db/snapshot.h"
 #include "db/write_batch_internal.h"
@ -80,6 +80,58 @@ int WriteBatch::Count() const {
  return WriteBatchInternal::Count(this);
 }

+Status ReadRecordFromWriteBatch(Slice* input, char* tag,
+                                uint32_t* column_family, Slice* key,
+                                Slice* value, Slice* blob) {
+  assert(key != nullptr && value != nullptr);
+  *tag = (*input)[0];
+  input->remove_prefix(1);
+  *column_family = 0;  // default
+  switch (*tag) {
+    case kTypeColumnFamilyValue:
+      if (!GetVarint32(input, column_family)) {
+        return Status::Corruption("bad WriteBatch Put");
+      }
+    // intentional fallthrough
+    case kTypeValue:
+      if (!GetLengthPrefixedSlice(input, key) ||
+          !GetLengthPrefixedSlice(input, value)) {
+        return Status::Corruption("bad WriteBatch Put");
+      }
+      break;
+    case kTypeColumnFamilyDeletion:
+      if (!GetVarint32(input, column_family)) {
+        return Status::Corruption("bad WriteBatch Delete");
+      }
+    // intentional fallthrough
+    case kTypeDeletion:
+      if (!GetLengthPrefixedSlice(input, key)) {
+        return Status::Corruption("bad WriteBatch Delete");
+      }
+      break;
+    case kTypeColumnFamilyMerge:
+      if (!GetVarint32(input, column_family)) {
+        return Status::Corruption("bad WriteBatch Merge");
+      }
+    // intentional fallthrough
+    case kTypeMerge:
+      if (!GetLengthPrefixedSlice(input, key) ||
+          !GetLengthPrefixedSlice(input, value)) {
+        return Status::Corruption("bad WriteBatch Merge");
+      }
+      break;
+    case kTypeLogData:
+      assert(blob != nullptr);
+      if (!GetLengthPrefixedSlice(input, blob)) {
+        return Status::Corruption("bad WriteBatch Blob");
+      }
+      break;
+    default:
+      return Status::Corruption("unknown WriteBatch tag");
+  }
+  return Status::OK();
+}
+
 Status WriteBatch::Iterate(Handler* handler) const {
  Slice input(rep_);
  if (input.size() < kHeader) {
@ -91,57 +143,33 @@ Status WriteBatch::Iterate(Handler* handler) const {
  int found = 0;
  Status s;
  while (s.ok() && !input.empty() && handler->Continue()) {
-    char tag = input[0];
-    input.remove_prefix(1);
+    char tag = 0;
    uint32_t column_family = 0;  // default
+
+    s = ReadRecordFromWriteBatch(&input, &tag, &column_family, &key, &value,
+                                 &blob);
+    if (!s.ok()) {
+      return s;
+    }
+
    switch (tag) {
      case kTypeColumnFamilyValue:
-        if (!GetVarint32(&input, &column_family)) {
-          return Status::Corruption("bad WriteBatch Put");
-        }
-      // intentional fallthrough
      case kTypeValue:
-        if (GetLengthPrefixedSlice(&input, &key) &&
-            GetLengthPrefixedSlice(&input, &value)) {
-          s = handler->PutCF(column_family, key, value);
-          found++;
-        } else {
-          return Status::Corruption("bad WriteBatch Put");
-        }
+        s = handler->PutCF(column_family, key, value);
+        found++;
        break;
      case kTypeColumnFamilyDeletion:
-        if (!GetVarint32(&input, &column_family)) {
-          return Status::Corruption("bad WriteBatch Delete");
-        }
-      // intentional fallthrough
      case kTypeDeletion:
-        if (GetLengthPrefixedSlice(&input, &key)) {
-          s = handler->DeleteCF(column_family, key);
-          found++;
-        } else {
-          return Status::Corruption("bad WriteBatch Delete");
-        }
+        s = handler->DeleteCF(column_family, key);
+        found++;
        break;
      case kTypeColumnFamilyMerge:
-        if (!GetVarint32(&input, &column_family)) {
-          return Status::Corruption("bad WriteBatch Merge");
-        }
-      // intentional fallthrough
      case kTypeMerge:
-        if (GetLengthPrefixedSlice(&input, &key) &&
-            GetLengthPrefixedSlice(&input, &value)) {
-          s = handler->MergeCF(column_family, key, value);
-          found++;
-        } else {
-          return Status::Corruption("bad WriteBatch Merge");
-        }
+        s = handler->MergeCF(column_family, key, value);
+        found++;
        break;
      case kTypeLogData:
-        if (GetLengthPrefixedSlice(&input, &blob)) {
-          handler->LogData(blob);
-        } else {
-          return Status::Corruption("bad WriteBatch Blob");
-        }
+        handler->LogData(blob);
        break;
      default:
        return Status::Corruption("unknown WriteBatch tag");
@ -186,17 +214,6 @@ void WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id,
  PutLengthPrefixedSlice(&b->rep_, value);
 }

-namespace {
-inline uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family) {
-  uint32_t column_family_id = 0;
-  if (column_family != nullptr) {
-    auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
-    column_family_id = cfh->GetID();
-  }
-  return column_family_id;
-}
-}  // namespace
-
 void WriteBatch::Put(ColumnFamilyHandle* column_family, const Slice& key,
                     const Slice& value) {
  WriteBatchInternal::Put(this, GetColumnFamilyID(column_family), key, value);
@ -281,17 +298,17 @@ class MemTableInserter : public WriteBatch::Handler {
 public:
  SequenceNumber sequence_;
  ColumnFamilyMemTables* cf_mems_;
-  bool recovery_;
+  bool ignore_missing_column_families_;
  uint64_t log_number_;
  DBImpl* db_;
  const bool dont_filter_deletes_;

  MemTableInserter(SequenceNumber sequence, ColumnFamilyMemTables* cf_mems,
-                   bool recovery, uint64_t log_number, DB* db,
-                   const bool dont_filter_deletes)
+                   bool ignore_missing_column_families, uint64_t log_number,
+                   DB* db, const bool dont_filter_deletes)
      : sequence_(sequence),
        cf_mems_(cf_mems),
-        recovery_(recovery),
+        ignore_missing_column_families_(ignore_missing_column_families),
        log_number_(log_number),
        db_(reinterpret_cast<DBImpl*>(db)),
        dont_filter_deletes_(dont_filter_deletes) {
@ -303,12 +320,18 @@ class MemTableInserter : public WriteBatch::Handler {

  bool SeekToColumnFamily(uint32_t column_family_id, Status* s) {
    bool found = cf_mems_->Seek(column_family_id);
-    if (recovery_ && (!found || log_number_ < cf_mems_->GetLogNumber())) {
-      // if in recovery envoronment:
-      // * If column family was not found, it might mean that the WAL write
-      // batch references to the column family that was dropped after the
-      // insert. We don't want to fail the whole write batch in that case -- we
-      // just ignore the update.
+    if (!found) {
+      if (ignore_missing_column_families_) {
+        *s = Status::OK();
+      } else {
+        *s = Status::InvalidArgument(
+            "Invalid column family specified in write batch");
+      }
+      return false;
+    }
+    if (log_number_ != 0 && log_number_ < cf_mems_->GetLogNumber()) {
+      // This is true only in recovery environment (log_number_ is always 0 in
+      // non-recovery, regular write code-path)
      // * If log_number_ < cf_mems_->GetLogNumber(), this means that column
      // family already contains updates from this log. We can't apply updates
      // twice because of update-in-place or merge workloads -- ignore the
@ -316,18 +339,8 @@ class MemTableInserter : public WriteBatch::Handler {
      *s = Status::OK();
      return false;
    }
-    if (!found) {
-      assert(!recovery_);
-      // If the column family was not found in non-recovery enviornment
-      // (client's write code-path), we have to fail the write and return
-      // the failure status to the client.
-      *s = Status::InvalidArgument(
-          "Invalid column family specified in write batch");
-      return false;
-    }
    return true;
  }
-
  virtual Status PutCF(uint32_t column_family_id, const Slice& key,
                       const Slice& value) {
    Status seek_status;
@ -336,14 +349,15 @@ class MemTableInserter : public WriteBatch::Handler {
      return seek_status;
    }
    MemTable* mem = cf_mems_->GetMemTable();
-    const Options* options = cf_mems_->GetOptions();
-    if (!options->inplace_update_support) {
+    auto* ioptions = mem->GetImmutableOptions();
+    auto* moptions = mem->GetMemTableOptions();
+    if (!moptions->inplace_update_support) {
      mem->Add(sequence_, kTypeValue, key, value);
-    } else if (options->inplace_callback == nullptr) {
+    } else if (moptions->inplace_callback == nullptr) {
      mem->Update(sequence_, key, value);
-      RecordTick(options->statistics.get(), NUMBER_KEYS_UPDATED);
+      RecordTick(ioptions->statistics, NUMBER_KEYS_UPDATED);
    } else {
-      if (mem->UpdateCallback(sequence_, key, value, *options)) {
+      if (mem->UpdateCallback(sequence_, key, value)) {
      } else {
        // key not found in memtable. Do sst get, update, add
        SnapshotImpl read_from_snapshot;
@ -362,17 +376,17 @@ class MemTableInserter : public WriteBatch::Handler {

        char* prev_buffer = const_cast<char*>(prev_value.c_str());
        uint32_t prev_size = prev_value.size();
-        auto status = options->inplace_callback(s.ok() ? prev_buffer : nullptr,
-                                                s.ok() ? &prev_size : nullptr,
-                                                value, &merged_value);
+        auto status = moptions->inplace_callback(s.ok() ? prev_buffer : nullptr,
+                                                 s.ok() ? &prev_size : nullptr,
+                                                 value, &merged_value);
        if (status == UpdateStatus::UPDATED_INPLACE) {
          // prev_value is updated in-place with final value.
          mem->Add(sequence_, kTypeValue, key, Slice(prev_buffer, prev_size));
-          RecordTick(options->statistics.get(), NUMBER_KEYS_WRITTEN);
+          RecordTick(ioptions->statistics, NUMBER_KEYS_WRITTEN);
        } else if (status == UpdateStatus::UPDATED) {
          // merged_value contains the final value.
          mem->Add(sequence_, kTypeValue, key, Slice(merged_value));
-          RecordTick(options->statistics.get(), NUMBER_KEYS_WRITTEN);
+          RecordTick(ioptions->statistics, NUMBER_KEYS_WRITTEN);
        }
      }
    }
@ -380,6 +394,7 @@ class MemTableInserter : public WriteBatch::Handler {
    // sequence number. Even if the update eventually fails and does not result
    // in memtable add/update.
    sequence_++;
+    cf_mems_->CheckMemtableFull();
    return Status::OK();
  }

@ -391,17 +406,18 @@ class MemTableInserter : public WriteBatch::Handler {
      return seek_status;
    }
    MemTable* mem = cf_mems_->GetMemTable();
-    const Options* options = cf_mems_->GetOptions();
+    auto* ioptions = mem->GetImmutableOptions();
+    auto* moptions = mem->GetMemTableOptions();
    bool perform_merge = false;

-    if (options->max_successive_merges > 0 && db_ != nullptr) {
+    if (moptions->max_successive_merges > 0 && db_ != nullptr) {
      LookupKey lkey(key, sequence_);

      // Count the number of successive merges at the head
      // of the key in the memtable
      size_t num_merges = mem->CountSuccessiveMergeEntries(lkey);

-      if (num_merges >= options->max_successive_merges) {
+      if (num_merges >= moptions->max_successive_merges) {
        perform_merge = true;
      }
    }
@ -425,16 +441,16 @@ class MemTableInserter : public WriteBatch::Handler {
      Slice get_value_slice = Slice(get_value);

      // 2) Apply this merge
-      auto merge_operator = options->merge_operator.get();
+      auto merge_operator = ioptions->merge_operator;
      assert(merge_operator);

      std::deque<std::string> operands;
      operands.push_front(value.ToString());
      std::string new_value;
      if (!merge_operator->FullMerge(key, &get_value_slice, operands,
-                                     &new_value, options->info_log.get())) {
+                                     &new_value, ioptions->info_log)) {
          // Failed to merge!
-        RecordTick(options->statistics.get(), NUMBER_MERGE_FAILURES);
+        RecordTick(ioptions->statistics, NUMBER_MERGE_FAILURES);

        // Store the delta in memtable
        perform_merge = false;
@ -450,6 +466,7 @@ class MemTableInserter : public WriteBatch::Handler {
    }

    sequence_++;
+    cf_mems_->CheckMemtableFull();
    return Status::OK();
  }

@ -460,8 +477,9 @@ class MemTableInserter : public WriteBatch::Handler {
      return seek_status;
    }
    MemTable* mem = cf_mems_->GetMemTable();
-    const Options* options = cf_mems_->GetOptions();
-    if (!dont_filter_deletes_ && options->filter_deletes) {
+    auto* ioptions = mem->GetImmutableOptions();
+    auto* moptions = mem->GetMemTableOptions();
+    if (!dont_filter_deletes_ && moptions->filter_deletes) {
      SnapshotImpl read_from_snapshot;
      read_from_snapshot.number_ = sequence_;
      ReadOptions ropts;
@ -472,12 +490,13 @@ class MemTableInserter : public WriteBatch::Handler {
        cf_handle = db_->DefaultColumnFamily();
      }
      if (!db_->KeyMayExist(ropts, cf_handle, key, &value)) {
-        RecordTick(options->statistics.get(), NUMBER_FILTERED_DELETES);
+        RecordTick(ioptions->statistics, NUMBER_FILTERED_DELETES);
        return Status::OK();
      }
    }
    mem->Add(sequence_, kTypeDeletion, key, Slice());
    sequence_++;
+    cf_mems_->CheckMemtableFull();
    return Status::OK();
  }
 };
@ -485,10 +504,12 @@ class MemTableInserter : public WriteBatch::Handler {

 Status WriteBatchInternal::InsertInto(const WriteBatch* b,
                                      ColumnFamilyMemTables* memtables,
-                                      bool recovery, uint64_t log_number,
-                                      DB* db, const bool dont_filter_deletes) {
+                                      bool ignore_missing_column_families,
+                                      uint64_t log_number, DB* db,
+                                      const bool dont_filter_deletes) {
  MemTableInserter inserter(WriteBatchInternal::Sequence(b), memtables,
-                            recovery, log_number, db, dont_filter_deletes);
+                            ignore_missing_column_families, log_number, db,
+                            dont_filter_deletes);
  return b->Iterate(&inserter);
 }

--- a/db/write_batch_internal.h
+++ b/db/write_batch_internal.h
@ -28,6 +28,7 @@ class ColumnFamilyMemTables {
  virtual MemTable* GetMemTable() const = 0;
  virtual const Options* GetOptions() const = 0;
  virtual ColumnFamilyHandle* GetColumnFamilyHandle() = 0;
+  virtual void CheckMemtableFull() = 0;
 };

 class ColumnFamilyMemTablesDefault : public ColumnFamilyMemTables {
@ -54,6 +55,8 @@ class ColumnFamilyMemTablesDefault : public ColumnFamilyMemTables {

  ColumnFamilyHandle* GetColumnFamilyHandle() override { return nullptr; }

+  void CheckMemtableFull() override {}
+
 private:
  bool ok_;
  MemTable* mem_;
@ -106,18 +109,18 @@ class WriteBatchInternal {
  // Inserts batch entries into memtable
  // If dont_filter_deletes is false AND options.filter_deletes is true,
  // then --> Drops deletes in batch if db->KeyMayExist returns false
-  // If recovery == true, this means InsertInto is executed on a recovery
-  // code-path. WriteBatch referencing a dropped column family can be
-  // found on a recovery code-path and should be ignored (recovery should not
-  // fail). Additionally, the memtable will be updated only if
+  // If ignore_missing_column_families == true. WriteBatch referencing
+  // non-existing column family should be ignored.
+  // However, if ignore_missing_column_families == false, any WriteBatch
+  // referencing non-existing column family will return a InvalidArgument()
+  // failure.
+  //
+  // If log_number is non-zero, the memtable will be updated only if
  // memtables->GetLogNumber() >= log_number
-  // However, if recovery == false, any WriteBatch referencing
-  // non-existing column family will return a failure. Also, log_number is
-  // ignored in that case
  static Status InsertInto(const WriteBatch* batch,
                           ColumnFamilyMemTables* memtables,
-                           bool recovery = false, uint64_t log_number = 0,
-                           DB* db = nullptr,
+                           bool ignore_missing_column_families = false,
+                           uint64_t log_number = 0, DB* db = nullptr,
                           const bool dont_filter_deletes = true);

  static void Append(WriteBatch* dst, const WriteBatch* src);
--- a/db/write_batch_test.cc
+++ b/db/write_batch_test.cc
@ -15,8 +15,10 @@
 #include "db/write_batch_internal.h"
 #include "rocksdb/env.h"
 #include "rocksdb/memtablerep.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
 #include "util/logging.h"
 #include "util/testharness.h"
+#include "util/scoped_arena_iterator.h"

 namespace rocksdb {

@ -25,13 +27,15 @@ static std::string PrintContents(WriteBatch* b) {
  auto factory = std::make_shared<SkipListFactory>();
  Options options;
  options.memtable_factory = factory;
-  MemTable* mem = new MemTable(cmp, options);
+  MemTable* mem = new MemTable(cmp, ImmutableCFOptions(options),
+      MemTableOptions(MutableCFOptions(options), options));
  mem->Ref();
  std::string state;
  ColumnFamilyMemTablesDefault cf_mems_default(mem, &options);
  Status s = WriteBatchInternal::InsertInto(b, &cf_mems_default);
  int count = 0;
-  Iterator* iter = mem->NewIterator(ReadOptions());
+  Arena arena;
+  ScopedArenaIterator iter(mem->NewIterator(ReadOptions(), &arena));
  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
    ParsedInternalKey ikey;
    memset((void *)&ikey, 0, sizeof(ikey));
@ -66,7 +70,6 @@ static std::string PrintContents(WriteBatch* b) {
    state.append("@");
    state.append(NumberToString(ikey.sequence));
  }
-  delete iter;
  if (!s.ok()) {
    state.append(s.ToString());
  } else if (count != WriteBatchInternal::Count(b)) {
@ -286,6 +289,9 @@ class ColumnFamilyHandleImplDummy : public ColumnFamilyHandleImpl {
  explicit ColumnFamilyHandleImplDummy(int id)
      : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr), id_(id) {}
  uint32_t GetID() const override { return id_; }
+  const Comparator* user_comparator() const override {
+    return BytewiseComparator();
+  }

 private:
  uint32_t id_;
@ -316,6 +322,88 @@ TEST(WriteBatchTest, ColumnFamiliesBatchTest) {
      handler.seen);
 }

+TEST(WriteBatchTest, ColumnFamiliesBatchWithIndexTest) {
+  WriteBatchWithIndex batch;
+  ColumnFamilyHandleImplDummy zero(0), two(2), three(3), eight(8);
+  batch.Put(&zero, Slice("foo"), Slice("bar"));
+  batch.Put(&two, Slice("twofoo"), Slice("bar2"));
+  batch.Put(&eight, Slice("eightfoo"), Slice("bar8"));
+  batch.Delete(&eight, Slice("eightfoo"));
+  batch.Merge(&three, Slice("threethree"), Slice("3three"));
+  batch.Put(&zero, Slice("foo"), Slice("bar"));
+  batch.Merge(Slice("omom"), Slice("nom"));
+
+  std::unique_ptr<WBWIIterator> iter;
+
+  iter.reset(batch.NewIterator(&eight));
+  iter->Seek("eightfoo");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(WriteType::kPutRecord, iter->Entry().type);
+  ASSERT_EQ("eightfoo", iter->Entry().key.ToString());
+  ASSERT_EQ("bar8", iter->Entry().value.ToString());
+
+  iter->Next();
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(WriteType::kDeleteRecord, iter->Entry().type);
+  ASSERT_EQ("eightfoo", iter->Entry().key.ToString());
+
+  iter->Next();
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(!iter->Valid());
+
+  iter.reset(batch.NewIterator());
+  iter->Seek("gggg");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(WriteType::kMergeRecord, iter->Entry().type);
+  ASSERT_EQ("omom", iter->Entry().key.ToString());
+  ASSERT_EQ("nom", iter->Entry().value.ToString());
+
+  iter->Next();
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(!iter->Valid());
+
+  iter.reset(batch.NewIterator(&zero));
+  iter->Seek("foo");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(WriteType::kPutRecord, iter->Entry().type);
+  ASSERT_EQ("foo", iter->Entry().key.ToString());
+  ASSERT_EQ("bar", iter->Entry().value.ToString());
+
+  iter->Next();
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(WriteType::kPutRecord, iter->Entry().type);
+  ASSERT_EQ("foo", iter->Entry().key.ToString());
+  ASSERT_EQ("bar", iter->Entry().value.ToString());
+
+  iter->Next();
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(WriteType::kMergeRecord, iter->Entry().type);
+  ASSERT_EQ("omom", iter->Entry().key.ToString());
+  ASSERT_EQ("nom", iter->Entry().value.ToString());
+
+  iter->Next();
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(!iter->Valid());
+
+  TestHandler handler;
+  batch.GetWriteBatch()->Iterate(&handler);
+  ASSERT_EQ(
+      "Put(foo, bar)"
+      "PutCF(2, twofoo, bar2)"
+      "PutCF(8, eightfoo, bar8)"
+      "DeleteCF(8, eightfoo)"
+      "MergeCF(3, threethree, 3three)"
+      "Put(foo, bar)"
+      "Merge(omom, nom)",
+      handler.seen);
+}
+
 }  // namespace rocksdb

 int main(int argc, char** argv) {
--- a/db/write_controller.cc
+++ b/db/write_controller.cc
@ -0,0 +1,37 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include "db/write_controller.h"
+
+#include <cassert>
+
+namespace rocksdb {
+
+std::unique_ptr<WriteControllerToken> WriteController::GetStopToken() {
+  ++total_stopped_;
+  return std::unique_ptr<WriteControllerToken>(new StopWriteToken(this));
+}
+
+std::unique_ptr<WriteControllerToken> WriteController::GetDelayToken(
+    uint64_t delay_us) {
+  total_delay_us_ += delay_us;
+  return std::unique_ptr<WriteControllerToken>(
+      new DelayWriteToken(this, delay_us));
+}
+
+bool WriteController::IsStopped() const { return total_stopped_ > 0; }
+uint64_t WriteController::GetDelay() const { return total_delay_us_; }
+
+StopWriteToken::~StopWriteToken() {
+  assert(controller_->total_stopped_ >= 1);
+  --controller_->total_stopped_;
+}
+
+DelayWriteToken::~DelayWriteToken() {
+  assert(controller_->total_delay_us_ >= delay_us_);
+  controller_->total_delay_us_ -= delay_us_;
+}
+
+}  // namespace rocksdb
--- a/db/write_controller.h
+++ b/db/write_controller.h
@ -0,0 +1,78 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#include <stdint.h>
+
+#include <memory>
+
+namespace rocksdb {
+
+class WriteControllerToken;
+
+// WriteController is controlling write stalls in our write code-path. Write
+// stalls happen when compaction can't keep up with write rate.
+// All of the methods here (including WriteControllerToken's destructors) need
+// to be called while holding DB mutex
+class WriteController {
+ public:
+  WriteController() : total_stopped_(0), total_delay_us_(0) {}
+  ~WriteController() = default;
+
+  // When an actor (column family) requests a stop token, all writes will be
+  // stopped until the stop token is released (deleted)
+  std::unique_ptr<WriteControllerToken> GetStopToken();
+  // When an actor (column family) requests a delay token, total delay for all
+  // writes will be increased by delay_us. The delay will last until delay token
+  // is released
+  std::unique_ptr<WriteControllerToken> GetDelayToken(uint64_t delay_us);
+
+  // these two metods are querying the state of the WriteController
+  bool IsStopped() const;
+  uint64_t GetDelay() const;
+
+ private:
+  friend class WriteControllerToken;
+  friend class StopWriteToken;
+  friend class DelayWriteToken;
+
+  int total_stopped_;
+  uint64_t total_delay_us_;
+};
+
+class WriteControllerToken {
+ public:
+  explicit WriteControllerToken(WriteController* controller)
+      : controller_(controller) {}
+  virtual ~WriteControllerToken() {}
+
+ protected:
+  WriteController* controller_;
+
+ private:
+  // no copying allowed
+  WriteControllerToken(const WriteControllerToken&) = delete;
+  void operator=(const WriteControllerToken&) = delete;
+};
+
+class StopWriteToken : public WriteControllerToken {
+ public:
+  explicit StopWriteToken(WriteController* controller)
+      : WriteControllerToken(controller) {}
+  virtual ~StopWriteToken();
+};
+
+class DelayWriteToken : public WriteControllerToken {
+ public:
+  DelayWriteToken(WriteController* controller, uint64_t delay_us)
+      : WriteControllerToken(controller), delay_us_(delay_us) {}
+  virtual ~DelayWriteToken();
+
+ private:
+  uint64_t delay_us_;
+};
+
+}  // namespace rocksdb
--- a/db/write_controller_test.cc
+++ b/db/write_controller_test.cc
@ -0,0 +1,40 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#include "db/write_controller.h"
+
+#include "util/testharness.h"
+
+namespace rocksdb {
+
+class WriteControllerTest {};
+
+TEST(WriteControllerTest, SanityTest) {
+  WriteController controller;
+  auto stop_token_1 = controller.GetStopToken();
+  auto stop_token_2 = controller.GetStopToken();
+
+  ASSERT_EQ(true, controller.IsStopped());
+  stop_token_1.reset();
+  ASSERT_EQ(true, controller.IsStopped());
+  stop_token_2.reset();
+  ASSERT_EQ(false, controller.IsStopped());
+
+  auto delay_token_1 = controller.GetDelayToken(5);
+  ASSERT_EQ(static_cast<uint64_t>(5), controller.GetDelay());
+  auto delay_token_2 = controller.GetDelayToken(8);
+  ASSERT_EQ(static_cast<uint64_t>(13), controller.GetDelay());
+
+  delay_token_2.reset();
+  ASSERT_EQ(static_cast<uint64_t>(5), controller.GetDelay());
+  delay_token_1.reset();
+  ASSERT_EQ(static_cast<uint64_t>(0), controller.GetDelay());
+  delay_token_1.reset();
+  ASSERT_EQ(false, controller.IsStopped());
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); }
--- a/db/write_thread.cc
+++ b/db/write_thread.cc
@ -0,0 +1,147 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include "db/write_thread.h"
+
+namespace rocksdb {
+
+Status WriteThread::EnterWriteThread(WriteThread::Writer* w,
+                                     uint64_t expiration_time) {
+  // the following code block pushes the current writer "w" into the writer
+  // queue "writers_" and wait until one of the following conditions met:
+  // 1. the job of "w" has been done by some other writers.
+  // 2. "w" becomes the first writer in "writers_"
+  // 3. "w" timed-out.
+  writers_.push_back(w);
+
+  bool timed_out = false;
+  while (!w->done && w != writers_.front()) {
+    if (expiration_time == 0) {
+      w->cv.Wait();
+    } else if (w->cv.TimedWait(expiration_time)) {
+      if (w->in_batch_group) {
+        // then it means the front writer is currently doing the
+        // write on behalf of this "timed-out" writer.  Then it
+        // should wait until the write completes.
+        expiration_time = 0;
+      } else {
+        timed_out = true;
+        break;
+      }
+    }
+  }
+
+  if (timed_out) {
+#ifndef NDEBUG
+    bool found = false;
+#endif
+    for (auto iter = writers_.begin(); iter != writers_.end(); iter++) {
+      if (*iter == w) {
+        writers_.erase(iter);
+#ifndef NDEBUG
+        found = true;
+#endif
+        break;
+      }
+    }
+#ifndef NDEBUG
+    assert(found);
+#endif
+    // writers_.front() might still be in cond_wait without a time-out.
+    // As a result, we need to signal it to wake it up.  Otherwise no
+    // one else will wake him up, and RocksDB will hang.
+    if (!writers_.empty()) {
+      writers_.front()->cv.Signal();
+    }
+    return Status::TimedOut();
+  }
+  return Status::OK();
+}
+
+void WriteThread::ExitWriteThread(WriteThread::Writer* w,
+                                  WriteThread::Writer* last_writer,
+                                  Status status) {
+  // Pop out the current writer and all writers being pushed before the
+  // current writer from the writer queue.
+  while (!writers_.empty()) {
+    Writer* ready = writers_.front();
+    writers_.pop_front();
+    if (ready != w) {
+      ready->status = status;
+      ready->done = true;
+      ready->cv.Signal();
+    }
+    if (ready == last_writer) break;
+  }
+
+  // Notify new head of write queue
+  if (!writers_.empty()) {
+    writers_.front()->cv.Signal();
+  }
+}
+
+// This function will be called only when the first writer succeeds.
+// All writers in the to-be-built batch group will be processed.
+//
+// REQUIRES: Writer list must be non-empty
+// REQUIRES: First writer must have a non-nullptr batch
+void WriteThread::BuildBatchGroup(WriteThread::Writer** last_writer,
+                                  autovector<WriteBatch*>* write_batch_group) {
+  assert(!writers_.empty());
+  Writer* first = writers_.front();
+  assert(first->batch != nullptr);
+
+  size_t size = WriteBatchInternal::ByteSize(first->batch);
+  write_batch_group->push_back(first->batch);
+
+  // Allow the group to grow up to a maximum size, but if the
+  // original write is small, limit the growth so we do not slow
+  // down the small write too much.
+  size_t max_size = 1 << 20;
+  if (size <= (128<<10)) {
+    max_size = size + (128<<10);
+  }
+
+  *last_writer = first;
+  std::deque<Writer*>::iterator iter = writers_.begin();
+  ++iter;  // Advance past "first"
+  for (; iter != writers_.end(); ++iter) {
+    Writer* w = *iter;
+    if (w->sync && !first->sync) {
+      // Do not include a sync write into a batch handled by a non-sync write.
+      break;
+    }
+
+    if (!w->disableWAL && first->disableWAL) {
+      // Do not include a write that needs WAL into a batch that has
+      // WAL disabled.
+      break;
+    }
+
+    if (w->timeout_hint_us < first->timeout_hint_us) {
+      // Do not include those writes with shorter timeout.  Otherwise, we might
+      // execute a write that should instead be aborted because of timeout.
+      break;
+    }
+
+    if (w->batch == nullptr) {
+      // Do not include those writes with nullptr batch. Those are not writes,
+      // those are something else. They want to be alone
+      break;
+    }
+
+    size += WriteBatchInternal::ByteSize(w->batch);
+    if (size > max_size) {
+      // Do not make batch too big
+      break;
+    }
+
+    write_batch_group->push_back(w->batch);
+    w->in_batch_group = true;
+    *last_writer = w;
+  }
+}
+
+}  // namespace rocksdb
--- a/db/write_thread.h
+++ b/db/write_thread.h
@ -0,0 +1,80 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#include <stdint.h>
+#include <deque>
+#include <limits>
+#include "rocksdb/status.h"
+#include "db/write_batch_internal.h"
+#include "util/autovector.h"
+#include "port/port.h"
+
+namespace rocksdb {
+
+class WriteThread {
+ public:
+  static const uint64_t kNoTimeOut = std::numeric_limits<uint64_t>::max();
+  // Information kept for every waiting writer
+  struct Writer {
+    Status status;
+    WriteBatch* batch;
+    bool sync;
+    bool disableWAL;
+    bool in_batch_group;
+    bool done;
+    uint64_t timeout_hint_us;
+    port::CondVar cv;
+
+    explicit Writer(port::Mutex* mu)
+        : batch(nullptr),
+          sync(false),
+          disableWAL(false),
+          in_batch_group(false),
+          done(false),
+          timeout_hint_us(kNoTimeOut),
+          cv(mu) {}
+  };
+
+  WriteThread() = default;
+  ~WriteThread() = default;
+
+  // Before applying write operation (such as DBImpl::Write, DBImpl::Flush)
+  // thread should grab the mutex_ and be the first on writers queue.
+  // EnterWriteThread is used for it.
+  // Be aware! Writer's job can be done by other thread (see DBImpl::Write
+  // for examples), so check it via w.done before applying changes.
+  //
+  // Writer* w:                writer to be placed in the queue
+  // uint64_t expiration_time: maximum time to be in the queue
+  // See also: ExitWriteThread
+  // REQUIRES: db mutex held
+  Status EnterWriteThread(Writer* w, uint64_t expiration_time);
+
+  // After doing write job, we need to remove already used writers from
+  // writers_ queue and notify head of the queue about it.
+  // ExitWriteThread is used for this.
+  //
+  // Writer* w:           Writer, that was added by EnterWriteThread function
+  // Writer* last_writer: Since we can join a few Writers (as DBImpl::Write
+  //                      does)
+  //                      we should pass last_writer as a parameter to
+  //                      ExitWriteThread
+  //                      (if you don't touch other writers, just pass w)
+  // Status status:       Status of write operation
+  // See also: EnterWriteThread
+  // REQUIRES: db mutex held
+  void ExitWriteThread(Writer* w, Writer* last_writer, Status status);
+
+  void BuildBatchGroup(Writer** last_writer,
+                       autovector<WriteBatch*>* write_batch_group);
+
+ private:
+  // Queue of writers.
+  std::deque<Writer*> writers_;
+};
+
+}  // namespace rocksdb
--- a/include/rocksdb/c.h
+++ b/include/rocksdb/c.h
@ -75,6 +75,8 @@ typedef struct rocksdb_iterator_t        rocksdb_iterator_t;
 typedef struct rocksdb_logger_t          rocksdb_logger_t;
 typedef struct rocksdb_mergeoperator_t   rocksdb_mergeoperator_t;
 typedef struct rocksdb_options_t         rocksdb_options_t;
+typedef struct rocksdb_block_based_table_options_t
+    rocksdb_block_based_table_options_t;
 typedef struct rocksdb_randomfile_t      rocksdb_randomfile_t;
 typedef struct rocksdb_readoptions_t     rocksdb_readoptions_t;
 typedef struct rocksdb_seqfile_t         rocksdb_seqfile_t;
@ -346,6 +348,34 @@ extern void rocksdb_writebatch_iterate(
    void (*deleted)(void*, const char* k, size_t klen));
 extern const char* rocksdb_writebatch_data(rocksdb_writebatch_t*, size_t *size);

+/* Block based table options */
+
+extern rocksdb_block_based_table_options_t*
+    rocksdb_block_based_options_create();
+extern void rocksdb_block_based_options_destroy(
+    rocksdb_block_based_table_options_t* options);
+extern void rocksdb_block_based_options_set_block_size(
+    rocksdb_block_based_table_options_t* options, size_t block_size);
+extern void rocksdb_block_based_options_set_block_size_deviation(
+    rocksdb_block_based_table_options_t* options, int block_size_deviation);
+extern void rocksdb_block_based_options_set_block_restart_interval(
+    rocksdb_block_based_table_options_t* options, int block_restart_interval);
+extern void rocksdb_block_based_options_set_filter_policy(
+    rocksdb_block_based_table_options_t* options,
+    rocksdb_filterpolicy_t* filter_policy);
+extern void rocksdb_block_based_options_set_no_block_cache(
+    rocksdb_block_based_table_options_t* options,
+    unsigned char no_block_cache);
+extern void rocksdb_block_based_options_set_block_cache(
+    rocksdb_block_based_table_options_t* options, rocksdb_cache_t* block_cache);
+extern void rocksdb_block_based_options_set_block_cache_compressed(
+    rocksdb_block_based_table_options_t* options,
+    rocksdb_cache_t* block_cache_compressed);
+extern void rocksdb_block_based_options_set_whole_key_filtering(
+    rocksdb_block_based_table_options_t*, unsigned char);
+extern void rocksdb_options_set_block_based_table_factory(
+    rocksdb_options_t *opt, rocksdb_block_based_table_options_t* table_options);
+
 /* Options */

 extern rocksdb_options_t* rocksdb_options_create();
@ -353,7 +383,7 @@ extern void rocksdb_options_destroy(rocksdb_options_t*);
 extern void rocksdb_options_increase_parallelism(
    rocksdb_options_t* opt, int total_threads);
 extern void rocksdb_options_optimize_for_point_lookup(
-    rocksdb_options_t* opt);
+    rocksdb_options_t* opt, uint64_t block_cache_size_mb);
 extern void rocksdb_options_optimize_level_style_compaction(
    rocksdb_options_t* opt, uint64_t memtable_memory_budget);
 extern void rocksdb_options_optimize_universal_style_compaction(
@ -376,9 +406,6 @@ extern void rocksdb_options_set_compression_per_level(
  rocksdb_options_t* opt,
  int* level_values,
  size_t num_levels);
-extern void rocksdb_options_set_filter_policy(
-    rocksdb_options_t*,
-    rocksdb_filterpolicy_t*);
 extern void rocksdb_options_set_create_if_missing(
    rocksdb_options_t*, unsigned char);
 extern void rocksdb_options_set_create_missing_column_families(
@ -392,13 +419,8 @@ extern void rocksdb_options_set_info_log(rocksdb_options_t*, rocksdb_logger_t*);
 extern void rocksdb_options_set_info_log_level(rocksdb_options_t*, int);
 extern void rocksdb_options_set_write_buffer_size(rocksdb_options_t*, size_t);
 extern void rocksdb_options_set_max_open_files(rocksdb_options_t*, int);
-extern void rocksdb_options_set_cache(rocksdb_options_t*, rocksdb_cache_t*);
-extern void rocksdb_options_set_cache_compressed(rocksdb_options_t*, rocksdb_cache_t*);
-extern void rocksdb_options_set_block_size(rocksdb_options_t*, size_t);
-extern void rocksdb_options_set_block_restart_interval(rocksdb_options_t*, int);
 extern void rocksdb_options_set_compression_options(
    rocksdb_options_t*, int, int, int);
-extern void rocksdb_options_set_whole_key_filtering(rocksdb_options_t*, unsigned char);
 extern void rocksdb_options_set_prefix_extractor(
    rocksdb_options_t*, rocksdb_slicetransform_t*);
 extern void rocksdb_options_set_num_levels(rocksdb_options_t*, int);
@ -449,8 +471,6 @@ extern void rocksdb_options_set_arena_block_size(
    rocksdb_options_t*, size_t);
 extern void rocksdb_options_set_use_fsync(
    rocksdb_options_t*, int);
-extern void rocksdb_options_set_db_stats_log_interval(
-    rocksdb_options_t*, int);
 extern void rocksdb_options_set_db_log_dir(
    rocksdb_options_t*, const char*);
 extern void rocksdb_options_set_wal_dir(
@ -493,7 +513,6 @@ extern void rocksdb_options_set_max_sequential_skip_in_iterations(
    rocksdb_options_t*, uint64_t);
 extern void rocksdb_options_set_disable_data_sync(rocksdb_options_t*, int);
 extern void rocksdb_options_set_disable_auto_compactions(rocksdb_options_t*, int);
-extern void rocksdb_options_set_disable_seek_compaction(rocksdb_options_t*, int);
 extern void rocksdb_options_set_delete_obsolete_files_period_micros(
    rocksdb_options_t*, uint64_t);
 extern void rocksdb_options_set_source_compaction_factor(rocksdb_options_t*, int);
@ -679,6 +698,10 @@ extern void rocksdb_readoptions_set_fill_cache(
 extern void rocksdb_readoptions_set_snapshot(
    rocksdb_readoptions_t*,
    const rocksdb_snapshot_t*);
+extern void rocksdb_readoptions_set_iterate_upper_bound(
+    rocksdb_readoptions_t*,
+    const char* key,
+    size_t keylen);
 extern void rocksdb_readoptions_set_read_tier(
    rocksdb_readoptions_t*, int);
 extern void rocksdb_readoptions_set_tailing(
--- a/include/rocksdb/cache.h
+++ b/include/rocksdb/cache.h
@ -127,9 +127,6 @@ class Cache {
  void LRU_Append(Handle* e);
  void Unref(Handle* e);

-  struct Rep;
-  Rep* rep_;
-
  // No copying allowed
  Cache(const Cache&);
  void operator=(const Cache&);
--- a/include/rocksdb/compaction_filter.h
+++ b/include/rocksdb/compaction_filter.h
@ -9,6 +9,7 @@
 #ifndef STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_
 #define STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_

+#include <memory>
 #include <string>
 #include <vector>

--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@ -123,7 +123,7 @@ class DB {

  // Open DB with column families.
  // db_options specify database specific options
-  // column_families is the vector of all column families in the databse,
+  // column_families is the vector of all column families in the database,
  // containing column family name and options. You need to open ALL column
  // families in the database. To get the list of column families, you can use
  // ListColumnFamilies(). Also, you can open only a subset of column families
@ -359,6 +359,14 @@ class DB {
    return CompactRange(DefaultColumnFamily(), begin, end, reduce_level,
                        target_level, target_path_id);
  }
+  virtual bool SetOptions(ColumnFamilyHandle* column_family,
+      const std::unordered_map<std::string, std::string>& new_options) {
+    return true;
+  }
+  virtual bool SetOptions(
+      const std::unordered_map<std::string, std::string>& new_options) {
+    return SetOptions(DefaultColumnFamily(), new_options);
+  }

  // Number of levels used for this DB.
  virtual int NumberLevels(ColumnFamilyHandle* column_family) = 0;
--- a/include/rocksdb/filter_policy.h
+++ b/include/rocksdb/filter_policy.h
@ -21,11 +21,52 @@
 #define STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_

 #include <string>
+#include <memory>

 namespace rocksdb {

 class Slice;

+// A class that takes a bunch of keys, then generates filter
+class FilterBitsBuilder {
+ public:
+  virtual ~FilterBitsBuilder() {}
+
+  // Add Key to filter, you could use any way to store the key.
+  // Such as: storing hashes or original keys
+  // Keys are in sorted order and duplicated keys are possible.
+  virtual void AddKey(const Slice& key) = 0;
+
+  // Generate the filter using the keys that are added
+  // The return value of this function would be the filter bits,
+  // The ownership of actual data is set to buf
+  virtual Slice Finish(std::unique_ptr<const char[]>* buf) = 0;
+};
+
+// A class that checks if a key can be in filter
+// It should be initialized by Slice generated by BitsBuilder
+class FilterBitsReader {
+ public:
+  virtual ~FilterBitsReader() {}
+
+  // Check if the entry match the bits in filter
+  virtual bool MayMatch(const Slice& entry) = 0;
+};
+
+// We add a new format of filter block called full filter block
+// This new interface gives you more space of customization
+//
+// For the full filter block, you can plug in your version by implement
+// the FilterBitsBuilder and FilterBitsReader
+//
+// There are two sets of interface in FilterPolicy
+// Set 1: CreateFilter, KeyMayMatch: used for blockbased filter
+// Set 2: GetFilterBitsBuilder, GetFilterBitsReader, they are used for
+// full filter.
+// Set 1 MUST be implemented correctly, Set 2 is optional
+// RocksDB would first try using functions in Set 2. if they return nullptr,
+// it would use Set 1 instead.
+// You can choose filter type in NewBloomFilterPolicy
 class FilterPolicy {
 public:
  virtual ~FilterPolicy();
@ -51,11 +92,28 @@ class FilterPolicy {
  // This method may return true or false if the key was not on the
  // list, but it should aim to return false with a high probability.
  virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const = 0;
+
+  // Get the FilterBitsBuilder, which is ONLY used for full filter block
+  // It contains interface to take individual key, then generate filter
+  virtual FilterBitsBuilder* GetFilterBitsBuilder() const {
+    return nullptr;
+  }
+
+  // Get the FilterBitsReader, which is ONLY used for full filter block
+  // It contains interface to tell if key can be in filter
+  // The input slice should NOT be deleted by FilterPolicy
+  virtual FilterBitsReader* GetFilterBitsReader(const Slice& contents) const {
+    return nullptr;
+  }
 };

 // Return a new filter policy that uses a bloom filter with approximately
-// the specified number of bits per key.  A good value for bits_per_key
+// the specified number of bits per key.
+//
+// bits_per_key: bits per key in bloom filter. A good value for bits_per_key
 // is 10, which yields a filter with ~ 1% false positive rate.
+// use_block_based_builder: use block based filter rather than full fiter.
+// If you want to builder full filter, it needs to be set to false.
 //
 // Callers must delete the result after any database that is using the
 // result has been closed.
@ -67,8 +125,8 @@ class FilterPolicy {
 // ignores trailing spaces, it would be incorrect to use a
 // FilterPolicy (like NewBloomFilterPolicy) that does not ignore
 // trailing spaces in keys.
-extern const FilterPolicy* NewBloomFilterPolicy(int bits_per_key);
-
+extern const FilterPolicy* NewBloomFilterPolicy(int bits_per_key,
+    bool use_block_based_builder = true);
 }

 #endif  // STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_
--- a/include/rocksdb/flush_block_policy.h
+++ b/include/rocksdb/flush_block_policy.h
@ -6,6 +6,7 @@
 #pragma once

 #include <string>
+#include "rocksdb/table.h"

 namespace rocksdb {

@ -37,7 +38,8 @@ class FlushBlockPolicyFactory {
  // Callers must delete the result after any database that is using the
  // result has been closed.
  virtual FlushBlockPolicy* NewFlushBlockPolicy(
-      const Options& options, const BlockBuilder& data_block_builder) const = 0;
+      const BlockBasedTableOptions& table_options,
+      const BlockBuilder& data_block_builder) const = 0;

  virtual ~FlushBlockPolicyFactory() { }
 };
@ -51,7 +53,7 @@ class FlushBlockBySizePolicyFactory : public FlushBlockPolicyFactory {
  }

  virtual FlushBlockPolicy* NewFlushBlockPolicy(
-      const Options& options,
+      const BlockBasedTableOptions& table_options,
      const BlockBuilder& data_block_builder) const override;
 };

--- a/include/rocksdb/immutable_options.h
+++ b/include/rocksdb/immutable_options.h
@ -0,0 +1,84 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#include <vector>
+#include "rocksdb/options.h"
+
+namespace rocksdb {
+
+// ImmutableCFOptions is a data struct used by RocksDB internal. It contains a
+// subset of Options that should not be changed during the entire lifetime
+// of DB. You shouldn't need to access this data structure unless you are
+// implementing a new TableFactory. Raw pointers defined in this struct do
+// not have ownership to the data they point to. Options contains shared_ptr
+// to these data.
+struct ImmutableCFOptions {
+  explicit ImmutableCFOptions(const Options& options);
+
+  CompactionStyle compaction_style;
+
+  CompactionOptionsUniversal compaction_options_universal;
+
+  const SliceTransform* prefix_extractor;
+
+  const Comparator* comparator;
+
+  MergeOperator* merge_operator;
+
+  const CompactionFilter* compaction_filter;
+
+  CompactionFilterFactory* compaction_filter_factory;
+
+  CompactionFilterFactoryV2* compaction_filter_factory_v2;
+
+  Logger* info_log;
+
+  Statistics* statistics;
+
+  InfoLogLevel info_log_level;
+
+  Env* env;
+
+  // Allow the OS to mmap file for reading sst tables. Default: false
+  bool allow_mmap_reads;
+
+  // Allow the OS to mmap file for writing. Default: false
+  bool allow_mmap_writes;
+
+  std::vector<DbPath> db_paths;
+
+  MemTableRepFactory* memtable_factory;
+
+  TableFactory* table_factory;
+
+  Options::TablePropertiesCollectorFactories
+    table_properties_collector_factories;
+
+  bool advise_random_on_open;
+
+  // This options is required by PlainTableReader. May need to move it
+  // to PlainTalbeOptions just like bloom_bits_per_key
+  uint32_t bloom_locality;
+
+  bool purge_redundant_kvs_while_flush;
+
+  uint32_t min_partial_merge_operands;
+
+  bool disable_data_sync;
+
+  bool use_fsync;
+
+  CompressionType compression;
+
+  std::vector<CompressionType> compression_per_level;
+
+  CompressionOptions compression_opts;
+
+  Options::AccessHint access_hint_on_compaction_start;
+};
+
+}  // namespace rocksdb
--- a/include/rocksdb/iostats_context.h
+++ b/include/rocksdb/iostats_context.h
@ -27,7 +27,9 @@ struct IOStatsContext {
  uint64_t bytes_read;
 };

+#ifndef IOS_CROSS_COMPILE
 extern __thread IOStatsContext iostats_context;
+#endif  // IOS_CROSS_COMPILE

 }  // namespace rocksdb

--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@ -14,6 +14,7 @@
 #include <memory>
 #include <vector>
 #include <stdint.h>
+#include <unordered_map>

 #include "rocksdb/version.h"
 #include "rocksdb/universal_compaction.h"
@ -57,6 +58,7 @@ enum CompactionStyle : char {
  kCompactionStyleFIFO = 0x2,       // FIFO compaction style
 };

+
 struct CompactionOptionsFIFO {
  // once the total sum of table files reaches this, we will delete the oldest
  // table file
@ -97,7 +99,8 @@ struct ColumnFamilyOptions {

  // Use this if you don't need to keep the data sorted, i.e. you'll never use
  // an iterator, only Put() and Get() API calls
-  ColumnFamilyOptions* OptimizeForPointLookup();
+  ColumnFamilyOptions* OptimizeForPointLookup(
+      uint64_t block_cache_size_mb);

  // Default values for some parameters in ColumnFamilyOptions are not
  // optimized for heavy workloads and big datasets, which means you might
@ -206,34 +209,6 @@ struct ColumnFamilyOptions {
  // individual write buffers.  Default: 1
  int min_write_buffer_number_to_merge;

-  // Control over blocks (user data is stored in a set of blocks, and
-  // a block is the unit of reading from disk).
-
-  // If non-NULL use the specified cache for blocks.
-  // If NULL, rocksdb will automatically create and use an 8MB internal cache.
-  // Default: nullptr
-  std::shared_ptr<Cache> block_cache;
-
-  // If non-NULL use the specified cache for compressed blocks.
-  // If NULL, rocksdb will not use a compressed block cache.
-  // Default: nullptr
-  std::shared_ptr<Cache> block_cache_compressed;
-
-  // Approximate size of user data packed per block.  Note that the
-  // block size specified here corresponds to uncompressed data.  The
-  // actual size of the unit read from disk may be smaller if
-  // compression is enabled.  This parameter can be changed dynamically.
-  //
-  // Default: 4K
-  size_t block_size;
-
-  // Number of keys between restart points for delta encoding of keys.
-  // This parameter can be changed dynamically.  Most clients should
-  // leave this parameter alone.
-  //
-  // Default: 16
-  int block_restart_interval;
-
  // Compress blocks using the specified compression algorithm.  This
  // parameter can be changed dynamically.
  //
@ -251,29 +226,17 @@ struct ColumnFamilyOptions {
  CompressionType compression;

  // Different levels can have different compression policies. There
-  // are cases where most lower levels would like to quick compression
-  // algorithm while the higher levels (which have more data) use
+  // are cases where most lower levels would like to use quick compression
+  // algorithms while the higher levels (which have more data) use
  // compression algorithms that have better compression but could
-  // be slower. This array, if non nullptr, should have an entry for
-  // each level of the database. This array, if non nullptr, overides the
-  // value specified in the previous field 'compression'. The caller is
-  // reponsible for allocating memory and initializing the values in it
-  // before invoking Open(). The caller is responsible for freeing this
-  // array and it could be freed anytime after the return from Open().
-  // This could have been a std::vector but that makes the equivalent
-  // java/C api hard to construct.
+  // be slower. This array, if non-empty, should have an entry for
+  // each level of the database; these override the value specified in
+  // the previous field 'compression'.
  std::vector<CompressionType> compression_per_level;

  // different options for compression algorithms
  CompressionOptions compression_opts;

-  // If non-nullptr, use the specified filter policy to reduce disk reads.
-  // Many applications will benefit from passing the result of
-  // NewBloomFilterPolicy() here.
-  //
-  // Default: nullptr
-  const FilterPolicy* filter_policy;
-
  // If non-nullptr, use the specified function to determine the
  // prefixes for keys.  These prefixes will be placed in the filter.
  // Depending on the workload, this can reduce the number of read-IOP
@ -290,12 +253,6 @@ struct ColumnFamilyOptions {
  // Default: nullptr
  std::shared_ptr<const SliceTransform> prefix_extractor;

-  // If true, place whole keys in the filter (not just prefixes).
-  // This must generally be true for gets to be efficient.
-  //
-  // Default: true
-  bool whole_key_filtering;
-
  // Number of levels for this database
  int num_levels;

@ -331,7 +288,7 @@ struct ColumnFamilyOptions {
  // and each file on level-3 will be 200MB.

  // by default target_file_size_base is 2MB.
-  int target_file_size_base;
+  uint64_t target_file_size_base;
  // by default target_file_size_multiplier is 1, which means
  // by default files in different levels will have similar size.
  int target_file_size_multiplier;
@ -375,18 +332,6 @@ struct ColumnFamilyOptions {
  // stop building a single file in a level->level+1 compaction.
  int max_grandparent_overlap_factor;

-  // We decided to remove seek compaction from RocksDB because:
-  // 1) It makes more sense for spinning disk workloads, while RocksDB is
-  // primarily designed for flash and memory,
-  // 2) It added some complexity to the important code-paths,
-  // 3) None of our internal customers were really using it.
-  //
-  // Since we removed seek compaction, this option is now obsolete.
-  // We left it here for backwards compatiblity (otherwise it would break the
-  // build), but we'll remove it at some point.
-  // Default: true
-  bool disable_seek_compaction;
-
  // Puts are delayed 0-1 ms when any level has a compaction score that exceeds
  // soft_rate_limit. This is ignored when == 0.0.
  // CONSTRAINT: soft_rate_limit <= hard_rate_limit. If this constraint does not
@ -399,17 +344,9 @@ struct ColumnFamilyOptions {
  // Default: 0 (disabled)
  double hard_rate_limit;

-  // Max time a put will be stalled when hard_rate_limit is enforced. If 0, then
-  // there is no limit.
-  // Default: 1000
+  // DEPRECATED -- this options is no longer used
  unsigned int rate_limit_delay_max_milliseconds;

-  // Disable block cache. If this is set to true,
-  // then no block cache should be used, and the block_cache should
-  // point to a nullptr object.
-  // Default: false
-  bool no_block_cache;
-
  // size of one block in arena memory allocation.
  // If <= 0, a proper value is automatically calculated (usually 1/10 of
  // writer_buffer_size).
@ -433,14 +370,6 @@ struct ColumnFamilyOptions {
  // Default: true
  bool purge_redundant_kvs_while_flush;

-  // This is used to close a block before it reaches the configured
-  // 'block_size'. If the percentage of free space in the current block is less
-  // than this specified number and adding a new record to the block will
-  // exceed the configured block size, then this block will be closed and the
-  // new record will be written to the next block.
-  // Default is 10.
-  int block_size_deviation;
-
  // The compaction style. Default: kCompactionStyleLevel
  CompactionStyle compaction_style;

@ -475,10 +404,24 @@ struct ColumnFamilyOptions {
  std::shared_ptr<MemTableRepFactory> memtable_factory;

  // This is a factory that provides TableFactory objects.
-  // Default: a factory that provides a default implementation of
-  // Table and TableBuilder.
+  // Default: a block-based table factory that provides a default
+  // implementation of TableBuilder and TableReader with default
+  // BlockBasedTableOptions.
  std::shared_ptr<TableFactory> table_factory;

+  // Block-based table related options are moved to BlockBasedTableOptions.
+  // Related options that were originally here but now moved include:
+  //   no_block_cache
+  //   block_cache
+  //   block_cache_compressed
+  //   block_size
+  //   block_size_deviation
+  //   block_restart_interval
+  //   filter_policy
+  //   whole_key_filtering
+  // If you'd like to customize some of these options, you will need to
+  // use NewBlockBasedTableFactory() to construct a new table factory.
+
  // This option allows user to to collect their own interested statistics of
  // the tables.
  // Default: empty vector -- no user-defined statistics collection will be
@ -669,7 +612,7 @@ struct DBOptions {
  // it does not use any locks to prevent concurrent updates.
  std::shared_ptr<Statistics> statistics;

-  // If true, then the contents of data files are not synced
+  // If true, then the contents of manifest and data files are not synced
  // to stable storage. Their contents remain in the OS buffers till the
  // OS decides to flush them. This option is good for bulk-loading
  // of data. Once the bulk-loading is complete, please issue a
@ -684,9 +627,6 @@ struct DBOptions {
  // Default: false
  bool use_fsync;

-  // This options is not used!!
-  int db_stats_log_interval;
-
  // A list of paths where SST files can be put into, with its target size.
  // Newer data is placed into paths specified earlier in the vector while
  // older data gradually moves to paths specified later in the vector.
@ -844,12 +784,13 @@ struct DBOptions {
  // Specify the file access pattern once a compaction is started.
  // It will be applied to all input files of a compaction.
  // Default: NORMAL
-  enum {
-    NONE,
-    NORMAL,
-    SEQUENTIAL,
-    WILLNEED
-  } access_hint_on_compaction_start;
+  enum AccessHint {
+      NONE,
+      NORMAL,
+      SEQUENTIAL,
+      WILLNEED
+  };
+  AccessHint access_hint_on_compaction_start;

  // Use adaptive mutex, which spins in the user space before resorting
  // to kernel. This could reduce context switch when the mutex is not
@ -958,6 +899,18 @@ struct ReadOptions {
  // ! DEPRECATED
  // const Slice* prefix;

+  // "iterate_upper_bound" defines the extent upto which the forward iterator
+  // can returns entries. Once the bound is reached, Valid() will be false.
+  // "iterate_upper_bound" is exclusive ie the bound value is
+  // not a valid entry.  If iterator_extractor is not null, the Seek target
+  // and iterator_upper_bound need to have the same prefix.
+  // This is because ordering is not guaranteed outside of prefix domain.
+  // There is no lower bound on the iterator. If needed, that can be easily
+  // implemented
+  //
+  // Default: nullptr
+  const Slice* iterate_upper_bound;
+
  // Specify if this read request should process data that ALREADY
  // resides on a particular cache. If the required data is not
  // found at the specified cache, then Status::Incomplete is returned.
@ -972,18 +925,27 @@ struct ReadOptions {
  // Not supported in ROCKSDB_LITE mode!
  bool tailing;

+  // Enable a total order seek regardless of index format (e.g. hash index)
+  // used in the table. Some table format (e.g. plain table) may not support
+  // this option.
+  bool total_order_seek;
+
  ReadOptions()
      : verify_checksums(true),
        fill_cache(true),
        snapshot(nullptr),
+        iterate_upper_bound(nullptr),
        read_tier(kReadAllTier),
-        tailing(false) {}
+        tailing(false),
+        total_order_seek(false) {}
  ReadOptions(bool cksum, bool cache)
      : verify_checksums(cksum),
        fill_cache(cache),
        snapshot(nullptr),
+        iterate_upper_bound(nullptr),
        read_tier(kReadAllTier),
-        tailing(false) {}
+        tailing(false),
+        total_order_seek(false) {}
 };

 // Options that control write operations
@ -1021,7 +983,17 @@ struct WriteOptions {
  // Default: 0
  uint64_t timeout_hint_us;

-  WriteOptions() : sync(false), disableWAL(false), timeout_hint_us(0) {}
+  // If true and if user is trying to write to column families that don't exist
+  // (they were dropped),  ignore the write (don't return an error). If there
+  // are multiple writes in a WriteBatch, other writes will succeed.
+  // Default: false
+  bool ignore_missing_column_families;
+
+  WriteOptions()
+      : sync(false),
+        disableWAL(false),
+        timeout_hint_us(0),
+        ignore_missing_column_families(false) {}
 };

 // Options that control flush operations
@ -1043,6 +1015,12 @@ extern Options GetOptions(size_t total_write_buffer_limit,
                          int read_amplification_threshold = 8,
                          int write_amplification_threshold = 32,
                          uint64_t target_db_size = 68719476736 /* 64GB */);
+
+bool GetOptionsFromStrings(
+    const Options& base_options,
+    const std::unordered_map<std::string, std::string>& options_map,
+    Options* new_options);
+
 }  // namespace rocksdb

 #endif  // STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_
--- a/include/rocksdb/statistics.h
+++ b/include/rocksdb/statistics.h
@ -115,7 +115,7 @@ enum Tickers : uint32_t {
  // head of the writers queue.
  WRITE_DONE_BY_SELF,
  WRITE_DONE_BY_OTHER,
-  WRITE_TIMEDOUT,        // Number of writes ending up with timed-out.
+  WRITE_TIMEDOUT,       // Number of writes ending up with timed-out.
  WRITE_WITH_WAL,       // Number of Write calls that request WAL
  COMPACT_READ_BYTES,   // Bytes read during compaction
  COMPACT_WRITE_BYTES,  // Bytes written during compaction
@ -212,7 +212,6 @@ enum Histograms : uint32_t {
  READ_BLOCK_COMPACTION_MICROS,
  READ_BLOCK_GET_MICROS,
  WRITE_RAW_BLOCK_MICROS,
-
  STALL_L0_SLOWDOWN_COUNT,
  STALL_MEMTABLE_COMPACTION_COUNT,
  STALL_L0_NUM_FILES_COUNT,
@ -220,6 +219,7 @@ enum Histograms : uint32_t {
  SOFT_RATE_LIMIT_DELAY_COUNT,
  NUM_FILES_IN_SINGLE_COMPACTION,
  DB_SEEK,
+  WRITE_STALL,
  HISTOGRAM_ENUM_MAX,
 };

--- a/include/rocksdb/status.h
+++ b/include/rocksdb/status.h
@ -96,7 +96,7 @@ class Status {
  // Returns true iff the status indicates Incomplete
  bool IsIncomplete() const { return code() == kIncomplete; }

-  // Returns true iff the status indicates Incomplete
+  // Returns true iff the status indicates Shutdown In progress
  bool IsShutdownInProgress() const { return code() == kShutdownInProgress; }

  bool IsTimedOut() const { return code() == kTimedOut; }
--- a/include/rocksdb/table.h
+++ b/include/rocksdb/table.h
@ -23,6 +23,7 @@
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/options.h"
+#include "rocksdb/immutable_options.h"
 #include "rocksdb/status.h"

 namespace rocksdb {
@ -84,6 +85,46 @@ struct BlockBasedTableOptions {
  // protected with this checksum type. Old table files will still be readable,
  // even though they have different checksum type.
  ChecksumType checksum = kCRC32c;
+
+  // Disable block cache. If this is set to true,
+  // then no block cache should be used, and the block_cache should
+  // point to a nullptr object.
+  bool no_block_cache = false;
+
+  // If non-NULL use the specified cache for blocks.
+  // If NULL, rocksdb will automatically create and use an 8MB internal cache.
+  std::shared_ptr<Cache> block_cache = nullptr;
+
+  // If non-NULL use the specified cache for compressed blocks.
+  // If NULL, rocksdb will not use a compressed block cache.
+  std::shared_ptr<Cache> block_cache_compressed = nullptr;
+
+  // Approximate size of user data packed per block.  Note that the
+  // block size specified here corresponds to uncompressed data.  The
+  // actual size of the unit read from disk may be smaller if
+  // compression is enabled.  This parameter can be changed dynamically.
+  size_t block_size = 4 * 1024;
+
+  // This is used to close a block before it reaches the configured
+  // 'block_size'. If the percentage of free space in the current block is less
+  // than this specified number and adding a new record to the block will
+  // exceed the configured block size, then this block will be closed and the
+  // new record will be written to the next block.
+  int block_size_deviation = 10;
+
+  // Number of keys between restart points for delta encoding of keys.
+  // This parameter can be changed dynamically.  Most clients should
+  // leave this parameter alone.
+  int block_restart_interval = 16;
+
+  // If non-nullptr, use the specified filter policy to reduce disk reads.
+  // Many applications will benefit from passing the result of
+  // NewBloomFilterPolicy() here.
+  std::shared_ptr<const FilterPolicy> filter_policy = nullptr;
+
+  // If true, place whole keys in the filter (not just prefixes).
+  // This must generally be true for gets to be efficient.
+  bool whole_key_filtering = true;
 };

 // Table Properties that are specific to block-based table properties.
@ -126,47 +167,49 @@ struct PlainTablePropertyNames {
 const uint32_t kPlainTableVariableLength = 0;

 struct PlainTableOptions {
-// @user_key_len: plain table has optimization for fix-sized keys, which can be
-//                specified via user_key_len.  Alternatively, you can pass
-//                `kPlainTableVariableLength` if your keys have variable
-//                lengths.
-uint32_t user_key_len = kPlainTableVariableLength;
+  // @user_key_len: plain table has optimization for fix-sized keys, which can
+  //                be specified via user_key_len.  Alternatively, you can pass
+  //                `kPlainTableVariableLength` if your keys have variable
+  //                lengths.
+  uint32_t user_key_len = kPlainTableVariableLength;

-// @bloom_bits_per_key: the number of bits used for bloom filer per prefix. You
-//                      may disable it by passing a zero.
-int bloom_bits_per_key = 10;
+  // @bloom_bits_per_key: the number of bits used for bloom filer per prefix.
+  //                      You may disable it by passing a zero.
+  int bloom_bits_per_key = 10;

-// @hash_table_ratio: the desired utilization of the hash table used for prefix
-//                    hashing. hash_table_ratio = number of prefixes / #buckets
-//                    in the hash table
-double hash_table_ratio = 0.75;
+  // @hash_table_ratio: the desired utilization of the hash table used for
+  //                    prefix hashing.
+  //                    hash_table_ratio = number of prefixes / #buckets in the
+  //                    hash table
+  double hash_table_ratio = 0.75;

-// @index_sparseness: inside each prefix, need to build one index record for how
-//                    many keys for binary search inside each hash bucket.
-//                    For encoding type kPrefix, the value will be used when
-//                    writing to determine an interval to rewrite the full key.
-//                    It will also be used as a suggestion and satisfied when
-//                    possible.
-size_t index_sparseness = 16;
+  // @index_sparseness: inside each prefix, need to build one index record for
+  //                    how many keys for binary search inside each hash bucket.
+  //                    For encoding type kPrefix, the value will be used when
+  //                    writing to determine an interval to rewrite the full
+  //                    key. It will also be used as a suggestion and satisfied
+  //                    when possible.
+  size_t index_sparseness = 16;

-// @huge_page_tlb_size: if <=0, allocate hash indexes and blooms from malloc.
-//                      Otherwise from huge page TLB. The user needs to reserve
-//                      huge pages for it to be allocated, like:
-//                          sysctl -w vm.nr_hugepages=20
-//                      See linux doc Documentation/vm/hugetlbpage.txt
-size_t huge_page_tlb_size = 0;
+  // @huge_page_tlb_size: if <=0, allocate hash indexes and blooms from malloc.
+  //                      Otherwise from huge page TLB. The user needs to
+  //                      reserve huge pages for it to be allocated, like:
+  //                          sysctl -w vm.nr_hugepages=20
+  //                      See linux doc Documentation/vm/hugetlbpage.txt
+  size_t huge_page_tlb_size = 0;

-// @encoding_type: how to encode the keys. See enum EncodingType above for
-//                 the choices. The value will determine how to encode keys
-//                 when writing to a new SST file. This value will be stored
-//                 inside the SST file which will be used when reading from the
-//                 file, which makes it possible for users to choose different
-//                 encoding type when reopening a DB. Files with different
-//                 encoding types can co-exist in the same DB and can be read.
-EncodingType encoding_type = kPlain;
+  // @encoding_type: how to encode the keys. See enum EncodingType above for
+  //                 the choices. The value will determine how to encode keys
+  //                 when writing to a new SST file. This value will be stored
+  //                 inside the SST file which will be used when reading from
+  //                 the file, which makes it possible for users to choose
+  //                 different encoding type when reopening a DB. Files with
+  //                 different encoding types can co-exist in the same DB and
+  //                 can be read.
+  EncodingType encoding_type = kPlain;

-// @full_scan_mode: mode for reading the whole file one record by one without
-//                  using the index.
+  // @full_scan_mode: mode for reading the whole file one record by one without
+  //                  using the index.
  bool full_scan_mode = false;

  // @store_index_in_file: compute plain table index and bloom filter during
@ -185,15 +228,59 @@ extern TableFactory* NewPlainTableFactory(const PlainTableOptions& options =
                                              PlainTableOptions());

 struct CuckooTablePropertyNames {
+  // The key that is used to fill empty buckets.
  static const std::string kEmptyKey;
+  // Fixed length of value.
  static const std::string kValueLength;
-  static const std::string kNumHashTable;
-  static const std::string kMaxNumBuckets;
+  // Number of hash functions used in Cuckoo Hash.
+  static const std::string kNumHashFunc;
+  // It denotes the number of buckets in a Cuckoo Block. Given a key and a
+  // particular hash function, a Cuckoo Block is a set of consecutive buckets,
+  // where starting bucket id is given by the hash function on the key. In case
+  // of a collision during inserting the key, the builder tries to insert the
+  // key in other locations of the cuckoo block before using the next hash
+  // function. This reduces cache miss during read operation in case of
+  // collision.
+  static const std::string kCuckooBlockSize;
+  // Size of the hash table. Use this number to compute the modulo of hash
+  // function. The actual number of buckets will be kMaxHashTableSize +
+  // kCuckooBlockSize - 1. The last kCuckooBlockSize-1 buckets are used to
+  // accommodate the Cuckoo Block from end of hash table, due to cache friendly
+  // implementation.
+  static const std::string kHashTableSize;
+  // Denotes if the key sorted in the file is Internal Key (if false)
+  // or User Key only (if true).
  static const std::string kIsLastLevel;
+  // Indicate if using identity function for the first hash function.
+  static const std::string kIdentityAsFirstHash;
 };

-extern TableFactory* NewCuckooTableFactory(double hash_table_ratio = 0.9,
-    uint32_t max_search_depth = 100);
+struct CuckooTableOptions {
+  // Determines the utilization of hash tables. Smaller values
+  // result in larger hash tables with fewer collisions.
+  double hash_table_ratio = 0.9;
+  // A property used by builder to determine the depth to go to
+  // to search for a path to displace elements in case of
+  // collision. See Builder.MakeSpaceForKey method. Higher
+  // values result in more efficient hash tables with fewer
+  // lookups but take more time to build.
+  uint32_t max_search_depth = 100;
+  // In case of collision while inserting, the builder
+  // attempts to insert in the next cuckoo_block_size
+  // locations before skipping over to the next Cuckoo hash
+  // function. This makes lookups more cache friendly in case
+  // of collisions.
+  uint32_t cuckoo_block_size = 5;
+  // If this options is enabled, user key is treated as uint64_t and its value
+  // is used as hash value directly. This option changes builder's behavior.
+  // Reader ignore this option and behave according to what specified in table
+  // property.
+  bool identity_as_first_hash = false;
+};
+
+// Cuckoo Table Factory for SST table format using Cache Friendly Cuckoo Hashing
+extern TableFactory* NewCuckooTableFactory(
+    const CuckooTableOptions& table_options = CuckooTableOptions());

 #endif  // ROCKSDB_LITE

@ -220,14 +307,15 @@ class TableFactory {
  //     and cache the table object returned.
  // (1) SstFileReader (for SST Dump) opens the table and dump the table
  //     contents using the interator of the table.
-  // options and soptions are options. options is the general options.
+  // ImmutableCFOptions is a subset of Options that can not be altered.
+  // EnvOptions is a subset of Options that will be used by Env.
  // Multiple configured can be accessed from there, including and not
  // limited to block cache and key comparators.
  // file is a file handler to handle the file for the table
  // file_size is the physical file size of the file
  // table_reader is the output table reader
  virtual Status NewTableReader(
-      const Options& options, const EnvOptions& soptions,
+      const ImmutableCFOptions& ioptions, const EnvOptions& env_options,
      const InternalKeyComparator& internal_comparator,
      unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
      unique_ptr<TableReader>* table_reader) const = 0;
@ -245,14 +333,27 @@ class TableFactory {
  // (4) When running Repairer, it creates a table builder to convert logs to
  //     SST files (In Repairer::ConvertLogToTable() by calling BuildTable())
  //
-  // options is the general options. Multiple configured can be acceseed from
-  // there, including and not limited to compression options.
-  // file is a handle of a writable file. It is the caller's responsibility to
-  // keep the file open and close the file after closing the table builder.
-  // compression_type is the compression type to use in this table.
+  // ImmutableCFOptions is a subset of Options that can not be altered.
+  // Multiple configured can be acceseed from there, including and not limited
+  // to compression options. file is a handle of a writable file.
+  // It is the caller's responsibility to keep the file open and close the file
+  // after closing the table builder. compression_type is the compression type
+  // to use in this table.
  virtual TableBuilder* NewTableBuilder(
-      const Options& options, const InternalKeyComparator& internal_comparator,
-      WritableFile* file, CompressionType compression_type) const = 0;
+      const ImmutableCFOptions& ioptions,
+      const InternalKeyComparator& internal_comparator,
+      WritableFile* file, const CompressionType compression_type,
+      const CompressionOptions& compression_opts) const = 0;
+
+  // Sanitizes the specified DB Options.
+  //
+  // If the function cannot find a way to sanitize the input DB Options,
+  // a non-ok Status will be returned.
+  virtual Status SanitizeDBOptions(const DBOptions* db_opts) const = 0;
+
+  // Return a string that contains printable format of table configurations.
+  // RocksDB prints configurations at DB Open().
+  virtual std::string GetPrintableTableOptions() const = 0;
 };

 #ifndef ROCKSDB_LITE
--- a/include/rocksdb/utilities/backupable_db.h
+++ b/include/rocksdb/utilities/backupable_db.h
@ -10,7 +10,10 @@
 #pragma once
 #ifndef ROCKSDB_LITE

+#ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
+#endif
+
 #include <inttypes.h>
 #include <string>
 #include <map>
@ -127,9 +130,41 @@ struct BackupInfo {
  int64_t timestamp;
  uint64_t size;

+  uint32_t number_files;
+
  BackupInfo() {}
-  BackupInfo(BackupID _backup_id, int64_t _timestamp, uint64_t _size)
-      : backup_id(_backup_id), timestamp(_timestamp), size(_size) {}
+
+  BackupInfo(BackupID _backup_id, int64_t _timestamp, uint64_t _size,
+             uint32_t _number_files)
+      : backup_id(_backup_id), timestamp(_timestamp), size(_size),
+        number_files(_number_files) {}
+};
+
+class BackupStatistics {
+ public:
+  BackupStatistics() {
+    number_success_backup = 0;
+    number_fail_backup = 0;
+  }
+
+  BackupStatistics(uint32_t _number_success_backup,
+                   uint32_t _number_fail_backup)
+      : number_success_backup(_number_success_backup),
+        number_fail_backup(_number_fail_backup) {}
+
+  ~BackupStatistics() {}
+
+  void IncrementNumberSuccessBackup();
+  void IncrementNumberFailBackup();
+
+  uint32_t GetNumberSuccessBackup() const;
+  uint32_t GetNumberFailBackup() const;
+
+  std::string ToString() const;
+
+ private:
+  uint32_t number_success_backup;
+  uint32_t number_fail_backup;
 };

 class BackupEngineReadOnly {
--- a/include/rocksdb/utilities/write_batch_with_index.h
+++ b/include/rocksdb/utilities/write_batch_with_index.h
@ -0,0 +1,105 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A WriteBatchWithIndex with a binary searchable index built for all the keys
+// inserted.
+
+#pragma once
+
+#include "rocksdb/comparator.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "rocksdb/write_batch.h"
+
+namespace rocksdb {
+
+class ColumnFamilyHandle;
+struct SliceParts;
+class Comparator;
+
+enum WriteType { kPutRecord, kMergeRecord, kDeleteRecord, kLogDataRecord };
+
+// an entry for Put, Merge or Delete entry for write batches. Used in
+// WBWIIterator.
+struct WriteEntry {
+  WriteType type;
+  Slice key;
+  Slice value;
+};
+
+// Iterator of one column family out of a WriteBatchWithIndex.
+class WBWIIterator {
+ public:
+  virtual ~WBWIIterator() {}
+
+  virtual bool Valid() const = 0;
+
+  virtual void Seek(const Slice& key) = 0;
+
+  virtual void Next() = 0;
+
+  virtual const WriteEntry& Entry() const = 0;
+
+  virtual Status status() const = 0;
+};
+
+// A WriteBatchWithIndex with a binary searchable index built for all the keys
+// inserted.
+// In Put(), Merge() or Delete(), the same function of the wrapped will be
+// called. At the same time, indexes will be built.
+// By calling GetWriteBatch(), a user will get the WriteBatch for the data
+// they inserted, which can be used for DB::Write().
+// A user can call NewIterator() to create an iterator.
+class WriteBatchWithIndex {
+ public:
+  // backup_index_comparator: the backup comparator used to compare keys
+  // within the same column family, if column family is not given in the
+  // interface, or we can't find a column family from the column family handle
+  // passed in, backup_index_comparator will be used for the column family.
+  // reserved_bytes: reserved bytes in underlying WriteBatch
+  explicit WriteBatchWithIndex(
+      const Comparator* backup_index_comparator = BytewiseComparator(),
+      size_t reserved_bytes = 0);
+  virtual ~WriteBatchWithIndex();
+
+  WriteBatch* GetWriteBatch();
+
+  virtual void Put(ColumnFamilyHandle* column_family, const Slice& key,
+                   const Slice& value);
+
+  virtual void Put(const Slice& key, const Slice& value);
+
+  virtual void Merge(ColumnFamilyHandle* column_family, const Slice& key,
+                     const Slice& value);
+
+  virtual void Merge(const Slice& key, const Slice& value);
+
+  virtual void PutLogData(const Slice& blob);
+
+  virtual void Delete(ColumnFamilyHandle* column_family, const Slice& key);
+  virtual void Delete(const Slice& key);
+
+  virtual void Delete(ColumnFamilyHandle* column_family, const SliceParts& key);
+
+  virtual void Delete(const SliceParts& key);
+
+  // Create an iterator of a column family. User can call iterator.Seek() to
+  // search to the next entry of or after a key. Keys will be iterated in the
+  // order given by index_comparator. For multiple updates on the same key,
+  // each update will be returned as a separate entry, in the order of update
+  // time.
+  virtual WBWIIterator* NewIterator(ColumnFamilyHandle* column_family);
+  // Create an iterator of the default column family.
+  virtual WBWIIterator* NewIterator();
+
+ private:
+  struct Rep;
+  Rep* rep;
+};
+
+}  // namespace rocksdb
--- a/include/rocksdb/write_batch.h
+++ b/include/rocksdb/write_batch.h
@ -152,6 +152,7 @@ class WriteBatch {
 private:
  friend class WriteBatchInternal;

+ protected:
  std::string rep_;  // See comment in write_batch.cc for the format of rep_

  // Intentionally copyable
--- a/java/Makefile
+++ b/java/Makefile
@ -1,4 +1,4 @@
-NATIVE_JAVA_CLASSES = org.rocksdb.RocksDB org.rocksdb.Options org.rocksdb.WriteBatch org.rocksdb.WriteBatchInternal org.rocksdb.WriteBatchTest org.rocksdb.WriteOptions org.rocksdb.BackupableDB org.rocksdb.BackupableDBOptions org.rocksdb.Statistics org.rocksdb.RocksIterator org.rocksdb.VectorMemTableConfig org.rocksdb.SkipListMemTableConfig org.rocksdb.HashLinkedListMemTableConfig org.rocksdb.HashSkipListMemTableConfig org.rocksdb.PlainTableConfig org.rocksdb.ReadOptions org.rocksdb.Filter org.rocksdb.BloomFilter org.rocksdb.RestoreOptions org.rocksdb.RestoreBackupableDB org.rocksdb.RocksEnv
+NATIVE_JAVA_CLASSES = org.rocksdb.RocksDB org.rocksdb.Options org.rocksdb.WriteBatch org.rocksdb.WriteBatchInternal org.rocksdb.WriteBatchTest org.rocksdb.WriteOptions org.rocksdb.BackupableDB org.rocksdb.BackupableDBOptions org.rocksdb.Statistics org.rocksdb.RocksIterator org.rocksdb.VectorMemTableConfig org.rocksdb.SkipListMemTableConfig org.rocksdb.HashLinkedListMemTableConfig org.rocksdb.HashSkipListMemTableConfig org.rocksdb.PlainTableConfig org.rocksdb.BlockBasedTableConfig org.rocksdb.ReadOptions org.rocksdb.Filter org.rocksdb.BloomFilter org.rocksdb.RestoreOptions org.rocksdb.RestoreBackupableDB org.rocksdb.RocksEnv org.rocksdb.GenericRateLimiterConfig

 NATIVE_INCLUDE = ./include
 ROCKSDB_JAR = rocksdbjni.jar
--- a/java/RocksDBSample.java
+++ b/java/RocksDBSample.java
@ -35,16 +35,11 @@ public class RocksDBSample {
      assert(db == null);
    }

-    Filter filter = new BloomFilter(10);
    options.setCreateIfMissing(true)
        .createStatistics()
        .setWriteBufferSize(8 * SizeUnit.KB)
        .setMaxWriteBufferNumber(3)
-        .setDisableSeekCompaction(true)
-        .setBlockSize(64 * SizeUnit.KB)
        .setMaxBackgroundCompactions(10)
-        .setFilter(filter)
-        .setCacheNumShardBits(6)
        .setCompressionType(CompressionType.SNAPPY_COMPRESSION)
        .setCompactionStyle(CompactionStyle.UNIVERSAL);
    Statistics stats = options.statisticsPtr();
@ -52,10 +47,7 @@ public class RocksDBSample {
    assert(options.createIfMissing() == true);
    assert(options.writeBufferSize() == 8 * SizeUnit.KB);
    assert(options.maxWriteBufferNumber() == 3);
-    assert(options.disableSeekCompaction() == true);
-    assert(options.blockSize() == 64 * SizeUnit.KB);
    assert(options.maxBackgroundCompactions() == 10);
-    assert(options.cacheNumShardBits() == 6);
    assert(options.compressionType() == CompressionType.SNAPPY_COMPRESSION);
    assert(options.compactionStyle() == CompactionStyle.UNIVERSAL);

@ -80,7 +72,22 @@ public class RocksDBSample {
    assert(options.memTableFactoryName().equals("SkipListFactory"));

    options.setTableFormatConfig(new PlainTableConfig());
+    // Plain-Table requires mmap read
+    options.setAllowMmapReads(true);
    assert(options.tableFactoryName().equals("PlainTable"));
+    
+    options.setRateLimiterConfig(new GenericRateLimiterConfig(10000000,
+            10000, 10));
+    options.setRateLimiterConfig(new GenericRateLimiterConfig(10000000));
+
+    BlockBasedTableConfig table_options = new BlockBasedTableConfig();
+    table_options.setBlockCacheSize(64 * SizeUnit.KB)
+                 .setFilterBitsPerKey(10)
+                 .setCacheNumShardBits(6);
+    assert(table_options.blockCacheSize() == 64 * SizeUnit.KB);
+    assert(table_options.cacheNumShardBits() == 6);
+    options.setTableFormatConfig(table_options);
+    assert(options.tableFactoryName().equals("BlockBasedTable"));

    try {
      db = RocksDB.open(options, db_path_not_found);
@ -120,6 +127,29 @@ public class RocksDBSample {
        System.out.println("");
      }

+      // write batch test
+      WriteOptions writeOpt = new WriteOptions();
+      for (int i = 10; i <= 19; ++i) {
+        WriteBatch batch = new WriteBatch();
+        for (int j = 10; j <= 19; ++j) {
+          batch.put(String.format("%dx%d", i, j).getBytes(),
+                    String.format("%d", i * j).getBytes());
+        }
+        db.write(writeOpt, batch);
+        batch.dispose();
+      }
+      for (int i = 10; i <= 19; ++i) {
+        for (int j = 10; j <= 19; ++j) {
+          assert(new String(
+              db.get(String.format("%dx%d", i, j).getBytes())).equals(
+                  String.format("%d", i * j)));
+          System.out.format("%s ", new String(db.get(
+              String.format("%dx%d", i, j).getBytes())));
+        }
+        System.out.println("");
+      }
+      writeOpt.dispose();
+
      value = db.get("1x1".getBytes());
      assert(value != null);
      value = db.get("world".getBytes());
@ -254,6 +284,5 @@ public class RocksDBSample {
    // be sure to dispose c++ pointers
    options.dispose();
    readOptions.dispose();
-    filter.dispose();
  }
 }
--- a/java/org/rocksdb/BlockBasedTableConfig.java
+++ b/java/org/rocksdb/BlockBasedTableConfig.java
@ -0,0 +1,210 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+package org.rocksdb;
+
+/**
+ * The config for plain table sst format.
+ *
+ * BlockBasedTable is a RocksDB's default SST file format.
+ */
+public class BlockBasedTableConfig extends TableFormatConfig {
+
+  public BlockBasedTableConfig() {
+    noBlockCache_ = false;
+    blockCacheSize_ = 8 * 1024 * 1024;
+    blockSize_ =  4 * 1024;
+    blockSizeDeviation_ =10;
+    blockRestartInterval_ =16;
+    wholeKeyFiltering_ = true;
+    bitsPerKey_ = 0;
+  }
+
+  /**
+   * Disable block cache. If this is set to true,
+   * then no block cache should be used, and the block_cache should
+   * point to a nullptr object.
+   * Default: false
+   *
+   * @param noBlockCache if use block cache
+   * @return the reference to the current config.
+   */
+  public BlockBasedTableConfig setNoBlockCache(boolean noBlockCache) {
+    noBlockCache_ = noBlockCache;
+    return this;
+  }
+
+  /**
+   * @return if block cache is disabled
+   */
+  public boolean noBlockCache() {
+    return noBlockCache_;
+  }
+
+  /**
+   * Set the amount of cache in bytes that will be used by RocksDB.
+   * If cacheSize is non-positive, then cache will not be used.
+   * DEFAULT: 8M
+   *
+   * @param blockCacheSize block cache size in bytes
+   * @return the reference to the current config.
+   */
+  public BlockBasedTableConfig setBlockCacheSize(long blockCacheSize) {
+    blockCacheSize_ = blockCacheSize;
+    return this;
+  }
+
+  /**
+   * @return block cache size in bytes
+   */
+  public long blockCacheSize() {
+    return blockCacheSize_;
+  }
+
+  /**
+   * Controls the number of shards for the block cache.
+   * This is applied only if cacheSize is set to non-negative.
+   *
+   * @param numShardBits the number of shard bits.  The resulting
+   *     number of shards would be 2 ^ numShardBits.  Any negative
+   *     number means use default settings."
+   * @return the reference to the current option.
+   */
+  public BlockBasedTableConfig setCacheNumShardBits(int numShardBits) {
+    numShardBits_ = numShardBits;
+    return this;
+  }
+
+  /**
+   * Returns the number of shard bits used in the block cache.
+   * The resulting number of shards would be 2 ^ (returned value).
+   * Any negative number means use default settings.
+   *
+   * @return the number of shard bits used in the block cache.
+   */
+  public int cacheNumShardBits() {
+    return numShardBits_;
+  }
+
+  /**
+   * Approximate size of user data packed per block.  Note that the
+   * block size specified here corresponds to uncompressed data.  The
+   * actual size of the unit read from disk may be smaller if
+   * compression is enabled.  This parameter can be changed dynamically.
+   * Default: 4K
+   *
+   * @param blockSize block size in bytes
+   * @return the reference to the current config.
+   */
+  public BlockBasedTableConfig setBlockSize(long blockSize) {
+    blockSize_ = blockSize;
+    return this;
+  }
+
+  /**
+   * @return block size in bytes
+   */
+  public long blockSize() {
+    return blockSize_;
+  }
+
+  /**
+   * This is used to close a block before it reaches the configured
+   * 'block_size'. If the percentage of free space in the current block is less
+   * than this specified number and adding a new record to the block will
+   * exceed the configured block size, then this block will be closed and the
+   * new record will be written to the next block.
+   * Default is 10.
+   *
+   * @param blockSizeDeviation the deviation to block size allowed
+   * @return the reference to the current config.
+   */
+  public BlockBasedTableConfig setBlockSizeDeviation(int blockSizeDeviation) {
+    blockSizeDeviation_ = blockSizeDeviation;
+    return this;
+  }
+
+  /**
+   * @return the hash table ratio.
+   */
+  public int blockSizeDeviation() {
+    return blockSizeDeviation_;
+  }
+
+  /**
+   * Set block restart interval
+   *
+   * @param restartInterval block restart interval.
+   * @return the reference to the current config.
+   */
+  public BlockBasedTableConfig setBlockRestartInterval(int restartInterval) {
+    blockRestartInterval_ = restartInterval;
+    return this;
+  }
+
+  /**
+   * @return block restart interval
+   */
+  public int blockRestartInterval() {
+    return blockRestartInterval_;
+  }
+
+  /**
+   * If true, place whole keys in the filter (not just prefixes).
+   * This must generally be true for gets to be efficient.
+   * Default: true
+   *
+   * @param wholeKeyFiltering if enable whole key filtering
+   * @return the reference to the current config.
+   */
+  public BlockBasedTableConfig setWholeKeyFiltering(boolean wholeKeyFiltering) {
+    wholeKeyFiltering_ = wholeKeyFiltering;
+    return this;
+  }
+
+  /**
+   * @return if whole key filtering is enabled
+   */
+  public boolean wholeKeyFiltering() {
+    return wholeKeyFiltering_;
+  }
+
+  /**
+   * Use the specified filter policy to reduce disk reads.
+   *
+   * Filter should not be disposed before options instances using this filter is
+   * disposed. If dispose() function is not called, then filter object will be
+   * GC'd automatically.
+   *
+   * Filter instance can be re-used in multiple options instances.
+   *
+   * @param Filter policy java instance.
+   * @return the reference to the current config.
+   */
+  public BlockBasedTableConfig setFilterBitsPerKey(int bitsPerKey) {
+    bitsPerKey_ = bitsPerKey;
+    return this;
+  }
+
+  @Override protected long newTableFactoryHandle() {
+    return newTableFactoryHandle(noBlockCache_, blockCacheSize_, numShardBits_,
+        blockSize_, blockSizeDeviation_, blockRestartInterval_,
+        wholeKeyFiltering_, bitsPerKey_);
+  }
+
+  private native long newTableFactoryHandle(
+      boolean noBlockCache, long blockCacheSize, int numShardbits,
+      long blockSize, int blockSizeDeviation, int blockRestartInterval,
+      boolean wholeKeyFiltering, int bitsPerKey);
+
+  private boolean noBlockCache_;
+  private long blockCacheSize_;
+  private int numShardBits_;
+  private long shard;
+  private long blockSize_;
+  private int blockSizeDeviation_;
+  private int blockRestartInterval_;
+  private boolean wholeKeyFiltering_;
+  private int bitsPerKey_;
+}
--- a/java/org/rocksdb/GenericRateLimiterConfig.java
+++ b/java/org/rocksdb/GenericRateLimiterConfig.java
@ -0,0 +1,36 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+package org.rocksdb;
+
+/**
+ * Config for rate limiter, which is used to control write rate of flush and
+ * compaction.
+ */
+public class GenericRateLimiterConfig extends RateLimiterConfig {
+  private static final long DEFAULT_REFILL_PERIOD_MICROS = (100 * 1000);
+  private static final int DEFAULT_FAIRNESS = 10;
+    
+  public GenericRateLimiterConfig(long rateBytesPerSecond,
+      long refillPeriodMicros, int fairness) {
+    rateBytesPerSecond_ = rateBytesPerSecond;
+    refillPeriodMicros_ = refillPeriodMicros;
+    fairness_ = fairness;
+  }
+  
+  public GenericRateLimiterConfig(long rateBytesPerSecond) {
+    this(rateBytesPerSecond, DEFAULT_REFILL_PERIOD_MICROS, DEFAULT_FAIRNESS);
+  }
+  
+  @Override protected long newRateLimiterHandle() {
+    return newRateLimiterHandle(rateBytesPerSecond_, refillPeriodMicros_,
+        fairness_);
+  }
+    
+  private native long newRateLimiterHandle(long rateBytesPerSecond,
+      long refillPeriodMicros, int fairness);
+  private final long rateBytesPerSecond_;
+  private final long refillPeriodMicros_;
+  private final int fairness_;
+}
--- a/java/org/rocksdb/Options.java
+++ b/java/org/rocksdb/Options.java
@ -139,135 +139,6 @@ public class Options extends RocksObject {
    return maxWriteBufferNumber(nativeHandle_);
  }

-  /*
-   * Approximate size of user data packed per block.  Note that the
-   * block size specified here corresponds to uncompressed data.  The
-   * actual size of the unit read from disk may be smaller if
-   * compression is enabled.  This parameter can be changed dynamically.
-   *
-   * Default: 4K
-   *
-   * @param blockSize the size of each block in bytes.
-   * @return the instance of the current Options.
-   * @see RocksDB.open()
-   */
-  public Options setBlockSize(long blockSize) {
-    assert(isInitialized());
-    setBlockSize(nativeHandle_, blockSize);
-    return this;
-  }
-
-  /*
-   * Returns the size of a block in bytes.
-   *
-   * @return block size.
-   * @see setBlockSize()
-   */
-  public long blockSize() {
-    assert(isInitialized());
-    return blockSize(nativeHandle_);
-  }
-
-  /**
-   * Use the specified filter policy to reduce disk reads.
-   *
-   * Filter should not be disposed before options instances using this filter is
-   * disposed. If dispose() function is not called, then filter object will be
-   * GC'd automatically.
-   * 
-   * Filter instance can be re-used in multiple options instances. 
-   *
-   * @param Filter policy java instance.
-   * @return the instance of the current Options.
-   * @see RocksDB.open()
-   */
-  public Options setFilter(Filter filter) {
-    assert(isInitialized());
-    setFilterHandle(nativeHandle_, filter.nativeHandle_);
-    filter_ = filter;
-    return this;
-  }
-  private native void setFilterHandle(long optHandle, long filterHandle);
-
-  /*
-   * Disable compaction triggered by seek.
-   * With bloomfilter and fast storage, a miss on one level
-   * is very cheap if the file handle is cached in table cache
-   * (which is true if max_open_files is large).
-   * Default: true
-   *
-   * @param disableSeekCompaction a boolean value to specify whether
-   *     to disable seek compaction.
-   * @return the instance of the current Options.
-   * @see RocksDB.open()
-   */
-  public Options setDisableSeekCompaction(boolean disableSeekCompaction) {
-    assert(isInitialized());
-    setDisableSeekCompaction(nativeHandle_, disableSeekCompaction);
-    return this;
-  }
-
-  /*
-   * Returns true if disable seek compaction is set to true.
-   *
-   * @return true if disable seek compaction is set to true.
-   * @see setDisableSeekCompaction()
-   */
-  public boolean disableSeekCompaction() {
-    assert(isInitialized());
-    return disableSeekCompaction(nativeHandle_);
-  }
-
-  /**
-   * Set the amount of cache in bytes that will be used by RocksDB.
-   * If cacheSize is non-positive, then cache will not be used.
-   *
-   * DEFAULT: 8M
-   * @see setCacheNumShardBits()
-   */
-  public Options setCacheSize(long cacheSize) {
-    cacheSize_ = cacheSize;
-    return this;
-  }
-
-  /**
-   * @return the amount of cache in bytes that will be used by RocksDB.
-   *
-   * @see cacheNumShardBits()
-   */
-  public long cacheSize() {
-    return cacheSize_;
-  }
-
-  /**
-   * Controls the number of shards for the block cache.
-   * This is applied only if cacheSize is set to non-negative.
-   *
-   * @param numShardBits the number of shard bits.  The resulting
-   *     number of shards would be 2 ^ numShardBits.  Any negative
-   *     number means use default settings."
-   * @return the reference to the current option.
-   *
-   * @see setCacheSize()
-   */
-  public Options setCacheNumShardBits(int numShardBits) {
-    numShardBits_ = numShardBits;
-    return this;
-  }
-
-  /**
-   * Returns the number of shard bits used in the block cache.
-   * The resulting number of shards would be 2 ^ (returned value).
-   * Any negative number means use default settings.
-   *
-   * @return the number of shard bits used in the block cache.
-   *
-   * @see cacheSize()
-   */
-  public int cacheNumShardBits() {
-    return numShardBits_;
-  }
-
  /**
   * If true, an error will be thrown during RocksDB.open() if the
   * database already exists.
@ -437,40 +308,6 @@ public class Options extends RocksObject {
  }
  private native void setUseFsync(long handle, boolean useFsync);

-  /**
-   * The time interval in seconds between each two consecutive stats logs.
-   * This number controls how often a new scribe log about
-   * db deploy stats is written out.
-   * -1 indicates no logging at all.
-   *
-   * @return the time interval in seconds between each two consecutive
-   *     stats logs.
-   */
-  public int dbStatsLogInterval() {
-    assert(isInitialized());
-    return dbStatsLogInterval(nativeHandle_);
-  }
-  private native int dbStatsLogInterval(long handle);
-
-  /**
-   * The time interval in seconds between each two consecutive stats logs.
-   * This number controls how often a new scribe log about
-   * db deploy stats is written out.
-   * -1 indicates no logging at all.
-   * Default value is 1800 (half an hour).
-   *
-   * @param dbStatsLogInterval the time interval in seconds between each
-   *     two consecutive stats logs.
-   * @return the reference to the current option.
-   */
-  public Options setDbStatsLogInterval(int dbStatsLogInterval) {
-    assert(isInitialized());
-    setDbStatsLogInterval(nativeHandle_, dbStatsLogInterval);
-    return this;
-  }
-  private native void setDbStatsLogInterval(
-      long handle, int dbStatsLogInterval);
-
  /**
   * Returns the directory of info log.
   *
@ -1270,6 +1107,19 @@ public class Options extends RocksObject {
    setMemTableFactory(nativeHandle_, config.newMemTableFactoryHandle());
    return this;
  }
+  
+  /**
+   * Use to control write rate of flush and compaction. Flush has higher
+   * priority than compaction. Rate limiting is disabled if nullptr.
+   * Default: nullptr
+   *
+   * @param config rate limiter config.
+   * @return the instance of the current Options.
+   */
+  public Options setRateLimiterConfig(RateLimiterConfig config) {
+    setRateLimiter(nativeHandle_, config.newRateLimiterHandle());
+    return this;
+  }

  /**
   * Returns the name of the current mem table representation.
@ -1347,26 +1197,26 @@ public class Options extends RocksObject {
  }
  private native void setBlockRestartInterval(
      long handle, int blockRestartInterval);
-      
+
  /**
   * Compress blocks using the specified compression algorithm.  This
     parameter can be changed dynamically.
-   * 
+   *
   * Default: SNAPPY_COMPRESSION, which gives lightweight but fast compression.
-   * 
+   *
   * @return Compression type.
-   */ 
+   */
  public CompressionType compressionType() {
    return CompressionType.values()[compressionType(nativeHandle_)];
  }
  private native byte compressionType(long handle);
-      
+
  /**
   * Compress blocks using the specified compression algorithm.  This
     parameter can be changed dynamically.
-   * 
+   *
   * Default: SNAPPY_COMPRESSION, which gives lightweight but fast compression.
-   * 
+   *
   * @param compressionType Compression Type.
   * @return the reference to the current option.
   */
@ -1375,22 +1225,22 @@ public class Options extends RocksObject {
    return this;
  }
  private native void setCompressionType(long handle, byte compressionType);
-      
+
   /**
   * Compaction style for DB.
-   * 
+   *
   * @return Compaction style.
-   */ 
+   */
  public CompactionStyle compactionStyle() {
    return CompactionStyle.values()[compactionStyle(nativeHandle_)];
  }
  private native byte compactionStyle(long handle);
-      
+
  /**
   * Set compaction style for DB.
-   * 
+   *
   * Default: LEVEL.
-   * 
+   *
   * @param compactionStyle Compaction style.
   * @return the reference to the current option.
   */
@ -1400,33 +1250,6 @@ public class Options extends RocksObject {
  }
  private native void setCompactionStyle(long handle, byte compactionStyle);

-  /**
-   * If true, place whole keys in the filter (not just prefixes).
-   * This must generally be true for gets to be efficient.
-   * Default: true
-   *
-   * @return if true, then whole-key-filtering is on.
-   */
-  public boolean wholeKeyFiltering() {
-    return wholeKeyFiltering(nativeHandle_);
-  }
-  private native boolean wholeKeyFiltering(long handle);
-
-  /**
-   * If true, place whole keys in the filter (not just prefixes).
-   * This must generally be true for gets to be efficient.
-   * Default: true
-   *
-   * @param wholeKeyFiltering if true, then whole-key-filtering is on.
-   * @return the reference to the current option.
-   */
-  public Options setWholeKeyFiltering(boolean wholeKeyFiltering) {
-    setWholeKeyFiltering(nativeHandle_, wholeKeyFiltering);
-    return this;
-  }
-  private native void setWholeKeyFiltering(
-      long handle, boolean wholeKeyFiltering);
-
  /**
   * If level-styled compaction is used, then this number determines
   * the total number of levels.
@ -1900,35 +1723,6 @@ public class Options extends RocksObject {
  private native void setRateLimitDelayMaxMilliseconds(
      long handle, int rateLimitDelayMaxMilliseconds);

-  /**
-   * Disable block cache. If this is set to true,
-   * then no block cache should be used, and the block_cache should
-   * point to a nullptr object.
-   * Default: false
-   *
-   * @return true if block cache is disabled.
-   */
-  public boolean noBlockCache() {
-    return noBlockCache(nativeHandle_);
-  }
-  private native boolean noBlockCache(long handle);
-
-  /**
-   * Disable block cache. If this is set to true,
-   * then no block cache should be used, and the block_cache should
-   * point to a nullptr object.
-   * Default: false
-   *
-   * @param noBlockCache true if block-cache is disabled.
-   * @return the reference to the current option.
-   */
-  public Options setNoBlockCache(boolean noBlockCache) {
-    setNoBlockCache(nativeHandle_, noBlockCache);
-    return this;
-  }
-  private native void setNoBlockCache(
-      long handle, boolean noBlockCache);
-
  /**
   * The size of one block in arena memory allocation.
   * If <= 0, a proper value is automatically calculated (usually 1/10 of
@ -2026,39 +1820,6 @@ public class Options extends RocksObject {
  private native void setPurgeRedundantKvsWhileFlush(
      long handle, boolean purgeRedundantKvsWhileFlush);

-  /**
-   * This is used to close a block before it reaches the configured
-   * 'block_size'. If the percentage of free space in the current block is less
-   * than this specified number and adding a new record to the block will
-   * exceed the configured block size, then this block will be closed and the
-   * new record will be written to the next block.
-   * Default is 10.
-   *
-   * @return the target block size
-   */
-  public int blockSizeDeviation() {
-    return blockSizeDeviation(nativeHandle_);
-  }
-  private native int blockSizeDeviation(long handle);
-
-  /**
-   * This is used to close a block before it reaches the configured
-   * 'block_size'. If the percentage of free space in the current block is less
-   * than this specified number and adding a new record to the block will
-   * exceed the configured block size, then this block will be closed and the
-   * new record will be written to the next block.
-   * Default is 10.
-   *
-   * @param blockSizeDeviation the target block size
-   * @return the reference to the current option.
-   */
-  public Options setBlockSizeDeviation(int blockSizeDeviation) {
-    setBlockSizeDeviation(nativeHandle_, blockSizeDeviation);
-    return this;
-  }
-  private native void setBlockSizeDeviation(
-      long handle, int blockSizeDeviation);
-
  /**
   * If true, compaction will verify checksum on every read that happens
   * as part of compaction
@ -2440,11 +2201,6 @@ public class Options extends RocksObject {
  private native void setMaxWriteBufferNumber(
      long handle, int maxWriteBufferNumber);
  private native int maxWriteBufferNumber(long handle);
-  private native void setBlockSize(long handle, long blockSize);
-  private native long blockSize(long handle);
-  private native void setDisableSeekCompaction(
-      long handle, boolean disableSeekCompaction);
-  private native boolean disableSeekCompaction(long handle);
  private native void setMaxBackgroundCompactions(
      long handle, int maxBackgroundCompactions);
  private native int maxBackgroundCompactions(long handle);
@ -2452,6 +2208,8 @@ public class Options extends RocksObject {
  private native long statisticsPtr(long optHandle);

  private native void setMemTableFactory(long handle, long factoryHandle);
+  private native void setRateLimiter(long handle,
+      long rateLimiterHandle);
  private native String memTableFactoryName(long handle);

  private native void setTableFactory(long handle, long factoryHandle);
@ -2462,6 +2220,5 @@ public class Options extends RocksObject {

  long cacheSize_;
  int numShardBits_;
-  Filter filter_;
  RocksEnv env_;
 }
--- a/java/org/rocksdb/RateLimiterConfig.java
+++ b/java/org/rocksdb/RateLimiterConfig.java
@ -0,0 +1,20 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+package org.rocksdb;
+
+/**
+ * Config for rate limiter, which is used to control write rate of flush and
+ * compaction.
+ */
+public abstract class RateLimiterConfig {
+  /**
+   * This function should only be called by Options.setRateLimiter(),
+   * which will create a c++ shared-pointer to the c++ RateLimiter
+   * that is associated with the Java RateLimtierConifg.
+   *
+   * @see Options.setRateLimiter()
+   */
+  abstract protected long newRateLimiterHandle();
+}
--- a/java/org/rocksdb/RocksDB.java
+++ b/java/org/rocksdb/RocksDB.java
@ -114,11 +114,11 @@ public class RocksDB extends RocksObject {
  /**
   * The factory constructor of RocksDB that opens a RocksDB instance given
   * the path to the database using the specified options and db path.
-   * 
+   *
   * Options instance *should* not be disposed before all DBs using this options
   * instance have been closed. If user doesn't call options dispose explicitly,
   * then this options instance will be GC'd automatically.
-   * 
+   *
   * Options instance can be re-used to open multiple DBs if DB statistics is
   * not used. If DB statistics are required, then its recommended to open DB
   * with new Options instance as underlying native statistics instance does not
@ -130,13 +130,12 @@ public class RocksDB extends RocksObject {
    // in RocksDB can prevent Java to GC during the life-time of
    // the currently-created RocksDB.
    RocksDB db = new RocksDB();
-    db.open(options.nativeHandle_, options.cacheSize_,
-            options.numShardBits_, path);
-    
+    db.open(options.nativeHandle_, path);
+
    db.storeOptionsInstance(options);
    return db;
  }
-  
+
  private void storeOptionsInstance(Options options) {
    options_ = options;
  }
@ -349,8 +348,7 @@ public class RocksDB extends RocksObject {

  // native methods
  protected native void open(
-      long optionsHandle, long cacheSize, int numShardBits,
-      String path) throws RocksDBException;
+      long optionsHandle, String path) throws RocksDBException;
  protected native void put(
      long handle, byte[] key, int keyLen,
      byte[] value, int valueLen) throws RocksDBException;
--- a/java/org/rocksdb/benchmark/DbBenchmark.java
+++ b/java/org/rocksdb/benchmark/DbBenchmark.java
@ -255,7 +255,7 @@ public class DbBenchmark {
            for (long j = 0; j < entriesPerBatch_; j++) {
              getKey(key, i + j, keyRange_);
              DbBenchmark.this.gen_.generate(value);
-              db_.put(writeOpt_, key, value);
+              batch.put(key, value);
              stats_.finishedSingleOp(keySize_ + valueSize_);
            }
            db_.write(writeOpt_, batch);
@ -446,7 +446,6 @@ public class DbBenchmark {
    randSeed_ = (Long) flags.get(Flag.seed);
    databaseDir_ = (String) flags.get(Flag.db);
    writesPerSeconds_ = (Integer) flags.get(Flag.writes_per_second);
-    cacheSize_ = (Long) flags.get(Flag.cache_size);
    memtable_ = (String) flags.get(Flag.memtablerep);
    maxWriteBufferNumber_ = (Integer) flags.get(Flag.max_write_buffer_number);
    prefixSize_ = (Integer) flags.get(Flag.prefix_size);
@ -491,7 +490,6 @@ public class DbBenchmark {
  }

  private void prepareOptions(Options options) {
-    options.setCacheSize(cacheSize_);
    if (!useExisting_) {
      options.setCreateIfMissing(true);
    } else {
@ -521,6 +519,13 @@ public class DbBenchmark {
    if (usePlainTable_) {
      options.setTableFormatConfig(
          new PlainTableConfig().setKeySize(keySize_));
+    } else {
+      BlockBasedTableConfig table_options = new BlockBasedTableConfig();
+      table_options.setBlockSize((Long)flags_.get(Flag.block_size))
+                   .setBlockCacheSize((Long)flags_.get(Flag.cache_size))
+                   .setFilterBitsPerKey((Integer)flags_.get(Flag.bloom_bits))
+                   .setCacheNumShardBits((Integer)flags_.get(Flag.cache_numshardbits));
+      options.setTableFormatConfig(table_options);
    }
    options.setWriteBufferSize(
        (Long)flags_.get(Flag.write_buffer_size));
@ -532,12 +537,6 @@ public class DbBenchmark {
        (Integer)flags_.get(Flag.max_background_compactions));
    options.setMaxBackgroundFlushes(
        (Integer)flags_.get(Flag.max_background_flushes));
-    options.setCacheSize(
-        (Long)flags_.get(Flag.cache_size));
-    options.setCacheNumShardBits(
-        (Integer)flags_.get(Flag.cache_numshardbits));
-    options.setBlockSize(
-        (Long)flags_.get(Flag.block_size));
    options.setMaxOpenFiles(
        (Integer)flags_.get(Flag.open_files));
    options.setTableCacheRemoveScanCountLimit(
@ -548,8 +547,6 @@ public class DbBenchmark {
        (Boolean)flags_.get(Flag.use_fsync));
    options.setWalDir(
        (String)flags_.get(Flag.wal_dir));
-    options.setDisableSeekCompaction(
-        (Boolean)flags_.get(Flag.disable_seek_compaction));
    options.setDeleteObsoleteFilesPeriodMicros(
        (Integer)flags_.get(Flag.delete_obsolete_files_period_micros));
    options.setTableCacheNumshardbits(
@ -604,15 +601,6 @@ public class DbBenchmark {
        (Integer)flags_.get(Flag.max_successive_merges));
    options.setWalTtlSeconds((Long)flags_.get(Flag.wal_ttl_seconds));
    options.setWalSizeLimitMB((Long)flags_.get(Flag.wal_size_limit_MB));
-    int bloomBits = (Integer)flags_.get(Flag.bloom_bits);
-    if (bloomBits > 0) {
-      // Internally, options will keep a reference to this BloomFilter.
-      // This will disallow Java to GC this BloomFilter.  In addition,
-      // options.dispose() will release the c++ object of this BloomFilter.
-      // As a result, the caller should not directly call
-      // BloomFilter.dispose().
-      options.setFilter(new BloomFilter(bloomBits));
-    }
    /* TODO(yhchiang): enable the following parameters
    options.setCompressionType((String)flags_.get(Flag.compression_type));
    options.setCompressionLevel((Integer)flags_.get(Flag.compression_level));
@ -1160,7 +1148,7 @@ public class DbBenchmark {
        return Integer.parseInt(value);
      }
    },
-    block_size(defaultOptions_.blockSize(),
+    block_size(defaultBlockBasedTableOptions_.blockSize(),
        "Number of bytes in a block.") {
      @Override public Object parseValue(String value) {
        return Long.parseLong(value);
@ -1312,12 +1300,6 @@ public class DbBenchmark {
        return Integer.parseInt(value);
      }
    },
-    disable_seek_compaction(false,"Option to disable compaction\n" +
-        "\ttriggered by read.") {
-      @Override public Object parseValue(String value) {
-        return parseBoolean(value);
-      }
-    },
    delete_obsolete_files_period_micros(0,"Option to delete\n" +
        "\tobsolete files periodically. 0 means that obsolete files are\n" +
        "\tdeleted after every compaction run.") {
@ -1597,7 +1579,6 @@ public class DbBenchmark {
  final int threadNum_;
  final int writesPerSeconds_;
  final long randSeed_;
-  final long cacheSize_;
  final boolean useExisting_;
  final String databaseDir_;
  double compressionRatio_;
@ -1620,6 +1601,8 @@ public class DbBenchmark {
  // as the scope of a static member equals to the scope of the problem,
  // we let its c++ pointer to be disposed in its finalizer.
  static Options defaultOptions_ = new Options();
+  static BlockBasedTableConfig defaultBlockBasedTableOptions_ =
+    new BlockBasedTableConfig();
  String compressionType_;
  CompressionType compression_;
 }
--- a/java/org/rocksdb/test/OptionsTest.java
+++ b/java/org/rocksdb/test/OptionsTest.java
@ -52,12 +52,6 @@ public class OptionsTest {
      assert(opt.useFsync() == boolValue);
    }

-    { // DbStatsLogInterval test
-      int intValue = rand.nextInt();
-      opt.setDbStatsLogInterval(intValue);
-      assert(opt.dbStatsLogInterval() == intValue);
-    }
-
    { // DbLogDir test
      String str = "path/to/DbLogDir";
      opt.setDbLogDir(str);
@ -214,24 +208,6 @@ public class OptionsTest {
      assert(opt.minWriteBufferNumberToMerge() == intValue);
    }

-    { // BlockSize test
-      long longValue = rand.nextLong();
-      opt.setBlockSize(longValue);
-      assert(opt.blockSize() == longValue);
-    }
-
-    { // BlockRestartInterval test
-      int intValue = rand.nextInt();
-      opt.setBlockRestartInterval(intValue);
-      assert(opt.blockRestartInterval() == intValue);
-    }
-
-    { // WholeKeyFiltering test
-      boolean boolValue = rand.nextBoolean();
-      opt.setWholeKeyFiltering(boolValue);
-      assert(opt.wholeKeyFiltering() == boolValue);
-    }
-
    { // NumLevels test
      int intValue = rand.nextInt();
      opt.setNumLevels(intValue);
@ -304,12 +280,6 @@ public class OptionsTest {
      assert(opt.maxGrandparentOverlapFactor() == intValue);
    }

-    { // DisableSeekCompaction test
-      boolean boolValue = rand.nextBoolean();
-      opt.setDisableSeekCompaction(boolValue);
-      assert(opt.disableSeekCompaction() == boolValue);
-    }
-
    { // SoftRateLimit test
      double doubleValue = rand.nextDouble();
      opt.setSoftRateLimit(doubleValue);
@ -328,12 +298,6 @@ public class OptionsTest {
      assert(opt.rateLimitDelayMaxMilliseconds() == intValue);
    }

-    { // NoBlockCache test
-      boolean boolValue = rand.nextBoolean();
-      opt.setNoBlockCache(boolValue);
-      assert(opt.noBlockCache() == boolValue);
-    }
-
    { // ArenaBlockSize test
      long longValue = rand.nextLong();
      opt.setArenaBlockSize(longValue);
@ -352,12 +316,6 @@ public class OptionsTest {
      assert(opt.purgeRedundantKvsWhileFlush() == boolValue);
    }

-    { // BlockSizeDeviation test
-      int intValue = rand.nextInt();
-      opt.setBlockSizeDeviation(intValue);
-      assert(opt.blockSizeDeviation() == intValue);
-    }
-
    { // VerifyChecksumsInCompaction test
      boolean boolValue = rand.nextBoolean();
      opt.setVerifyChecksumsInCompaction(boolValue);
--- a/java/rocksjni/memtablejni.cc
+++ b/java/rocksjni/memtablejni.cc
@ -5,6 +5,7 @@
 //
 // This file implements the "bridge" between Java and C++ for MemTables.

+#include "rocksjni/portal.h"
 #include "include/org_rocksdb_HashSkipListMemTableConfig.h"
 #include "include/org_rocksdb_HashLinkedListMemTableConfig.h"
 #include "include/org_rocksdb_VectorMemTableConfig.h"
@ -20,7 +21,7 @@ jlong Java_org_rocksdb_HashSkipListMemTableConfig_newMemTableFactoryHandle(
    JNIEnv* env, jobject jobj, jlong jbucket_count,
    jint jheight, jint jbranching_factor) {
  return reinterpret_cast<jlong>(rocksdb::NewHashSkipListRepFactory(
-      static_cast<size_t>(jbucket_count),
+      rocksdb::jlong_to_size_t(jbucket_count),
      static_cast<int32_t>(jheight),
      static_cast<int32_t>(jbranching_factor)));
 }
@ -33,7 +34,7 @@ jlong Java_org_rocksdb_HashSkipListMemTableConfig_newMemTableFactoryHandle(
 jlong Java_org_rocksdb_HashLinkedListMemTableConfig_newMemTableFactoryHandle(
    JNIEnv* env, jobject jobj, jlong jbucket_count) {
  return reinterpret_cast<jlong>(rocksdb::NewHashLinkListRepFactory(
-       static_cast<size_t>(jbucket_count)));
+       rocksdb::jlong_to_size_t(jbucket_count)));
 }

 /*
@ -44,7 +45,7 @@ jlong Java_org_rocksdb_HashLinkedListMemTableConfig_newMemTableFactoryHandle(
 jlong Java_org_rocksdb_VectorMemTableConfig_newMemTableFactoryHandle(
    JNIEnv* env, jobject jobj, jlong jreserved_size) {
  return reinterpret_cast<jlong>(new rocksdb::VectorRepFactory(
-      static_cast<size_t>(jreserved_size)));
+      rocksdb::jlong_to_size_t(jreserved_size)));
 }

 /*
--- a/java/rocksjni/options.cc
+++ b/java/rocksjni/options.cc
@ -21,7 +21,7 @@
 #include "rocksdb/memtablerep.h"
 #include "rocksdb/table.h"
 #include "rocksdb/slice_transform.h"
-#include "rocksdb/filter_policy.h"
+#include "rocksdb/rate_limiter.h"

 /*
 * Class:     org_rocksdb_Options
@ -71,7 +71,7 @@ jboolean Java_org_rocksdb_Options_createIfMissing(
 void Java_org_rocksdb_Options_setWriteBufferSize(
    JNIEnv* env, jobject jobj, jlong jhandle, jlong jwrite_buffer_size) {
  reinterpret_cast<rocksdb::Options*>(jhandle)->write_buffer_size =
-          static_cast<size_t>(jwrite_buffer_size);
+          rocksdb::jlong_to_size_t(jwrite_buffer_size);
 }


@ -118,17 +118,6 @@ jlong Java_org_rocksdb_Options_statisticsPtr(
  return reinterpret_cast<jlong>(st);
 }

-/*
- * Class:     org_rocksdb_Options
- * Method:    setFilterHandle
- * Signature: (JJ)V
- */
-void Java_org_rocksdb_Options_setFilterHandle(
-    JNIEnv* env, jobject jobj, jlong jopt_handle, jlong jfilter_handle) {
-  reinterpret_cast<rocksdb::Options*>(jopt_handle)->filter_policy =
-      reinterpret_cast<rocksdb::FilterPolicy*>(jfilter_handle);
-}
-
 /*
 * Class:     org_rocksdb_Options
 * Method:    maxWriteBufferNumber
@ -139,49 +128,6 @@ jint Java_org_rocksdb_Options_maxWriteBufferNumber(
  return reinterpret_cast<rocksdb::Options*>(jhandle)->max_write_buffer_number;
 }

-/*
- * Class:     org_rocksdb_Options
- * Method:    setBlockSize
- * Signature: (JJ)V
- */
-void Java_org_rocksdb_Options_setBlockSize(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong jblock_size) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->block_size =
-          static_cast<size_t>(jblock_size);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    blockSize
- * Signature: (J)J
- */
-jlong Java_org_rocksdb_Options_blockSize(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)->block_size;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setDisableSeekCompaction
- * Signature: (JZ)V
- */
-void Java_org_rocksdb_Options_setDisableSeekCompaction(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jboolean jdisable_seek_compaction) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->disable_seek_compaction =
-         jdisable_seek_compaction;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    disableSeekCompaction
- * Signature: (J)Z
- */
-jboolean Java_org_rocksdb_Options_disableSeekCompaction(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)->disable_seek_compaction;
-}
-
 /*
 * Class:     org_rocksdb_Options
 * Method:    errorIfExists
@ -287,27 +233,6 @@ void Java_org_rocksdb_Options_setUseFsync(
      static_cast<bool>(use_fsync);
 }

-/*
- * Class:     org_rocksdb_Options
- * Method:    dbStatsLogInterval
- * Signature: (J)I
- */
-jint Java_org_rocksdb_Options_dbStatsLogInterval(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)->db_stats_log_interval;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setDbStatsLogInterval
- * Signature: (JI)V
- */
-void Java_org_rocksdb_Options_setDbStatsLogInterval(
-    JNIEnv* env, jobject jobj, jlong jhandle, jint db_stats_log_interval) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->db_stats_log_interval =
-      static_cast<int>(db_stats_log_interval);
-}
-
 /*
 * Class:     org_rocksdb_Options
 * Method:    dbLogDir
@ -438,7 +363,7 @@ jlong Java_org_rocksdb_Options_maxLogFileSize(
 void Java_org_rocksdb_Options_setMaxLogFileSize(
    JNIEnv* env, jobject jobj, jlong jhandle, jlong max_log_file_size) {
  reinterpret_cast<rocksdb::Options*>(jhandle)->max_log_file_size =
-      static_cast<size_t>(max_log_file_size);
+      rocksdb::jlong_to_size_t(max_log_file_size);
 }

 /*
@ -459,7 +384,7 @@ jlong Java_org_rocksdb_Options_logFileTimeToRoll(
 void Java_org_rocksdb_Options_setLogFileTimeToRoll(
    JNIEnv* env, jobject jobj, jlong jhandle, jlong log_file_time_to_roll) {
  reinterpret_cast<rocksdb::Options*>(jhandle)->log_file_time_to_roll =
-      static_cast<size_t>(log_file_time_to_roll);
+      rocksdb::jlong_to_size_t(log_file_time_to_roll);
 }

 /*
@ -480,7 +405,7 @@ jlong Java_org_rocksdb_Options_keepLogFileNum(
 void Java_org_rocksdb_Options_setKeepLogFileNum(
    JNIEnv* env, jobject jobj, jlong jhandle, jlong keep_log_file_num) {
  reinterpret_cast<rocksdb::Options*>(jhandle)->keep_log_file_num =
-      static_cast<size_t>(keep_log_file_num);
+      rocksdb::jlong_to_size_t(keep_log_file_num);
 }

 /*
@ -535,6 +460,17 @@ void Java_org_rocksdb_Options_setMemTableFactory(
      reinterpret_cast<rocksdb::MemTableRepFactory*>(jfactory_handle));
 }

+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setRateLimiter
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setRateLimiter(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong jrate_limiter_handle) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->rate_limiter.reset(
+      reinterpret_cast<rocksdb::RateLimiter*>(jrate_limiter_handle));
+}
+
 /*
 * Class:     org_rocksdb_Options
 * Method:    tableCacheNumshardbits
@ -585,7 +521,8 @@ void Java_org_rocksdb_Options_setTableCacheRemoveScanCountLimit(
 void Java_org_rocksdb_Options_useFixedLengthPrefixExtractor(
    JNIEnv* env, jobject jobj, jlong jhandle, jint jprefix_length) {
  reinterpret_cast<rocksdb::Options*>(jhandle)->prefix_extractor.reset(
-      rocksdb::NewFixedPrefixTransform(static_cast<size_t>(jprefix_length)));
+      rocksdb::NewFixedPrefixTransform(
+          rocksdb::jlong_to_size_t(jprefix_length)));
 }

 /*
@ -649,7 +586,7 @@ jlong Java_org_rocksdb_Options_manifestPreallocationSize(
 void Java_org_rocksdb_Options_setManifestPreallocationSize(
    JNIEnv* env, jobject jobj, jlong jhandle, jlong preallocation_size) {
  reinterpret_cast<rocksdb::Options*>(jhandle)->manifest_preallocation_size =
-      static_cast<size_t>(preallocation_size);
+      rocksdb::jlong_to_size_t(preallocation_size);
 }

 /*
@ -914,27 +851,6 @@ void Java_org_rocksdb_Options_setMinWriteBufferNumberToMerge(
          static_cast<int>(jmin_write_buffer_number_to_merge);
 }

-/*
- * Class:     org_rocksdb_Options
- * Method:    blockRestartInterval
- * Signature: (J)I
- */
-jint Java_org_rocksdb_Options_blockRestartInterval(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)->block_restart_interval;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setBlockRestartInterval
- * Signature: (JI)V
- */
-void Java_org_rocksdb_Options_setBlockRestartInterval(
-    JNIEnv* env, jobject jobj, jlong jhandle, jint jblock_restart_interval) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->block_restart_interval =
-      static_cast<int>(jblock_restart_interval);
-}
-
 /*
 * Class:     org_rocksdb_Options
 * Method:    setCompressionType
@ -977,27 +893,6 @@ jbyte Java_org_rocksdb_Options_compactionStyle(
  return reinterpret_cast<rocksdb::Options*>(jhandle)->compaction_style;
 }

-/*
- * Class:     org_rocksdb_Options
- * Method:    wholeKeyFiltering
- * Signature: (J)Z
- */
-jboolean Java_org_rocksdb_Options_wholeKeyFiltering(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)->whole_key_filtering;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setWholeKeyFiltering
- * Signature: (JZ)V
- */
-void Java_org_rocksdb_Options_setWholeKeyFiltering(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean jwhole_key_filtering) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->whole_key_filtering =
-      static_cast<bool>(jwhole_key_filtering);
-}
-
 /*
 * Class:     org_rocksdb_Options
 * Method:    numLevels
@ -1345,27 +1240,6 @@ void Java_org_rocksdb_Options_setRateLimitDelayMaxMilliseconds(
          static_cast<int>(jrate_limit_delay_max_milliseconds);
 }

-/*
- * Class:     org_rocksdb_Options
- * Method:    noBlockCache
- * Signature: (J)Z
- */
-jboolean Java_org_rocksdb_Options_noBlockCache(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)->no_block_cache;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setNoBlockCache
- * Signature: (JZ)V
- */
-void Java_org_rocksdb_Options_setNoBlockCache(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean jno_block_cache) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->no_block_cache =
-      static_cast<bool>(jno_block_cache);
-}
-
 /*
 * Class:     org_rocksdb_Options
 * Method:    arenaBlockSize
@ -1384,7 +1258,7 @@ jlong Java_org_rocksdb_Options_arenaBlockSize(
 void Java_org_rocksdb_Options_setArenaBlockSize(
    JNIEnv* env, jobject jobj, jlong jhandle, jlong jarena_block_size) {
  reinterpret_cast<rocksdb::Options*>(jhandle)->arena_block_size =
-      static_cast<size_t>(jarena_block_size);
+      rocksdb::jlong_to_size_t(jarena_block_size);
 }

 /*
@ -1435,28 +1309,6 @@ void Java_org_rocksdb_Options_setPurgeRedundantKvsWhileFlush(
          static_cast<bool>(jpurge_redundant_kvs_while_flush);
 }

-/*
- * Class:     org_rocksdb_Options
- * Method:    blockSizeDeviation
- * Signature: (J)I
- */
-jint Java_org_rocksdb_Options_blockSizeDeviation(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)->block_size_deviation;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setBlockSizeDeviation
- * Signature: (JI)V
- */
-void Java_org_rocksdb_Options_setBlockSizeDeviation(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jint jblock_size_deviation) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->block_size_deviation =
-      static_cast<int>(jblock_size_deviation);
-}
-
 /*
 * Class:     org_rocksdb_Options
 * Method:    verifyChecksumsInCompaction
@ -1571,7 +1423,7 @@ void Java_org_rocksdb_Options_setInplaceUpdateNumLocks(
    jlong jinplace_update_num_locks) {
  reinterpret_cast<rocksdb::Options*>(
      jhandle)->inplace_update_num_locks =
-          static_cast<size_t>(jinplace_update_num_locks);
+          rocksdb::jlong_to_size_t(jinplace_update_num_locks);
 }

 /*
@ -1662,7 +1514,7 @@ void Java_org_rocksdb_Options_setMaxSuccessiveMerges(
    JNIEnv* env, jobject jobj, jlong jhandle,
    jlong jmax_successive_merges) {
  reinterpret_cast<rocksdb::Options*>(jhandle)->max_successive_merges =
-      static_cast<size_t>(jmax_successive_merges);
+      rocksdb::jlong_to_size_t(jmax_successive_merges);
 }

 /*
--- a/java/rocksjni/portal.h
+++ b/java/rocksjni/portal.h
@ -11,12 +11,19 @@
 #define JAVA_ROCKSJNI_PORTAL_H_

 #include <jni.h>
+#include <limits>
 #include "rocksdb/db.h"
 #include "rocksdb/filter_policy.h"
 #include "rocksdb/utilities/backupable_db.h"

 namespace rocksdb {

+inline size_t jlong_to_size_t(const jlong& jvalue) {
+  return static_cast<uint64_t>(jvalue) <=
+      static_cast<uint64_t>(std::numeric_limits<size_t>::max()) ?
+      static_cast<size_t>(jvalue) : std::numeric_limits<size_t>::max();
+}
+
 // The portal class for org.rocksdb.RocksDB
 class RocksDBJni {
 public:
--- a/java/rocksjni/ratelimiterjni.cc
+++ b/java/rocksjni/ratelimiterjni.cc
@ -0,0 +1,24 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file implements the "bridge" between Java and C++ for RateLimiter.
+
+#include "rocksjni/portal.h"
+#include "include/org_rocksdb_GenericRateLimiterConfig.h"
+#include "rocksdb/rate_limiter.h"
+
+/*
+ * Class:     org_rocksdb_GenericRateLimiterConfig
+ * Method:    newRateLimiterHandle
+ * Signature: (JJI)J
+ */
+jlong Java_org_rocksdb_GenericRateLimiterConfig_newRateLimiterHandle(
+    JNIEnv* env, jobject jobj, jlong jrate_bytes_per_second,
+    jlong jrefill_period_micros, jint jfairness) {
+  return reinterpret_cast<jlong>(rocksdb::NewGenericRateLimiter(
+      rocksdb::jlong_to_size_t(jrate_bytes_per_second),
+      rocksdb::jlong_to_size_t(jrefill_period_micros),
+      static_cast<int32_t>(jfairness)));
+}
--- a/java/rocksjni/rocksjni.cc
+++ b/java/rocksjni/rocksjni.cc
@ -26,21 +26,8 @@
 * Signature: (JLjava/lang/String;)V
 */
 void Java_org_rocksdb_RocksDB_open(
-    JNIEnv* env, jobject jdb, jlong jopt_handle,
-    jlong jcache_size, jint jnum_shardbits, jstring jdb_path) {
+    JNIEnv* env, jobject jdb, jlong jopt_handle, jstring jdb_path) {
  auto opt = reinterpret_cast<rocksdb::Options*>(jopt_handle);
-  if (jcache_size > 0) {
-    opt->no_block_cache = false;
-    if (jnum_shardbits >= 1) {
-      opt->block_cache = rocksdb::NewLRUCache(jcache_size, jnum_shardbits);
-    } else {
-      opt->block_cache = rocksdb::NewLRUCache(jcache_size);
-    }
-  } else {
-    opt->no_block_cache = true;
-    opt->block_cache = nullptr;
-  }
-
  rocksdb::DB* db = nullptr;
  const char* db_path = env->GetStringUTFChars(jdb_path, 0);
  rocksdb::Status s = rocksdb::DB::Open(*opt, db_path, &db);
--- a/java/rocksjni/table.cc
+++ b/java/rocksjni/table.cc
@ -7,7 +7,10 @@

 #include <jni.h>
 #include "include/org_rocksdb_PlainTableConfig.h"
+#include "include/org_rocksdb_BlockBasedTableConfig.h"
 #include "rocksdb/table.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/filter_policy.h"

 /*
 * Class:     org_rocksdb_PlainTableConfig
@ -24,3 +27,34 @@ jlong Java_org_rocksdb_PlainTableConfig_newTableFactoryHandle(
  options.index_sparseness = jindex_sparseness;
  return reinterpret_cast<jlong>(rocksdb::NewPlainTableFactory(options));
 }
+
+/*
+ * Class:     org_rocksdb_BlockBasedTableConfig
+ * Method:    newTableFactoryHandle
+ * Signature: (ZJIJIIZI)J
+ */
+jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle(
+    JNIEnv* env, jobject jobj, jboolean no_block_cache, jlong block_cache_size,
+    jint num_shardbits, jlong block_size, jint block_size_deviation,
+    jint block_restart_interval, jboolean whole_key_filtering,
+    jint bits_per_key) {
+  rocksdb::BlockBasedTableOptions options;
+  options.no_block_cache = no_block_cache;
+
+  if (!no_block_cache && block_cache_size > 0) {
+    if (num_shardbits > 0) {
+      options.block_cache =
+          rocksdb::NewLRUCache(block_cache_size, num_shardbits);
+    } else {
+      options.block_cache = rocksdb::NewLRUCache(block_cache_size);
+    }
+  }
+  options.block_size = block_size;
+  options.block_size_deviation = block_size_deviation;
+  options.block_restart_interval = block_restart_interval;
+  options.whole_key_filtering = whole_key_filtering;
+  if (bits_per_key > 0) {
+    options.filter_policy.reset(rocksdb::NewBloomFilterPolicy(bits_per_key));
+  }
+  return reinterpret_cast<jlong>(rocksdb::NewBlockBasedTableFactory(options));
+}
--- a/java/rocksjni/write_batch.cc
+++ b/java/rocksjni/write_batch.cc
@ -12,12 +12,14 @@
 #include "include/org_rocksdb_WriteBatchTest.h"
 #include "rocksjni/portal.h"
 #include "rocksdb/db.h"
+#include "rocksdb/immutable_options.h"
 #include "db/memtable.h"
 #include "rocksdb/write_batch.h"
 #include "db/write_batch_internal.h"
 #include "rocksdb/env.h"
 #include "rocksdb/memtablerep.h"
 #include "util/logging.h"
+#include "util/scoped_arena_iterator.h"
 #include "util/testharness.h"

 /*
@ -28,7 +30,7 @@
 void Java_org_rocksdb_WriteBatch_newWriteBatch(
    JNIEnv* env, jobject jobj, jint jreserved_bytes) {
  rocksdb::WriteBatch* wb = new rocksdb::WriteBatch(
-      static_cast<size_t>(jreserved_bytes));
+      rocksdb::jlong_to_size_t(jreserved_bytes));

  rocksdb::WriteBatchJni::setHandle(env, jobj, wb);
 }
@ -202,14 +204,18 @@ jbyteArray Java_org_rocksdb_WriteBatchTest_getContents(
  auto factory = std::make_shared<rocksdb::SkipListFactory>();
  rocksdb::Options options;
  options.memtable_factory = factory;
-  rocksdb::MemTable* mem = new rocksdb::MemTable(cmp, options);
+  rocksdb::MemTable* mem = new rocksdb::MemTable(
+      cmp, rocksdb::ImmutableCFOptions(options),
+      rocksdb::MemTableOptions(rocksdb::MutableCFOptions(options), options));
  mem->Ref();
  std::string state;
  rocksdb::ColumnFamilyMemTablesDefault cf_mems_default(mem, &options);
  rocksdb::Status s =
      rocksdb::WriteBatchInternal::InsertInto(b, &cf_mems_default);
  int count = 0;
-  rocksdb::Iterator* iter = mem->NewIterator(rocksdb::ReadOptions());
+  rocksdb::Arena arena;
+  rocksdb::ScopedArenaIterator iter(mem->NewIterator(
+      rocksdb::ReadOptions(), &arena));
  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
    rocksdb::ParsedInternalKey ikey;
    memset(reinterpret_cast<void*>(&ikey), 0, sizeof(ikey));
@ -244,7 +250,6 @@ jbyteArray Java_org_rocksdb_WriteBatchTest_getContents(
    state.append("@");
    state.append(rocksdb::NumberToString(ikey.sequence));
  }
-  delete iter;
  if (!s.ok()) {
    state.append(s.ToString());
  } else if (count != rocksdb::WriteBatchInternal::Count(b)) {
--- a/port/stack_trace.cc
+++ b/port/stack_trace.cc
@ -33,7 +33,7 @@ const char* GetExecutableName() {

  char link[1024];
  snprintf(link, sizeof(link), "/proc/%d/exe", getpid());
-  auto read = readlink(link, name, sizeof(name));
+  auto read = readlink(link, name, sizeof(name) - 1);
  if (-1 == read) {
    return nullptr;
  } else {
--- a/table/adaptive_table_factory.cc
+++ b/table/adaptive_table_factory.cc
@ -39,7 +39,7 @@ extern const uint64_t kLegacyBlockBasedTableMagicNumber;
 extern const uint64_t kCuckooTableMagicNumber;

 Status AdaptiveTableFactory::NewTableReader(
-    const Options& options, const EnvOptions& soptions,
+    const ImmutableCFOptions& ioptions, const EnvOptions& env_options,
    const InternalKeyComparator& icomp, unique_ptr<RandomAccessFile>&& file,
    uint64_t file_size, unique_ptr<TableReader>* table) const {
  Footer footer;
@ -50,24 +50,59 @@ Status AdaptiveTableFactory::NewTableReader(
  if (footer.table_magic_number() == kPlainTableMagicNumber ||
      footer.table_magic_number() == kLegacyPlainTableMagicNumber) {
    return plain_table_factory_->NewTableReader(
-        options, soptions, icomp, std::move(file), file_size, table);
+        ioptions, env_options, icomp, std::move(file), file_size, table);
  } else if (footer.table_magic_number() == kBlockBasedTableMagicNumber ||
      footer.table_magic_number() == kLegacyBlockBasedTableMagicNumber) {
    return block_based_table_factory_->NewTableReader(
-        options, soptions, icomp, std::move(file), file_size, table);
+        ioptions, env_options, icomp, std::move(file), file_size, table);
  } else if (footer.table_magic_number() == kCuckooTableMagicNumber) {
    return cuckoo_table_factory_->NewTableReader(
-        options, soptions, icomp, std::move(file), file_size, table);
+        ioptions, env_options, icomp, std::move(file), file_size, table);
  } else {
    return Status::NotSupported("Unidentified table format");
  }
 }

 TableBuilder* AdaptiveTableFactory::NewTableBuilder(
-    const Options& options, const InternalKeyComparator& internal_comparator,
-    WritableFile* file, CompressionType compression_type) const {
-  return table_factory_to_write_->NewTableBuilder(options, internal_comparator,
-                                                  file, compression_type);
+    const ImmutableCFOptions& ioptions,
+    const InternalKeyComparator& internal_comparator,
+    WritableFile* file, const CompressionType compression_type,
+    const CompressionOptions& compression_opts) const {
+  return table_factory_to_write_->NewTableBuilder(
+      ioptions, internal_comparator, file, compression_type, compression_opts);
+}
+
+std::string AdaptiveTableFactory::GetPrintableTableOptions() const {
+  std::string ret;
+  ret.reserve(20000);
+  const int kBufferSize = 200;
+  char buffer[kBufferSize];
+
+  if (!table_factory_to_write_) {
+    snprintf(buffer, kBufferSize, "  write factory (%s) options:\n%s\n",
+             table_factory_to_write_->Name(),
+             table_factory_to_write_->GetPrintableTableOptions().c_str());
+    ret.append(buffer);
+  }
+  if (!plain_table_factory_) {
+    snprintf(buffer, kBufferSize, "  %s options:\n%s\n",
+             plain_table_factory_->Name(),
+             plain_table_factory_->GetPrintableTableOptions().c_str());
+    ret.append(buffer);
+  }
+  if (!block_based_table_factory_) {
+    snprintf(buffer, kBufferSize, "  %s options:\n%s\n",
+             block_based_table_factory_->Name(),
+             block_based_table_factory_->GetPrintableTableOptions().c_str());
+    ret.append(buffer);
+  }
+  if (!cuckoo_table_factory_) {
+    snprintf(buffer, kBufferSize, "  %s options:\n%s\n",
+             cuckoo_table_factory_->Name(),
+             cuckoo_table_factory_->GetPrintableTableOptions().c_str());
+    ret.append(buffer);
+  }
+  return ret;
 }

 extern TableFactory* NewAdaptiveTableFactory(
--- a/table/adaptive_table_factory.h
+++ b/table/adaptive_table_factory.h
@ -6,12 +6,12 @@

 #ifndef ROCKSDB_LITE

+#include <string>
 #include "rocksdb/options.h"
 #include "rocksdb/table.h"

 namespace rocksdb {

-struct Options;
 struct EnvOptions;

 using std::unique_ptr;
@ -30,16 +30,32 @@ class AdaptiveTableFactory : public TableFactory {
      std::shared_ptr<TableFactory> block_based_table_factory,
      std::shared_ptr<TableFactory> plain_table_factory,
      std::shared_ptr<TableFactory> cuckoo_table_factory);
+
  const char* Name() const override { return "AdaptiveTableFactory"; }
-  Status NewTableReader(const Options& options, const EnvOptions& soptions,
-                        const InternalKeyComparator& internal_comparator,
-                        unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
-                        unique_ptr<TableReader>* table) const override;
-  TableBuilder* NewTableBuilder(const Options& options,
-                                const InternalKeyComparator& icomparator,
-                                WritableFile* file,
-                                CompressionType compression_type) const
-      override;
+
+  Status NewTableReader(
+      const ImmutableCFOptions& ioptions, const EnvOptions& env_options,
+      const InternalKeyComparator& internal_comparator,
+      unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
+      unique_ptr<TableReader>* table) const override;
+
+  TableBuilder* NewTableBuilder(
+      const ImmutableCFOptions& ioptions,
+      const InternalKeyComparator& icomparator,
+      WritableFile* file,
+      const CompressionType compression_type,
+      const CompressionOptions& compression_opts) const override;
+
+  // Sanitizes the specified DB Options.
+  Status SanitizeDBOptions(const DBOptions* db_opts) const override {
+    if (db_opts->allow_mmap_reads == false) {
+      return Status::NotSupported(
+          "AdaptiveTable with allow_mmap_reads == false is not supported.");
+    }
+    return Status::OK();
+  }
+
+  std::string GetPrintableTableOptions() const override;

 private:
  std::shared_ptr<TableFactory> table_factory_to_write_;
--- a/table/block.cc
+++ b/table/block.cc
@ -297,12 +297,10 @@ uint32_t Block::NumRestarts() const {
  return DecodeFixed32(data_ + size_ - sizeof(uint32_t));
 }

-Block::Block(const BlockContents& contents)
-    : data_(contents.data.data()),
-      size_(contents.data.size()),
-      owned_(contents.heap_allocated),
-      cachable_(contents.cachable),
-      compression_type_(contents.compression_type) {
+Block::Block(BlockContents&& contents)
+    : contents_(std::move(contents)),
+      data_(contents_.data.data()),
+      size_(contents_.data.size()) {
  if (size_ < sizeof(uint32_t)) {
    size_ = 0;  // Error marker
  } else {
@ -315,13 +313,8 @@ Block::Block(const BlockContents& contents)
  }
 }

-Block::~Block() {
-  if (owned_) {
-    delete[] data_;
-  }
-}
-
-Iterator* Block::NewIterator(const Comparator* cmp, BlockIter* iter) {
+Iterator* Block::NewIterator(
+    const Comparator* cmp, BlockIter* iter, bool total_order_seek) {
  if (size_ < 2*sizeof(uint32_t)) {
    if (iter != nullptr) {
      iter->SetStatus(Status::Corruption("bad block contents"));
@ -339,12 +332,17 @@ Iterator* Block::NewIterator(const Comparator* cmp, BlockIter* iter) {
      return NewEmptyIterator();
    }
  } else {
+    BlockHashIndex* hash_index_ptr =
+        total_order_seek ? nullptr : hash_index_.get();
+    BlockPrefixIndex* prefix_index_ptr =
+        total_order_seek ? nullptr : prefix_index_.get();
+
    if (iter != nullptr) {
      iter->Initialize(cmp, data_, restart_offset_, num_restarts,
-                    hash_index_.get(), prefix_index_.get());
+                    hash_index_ptr, prefix_index_ptr);
    } else {
      iter = new BlockIter(cmp, data_, restart_offset_, num_restarts,
-                    hash_index_.get(), prefix_index_.get());
+                           hash_index_ptr, prefix_index_ptr);
    }
  }

--- a/table/block.h
+++ b/table/block.h
@ -14,6 +14,10 @@
 #include "rocksdb/iterator.h"
 #include "rocksdb/options.h"
 #include "db/dbformat.h"
+#include "table/block_prefix_index.h"
+#include "table/block_hash_index.h"
+
+#include "format.h"

 namespace rocksdb {

@ -26,15 +30,17 @@ class BlockPrefixIndex;
 class Block {
 public:
  // Initialize the block with the specified contents.
-  explicit Block(const BlockContents& contents);
+  explicit Block(BlockContents&& contents);

-  ~Block();
+  ~Block() = default;

  size_t size() const { return size_; }
  const char* data() const { return data_; }
-  bool cachable() const { return cachable_; }
+  bool cachable() const { return contents_.cachable; }
  uint32_t NumRestarts() const;
-  CompressionType compression_type() const { return compression_type_; }
+  CompressionType compression_type() const {
+    return contents_.compression_type;
+  }

  // If hash index lookup is enabled and `use_hash_index` is true. This block
  // will do hash lookup for the key prefix.
@ -45,8 +51,12 @@ class Block {
  //
  // If iter is null, return new Iterator
  // If iter is not null, update this one and return it as Iterator*
+  //
+  // If total_order_seek is true, hash_index_ and prefix_index_ are ignored.
+  // This option only applies for index block. For data block, hash_index_
+  // and prefix_index_ are null, so this option does not matter.
  Iterator* NewIterator(const Comparator* comparator,
-      BlockIter* iter = nullptr);
+      BlockIter* iter = nullptr, bool total_order_seek = true);
  void SetBlockHashIndex(BlockHashIndex* hash_index);
  void SetBlockPrefixIndex(BlockPrefixIndex* prefix_index);

@ -54,12 +64,10 @@ class Block {
  size_t ApproximateMemoryUsage() const;

 private:
-  const char* data_;
-  size_t size_;
+  BlockContents contents_;
+  const char* data_;            // contents_.data.data()
+  size_t size_;                 // contents_.data.size()
  uint32_t restart_offset_;     // Offset in data_ of restart array
-  bool owned_;                  // Block owns data_[]
-  bool cachable_;
-  CompressionType compression_type_;
  std::unique_ptr<BlockHashIndex> hash_index_;
  std::unique_ptr<BlockPrefixIndex> prefix_index_;

--- a/table/block_based_filter_block.cc
+++ b/table/block_based_filter_block.cc
@ -7,7 +7,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.

-#include "table/filter_block.h"
+#include "table/block_based_filter_block.h"

 #include "db/dbformat.h"
 #include "rocksdb/filter_policy.h"
@ -15,20 +15,39 @@

 namespace rocksdb {

+namespace {
+bool SamePrefix(const SliceTransform* prefix_extractor,
+                const Slice& key1, const Slice& key2) {
+  if (!prefix_extractor->InDomain(key1) &&
+      !prefix_extractor->InDomain(key2)) {
+    return true;
+  } else if (!prefix_extractor->InDomain(key1) ||
+             !prefix_extractor->InDomain(key2)) {
+    return false;
+  } else {
+    return (prefix_extractor->Transform(key1) ==
+            prefix_extractor->Transform(key2));
+  }
+}
+}  // namespace
+
+
 // See doc/table_format.txt for an explanation of the filter block format.

 // Generate new filter every 2KB of data
 static const size_t kFilterBaseLg = 11;
 static const size_t kFilterBase = 1 << kFilterBaseLg;

-FilterBlockBuilder::FilterBlockBuilder(const Options& opt,
-                                       const Comparator* internal_comparator)
-    : policy_(opt.filter_policy),
-      prefix_extractor_(opt.prefix_extractor.get()),
-      whole_key_filtering_(opt.whole_key_filtering),
-      comparator_(internal_comparator) {}
+BlockBasedFilterBlockBuilder::BlockBasedFilterBlockBuilder(
+    const SliceTransform* prefix_extractor,
+    const BlockBasedTableOptions& table_opt)
+    : policy_(table_opt.filter_policy.get()),
+      prefix_extractor_(prefix_extractor),
+      whole_key_filtering_(table_opt.whole_key_filtering) {
+  assert(policy_);
+}

-void FilterBlockBuilder::StartBlock(uint64_t block_offset) {
+void BlockBasedFilterBlockBuilder::StartBlock(uint64_t block_offset) {
  uint64_t filter_index = (block_offset / kFilterBase);
  assert(filter_index >= filter_offsets_.size());
  while (filter_index > filter_offsets_.size()) {
@ -36,59 +55,45 @@ void FilterBlockBuilder::StartBlock(uint64_t block_offset) {
  }
 }

-bool FilterBlockBuilder::SamePrefix(const Slice &key1,
-                                    const Slice &key2) const {
-  if (!prefix_extractor_->InDomain(key1) &&
-      !prefix_extractor_->InDomain(key2)) {
-    return true;
-  } else if (!prefix_extractor_->InDomain(key1) ||
-             !prefix_extractor_->InDomain(key2)) {
-    return false;
-  } else {
-    return (prefix_extractor_->Transform(key1) ==
-            prefix_extractor_->Transform(key2));
+void BlockBasedFilterBlockBuilder::Add(const Slice& key) {
+  added_to_start_ = 0;
+  if (whole_key_filtering_) {
+    AddKey(key);
+    added_to_start_ = 1;
+  }
+  if (prefix_extractor_ && prefix_extractor_->InDomain(key)) {
+    AddPrefix(key);
  }
 }

-void FilterBlockBuilder::AddKey(const Slice& key) {
+// Add key to filter if needed
+inline void BlockBasedFilterBlockBuilder::AddKey(const Slice& key) {
+  start_.push_back(entries_.size());
+  entries_.append(key.data(), key.size());
+}
+
+// Add prefix to filter if needed
+inline void BlockBasedFilterBlockBuilder::AddPrefix(const Slice& key) {
  // get slice for most recently added entry
  Slice prev;
-  size_t added_to_start = 0;
-
-  // add key to filter if needed
-  if (whole_key_filtering_) {
-    start_.push_back(entries_.size());
-    ++added_to_start;
-    entries_.append(key.data(), key.size());
-  }
-
-  if (start_.size() > added_to_start) {
-    size_t prev_start = start_[start_.size() - 1 - added_to_start];
+  if (start_.size() > added_to_start_) {
+    size_t prev_start = start_[start_.size() - 1 - added_to_start_];
    const char* base = entries_.data() + prev_start;
    size_t length = entries_.size() - prev_start;
    prev = Slice(base, length);
  }

-  // add prefix to filter if needed
-  if (prefix_extractor_ && prefix_extractor_->InDomain(ExtractUserKey(key))) {
-    // If prefix_extractor_, this filter_block layer assumes we only
-    // operate on internal keys.
-    Slice user_key = ExtractUserKey(key);
-    // this assumes prefix(prefix(key)) == prefix(key), as the last
-    // entry in entries_ may be either a key or prefix, and we use
-    // prefix(last entry) to get the prefix of the last key.
-    if (prev.size() == 0 ||
-        !SamePrefix(user_key, ExtractUserKey(prev))) {
-      Slice prefix = prefix_extractor_->Transform(user_key);
-      InternalKey internal_prefix_tmp(prefix, 0, kTypeValue);
-      Slice internal_prefix = internal_prefix_tmp.Encode();
-      start_.push_back(entries_.size());
-      entries_.append(internal_prefix.data(), internal_prefix.size());
-    }
+  // this assumes prefix(prefix(key)) == prefix(key), as the last
+  // entry in entries_ may be either a key or prefix, and we use
+  // prefix(last entry) to get the prefix of the last key.
+  if (prev.size() == 0 || !SamePrefix(prefix_extractor_, key, prev)) {
+    Slice prefix = prefix_extractor_->Transform(key);
+    start_.push_back(entries_.size());
+    entries_.append(prefix.data(), prefix.size());
  }
 }

-Slice FilterBlockBuilder::Finish() {
+Slice BlockBasedFilterBlockBuilder::Finish() {
  if (!start_.empty()) {
    GenerateFilter();
  }
@ -104,7 +109,7 @@ Slice FilterBlockBuilder::Finish() {
  return Slice(result_);
 }

-void FilterBlockBuilder::GenerateFilter() {
+void BlockBasedFilterBlockBuilder::GenerateFilter() {
  const size_t num_entries = start_.size();
  if (num_entries == 0) {
    // Fast path if there are no keys for this filter
@ -117,7 +122,7 @@ void FilterBlockBuilder::GenerateFilter() {
  tmp_entries_.resize(num_entries);
  for (size_t i = 0; i < num_entries; i++) {
    const char* base = entries_.data() + start_[i];
-    size_t length = start_[i+1] - start_[i];
+    size_t length = start_[i + 1] - start_[i];
    tmp_entries_[i] = Slice(base, length);
  }

@ -130,49 +135,52 @@ void FilterBlockBuilder::GenerateFilter() {
  start_.clear();
 }

-FilterBlockReader::FilterBlockReader(
-    const Options& opt, const Slice& contents, bool delete_contents_after_use)
-    : policy_(opt.filter_policy),
-      prefix_extractor_(opt.prefix_extractor.get()),
-      whole_key_filtering_(opt.whole_key_filtering),
+BlockBasedFilterBlockReader::BlockBasedFilterBlockReader(
+    const SliceTransform* prefix_extractor,
+    const BlockBasedTableOptions& table_opt, BlockContents&& contents)
+    : policy_(table_opt.filter_policy.get()),
+      prefix_extractor_(prefix_extractor),
+      whole_key_filtering_(table_opt.whole_key_filtering),
      data_(nullptr),
      offset_(nullptr),
      num_(0),
-      base_lg_(0) {
-  size_t n = contents.size();
+      base_lg_(0),
+      contents_(std::move(contents)) {
+  assert(policy_);
+  size_t n = contents_.data.size();
  if (n < 5) return;  // 1 byte for base_lg_ and 4 for start of offset array
-  base_lg_ = contents[n-1];
-  uint32_t last_word = DecodeFixed32(contents.data() + n - 5);
+  base_lg_ = contents_.data[n - 1];
+  uint32_t last_word = DecodeFixed32(contents_.data.data() + n - 5);
  if (last_word > n - 5) return;
-  data_ = contents.data();
+  data_ = contents_.data.data();
  offset_ = data_ + last_word;
  num_ = (n - 5 - last_word) / 4;
-  if (delete_contents_after_use) {
-    filter_data.reset(contents.data());
-  }
 }

-bool FilterBlockReader::KeyMayMatch(uint64_t block_offset,
-                                    const Slice& key) {
+bool BlockBasedFilterBlockReader::KeyMayMatch(const Slice& key,
+                                              uint64_t block_offset) {
+  assert(block_offset != kNotValid);
  if (!whole_key_filtering_) {
    return true;
  }
-  return MayMatch(block_offset, key);
+  return MayMatch(key, block_offset);
 }

-bool FilterBlockReader::PrefixMayMatch(uint64_t block_offset,
-                                       const Slice& prefix) {
+bool BlockBasedFilterBlockReader::PrefixMayMatch(const Slice& prefix,
+                                                 uint64_t block_offset) {
+  assert(block_offset != kNotValid);
  if (!prefix_extractor_) {
    return true;
  }
-  return MayMatch(block_offset, prefix);
+  return MayMatch(prefix, block_offset);
 }

-bool FilterBlockReader::MayMatch(uint64_t block_offset, const Slice& entry) {
+bool BlockBasedFilterBlockReader::MayMatch(const Slice& entry,
+                                           uint64_t block_offset) {
  uint64_t index = block_offset >> base_lg_;
  if (index < num_) {
-    uint32_t start = DecodeFixed32(offset_ + index*4);
-    uint32_t limit = DecodeFixed32(offset_ + index*4 + 4);
+    uint32_t start = DecodeFixed32(offset_ + index * 4);
+    uint32_t limit = DecodeFixed32(offset_ + index * 4 + 4);
    if (start <= limit && limit <= (uint32_t)(offset_ - data_)) {
      Slice filter = Slice(data_ + start, limit - start);
      return policy_->KeyMayMatch(entry, filter);
@ -184,7 +192,7 @@ bool FilterBlockReader::MayMatch(uint64_t block_offset, const Slice& entry) {
  return true;  // Errors are treated as potential matches
 }

-size_t FilterBlockReader::ApproximateMemoryUsage() const {
+size_t BlockBasedFilterBlockReader::ApproximateMemoryUsage() const {
  return num_ * 4 + 5 + (offset_ - data_);
 }
 }
--- a/table/block_based_filter_block.h
+++ b/table/block_based_filter_block.h
@ -0,0 +1,101 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A filter block is stored near the end of a Table file.  It contains
+// filters (e.g., bloom filters) for all data blocks in the table combined
+// into a single filter block.
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string>
+#include <memory>
+#include <vector>
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "table/filter_block.h"
+#include "util/hash.h"
+
+namespace rocksdb {
+
+
+// A BlockBasedFilterBlockBuilder is used to construct all of the filters for a
+// particular Table.  It generates a single string which is stored as
+// a special block in the Table.
+//
+// The sequence of calls to BlockBasedFilterBlockBuilder must match the regexp:
+//      (StartBlock Add*)* Finish
+class BlockBasedFilterBlockBuilder : public FilterBlockBuilder {
+ public:
+  BlockBasedFilterBlockBuilder(const SliceTransform* prefix_extractor,
+      const BlockBasedTableOptions& table_opt);
+
+  virtual bool IsBlockBased() override { return true; }
+  virtual void StartBlock(uint64_t block_offset) override;
+  virtual void Add(const Slice& key) override;
+  virtual Slice Finish() override;
+
+ private:
+  void AddKey(const Slice& key);
+  void AddPrefix(const Slice& key);
+  void GenerateFilter();
+
+  // important: all of these might point to invalid addresses
+  // at the time of destruction of this filter block. destructor
+  // should NOT dereference them.
+  const FilterPolicy* policy_;
+  const SliceTransform* prefix_extractor_;
+  bool whole_key_filtering_;
+
+  std::string entries_;             // Flattened entry contents
+  std::vector<size_t> start_;       // Starting index in entries_ of each entry
+  uint32_t added_to_start_;         // To indicate if key is added
+  std::string result_;              // Filter data computed so far
+  std::vector<Slice> tmp_entries_;  // policy_->CreateFilter() argument
+  std::vector<uint32_t> filter_offsets_;
+
+  // No copying allowed
+  BlockBasedFilterBlockBuilder(const BlockBasedFilterBlockBuilder&);
+  void operator=(const BlockBasedFilterBlockBuilder&);
+};
+
+// A FilterBlockReader is used to parse filter from SST table.
+// KeyMayMatch and PrefixMayMatch would trigger filter checking
+class BlockBasedFilterBlockReader : public FilterBlockReader {
+ public:
+  // REQUIRES: "contents" and *policy must stay live while *this is live.
+  BlockBasedFilterBlockReader(const SliceTransform* prefix_extractor,
+                              const BlockBasedTableOptions& table_opt,
+                              BlockContents&& contents);
+  virtual bool IsBlockBased() override { return true; }
+  virtual bool KeyMayMatch(const Slice& key,
+                           uint64_t block_offset = kNotValid) override;
+  virtual bool PrefixMayMatch(const Slice& prefix,
+                              uint64_t block_offset = kNotValid) override;
+  virtual size_t ApproximateMemoryUsage() const override;
+
+ private:
+  const FilterPolicy* policy_;
+  const SliceTransform* prefix_extractor_;
+  bool whole_key_filtering_;
+  const char* data_;    // Pointer to filter data (at block-start)
+  const char* offset_;  // Pointer to beginning of offset array (at block-end)
+  size_t num_;          // Number of entries in offset array
+  size_t base_lg_;      // Encoding parameter (see kFilterBaseLg in .cc file)
+  BlockContents contents_;
+
+  bool MayMatch(const Slice& entry, uint64_t block_offset);
+
+  // No copying allowed
+  BlockBasedFilterBlockReader(const BlockBasedFilterBlockReader&);
+  void operator=(const BlockBasedFilterBlockReader&);
+};
+}  // namespace rocksdb
--- a/table/block_based_filter_block_test.cc
+++ b/table/block_based_filter_block_test.cc
@ -0,0 +1,242 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/block_based_filter_block.h"
+
+#include "rocksdb/filter_policy.h"
+#include "util/coding.h"
+#include "util/hash.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+// For testing: emit an array with one hash value per key
+class TestHashFilter : public FilterPolicy {
+ public:
+  virtual const char* Name() const {
+    return "TestHashFilter";
+  }
+
+  virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const {
+    for (int i = 0; i < n; i++) {
+      uint32_t h = Hash(keys[i].data(), keys[i].size(), 1);
+      PutFixed32(dst, h);
+    }
+  }
+
+  virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const {
+    uint32_t h = Hash(key.data(), key.size(), 1);
+    for (unsigned int i = 0; i + 4 <= filter.size(); i += 4) {
+      if (h == DecodeFixed32(filter.data() + i)) {
+        return true;
+      }
+    }
+    return false;
+  }
+};
+
+class FilterBlockTest {
+ public:
+  TestHashFilter policy_;
+  BlockBasedTableOptions table_options_;
+
+  FilterBlockTest() {
+    table_options_.filter_policy.reset(new TestHashFilter());
+  }
+};
+
+TEST(FilterBlockTest, EmptyBuilder) {
+  BlockBasedFilterBlockBuilder builder(nullptr, table_options_);
+  BlockContents block(builder.Finish(), false, kNoCompression);
+  ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block.data));
+  BlockBasedFilterBlockReader reader(nullptr, table_options_, std::move(block));
+  ASSERT_TRUE(reader.KeyMayMatch("foo", 0));
+  ASSERT_TRUE(reader.KeyMayMatch("foo", 100000));
+}
+
+TEST(FilterBlockTest, SingleChunk) {
+  BlockBasedFilterBlockBuilder builder(nullptr, table_options_);
+  builder.StartBlock(100);
+  builder.Add("foo");
+  builder.Add("bar");
+  builder.Add("box");
+  builder.StartBlock(200);
+  builder.Add("box");
+  builder.StartBlock(300);
+  builder.Add("hello");
+  BlockContents block(builder.Finish(), false, kNoCompression);
+  BlockBasedFilterBlockReader reader(nullptr, table_options_, std::move(block));
+  ASSERT_TRUE(reader.KeyMayMatch("foo", 100));
+  ASSERT_TRUE(reader.KeyMayMatch("bar", 100));
+  ASSERT_TRUE(reader.KeyMayMatch("box", 100));
+  ASSERT_TRUE(reader.KeyMayMatch("hello", 100));
+  ASSERT_TRUE(reader.KeyMayMatch("foo", 100));
+  ASSERT_TRUE(!reader.KeyMayMatch("missing", 100));
+  ASSERT_TRUE(!reader.KeyMayMatch("other", 100));
+}
+
+TEST(FilterBlockTest, MultiChunk) {
+  BlockBasedFilterBlockBuilder builder(nullptr, table_options_);
+
+  // First filter
+  builder.StartBlock(0);
+  builder.Add("foo");
+  builder.StartBlock(2000);
+  builder.Add("bar");
+
+  // Second filter
+  builder.StartBlock(3100);
+  builder.Add("box");
+
+  // Third filter is empty
+
+  // Last filter
+  builder.StartBlock(9000);
+  builder.Add("box");
+  builder.Add("hello");
+
+  BlockContents block(builder.Finish(), false, kNoCompression);
+  BlockBasedFilterBlockReader reader(nullptr, table_options_, std::move(block));
+
+  // Check first filter
+  ASSERT_TRUE(reader.KeyMayMatch("foo", 0));
+  ASSERT_TRUE(reader.KeyMayMatch("bar", 2000));
+  ASSERT_TRUE(!reader.KeyMayMatch("box", 0));
+  ASSERT_TRUE(!reader.KeyMayMatch("hello", 0));
+
+  // Check second filter
+  ASSERT_TRUE(reader.KeyMayMatch("box", 3100));
+  ASSERT_TRUE(!reader.KeyMayMatch("foo", 3100));
+  ASSERT_TRUE(!reader.KeyMayMatch("bar", 3100));
+  ASSERT_TRUE(!reader.KeyMayMatch("hello", 3100));
+
+  // Check third filter (empty)
+  ASSERT_TRUE(!reader.KeyMayMatch("foo", 4100));
+  ASSERT_TRUE(!reader.KeyMayMatch("bar", 4100));
+  ASSERT_TRUE(!reader.KeyMayMatch("box", 4100));
+  ASSERT_TRUE(!reader.KeyMayMatch("hello", 4100));
+
+  // Check last filter
+  ASSERT_TRUE(reader.KeyMayMatch("box", 9000));
+  ASSERT_TRUE(reader.KeyMayMatch("hello", 9000));
+  ASSERT_TRUE(!reader.KeyMayMatch("foo", 9000));
+  ASSERT_TRUE(!reader.KeyMayMatch("bar", 9000));
+}
+
+// Test for block based filter block
+// use new interface in FilterPolicy to create filter builder/reader
+class BlockBasedFilterBlockTest {
+ public:
+  BlockBasedTableOptions table_options_;
+
+  BlockBasedFilterBlockTest() {
+    table_options_.filter_policy.reset(NewBloomFilterPolicy(10));
+  }
+
+  ~BlockBasedFilterBlockTest() {}
+};
+
+TEST(BlockBasedFilterBlockTest, BlockBasedEmptyBuilder) {
+  FilterBlockBuilder* builder = new BlockBasedFilterBlockBuilder(
+      nullptr, table_options_);
+  BlockContents block(builder->Finish(), false, kNoCompression);
+  ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block.data));
+  FilterBlockReader* reader = new BlockBasedFilterBlockReader(
+      nullptr, table_options_, std::move(block));
+  ASSERT_TRUE(reader->KeyMayMatch("foo", 0));
+  ASSERT_TRUE(reader->KeyMayMatch("foo", 100000));
+
+  delete builder;
+  delete reader;
+}
+
+TEST(BlockBasedFilterBlockTest, BlockBasedSingleChunk) {
+  FilterBlockBuilder* builder = new BlockBasedFilterBlockBuilder(
+      nullptr, table_options_);
+  builder->StartBlock(100);
+  builder->Add("foo");
+  builder->Add("bar");
+  builder->Add("box");
+  builder->StartBlock(200);
+  builder->Add("box");
+  builder->StartBlock(300);
+  builder->Add("hello");
+  BlockContents block(builder->Finish(), false, kNoCompression);
+  FilterBlockReader* reader = new BlockBasedFilterBlockReader(
+      nullptr, table_options_, std::move(block));
+  ASSERT_TRUE(reader->KeyMayMatch("foo", 100));
+  ASSERT_TRUE(reader->KeyMayMatch("bar", 100));
+  ASSERT_TRUE(reader->KeyMayMatch("box", 100));
+  ASSERT_TRUE(reader->KeyMayMatch("hello", 100));
+  ASSERT_TRUE(reader->KeyMayMatch("foo", 100));
+  ASSERT_TRUE(!reader->KeyMayMatch("missing", 100));
+  ASSERT_TRUE(!reader->KeyMayMatch("other", 100));
+
+  delete builder;
+  delete reader;
+}
+
+TEST(BlockBasedFilterBlockTest, BlockBasedMultiChunk) {
+  FilterBlockBuilder* builder = new BlockBasedFilterBlockBuilder(
+      nullptr, table_options_);
+
+  // First filter
+  builder->StartBlock(0);
+  builder->Add("foo");
+  builder->StartBlock(2000);
+  builder->Add("bar");
+
+  // Second filter
+  builder->StartBlock(3100);
+  builder->Add("box");
+
+  // Third filter is empty
+
+  // Last filter
+  builder->StartBlock(9000);
+  builder->Add("box");
+  builder->Add("hello");
+
+  BlockContents block(builder->Finish(), false, kNoCompression);
+  FilterBlockReader* reader = new BlockBasedFilterBlockReader(
+      nullptr, table_options_, std::move(block));
+
+  // Check first filter
+  ASSERT_TRUE(reader->KeyMayMatch("foo", 0));
+  ASSERT_TRUE(reader->KeyMayMatch("bar", 2000));
+  ASSERT_TRUE(!reader->KeyMayMatch("box", 0));
+  ASSERT_TRUE(!reader->KeyMayMatch("hello", 0));
+
+  // Check second filter
+  ASSERT_TRUE(reader->KeyMayMatch("box", 3100));
+  ASSERT_TRUE(!reader->KeyMayMatch("foo", 3100));
+  ASSERT_TRUE(!reader->KeyMayMatch("bar", 3100));
+  ASSERT_TRUE(!reader->KeyMayMatch("hello", 3100));
+
+  // Check third filter (empty)
+  ASSERT_TRUE(!reader->KeyMayMatch("foo", 4100));
+  ASSERT_TRUE(!reader->KeyMayMatch("bar", 4100));
+  ASSERT_TRUE(!reader->KeyMayMatch("box", 4100));
+  ASSERT_TRUE(!reader->KeyMayMatch("hello", 4100));
+
+  // Check last filter
+  ASSERT_TRUE(reader->KeyMayMatch("box", 9000));
+  ASSERT_TRUE(reader->KeyMayMatch("hello", 9000));
+  ASSERT_TRUE(!reader->KeyMayMatch("foo", 9000));
+  ASSERT_TRUE(!reader->KeyMayMatch("bar", 9000));
+
+  delete builder;
+  delete reader;
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); }
--- a/Show More
+++ b/Show More