Merge branch 'master' of github.com:facebook/rocksdb into gh-pages

2013-11-15 11:34:59 -08:00 · 2013-11-15 11:34:59 -08:00 · 3a0a8a02be
commit 3a0a8a02be
parent 4f873b9c2b 29c931f70b
353 changed files with 157724 additions and 0 deletions
--- a/.arcconfig
+++ b/.arcconfig
@ -0,0 +1,10 @@
+{
+  "project_id" : "leveldb",
+  "conduit_uri" : "https://reviews.facebook.net/",
+  "copyright_holder" : "",
+  "load" : [
+    "linters/src/"
+  ],
+  "lint.engine" : "FacebookFbcodeLintEngine",
+  "lint.engine.single.linter" : "FbcodeCppLinter"
+}
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,23 @@
+build_config.mk
+
+*.a
+*.arc
+*.d
+*.dylib*
+*.gcda
+*.gcno
+*.o
+*.so
+*.so.*
+*_test
+*_bench
+*_stress
+
+ldb
+manifest_dump
+sst_dump
+util/build_version.cc
+build_tools/VALGRIND_LOGS/
+coverage/COVERAGE_REPORT
+util/build_version.cc.tmp
+.gdbhistory
--- a/35
+++ b/35
@ -0,0 +1,35 @@
+BSD License
+
+For rocksdb software
+
+Copyright (c) 2013, Facebook, Inc.
+All rights reserved.
+---------------------------------------------------------------------
+
+Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+   * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/380
+++ b/380
@ -0,0 +1,380 @@
+# Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+# Inherit some settings from environment variables, if available
+INSTALL_PATH ?= $(CURDIR)
+
+#-----------------------------------------------
+# Uncomment exactly one of the lines labelled (A), (B), and (C) below
+# to switch between compilation modes.
+
+# OPT ?= -DNDEBUG     # (A) Production use (optimized mode)
+OPT += -O2 -fno-omit-frame-pointer -momit-leaf-frame-pointer
+#-----------------------------------------------
+
+# detect what platform we're building on
+$(shell (export ROCKSDB_ROOT=$(CURDIR); $(CURDIR)/build_tools/build_detect_platform $(CURDIR)/build_config.mk))
+# this file is generated by the previous line to set build flags and sources
+include build_config.mk
+
+WARNING_FLAGS = -Wall -Werror
+CFLAGS += -g $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CCFLAGS) $(OPT)
+CXXFLAGS += -g $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) -std=gnu++0x -Woverloaded-virtual
+
+LDFLAGS += $(PLATFORM_LDFLAGS)
+
+LIBOBJECTS = $(SOURCES:.cc=.o)
+LIBOBJECTS += $(SOURCESCPP:.cpp=.o)
+MEMENVOBJECTS = $(MEMENV_SOURCES:.cc=.o)
+
+TESTUTIL = ./util/testutil.o
+TESTHARNESS = ./util/testharness.o $(TESTUTIL)
+VALGRIND_ERROR = 2
+VALGRIND_DIR = build_tools/VALGRIND_LOGS
+VALGRIND_VER := $(join $(VALGRIND_VER),valgrind)
+VALGRIND_OPTS = --error-exitcode=$(VALGRIND_ERROR) --leak-check=full
+
+TESTS = \
+	table_stats_collector_test \
+	arena_test \
+	auto_roll_logger_test \
+	block_test \
+	bloom_test \
+	c_test \
+	cache_test \
+	coding_test \
+	corruption_test \
+	crc32c_test \
+	db_test \
+	dbformat_test \
+	env_test \
+	blob_store_test \
+	filelock_test \
+	filename_test \
+	filter_block_test \
+	histogram_test \
+	log_test \
+	manual_compaction_test \
+	memenv_test \
+	merge_test \
+	redis_test \
+	reduce_levels_test \
+	simple_table_db_test \
+	skiplist_test \
+	stringappend_test \
+	table_test \
+	ttl_test \
+	version_edit_test \
+	version_set_test \
+	write_batch_test\
+	deletefile_test
+
+TOOLS = \
+        sst_dump \
+        db_stress \
+        ldb \
+				db_repl_stress \
+ 				blob_store_bench
+
+PROGRAMS = db_bench signal_test $(TESTS) $(TOOLS)
+BENCHMARKS = db_bench_sqlite3 db_bench_tree_db table_reader_bench
+
+# The library name is configurable since we are maintaining libraries of both
+# debug/release mode.
+LIBNAME = librocksdb
+LIBRARY = ${LIBNAME}.a
+MEMENVLIBRARY = libmemenv.a
+
+default: all
+
+#-----------------------------------------------
+# Create platform independent shared libraries.
+#-----------------------------------------------
+ifneq ($(PLATFORM_SHARED_EXT),)
+
+ifneq ($(PLATFORM_SHARED_VERSIONED),true)
+SHARED1 = ${LIBNAME}.$(PLATFORM_SHARED_EXT)
+SHARED2 = $(SHARED1)
+SHARED3 = $(SHARED1)
+SHARED = $(SHARED1)
+else
+# Update db.h if you change these.
+SHARED_MAJOR = 2
+SHARED_MINOR = 0
+SHARED1 = ${LIBNAME}.$(PLATFORM_SHARED_EXT)
+SHARED2 = $(SHARED1).$(SHARED_MAJOR)
+SHARED3 = $(SHARED1).$(SHARED_MAJOR).$(SHARED_MINOR)
+SHARED = $(SHARED1) $(SHARED2) $(SHARED3)
+$(SHARED1): $(SHARED3)
+	ln -fs $(SHARED3) $(SHARED1)
+$(SHARED2): $(SHARED3)
+	ln -fs $(SHARED3) $(SHARED2)
+endif
+
+$(SHARED3):
+	$(CXX) $(LDFLAGS) $(PLATFORM_SHARED_LDFLAGS)$(SHARED2) $(CXXFLAGS) $(COVERAGEFLAGS) $(PLATFORM_SHARED_CFLAGS) $(SOURCES) -o $(SHARED3)
+
+endif  # PLATFORM_SHARED_EXT
+
+all: $(LIBRARY) $(PROGRAMS)
+
+.PHONY: blackbox_crash_test check clean coverage crash_test ldb_tests \
+	release tags valgrind_check whitebox_crash_test
+
+release:
+	$(MAKE) clean
+	OPT=-DNDEBUG $(MAKE) -j32
+
+coverage:
+	$(MAKE) clean
+	COVERAGEFLAGS="-fprofile-arcs -ftest-coverage" LDFLAGS+="-lgcov" $(MAKE) all check
+	(cd coverage; ./coverage_test.sh)
+	# Delete intermediate files
+	find . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \;
+
+check: all $(PROGRAMS) $(TESTS) $(TOOLS) ldb_tests
+	for t in $(TESTS); do echo "***** Running $$t"; ./$$t || exit 1; done
+
+ldb_tests: all $(PROGRAMS) $(TOOLS)
+	python tools/ldb_test.py
+
+crash_test: blackbox_crash_test whitebox_crash_test
+
+blackbox_crash_test: db_stress
+	python -u tools/db_crashtest.py
+
+whitebox_crash_test: db_stress
+	python -u tools/db_crashtest2.py
+
+valgrind_check: all $(PROGRAMS) $(TESTS)
+	mkdir -p $(VALGRIND_DIR)
+	echo TESTS THAT HAVE VALGRIND ERRORS > $(VALGRIND_DIR)/valgrind_failed_tests; \
+	echo TIMES in seconds TAKEN BY TESTS ON VALGRIND > $(VALGRIND_DIR)/valgrind_tests_times; \
+	for t in $(filter-out skiplist_test,$(TESTS)); do \
+		stime=`date '+%s'`; \
+		$(VALGRIND_VER) $(VALGRIND_OPTS) ./$$t; \
+		if [ $$? -eq $(VALGRIND_ERROR) ] ; then \
+			echo $$t >> $(VALGRIND_DIR)/valgrind_failed_tests; \
+		fi; \
+		etime=`date '+%s'`; \
+		echo $$t $$((etime - stime)) >> $(VALGRIND_DIR)/valgrind_tests_times; \
+	done
+
+clean:
+	-rm -f $(PROGRAMS) $(BENCHMARKS) $(LIBRARY) $(SHARED) $(MEMENVLIBRARY) build_config.mk
+	-rm -rf ios-x86/* ios-arm/*
+	-find . -name "*.[od]" -exec rm {} \;
+	-find . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \;
+tags:
+	ctags * -R
+	cscope -b `find . -name '*.cc'` `find . -name '*.h'`
+
+# ---------------------------------------------------------------------------
+# 	Unit tests and tools
+# ---------------------------------------------------------------------------
+$(LIBRARY): $(LIBOBJECTS)
+	rm -f $@
+	$(AR) -rs $@ $(LIBOBJECTS)
+
+db_bench: db/db_bench.o $(LIBOBJECTS) $(TESTUTIL)
+	$(CXX) db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)
+
+db_stress: tools/db_stress.o $(LIBOBJECTS) $(TESTUTIL)
+	$(CXX) tools/db_stress.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)
+
+db_repl_stress: tools/db_repl_stress.o $(LIBOBJECTS) $(TESTUTIL)
+	$(CXX) tools/db_repl_stress.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)
+
+blob_store_bench: tools/blob_store_bench.o $(LIBOBJECTS) $(TESTUTIL)
+	$(CXX) tools/blob_store_bench.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)
+
+db_bench_sqlite3: doc/bench/db_bench_sqlite3.o $(LIBOBJECTS) $(TESTUTIL)
+	$(CXX) doc/bench/db_bench_sqlite3.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) -lsqlite3 $(COVERAGEFLAGS)
+
+db_bench_tree_db: doc/bench/db_bench_tree_db.o $(LIBOBJECTS) $(TESTUTIL)
+	$(CXX) doc/bench/db_bench_tree_db.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) -lkyotocabinet $(COVERAGEFLAGS)
+
+signal_test: util/signal_test.o $(LIBOBJECTS)
+	$(CXX) util/signal_test.o $(LIBOBJECTS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+arena_test: util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+table_stats_collector_test: db/table_stats_collector_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) db/table_stats_collector_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+bloom_test: util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+c_test: db/c_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) db/c_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+cache_test: util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+coding_test: util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+blob_store_test: util/blob_store_test.o $(LIBOBJECTS) $(TESTHARNESS) $(TESTUTIL)
+	$(CXX) util/blob_store_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o$@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+stringappend_test: utilities/merge_operators/string_append/stringappend_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) utilities/merge_operators/string_append/stringappend_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+redis_test: utilities/redis/redis_lists_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) utilities/redis/redis_lists_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+histogram_test: util/histogram_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) util/histogram_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o$@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+corruption_test: db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+crc32c_test: util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+db_test: db/db_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+simple_table_db_test: db/simple_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) db/simple_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+table_reader_bench: table/table_reader_bench.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) table/table_reader_bench.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+perf_context_test: db/perf_context_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) db/perf_context_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS)
+
+prefix_test: db/prefix_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) db/prefix_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS)
+
+ttl_test: utilities/ttl/ttl_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) utilities/ttl/ttl_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)
+
+dbformat_test: db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+env_test: util/env_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) util/env_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+filter_block_test: table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+table_test: table/table_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+block_test: table/block_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) table/block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+skiplist_test: db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+version_edit_test: db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+version_set_test: db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+reduce_levels_test: tools/reduce_levels_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) tools/reduce_levels_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+write_batch_test: db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+merge_test: db/merge_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) db/merge_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+deletefile_test: db/deletefile_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) db/deletefile_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS)
+
+$(MEMENVLIBRARY) : $(MEMENVOBJECTS)
+	rm -f $@
+	$(AR) -rs $@ $(MEMENVOBJECTS)
+
+memenv_test : helpers/memenv/memenv_test.o $(MEMENVLIBRARY) $(LIBRARY) $(TESTHARNESS)
+	$(CXX) helpers/memenv/memenv_test.o $(MEMENVLIBRARY) $(LIBRARY) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+manual_compaction_test: util/manual_compaction_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) util/manual_compaction_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+rocksdb_shell: tools/shell/ShellContext.o tools/shell/ShellState.o tools/shell/LeveldbShell.o tools/shell/DBClientProxy.o tools/shell/ShellContext.h tools/shell/ShellState.h tools/shell/DBClientProxy.h $(LIBOBJECTS)
+	$(CXX) tools/shell/ShellContext.o tools/shell/ShellState.o tools/shell/LeveldbShell.o tools/shell/DBClientProxy.o $(LIBOBJECTS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+DBClientProxy_test: tools/shell/test/DBClientProxyTest.o tools/shell/DBClientProxy.o $(LIBRARY)
+	$(CXX) tools/shell/test/DBClientProxyTest.o tools/shell/DBClientProxy.o $(LIBRARY) $(EXEC_LDFLAGS) $(EXEC_LDFLAGS) -o $@  $(LDFLAGS) $(COVERAGEFLAGS)
+
+filelock_test: util/filelock_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) util/filelock_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+auto_roll_logger_test: util/auto_roll_logger_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(CXX) util/auto_roll_logger_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+sst_dump: tools/sst_dump.o $(LIBOBJECTS)
+	$(CXX) tools/sst_dump.o $(LIBOBJECTS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+ldb: tools/ldb.o $(LIBOBJECTS)
+	$(CXX) tools/ldb.o $(LIBOBJECTS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+# ---------------------------------------------------------------------------
+#  	Platform-specific compilation
+# ---------------------------------------------------------------------------
+
+ifeq ($(PLATFORM), IOS)
+# For iOS, create universal object files to be used on both the simulator and
+# a device.
+PLATFORMSROOT=/Applications/Xcode.app/Contents/Developer/Platforms
+SIMULATORROOT=$(PLATFORMSROOT)/iPhoneSimulator.platform/Developer
+DEVICEROOT=$(PLATFORMSROOT)/iPhoneOS.platform/Developer
+IOSVERSION=$(shell defaults read $(PLATFORMSROOT)/iPhoneOS.platform/versionCFBundleShortVersionString)
+
+.cc.o:
+	mkdir -p ios-x86/$(dir $@)
+	$(SIMULATORROOT)/usr/bin/$(CXX) $(CXXFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -c $< -o ios-x86/$@ $(COVERAGEFLAGS)
+	mkdir -p ios-arm/$(dir $@)
+	$(DEVICEROOT)/usr/bin/$(CXX) $(CXXFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -c $< -o ios-arm/$@ $(COVERAGEFLAGS)
+	lipo ios-x86/$@ ios-arm/$@ -create -output $@
+
+.c.o:
+	mkdir -p ios-x86/$(dir $@)
+	$(SIMULATORROOT)/usr/bin/$(CC) $(CFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -c $< -o ios-x86/$@
+	mkdir -p ios-arm/$(dir $@)
+	$(DEVICEROOT)/usr/bin/$(CC) $(CFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -c $< -o ios-arm/$@
+	lipo ios-x86/$@ ios-arm/$@ -create -output $@
+
+else
+.cc.o:
+	$(CXX) $(CXXFLAGS) -c $< -o $@ $(COVERAGEFLAGS)
+
+.c.o:
+	$(CC) $(CFLAGS) -c $< -o $@
+endif
+
+# ---------------------------------------------------------------------------
+#  	Source files dependencies detection
+# ---------------------------------------------------------------------------
+
+# Add proper dependency support so changing a .h file forces a .cc file to
+# rebuild.
+
+# The .d file indicates .cc file's dependencies on .h files. We generate such
+# dependency by g++'s -MM option, whose output is a make dependency rule.
+# The sed command makes sure the "target" file in the generated .d file has
+# the correct path prefix.
+%.d: %.cc
+	$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) -MM $< -o $@
+	@sed -i -e 's|.*:|$*.o:|' $@
+
+DEPFILES = $(filter-out util/build_version.d,$(SOURCES:.cc=.d))
+
+depend: $(DEPFILES)
+
+ifneq ($(MAKECMDGOALS),clean)
+-include $(DEPFILES)
+endif
--- a/23
+++ b/23
@ -0,0 +1,23 @@
+Additional Grant of Patent Rights
+
+“Software” means the rocksdb software distributed by Facebook, Inc.
+
+Facebook hereby grants you a perpetual, worldwide, royalty-free,
+non-exclusive, irrevocable (subject to the termination provision below)
+license under any rights in any patent claims owned by Facebook, to make,
+have made, use, sell, offer to sell, import, and otherwise transfer the
+Software. For avoidance of doubt, no license is granted under Facebook’s
+rights in any patent claims that are infringed by (i) modifications to the
+Software made by you or a third party, or (ii) the Software in combination
+with any software or other technology provided by you or a third party.
+
+The license granted hereunder will terminate, automatically and without
+notice, for anyone that makes any claim (including by filing any lawsuit,
+assertion or other action) alleging (a) direct, indirect, or contributory
+infringement or inducement to infringe any patent: (i) by Facebook or any
+of its subsidiaries or affiliates, whether or not such claim is related
+to the Software, (ii) by any party if such claim arises in whole or in
+part from any software, product or service of Facebook or any of its
+subsidiaries or affiliates, whether or not such claim is related to the
+Software, or (iii) by any party relating to the Software; or (b) that
+any right in any patent claim of Facebook is invalid or unenforceable.
--- a/82
+++ b/82
@ -0,0 +1,82 @@
+rocksdb: A persistent key-value store for flash storage
+Authors: * The Facebook Database Engineering Team
+         * Build on earlier work on leveldb by Sanjay Ghemawat
+           (sanjay@google.com) and Jeff Dean (jeff@google.com)
+
+This code is a library that forms the core building block for a fast
+key value server, especially suited for storing data on flash drives.
+It has an Log-Stuctured-Merge-Database (LSM) design with flexible tradeoffs
+between Write-Amplification-Factor(WAF), Read-Amplification-Factor (RAF)
+and Space-Amplification-Factor(SAF). It has multi-threaded compactions,
+making it specially suitable for storing multiple terabytes of data in a
+single database.
+
+The core of this code has been derived from open-source leveldb.
+
+The code under this directory implements a system for maintaining a
+persistent key/value store.
+
+See doc/index.html for more explanation.
+See doc/impl.html for a brief overview of the implementation.
+
+The public interface is in include/*.  Callers should not include or
+rely on the details of any other header files in this package.  Those
+internal APIs may be changed without warning.
+
+Guide to header files:
+
+include/rocksdb/db.h
+    Main interface to the DB: Start here
+
+include/rocksdb/options.h
+    Control over the behavior of an entire database, and also
+    control over the behavior of individual reads and writes.
+
+include/rocksdb/comparator.h
+    Abstraction for user-specified comparison function.  If you want
+    just bytewise comparison of keys, you can use the default comparator,
+    but clients can write their own comparator implementations if they
+    want custom ordering (e.g. to handle different character
+    encodings, etc.)
+
+include/rocksdb/iterator.h
+    Interface for iterating over data. You can get an iterator
+    from a DB object.
+
+include/rocksdb/write_batch.h
+    Interface for atomically applying multiple updates to a database.
+
+include/rocksdb/slice.h
+    A simple module for maintaining a pointer and a length into some
+    other byte array.
+
+include/rocksdb/status.h
+    Status is returned from many of the public interfaces and is used
+    to report success and various kinds of errors.
+
+include/rocksdb/env.h
+    Abstraction of the OS environment.  A posix implementation of
+    this interface is in util/env_posix.cc
+
+include/rocksdb/table_builder.h
+    Lower-level modules that most clients probably won't use directly
+
+include/rocksdb/cache.h
+    An API for the block cache.
+
+include/rocksdb/compaction_filter.h
+    An API for a application filter invoked on every compaction.
+
+include/rocksdb/filter_policy.h
+    An API for configuring a bloom filter.
+
+include/rocksdb/memtablerep.h
+    An API for implementing a memtable.
+
+include/rocksdb/statistics.h
+    An API to retrieve various database statistics.
+
+include/rocksdb/transaction_log.h
+    An API to retrieve transaction logs from a database.
+
+
--- a/README.fb
+++ b/README.fb
@ -0,0 +1,3 @@
+* Detailed instructions on how to compile using fbcode and jemalloc
+
+* Latest release is 2.5.fb
--- a/build_tools/build_detect_platform
+++ b/build_tools/build_detect_platform
@ -0,0 +1,252 @@
+#!/bin/sh
+#
+# Detects OS we're compiling on and outputs a file specified by the first
+# argument, which in turn gets read while processing Makefile.
+#
+# The output will set the following variables:
+#   CC                          C Compiler path
+#   CXX                         C++ Compiler path
+#   PLATFORM_LDFLAGS            Linker flags
+#   PLATFORM_SHARED_EXT         Extension for shared libraries
+#   PLATFORM_SHARED_LDFLAGS     Flags for building shared library
+#   PLATFORM_SHARED_CFLAGS      Flags for compiling objects for shared library
+#   PLATFORM_CCFLAGS            C compiler flags
+#   PLATFORM_CXXFLAGS           C++ compiler flags.  Will contain:
+#   PLATFORM_SHARED_VERSIONED   Set to 'true' if platform supports versioned
+#                               shared libraries, empty otherwise.
+#
+# The PLATFORM_CCFLAGS and PLATFORM_CXXFLAGS might include the following:
+#
+#       -DLEVELDB_PLATFORM_POSIX if cstdatomic is present
+#       -DLEVELDB_PLATFORM_NOATOMIC if it is not
+#       -DSNAPPY                     if the Snappy library is present
+#
+
+OUTPUT=$1
+if test -z "$OUTPUT"; then
+  echo "usage: $0 <output-filename>" >&2
+  exit 1
+fi
+
+# Default to fbcode gcc on internal fb machines
+if [ -d /mnt/gvfs/third-party -a -z "$CXX" ]; then
+    if [ -z "$USE_CLANG" ]; then
+        source $PWD/build_tools/fbcode.gcc471.sh
+    else
+        source $PWD/build_tools/fbcode.clang31.sh
+    fi
+fi
+
+# Delete existing output, if it exists
+rm -f $OUTPUT
+touch $OUTPUT
+
+if test -z "$CC"; then
+   CC=cc
+fi
+
+if test -z "$CXX"; then
+    CXX=g++
+fi
+
+# Detect OS
+if test -z "$TARGET_OS"; then
+    TARGET_OS=`uname -s`
+fi
+
+COMMON_FLAGS="${CFLAGS}"
+CROSS_COMPILE=
+PLATFORM_CCFLAGS=
+PLATFORM_CXXFLAGS="${CXXFLAGS}"
+PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS"
+PLATFORM_SHARED_EXT="so"
+PLATFORM_SHARED_LDFLAGS="${EXEC_LDFLAGS_SHARED} -shared -Wl,-soname -Wl,"
+PLATFORM_SHARED_CFLAGS="-fPIC"
+PLATFORM_SHARED_VERSIONED=true
+
+# generic port files (working on all platform by #ifdef) go directly in /port
+GENERIC_PORT_FILES=`find $ROCKSDB_ROOT/port -name '*.cc' | tr "\n" " "`
+
+# On GCC, we pick libc's memcmp over GCC's memcmp via -fno-builtin-memcmp
+case "$TARGET_OS" in
+    Darwin)
+        PLATFORM=OS_MACOSX
+        COMMON_FLAGS="$COMMON_FLAGS -DOS_MACOSX"
+        PLATFORM_SHARED_EXT=dylib
+        PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name "
+        # PORT_FILES=port/darwin/darwin_specific.cc
+        ;;
+    Linux)
+        PLATFORM=OS_LINUX
+        COMMON_FLAGS="$COMMON_FLAGS -DOS_LINUX"
+        if [ -z "$USE_CLANG" ]; then
+            COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp"
+        fi
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt"
+        # PORT_FILES=port/linux/linux_specific.cc
+        ;;
+    SunOS)
+        PLATFORM=OS_SOLARIS
+        COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_SOLARIS"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt"
+        # PORT_FILES=port/sunos/sunos_specific.cc
+        ;;
+    FreeBSD)
+        PLATFORM=OS_FREEBSD
+        COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_FREEBSD"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread"
+        # PORT_FILES=port/freebsd/freebsd_specific.cc
+        ;;
+    NetBSD)
+        PLATFORM=OS_NETBSD
+        COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_NETBSD"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lgcc_s"
+        # PORT_FILES=port/netbsd/netbsd_specific.cc
+        ;;
+    OpenBSD)
+        PLATFORM=OS_OPENBSD
+        COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_OPENBSD"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -pthread"
+        # PORT_FILES=port/openbsd/openbsd_specific.cc
+        ;;
+    DragonFly)
+        PLATFORM=OS_DRAGONFLYBSD
+        COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_DRAGONFLYBSD"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread"
+        # PORT_FILES=port/dragonfly/dragonfly_specific.cc
+        ;;
+    OS_ANDROID_CROSSCOMPILE)
+        PLATFORM=OS_ANDROID
+	COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_ANDROID -DLEVELDB_PLATFORM_POSIX"
+	PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS "  # All pthread features are in the Android C library
+        # PORT_FILES=port/android/android.cc
+        CROSS_COMPILE=true
+        ;;
+    *)
+        echo "Unknown platform!" >&2
+        exit 1
+esac
+
+$PWD/build_tools/build_detect_version
+
+# We want to make a list of all cc files within util, db, table, and helpers
+# except for the test and benchmark files. By default, find will output a list
+# of all files matching either rule, so we need to append -print to make the
+# prune take effect.
+DIRS="util db table utilities"
+
+set -f # temporarily disable globbing so that our patterns arent expanded
+PRUNE_TEST="-name *test*.cc -prune"
+PRUNE_BENCH="-name *_bench.cc -prune"
+PORTABLE_FILES=`cd $ROCKSDB_ROOT; find $DIRS $PRUNE_TEST -o $PRUNE_BENCH -o -name '*.cc' -print | sort | tr "\n" " "`
+PORTABLE_CPP=`cd $ROCKSDB_ROOT; find $DIRS $PRUNE_TEST -o $PRUNE_BENCH -o -name '*.cpp' -print | sort | tr "\n" " "`
+set +f # re-enable globbing
+
+# The sources consist of the portable files, plus the platform-specific port
+# file.
+echo "SOURCES=$PORTABLE_FILES $GENERIC_PORT_FILES $PORT_FILES" >> $OUTPUT
+echo "SOURCESCPP=$PORTABLE_CPP" >> $OUTPUT
+echo "MEMENV_SOURCES=helpers/memenv/memenv.cc" >> $OUTPUT
+
+if [ "$CROSS_COMPILE" = "true" ]; then
+    # Cross-compiling; do not try any compilation tests.
+    true
+else
+    # If -std=c++0x works, use <cstdatomic>.  Otherwise use port_posix.h.
+    $CXX $CFLAGS -std=c++0x -x c++ - -o /dev/null 2>/dev/null  <<EOF
+      #include <cstdatomic>
+      int main() {}
+EOF
+    if [ "$?" = 0 ]; then
+        COMMON_FLAGS="$COMMON_FLAGS -DLEVELDB_PLATFORM_POSIX -DLEVELDB_CSTDATOMIC_PRESENT"
+        PLATFORM_CXXFLAGS="-std=c++0x"
+    else
+        COMMON_FLAGS="$COMMON_FLAGS -DLEVELDB_PLATFORM_POSIX"
+    fi
+
+    # Test whether Snappy library is installed
+    # http://code.google.com/p/snappy/
+    $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+      #include <snappy.h>
+      int main() {}
+EOF
+    if [ "$?" = 0 ]; then
+        COMMON_FLAGS="$COMMON_FLAGS -DSNAPPY"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS ${SNAPPY_LDFLAGS:-./snappy/libs/libsnappy.a}"
+    fi
+
+
+    # Test whether gflags library is installed
+    # http://code.google.com/p/gflags/
+    $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+      #include <gflags/gflags.h>
+      int main() {}
+EOF
+    if [ "$?" = 0 ]; then
+        COMMON_FLAGS="$COMMON_FLAGS -DGFLAGS"
+    fi
+
+    # Test whether zlib library is installed
+    $CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+      #include <zlib.h>
+      int main() {}
+EOF
+    if [ "$?" = 0 ]; then
+        COMMON_FLAGS="$COMMON_FLAGS -DZLIB"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lz"
+    fi
+
+    # Test whether bzip library is installed
+    $CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+      #include <bzlib.h>
+      int main() {}
+EOF
+    if [ "$?" = 0 ]; then
+        COMMON_FLAGS="$COMMON_FLAGS -DBZIP2"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lbz2"
+    fi
+
+    # Test whether tcmalloc is available
+    $CXX $CFLAGS -x c++ - -o /dev/null -ltcmalloc 2>/dev/null  <<EOF
+      int main() {}
+EOF
+    if [ "$?" = 0 ]; then
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -ltcmalloc"
+    fi
+fi
+
+# shall we use HDFS?
+
+if test "$USE_HDFS"; then
+  if test -z "$JAVA_HOME"; then
+    echo "JAVA_HOME has to be set for HDFS usage."
+    exit 1
+  fi
+  HDFS_CCFLAGS="$HDFS_CCFLAGS -I$JAVA_HOME/include -I$JAVA_HOME/include/linux -DUSE_HDFS"
+  HDFS_LDFLAGS="$HDFS_LDFLAGS -Wl,--no-whole-archive hdfs/libhdfs.a -L$JAVA_HOME/jre/lib/amd64"
+  HDFS_LDFLAGS="$HDFS_LDFLAGS -L$JAVA_HOME/jre/lib/amd64/server -L$GLIBC_RUNTIME_PATH/lib"
+  HDFS_LDFLAGS="$HDFS_LDFLAGS -ldl -lverify -ljava -ljvm"
+  COMMON_FLAGS="$COMMON_FLAGS $HDFS_CCFLAGS"
+  PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS $HDFS_LDFLAGS"
+fi
+
+# if Intel SSE instruction set is supported, set USE_SSE=" -msse -msse4.2 "
+COMMON_FLAGS="$COMMON_FLAGS $USE_SSE"
+
+PLATFORM_CCFLAGS="$PLATFORM_CCFLAGS $COMMON_FLAGS"
+PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS $COMMON_FLAGS"
+
+VALGRIND_VER="$VALGRIND_VER"
+
+echo "CC=$CC" >> $OUTPUT
+echo "CXX=$CXX" >> $OUTPUT
+echo "PLATFORM=$PLATFORM" >> $OUTPUT
+echo "PLATFORM_LDFLAGS=$PLATFORM_LDFLAGS" >> $OUTPUT
+echo "VALGRIND_VER=$VALGRIND_VER" >> $OUTPUT
+echo "PLATFORM_CCFLAGS=$PLATFORM_CCFLAGS" >> $OUTPUT
+echo "PLATFORM_CXXFLAGS=$PLATFORM_CXXFLAGS" >> $OUTPUT
+echo "PLATFORM_SHARED_CFLAGS=$PLATFORM_SHARED_CFLAGS" >> $OUTPUT
+echo "PLATFORM_SHARED_EXT=$PLATFORM_SHARED_EXT" >> $OUTPUT
+echo "PLATFORM_SHARED_LDFLAGS=$PLATFORM_SHARED_LDFLAGS" >> $OUTPUT
+echo "PLATFORM_SHARED_VERSIONED=$PLATFORM_SHARED_VERSIONED" >> $OUTPUT
+echo "EXEC_LDFLAGS=$EXEC_LDFLAGS" >> $OUTPUT
--- a/build_tools/build_detect_version
+++ b/build_tools/build_detect_version
@ -0,0 +1,42 @@
+#!/bin/sh
+#
+# Record the version of the source that we are compiling.
+# We keep a record of the git revision in util/version.cc. This source file
+# is then built as a regular source file as part of the compilation process.
+# One can run "strings executable_filename | grep _build_" to find the version of
+# the source that we used to build the executable file.
+#
+
+# create git version file
+VFILE=$ROCKSDB_ROOT/util/build_version.cc.tmp
+trap "rm $VFILE" EXIT
+
+# check to see if git is in the path
+which git > /dev/null
+
+if [ "$?" = 0 ]; then
+  env -i git rev-parse HEAD |
+  awk '
+  BEGIN {
+    print "#include \"build_version.h\"\n"
+  }
+  { print "const char* rocksdb_build_git_sha = \"rocksdb_build_git_sha:" $0"\";" }
+  ' >  ${VFILE}
+else
+  echo "git not found" |
+  awk '
+    BEGIN {
+      print "#include \"build_version.h\""
+    }
+    { print "const char* rocksdb_build_git_sha = \"rocksdb_build_git_sha:git not found\";" }
+    ' > ${VFILE}
+fi
+
+echo "const char* rocksdb_build_git_datetime = \"rocksdb_build_git_datetime:$(date)\";" >> ${VFILE}
+echo "const char* rocksdb_build_compile_date = __DATE__;" >> ${VFILE}
+echo "const char* rocksdb_build_compile_time = __TIME__;" >> ${VFILE}
+
+OUTFILE=$ROCKSDB_ROOT/util/build_version.cc
+if [ ! -e $OUTFILE ] || ! cmp -s $VFILE $OUTFILE; then
+    cp $VFILE $OUTFILE
+fi
--- a/build_tools/fbcode.clang31.sh
+++ b/build_tools/fbcode.clang31.sh
@ -0,0 +1,49 @@
+#!/bin/sh
+#
+# Set environment variables so that we can compile leveldb using
+# fbcode settings.  It uses the latest g++ compiler and also
+# uses jemalloc
+
+TOOLCHAIN_REV=fbe3b095a4cc4a3713730050d182b7b4a80c342f
+TOOLCHAIN_EXECUTABLES="/mnt/gvfs/third-party/$TOOLCHAIN_REV/centos5.2-native"
+TOOLCHAIN_LIB_BASE="/mnt/gvfs/third-party/$TOOLCHAIN_REV/gcc-4.7.1-glibc-2.14.1"
+TOOL_JEMALLOC=jemalloc-3.3.1/9202ce3
+GLIBC_RUNTIME_PATH=/usr/local/fbcode/gcc-4.7.1-glibc-2.14.1
+
+# location of snappy headers and libraries
+SNAPPY_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/7518bbe/include"
+SNAPPY_LIBS=" $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/7518bbe/lib/libsnappy.a"
+
+# location of libevent
+LIBEVENT_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/libevent/libevent-1.4.14b/91ddd43/include"
+LIBEVENT_LIBS=" -L $TOOLCHAIN_LIB_BASE/libevent/libevent-1.4.14b/91ddd43/lib"
+
+# location of gflags headers and libraries
+GFLAGS_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/include"
+GFLAGS_LIBS=" $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/lib/libgflags.a"
+
+# use Intel SSE support for checksum calculations
+export USE_SSE=" -msse -msse4.2 "
+
+CC="$TOOLCHAIN_EXECUTABLES/clang/clang-3.1/6ca8a59/bin/clang $CLANG_INCLUDES"
+CXX="$TOOLCHAIN_EXECUTABLES/clang/clang-3.1/6ca8a59/bin/clang++ $CLANG_INCLUDES $JINCLUDE $SNAPPY_INCLUDE $LIBEVENT_INCLUDE $GFLAGS_INCLUDE"
+AR=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ar
+RANLIB=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ranlib
+
+CFLAGS="-B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin -nostdlib -nostdinc -isystem $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include/c++/4.7.1 -isystem $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include/c++/4.7.1/x86_64-facebook-linux -isystem $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include/c++/4.7.1/backward -isystem $TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/include -isystem $TOOLCHAIN_LIB_BASE/clang/clang-3.1/c8f7279/lib/clang/3.1/include -isystem $TOOLCHAIN_LIB_BASE/kernel-headers/kernel-headers-3.2.18_70_fbk11_00129_gc8882d0/da39a3e/include/linux -isystem $TOOLCHAIN_LIB_BASE/kernel-headers/kernel-headers-3.2.18_70_fbk11_00129_gc8882d0/da39a3e/include -Wall -Wno-sign-compare -Wno-unused-variable -Winvalid-pch -Wno-deprecated -Woverloaded-virtual"
+CXXFLAGS="$CFLAGS -nostdinc++ -std=gnu++0x"
+
+CFLAGS+=" -I $TOOLCHAIN_LIB_BASE/jemalloc/$TOOL_JEMALLOC/include -DHAVE_JEMALLOC"
+
+EXEC_LDFLAGS=" -Wl,--whole-archive $TOOLCHAIN_LIB_BASE/jemalloc/$TOOL_JEMALLOC/lib/libjemalloc.a"
+EXEC_LDFLAGS+=" -Wl,--no-whole-archive $TOOLCHAIN_LIB_BASE/libunwind/libunwind-1.0.1/350336c/lib/libunwind.a"
+EXEC_LDFLAGS+=" $HDFSLIB $SNAPPY_LIBS $LIBEVENT_LIBS $GFLAGS_LIBS"
+EXEC_LDFLAGS+=" -Wl,--dynamic-linker,$GLIBC_RUNTIME_PATH/lib/ld-linux-x86-64.so.2"
+EXEC_LDFLAGS+=" -B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin"
+
+PLATFORM_LDFLAGS="-L$TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/lib -L$TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/lib"
+
+EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $GFLAGS_LIBS"
+SNAPPY_LDFLAGS="$SNAPPY_LIBS"
+
+export CC CXX AR RANLIB CFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED SNAPPY_LDFLAGS
--- a/build_tools/fbcode.gcc471.sh
+++ b/build_tools/fbcode.gcc471.sh
@ -0,0 +1,60 @@
+#!/bin/sh
+#
+# Set environment variables so that we can compile leveldb using
+# fbcode settings.  It uses the latest g++ compiler and also
+# uses jemalloc
+
+TOOLCHAIN_REV=fbe3b095a4cc4a3713730050d182b7b4a80c342f
+TOOLCHAIN_EXECUTABLES="/mnt/gvfs/third-party/$TOOLCHAIN_REV/centos5.2-native"
+TOOLCHAIN_LIB_BASE="/mnt/gvfs/third-party/$TOOLCHAIN_REV/gcc-4.7.1-glibc-2.14.1"
+TOOL_JEMALLOC=jemalloc-3.3.1/9202ce3
+
+# location of libhdfs libraries
+if test "$USE_HDFS"; then
+  JAVA_HOME="/usr/local/jdk-6u22-64"
+  JINCLUDE="-I$JAVA_HOME/include -I$JAVA_HOME/include/linux"
+  GLIBC_RUNTIME_PATH="/usr/local/fbcode/gcc-4.7.1-glibc-2.14.1"
+  HDFSLIB=" -Wl,--no-whole-archive hdfs/libhdfs.a -L$JAVA_HOME/jre/lib/amd64 "
+  HDFSLIB+=" -L$JAVA_HOME/jre/lib/amd64/server -L$GLIBC_RUNTIME_PATH/lib "
+  HDFSLIB+=" -ldl -lverify -ljava -ljvm "
+fi
+
+# location of snappy headers and libraries
+SNAPPY_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/7518bbe/include"
+SNAPPY_LIBS=" $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/7518bbe/lib/libsnappy.a"
+
+# location of zlib headers and libraries
+ZLIB_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/91ddd43/include"
+ZLIB_LIBS=" $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/91ddd43/lib/libz.a"
+
+# location of libevent
+LIBEVENT_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/libevent/libevent-1.4.14b/91ddd43/include"
+LIBEVENT_LIBS=" -L $TOOLCHAIN_LIB_BASE/libevent/libevent-1.4.14b/91ddd43/lib"
+
+# location of gflags headers and libraries
+GFLAGS_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/include"
+GFLAGS_LIBS=" $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/lib/libgflags.a"
+
+# use Intel SSE support for checksum calculations
+export USE_SSE=" -msse -msse4.2 "
+
+CC="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.7.1-glibc-2.14.1/bin/gcc"
+CXX="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.7.1-glibc-2.14.1/bin/g++ $JINCLUDE $SNAPPY_INCLUDE $ZLIB_INCLUDE $LIBEVENT_INCLUDE $GFLAGS_INCLUDE"
+AR=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ar
+RANLIB=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ranlib
+
+CFLAGS="-B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/bin/gold -m64 -mtune=generic -fPIC"
+CFLAGS+=" -I $TOOLCHAIN_LIB_BASE/jemalloc/$TOOL_JEMALLOC/include -DHAVE_JEMALLOC"
+
+EXEC_LDFLAGS=" -Wl,--whole-archive $TOOLCHAIN_LIB_BASE/jemalloc/$TOOL_JEMALLOC/lib/libjemalloc.a"
+EXEC_LDFLAGS+=" -Wl,--no-whole-archive $TOOLCHAIN_LIB_BASE/libunwind/libunwind-1.0.1/350336c/lib/libunwind.a"
+EXEC_LDFLAGS+=" $HDFSLIB $SNAPPY_LIBS $ZLIB_LIBS $LIBEVENT_LIBS $GFLAGS_LIBS"
+
+PLATFORM_LDFLAGS="-L$TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/lib -L$TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/lib"
+
+EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $GFLAGS_LIBS"
+SNAPPY_LDFLAGS="$SNAPPY_LIBS $ZLIB_LIBS"
+
+VALGRIND_VER="$TOOLCHAIN_LIB_BASE/valgrind/valgrind-3.8.1/91ddd43/bin/"
+
+export CC CXX AR RANLIB CFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED SNAPPY_LDFLAGS VALGRIND_VER
--- a/build_tools/regression_build_test.sh
+++ b/build_tools/regression_build_test.sh
@ -0,0 +1,100 @@
+#!/bin/bash
+
+set -e
+
+NUM=10000000
+
+if [ $# -eq 1 ];then
+  DATA_DIR=$1
+elif [ $# -eq 2 ];then
+  DATA_DIR=$1
+  STAT_FILE=$2
+fi
+
+# On the production build servers, set data and stat
+# files/directories not in /tmp or else the tempdir cleaning
+# scripts will make you very unhappy.
+DATA_DIR=${DATA_DIR:-$(mktemp -t -d rocksdb_XXXX)}
+STAT_FILE=${STAT_FILE:-$(mktemp -t -u rocksdb_test_stats_XXXX)}
+
+function cleanup {
+  rm -rf $DATA_DIR
+  rm -f $STAT_FILE.fillseq
+  rm -f $STAT_FILE.readrandom
+  rm -f $STAT_FILE.overwrite
+}
+
+trap cleanup EXIT
+
+function send_to_ods {
+  key="$1"
+  value="$2"
+
+  if [ -z "$value" ];then
+    echo >&2 "ERROR: Key $key doesn't have a value."
+    return
+  fi
+  curl -s "https://www.intern.facebook.com/intern/agent/ods_set.php?entity=rocksdb_build&key=$key&value=$value" \
+    --connect-timeout 60
+}
+
+make clean
+make db_bench -j$(nproc)
+
+./db_bench \
+    --benchmarks=fillseq \
+    --db=$DATA_DIR \
+    --use_existing_db=0 \
+    --bloom_bits=10 \
+    --num=$NUM \
+    --writes=$NUM \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6 \
+    --open_files=55000 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0  > ${STAT_FILE}.fillseq
+
+./db_bench \
+    --benchmarks=overwrite \
+    --db=$DATA_DIR \
+    --use_existing_db=1 \
+    --bloom_bits=10 \
+    --num=$NUM \
+    --writes=$((NUM / 2)) \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6  \
+    --open_files=55000 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=8 > ${STAT_FILE}.overwrite
+
+./db_bench \
+    --benchmarks=readrandom \
+    --db=$DATA_DIR \
+    --use_existing_db=1 \
+    --bloom_bits=10 \
+    --num=$NUM \
+    --reads=$((NUM / 100)) \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6 \
+    --open_files=55000 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=128 > ${STAT_FILE}.readrandom
+
+OVERWRITE_OPS=$(awk '/overwrite/ {print $5}' $STAT_FILE.overwrite)
+FILLSEQ_OPS=$(awk '/fillseq/ {print $5}' $STAT_FILE.fillseq)
+READRANDOM_OPS=$(awk '/readrandom/ {print $5}' $STAT_FILE.readrandom)
+
+send_to_ods rocksdb.build.overwrite.qps $OVERWRITE_OPS
+send_to_ods rocksdb.build.fillseq.qps $FILLSEQ_OPS
+send_to_ods rocksdb.build.readrandom.qps $READRANDOM_OPS
--- a/build_tools/valgrind_test.sh
+++ b/build_tools/valgrind_test.sh
@ -0,0 +1,15 @@
+#!/bin/bash
+#A shell script for Jenknis to run valgrind on rocksdb tests
+#Returns 0 on success when there are no failed tests 
+
+VALGRIND_DIR=build_tools/VALGRIND_LOGS
+make clean
+make -j$(nproc) valgrind_check
+NUM_FAILED_TESTS=$((`wc -l $VALGRIND_DIR/valgrind_failed_tests | awk '{print $1}'` - 1))
+if [ $NUM_FAILED_TESTS -lt 1 ]; then
+  echo No tests have valgrind errors
+  exit 0
+else
+  cat $VALGRIND_DIR/valgrind_failed_tests
+  exit 1
+fi
--- a/coverage/coverage_test.sh
+++ b/coverage/coverage_test.sh
@ -0,0 +1,73 @@
+#!/bin/bash
+
+# Exit on error.
+set -e
+
+if [ -n "$USE_CLANG" ]; then
+  echo "Error: Coverage test is supported only for gcc."
+  exit 1
+fi
+
+ROOT=".."
+# Fetch right version of gcov
+if [ -d /mnt/gvfs/third-party -a -z "$CXX" ]; then
+  source $ROOT/build_tools/fbcode.gcc471.sh
+  GCOV=$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.7.1/cc6c9dc/bin/gcov
+else
+  GCOV=$(which gcov)
+fi
+
+COVERAGE_DIR="$PWD/COVERAGE_REPORT"
+mkdir -p $COVERAGE_DIR
+
+# Find all gcno files to generate the coverage report
+
+GCNO_FILES=`find $ROOT -name "*.gcno"`
+$GCOV --preserve-paths --relative-only --no-output $GCNO_FILES 2>/dev/null |
+  # Parse the raw gcov report to more human readable form.
+  python $ROOT/coverage/parse_gcov_output.py |
+  # Write the output to both stdout and report file.
+  tee $COVERAGE_DIR/coverage_report_all.txt &&
+echo -e "Generated coverage report for all files: $COVERAGE_DIR/coverage_report_all.txt\n"
+
+# TODO: we also need to get the files of the latest commits.
+# Get the most recently committed files.
+LATEST_FILES=`
+  git show --pretty="format:" --name-only HEAD |
+  grep -v "^$" |
+  paste -s -d,`
+RECENT_REPORT=$COVERAGE_DIR/coverage_report_recent.txt
+
+echo -e "Recently updated files: $LATEST_FILES\n" > $RECENT_REPORT
+$GCOV --preserve-paths --relative-only --no-output $GCNO_FILES 2>/dev/null |
+  python $ROOT/coverage/parse_gcov_output.py -interested-files $LATEST_FILES |
+  tee -a $RECENT_REPORT &&
+echo -e "Generated coverage report for recently updated files: $RECENT_REPORT\n"
+
+# Generate the html report. If we cannot find lcov in this machine, we'll simply
+# skip this step.
+echo "Generating the html coverage report..."
+
+LCOV=$(which lcov || true 2>/dev/null)
+if [ -z $LCOV ]
+then
+  echo "Skip: Cannot find lcov to generate the html report."
+  exit 0
+fi
+
+LCOV_VERSION=$(lcov -v | grep 1.1 || true)
+if [ $LCOV_VERSION ]
+then
+  echo "Not supported lcov version. Expect lcov 1.1."
+  exit 0
+fi
+
+(cd $ROOT; lcov --no-external \
+     --capture  \
+     --directory $PWD \
+     --gcov-tool $GCOV \
+     --output-file $COVERAGE_DIR/coverage.info)
+
+genhtml $COVERAGE_DIR/coverage.info -o $COVERAGE_DIR
+
+echo "HTML Coverage report is generated in $COVERAGE_DIR"
--- a/coverage/parse_gcov_output.py
+++ b/coverage/parse_gcov_output.py
@ -0,0 +1,118 @@
+import optparse
+import re
+import sys
+
+from optparse import OptionParser
+
+# the gcov report follows certain pattern. Each file will have two lines
+# of report, from which we can extract the file name, total lines and coverage
+# percentage.
+def parse_gcov_report(gcov_input):
+    per_file_coverage = {}
+    total_coverage = None
+
+    for line in sys.stdin:
+        line = line.strip()
+
+        # --First line of the coverage report (with file name in it)?
+        match_obj = re.match("^File '(.*)'$", line)
+        if match_obj:
+            # fetch the file name from the first line of the report.
+            current_file = match_obj.group(1)
+            continue
+
+        # -- Second line of the file report (with coverage percentage)
+        match_obj = re.match("^Lines executed:(.*)% of (.*)", line)
+
+        if match_obj:
+            coverage = float(match_obj.group(1))
+            lines = int(match_obj.group(2))
+
+            if current_file is not None:
+                per_file_coverage[current_file] = (coverage, lines)
+                current_file = None
+            else:
+                # If current_file is not set, we reach the last line of report,
+                # which contains the summarized coverage percentage.
+                total_coverage = (coverage, lines)
+            continue
+
+        # If the line's pattern doesn't fall into the above categories. We
+        # can simply ignore them since they're either empty line or doesn't
+        # find executable lines of the given file.
+        current_file = None
+
+    return per_file_coverage, total_coverage
+
+def get_option_parser():
+    usage = "Parse the gcov output and generate more human-readable code " +\
+            "coverage report."
+    parser = OptionParser(usage)
+
+    parser.add_option(
+        "--interested-files", "-i",
+        dest="filenames",
+        help="Comma separated files names. if specified, we will display " +
+             "the coverage report only for interested source files. " +
+             "Otherwise we will display the coverage report for all " +
+             "source files."
+    )
+    return parser
+
+def display_file_coverage(per_file_coverage, total_coverage):
+    # To print out auto-adjustable column, we need to know the longest
+    # length of file names.
+    max_file_name_length = max(
+        len(fname) for fname in per_file_coverage.keys()
+    )
+
+    # -- Print header
+    # size of separator is determined by 3 column sizes:
+    # file name, coverage percentage and lines.
+    header_template = \
+        "%" + str(max_file_name_length) + "s\t%s\t%s"
+    separator = "-" * (max_file_name_length + 10 + 20)
+    print header_template % ("Filename", "Coverage", "Lines")
+    print separator
+
+    # -- Print body
+    # template for printing coverage report for each file.
+    record_template = "%" + str(max_file_name_length) + "s\t%5.2f%%\t%10d"
+
+    for fname, coverage_info in per_file_coverage.items():
+        coverage, lines = coverage_info
+        print record_template % (fname, coverage, lines)
+
+    # -- Print footer
+    if total_coverage:
+        print separator
+        print record_template % ("Total", total_coverage[0], total_coverage[1])
+
+def report_coverage():
+    parser = get_option_parser()
+    (options, args) = parser.parse_args()
+
+    interested_files = set()
+    if options.filenames is not None:
+        interested_files = set(f.strip() for f in options.filenames.split(','))
+
+    # To make things simple, right now we only read gcov report from the input
+    per_file_coverage, total_coverage = parse_gcov_report(sys.stdin)
+
+    # Check if we need to display coverage info for interested files.
+    if len(interested_files):
+        per_file_coverage = dict(
+            (fname, per_file_coverage[fname]) for fname in interested_files
+            if fname in per_file_coverage
+        )
+        # If we only interested in several files, it makes no sense to report
+        # the total_coverage
+        total_coverage = None
+
+    if not len(per_file_coverage):
+        print >> sys.stderr, "Cannot find coverage info for the given files."
+        return
+    display_file_coverage(per_file_coverage, total_coverage)
+
+if __name__ == "__main__":
+    report_coverage()
--- a/db/builder.cc
+++ b/db/builder.cc
@ -0,0 +1,226 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/builder.h"
+
+#include "db/filename.h"
+#include "db/dbformat.h"
+#include "db/merge_helper.h"
+#include "db/table_cache.h"
+#include "db/version_edit.h"
+#include "rocksdb/db.h"
+#include "rocksdb/table.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "table/block_based_table_builder.h"
+#include "util/stop_watch.h"
+
+namespace rocksdb {
+
+class TableFactory;
+
+TableBuilder* GetTableBuilder(const Options& options, WritableFile* file,
+                              CompressionType compression_type) {
+  return options.table_factory->GetTableBuilder(options, file,
+                                                compression_type);
+}
+
+Status BuildTable(const std::string& dbname,
+                  Env* env,
+                  const Options& options,
+                  const EnvOptions& soptions,
+                  TableCache* table_cache,
+                  Iterator* iter,
+                  FileMetaData* meta,
+                  const Comparator* user_comparator,
+                  const SequenceNumber newest_snapshot,
+                  const SequenceNumber earliest_seqno_in_memtable,
+                  const bool enable_compression) {
+  Status s;
+  meta->file_size = 0;
+  meta->smallest_seqno = meta->largest_seqno = 0;
+  iter->SeekToFirst();
+
+  // If the sequence number of the smallest entry in the memtable is
+  // smaller than the most recent snapshot, then we do not trigger
+  // removal of duplicate/deleted keys as part of this builder.
+  bool purge = options.purge_redundant_kvs_while_flush;
+  if (earliest_seqno_in_memtable <= newest_snapshot) {
+    purge = false;
+  }
+
+  std::string fname = TableFileName(dbname, meta->number);
+  if (iter->Valid()) {
+    unique_ptr<WritableFile> file;
+    s = env->NewWritableFile(fname, &file, soptions);
+    if (!s.ok()) {
+      return s;
+    }
+
+    TableBuilder* builder = GetTableBuilder(options, file.get(),
+                                            options.compression);
+
+    // the first key is the smallest key
+    Slice key = iter->key();
+    meta->smallest.DecodeFrom(key);
+    meta->smallest_seqno = GetInternalKeySeqno(key);
+    meta->largest_seqno = meta->smallest_seqno;
+
+    MergeHelper merge(user_comparator, options.merge_operator.get(),
+                      options.info_log.get(),
+                      true /* internal key corruption is not ok */);
+
+    if (purge) {
+      // Ugly walkaround to avoid compiler error for release build
+      bool ok __attribute__((unused)) = true;
+
+      // Will write to builder if current key != prev key
+      ParsedInternalKey prev_ikey;
+      std::string prev_key;
+      bool is_first_key = true;    // Also write if this is the very first key
+
+      while (iter->Valid()) {
+        bool iterator_at_next = false;
+
+        // Get current key
+        ParsedInternalKey this_ikey;
+        Slice key = iter->key();
+        Slice value = iter->value();
+
+        // In-memory key corruption is not ok;
+        // TODO: find a clean way to treat in memory key corruption
+        ok = ParseInternalKey(key, &this_ikey);
+        assert(ok);
+        assert(this_ikey.sequence >= earliest_seqno_in_memtable);
+
+        // If the key is the same as the previous key (and it is not the
+        // first key), then we skip it, since it is an older version.
+        // Otherwise we output the key and mark it as the "new" previous key.
+        if (!is_first_key && !user_comparator->Compare(prev_ikey.user_key,
+                                                       this_ikey.user_key)) {
+          // seqno within the same key are in decreasing order
+          assert(this_ikey.sequence < prev_ikey.sequence);
+        } else {
+          is_first_key = false;
+
+          if (this_ikey.type == kTypeMerge) {
+            // Handle merge-type keys using the MergeHelper
+            merge.MergeUntil(iter, 0 /* don't worry about snapshot */);
+            iterator_at_next = true;
+            if (merge.IsSuccess()) {
+              // Merge completed correctly.
+              // Add the resulting merge key/value and continue to next
+              builder->Add(merge.key(), merge.value());
+              prev_key.assign(merge.key().data(), merge.key().size());
+              ok = ParseInternalKey(Slice(prev_key), &prev_ikey);
+              assert(ok);
+            } else {
+              // Merge did not find a Put/Delete.
+              // Can not compact these merges into a kValueType.
+              // Write them out one-by-one. (Proceed back() to front())
+              const std::deque<std::string>& keys = merge.keys();
+              const std::deque<std::string>& values = merge.values();
+              assert(keys.size() == values.size() && keys.size() >= 1);
+              std::deque<std::string>::const_reverse_iterator key_iter;
+              std::deque<std::string>::const_reverse_iterator value_iter;
+              for (key_iter=keys.rbegin(), value_iter = values.rbegin();
+                   key_iter != keys.rend() && value_iter != values.rend();
+                   ++key_iter, ++value_iter) {
+
+                builder->Add(Slice(*key_iter), Slice(*value_iter));
+              }
+
+              // Sanity check. Both iterators should end at the same time
+              assert(key_iter == keys.rend() && value_iter == values.rend());
+
+              prev_key.assign(keys.front());
+              ok = ParseInternalKey(Slice(prev_key), &prev_ikey);
+              assert(ok);
+            }
+          } else {
+            // Handle Put/Delete-type keys by simply writing them
+            builder->Add(key, value);
+            prev_key.assign(key.data(), key.size());
+            ok = ParseInternalKey(Slice(prev_key), &prev_ikey);
+            assert(ok);
+          }
+        }
+
+        if (!iterator_at_next) iter->Next();
+      }
+
+      // The last key is the largest key
+      meta->largest.DecodeFrom(Slice(prev_key));
+      SequenceNumber seqno = GetInternalKeySeqno(Slice(prev_key));
+      meta->smallest_seqno = std::min(meta->smallest_seqno, seqno);
+      meta->largest_seqno = std::max(meta->largest_seqno, seqno);
+
+    } else {
+      for (; iter->Valid(); iter->Next()) {
+        Slice key = iter->key();
+        meta->largest.DecodeFrom(key);
+        builder->Add(key, iter->value());
+        SequenceNumber seqno = GetInternalKeySeqno(key);
+        meta->smallest_seqno = std::min(meta->smallest_seqno, seqno);
+        meta->largest_seqno = std::max(meta->largest_seqno, seqno);
+      }
+    }
+
+    // Finish and check for builder errors
+    if (s.ok()) {
+      s = builder->Finish();
+      if (s.ok()) {
+        meta->file_size = builder->FileSize();
+        assert(meta->file_size > 0);
+      }
+    } else {
+      builder->Abandon();
+    }
+    delete builder;
+
+    // Finish and check for file errors
+    if (s.ok() && !options.disableDataSync) {
+      if (options.use_fsync) {
+        StopWatch sw(env, options.statistics, TABLE_SYNC_MICROS);
+        s = file->Fsync();
+      } else {
+        StopWatch sw(env, options.statistics, TABLE_SYNC_MICROS);
+        s = file->Sync();
+      }
+    }
+    if (s.ok()) {
+      s = file->Close();
+    }
+
+    if (s.ok()) {
+      // Verify that the table is usable
+      Iterator* it = table_cache->NewIterator(ReadOptions(),
+                                              soptions,
+                                              meta->number,
+                                              meta->file_size);
+      s = it->status();
+      delete it;
+    }
+  }
+
+  // Check for input iterator errors
+  if (!iter->status().ok()) {
+    s = iter->status();
+  }
+
+  if (s.ok() && meta->file_size > 0) {
+    // Keep it
+  } else {
+    env->DeleteFile(fname);
+  }
+  return s;
+}
+
+}  // namespace rocksdb
--- a/db/builder.h
+++ b/db/builder.h
@ -0,0 +1,48 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+#include "rocksdb/comparator.h"
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+#include "rocksdb/options.h"
+
+namespace rocksdb {
+
+struct Options;
+struct FileMetaData;
+
+class Env;
+struct EnvOptions;
+class Iterator;
+class TableCache;
+class VersionEdit;
+class TableBuilder;
+class WritableFile;
+
+
+extern TableBuilder* GetTableBuilder(const Options& options, WritableFile* file,
+                                     CompressionType compression_type);
+
+// Build a Table file from the contents of *iter.  The generated file
+// will be named according to meta->number.  On success, the rest of
+// *meta will be filled with metadata about the generated table.
+// If no data is present in *iter, meta->file_size will be set to
+// zero, and no Table file will be produced.
+extern Status BuildTable(const std::string& dbname,
+                         Env* env,
+                         const Options& options,
+                         const EnvOptions& soptions,
+                         TableCache* table_cache,
+                         Iterator* iter,
+                         FileMetaData* meta,
+                         const Comparator* user_comparator,
+                         const SequenceNumber newest_snapshot,
+                         const SequenceNumber earliest_seqno_in_memtable,
+                         const bool enable_compression);
+
+}  // namespace rocksdb
--- a/db/c.cc
+++ b/db/c.cc
@ -0,0 +1,691 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/c.h"
+
+#include <stdlib.h>
+#include <unistd.h>
+#include "rocksdb/cache.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+#include "rocksdb/write_batch.h"
+
+using rocksdb::Cache;
+using rocksdb::Comparator;
+using rocksdb::CompressionType;
+using rocksdb::DB;
+using rocksdb::Env;
+using rocksdb::FileLock;
+using rocksdb::FilterPolicy;
+using rocksdb::Iterator;
+using rocksdb::Logger;
+using rocksdb::NewBloomFilterPolicy;
+using rocksdb::NewLRUCache;
+using rocksdb::Options;
+using rocksdb::RandomAccessFile;
+using rocksdb::Range;
+using rocksdb::ReadOptions;
+using rocksdb::SequentialFile;
+using rocksdb::Slice;
+using rocksdb::Snapshot;
+using rocksdb::Status;
+using rocksdb::WritableFile;
+using rocksdb::WriteBatch;
+using rocksdb::WriteOptions;
+
+using std::shared_ptr;
+
+extern "C" {
+
+struct leveldb_t              { DB*               rep; };
+struct leveldb_iterator_t     { Iterator*         rep; };
+struct leveldb_writebatch_t   { WriteBatch        rep; };
+struct leveldb_snapshot_t     { const Snapshot*   rep; };
+struct leveldb_readoptions_t  { ReadOptions       rep; };
+struct leveldb_writeoptions_t { WriteOptions      rep; };
+struct leveldb_options_t      { Options           rep; };
+struct leveldb_seqfile_t      { SequentialFile*   rep; };
+struct leveldb_randomfile_t   { RandomAccessFile* rep; };
+struct leveldb_writablefile_t { WritableFile*     rep; };
+struct leveldb_filelock_t     { FileLock*         rep; };
+struct leveldb_logger_t       { shared_ptr<Logger>  rep; };
+struct leveldb_cache_t        { shared_ptr<Cache>   rep; };
+
+struct leveldb_comparator_t : public Comparator {
+  void* state_;
+  void (*destructor_)(void*);
+  int (*compare_)(
+      void*,
+      const char* a, size_t alen,
+      const char* b, size_t blen);
+  const char* (*name_)(void*);
+
+  virtual ~leveldb_comparator_t() {
+    (*destructor_)(state_);
+  }
+
+  virtual int Compare(const Slice& a, const Slice& b) const {
+    return (*compare_)(state_, a.data(), a.size(), b.data(), b.size());
+  }
+
+  virtual const char* Name() const {
+    return (*name_)(state_);
+  }
+
+  // No-ops since the C binding does not support key shortening methods.
+  virtual void FindShortestSeparator(std::string*, const Slice&) const { }
+  virtual void FindShortSuccessor(std::string* key) const { }
+};
+
+struct leveldb_filterpolicy_t : public FilterPolicy {
+  void* state_;
+  void (*destructor_)(void*);
+  const char* (*name_)(void*);
+  char* (*create_)(
+      void*,
+      const char* const* key_array, const size_t* key_length_array,
+      int num_keys,
+      size_t* filter_length);
+  unsigned char (*key_match_)(
+      void*,
+      const char* key, size_t length,
+      const char* filter, size_t filter_length);
+
+  virtual ~leveldb_filterpolicy_t() {
+    (*destructor_)(state_);
+  }
+
+  virtual const char* Name() const {
+    return (*name_)(state_);
+  }
+
+  virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const {
+    std::vector<const char*> key_pointers(n);
+    std::vector<size_t> key_sizes(n);
+    for (int i = 0; i < n; i++) {
+      key_pointers[i] = keys[i].data();
+      key_sizes[i] = keys[i].size();
+    }
+    size_t len;
+    char* filter = (*create_)(state_, &key_pointers[0], &key_sizes[0], n, &len);
+    dst->append(filter, len);
+    free(filter);
+  }
+
+  virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const {
+    return (*key_match_)(state_, key.data(), key.size(),
+                         filter.data(), filter.size());
+  }
+};
+
+struct leveldb_env_t {
+  Env* rep;
+  bool is_default;
+};
+
+static bool SaveError(char** errptr, const Status& s) {
+  assert(errptr != NULL);
+  if (s.ok()) {
+    return false;
+  } else if (*errptr == NULL) {
+    *errptr = strdup(s.ToString().c_str());
+  } else {
+    // TODO(sanjay): Merge with existing error?
+    free(*errptr);
+    *errptr = strdup(s.ToString().c_str());
+  }
+  return true;
+}
+
+static char* CopyString(const std::string& str) {
+  char* result = reinterpret_cast<char*>(malloc(sizeof(char) * str.size()));
+  memcpy(result, str.data(), sizeof(char) * str.size());
+  return result;
+}
+
+leveldb_t* leveldb_open(
+    const leveldb_options_t* options,
+    const char* name,
+    char** errptr) {
+  DB* db;
+  if (SaveError(errptr, DB::Open(options->rep, std::string(name), &db))) {
+    return NULL;
+  }
+  leveldb_t* result = new leveldb_t;
+  result->rep = db;
+  return result;
+}
+
+void leveldb_close(leveldb_t* db) {
+  delete db->rep;
+  delete db;
+}
+
+void leveldb_put(
+    leveldb_t* db,
+    const leveldb_writeoptions_t* options,
+    const char* key, size_t keylen,
+    const char* val, size_t vallen,
+    char** errptr) {
+  SaveError(errptr,
+            db->rep->Put(options->rep, Slice(key, keylen), Slice(val, vallen)));
+}
+
+void leveldb_delete(
+    leveldb_t* db,
+    const leveldb_writeoptions_t* options,
+    const char* key, size_t keylen,
+    char** errptr) {
+  SaveError(errptr, db->rep->Delete(options->rep, Slice(key, keylen)));
+}
+
+
+void leveldb_write(
+    leveldb_t* db,
+    const leveldb_writeoptions_t* options,
+    leveldb_writebatch_t* batch,
+    char** errptr) {
+  SaveError(errptr, db->rep->Write(options->rep, &batch->rep));
+}
+
+char* leveldb_get(
+    leveldb_t* db,
+    const leveldb_readoptions_t* options,
+    const char* key, size_t keylen,
+    size_t* vallen,
+    char** errptr) {
+  char* result = NULL;
+  std::string tmp;
+  Status s = db->rep->Get(options->rep, Slice(key, keylen), &tmp);
+  if (s.ok()) {
+    *vallen = tmp.size();
+    result = CopyString(tmp);
+  } else {
+    *vallen = 0;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+  }
+  return result;
+}
+
+leveldb_iterator_t* leveldb_create_iterator(
+    leveldb_t* db,
+    const leveldb_readoptions_t* options) {
+  leveldb_iterator_t* result = new leveldb_iterator_t;
+  result->rep = db->rep->NewIterator(options->rep);
+  return result;
+}
+
+const leveldb_snapshot_t* leveldb_create_snapshot(
+    leveldb_t* db) {
+  leveldb_snapshot_t* result = new leveldb_snapshot_t;
+  result->rep = db->rep->GetSnapshot();
+  return result;
+}
+
+void leveldb_release_snapshot(
+    leveldb_t* db,
+    const leveldb_snapshot_t* snapshot) {
+  db->rep->ReleaseSnapshot(snapshot->rep);
+  delete snapshot;
+}
+
+char* leveldb_property_value(
+    leveldb_t* db,
+    const char* propname) {
+  std::string tmp;
+  if (db->rep->GetProperty(Slice(propname), &tmp)) {
+    // We use strdup() since we expect human readable output.
+    return strdup(tmp.c_str());
+  } else {
+    return NULL;
+  }
+}
+
+void leveldb_approximate_sizes(
+    leveldb_t* db,
+    int num_ranges,
+    const char* const* range_start_key, const size_t* range_start_key_len,
+    const char* const* range_limit_key, const size_t* range_limit_key_len,
+    uint64_t* sizes) {
+  Range* ranges = new Range[num_ranges];
+  for (int i = 0; i < num_ranges; i++) {
+    ranges[i].start = Slice(range_start_key[i], range_start_key_len[i]);
+    ranges[i].limit = Slice(range_limit_key[i], range_limit_key_len[i]);
+  }
+  db->rep->GetApproximateSizes(ranges, num_ranges, sizes);
+  delete[] ranges;
+}
+
+void leveldb_compact_range(
+    leveldb_t* db,
+    const char* start_key, size_t start_key_len,
+    const char* limit_key, size_t limit_key_len) {
+  Slice a, b;
+  db->rep->CompactRange(
+      // Pass NULL Slice if corresponding "const char*" is NULL
+      (start_key ? (a = Slice(start_key, start_key_len), &a) : NULL),
+      (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : NULL));
+}
+
+void leveldb_destroy_db(
+    const leveldb_options_t* options,
+    const char* name,
+    char** errptr) {
+  SaveError(errptr, DestroyDB(name, options->rep));
+}
+
+void leveldb_repair_db(
+    const leveldb_options_t* options,
+    const char* name,
+    char** errptr) {
+  SaveError(errptr, RepairDB(name, options->rep));
+}
+
+void leveldb_iter_destroy(leveldb_iterator_t* iter) {
+  delete iter->rep;
+  delete iter;
+}
+
+unsigned char leveldb_iter_valid(const leveldb_iterator_t* iter) {
+  return iter->rep->Valid();
+}
+
+void leveldb_iter_seek_to_first(leveldb_iterator_t* iter) {
+  iter->rep->SeekToFirst();
+}
+
+void leveldb_iter_seek_to_last(leveldb_iterator_t* iter) {
+  iter->rep->SeekToLast();
+}
+
+void leveldb_iter_seek(leveldb_iterator_t* iter, const char* k, size_t klen) {
+  iter->rep->Seek(Slice(k, klen));
+}
+
+void leveldb_iter_next(leveldb_iterator_t* iter) {
+  iter->rep->Next();
+}
+
+void leveldb_iter_prev(leveldb_iterator_t* iter) {
+  iter->rep->Prev();
+}
+
+const char* leveldb_iter_key(const leveldb_iterator_t* iter, size_t* klen) {
+  Slice s = iter->rep->key();
+  *klen = s.size();
+  return s.data();
+}
+
+const char* leveldb_iter_value(const leveldb_iterator_t* iter, size_t* vlen) {
+  Slice s = iter->rep->value();
+  *vlen = s.size();
+  return s.data();
+}
+
+void leveldb_iter_get_error(const leveldb_iterator_t* iter, char** errptr) {
+  SaveError(errptr, iter->rep->status());
+}
+
+leveldb_writebatch_t* leveldb_writebatch_create() {
+  return new leveldb_writebatch_t;
+}
+
+void leveldb_writebatch_destroy(leveldb_writebatch_t* b) {
+  delete b;
+}
+
+void leveldb_writebatch_clear(leveldb_writebatch_t* b) {
+  b->rep.Clear();
+}
+
+void leveldb_writebatch_put(
+    leveldb_writebatch_t* b,
+    const char* key, size_t klen,
+    const char* val, size_t vlen) {
+  b->rep.Put(Slice(key, klen), Slice(val, vlen));
+}
+
+void leveldb_writebatch_delete(
+    leveldb_writebatch_t* b,
+    const char* key, size_t klen) {
+  b->rep.Delete(Slice(key, klen));
+}
+
+void leveldb_writebatch_iterate(
+    leveldb_writebatch_t* b,
+    void* state,
+    void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
+    void (*deleted)(void*, const char* k, size_t klen)) {
+  class H : public WriteBatch::Handler {
+   public:
+    void* state_;
+    void (*put_)(void*, const char* k, size_t klen, const char* v, size_t vlen);
+    void (*deleted_)(void*, const char* k, size_t klen);
+    virtual void Put(const Slice& key, const Slice& value) {
+      (*put_)(state_, key.data(), key.size(), value.data(), value.size());
+    }
+    virtual void Delete(const Slice& key) {
+      (*deleted_)(state_, key.data(), key.size());
+    }
+  };
+  H handler;
+  handler.state_ = state;
+  handler.put_ = put;
+  handler.deleted_ = deleted;
+  b->rep.Iterate(&handler);
+}
+
+leveldb_options_t* leveldb_options_create() {
+  return new leveldb_options_t;
+}
+
+void leveldb_options_destroy(leveldb_options_t* options) {
+  delete options;
+}
+
+void leveldb_options_set_comparator(
+    leveldb_options_t* opt,
+    leveldb_comparator_t* cmp) {
+  opt->rep.comparator = cmp;
+}
+
+void leveldb_options_set_filter_policy(
+    leveldb_options_t* opt,
+    leveldb_filterpolicy_t* policy) {
+  opt->rep.filter_policy = policy;
+}
+
+void leveldb_options_set_create_if_missing(
+    leveldb_options_t* opt, unsigned char v) {
+  opt->rep.create_if_missing = v;
+}
+
+void leveldb_options_set_error_if_exists(
+    leveldb_options_t* opt, unsigned char v) {
+  opt->rep.error_if_exists = v;
+}
+
+void leveldb_options_set_paranoid_checks(
+    leveldb_options_t* opt, unsigned char v) {
+  opt->rep.paranoid_checks = v;
+}
+
+void leveldb_options_set_env(leveldb_options_t* opt, leveldb_env_t* env) {
+  opt->rep.env = (env ? env->rep : NULL);
+}
+
+void leveldb_options_set_info_log(leveldb_options_t* opt, leveldb_logger_t* l) {
+  if (l) {
+    opt->rep.info_log = l->rep;
+  }
+}
+
+void leveldb_options_set_write_buffer_size(leveldb_options_t* opt, size_t s) {
+  opt->rep.write_buffer_size = s;
+}
+
+void leveldb_options_set_max_open_files(leveldb_options_t* opt, int n) {
+  opt->rep.max_open_files = n;
+}
+
+void leveldb_options_set_cache(leveldb_options_t* opt, leveldb_cache_t* c) {
+  if (c) {
+    opt->rep.block_cache = c->rep;
+  }
+}
+
+void leveldb_options_set_block_size(leveldb_options_t* opt, size_t s) {
+  opt->rep.block_size = s;
+}
+
+void leveldb_options_set_block_restart_interval(leveldb_options_t* opt, int n) {
+  opt->rep.block_restart_interval = n;
+}
+
+void leveldb_options_set_target_file_size_base(
+    leveldb_options_t* opt, uint64_t n) {
+  opt->rep.target_file_size_base = n;
+}
+
+void leveldb_options_set_target_file_size_multiplier(
+    leveldb_options_t* opt, int n) {
+  opt->rep.target_file_size_multiplier = n;
+}
+
+void leveldb_options_set_max_bytes_for_level_base(
+    leveldb_options_t* opt, uint64_t n) {
+  opt->rep.max_bytes_for_level_base = n;
+}
+
+void leveldb_options_set_max_bytes_for_level_multiplier(
+    leveldb_options_t* opt, int n) {
+  opt->rep.max_bytes_for_level_multiplier = n;
+}
+
+void leveldb_options_set_expanded_compaction_factor(
+    leveldb_options_t* opt, int n) {
+  opt->rep.expanded_compaction_factor = n;
+}
+
+void leveldb_options_set_max_grandparent_overlap_factor(
+    leveldb_options_t* opt, int n) {
+  opt->rep.max_grandparent_overlap_factor = n;
+}
+
+void leveldb_options_set_num_levels(leveldb_options_t* opt, int n) {
+  opt->rep.num_levels = n;
+}
+
+void leveldb_options_set_level0_file_num_compaction_trigger(
+    leveldb_options_t* opt, int n) {
+  opt->rep.level0_file_num_compaction_trigger = n;
+}
+
+void leveldb_options_set_level0_slowdown_writes_trigger(
+    leveldb_options_t* opt, int n) {
+  opt->rep.level0_slowdown_writes_trigger = n;
+}
+
+void leveldb_options_set_level0_stop_writes_trigger(
+    leveldb_options_t* opt, int n) {
+  opt->rep.level0_stop_writes_trigger = n;
+}
+
+void leveldb_options_set_max_mem_compaction_level(
+    leveldb_options_t* opt, int n) {
+  opt->rep.max_mem_compaction_level = n;
+}
+
+void leveldb_options_set_compression(leveldb_options_t* opt, int t) {
+  opt->rep.compression = static_cast<CompressionType>(t);
+}
+
+void leveldb_options_set_compression_per_level(leveldb_options_t* opt,
+                                               int* level_values,
+                                               size_t num_levels) {
+  opt->rep.compression_per_level.resize(num_levels);
+  for (size_t i = 0; i < num_levels; ++i) {
+    opt->rep.compression_per_level[i] =
+      static_cast<CompressionType>(level_values[i]);
+  }
+}
+
+void leveldb_options_set_compression_options(
+    leveldb_options_t* opt, int w_bits, int level, int strategy) {
+  opt->rep.compression_opts.window_bits = w_bits;
+  opt->rep.compression_opts.level = level;
+  opt->rep.compression_opts.strategy = strategy;
+}
+
+void leveldb_options_set_disable_data_sync(
+    leveldb_options_t* opt, bool disable_data_sync) {
+  opt->rep.disableDataSync = disable_data_sync;
+}
+
+void leveldb_options_set_use_fsync(
+    leveldb_options_t* opt, bool use_fsync) {
+  opt->rep.use_fsync = use_fsync;
+}
+
+void leveldb_options_set_db_stats_log_interval(
+    leveldb_options_t* opt, int db_stats_log_interval) {
+  opt->rep.db_stats_log_interval = db_stats_log_interval;
+}
+
+void leveldb_options_set_db_log_dir(
+    leveldb_options_t* opt, const char* db_log_dir) {
+  opt->rep.db_log_dir = db_log_dir;
+}
+
+void leveldb_options_set_WAL_ttl_seconds(leveldb_options_t* opt, uint64_t ttl) {
+  opt->rep.WAL_ttl_seconds = ttl;
+}
+
+void leveldb_options_set_WAL_size_limit_MB(
+    leveldb_options_t* opt, uint64_t limit) {
+  opt->rep.WAL_size_limit_MB = limit;
+}
+
+leveldb_comparator_t* leveldb_comparator_create(
+    void* state,
+    void (*destructor)(void*),
+    int (*compare)(
+        void*,
+        const char* a, size_t alen,
+        const char* b, size_t blen),
+    const char* (*name)(void*)) {
+  leveldb_comparator_t* result = new leveldb_comparator_t;
+  result->state_ = state;
+  result->destructor_ = destructor;
+  result->compare_ = compare;
+  result->name_ = name;
+  return result;
+}
+
+void leveldb_comparator_destroy(leveldb_comparator_t* cmp) {
+  delete cmp;
+}
+
+leveldb_filterpolicy_t* leveldb_filterpolicy_create(
+    void* state,
+    void (*destructor)(void*),
+    char* (*create_filter)(
+        void*,
+        const char* const* key_array, const size_t* key_length_array,
+        int num_keys,
+        size_t* filter_length),
+    unsigned char (*key_may_match)(
+        void*,
+        const char* key, size_t length,
+        const char* filter, size_t filter_length),
+    const char* (*name)(void*)) {
+  leveldb_filterpolicy_t* result = new leveldb_filterpolicy_t;
+  result->state_ = state;
+  result->destructor_ = destructor;
+  result->create_ = create_filter;
+  result->key_match_ = key_may_match;
+  result->name_ = name;
+  return result;
+}
+
+void leveldb_filterpolicy_destroy(leveldb_filterpolicy_t* filter) {
+  delete filter;
+}
+
+leveldb_filterpolicy_t* leveldb_filterpolicy_create_bloom(int bits_per_key) {
+  // Make a leveldb_filterpolicy_t, but override all of its methods so
+  // they delegate to a NewBloomFilterPolicy() instead of user
+  // supplied C functions.
+  struct Wrapper : public leveldb_filterpolicy_t {
+    const FilterPolicy* rep_;
+    ~Wrapper() { delete rep_; }
+    const char* Name() const { return rep_->Name(); }
+    void CreateFilter(const Slice* keys, int n, std::string* dst) const {
+      return rep_->CreateFilter(keys, n, dst);
+    }
+    bool KeyMayMatch(const Slice& key, const Slice& filter) const {
+      return rep_->KeyMayMatch(key, filter);
+    }
+    static void DoNothing(void*) { }
+  };
+  Wrapper* wrapper = new Wrapper;
+  wrapper->rep_ = NewBloomFilterPolicy(bits_per_key);
+  wrapper->state_ = NULL;
+  wrapper->destructor_ = &Wrapper::DoNothing;
+  return wrapper;
+}
+
+leveldb_readoptions_t* leveldb_readoptions_create() {
+  return new leveldb_readoptions_t;
+}
+
+void leveldb_readoptions_destroy(leveldb_readoptions_t* opt) {
+  delete opt;
+}
+
+void leveldb_readoptions_set_verify_checksums(
+    leveldb_readoptions_t* opt,
+    unsigned char v) {
+  opt->rep.verify_checksums = v;
+}
+
+void leveldb_readoptions_set_fill_cache(
+    leveldb_readoptions_t* opt, unsigned char v) {
+  opt->rep.fill_cache = v;
+}
+
+void leveldb_readoptions_set_snapshot(
+    leveldb_readoptions_t* opt,
+    const leveldb_snapshot_t* snap) {
+  opt->rep.snapshot = (snap ? snap->rep : NULL);
+}
+
+leveldb_writeoptions_t* leveldb_writeoptions_create() {
+  return new leveldb_writeoptions_t;
+}
+
+void leveldb_writeoptions_destroy(leveldb_writeoptions_t* opt) {
+  delete opt;
+}
+
+void leveldb_writeoptions_set_sync(
+    leveldb_writeoptions_t* opt, unsigned char v) {
+  opt->rep.sync = v;
+}
+
+leveldb_cache_t* leveldb_cache_create_lru(size_t capacity) {
+  leveldb_cache_t* c = new leveldb_cache_t;
+  c->rep = NewLRUCache(capacity);
+  return c;
+}
+
+void leveldb_cache_destroy(leveldb_cache_t* cache) {
+  delete cache;
+}
+
+leveldb_env_t* leveldb_create_default_env() {
+  leveldb_env_t* result = new leveldb_env_t;
+  result->rep = Env::Default();
+  result->is_default = true;
+  return result;
+}
+
+void leveldb_env_destroy(leveldb_env_t* env) {
+  if (!env->is_default) delete env->rep;
+  delete env;
+}
+
+}  // end extern "C"
--- a/db/c_test.c
+++ b/db/c_test.c
@ -0,0 +1,390 @@
+/* Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+   Use of this source code is governed by a BSD-style license that can be
+   found in the LICENSE file. See the AUTHORS file for names of contributors. */
+
+#include "rocksdb/c.h"
+
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+const char* phase = "";
+static char dbname[200];
+
+static void StartPhase(const char* name) {
+  fprintf(stderr, "=== Test %s\n", name);
+  phase = name;
+}
+
+static const char* GetTempDir(void) {
+    const char* ret = getenv("TEST_TMPDIR");
+    if (ret == NULL || ret[0] == '\0')
+        ret = "/tmp";
+    return ret;
+}
+
+#define CheckNoError(err)                                               \
+  if ((err) != NULL) {                                                  \
+    fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, __LINE__, phase, (err)); \
+    abort();                                                            \
+  }
+
+#define CheckCondition(cond)                                            \
+  if (!(cond)) {                                                        \
+    fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, __LINE__, phase, #cond); \
+    abort();                                                            \
+  }
+
+static void CheckEqual(const char* expected, const char* v, size_t n) {
+  if (expected == NULL && v == NULL) {
+    // ok
+  } else if (expected != NULL && v != NULL && n == strlen(expected) &&
+             memcmp(expected, v, n) == 0) {
+    // ok
+    return;
+  } else {
+    fprintf(stderr, "%s: expected '%s', got '%s'\n",
+            phase,
+            (expected ? expected : "(null)"),
+            (v ? v : "(null"));
+    abort();
+  }
+}
+
+static void Free(char** ptr) {
+  if (*ptr) {
+    free(*ptr);
+    *ptr = NULL;
+  }
+}
+
+static void CheckGet(
+    leveldb_t* db,
+    const leveldb_readoptions_t* options,
+    const char* key,
+    const char* expected) {
+  char* err = NULL;
+  size_t val_len;
+  char* val;
+  val = leveldb_get(db, options, key, strlen(key), &val_len, &err);
+  CheckNoError(err);
+  CheckEqual(expected, val, val_len);
+  Free(&val);
+}
+
+static void CheckIter(leveldb_iterator_t* iter,
+                      const char* key, const char* val) {
+  size_t len;
+  const char* str;
+  str = leveldb_iter_key(iter, &len);
+  CheckEqual(key, str, len);
+  str = leveldb_iter_value(iter, &len);
+  CheckEqual(val, str, len);
+}
+
+// Callback from leveldb_writebatch_iterate()
+static void CheckPut(void* ptr,
+                     const char* k, size_t klen,
+                     const char* v, size_t vlen) {
+  int* state = (int*) ptr;
+  CheckCondition(*state < 2);
+  switch (*state) {
+    case 0:
+      CheckEqual("bar", k, klen);
+      CheckEqual("b", v, vlen);
+      break;
+    case 1:
+      CheckEqual("box", k, klen);
+      CheckEqual("c", v, vlen);
+      break;
+  }
+  (*state)++;
+}
+
+// Callback from leveldb_writebatch_iterate()
+static void CheckDel(void* ptr, const char* k, size_t klen) {
+  int* state = (int*) ptr;
+  CheckCondition(*state == 2);
+  CheckEqual("bar", k, klen);
+  (*state)++;
+}
+
+static void CmpDestroy(void* arg) { }
+
+static int CmpCompare(void* arg, const char* a, size_t alen,
+                      const char* b, size_t blen) {
+  int n = (alen < blen) ? alen : blen;
+  int r = memcmp(a, b, n);
+  if (r == 0) {
+    if (alen < blen) r = -1;
+    else if (alen > blen) r = +1;
+  }
+  return r;
+}
+
+static const char* CmpName(void* arg) {
+  return "foo";
+}
+
+// Custom filter policy
+static unsigned char fake_filter_result = 1;
+static void FilterDestroy(void* arg) { }
+static const char* FilterName(void* arg) {
+  return "TestFilter";
+}
+static char* FilterCreate(
+    void* arg,
+    const char* const* key_array, const size_t* key_length_array,
+    int num_keys,
+    size_t* filter_length) {
+  *filter_length = 4;
+  char* result = malloc(4);
+  memcpy(result, "fake", 4);
+  return result;
+}
+unsigned char FilterKeyMatch(
+    void* arg,
+    const char* key, size_t length,
+    const char* filter, size_t filter_length) {
+  CheckCondition(filter_length == 4);
+  CheckCondition(memcmp(filter, "fake", 4) == 0);
+  return fake_filter_result;
+}
+
+int main(int argc, char** argv) {
+  leveldb_t* db;
+  leveldb_comparator_t* cmp;
+  leveldb_cache_t* cache;
+  leveldb_env_t* env;
+  leveldb_options_t* options;
+  leveldb_readoptions_t* roptions;
+  leveldb_writeoptions_t* woptions;
+  char* err = NULL;
+  int run = -1;
+
+  snprintf(dbname, sizeof(dbname),
+           "%s/leveldb_c_test-%d",
+           GetTempDir(),
+           ((int) geteuid()));
+
+  StartPhase("create_objects");
+  cmp = leveldb_comparator_create(NULL, CmpDestroy, CmpCompare, CmpName);
+  env = leveldb_create_default_env();
+  cache = leveldb_cache_create_lru(100000);
+
+  options = leveldb_options_create();
+  leveldb_options_set_comparator(options, cmp);
+  leveldb_options_set_error_if_exists(options, 1);
+  leveldb_options_set_cache(options, cache);
+  leveldb_options_set_env(options, env);
+  leveldb_options_set_info_log(options, NULL);
+  leveldb_options_set_write_buffer_size(options, 100000);
+  leveldb_options_set_paranoid_checks(options, 1);
+  leveldb_options_set_max_open_files(options, 10);
+  leveldb_options_set_block_size(options, 1024);
+  leveldb_options_set_block_restart_interval(options, 8);
+  leveldb_options_set_compression(options, leveldb_no_compression);
+  leveldb_options_set_compression_options(options, -14, -1, 0);
+  int compression_levels[] = {leveldb_no_compression, leveldb_no_compression,
+                              leveldb_no_compression, leveldb_no_compression};
+  leveldb_options_set_compression_per_level(options, compression_levels, 4);
+
+  roptions = leveldb_readoptions_create();
+  leveldb_readoptions_set_verify_checksums(roptions, 1);
+  leveldb_readoptions_set_fill_cache(roptions, 0);
+
+  woptions = leveldb_writeoptions_create();
+  leveldb_writeoptions_set_sync(woptions, 1);
+
+  StartPhase("destroy");
+  leveldb_destroy_db(options, dbname, &err);
+  Free(&err);
+
+  StartPhase("open_error");
+  db = leveldb_open(options, dbname, &err);
+  CheckCondition(err != NULL);
+  Free(&err);
+
+  StartPhase("open");
+  leveldb_options_set_create_if_missing(options, 1);
+  db = leveldb_open(options, dbname, &err);
+  CheckNoError(err);
+  CheckGet(db, roptions, "foo", NULL);
+
+  StartPhase("put");
+  leveldb_put(db, woptions, "foo", 3, "hello", 5, &err);
+  CheckNoError(err);
+  CheckGet(db, roptions, "foo", "hello");
+
+  StartPhase("compactall");
+  leveldb_compact_range(db, NULL, 0, NULL, 0);
+  CheckGet(db, roptions, "foo", "hello");
+
+  StartPhase("compactrange");
+  leveldb_compact_range(db, "a", 1, "z", 1);
+  CheckGet(db, roptions, "foo", "hello");
+
+  StartPhase("writebatch");
+  {
+    leveldb_writebatch_t* wb = leveldb_writebatch_create();
+    leveldb_writebatch_put(wb, "foo", 3, "a", 1);
+    leveldb_writebatch_clear(wb);
+    leveldb_writebatch_put(wb, "bar", 3, "b", 1);
+    leveldb_writebatch_put(wb, "box", 3, "c", 1);
+    leveldb_writebatch_delete(wb, "bar", 3);
+    leveldb_write(db, woptions, wb, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "foo", "hello");
+    CheckGet(db, roptions, "bar", NULL);
+    CheckGet(db, roptions, "box", "c");
+    int pos = 0;
+    leveldb_writebatch_iterate(wb, &pos, CheckPut, CheckDel);
+    CheckCondition(pos == 3);
+    leveldb_writebatch_destroy(wb);
+  }
+
+  StartPhase("iter");
+  {
+    leveldb_iterator_t* iter = leveldb_create_iterator(db, roptions);
+    CheckCondition(!leveldb_iter_valid(iter));
+    leveldb_iter_seek_to_first(iter);
+    CheckCondition(leveldb_iter_valid(iter));
+    CheckIter(iter, "box", "c");
+    leveldb_iter_next(iter);
+    CheckIter(iter, "foo", "hello");
+    leveldb_iter_prev(iter);
+    CheckIter(iter, "box", "c");
+    leveldb_iter_prev(iter);
+    CheckCondition(!leveldb_iter_valid(iter));
+    leveldb_iter_seek_to_last(iter);
+    CheckIter(iter, "foo", "hello");
+    leveldb_iter_seek(iter, "b", 1);
+    CheckIter(iter, "box", "c");
+    leveldb_iter_get_error(iter, &err);
+    CheckNoError(err);
+    leveldb_iter_destroy(iter);
+  }
+
+  StartPhase("approximate_sizes");
+  {
+    int i;
+    int n = 20000;
+    char keybuf[100];
+    char valbuf[100];
+    uint64_t sizes[2];
+    const char* start[2] = { "a", "k00000000000000010000" };
+    size_t start_len[2] = { 1, 21 };
+    const char* limit[2] = { "k00000000000000010000", "z" };
+    size_t limit_len[2] = { 21, 1 };
+    leveldb_writeoptions_set_sync(woptions, 0);
+    for (i = 0; i < n; i++) {
+      snprintf(keybuf, sizeof(keybuf), "k%020d", i);
+      snprintf(valbuf, sizeof(valbuf), "v%020d", i);
+      leveldb_put(db, woptions, keybuf, strlen(keybuf), valbuf, strlen(valbuf),
+                  &err);
+      CheckNoError(err);
+    }
+    leveldb_approximate_sizes(db, 2, start, start_len, limit, limit_len, sizes);
+    CheckCondition(sizes[0] > 0);
+    CheckCondition(sizes[1] > 0);
+  }
+
+  StartPhase("property");
+  {
+    char* prop = leveldb_property_value(db, "nosuchprop");
+    CheckCondition(prop == NULL);
+    prop = leveldb_property_value(db, "rocksdb.stats");
+    CheckCondition(prop != NULL);
+    Free(&prop);
+  }
+
+  StartPhase("snapshot");
+  {
+    const leveldb_snapshot_t* snap;
+    snap = leveldb_create_snapshot(db);
+    leveldb_delete(db, woptions, "foo", 3, &err);
+    CheckNoError(err);
+    leveldb_readoptions_set_snapshot(roptions, snap);
+    CheckGet(db, roptions, "foo", "hello");
+    leveldb_readoptions_set_snapshot(roptions, NULL);
+    CheckGet(db, roptions, "foo", NULL);
+    leveldb_release_snapshot(db, snap);
+  }
+
+  StartPhase("repair");
+  {
+    // If we do not compact here, then the lazy deletion of
+    // files (https://reviews.facebook.net/D6123) would leave
+    // around deleted files and the repair process will find
+    // those files and put them back into the database.
+    leveldb_compact_range(db, NULL, 0, NULL, 0);
+    leveldb_close(db);
+    leveldb_options_set_create_if_missing(options, 0);
+    leveldb_options_set_error_if_exists(options, 0);
+    leveldb_repair_db(options, dbname, &err);
+    CheckNoError(err);
+    db = leveldb_open(options, dbname, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "foo", NULL);
+    CheckGet(db, roptions, "bar", NULL);
+    CheckGet(db, roptions, "box", "c");
+    leveldb_options_set_create_if_missing(options, 1);
+    leveldb_options_set_error_if_exists(options, 1);
+  }
+
+  StartPhase("filter");
+  for (run = 0; run < 2; run++) {
+    // First run uses custom filter, second run uses bloom filter
+    CheckNoError(err);
+    leveldb_filterpolicy_t* policy;
+    if (run == 0) {
+      policy = leveldb_filterpolicy_create(
+          NULL, FilterDestroy, FilterCreate, FilterKeyMatch, FilterName);
+    } else {
+      policy = leveldb_filterpolicy_create_bloom(10);
+    }
+
+    // Create new database
+    leveldb_close(db);
+    leveldb_destroy_db(options, dbname, &err);
+    leveldb_options_set_filter_policy(options, policy);
+    db = leveldb_open(options, dbname, &err);
+    CheckNoError(err);
+    leveldb_put(db, woptions, "foo", 3, "foovalue", 8, &err);
+    CheckNoError(err);
+    leveldb_put(db, woptions, "bar", 3, "barvalue", 8, &err);
+    CheckNoError(err);
+    leveldb_compact_range(db, NULL, 0, NULL, 0);
+
+    fake_filter_result = 1;
+    CheckGet(db, roptions, "foo", "foovalue");
+    CheckGet(db, roptions, "bar", "barvalue");
+    if (phase == 0) {
+      // Must not find value when custom filter returns false
+      fake_filter_result = 0;
+      CheckGet(db, roptions, "foo", NULL);
+      CheckGet(db, roptions, "bar", NULL);
+      fake_filter_result = 1;
+
+      CheckGet(db, roptions, "foo", "foovalue");
+      CheckGet(db, roptions, "bar", "barvalue");
+    }
+    leveldb_options_set_filter_policy(options, NULL);
+    leveldb_filterpolicy_destroy(policy);
+  }
+
+  StartPhase("cleanup");
+  leveldb_close(db);
+  leveldb_options_destroy(options);
+  leveldb_readoptions_destroy(roptions);
+  leveldb_writeoptions_destroy(woptions);
+  leveldb_cache_destroy(cache);
+  leveldb_comparator_destroy(cmp);
+  leveldb_env_destroy(env);
+
+  fprintf(stderr, "PASS\n");
+  return 0;
+}
--- a/db/corruption_test.cc
+++ b/db/corruption_test.cc
@ -0,0 +1,378 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/db.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include "rocksdb/cache.h"
+#include "rocksdb/env.h"
+#include "rocksdb/table.h"
+#include "rocksdb/write_batch.h"
+#include "db/db_impl.h"
+#include "db/filename.h"
+#include "db/log_format.h"
+#include "db/version_set.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+static const int kValueSize = 1000;
+
+class CorruptionTest {
+ public:
+  test::ErrorEnv env_;
+  std::string dbname_;
+  shared_ptr<Cache> tiny_cache_;
+  Options options_;
+  DB* db_;
+
+  CorruptionTest() {
+    tiny_cache_ = NewLRUCache(100);
+    options_.env = &env_;
+    dbname_ = test::TmpDir() + "/db_test";
+    DestroyDB(dbname_, options_);
+
+    db_ = nullptr;
+    options_.create_if_missing = true;
+    options_.block_size_deviation = 0; // make unit test pass for now
+    Reopen();
+    options_.create_if_missing = false;
+  }
+
+  ~CorruptionTest() {
+     delete db_;
+     DestroyDB(dbname_, Options());
+  }
+
+  Status TryReopen(Options* options = nullptr) {
+    delete db_;
+    db_ = nullptr;
+    Options opt = (options ? *options : options_);
+    opt.env = &env_;
+    opt.block_cache = tiny_cache_;
+    opt.block_size_deviation = 0;
+    opt.arena_block_size = 4096;
+    return DB::Open(opt, dbname_, &db_);
+  }
+
+  void Reopen(Options* options = nullptr) {
+    ASSERT_OK(TryReopen(options));
+  }
+
+  void RepairDB() {
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(::rocksdb::RepairDB(dbname_, options_));
+  }
+
+  void Build(int n) {
+    std::string key_space, value_space;
+    WriteBatch batch;
+    for (int i = 0; i < n; i++) {
+      //if ((i % 100) == 0) fprintf(stderr, "@ %d of %d\n", i, n);
+      Slice key = Key(i, &key_space);
+      batch.Clear();
+      batch.Put(key, Value(i, &value_space));
+      ASSERT_OK(db_->Write(WriteOptions(), &batch));
+    }
+  }
+
+  void Check(int min_expected, int max_expected) {
+    unsigned int next_expected = 0;
+    int missed = 0;
+    int bad_keys = 0;
+    int bad_values = 0;
+    int correct = 0;
+    std::string value_space;
+    Iterator* iter = db_->NewIterator(ReadOptions());
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      uint64_t key;
+      Slice in(iter->key());
+      if (!ConsumeDecimalNumber(&in, &key) ||
+          !in.empty() ||
+          key < next_expected) {
+        bad_keys++;
+        continue;
+      }
+      missed += (key - next_expected);
+      next_expected = key + 1;
+      if (iter->value() != Value(key, &value_space)) {
+        bad_values++;
+      } else {
+        correct++;
+      }
+    }
+    delete iter;
+
+    fprintf(stderr,
+            "expected=%d..%d; got=%d; bad_keys=%d; bad_values=%d; missed=%d\n",
+            min_expected, max_expected, correct, bad_keys, bad_values, missed);
+    ASSERT_LE(min_expected, correct);
+    ASSERT_GE(max_expected, correct);
+  }
+
+  void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) {
+    // Pick file to corrupt
+    std::vector<std::string> filenames;
+    ASSERT_OK(env_.GetChildren(dbname_, &filenames));
+    uint64_t number;
+    FileType type;
+    std::string fname;
+    int picked_number = -1;
+    for (unsigned int i = 0; i < filenames.size(); i++) {
+      if (ParseFileName(filenames[i], &number, &type) &&
+          type == filetype &&
+          int(number) > picked_number) {  // Pick latest file
+        fname = dbname_ + "/" + filenames[i];
+        picked_number = number;
+      }
+    }
+    ASSERT_TRUE(!fname.empty()) << filetype;
+
+    struct stat sbuf;
+    if (stat(fname.c_str(), &sbuf) != 0) {
+      const char* msg = strerror(errno);
+      ASSERT_TRUE(false) << fname << ": " << msg;
+    }
+
+    if (offset < 0) {
+      // Relative to end of file; make it absolute
+      if (-offset > sbuf.st_size) {
+        offset = 0;
+      } else {
+        offset = sbuf.st_size + offset;
+      }
+    }
+    if (offset > sbuf.st_size) {
+      offset = sbuf.st_size;
+    }
+    if (offset + bytes_to_corrupt > sbuf.st_size) {
+      bytes_to_corrupt = sbuf.st_size - offset;
+    }
+
+    // Do it
+    std::string contents;
+    Status s = ReadFileToString(Env::Default(), fname, &contents);
+    ASSERT_TRUE(s.ok()) << s.ToString();
+    for (int i = 0; i < bytes_to_corrupt; i++) {
+      contents[i + offset] ^= 0x80;
+    }
+    s = WriteStringToFile(Env::Default(), contents, fname);
+    ASSERT_TRUE(s.ok()) << s.ToString();
+  }
+
+  int Property(const std::string& name) {
+    std::string property;
+    int result;
+    if (db_->GetProperty(name, &property) &&
+        sscanf(property.c_str(), "%d", &result) == 1) {
+      return result;
+    } else {
+      return -1;
+    }
+  }
+
+  // Return the ith key
+  Slice Key(int i, std::string* storage) {
+    char buf[100];
+    snprintf(buf, sizeof(buf), "%016d", i);
+    storage->assign(buf, strlen(buf));
+    return Slice(*storage);
+  }
+
+  // Return the value to associate with the specified key
+  Slice Value(int k, std::string* storage) {
+    Random r(k);
+    return test::RandomString(&r, kValueSize, storage);
+  }
+};
+
+TEST(CorruptionTest, Recovery) {
+  Build(100);
+  Check(100, 100);
+  Corrupt(kLogFile, 19, 1);      // WriteBatch tag for first record
+  Corrupt(kLogFile, log::kBlockSize + 1000, 1);  // Somewhere in second block
+  Reopen();
+
+  // The 64 records in the first two log blocks are completely lost.
+  Check(36, 36);
+}
+
+TEST(CorruptionTest, RecoverWriteError) {
+  env_.writable_file_error_ = true;
+  Status s = TryReopen();
+  ASSERT_TRUE(!s.ok());
+}
+
+TEST(CorruptionTest, NewFileErrorDuringWrite) {
+  // Do enough writing to force minor compaction
+  env_.writable_file_error_ = true;
+  const int num = 3 + (Options().write_buffer_size / kValueSize);
+  std::string value_storage;
+  Status s;
+  bool failed = false;
+  for (int i = 0; i < num; i++) {
+    WriteBatch batch;
+    batch.Put("a", Value(100, &value_storage));
+    s = db_->Write(WriteOptions(), &batch);
+    if (!s.ok()) {
+      failed = true;
+    }
+    ASSERT_TRUE(!failed || !s.ok());
+  }
+  ASSERT_TRUE(!s.ok());
+  ASSERT_GE(env_.num_writable_file_errors_, 1);
+  env_.writable_file_error_ = false;
+  Reopen();
+}
+
+TEST(CorruptionTest, TableFile) {
+  Build(100);
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+  dbi->TEST_FlushMemTable();
+  dbi->TEST_CompactRange(0, nullptr, nullptr);
+  dbi->TEST_CompactRange(1, nullptr, nullptr);
+
+  Corrupt(kTableFile, 100, 1);
+  Check(99, 99);
+}
+
+TEST(CorruptionTest, TableFileIndexData) {
+  Build(10000);  // Enough to build multiple Tables
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+  dbi->TEST_FlushMemTable();
+
+  Corrupt(kTableFile, -2000, 500);
+  Reopen();
+  Check(5000, 9999);
+}
+
+TEST(CorruptionTest, MissingDescriptor) {
+  Build(1000);
+  RepairDB();
+  Reopen();
+  Check(1000, 1000);
+}
+
+TEST(CorruptionTest, SequenceNumberRecovery) {
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v3"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v4"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v5"));
+  RepairDB();
+  Reopen();
+  std::string v;
+  ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+  ASSERT_EQ("v5", v);
+  // Write something.  If sequence number was not recovered properly,
+  // it will be hidden by an earlier write.
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v6"));
+  ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+  ASSERT_EQ("v6", v);
+  Reopen();
+  ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+  ASSERT_EQ("v6", v);
+}
+
+TEST(CorruptionTest, CorruptedDescriptor) {
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "hello"));
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+  dbi->TEST_FlushMemTable();
+  dbi->TEST_CompactRange(0, nullptr, nullptr);
+
+  Corrupt(kDescriptorFile, 0, 1000);
+  Status s = TryReopen();
+  ASSERT_TRUE(!s.ok());
+
+  RepairDB();
+  Reopen();
+  std::string v;
+  ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+  ASSERT_EQ("hello", v);
+}
+
+TEST(CorruptionTest, CompactionInputError) {
+  Build(10);
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+  dbi->TEST_FlushMemTable();
+  const int last = dbi->MaxMemCompactionLevel();
+  ASSERT_EQ(1, Property("rocksdb.num-files-at-level" + NumberToString(last)));
+
+  Corrupt(kTableFile, 100, 1);
+  Check(9, 9);
+
+  // Force compactions by writing lots of values
+  Build(10000);
+  Check(10000, 10000);
+}
+
+TEST(CorruptionTest, CompactionInputErrorParanoid) {
+  Options options;
+  options.paranoid_checks = true;
+  options.write_buffer_size = 1048576;
+  Reopen(&options);
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+
+  // Fill levels >= 1 so memtable compaction outputs to level 1
+  for (int level = 1; level < dbi->NumberLevels(); level++) {
+    dbi->Put(WriteOptions(), "", "begin");
+    dbi->Put(WriteOptions(), "~", "end");
+    dbi->TEST_FlushMemTable();
+  }
+
+  Build(10);
+  dbi->TEST_FlushMemTable();
+  dbi->TEST_WaitForCompact();
+  ASSERT_EQ(1, Property("rocksdb.num-files-at-level0"));
+
+  Corrupt(kTableFile, 100, 1);
+  Check(9, 9);
+
+  // Write must eventually fail because of corrupted table
+  Status s;
+  std::string tmp1, tmp2;
+  bool failed = false;
+  for (int i = 0; i < 10000 && s.ok(); i++) {
+    s = db_->Put(WriteOptions(), Key(i, &tmp1), Value(i, &tmp2));
+    if (!s.ok()) {
+      failed = true;
+    }
+    // if one write failed, every subsequent write must fail, too
+    ASSERT_TRUE(!failed || !s.ok()) << "write did not fail in a corrupted db";
+  }
+  ASSERT_TRUE(!s.ok()) << "write did not fail in corrupted paranoid db";
+}
+
+TEST(CorruptionTest, UnrelatedKeys) {
+  Build(10);
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+  dbi->TEST_FlushMemTable();
+  Corrupt(kTableFile, 100, 1);
+
+  std::string tmp1, tmp2;
+  ASSERT_OK(db_->Put(WriteOptions(), Key(1000, &tmp1), Value(1000, &tmp2)));
+  std::string v;
+  ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v));
+  ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
+  dbi->TEST_FlushMemTable();
+  ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v));
+  ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
--- a/db/db_filesnapshot.cc
+++ b/db/db_filesnapshot.cc
@ -0,0 +1,100 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2012 Facebook.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <algorithm>
+#include <string>
+#include <stdint.h>
+#include "db/db_impl.h"
+#include "db/filename.h"
+#include "db/version_set.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "port/port.h"
+#include "util/mutexlock.h"
+
+namespace rocksdb {
+
+Status DBImpl::DisableFileDeletions() {
+  MutexLock l(&mutex_);
+  disable_delete_obsolete_files_ = true;
+  Log(options_.info_log, "File Deletions Disabled");
+  return Status::OK();
+}
+
+Status DBImpl::EnableFileDeletions() {
+  DeletionState deletion_state;
+  {
+    MutexLock l(&mutex_);
+    disable_delete_obsolete_files_ = false;
+    Log(options_.info_log, "File Deletions Enabled");
+    FindObsoleteFiles(deletion_state, true);
+  }
+  PurgeObsoleteFiles(deletion_state);
+  LogFlush(options_.info_log);
+  return Status::OK();
+}
+
+Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
+                            uint64_t* manifest_file_size,
+                            bool flush_memtable) {
+
+  *manifest_file_size = 0;
+
+  if (flush_memtable) {
+    // flush all dirty data to disk.
+    Status status =  Flush(FlushOptions());
+    if (!status.ok()) {
+      Log(options_.info_log, "Cannot Flush data %s\n",
+          status.ToString().c_str());
+      return status;
+    }
+  }
+
+  MutexLock l(&mutex_);
+
+  // Make a set of all of the live *.sst files
+  std::set<uint64_t> live;
+  versions_->AddLiveFilesCurrentVersion(&live);
+
+  ret.clear();
+  ret.reserve(live.size() + 2); //*.sst + CURRENT + MANIFEST
+
+  // create names of the live files. The names are not absolute
+  // paths, instead they are relative to dbname_;
+  for (auto live_file : live) {
+    ret.push_back(TableFileName("", live_file));
+  }
+
+  ret.push_back(CurrentFileName(""));
+  ret.push_back(DescriptorFileName("", versions_->ManifestFileNumber()));
+
+  // find length of manifest file while holding the mutex lock
+  *manifest_file_size = versions_->ManifestFileSize();
+
+  return Status::OK();
+}
+
+Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) {
+  // First get sorted files in archive dir, then append sorted files from main
+  // dir to maintain sorted order
+
+  // list wal files in archive dir.
+  Status s;
+  std::string archivedir = ArchivalDirectory(options_.wal_dir);
+  if (env_->FileExists(archivedir)) {
+    s = AppendSortedWalsOfType(archivedir, files, kArchivedLogFile);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+  // list wal files in main db dir.
+  return AppendSortedWalsOfType(options_.wal_dir, files, kAliveLogFile);
+}
+
+}
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
--- a/db/db_impl.h
+++ b/db/db_impl.h
@ -0,0 +1,497 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+#include <atomic>
+#include <deque>
+#include <set>
+#include <vector>
+#include "db/dbformat.h"
+#include "db/log_writer.h"
+#include "db/snapshot.h"
+#include "db/version_edit.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/transaction_log.h"
+#include "port/port.h"
+#include "util/stats_logger.h"
+#include "memtablelist.h"
+
+namespace rocksdb {
+
+class MemTable;
+class TableCache;
+class Version;
+class VersionEdit;
+class VersionSet;
+
+class DBImpl : public DB {
+ public:
+  DBImpl(const Options& options, const std::string& dbname);
+  virtual ~DBImpl();
+
+  // Implementations of the DB interface
+  virtual Status Put(const WriteOptions&, const Slice& key, const Slice& value);
+  virtual Status Merge(const WriteOptions&, const Slice& key,
+                       const Slice& value);
+  virtual Status Delete(const WriteOptions&, const Slice& key);
+  virtual Status Write(const WriteOptions& options, WriteBatch* updates);
+  virtual Status Get(const ReadOptions& options,
+                     const Slice& key,
+                     std::string* value);
+  virtual std::vector<Status> MultiGet(const ReadOptions& options,
+                                       const std::vector<Slice>& keys,
+                                       std::vector<std::string>* values);
+
+  // Returns false if key doesn't exist in the database and true if it may.
+  // If value_found is not passed in as null, then return the value if found in
+  // memory. On return, if value was found, then value_found will be set to true
+  // , otherwise false.
+  virtual bool KeyMayExist(const ReadOptions& options,
+                           const Slice& key,
+                           std::string* value,
+                           bool* value_found = nullptr);
+  virtual Iterator* NewIterator(const ReadOptions&);
+  virtual const Snapshot* GetSnapshot();
+  virtual void ReleaseSnapshot(const Snapshot* snapshot);
+  virtual bool GetProperty(const Slice& property, std::string* value);
+  virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes);
+  virtual void CompactRange(const Slice* begin, const Slice* end,
+                            bool reduce_level = false, int target_level = -1);
+  virtual int NumberLevels();
+  virtual int MaxMemCompactionLevel();
+  virtual int Level0StopWriteTrigger();
+  virtual Status Flush(const FlushOptions& options);
+  virtual Status DisableFileDeletions();
+  virtual Status EnableFileDeletions();
+  // All the returned filenames start with "/"
+  virtual Status GetLiveFiles(std::vector<std::string>&,
+                              uint64_t* manifest_file_size,
+                              bool flush_memtable = true);
+  virtual Status GetSortedWalFiles(VectorLogPtr& files);
+  virtual SequenceNumber GetLatestSequenceNumber() const;
+  virtual Status GetUpdatesSince(SequenceNumber seq_number,
+                                 unique_ptr<TransactionLogIterator>* iter);
+  virtual Status DeleteFile(std::string name);
+
+  virtual void GetLiveFilesMetaData(
+    std::vector<LiveFileMetaData> *metadata);
+
+  // Extra methods (for testing) that are not in the public DB interface
+
+  // Compact any files in the named level that overlap [*begin, *end]
+  void TEST_CompactRange(int level, const Slice* begin, const Slice* end);
+
+  // Force current memtable contents to be flushed.
+  Status TEST_FlushMemTable();
+
+  // Wait for memtable compaction
+  Status TEST_WaitForFlushMemTable();
+
+  // Wait for any compaction
+  Status TEST_WaitForCompact();
+
+  // Return an internal iterator over the current state of the database.
+  // The keys of this iterator are internal keys (see format.h).
+  // The returned iterator should be deleted when no longer needed.
+  Iterator* TEST_NewInternalIterator();
+
+  // Return the maximum overlapping data (in bytes) at next level for any
+  // file at a level >= 1.
+  int64_t TEST_MaxNextLevelOverlappingBytes();
+
+  // Simulate a db crash, no elegant closing of database.
+  void TEST_Destroy_DBImpl();
+
+  // Return the current manifest file no.
+  uint64_t TEST_Current_Manifest_FileNo();
+
+  // Trigger's a background call for testing.
+  void TEST_PurgeObsoleteteWAL();
+
+  // get total level0 file size. Only for testing.
+  uint64_t TEST_GetLevel0TotalSize() { return versions_->NumLevelBytes(0);}
+
+  void TEST_SetDefaultTimeToCheck(uint64_t default_interval_to_delete_obsolete_WAL)
+  {
+    default_interval_to_delete_obsolete_WAL_ = default_interval_to_delete_obsolete_WAL;
+  }
+
+  // needed for CleanupIteratorState
+
+  struct DeletionState {
+    inline bool HaveSomethingToDelete() const {
+      return all_files.size() ||
+        sst_delete_files.size() ||
+        log_delete_files.size();
+    }
+    // a list of all files that we'll consider deleting
+    // (every once in a while this is filled up with all files
+    // in the DB directory)
+    std::vector<std::string> all_files;
+
+    // the list of all live sst files that cannot be deleted
+    std::vector<uint64_t> sst_live;
+
+    // a list of sst files that we need to delete
+    std::vector<FileMetaData*> sst_delete_files;
+
+    // a list of log files that we need to delete
+    std::vector<uint64_t> log_delete_files;
+
+    // the current manifest_file_number, log_number and prev_log_number
+    // that corresponds to the set of files in 'live'.
+    uint64_t manifest_file_number, log_number, prev_log_number;
+
+    DeletionState() {
+      manifest_file_number = 0;
+      log_number = 0;
+      prev_log_number = 0;
+    }
+  };
+
+  // Returns the list of live files in 'live' and the list
+  // of all files in the filesystem in 'all_files'.
+  // If force == false and the last call was less than
+  // options_.delete_obsolete_files_period_micros microseconds ago,
+  // it will not fill up the deletion_state
+  void FindObsoleteFiles(DeletionState& deletion_state,
+                         bool force,
+                         bool no_full_scan = false);
+
+  // Diffs the files listed in filenames and those that do not
+  // belong to live files are posibly removed. Also, removes all the
+  // files in sst_delete_files and log_delete_files.
+  // It is not necessary to hold the mutex when invoking this method.
+  void PurgeObsoleteFiles(DeletionState& deletion_state);
+
+ protected:
+  Env* const env_;
+  const std::string dbname_;
+  unique_ptr<VersionSet> versions_;
+  const InternalKeyComparator internal_comparator_;
+  const Options options_;  // options_.comparator == &internal_comparator_
+
+  const Comparator* user_comparator() const {
+    return internal_comparator_.user_comparator();
+  }
+
+  MemTable* GetMemTable() {
+    return mem_;
+  }
+
+  Iterator* NewInternalIterator(const ReadOptions&,
+                                SequenceNumber* latest_snapshot);
+
+ private:
+  friend class DB;
+  struct CompactionState;
+  struct Writer;
+
+  Status NewDB();
+
+  // Recover the descriptor from persistent storage.  May do a significant
+  // amount of work to recover recently logged updates.  Any changes to
+  // be made to the descriptor are added to *edit.
+  Status Recover(VersionEdit* edit, MemTable* external_table = nullptr,
+      bool error_if_log_file_exist = false);
+
+  void MaybeIgnoreError(Status* s) const;
+
+  const Status CreateArchivalDirectory();
+
+  // Delete any unneeded files and stale in-memory entries.
+  void DeleteObsoleteFiles();
+
+  // Flush the in-memory write buffer to storage.  Switches to a new
+  // log-file/memtable and writes a new descriptor iff successful.
+  Status FlushMemTableToOutputFile(bool* madeProgress,
+                                   DeletionState& deletion_state);
+
+  Status RecoverLogFile(uint64_t log_number,
+                        VersionEdit* edit,
+                        SequenceNumber* max_sequence,
+                        MemTable* external_table);
+
+  // The following two methods are used to flush a memtable to
+  // storage. The first one is used atdatabase RecoveryTime (when the
+  // database is opened) and is heavyweight because it holds the mutex
+  // for the entire period. The second method WriteLevel0Table supports
+  // concurrent flush memtables to storage.
+  Status WriteLevel0TableForRecovery(MemTable* mem, VersionEdit* edit);
+  Status WriteLevel0Table(std::vector<MemTable*> &mems, VersionEdit* edit,
+                                uint64_t* filenumber);
+
+  uint64_t SlowdownAmount(int n, int top, int bottom);
+  Status MakeRoomForWrite(bool force /* compact even if there is room? */);
+  WriteBatch* BuildBatchGroup(Writer** last_writer);
+
+  // Force current memtable contents to be flushed.
+  Status FlushMemTable(const FlushOptions& options);
+
+  // Wait for memtable flushed
+  Status WaitForFlushMemTable();
+
+  void MaybeScheduleLogDBDeployStats();
+  static void BGLogDBDeployStats(void* db);
+  void LogDBDeployStats();
+
+  void MaybeScheduleFlushOrCompaction();
+  static void BGWorkCompaction(void* db);
+  static void BGWorkFlush(void* db);
+  void BackgroundCallCompaction();
+  void BackgroundCallFlush();
+  Status BackgroundCompaction(bool* madeProgress,DeletionState& deletion_state);
+  Status BackgroundFlush(bool* madeProgress, DeletionState& deletion_state);
+  void CleanupCompaction(CompactionState* compact, Status status);
+  Status DoCompactionWork(CompactionState* compact,
+                          DeletionState& deletion_state);
+
+  Status OpenCompactionOutputFile(CompactionState* compact);
+  Status FinishCompactionOutputFile(CompactionState* compact, Iterator* input);
+  Status InstallCompactionResults(CompactionState* compact);
+  void AllocateCompactionOutputFileNumbers(CompactionState* compact);
+  void ReleaseCompactionUnusedFileNumbers(CompactionState* compact);
+
+  void PurgeObsoleteWALFiles();
+
+  Status AppendSortedWalsOfType(const std::string& path,
+                                VectorLogPtr& log_files,
+                                WalFileType type);
+
+  // Requires: all_logs should be sorted with earliest log file first
+  // Retains all log files in all_logs which contain updates with seq no.
+  // Greater Than or Equal to the requested SequenceNumber.
+  Status RetainProbableWalFiles(VectorLogPtr& all_logs,
+                                const SequenceNumber target);
+  //  return true if
+  bool CheckWalFileExistsAndEmpty(const WalFileType type,
+                                  const uint64_t number);
+
+  Status ReadFirstRecord(const WalFileType type, const uint64_t number,
+                         WriteBatch* const result);
+
+  Status ReadFirstLine(const std::string& fname, WriteBatch* const batch);
+
+  void PrintStatistics();
+
+  // dump rocksdb.stats to LOG
+  void MaybeDumpStats();
+
+  // Return the minimum empty level that could hold the total data in the
+  // input level. Return the input level, if such level could not be found.
+  int FindMinimumEmptyLevelFitting(int level);
+
+  // Move the files in the input level to the target level.
+  // If target_level < 0, automatically calculate the minimum level that could
+  // hold the data set.
+  void ReFitLevel(int level, int target_level = -1);
+
+  // Constant after construction
+  const InternalFilterPolicy internal_filter_policy_;
+  bool owns_info_log_;
+
+  // table_cache_ provides its own synchronization
+  unique_ptr<TableCache> table_cache_;
+
+  // Lock over the persistent DB state.  Non-nullptr iff successfully acquired.
+  FileLock* db_lock_;
+
+  // State below is protected by mutex_
+  port::Mutex mutex_;
+  port::AtomicPointer shutting_down_;
+  port::CondVar bg_cv_;          // Signalled when background work finishes
+  std::shared_ptr<MemTableRepFactory> mem_rep_factory_;
+  MemTable* mem_;
+  MemTableList imm_;             // Memtable that are not changing
+  uint64_t logfile_number_;
+  unique_ptr<log::Writer> log_;
+
+  std::string host_name_;
+
+  // Queue of writers.
+  std::deque<Writer*> writers_;
+  WriteBatch tmp_batch_;
+
+  SnapshotList snapshots_;
+
+  // Set of table files to protect from deletion because they are
+  // part of ongoing compactions.
+  std::set<uint64_t> pending_outputs_;
+
+  // count how many background compaction been scheduled or is running?
+  int bg_compaction_scheduled_;
+
+  // number of background memtable flush jobs, submitted to the HIGH pool
+  int bg_flush_scheduled_;
+
+  // Has a background stats log thread scheduled?
+  bool bg_logstats_scheduled_;
+
+  // Information for a manual compaction
+  struct ManualCompaction {
+    int level;
+    bool done;
+    bool in_progress;           // compaction request being processed?
+    const InternalKey* begin;   // nullptr means beginning of key range
+    const InternalKey* end;     // nullptr means end of key range
+    InternalKey tmp_storage;    // Used to keep track of compaction progress
+  };
+  ManualCompaction* manual_compaction_;
+
+  // Have we encountered a background error in paranoid mode?
+  Status bg_error_;
+
+  std::unique_ptr<StatsLogger> logger_;
+
+  int64_t volatile last_log_ts;
+
+  // shall we disable deletion of obsolete files
+  bool disable_delete_obsolete_files_;
+
+  // last time when DeleteObsoleteFiles was invoked
+  uint64_t delete_obsolete_files_last_run_;
+
+  // last time when PurgeObsoleteWALFiles ran.
+  uint64_t purge_wal_files_last_run_;
+
+  // last time stats were dumped to LOG
+  std::atomic<uint64_t> last_stats_dump_time_microsec_;
+
+  // obsolete files will be deleted every this seconds if ttl deletion is
+  // enabled and archive size_limit is disabled.
+  uint64_t default_interval_to_delete_obsolete_WAL_;
+
+  // These count the number of microseconds for which MakeRoomForWrite stalls.
+  uint64_t stall_level0_slowdown_;
+  uint64_t stall_memtable_compaction_;
+  uint64_t stall_level0_num_files_;
+  std::vector<uint64_t> stall_leveln_slowdown_;
+  uint64_t stall_level0_slowdown_count_;
+  uint64_t stall_memtable_compaction_count_;
+  uint64_t stall_level0_num_files_count_;
+  std::vector<uint64_t> stall_leveln_slowdown_count_;
+
+  // Time at which this instance was started.
+  const uint64_t started_at_;
+
+  bool flush_on_destroy_; // Used when disableWAL is true.
+
+  // Per level compaction stats.  stats_[level] stores the stats for
+  // compactions that produced data for the specified "level".
+  struct CompactionStats {
+    uint64_t micros;
+
+    // Bytes read from level N during compaction between levels N and N+1
+    int64_t bytes_readn;
+
+    // Bytes read from level N+1 during compaction between levels N and N+1
+    int64_t bytes_readnp1;
+
+    // Total bytes written during compaction between levels N and N+1
+    int64_t bytes_written;
+
+    // Files read from level N during compaction between levels N and N+1
+    int     files_in_leveln;
+
+    // Files read from level N+1 during compaction between levels N and N+1
+    int     files_in_levelnp1;
+
+    // Files written during compaction between levels N and N+1
+    int     files_out_levelnp1;
+
+    // Number of compactions done
+    int     count;
+
+    CompactionStats() : micros(0), bytes_readn(0), bytes_readnp1(0),
+                        bytes_written(0), files_in_leveln(0),
+                        files_in_levelnp1(0), files_out_levelnp1(0),
+                        count(0) { }
+
+    void Add(const CompactionStats& c) {
+      this->micros += c.micros;
+      this->bytes_readn += c.bytes_readn;
+      this->bytes_readnp1 += c.bytes_readnp1;
+      this->bytes_written += c.bytes_written;
+      this->files_in_leveln += c.files_in_leveln;
+      this->files_in_levelnp1 += c.files_in_levelnp1;
+      this->files_out_levelnp1 += c.files_out_levelnp1;
+      this->count += 1;
+    }
+  };
+
+  std::vector<CompactionStats> stats_;
+
+  // Used to compute per-interval statistics
+  struct StatsSnapshot {
+    uint64_t bytes_read_;
+    uint64_t bytes_written_;
+    uint64_t bytes_new_;
+    double   seconds_up_;
+
+    StatsSnapshot() : bytes_read_(0), bytes_written_(0),
+                      bytes_new_(0), seconds_up_(0) {}
+  };
+
+  StatsSnapshot last_stats_;
+
+  static const int KEEP_LOG_FILE_NUM = 1000;
+  std::string db_absolute_path_;
+
+  // count of the number of contiguous delaying writes
+  int delayed_writes_;
+
+  // The options to access storage files
+  const EnvOptions storage_options_;
+
+  // A value of true temporarily disables scheduling of background work
+  bool bg_work_gate_closed_;
+
+  // Guard against multiple concurrent refitting
+  bool refitting_level_;
+
+  // No copying allowed
+  DBImpl(const DBImpl&);
+  void operator=(const DBImpl&);
+
+  // dump the delayed_writes_ to the log file and reset counter.
+  void DelayLoggingAndReset();
+
+  // Return the earliest snapshot where seqno is visible.
+  // Store the snapshot right before that, if any, in prev_snapshot
+  inline SequenceNumber findEarliestVisibleSnapshot(
+    SequenceNumber in,
+    std::vector<SequenceNumber>& snapshots,
+    SequenceNumber* prev_snapshot);
+
+  // Function that Get and KeyMayExist call with no_io true or false
+  // Note: 'value_found' from KeyMayExist propagates here
+  Status GetImpl(const ReadOptions& options,
+                 const Slice& key,
+                 std::string* value,
+                 bool* value_found = nullptr);
+};
+
+// Sanitize db options.  The caller should delete result.info_log if
+// it is not equal to src.info_log.
+extern Options SanitizeOptions(const std::string& db,
+                               const InternalKeyComparator* icmp,
+                               const InternalFilterPolicy* ipolicy,
+                               const Options& src);
+
+
+// Determine compression type, based on user options, level of the output
+// file and whether compression is disabled.
+// If enable_compression is false, then compression is always disabled no
+// matter what the values of the other two parameters are.
+// Otherwise, the compression type is determined based on options and level.
+CompressionType GetCompressionType(const Options& options, int level,
+                                   const bool enable_compression);
+
+}  // namespace rocksdb
--- a/db/db_impl_readonly.cc
+++ b/db/db_impl_readonly.cc
@ -0,0 +1,99 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2012 Facebook. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "db/db_impl_readonly.h"
+#include "db/db_impl.h"
+
+#include <algorithm>
+#include <set>
+#include <string>
+#include <stdint.h>
+#include <stdio.h>
+#include <vector>
+#include <algorithm>
+#include "db/db_iter.h"
+#include "db/dbformat.h"
+#include "db/filename.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/memtable.h"
+#include "db/table_cache.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "port/port.h"
+#include "table/block.h"
+#include "table/merger.h"
+#include "table/two_level_iterator.h"
+#include "util/coding.h"
+#include "util/logging.h"
+#include "util/build_version.h"
+
+namespace rocksdb {
+
+DBImplReadOnly::DBImplReadOnly(const Options& options,
+    const std::string& dbname)
+    : DBImpl(options, dbname) {
+  Log(options_.info_log, "Opening the db in read only mode");
+}
+
+DBImplReadOnly::~DBImplReadOnly() {
+}
+
+// Implementations of the DB interface
+Status DBImplReadOnly::Get(const ReadOptions& options,
+                   const Slice& key,
+                   std::string* value) {
+  Status s;
+  MemTable* mem = GetMemTable();
+  Version* current = versions_->current();
+  SequenceNumber snapshot = versions_->LastSequence();
+  std::deque<std::string> merge_operands;
+  LookupKey lkey(key, snapshot);
+  if (mem->Get(lkey, value, &s, &merge_operands, options_)) {
+  } else {
+    Version::GetStats stats;
+    current->Get(options, lkey, value, &s, &merge_operands, &stats, options_);
+  }
+  return s;
+}
+
+Iterator* DBImplReadOnly::NewIterator(const ReadOptions& options) {
+  SequenceNumber latest_snapshot;
+  Iterator* internal_iter = NewInternalIterator(options, &latest_snapshot);
+  return NewDBIterator(
+    &dbname_, env_, options_,  user_comparator(),internal_iter,
+      (options.snapshot != nullptr
+      ? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_
+      : latest_snapshot));
+}
+
+
+Status DB::OpenForReadOnly(const Options& options, const std::string& dbname,
+                DB** dbptr, bool error_if_log_file_exist) {
+  *dbptr = nullptr;
+
+  DBImplReadOnly* impl = new DBImplReadOnly(options, dbname);
+  impl->mutex_.Lock();
+  VersionEdit edit(impl->NumberLevels());
+  Status s = impl->Recover(&edit, impl->GetMemTable(),
+                           error_if_log_file_exist);
+  impl->mutex_.Unlock();
+  if (s.ok()) {
+    *dbptr = impl;
+  } else {
+    delete impl;
+  }
+  return s;
+}
+
+}
--- a/db/db_impl_readonly.h
+++ b/db/db_impl_readonly.h
@ -0,0 +1,78 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2012 Facebook. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#pragma once
+#include "db/db_impl.h"
+
+#include <deque>
+#include <set>
+#include "db/dbformat.h"
+#include "db/log_writer.h"
+#include "db/snapshot.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "port/port.h"
+#include "util/stats_logger.h"
+
+namespace rocksdb {
+
+class DBImplReadOnly : public DBImpl {
+public:
+  DBImplReadOnly(const Options& options, const std::string& dbname);
+ virtual ~DBImplReadOnly();
+
+ // Implementations of the DB interface
+ virtual Status Get(const ReadOptions& options,
+                    const Slice& key,
+                    std::string* value);
+
+ // TODO: Implement ReadOnly MultiGet?
+
+ virtual Iterator* NewIterator(const ReadOptions&);
+
+ virtual Status Put(const WriteOptions&, const Slice& key, const Slice& value) {
+   return Status::NotSupported("Not supported operation in read only mode.");
+ }
+ virtual Status Merge(const WriteOptions&, const Slice& key,
+                      const Slice& value) {
+   return Status::NotSupported("Not supported operation in read only mode.");
+ }
+ virtual Status Delete(const WriteOptions&, const Slice& key) {
+   return Status::NotSupported("Not supported operation in read only mode.");
+ }
+ virtual Status Write(const WriteOptions& options, WriteBatch* updates) {
+   return Status::NotSupported("Not supported operation in read only mode.");
+ }
+ virtual void CompactRange(const Slice* begin, const Slice* end,
+                           bool reduce_level = false, int target_level = -1) {
+ }
+ virtual Status DisableFileDeletions() {
+   return Status::NotSupported("Not supported operation in read only mode.");
+ }
+ virtual Status EnableFileDeletions() {
+   return Status::NotSupported("Not supported operation in read only mode.");
+ }
+ virtual Status GetLiveFiles(std::vector<std::string>&,
+                             uint64_t* manifest_file_size,
+                             bool flush_memtable = true) {
+   return Status::NotSupported("Not supported operation in read only mode.");
+ }
+ virtual Status Flush(const FlushOptions& options) {
+   return Status::NotSupported("Not supported operation in read only mode.");
+ }
+
+private:
+ friend class DB;
+
+ // No copying allowed
+ DBImplReadOnly(const DBImplReadOnly&);
+ void operator=(const DBImplReadOnly&);
+};
+
+}
--- a/db/db_iter.cc
+++ b/db/db_iter.cc
@ -0,0 +1,481 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_iter.h"
+#include <stdexcept>
+#include <deque>
+
+#include "db/filename.h"
+#include "db/dbformat.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/merge_operator.h"
+#include "port/port.h"
+#include "util/logging.h"
+#include "util/mutexlock.h"
+#include "util/perf_context_imp.h"
+
+namespace rocksdb {
+
+#if 0
+static void DumpInternalIter(Iterator* iter) {
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ParsedInternalKey k;
+    if (!ParseInternalKey(iter->key(), &k)) {
+      fprintf(stderr, "Corrupt '%s'\n", EscapeString(iter->key()).c_str());
+    } else {
+      fprintf(stderr, "@ '%s'\n", k.DebugString().c_str());
+    }
+  }
+}
+#endif
+
+namespace {
+
+// Memtables and sstables that make the DB representation contain
+// (userkey,seq,type) => uservalue entries.  DBIter
+// combines multiple entries for the same userkey found in the DB
+// representation into a single entry while accounting for sequence
+// numbers, deletion markers, overwrites, etc.
+class DBIter: public Iterator {
+ public:
+  // The following is grossly complicated. TODO: clean it up
+  // Which direction is the iterator currently moving?
+  // (1) When moving forward, the internal iterator is positioned at
+  //     the exact entry that yields this->key(), this->value()
+  // (2) When moving backwards, the internal iterator is positioned
+  //     just before all entries whose user key == this->key().
+  enum Direction {
+    kForward,
+    kReverse
+  };
+
+  DBIter(const std::string* dbname, Env* env, const Options& options,
+         const Comparator* cmp, Iterator* iter, SequenceNumber s)
+      : dbname_(dbname),
+        env_(env),
+        logger_(options.info_log),
+        user_comparator_(cmp),
+        user_merge_operator_(options.merge_operator.get()),
+        iter_(iter),
+        sequence_(s),
+        direction_(kForward),
+        valid_(false),
+        current_entry_is_merged_(false),
+        statistics_(options.statistics) {
+    RecordTick(statistics_, NO_ITERATORS, 1);
+    max_skip_ = options.max_sequential_skip_in_iterations;
+  }
+  virtual ~DBIter() {
+    RecordTick(statistics_, NO_ITERATORS, -1);
+    delete iter_;
+  }
+  virtual bool Valid() const { return valid_; }
+  virtual Slice key() const {
+    assert(valid_);
+    return saved_key_;
+  }
+  virtual Slice value() const {
+    assert(valid_);
+    return (direction_ == kForward && !current_entry_is_merged_) ?
+      iter_->value() : saved_value_;
+  }
+  virtual Status status() const {
+    if (status_.ok()) {
+      return iter_->status();
+    } else {
+      return status_;
+    }
+  }
+
+  virtual void Next();
+  virtual void Prev();
+  virtual void Seek(const Slice& target);
+  virtual void SeekToFirst();
+  virtual void SeekToLast();
+
+ private:
+  void FindNextUserEntry(bool skipping);
+  void FindPrevUserEntry();
+  bool ParseKey(ParsedInternalKey* key);
+  void MergeValuesNewToOld();
+
+  inline void SaveKey(const Slice& k, std::string* dst) {
+    dst->assign(k.data(), k.size());
+  }
+
+  inline void ClearSavedValue() {
+    if (saved_value_.capacity() > 1048576) {
+      std::string empty;
+      swap(empty, saved_value_);
+    } else {
+      saved_value_.clear();
+    }
+  }
+
+  const std::string* const dbname_;
+  Env* const env_;
+  shared_ptr<Logger> logger_;
+  const Comparator* const user_comparator_;
+  const MergeOperator* const user_merge_operator_;
+  Iterator* const iter_;
+  SequenceNumber const sequence_;
+
+  Status status_;
+  std::string saved_key_;     // == current key when direction_==kReverse
+  std::string saved_value_;   // == current raw value when direction_==kReverse
+  std::string skip_key_;
+  Direction direction_;
+  bool valid_;
+  bool current_entry_is_merged_;
+  std::shared_ptr<Statistics> statistics_;
+  uint64_t max_skip_;
+
+  // No copying allowed
+  DBIter(const DBIter&);
+  void operator=(const DBIter&);
+};
+
+inline bool DBIter::ParseKey(ParsedInternalKey* ikey) {
+  if (!ParseInternalKey(iter_->key(), ikey)) {
+    status_ = Status::Corruption("corrupted internal key in DBIter");
+    Log(logger_, "corrupted internal key in DBIter: %s",
+        iter_->key().ToString(true).c_str());
+    return false;
+  } else {
+    return true;
+  }
+}
+
+void DBIter::Next() {
+  assert(valid_);
+
+  if (direction_ == kReverse) {  // Switch directions?
+    direction_ = kForward;
+    // iter_ is pointing just before the entries for this->key(),
+    // so advance into the range of entries for this->key() and then
+    // use the normal skipping code below.
+    if (!iter_->Valid()) {
+      iter_->SeekToFirst();
+    } else {
+      iter_->Next();
+    }
+    if (!iter_->Valid()) {
+      valid_ = false;
+      saved_key_.clear();
+      return;
+    }
+  }
+
+  // If the current value is merged, we might already hit end of iter_
+  if (!iter_->Valid()) {
+    valid_ = false;
+    return;
+  }
+  FindNextUserEntry(true /* skipping the current user key */);
+}
+
+
+// PRE: saved_key_ has the current user key if skipping
+// POST: saved_key_ should have the next user key if valid_,
+//       if the current entry is a result of merge
+//           current_entry_is_merged_ => true
+//           saved_value_             => the merged value
+//
+// NOTE: In between, saved_key_ can point to a user key that has
+//       a delete marker
+void DBIter::FindNextUserEntry(bool skipping) {
+  // Loop until we hit an acceptable entry to yield
+  assert(iter_->Valid());
+  assert(direction_ == kForward);
+  current_entry_is_merged_ = false;
+  uint64_t num_skipped = 0;
+  do {
+    ParsedInternalKey ikey;
+    if (ParseKey(&ikey) && ikey.sequence <= sequence_) {
+      if (skipping &&
+          user_comparator_->Compare(ikey.user_key, saved_key_) <= 0) {
+        num_skipped++; // skip this entry
+        BumpPerfCount(&perf_context.internal_key_skipped_count);
+      } else {
+        skipping = false;
+        switch (ikey.type) {
+          case kTypeDeletion:
+            // Arrange to skip all upcoming entries for this key since
+            // they are hidden by this deletion.
+            SaveKey(ikey.user_key, &saved_key_);
+            skipping = true;
+            num_skipped = 0;
+            BumpPerfCount(&perf_context.internal_delete_skipped_count);
+            break;
+          case kTypeValue:
+            valid_ = true;
+            SaveKey(ikey.user_key, &saved_key_);
+            return;
+          case kTypeMerge:
+            // By now, we are sure the current ikey is going to yield a value
+            SaveKey(ikey.user_key, &saved_key_);
+            current_entry_is_merged_ = true;
+            valid_ = true;
+            MergeValuesNewToOld();  // Go to a different state machine
+            return;
+          case kTypeLogData:
+            assert(false);
+            break;
+        }
+      }
+    }
+    // If we have sequentially iterated via numerous keys and still not
+    // found the next user-key, then it is better to seek so that we can
+    // avoid too many key comparisons. We seek to the last occurence of
+    // our current key by looking for sequence number 0.
+    if (skipping && num_skipped > max_skip_) {
+      num_skipped = 0;
+      std::string last_key;
+      AppendInternalKey(&last_key,
+        ParsedInternalKey(Slice(saved_key_), 0, kValueTypeForSeek));
+      iter_->Seek(last_key);
+      RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
+    } else {
+      iter_->Next();
+    }
+  } while (iter_->Valid());
+  valid_ = false;
+}
+
+// Merge values of the same user key starting from the current iter_ position
+// Scan from the newer entries to older entries.
+// PRE: iter_->key() points to the first merge type entry
+//      saved_key_ stores the user key
+// POST: saved_value_ has the merged value for the user key
+//       iter_ points to the next entry (or invalid)
+void DBIter::MergeValuesNewToOld() {
+  if (!user_merge_operator_) {
+    Log(logger_, "Options::merge_operator is null.");
+    throw std::logic_error("DBIter::MergeValuesNewToOld() with"
+                           " Options::merge_operator null");
+  }
+
+  // Start the merge process by pushing the first operand
+  std::deque<std::string> operands;
+  operands.push_front(iter_->value().ToString());
+
+  std::string merge_result;   // Temporary string to hold merge result later
+  ParsedInternalKey ikey;
+  for (iter_->Next(); iter_->Valid(); iter_->Next()) {
+    if (!ParseKey(&ikey)) {
+      // skip corrupted key
+      continue;
+    }
+
+    if (user_comparator_->Compare(ikey.user_key, saved_key_) != 0) {
+      // hit the next user key, stop right here
+      break;
+    }
+
+    if (kTypeDeletion == ikey.type) {
+      // hit a delete with the same user key, stop right here
+      // iter_ is positioned after delete
+      iter_->Next();
+      break;
+    }
+
+    if (kTypeValue == ikey.type) {
+      // hit a put, merge the put value with operands and store the
+      // final result in saved_value_. We are done!
+      // ignore corruption if there is any.
+      const Slice value = iter_->value();
+      user_merge_operator_->FullMerge(ikey.user_key, &value, operands,
+                                      &saved_value_, logger_.get());
+      // iter_ is positioned after put
+      iter_->Next();
+      return;
+    }
+
+    if (kTypeMerge == ikey.type) {
+      // hit a merge, add the value as an operand and run associative merge.
+      // when complete, add result to operands and continue.
+      const Slice& value = iter_->value();
+      operands.push_front(value.ToString());
+      while(operands.size() >= 2) {
+        // Call user associative-merge until it returns false
+        if (user_merge_operator_->PartialMerge(ikey.user_key,
+                                               Slice(operands[0]),
+                                               Slice(operands[1]),
+                                               &merge_result,
+                                               logger_.get())) {
+          operands.pop_front();
+          swap(operands.front(), merge_result);
+        } else {
+          // Associative merge returns false ==> stack the operands
+          break;
+        }
+      }
+
+    }
+  }
+
+  // we either exhausted all internal keys under this user key, or hit
+  // a deletion marker.
+  // feed null as the existing value to the merge operator, such that
+  // client can differentiate this scenario and do things accordingly.
+  user_merge_operator_->FullMerge(saved_key_, nullptr, operands,
+                                  &saved_value_, logger_.get());
+}
+
+void DBIter::Prev() {
+  assert(valid_);
+
+  // Throw an exception now if merge_operator is provided
+  // TODO: support backward iteration
+  if (user_merge_operator_) {
+    Log(logger_, "Prev not supported yet if merge_operator is provided");
+    throw std::logic_error("DBIter::Prev backward iteration not supported"
+                           " if merge_operator is provided");
+  }
+
+  if (direction_ == kForward) {  // Switch directions?
+    // iter_ is pointing at the current entry.  Scan backwards until
+    // the key changes so we can use the normal reverse scanning code.
+    assert(iter_->Valid());  // Otherwise valid_ would have been false
+    SaveKey(ExtractUserKey(iter_->key()), &saved_key_);
+    while (true) {
+      iter_->Prev();
+      if (!iter_->Valid()) {
+        valid_ = false;
+        saved_key_.clear();
+        ClearSavedValue();
+        return;
+      }
+      if (user_comparator_->Compare(ExtractUserKey(iter_->key()),
+                                    saved_key_) < 0) {
+        break;
+      }
+    }
+    direction_ = kReverse;
+  }
+
+  FindPrevUserEntry();
+}
+
+void DBIter::FindPrevUserEntry() {
+  assert(direction_ == kReverse);
+  uint64_t num_skipped = 0;
+
+  ValueType value_type = kTypeDeletion;
+  if (iter_->Valid()) {
+    do {
+      ParsedInternalKey ikey;
+      bool saved_key_cleared = false;
+      if (ParseKey(&ikey) && ikey.sequence <= sequence_) {
+        if ((value_type != kTypeDeletion) &&
+            user_comparator_->Compare(ikey.user_key, saved_key_) < 0) {
+          // We encountered a non-deleted value in entries for previous keys,
+          break;
+        }
+        value_type = ikey.type;
+        if (value_type == kTypeDeletion) {
+          saved_key_.clear();
+          ClearSavedValue();
+          saved_key_cleared = true;
+        } else {
+          Slice raw_value = iter_->value();
+          if (saved_value_.capacity() > raw_value.size() + 1048576) {
+            std::string empty;
+            swap(empty, saved_value_);
+          }
+          SaveKey(ExtractUserKey(iter_->key()), &saved_key_);
+          saved_value_.assign(raw_value.data(), raw_value.size());
+        }
+      }
+      num_skipped++;
+      // If we have sequentially iterated via numerous keys and still not
+      // found the prev user-key, then it is better to seek so that we can
+      // avoid too many key comparisons. We seek to the first occurence of
+      // our current key by looking for max sequence number.
+      if (!saved_key_cleared && num_skipped > max_skip_) {
+        num_skipped = 0;
+        std::string last_key;
+        AppendInternalKey(&last_key,
+          ParsedInternalKey(Slice(saved_key_), kMaxSequenceNumber,
+                            kValueTypeForSeek));
+        iter_->Seek(last_key);
+        RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
+      } else {
+        iter_->Prev();
+      }
+    } while (iter_->Valid());
+  }
+
+  if (value_type == kTypeDeletion) {
+    // End
+    valid_ = false;
+    saved_key_.clear();
+    ClearSavedValue();
+    direction_ = kForward;
+  } else {
+    valid_ = true;
+  }
+}
+
+void DBIter::Seek(const Slice& target) {
+  direction_ = kForward;
+  ClearSavedValue();
+  saved_key_.clear();
+  AppendInternalKey(
+      &saved_key_, ParsedInternalKey(target, sequence_, kValueTypeForSeek));
+  iter_->Seek(saved_key_);
+  if (iter_->Valid()) {
+    FindNextUserEntry(false /*not skipping */);
+  } else {
+    valid_ = false;
+  }
+}
+
+void DBIter::SeekToFirst() {
+  direction_ = kForward;
+  ClearSavedValue();
+  iter_->SeekToFirst();
+  if (iter_->Valid()) {
+    FindNextUserEntry(false /* not skipping */);
+  } else {
+    valid_ = false;
+  }
+}
+
+void DBIter::SeekToLast() {
+  // Throw an exception for now if merge_operator is provided
+  // TODO: support backward iteration
+  if (user_merge_operator_) {
+    Log(logger_, "SeekToLast not supported yet if merge_operator is provided");
+    throw std::logic_error("DBIter::SeekToLast: backward iteration not"
+                           " supported if merge_operator is provided");
+  }
+
+  direction_ = kReverse;
+  ClearSavedValue();
+  iter_->SeekToLast();
+  FindPrevUserEntry();
+}
+
+}  // anonymous namespace
+
+Iterator* NewDBIterator(
+    const std::string* dbname,
+    Env* env,
+    const Options& options,
+    const Comparator *user_key_comparator,
+    Iterator* internal_iter,
+    const SequenceNumber& sequence) {
+  return new DBIter(dbname, env, options, user_key_comparator,
+                    internal_iter, sequence);
+}
+
+}  // namespace rocksdb
--- a/db/db_iter.h
+++ b/db/db_iter.h
@ -0,0 +1,28 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <stdint.h>
+#include "rocksdb/db.h"
+#include "db/dbformat.h"
+
+namespace rocksdb {
+
+// Return a new iterator that converts internal keys (yielded by
+// "*internal_iter") that were live at the specified "sequence" number
+// into appropriate user keys.
+extern Iterator* NewDBIterator(
+    const std::string* dbname,
+    Env* env,
+    const Options& options,
+    const Comparator *user_key_comparator,
+    Iterator* internal_iter,
+    const SequenceNumber& sequence);
+
+}  // namespace rocksdb
--- a/db/db_statistics.h
+++ b/db/db_statistics.h
@ -0,0 +1,65 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <cassert>
+#include <stdlib.h>
+#include <vector>
+#include <memory>
+
+#include "rocksdb/statistics.h"
+#include "util/histogram.h"
+#include "port/port.h"
+#include "util/mutexlock.h"
+
+
+namespace rocksdb {
+
+class DBStatistics: public Statistics {
+ public:
+  DBStatistics() : allTickers_(TICKER_ENUM_MAX),
+                   allHistograms_(HISTOGRAM_ENUM_MAX) { }
+
+  virtual ~DBStatistics() {}
+
+  virtual long getTickerCount(Tickers tickerType) {
+    assert(tickerType < TICKER_ENUM_MAX);
+    return allTickers_[tickerType].getCount();
+  }
+
+  virtual void setTickerCount(Tickers tickerType, uint64_t count) {
+    assert(tickerType < TICKER_ENUM_MAX);
+    allTickers_[tickerType].setTickerCount(count);
+  }
+
+  virtual void recordTick(Tickers tickerType, uint64_t count) {
+    assert(tickerType < TICKER_ENUM_MAX);
+    allTickers_[tickerType].recordTick(count);
+  }
+
+  virtual void measureTime(Histograms histogramType, uint64_t value) {
+    assert(histogramType < HISTOGRAM_ENUM_MAX);
+    allHistograms_[histogramType].Add(value);
+  }
+
+  virtual void histogramData(Histograms histogramType,
+                             HistogramData * const data) {
+    assert(histogramType < HISTOGRAM_ENUM_MAX);
+    allHistograms_[histogramType].Data(data);
+  }
+
+  std::vector<Ticker> allTickers_;
+  std::vector<HistogramImpl> allHistograms_;
+};
+
+std::shared_ptr<Statistics> CreateDBStatistics() {
+  return std::make_shared<DBStatistics>();
+}
+
+} // namespace rocksdb
--- a/db/db_stats_logger.cc
+++ b/db/db_stats_logger.cc
@ -0,0 +1,93 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_impl.h"
+#include <string>
+#include <stdint.h>
+#include <stdio.h>
+#include "db/version_set.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "port/port.h"
+#include "util/mutexlock.h"
+
+namespace rocksdb {
+
+void DBImpl::MaybeScheduleLogDBDeployStats() {
+
+  // There is a lock in the actual logger.
+  if (!logger_ || options_.db_stats_log_interval < 0
+      || host_name_.empty()) {
+    return;
+  }
+
+  if(bg_logstats_scheduled_ || shutting_down_.Acquire_Load()) {
+    // Already scheduled
+  } else {
+    int64_t current_ts = 0;
+    Status st = env_->GetCurrentTime(&current_ts);
+    if (!st.ok()) {
+      return;
+    }
+    if ((current_ts - last_log_ts) < options_.db_stats_log_interval) {
+      return;
+    }
+    last_log_ts = current_ts;
+    bg_logstats_scheduled_ = true;
+    env_->Schedule(&DBImpl::BGLogDBDeployStats, this);
+  }
+}
+
+void DBImpl::BGLogDBDeployStats(void* db) {
+  DBImpl* db_inst = reinterpret_cast<DBImpl*>(db);
+  db_inst->LogDBDeployStats();
+}
+
+void DBImpl::LogDBDeployStats() {
+  mutex_.Lock();
+
+  if (shutting_down_.Acquire_Load()) {
+    bg_logstats_scheduled_ = false;
+    bg_cv_.SignalAll();
+    mutex_.Unlock();
+    return;
+  }
+
+  char tmp_ver[100];
+  sprintf(tmp_ver, "%d.%d", kMajorVersion, kMinorVersion);
+  std::string version_info(tmp_ver);
+
+  uint64_t file_total_size = 0;
+  uint32_t file_total_num = 0;
+  for (int i = 0; i < versions_->NumberLevels(); i++) {
+    file_total_num += versions_->NumLevelFiles(i);
+    file_total_size += versions_->NumLevelBytes(i);
+  }
+
+  VersionSet::LevelSummaryStorage scratch;
+  const char* file_num_summary = versions_->LevelSummary(&scratch);
+  std::string file_num_per_level(file_num_summary);
+  std::string data_size_per_level(file_num_summary);
+
+  mutex_.Unlock();
+
+  int64_t unix_ts;
+  env_->GetCurrentTime(&unix_ts);
+
+  logger_->Log_Deploy_Stats(version_info, host_name_,
+      db_absolute_path_, file_total_size, file_total_num, file_num_per_level,
+      data_size_per_level, unix_ts);
+
+  mutex_.Lock();
+  bg_logstats_scheduled_ = false;
+  bg_cv_.SignalAll();
+  mutex_.Unlock();
+}
+
+}
--- a/db/db_test.cc
+++ b/db/db_test.cc
--- a/db/dbformat.cc
+++ b/db/dbformat.cc
@ -0,0 +1,147 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <stdio.h>
+#include "db/dbformat.h"
+#include "port/port.h"
+#include "util/coding.h"
+#include "util/perf_context_imp.h"
+
+namespace rocksdb {
+
+static uint64_t PackSequenceAndType(uint64_t seq, ValueType t) {
+  assert(seq <= kMaxSequenceNumber);
+  assert(t <= kValueTypeForSeek);
+  return (seq << 8) | t;
+}
+
+void AppendInternalKey(std::string* result, const ParsedInternalKey& key) {
+  result->append(key.user_key.data(), key.user_key.size());
+  PutFixed64(result, PackSequenceAndType(key.sequence, key.type));
+}
+
+std::string ParsedInternalKey::DebugString(bool hex) const {
+  char buf[50];
+  snprintf(buf, sizeof(buf), "' @ %llu : %d",
+           (unsigned long long) sequence,
+           int(type));
+  std::string result = "'";
+  result += user_key.ToString(hex);
+  result += buf;
+  return result;
+}
+
+std::string InternalKey::DebugString(bool hex) const {
+  std::string result;
+  ParsedInternalKey parsed;
+  if (ParseInternalKey(rep_, &parsed)) {
+    result = parsed.DebugString(hex);
+  } else {
+    result = "(bad)";
+    result.append(EscapeString(rep_));
+  }
+  return result;
+}
+
+const char* InternalKeyComparator::Name() const {
+  return name_.c_str();
+}
+
+int InternalKeyComparator::Compare(const Slice& akey, const Slice& bkey) const {
+  // Order by:
+  //    increasing user key (according to user-supplied comparator)
+  //    decreasing sequence number
+  //    decreasing type (though sequence# should be enough to disambiguate)
+  int r = user_comparator_->Compare(ExtractUserKey(akey), ExtractUserKey(bkey));
+  BumpPerfCount(&perf_context.user_key_comparison_count);
+  if (r == 0) {
+    const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8);
+    const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8);
+    if (anum > bnum) {
+      r = -1;
+    } else if (anum < bnum) {
+      r = +1;
+    }
+  }
+  return r;
+}
+
+void InternalKeyComparator::FindShortestSeparator(
+      std::string* start,
+      const Slice& limit) const {
+  // Attempt to shorten the user portion of the key
+  Slice user_start = ExtractUserKey(*start);
+  Slice user_limit = ExtractUserKey(limit);
+  std::string tmp(user_start.data(), user_start.size());
+  user_comparator_->FindShortestSeparator(&tmp, user_limit);
+  if (tmp.size() < user_start.size() &&
+      user_comparator_->Compare(user_start, tmp) < 0) {
+    // User key has become shorter physically, but larger logically.
+    // Tack on the earliest possible number to the shortened user key.
+    PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek));
+    assert(this->Compare(*start, tmp) < 0);
+    assert(this->Compare(tmp, limit) < 0);
+    start->swap(tmp);
+  }
+}
+
+void InternalKeyComparator::FindShortSuccessor(std::string* key) const {
+  Slice user_key = ExtractUserKey(*key);
+  std::string tmp(user_key.data(), user_key.size());
+  user_comparator_->FindShortSuccessor(&tmp);
+  if (tmp.size() < user_key.size() &&
+      user_comparator_->Compare(user_key, tmp) < 0) {
+    // User key has become shorter physically, but larger logically.
+    // Tack on the earliest possible number to the shortened user key.
+    PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek));
+    assert(this->Compare(*key, tmp) < 0);
+    key->swap(tmp);
+  }
+}
+
+const char* InternalFilterPolicy::Name() const {
+  return user_policy_->Name();
+}
+
+void InternalFilterPolicy::CreateFilter(const Slice* keys, int n,
+                                        std::string* dst) const {
+  // We rely on the fact that the code in table.cc does not mind us
+  // adjusting keys[].
+  Slice* mkey = const_cast<Slice*>(keys);
+  for (int i = 0; i < n; i++) {
+    mkey[i] = ExtractUserKey(keys[i]);
+    // TODO(sanjay): Suppress dups?
+  }
+  user_policy_->CreateFilter(keys, n, dst);
+}
+
+bool InternalFilterPolicy::KeyMayMatch(const Slice& key, const Slice& f) const {
+  return user_policy_->KeyMayMatch(ExtractUserKey(key), f);
+}
+
+LookupKey::LookupKey(const Slice& user_key, SequenceNumber s) {
+  size_t usize = user_key.size();
+  size_t needed = usize + 13;  // A conservative estimate
+  char* dst;
+  if (needed <= sizeof(space_)) {
+    dst = space_;
+  } else {
+    dst = new char[needed];
+  }
+  start_ = dst;
+  dst = EncodeVarint32(dst, usize + 8);
+  kstart_ = dst;
+  memcpy(dst, user_key.data(), usize);
+  dst += usize;
+  EncodeFixed64(dst, PackSequenceAndType(s, kValueTypeForSeek));
+  dst += 8;
+  end_ = dst;
+}
+
+}  // namespace rocksdb
--- a/db/dbformat.h
+++ b/db/dbformat.h
@ -0,0 +1,229 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <stdio.h>
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/table.h"
+#include "rocksdb/types.h"
+#include "util/coding.h"
+#include "util/logging.h"
+
+namespace rocksdb {
+
+class InternalKey;
+
+// Value types encoded as the last component of internal keys.
+// DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk
+// data structures.
+enum ValueType {
+  kTypeDeletion = 0x0,
+  kTypeValue = 0x1,
+  kTypeMerge = 0x2,
+  kTypeLogData = 0x3
+};
+// kValueTypeForSeek defines the ValueType that should be passed when
+// constructing a ParsedInternalKey object for seeking to a particular
+// sequence number (since we sort sequence numbers in decreasing order
+// and the value type is embedded as the low 8 bits in the sequence
+// number in internal keys, we need to use the highest-numbered
+// ValueType, not the lowest).
+static const ValueType kValueTypeForSeek = kTypeMerge;
+
+// We leave eight bits empty at the bottom so a type and sequence#
+// can be packed together into 64-bits.
+static const SequenceNumber kMaxSequenceNumber =
+    ((0x1ull << 56) - 1);
+
+struct ParsedInternalKey {
+  Slice user_key;
+  SequenceNumber sequence;
+  ValueType type;
+
+  ParsedInternalKey() { }  // Intentionally left uninitialized (for speed)
+  ParsedInternalKey(const Slice& u, const SequenceNumber& seq, ValueType t)
+      : user_key(u), sequence(seq), type(t) { }
+  std::string DebugString(bool hex = false) const;
+};
+
+// Return the length of the encoding of "key".
+inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) {
+  return key.user_key.size() + 8;
+}
+
+// Append the serialization of "key" to *result.
+extern void AppendInternalKey(std::string* result,
+                              const ParsedInternalKey& key);
+
+// Attempt to parse an internal key from "internal_key".  On success,
+// stores the parsed data in "*result", and returns true.
+//
+// On error, returns false, leaves "*result" in an undefined state.
+extern bool ParseInternalKey(const Slice& internal_key,
+                             ParsedInternalKey* result);
+
+// Returns the user key portion of an internal key.
+inline Slice ExtractUserKey(const Slice& internal_key) {
+  assert(internal_key.size() >= 8);
+  return Slice(internal_key.data(), internal_key.size() - 8);
+}
+
+inline ValueType ExtractValueType(const Slice& internal_key) {
+  assert(internal_key.size() >= 8);
+  const size_t n = internal_key.size();
+  uint64_t num = DecodeFixed64(internal_key.data() + n - 8);
+  unsigned char c = num & 0xff;
+  return static_cast<ValueType>(c);
+}
+
+// A comparator for internal keys that uses a specified comparator for
+// the user key portion and breaks ties by decreasing sequence number.
+class InternalKeyComparator : public Comparator {
+ private:
+  const Comparator* user_comparator_;
+  std::string name_;
+ public:
+  explicit InternalKeyComparator(const Comparator* c) : user_comparator_(c),
+    name_("rocksdb.InternalKeyComparator:" +
+          std::string(user_comparator_->Name())) {
+  }
+
+  virtual const char* Name() const;
+  virtual int Compare(const Slice& a, const Slice& b) const;
+  virtual void FindShortestSeparator(
+      std::string* start,
+      const Slice& limit) const;
+  virtual void FindShortSuccessor(std::string* key) const;
+
+  const Comparator* user_comparator() const { return user_comparator_; }
+
+  int Compare(const InternalKey& a, const InternalKey& b) const;
+};
+
+// Filter policy wrapper that converts from internal keys to user keys
+class InternalFilterPolicy : public FilterPolicy {
+ private:
+  const FilterPolicy* const user_policy_;
+ public:
+  explicit InternalFilterPolicy(const FilterPolicy* p) : user_policy_(p) { }
+  virtual const char* Name() const;
+  virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const;
+  virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const;
+};
+
+// Modules in this directory should keep internal keys wrapped inside
+// the following class instead of plain strings so that we do not
+// incorrectly use string comparisons instead of an InternalKeyComparator.
+class InternalKey {
+ private:
+  std::string rep_;
+ public:
+  InternalKey() { }   // Leave rep_ as empty to indicate it is invalid
+  InternalKey(const Slice& user_key, SequenceNumber s, ValueType t) {
+    AppendInternalKey(&rep_, ParsedInternalKey(user_key, s, t));
+  }
+
+  void DecodeFrom(const Slice& s) { rep_.assign(s.data(), s.size()); }
+  Slice Encode() const {
+    assert(!rep_.empty());
+    return rep_;
+  }
+
+  Slice user_key() const { return ExtractUserKey(rep_); }
+
+  void SetFrom(const ParsedInternalKey& p) {
+    rep_.clear();
+    AppendInternalKey(&rep_, p);
+  }
+
+  void Clear() { rep_.clear(); }
+
+  std::string DebugString(bool hex = false) const;
+};
+
+inline int InternalKeyComparator::Compare(
+    const InternalKey& a, const InternalKey& b) const {
+  return Compare(a.Encode(), b.Encode());
+}
+
+inline bool ParseInternalKey(const Slice& internal_key,
+                             ParsedInternalKey* result) {
+  const size_t n = internal_key.size();
+  if (n < 8) return false;
+  uint64_t num = DecodeFixed64(internal_key.data() + n - 8);
+  unsigned char c = num & 0xff;
+  result->sequence = num >> 8;
+  result->type = static_cast<ValueType>(c);
+  result->user_key = Slice(internal_key.data(), n - 8);
+  return (c <= static_cast<unsigned char>(kValueTypeForSeek));
+}
+
+// Update the sequence number in the internal key
+inline void UpdateInternalKey(char* internal_key,
+                              const size_t internal_key_size,
+                              uint64_t seq, ValueType t) {
+  assert(internal_key_size >= 8);
+  char* seqtype = internal_key + internal_key_size - 8;
+  uint64_t newval = (seq << 8) | t;
+  EncodeFixed64(seqtype, newval);
+}
+
+// Get the sequence number from the internal key
+inline uint64_t GetInternalKeySeqno(const Slice& internal_key) {
+  const size_t n = internal_key.size();
+  assert(n >= 8);
+  uint64_t num = DecodeFixed64(internal_key.data() + n - 8);
+  return num >> 8;
+}
+
+
+// A helper class useful for DBImpl::Get()
+class LookupKey {
+ public:
+  // Initialize *this for looking up user_key at a snapshot with
+  // the specified sequence number.
+  LookupKey(const Slice& user_key, SequenceNumber sequence);
+
+  ~LookupKey();
+
+  // Return a key suitable for lookup in a MemTable.
+  Slice memtable_key() const { return Slice(start_, end_ - start_); }
+
+  // Return an internal key (suitable for passing to an internal iterator)
+  Slice internal_key() const { return Slice(kstart_, end_ - kstart_); }
+
+  // Return the user key
+  Slice user_key() const { return Slice(kstart_, end_ - kstart_ - 8); }
+
+ private:
+  // We construct a char array of the form:
+  //    klength  varint32               <-- start_
+  //    userkey  char[klength]          <-- kstart_
+  //    tag      uint64
+  //                                    <-- end_
+  // The array is a suitable MemTable key.
+  // The suffix starting with "userkey" can be used as an InternalKey.
+  const char* start_;
+  const char* kstart_;
+  const char* end_;
+  char space_[200];      // Avoid allocation for short keys
+
+  // No copying allowed
+  LookupKey(const LookupKey&);
+  void operator=(const LookupKey&);
+};
+
+inline LookupKey::~LookupKey() {
+  if (start_ != space_) delete[] start_;
+}
+
+}  // namespace rocksdb
--- a/db/dbformat_test.cc
+++ b/db/dbformat_test.cc
@ -0,0 +1,117 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/dbformat.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+
+namespace rocksdb {
+
+static std::string IKey(const std::string& user_key,
+                        uint64_t seq,
+                        ValueType vt) {
+  std::string encoded;
+  AppendInternalKey(&encoded, ParsedInternalKey(user_key, seq, vt));
+  return encoded;
+}
+
+static std::string Shorten(const std::string& s, const std::string& l) {
+  std::string result = s;
+  InternalKeyComparator(BytewiseComparator()).FindShortestSeparator(&result, l);
+  return result;
+}
+
+static std::string ShortSuccessor(const std::string& s) {
+  std::string result = s;
+  InternalKeyComparator(BytewiseComparator()).FindShortSuccessor(&result);
+  return result;
+}
+
+static void TestKey(const std::string& key,
+                    uint64_t seq,
+                    ValueType vt) {
+  std::string encoded = IKey(key, seq, vt);
+
+  Slice in(encoded);
+  ParsedInternalKey decoded("", 0, kTypeValue);
+
+  ASSERT_TRUE(ParseInternalKey(in, &decoded));
+  ASSERT_EQ(key, decoded.user_key.ToString());
+  ASSERT_EQ(seq, decoded.sequence);
+  ASSERT_EQ(vt, decoded.type);
+
+  ASSERT_TRUE(!ParseInternalKey(Slice("bar"), &decoded));
+}
+
+class FormatTest { };
+
+TEST(FormatTest, InternalKey_EncodeDecode) {
+  const char* keys[] = { "", "k", "hello", "longggggggggggggggggggggg" };
+  const uint64_t seq[] = {
+    1, 2, 3,
+    (1ull << 8) - 1, 1ull << 8, (1ull << 8) + 1,
+    (1ull << 16) - 1, 1ull << 16, (1ull << 16) + 1,
+    (1ull << 32) - 1, 1ull << 32, (1ull << 32) + 1
+  };
+  for (unsigned int k = 0; k < sizeof(keys) / sizeof(keys[0]); k++) {
+    for (unsigned int s = 0; s < sizeof(seq) / sizeof(seq[0]); s++) {
+      TestKey(keys[k], seq[s], kTypeValue);
+      TestKey("hello", 1, kTypeDeletion);
+    }
+  }
+}
+
+TEST(FormatTest, InternalKeyShortSeparator) {
+  // When user keys are same
+  ASSERT_EQ(IKey("foo", 100, kTypeValue),
+            Shorten(IKey("foo", 100, kTypeValue),
+                    IKey("foo", 99, kTypeValue)));
+  ASSERT_EQ(IKey("foo", 100, kTypeValue),
+            Shorten(IKey("foo", 100, kTypeValue),
+                    IKey("foo", 101, kTypeValue)));
+  ASSERT_EQ(IKey("foo", 100, kTypeValue),
+            Shorten(IKey("foo", 100, kTypeValue),
+                    IKey("foo", 100, kTypeValue)));
+  ASSERT_EQ(IKey("foo", 100, kTypeValue),
+            Shorten(IKey("foo", 100, kTypeValue),
+                    IKey("foo", 100, kTypeDeletion)));
+
+  // When user keys are misordered
+  ASSERT_EQ(IKey("foo", 100, kTypeValue),
+            Shorten(IKey("foo", 100, kTypeValue),
+                    IKey("bar", 99, kTypeValue)));
+
+  // When user keys are different, but correctly ordered
+  ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek),
+            Shorten(IKey("foo", 100, kTypeValue),
+                    IKey("hello", 200, kTypeValue)));
+
+  // When start user key is prefix of limit user key
+  ASSERT_EQ(IKey("foo", 100, kTypeValue),
+            Shorten(IKey("foo", 100, kTypeValue),
+                    IKey("foobar", 200, kTypeValue)));
+
+  // When limit user key is prefix of start user key
+  ASSERT_EQ(IKey("foobar", 100, kTypeValue),
+            Shorten(IKey("foobar", 100, kTypeValue),
+                    IKey("foo", 200, kTypeValue)));
+}
+
+TEST(FormatTest, InternalKeyShortestSuccessor) {
+  ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek),
+            ShortSuccessor(IKey("foo", 100, kTypeValue)));
+  ASSERT_EQ(IKey("\xff\xff", 100, kTypeValue),
+            ShortSuccessor(IKey("\xff\xff", 100, kTypeValue)));
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
--- a/db/deletefile_test.cc
+++ b/db/deletefile_test.cc
@ -0,0 +1,283 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/db.h"
+#include "db/db_impl.h"
+#include "db/filename.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+#include "rocksdb/env.h"
+#include "rocksdb/transaction_log.h"
+#include <vector>
+#include <stdlib.h>
+#include <map>
+#include <string>
+
+namespace rocksdb {
+
+class DeleteFileTest {
+ public:
+  std::string dbname_;
+  Options options_;
+  DB* db_;
+  Env* env_;
+  int numlevels_;
+
+  DeleteFileTest() {
+    db_ = nullptr;
+    env_ = Env::Default();
+    options_.write_buffer_size = 1024*1024*1000;
+    options_.target_file_size_base = 1024*1024*1000;
+    options_.max_bytes_for_level_base = 1024*1024*1000;
+    options_.WAL_ttl_seconds = 300; // Used to test log files
+    options_.WAL_size_limit_MB = 1024; // Used to test log files
+    dbname_ = test::TmpDir() + "/deletefile_test";
+    options_.wal_dir = dbname_ + "/wal_files";
+    DestroyDB(dbname_, options_);
+    numlevels_ = 7;
+    ASSERT_OK(ReopenDB(true));
+  }
+
+  Status ReopenDB(bool create) {
+    delete db_;
+    if (create) {
+      DestroyDB(dbname_, options_);
+    }
+    db_ = nullptr;
+    options_.create_if_missing = create;
+    return DB::Open(options_, dbname_, &db_);
+  }
+
+  void CloseDB() {
+    delete db_;
+  }
+
+  void AddKeys(int numkeys, int startkey = 0) {
+    WriteOptions options;
+    options.sync = false;
+    ReadOptions roptions;
+    for (int i = startkey; i < (numkeys + startkey) ; i++) {
+      std::string temp = std::to_string(i);
+      Slice key(temp);
+      Slice value(temp);
+      ASSERT_OK(db_->Put(options, key, value));
+    }
+  }
+
+  int numKeysInLevels(
+    std::vector<LiveFileMetaData> &metadata,
+    std::vector<int> *keysperlevel = nullptr) {
+
+    if (keysperlevel != nullptr) {
+      keysperlevel->resize(numlevels_);
+    }
+
+    int numKeys = 0;
+    for (size_t i = 0; i < metadata.size(); i++) {
+      int startkey = atoi(metadata[i].smallestkey.c_str());
+      int endkey = atoi(metadata[i].largestkey.c_str());
+      int numkeysinfile = (endkey - startkey + 1);
+      numKeys += numkeysinfile;
+      if (keysperlevel != nullptr) {
+        (*keysperlevel)[(int)metadata[i].level] += numkeysinfile;
+      }
+      fprintf(stderr, "level %d name %s smallest %s largest %s\n",
+              metadata[i].level, metadata[i].name.c_str(),
+              metadata[i].smallestkey.c_str(),
+              metadata[i].largestkey.c_str());
+    }
+    return numKeys;
+  }
+
+  void CreateTwoLevels() {
+    AddKeys(50000, 10000);
+    DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+    ASSERT_OK(dbi->TEST_FlushMemTable());
+    ASSERT_OK(dbi->TEST_WaitForFlushMemTable());
+
+    AddKeys(50000, 10000);
+    ASSERT_OK(dbi->TEST_FlushMemTable());
+    ASSERT_OK(dbi->TEST_WaitForFlushMemTable());
+  }
+
+  void CheckFileTypeCounts(std::string& dir,
+                            int required_log,
+                            int required_sst,
+                            int required_manifest) {
+    std::vector<std::string> filenames;
+    env_->GetChildren(dir, &filenames);
+
+    int log_cnt = 0, sst_cnt = 0, manifest_cnt = 0;
+    for (auto file : filenames) {
+      uint64_t number;
+      FileType type;
+      if (ParseFileName(file, &number, &type)) {
+        log_cnt += (type == kLogFile);
+        sst_cnt += (type == kTableFile);
+        manifest_cnt += (type == kDescriptorFile);
+      }
+    }
+    ASSERT_EQ(required_log, log_cnt);
+    ASSERT_EQ(required_sst, sst_cnt);
+    ASSERT_EQ(required_manifest, manifest_cnt);
+  }
+
+};
+
+TEST(DeleteFileTest, AddKeysAndQueryLevels) {
+  CreateTwoLevels();
+  std::vector<LiveFileMetaData> metadata;
+  std::vector<int> keysinlevel;
+  db_->GetLiveFilesMetaData(&metadata);
+
+  std::string level1file = "";
+  int level1keycount = 0;
+  std::string level2file = "";
+  int level2keycount = 0;
+  int level1index = 0;
+  int level2index = 1;
+
+  ASSERT_EQ((int)metadata.size(), 2);
+  if (metadata[0].level == 2) {
+    level1index = 1;
+    level2index = 0;
+  }
+
+  level1file = metadata[level1index].name;
+  int startkey = atoi(metadata[level1index].smallestkey.c_str());
+  int endkey = atoi(metadata[level1index].largestkey.c_str());
+  level1keycount = (endkey - startkey + 1);
+  level2file = metadata[level2index].name;
+  startkey = atoi(metadata[level2index].smallestkey.c_str());
+  endkey = atoi(metadata[level2index].largestkey.c_str());
+  level2keycount = (endkey - startkey + 1);
+
+  // COntrolled setup. Levels 1 and 2 should both have 50K files.
+  // This is a little fragile as it depends on the current
+  // compaction heuristics.
+  ASSERT_EQ(level1keycount, 50000);
+  ASSERT_EQ(level2keycount, 50000);
+
+  Status status = db_->DeleteFile("0.sst");
+  ASSERT_TRUE(status.IsInvalidArgument());
+
+  // intermediate level files cannot be deleted.
+  status = db_->DeleteFile(level1file);
+  ASSERT_TRUE(status.IsInvalidArgument());
+
+  // Lowest level file deletion should succeed.
+  ASSERT_OK(db_->DeleteFile(level2file));
+
+  CloseDB();
+}
+
+TEST(DeleteFileTest, PurgeObsoleteFilesTest) {
+  CreateTwoLevels();
+  // there should be only one (empty) log file because CreateTwoLevels()
+  // flushes the memtables to disk
+  CheckFileTypeCounts(options_.wal_dir, 1, 0, 0);
+  // 2 ssts, 1 manifest
+  CheckFileTypeCounts(dbname_, 0, 2, 1);
+  std::string first("0"), last("999999");
+  Slice first_slice(first), last_slice(last);
+  db_->CompactRange(&first_slice, &last_slice, true, 2);
+  // 1 sst after compaction
+  CheckFileTypeCounts(dbname_, 0, 1, 1);
+
+  // this time, we keep an iterator alive
+  ReopenDB(true);
+  Iterator *itr = 0;
+  CreateTwoLevels();
+  itr = db_->NewIterator(ReadOptions());
+  db_->CompactRange(&first_slice, &last_slice, true, 2);
+  // 3 sst after compaction with live iterator
+  CheckFileTypeCounts(dbname_, 0, 3, 1);
+  delete itr;
+  // 1 sst after iterator deletion
+  CheckFileTypeCounts(dbname_, 0, 1, 1);
+
+  CloseDB();
+}
+
+TEST(DeleteFileTest, DeleteFileWithIterator) {
+  CreateTwoLevels();
+  ReadOptions options;
+  Iterator* it = db_->NewIterator(options);
+  std::vector<LiveFileMetaData> metadata;
+  db_->GetLiveFilesMetaData(&metadata);
+
+  std::string level2file = "";
+
+  ASSERT_EQ((int)metadata.size(), 2);
+  if (metadata[0].level == 1) {
+    level2file = metadata[1].name;
+  } else {
+    level2file = metadata[0].name;
+  }
+
+  Status status = db_->DeleteFile(level2file);
+  fprintf(stdout, "Deletion status %s: %s\n",
+          level2file.c_str(), status.ToString().c_str());
+  ASSERT_TRUE(status.ok());
+  it->SeekToFirst();
+  int numKeysIterated = 0;
+  while(it->Valid()) {
+    numKeysIterated++;
+    it->Next();
+  }
+  ASSERT_EQ(numKeysIterated, 50000);
+  delete it;
+  CloseDB();
+}
+
+TEST(DeleteFileTest, DeleteLogFiles) {
+  AddKeys(10, 0);
+  VectorLogPtr logfiles;
+  db_->GetSortedWalFiles(logfiles);
+  ASSERT_GT(logfiles.size(), 0UL);
+  // Take the last log file which is expected to be alive and try to delete it
+  // Should not succeed because live logs are not allowed to be deleted
+  std::unique_ptr<LogFile> alive_log = std::move(logfiles.back());
+  ASSERT_EQ(alive_log->Type(), kAliveLogFile);
+  ASSERT_TRUE(env_->FileExists(options_.wal_dir + "/" + alive_log->PathName()));
+  fprintf(stdout, "Deleting alive log file %s\n",
+          alive_log->PathName().c_str());
+  ASSERT_TRUE(!db_->DeleteFile(alive_log->PathName()).ok());
+  ASSERT_TRUE(env_->FileExists(options_.wal_dir + "/" + alive_log->PathName()));
+  logfiles.clear();
+
+  // Call Flush to bring about a new working log file and add more keys
+  // Call Flush again to flush out memtable and move alive log to archived log
+  // and try to delete the archived log file
+  FlushOptions fopts;
+  db_->Flush(fopts);
+  AddKeys(10, 0);
+  db_->Flush(fopts);
+  db_->GetSortedWalFiles(logfiles);
+  ASSERT_GT(logfiles.size(), 0UL);
+  std::unique_ptr<LogFile> archived_log = std::move(logfiles.front());
+  ASSERT_EQ(archived_log->Type(), kArchivedLogFile);
+  ASSERT_TRUE(env_->FileExists(options_.wal_dir + "/" +
+        archived_log->PathName()));
+  fprintf(stdout, "Deleting archived log file %s\n",
+          archived_log->PathName().c_str());
+  ASSERT_OK(db_->DeleteFile(archived_log->PathName()));
+  ASSERT_TRUE(!env_->FileExists(options_.wal_dir + "/" +
+        archived_log->PathName()));
+  CloseDB();
+}
+
+} //namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
+
--- a/db/filename.cc
+++ b/db/filename.cc
@ -0,0 +1,266 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/filename.h"
+
+#include <ctype.h>
+#include <stdio.h>
+#include "db/dbformat.h"
+#include "rocksdb/env.h"
+#include "util/logging.h"
+
+namespace rocksdb {
+
+// Given a path, flatten the path name by replacing all chars not in
+// {[0-9,a-z,A-Z,-,_,.]} with _. And append '\0' at the end.
+// Return the number of chars stored in dest not including the trailing '\0'.
+static int FlattenPath(const std::string& path, char* dest, int len) {
+  int write_idx = 0;
+  int i = 0;
+  int src_len = path.size();
+
+  while (i < src_len && write_idx < len - 1) {
+    if ((path[i] >= 'a' && path[i] <= 'z') ||
+        (path[i] >= '0' && path[i] <= '9') ||
+        (path[i] >= 'A' && path[i] <= 'Z') ||
+        path[i] == '-' ||
+        path[i] == '.' ||
+        path[i] == '_'){
+      dest[write_idx++] = path[i];
+    } else {
+      if (i > 0)
+        dest[write_idx++] = '_';
+    }
+    i++;
+  }
+
+  dest[write_idx] = '\0';
+  return write_idx;
+}
+
+// A utility routine: write "data" to the named file and Sync() it.
+extern Status WriteStringToFileSync(Env* env, const Slice& data,
+                                    const std::string& fname);
+
+static std::string MakeFileName(const std::string& name, uint64_t number,
+                                const char* suffix) {
+  char buf[100];
+  snprintf(buf, sizeof(buf), "/%06llu.%s",
+           static_cast<unsigned long long>(number),
+           suffix);
+  return name + buf;
+}
+
+std::string LogFileName(const std::string& name, uint64_t number) {
+  assert(number > 0);
+  return MakeFileName(name, number, "log");
+}
+
+std::string ArchivalDirectory(const std::string& dir) {
+  return dir + "/" + ARCHIVAL_DIR;
+}
+std::string ArchivedLogFileName(const std::string& name, uint64_t number) {
+  assert(number > 0);
+  return MakeFileName(name + "/" + ARCHIVAL_DIR, number, "log");
+}
+
+std::string TableFileName(const std::string& name, uint64_t number) {
+  assert(number > 0);
+  return MakeFileName(name, number, "sst");
+}
+
+std::string DescriptorFileName(const std::string& dbname, uint64_t number) {
+  assert(number > 0);
+  char buf[100];
+  snprintf(buf, sizeof(buf), "/MANIFEST-%06llu",
+           static_cast<unsigned long long>(number));
+  return dbname + buf;
+}
+
+std::string CurrentFileName(const std::string& dbname) {
+  return dbname + "/CURRENT";
+}
+
+std::string LockFileName(const std::string& dbname) {
+  return dbname + "/LOCK";
+}
+
+std::string TempFileName(const std::string& dbname, uint64_t number) {
+  assert(number >= 0);
+  return MakeFileName(dbname, number, "dbtmp");
+}
+
+std::string InfoLogFileName(const std::string& dbname,
+    const std::string& db_path, const std::string& log_dir) {
+  if (log_dir.empty())
+    return dbname + "/LOG";
+
+  char flatten_db_path[256];
+  FlattenPath(db_path, flatten_db_path, 256);
+  return log_dir + "/" + flatten_db_path + "_LOG";
+}
+
+// Return the name of the old info log file for "dbname".
+std::string OldInfoLogFileName(const std::string& dbname, uint64_t ts,
+    const std::string& db_path, const std::string& log_dir) {
+  char buf[50];
+  snprintf(buf, sizeof(buf), "%llu", static_cast<unsigned long long>(ts));
+
+  if (log_dir.empty())
+    return dbname + "/LOG.old." + buf;
+
+  char flatten_db_path[256];
+  FlattenPath(db_path, flatten_db_path, 256);
+  return log_dir + "/" + flatten_db_path + "_LOG.old." + buf;
+}
+
+std::string MetaDatabaseName(const std::string& dbname, uint64_t number) {
+  char buf[100];
+  snprintf(buf, sizeof(buf), "/METADB-%llu",
+           static_cast<unsigned long long>(number));
+  return dbname + buf;
+}
+
+std::string IdentityFileName(const std::string& dbname) {
+  return dbname + "/IDENTITY";
+}
+
+// Owned filenames have the form:
+//    dbname/IDENTITY
+//    dbname/CURRENT
+//    dbname/LOCK
+//    dbname/LOG
+//    dbname/LOG.old.[0-9]+
+//    dbname/MANIFEST-[0-9]+
+//    dbname/[0-9]+.(log|sst)
+//    dbname/METADB-[0-9]+
+//    Disregards / at the beginning
+bool ParseFileName(const std::string& fname,
+                   uint64_t* number,
+                   FileType* type,
+                   WalFileType* log_type) {
+  Slice rest(fname);
+  if (fname.length() > 1 && fname[0] == '/') {
+    rest.remove_prefix(1);
+  }
+  if (rest == "IDENTITY") {
+    *number = 0;
+    *type = kIdentityFile;
+  } else if (rest == "CURRENT") {
+    *number = 0;
+    *type = kCurrentFile;
+  } else if (rest == "LOCK") {
+    *number = 0;
+    *type = kDBLockFile;
+  } else if (rest == "LOG" || rest == "LOG.old") {
+    *number = 0;
+    *type = kInfoLogFile;
+  } else if (rest.starts_with("LOG.old.")) {
+    uint64_t ts_suffix;
+    // sizeof also counts the trailing '\0'.
+    rest.remove_prefix(sizeof("LOG.old.") - 1);
+    if (!ConsumeDecimalNumber(&rest, &ts_suffix)) {
+      return false;
+    }
+    *number = ts_suffix;
+    *type = kInfoLogFile;
+  } else if (rest.starts_with("MANIFEST-")) {
+    rest.remove_prefix(strlen("MANIFEST-"));
+    uint64_t num;
+    if (!ConsumeDecimalNumber(&rest, &num)) {
+      return false;
+    }
+    if (!rest.empty()) {
+      return false;
+    }
+    *type = kDescriptorFile;
+    *number = num;
+  } else if (rest.starts_with("METADB-")) {
+    rest.remove_prefix(strlen("METADB-"));
+    uint64_t num;
+    if (!ConsumeDecimalNumber(&rest, &num)) {
+      return false;
+    }
+    if (!rest.empty()) {
+      return false;
+    }
+    *type = kMetaDatabase;
+    *number = num;
+  } else {
+    // Avoid strtoull() to keep filename format independent of the
+    // current locale
+    bool archive_dir_found = false;
+    if (rest.starts_with(ARCHIVAL_DIR)) {
+      if (rest.size() <= ARCHIVAL_DIR.size()) {
+        return false;
+      }
+      rest.remove_prefix(ARCHIVAL_DIR.size() + 1); // Add 1 to remove / also
+      if (log_type) {
+        *log_type = kArchivedLogFile;
+      }
+      archive_dir_found = true;
+    }
+    uint64_t num;
+    if (!ConsumeDecimalNumber(&rest, &num)) {
+      return false;
+    }
+    Slice suffix = rest;
+    if (suffix == Slice(".log")) {
+      *type = kLogFile;
+      if (log_type && !archive_dir_found) {
+        *log_type = kAliveLogFile;
+      }
+    } else if (archive_dir_found) {
+      return false; // Archive dir can contain only log files
+    } else if (suffix == Slice(".sst")) {
+      *type = kTableFile;
+    } else if (suffix == Slice(".dbtmp")) {
+      *type = kTempFile;
+    } else {
+      return false;
+    }
+    *number = num;
+  }
+  return true;
+}
+
+Status SetCurrentFile(Env* env, const std::string& dbname,
+                      uint64_t descriptor_number) {
+  // Remove leading "dbname/" and add newline to manifest file name
+  std::string manifest = DescriptorFileName(dbname, descriptor_number);
+  Slice contents = manifest;
+  assert(contents.starts_with(dbname + "/"));
+  contents.remove_prefix(dbname.size() + 1);
+  std::string tmp = TempFileName(dbname, descriptor_number);
+  Status s = WriteStringToFileSync(env, contents.ToString() + "\n", tmp);
+  if (s.ok()) {
+    s = env->RenameFile(tmp, CurrentFileName(dbname));
+  }
+  if (!s.ok()) {
+    env->DeleteFile(tmp);
+  }
+  return s;
+}
+
+Status SetIdentityFile(Env* env, const std::string& dbname) {
+  std::string id = env->GenerateUniqueId();
+  assert(!id.empty());
+  // Reserve the filename dbname/000000.dbtmp for the temporary identity file
+  std::string tmp = TempFileName(dbname, 0);
+  Status s = WriteStringToFileSync(env, id, tmp);
+  if (s.ok()) {
+    s = env->RenameFile(tmp, IdentityFileName(dbname));
+  }
+  if (!s.ok()) {
+    env->DeleteFile(tmp);
+  }
+  return s;
+}
+
+}  // namespace rocksdb
--- a/db/filename.h
+++ b/db/filename.h
@ -0,0 +1,108 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// File names used by DB code
+
+#pragma once
+#include <stdint.h>
+#include <string>
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "rocksdb/transaction_log.h"
+#include "port/port.h"
+
+namespace rocksdb {
+
+class Env;
+
+enum FileType {
+  kLogFile,
+  kDBLockFile,
+  kTableFile,
+  kDescriptorFile,
+  kCurrentFile,
+  kTempFile,
+  kInfoLogFile,  // Either the current one, or an old one
+  kMetaDatabase,
+  kIdentityFile
+};
+
+// Return the name of the log file with the specified number
+// in the db named by "dbname".  The result will be prefixed with
+// "dbname".
+extern std::string LogFileName(const std::string& dbname, uint64_t number);
+
+static const std::string ARCHIVAL_DIR = "archive";
+
+extern std::string ArchivalDirectory(const std::string& dbname);
+
+//  Return the name of the archived log file with the specified number
+//  in the db named by "dbname". The result will be prefixed with "dbname".
+extern std::string ArchivedLogFileName(const std::string& dbname,
+                                       uint64_t num);
+
+// Return the name of the sstable with the specified number
+// in the db named by "dbname".  The result will be prefixed with
+// "dbname".
+extern std::string TableFileName(const std::string& dbname, uint64_t number);
+
+// Return the name of the descriptor file for the db named by
+// "dbname" and the specified incarnation number.  The result will be
+// prefixed with "dbname".
+extern std::string DescriptorFileName(const std::string& dbname,
+                                      uint64_t number);
+
+// Return the name of the current file.  This file contains the name
+// of the current manifest file.  The result will be prefixed with
+// "dbname".
+extern std::string CurrentFileName(const std::string& dbname);
+
+// Return the name of the lock file for the db named by
+// "dbname".  The result will be prefixed with "dbname".
+extern std::string LockFileName(const std::string& dbname);
+
+// Return the name of a temporary file owned by the db named "dbname".
+// The result will be prefixed with "dbname".
+extern std::string TempFileName(const std::string& dbname, uint64_t number);
+
+// Return the name of the info log file for "dbname".
+extern std::string InfoLogFileName(const std::string& dbname,
+    const std::string& db_path="", const std::string& log_dir="");
+
+// Return the name of the old info log file for "dbname".
+extern std::string OldInfoLogFileName(const std::string& dbname, uint64_t ts,
+    const std::string& db_path="", const std::string& log_dir="");
+
+// Return the name to use for a metadatabase. The result will be prefixed with
+// "dbname".
+extern std::string MetaDatabaseName(const std::string& dbname,
+                                    uint64_t number);
+
+// Return the name of the Identity file which stores a unique number for the db
+// that will get regenerated if the db loses all its data and is recreated fresh
+// either from a backup-image or empty
+extern std::string IdentityFileName(const std::string& dbname);
+
+// If filename is a rocksdb file, store the type of the file in *type.
+// The number encoded in the filename is stored in *number.  If the
+// filename was successfully parsed, returns true.  Else return false.
+extern bool ParseFileName(const std::string& filename,
+                          uint64_t* number,
+                          FileType* type,
+                          WalFileType* log_type = nullptr);
+
+// Make the CURRENT file point to the descriptor file with the
+// specified number.
+extern Status SetCurrentFile(Env* env, const std::string& dbname,
+                             uint64_t descriptor_number);
+
+// Make the IDENTITY file for the db
+extern Status SetIdentityFile(Env* env, const std::string& dbname);
+
+}  // namespace rocksdb
--- a/db/filename_test.cc
+++ b/db/filename_test.cc
@ -0,0 +1,140 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/filename.h"
+
+#include "db/dbformat.h"
+#include "port/port.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+
+namespace rocksdb {
+
+class FileNameTest { };
+
+TEST(FileNameTest, Parse) {
+  Slice db;
+  FileType type;
+  uint64_t number;
+
+  // Successful parses
+  static struct {
+    const char* fname;
+    uint64_t number;
+    FileType type;
+  } cases[] = {
+    { "100.log",            100,   kLogFile },
+    { "0.log",              0,     kLogFile },
+    { "0.sst",              0,     kTableFile },
+    { "CURRENT",            0,     kCurrentFile },
+    { "LOCK",               0,     kDBLockFile },
+    { "MANIFEST-2",         2,     kDescriptorFile },
+    { "MANIFEST-7",         7,     kDescriptorFile },
+    { "METADB-2",           2,     kMetaDatabase },
+    { "METADB-7",           7,     kMetaDatabase },
+    { "LOG",                0,     kInfoLogFile },
+    { "LOG.old",            0,     kInfoLogFile },
+    { "18446744073709551615.log", 18446744073709551615ull, kLogFile },
+  };
+  for (unsigned int i = 0; i < sizeof(cases) / sizeof(cases[0]); i++) {
+    std::string f = cases[i].fname;
+    ASSERT_TRUE(ParseFileName(f, &number, &type)) << f;
+    ASSERT_EQ(cases[i].type, type) << f;
+    ASSERT_EQ(cases[i].number, number) << f;
+  }
+
+  // Errors
+  static const char* errors[] = {
+    "",
+    "foo",
+    "foo-dx-100.log",
+    ".log",
+    "",
+    "manifest",
+    "CURREN",
+    "CURRENTX",
+    "MANIFES",
+    "MANIFEST",
+    "MANIFEST-",
+    "XMANIFEST-3",
+    "MANIFEST-3x",
+    "META",
+    "METADB",
+    "METADB-",
+    "XMETADB-3",
+    "METADB-3x",
+    "LOC",
+    "LOCKx",
+    "LO",
+    "LOGx",
+    "18446744073709551616.log",
+    "184467440737095516150.log",
+    "100",
+    "100.",
+    "100.lop"
+  };
+  for (unsigned int i = 0; i < sizeof(errors) / sizeof(errors[0]); i++) {
+    std::string f = errors[i];
+    ASSERT_TRUE(!ParseFileName(f, &number, &type)) << f;
+  };
+}
+
+TEST(FileNameTest, Construction) {
+  uint64_t number;
+  FileType type;
+  std::string fname;
+
+  fname = CurrentFileName("foo");
+  ASSERT_EQ("foo/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+  ASSERT_EQ(0U, number);
+  ASSERT_EQ(kCurrentFile, type);
+
+  fname = LockFileName("foo");
+  ASSERT_EQ("foo/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+  ASSERT_EQ(0U, number);
+  ASSERT_EQ(kDBLockFile, type);
+
+  fname = LogFileName("foo", 192);
+  ASSERT_EQ("foo/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+  ASSERT_EQ(192U, number);
+  ASSERT_EQ(kLogFile, type);
+
+  fname = TableFileName("bar", 200);
+  ASSERT_EQ("bar/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+  ASSERT_EQ(200U, number);
+  ASSERT_EQ(kTableFile, type);
+
+  fname = DescriptorFileName("bar", 100);
+  ASSERT_EQ("bar/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+  ASSERT_EQ(100U, number);
+  ASSERT_EQ(kDescriptorFile, type);
+
+  fname = TempFileName("tmp", 999);
+  ASSERT_EQ("tmp/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+  ASSERT_EQ(999U, number);
+  ASSERT_EQ(kTempFile, type);
+
+  fname = MetaDatabaseName("met", 100);
+  ASSERT_EQ("met/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+  ASSERT_EQ(100U, number);
+  ASSERT_EQ(kMetaDatabase, type);
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
--- a/db/log_format.h
+++ b/db/log_format.h
@ -0,0 +1,36 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Log format information shared by reader and writer.
+// See ../doc/log_format.txt for more detail.
+
+#pragma once
+namespace rocksdb {
+namespace log {
+
+enum RecordType {
+  // Zero is reserved for preallocated files
+  kZeroType = 0,
+
+  kFullType = 1,
+
+  // For fragments
+  kFirstType = 2,
+  kMiddleType = 3,
+  kLastType = 4
+};
+static const int kMaxRecordType = kLastType;
+
+static const unsigned int kBlockSize = 32768;
+
+// Header is checksum (4 bytes), type (1 byte), length (2 bytes).
+static const int kHeaderSize = 4 + 1 + 2;
+
+}  // namespace log
+}  // namespace rocksdb
--- a/db/log_reader.cc
+++ b/db/log_reader.cc
@ -0,0 +1,264 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/log_reader.h"
+
+#include <stdio.h>
+#include "rocksdb/env.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+
+namespace rocksdb {
+namespace log {
+
+Reader::Reporter::~Reporter() {
+}
+
+Reader::Reader(unique_ptr<SequentialFile>&& file, Reporter* reporter,
+               bool checksum, uint64_t initial_offset)
+    : file_(std::move(file)),
+      reporter_(reporter),
+      checksum_(checksum),
+      backing_store_(new char[kBlockSize]),
+      buffer_(),
+      eof_(false),
+      last_record_offset_(0),
+      end_of_buffer_offset_(0),
+      initial_offset_(initial_offset) {
+}
+
+Reader::~Reader() {
+  delete[] backing_store_;
+}
+
+bool Reader::SkipToInitialBlock() {
+  size_t offset_in_block = initial_offset_ % kBlockSize;
+  uint64_t block_start_location = initial_offset_ - offset_in_block;
+
+  // Don't search a block if we'd be in the trailer
+  if (offset_in_block > kBlockSize - 6) {
+    offset_in_block = 0;
+    block_start_location += kBlockSize;
+  }
+
+  end_of_buffer_offset_ = block_start_location;
+
+  // Skip to start of first block that can contain the initial record
+  if (block_start_location > 0) {
+    Status skip_status = file_->Skip(block_start_location);
+    if (!skip_status.ok()) {
+      ReportDrop(block_start_location, skip_status);
+      return false;
+    }
+  }
+
+  return true;
+}
+
+bool Reader::ReadRecord(Slice* record, std::string* scratch) {
+  if (last_record_offset_ < initial_offset_) {
+    if (!SkipToInitialBlock()) {
+      return false;
+    }
+  }
+
+  scratch->clear();
+  record->clear();
+  bool in_fragmented_record = false;
+  // Record offset of the logical record that we're reading
+  // 0 is a dummy value to make compilers happy
+  uint64_t prospective_record_offset = 0;
+
+  Slice fragment;
+  while (true) {
+    uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size();
+    const unsigned int record_type = ReadPhysicalRecord(&fragment);
+    switch (record_type) {
+      case kFullType:
+        if (in_fragmented_record) {
+          // Handle bug in earlier versions of log::Writer where
+          // it could emit an empty kFirstType record at the tail end
+          // of a block followed by a kFullType or kFirstType record
+          // at the beginning of the next block.
+          if (scratch->empty()) {
+            in_fragmented_record = false;
+          } else {
+            ReportCorruption(scratch->size(), "partial record without end(1)");
+          }
+        }
+        prospective_record_offset = physical_record_offset;
+        scratch->clear();
+        *record = fragment;
+        last_record_offset_ = prospective_record_offset;
+        return true;
+
+      case kFirstType:
+        if (in_fragmented_record) {
+          // Handle bug in earlier versions of log::Writer where
+          // it could emit an empty kFirstType record at the tail end
+          // of a block followed by a kFullType or kFirstType record
+          // at the beginning of the next block.
+          if (scratch->empty()) {
+            in_fragmented_record = false;
+          } else {
+            ReportCorruption(scratch->size(), "partial record without end(2)");
+          }
+        }
+        prospective_record_offset = physical_record_offset;
+        scratch->assign(fragment.data(), fragment.size());
+        in_fragmented_record = true;
+        break;
+
+      case kMiddleType:
+        if (!in_fragmented_record) {
+          ReportCorruption(fragment.size(),
+                           "missing start of fragmented record(1)");
+        } else {
+          scratch->append(fragment.data(), fragment.size());
+        }
+        break;
+
+      case kLastType:
+        if (!in_fragmented_record) {
+          ReportCorruption(fragment.size(),
+                           "missing start of fragmented record(2)");
+        } else {
+          scratch->append(fragment.data(), fragment.size());
+          *record = Slice(*scratch);
+          last_record_offset_ = prospective_record_offset;
+          return true;
+        }
+        break;
+
+      case kEof:
+        if (in_fragmented_record) {
+          ReportCorruption(scratch->size(), "partial record without end(3)");
+          scratch->clear();
+        }
+        return false;
+
+      case kBadRecord:
+        if (in_fragmented_record) {
+          ReportCorruption(scratch->size(), "error in middle of record");
+          in_fragmented_record = false;
+          scratch->clear();
+        }
+        break;
+
+      default: {
+        char buf[40];
+        snprintf(buf, sizeof(buf), "unknown record type %u", record_type);
+        ReportCorruption(
+            (fragment.size() + (in_fragmented_record ? scratch->size() : 0)),
+            buf);
+        in_fragmented_record = false;
+        scratch->clear();
+        break;
+      }
+    }
+  }
+  return false;
+}
+
+uint64_t Reader::LastRecordOffset() {
+  return last_record_offset_;
+}
+
+void Reader::ReportCorruption(size_t bytes, const char* reason) {
+  ReportDrop(bytes, Status::Corruption(reason));
+}
+
+void Reader::ReportDrop(size_t bytes, const Status& reason) {
+  if (reporter_ != nullptr &&
+      end_of_buffer_offset_ - buffer_.size() - bytes >= initial_offset_) {
+    reporter_->Corruption(bytes, reason);
+  }
+}
+
+unsigned int Reader::ReadPhysicalRecord(Slice* result) {
+  while (true) {
+    if (buffer_.size() < (size_t)kHeaderSize) {
+      if (!eof_) {
+        // Last read was a full read, so this is a trailer to skip
+        buffer_.clear();
+        Status status = file_->Read(kBlockSize, &buffer_, backing_store_);
+        end_of_buffer_offset_ += buffer_.size();
+        if (!status.ok()) {
+          buffer_.clear();
+          ReportDrop(kBlockSize, status);
+          eof_ = true;
+          return kEof;
+        } else if (buffer_.size() < (size_t)kBlockSize) {
+          eof_ = true;
+        }
+        continue;
+      } else if (buffer_.size() == 0) {
+        // End of file
+        return kEof;
+      } else {
+        size_t drop_size = buffer_.size();
+        buffer_.clear();
+        ReportCorruption(drop_size, "truncated record at end of file");
+        return kEof;
+      }
+    }
+
+    // Parse the header
+    const char* header = buffer_.data();
+    const uint32_t a = static_cast<uint32_t>(header[4]) & 0xff;
+    const uint32_t b = static_cast<uint32_t>(header[5]) & 0xff;
+    const unsigned int type = header[6];
+    const uint32_t length = a | (b << 8);
+    if (kHeaderSize + length > buffer_.size()) {
+      size_t drop_size = buffer_.size();
+      buffer_.clear();
+      ReportCorruption(drop_size, "bad record length");
+      return kBadRecord;
+    }
+
+    if (type == kZeroType && length == 0) {
+      // Skip zero length record without reporting any drops since
+      // such records are produced by the mmap based writing code in
+      // env_posix.cc that preallocates file regions.
+      buffer_.clear();
+      return kBadRecord;
+    }
+
+    // Check crc
+    if (checksum_) {
+      uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header));
+      uint32_t actual_crc = crc32c::Value(header + 6, 1 + length);
+      if (actual_crc != expected_crc) {
+        // Drop the rest of the buffer since "length" itself may have
+        // been corrupted and if we trust it, we could find some
+        // fragment of a real log record that just happens to look
+        // like a valid log record.
+        size_t drop_size = buffer_.size();
+        buffer_.clear();
+        ReportCorruption(drop_size, "checksum mismatch");
+        return kBadRecord;
+      }
+    }
+
+    buffer_.remove_prefix(kHeaderSize + length);
+
+    // Skip physical record that started before initial_offset_
+    if (end_of_buffer_offset_ - buffer_.size() - kHeaderSize - length <
+        initial_offset_) {
+      result->clear();
+      return kBadRecord;
+    }
+
+    *result = Slice(header + kHeaderSize, length);
+    return type;
+  }
+}
+
+}  // namespace log
+}  // namespace rocksdb
--- a/db/log_reader.h
+++ b/db/log_reader.h
@ -0,0 +1,124 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <memory>
+#include <stdint.h>
+
+#include "db/log_format.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+
+namespace rocksdb {
+
+class SequentialFile;
+using std::unique_ptr;
+
+namespace log {
+
+class Reader {
+ public:
+  // Interface for reporting errors.
+  class Reporter {
+   public:
+    virtual ~Reporter();
+
+    // Some corruption was detected.  "size" is the approximate number
+    // of bytes dropped due to the corruption.
+    virtual void Corruption(size_t bytes, const Status& status) = 0;
+  };
+
+  // Create a reader that will return log records from "*file".
+  // "*file" must remain live while this Reader is in use.
+  //
+  // If "reporter" is non-nullptr, it is notified whenever some data is
+  // dropped due to a detected corruption.  "*reporter" must remain
+  // live while this Reader is in use.
+  //
+  // If "checksum" is true, verify checksums if available.
+  //
+  // The Reader will start reading at the first record located at physical
+  // position >= initial_offset within the file.
+  Reader(unique_ptr<SequentialFile>&& file, Reporter* reporter,
+         bool checksum, uint64_t initial_offset);
+
+  ~Reader();
+
+  // Read the next record into *record.  Returns true if read
+  // successfully, false if we hit end of the input.  May use
+  // "*scratch" as temporary storage.  The contents filled in *record
+  // will only be valid until the next mutating operation on this
+  // reader or the next mutation to *scratch.
+  bool ReadRecord(Slice* record, std::string* scratch);
+
+  // Returns the physical offset of the last record returned by ReadRecord.
+  //
+  // Undefined before the first call to ReadRecord.
+  uint64_t LastRecordOffset();
+
+  // returns true if the reader has encountered an eof condition.
+  bool IsEOF() {
+    return eof_;
+  }
+
+  // when we know more data has been written to the file. we can use this
+  // function to force the reader to look again in the file.
+  void UnmarkEOF() {
+    eof_ = false;
+  }
+
+  SequentialFile* file() { return file_.get(); }
+
+ private:
+  const unique_ptr<SequentialFile> file_;
+  Reporter* const reporter_;
+  bool const checksum_;
+  char* const backing_store_;
+  Slice buffer_;
+  bool eof_;   // Last Read() indicated EOF by returning < kBlockSize
+
+  // Offset of the last record returned by ReadRecord.
+  uint64_t last_record_offset_;
+  // Offset of the first location past the end of buffer_.
+  uint64_t end_of_buffer_offset_;
+
+  // Offset at which to start looking for the first record to return
+  uint64_t const initial_offset_;
+
+  // Extend record types with the following special values
+  enum {
+    kEof = kMaxRecordType + 1,
+    // Returned whenever we find an invalid physical record.
+    // Currently there are three situations in which this happens:
+    // * The record has an invalid CRC (ReadPhysicalRecord reports a drop)
+    // * The record is a 0-length record (No drop is reported)
+    // * The record is below constructor's initial_offset (No drop is reported)
+    kBadRecord = kMaxRecordType + 2
+  };
+
+  // Skips all blocks that are completely before "initial_offset_".
+  //
+  // Returns true on success. Handles reporting.
+  bool SkipToInitialBlock();
+
+  // Return type, or one of the preceding special values
+  unsigned int ReadPhysicalRecord(Slice* result);
+
+  // Reports dropped bytes to the reporter.
+  // buffer_ must be updated to remove the dropped bytes prior to invocation.
+  void ReportCorruption(size_t bytes, const char* reason);
+  void ReportDrop(size_t bytes, const Status& reason);
+
+  // No copying allowed
+  Reader(const Reader&);
+  void operator=(const Reader&);
+};
+
+}  // namespace log
+}  // namespace rocksdb
--- a/db/log_test.cc
+++ b/db/log_test.cc
@ -0,0 +1,528 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "rocksdb/env.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+#include "util/random.h"
+#include "util/testharness.h"
+
+namespace rocksdb {
+namespace log {
+
+// Construct a string of the specified length made out of the supplied
+// partial string.
+static std::string BigString(const std::string& partial_string, size_t n) {
+  std::string result;
+  while (result.size() < n) {
+    result.append(partial_string);
+  }
+  result.resize(n);
+  return result;
+}
+
+// Construct a string from a number
+static std::string NumberString(int n) {
+  char buf[50];
+  snprintf(buf, sizeof(buf), "%d.", n);
+  return std::string(buf);
+}
+
+// Return a skewed potentially long string
+static std::string RandomSkewedString(int i, Random* rnd) {
+  return BigString(NumberString(i), rnd->Skewed(17));
+}
+
+class LogTest {
+ private:
+  class StringDest : public WritableFile {
+   public:
+    std::string contents_;
+
+    virtual Status Close() { return Status::OK(); }
+    virtual Status Flush() { return Status::OK(); }
+    virtual Status Sync() { return Status::OK(); }
+    virtual Status Append(const Slice& slice) {
+      contents_.append(slice.data(), slice.size());
+      return Status::OK();
+    }
+  };
+
+  class StringSource : public SequentialFile {
+   public:
+    Slice contents_;
+    bool force_error_;
+    bool returned_partial_;
+    StringSource() : force_error_(false), returned_partial_(false) { }
+
+    virtual Status Read(size_t n, Slice* result, char* scratch) {
+      ASSERT_TRUE(!returned_partial_) << "must not Read() after eof/error";
+
+      if (force_error_) {
+        force_error_ = false;
+        returned_partial_ = true;
+        return Status::Corruption("read error");
+      }
+
+      if (contents_.size() < n) {
+        n = contents_.size();
+        returned_partial_ = true;
+      }
+      *result = Slice(contents_.data(), n);
+      contents_.remove_prefix(n);
+      return Status::OK();
+    }
+
+    virtual Status Skip(uint64_t n) {
+      if (n > contents_.size()) {
+        contents_.clear();
+        return Status::NotFound("in-memory file skipepd past end");
+      }
+
+      contents_.remove_prefix(n);
+
+      return Status::OK();
+    }
+  };
+
+  class ReportCollector : public Reader::Reporter {
+   public:
+    size_t dropped_bytes_;
+    std::string message_;
+
+    ReportCollector() : dropped_bytes_(0) { }
+    virtual void Corruption(size_t bytes, const Status& status) {
+      dropped_bytes_ += bytes;
+      message_.append(status.ToString());
+    }
+  };
+
+  std::string& dest_contents() {
+    auto dest = dynamic_cast<StringDest*>(writer_.file());
+    assert(dest);
+    return dest->contents_;
+  }
+
+  const std::string& dest_contents() const {
+    auto dest = dynamic_cast<const StringDest*>(writer_.file());
+    assert(dest);
+    return dest->contents_;
+  }
+
+  void reset_source_contents() {
+    auto src = dynamic_cast<StringSource*>(reader_.file());
+    assert(src);
+    src->contents_ = dest_contents();
+  }
+
+  unique_ptr<StringDest> dest_holder_;
+  unique_ptr<StringSource> source_holder_;
+  ReportCollector report_;
+  bool reading_;
+  Writer writer_;
+  Reader reader_;
+
+  // Record metadata for testing initial offset functionality
+  static size_t initial_offset_record_sizes_[];
+  static uint64_t initial_offset_last_record_offsets_[];
+
+ public:
+  LogTest() : dest_holder_(new StringDest),
+              source_holder_(new StringSource),
+              reading_(false),
+              writer_(std::move(dest_holder_)),
+              reader_(std::move(source_holder_), &report_, true/*checksum*/,
+                      0/*initial_offset*/) {
+  }
+
+  void Write(const std::string& msg) {
+    ASSERT_TRUE(!reading_) << "Write() after starting to read";
+    writer_.AddRecord(Slice(msg));
+  }
+
+  size_t WrittenBytes() const {
+    return dest_contents().size();
+  }
+
+  std::string Read() {
+    if (!reading_) {
+      reading_ = true;
+      reset_source_contents();
+    }
+    std::string scratch;
+    Slice record;
+    if (reader_.ReadRecord(&record, &scratch)) {
+      return record.ToString();
+    } else {
+      return "EOF";
+    }
+  }
+
+  void IncrementByte(int offset, int delta) {
+    dest_contents()[offset] += delta;
+  }
+
+  void SetByte(int offset, char new_byte) {
+    dest_contents()[offset] = new_byte;
+  }
+
+  void ShrinkSize(int bytes) {
+    dest_contents().resize(dest_contents().size() - bytes);
+  }
+
+  void FixChecksum(int header_offset, int len) {
+    // Compute crc of type/len/data
+    uint32_t crc = crc32c::Value(&dest_contents()[header_offset+6], 1 + len);
+    crc = crc32c::Mask(crc);
+    EncodeFixed32(&dest_contents()[header_offset], crc);
+  }
+
+  void ForceError() {
+    auto src = dynamic_cast<StringSource*>(reader_.file());
+    src->force_error_ = true;
+  }
+
+  size_t DroppedBytes() const {
+    return report_.dropped_bytes_;
+  }
+
+  std::string ReportMessage() const {
+    return report_.message_;
+  }
+
+  // Returns OK iff recorded error message contains "msg"
+  std::string MatchError(const std::string& msg) const {
+    if (report_.message_.find(msg) == std::string::npos) {
+      return report_.message_;
+    } else {
+      return "OK";
+    }
+  }
+
+  void WriteInitialOffsetLog() {
+    for (int i = 0; i < 4; i++) {
+      std::string record(initial_offset_record_sizes_[i],
+                         static_cast<char>('a' + i));
+      Write(record);
+    }
+  }
+
+  void CheckOffsetPastEndReturnsNoRecords(uint64_t offset_past_end) {
+    WriteInitialOffsetLog();
+    reading_ = true;
+    unique_ptr<StringSource> source(new StringSource);
+    source->contents_ = dest_contents();
+    unique_ptr<Reader> offset_reader(
+      new Reader(std::move(source), &report_, true/*checksum*/,
+                 WrittenBytes() + offset_past_end));
+    Slice record;
+    std::string scratch;
+    ASSERT_TRUE(!offset_reader->ReadRecord(&record, &scratch));
+  }
+
+  void CheckInitialOffsetRecord(uint64_t initial_offset,
+                                int expected_record_offset) {
+    WriteInitialOffsetLog();
+    reading_ = true;
+    unique_ptr<StringSource> source(new StringSource);
+    source->contents_ = dest_contents();
+    unique_ptr<Reader> offset_reader(
+      new Reader(std::move(source), &report_, true/*checksum*/,
+                 initial_offset));
+    Slice record;
+    std::string scratch;
+    ASSERT_TRUE(offset_reader->ReadRecord(&record, &scratch));
+    ASSERT_EQ(initial_offset_record_sizes_[expected_record_offset],
+              record.size());
+    ASSERT_EQ(initial_offset_last_record_offsets_[expected_record_offset],
+              offset_reader->LastRecordOffset());
+    ASSERT_EQ((char)('a' + expected_record_offset), record.data()[0]);
+  }
+
+};
+
+size_t LogTest::initial_offset_record_sizes_[] =
+    {10000,  // Two sizable records in first block
+     10000,
+     2 * log::kBlockSize - 1000,  // Span three blocks
+     1};
+
+uint64_t LogTest::initial_offset_last_record_offsets_[] =
+    {0,
+     kHeaderSize + 10000,
+     2 * (kHeaderSize + 10000),
+     2 * (kHeaderSize + 10000) +
+         (2 * log::kBlockSize - 1000) + 3 * kHeaderSize};
+
+
+TEST(LogTest, Empty) {
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST(LogTest, ReadWrite) {
+  Write("foo");
+  Write("bar");
+  Write("");
+  Write("xxxx");
+  ASSERT_EQ("foo", Read());
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("", Read());
+  ASSERT_EQ("xxxx", Read());
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ("EOF", Read());  // Make sure reads at eof work
+}
+
+TEST(LogTest, ManyBlocks) {
+  for (int i = 0; i < 100000; i++) {
+    Write(NumberString(i));
+  }
+  for (int i = 0; i < 100000; i++) {
+    ASSERT_EQ(NumberString(i), Read());
+  }
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST(LogTest, Fragmentation) {
+  Write("small");
+  Write(BigString("medium", 50000));
+  Write(BigString("large", 100000));
+  ASSERT_EQ("small", Read());
+  ASSERT_EQ(BigString("medium", 50000), Read());
+  ASSERT_EQ(BigString("large", 100000), Read());
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST(LogTest, MarginalTrailer) {
+  // Make a trailer that is exactly the same length as an empty record.
+  const int n = kBlockSize - 2*kHeaderSize;
+  Write(BigString("foo", n));
+  ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize), WrittenBytes());
+  Write("");
+  Write("bar");
+  ASSERT_EQ(BigString("foo", n), Read());
+  ASSERT_EQ("", Read());
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST(LogTest, MarginalTrailer2) {
+  // Make a trailer that is exactly the same length as an empty record.
+  const int n = kBlockSize - 2*kHeaderSize;
+  Write(BigString("foo", n));
+  ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize), WrittenBytes());
+  Write("bar");
+  ASSERT_EQ(BigString("foo", n), Read());
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(0U, DroppedBytes());
+  ASSERT_EQ("", ReportMessage());
+}
+
+TEST(LogTest, ShortTrailer) {
+  const int n = kBlockSize - 2*kHeaderSize + 4;
+  Write(BigString("foo", n));
+  ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize + 4), WrittenBytes());
+  Write("");
+  Write("bar");
+  ASSERT_EQ(BigString("foo", n), Read());
+  ASSERT_EQ("", Read());
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST(LogTest, AlignedEof) {
+  const int n = kBlockSize - 2*kHeaderSize + 4;
+  Write(BigString("foo", n));
+  ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize + 4), WrittenBytes());
+  ASSERT_EQ(BigString("foo", n), Read());
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST(LogTest, RandomRead) {
+  const int N = 500;
+  Random write_rnd(301);
+  for (int i = 0; i < N; i++) {
+    Write(RandomSkewedString(i, &write_rnd));
+  }
+  Random read_rnd(301);
+  for (int i = 0; i < N; i++) {
+    ASSERT_EQ(RandomSkewedString(i, &read_rnd), Read());
+  }
+  ASSERT_EQ("EOF", Read());
+}
+
+// Tests of all the error paths in log_reader.cc follow:
+
+TEST(LogTest, ReadError) {
+  Write("foo");
+  ForceError();
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ((unsigned int)kBlockSize, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("read error"));
+}
+
+TEST(LogTest, BadRecordType) {
+  Write("foo");
+  // Type is stored in header[6]
+  IncrementByte(6, 100);
+  FixChecksum(0, 3);
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(3U, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("unknown record type"));
+}
+
+TEST(LogTest, TruncatedTrailingRecord) {
+  Write("foo");
+  ShrinkSize(4);   // Drop all payload as well as a header byte
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ((unsigned int)(kHeaderSize - 1), DroppedBytes());
+  ASSERT_EQ("OK", MatchError("truncated record at end of file"));
+}
+
+TEST(LogTest, BadLength) {
+  Write("foo");
+  ShrinkSize(1);
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ((unsigned int)(kHeaderSize + 2), DroppedBytes());
+  ASSERT_EQ("OK", MatchError("bad record length"));
+}
+
+TEST(LogTest, ChecksumMismatch) {
+  Write("foo");
+  IncrementByte(0, 10);
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(10U, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("checksum mismatch"));
+}
+
+TEST(LogTest, UnexpectedMiddleType) {
+  Write("foo");
+  SetByte(6, kMiddleType);
+  FixChecksum(0, 3);
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(3U, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("missing start"));
+}
+
+TEST(LogTest, UnexpectedLastType) {
+  Write("foo");
+  SetByte(6, kLastType);
+  FixChecksum(0, 3);
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(3U, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("missing start"));
+}
+
+TEST(LogTest, UnexpectedFullType) {
+  Write("foo");
+  Write("bar");
+  SetByte(6, kFirstType);
+  FixChecksum(0, 3);
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(3U, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("partial record without end"));
+}
+
+TEST(LogTest, UnexpectedFirstType) {
+  Write("foo");
+  Write(BigString("bar", 100000));
+  SetByte(6, kFirstType);
+  FixChecksum(0, 3);
+  ASSERT_EQ(BigString("bar", 100000), Read());
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(3U, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("partial record without end"));
+}
+
+TEST(LogTest, ErrorJoinsRecords) {
+  // Consider two fragmented records:
+  //    first(R1) last(R1) first(R2) last(R2)
+  // where the middle two fragments disappear.  We do not want
+  // first(R1),last(R2) to get joined and returned as a valid record.
+
+  // Write records that span two blocks
+  Write(BigString("foo", kBlockSize));
+  Write(BigString("bar", kBlockSize));
+  Write("correct");
+
+  // Wipe the middle block
+  for (unsigned int offset = kBlockSize; offset < 2*kBlockSize; offset++) {
+    SetByte(offset, 'x');
+  }
+
+  ASSERT_EQ("correct", Read());
+  ASSERT_EQ("EOF", Read());
+  const unsigned int dropped = DroppedBytes();
+  ASSERT_LE(dropped, 2*kBlockSize + 100);
+  ASSERT_GE(dropped, 2*kBlockSize);
+}
+
+TEST(LogTest, ReadStart) {
+  CheckInitialOffsetRecord(0, 0);
+}
+
+TEST(LogTest, ReadSecondOneOff) {
+  CheckInitialOffsetRecord(1, 1);
+}
+
+TEST(LogTest, ReadSecondTenThousand) {
+  CheckInitialOffsetRecord(10000, 1);
+}
+
+TEST(LogTest, ReadSecondStart) {
+  CheckInitialOffsetRecord(10007, 1);
+}
+
+TEST(LogTest, ReadThirdOneOff) {
+  CheckInitialOffsetRecord(10008, 2);
+}
+
+TEST(LogTest, ReadThirdStart) {
+  CheckInitialOffsetRecord(20014, 2);
+}
+
+TEST(LogTest, ReadFourthOneOff) {
+  CheckInitialOffsetRecord(20015, 3);
+}
+
+TEST(LogTest, ReadFourthFirstBlockTrailer) {
+  CheckInitialOffsetRecord(log::kBlockSize - 4, 3);
+}
+
+TEST(LogTest, ReadFourthMiddleBlock) {
+  CheckInitialOffsetRecord(log::kBlockSize + 1, 3);
+}
+
+TEST(LogTest, ReadFourthLastBlock) {
+  CheckInitialOffsetRecord(2 * log::kBlockSize + 1, 3);
+}
+
+TEST(LogTest, ReadFourthStart) {
+  CheckInitialOffsetRecord(
+      2 * (kHeaderSize + 1000) + (2 * log::kBlockSize - 1000) + 3 * kHeaderSize,
+      3);
+}
+
+TEST(LogTest, ReadEnd) {
+  CheckOffsetPastEndReturnsNoRecords(0);
+}
+
+TEST(LogTest, ReadPastEnd) {
+  CheckOffsetPastEndReturnsNoRecords(5);
+}
+
+}  // namespace log
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
--- a/db/log_writer.cc
+++ b/db/log_writer.cc
@ -0,0 +1,108 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/log_writer.h"
+
+#include <stdint.h>
+#include "rocksdb/env.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+
+namespace rocksdb {
+namespace log {
+
+Writer::Writer(unique_ptr<WritableFile>&& dest)
+    : dest_(std::move(dest)),
+      block_offset_(0) {
+  for (int i = 0; i <= kMaxRecordType; i++) {
+    char t = static_cast<char>(i);
+    type_crc_[i] = crc32c::Value(&t, 1);
+  }
+}
+
+Writer::~Writer() {
+}
+
+Status Writer::AddRecord(const Slice& slice) {
+  const char* ptr = slice.data();
+  size_t left = slice.size();
+
+  // Fragment the record if necessary and emit it.  Note that if slice
+  // is empty, we still want to iterate once to emit a single
+  // zero-length record
+  Status s;
+  bool begin = true;
+  do {
+    const int leftover = kBlockSize - block_offset_;
+    assert(leftover >= 0);
+    if (leftover < kHeaderSize) {
+      // Switch to a new block
+      if (leftover > 0) {
+        // Fill the trailer (literal below relies on kHeaderSize being 7)
+        assert(kHeaderSize == 7);
+        dest_->Append(Slice("\x00\x00\x00\x00\x00\x00", leftover));
+      }
+      block_offset_ = 0;
+    }
+
+    // Invariant: we never leave < kHeaderSize bytes in a block.
+    assert(kBlockSize - block_offset_ - kHeaderSize >= 0);
+
+    const size_t avail = kBlockSize - block_offset_ - kHeaderSize;
+    const size_t fragment_length = (left < avail) ? left : avail;
+
+    RecordType type;
+    const bool end = (left == fragment_length);
+    if (begin && end) {
+      type = kFullType;
+    } else if (begin) {
+      type = kFirstType;
+    } else if (end) {
+      type = kLastType;
+    } else {
+      type = kMiddleType;
+    }
+
+    s = EmitPhysicalRecord(type, ptr, fragment_length);
+    ptr += fragment_length;
+    left -= fragment_length;
+    begin = false;
+  } while (s.ok() && left > 0);
+  return s;
+}
+
+Status Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n) {
+  assert(n <= 0xffff);  // Must fit in two bytes
+  assert(block_offset_ + kHeaderSize + n <= kBlockSize);
+
+  // Format the header
+  char buf[kHeaderSize];
+  buf[4] = static_cast<char>(n & 0xff);
+  buf[5] = static_cast<char>(n >> 8);
+  buf[6] = static_cast<char>(t);
+
+  // Compute the crc of the record type and the payload.
+  uint32_t crc = crc32c::Extend(type_crc_[t], ptr, n);
+  crc = crc32c::Mask(crc);                 // Adjust for storage
+  EncodeFixed32(buf, crc);
+
+  // Write the header and the payload
+  Status s = dest_->Append(Slice(buf, kHeaderSize));
+  if (s.ok()) {
+    s = dest_->Append(Slice(ptr, n));
+    if (s.ok()) {
+      s = dest_->Flush();
+    }
+  }
+  block_offset_ += kHeaderSize + n;
+  return s;
+}
+
+}  // namespace log
+}  // namespace rocksdb
--- a/db/log_writer.h
+++ b/db/log_writer.h
@ -0,0 +1,55 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <memory>
+#include <stdint.h>
+#include "db/log_format.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+
+namespace rocksdb {
+
+class WritableFile;
+
+using std::unique_ptr;
+
+namespace log {
+
+class Writer {
+ public:
+  // Create a writer that will append data to "*dest".
+  // "*dest" must be initially empty.
+  // "*dest" must remain live while this Writer is in use.
+  explicit Writer(unique_ptr<WritableFile>&& dest);
+  ~Writer();
+
+  Status AddRecord(const Slice& slice);
+
+  WritableFile* file() { return dest_.get(); }
+  const WritableFile* file() const { return dest_.get(); }
+
+ private:
+  unique_ptr<WritableFile> dest_;
+  int block_offset_;       // Current offset in block
+
+  // crc32c values for all supported record types.  These are
+  // pre-computed to reduce the overhead of computing the crc of the
+  // record type stored in the header.
+  uint32_t type_crc_[kMaxRecordType + 1];
+
+  Status EmitPhysicalRecord(RecordType type, const char* ptr, size_t length);
+
+  // No copying allowed
+  Writer(const Writer&);
+  void operator=(const Writer&);
+};
+
+}  // namespace log
+}  // namespace rocksdb
--- a/db/memtable.cc
+++ b/db/memtable.cc
@ -0,0 +1,330 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/memtable.h"
+
+#include <memory>
+
+#include "db/dbformat.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/merge_operator.h"
+#include "util/coding.h"
+#include "util/mutexlock.h"
+#include "util/murmurhash.h"
+
+namespace std {
+template <>
+struct hash<rocksdb::Slice> {
+  size_t operator()(const rocksdb::Slice& slice) const {
+    return MurmurHash(slice.data(), slice.size(), 0);
+  }
+};
+}
+
+namespace rocksdb {
+
+MemTable::MemTable(const InternalKeyComparator& cmp,
+                   std::shared_ptr<MemTableRepFactory> table_factory,
+                   int numlevel,
+                   const Options& options)
+    : comparator_(cmp),
+      refs_(0),
+      arena_impl_(options.arena_block_size),
+      table_(table_factory->CreateMemTableRep(comparator_, &arena_impl_)),
+      flush_in_progress_(false),
+      flush_completed_(false),
+      file_number_(0),
+      edit_(numlevel),
+      first_seqno_(0),
+      mem_next_logfile_number_(0),
+      mem_logfile_number_(0),
+      locks_(options.inplace_update_support
+             ? options.inplace_update_num_locks
+             : 0) { }
+
+MemTable::~MemTable() {
+  assert(refs_ == 0);
+}
+
+size_t MemTable::ApproximateMemoryUsage() {
+  return arena_impl_.ApproximateMemoryUsage() +
+    table_->ApproximateMemoryUsage();
+}
+
+int MemTable::KeyComparator::operator()(const char* aptr, const char* bptr)
+    const {
+  // Internal keys are encoded as length-prefixed strings.
+  Slice a = GetLengthPrefixedSlice(aptr);
+  Slice b = GetLengthPrefixedSlice(bptr);
+  return comparator.Compare(a, b);
+}
+
+Slice MemTableRep::UserKey(const char* key) const {
+  Slice slice = GetLengthPrefixedSlice(key);
+  return Slice(slice.data(), slice.size() - 8);
+}
+
+// Encode a suitable internal key target for "target" and return it.
+// Uses *scratch as scratch space, and the returned pointer will point
+// into this scratch space.
+static const char* EncodeKey(std::string* scratch, const Slice& target) {
+  scratch->clear();
+  PutVarint32(scratch, target.size());
+  scratch->append(target.data(), target.size());
+  return scratch->data();
+}
+
+class MemTableIterator: public Iterator {
+ public:
+  MemTableIterator(MemTableRep* table, const ReadOptions& options)
+    : iter_() {
+    if (options.prefix) {
+      iter_ = table->GetPrefixIterator(*options.prefix);
+    } else if (options.prefix_seek) {
+      iter_ = table->GetDynamicPrefixIterator();
+    } else {
+      iter_ = table->GetIterator();
+    }
+  }
+
+  virtual bool Valid() const { return iter_->Valid(); }
+  virtual void Seek(const Slice& k) { iter_->Seek(EncodeKey(&tmp_, k)); }
+  virtual void SeekToFirst() { iter_->SeekToFirst(); }
+  virtual void SeekToLast() { iter_->SeekToLast(); }
+  virtual void Next() { iter_->Next(); }
+  virtual void Prev() { iter_->Prev(); }
+  virtual Slice key() const {
+    return GetLengthPrefixedSlice(iter_->key());
+  }
+  virtual Slice value() const {
+    Slice key_slice = GetLengthPrefixedSlice(iter_->key());
+    return GetLengthPrefixedSlice(key_slice.data() + key_slice.size());
+  }
+
+  virtual Status status() const { return Status::OK(); }
+
+ private:
+  std::shared_ptr<MemTableRep::Iterator> iter_;
+  std::string tmp_;       // For passing to EncodeKey
+
+  // No copying allowed
+  MemTableIterator(const MemTableIterator&);
+  void operator=(const MemTableIterator&);
+};
+
+Iterator* MemTable::NewIterator(const ReadOptions& options) {
+  return new MemTableIterator(table_.get(), options);
+}
+
+port::RWMutex* MemTable::GetLock(const Slice& key) {
+  return &locks_[std::hash<Slice>()(key) % locks_.size()];
+}
+
+void MemTable::Add(SequenceNumber s, ValueType type,
+                   const Slice& key,
+                   const Slice& value) {
+  // Format of an entry is concatenation of:
+  //  key_size     : varint32 of internal_key.size()
+  //  key bytes    : char[internal_key.size()]
+  //  value_size   : varint32 of value.size()
+  //  value bytes  : char[value.size()]
+  size_t key_size = key.size();
+  size_t val_size = value.size();
+  size_t internal_key_size = key_size + 8;
+  const size_t encoded_len =
+      VarintLength(internal_key_size) + internal_key_size +
+      VarintLength(val_size) + val_size;
+  char* buf = arena_impl_.Allocate(encoded_len);
+  char* p = EncodeVarint32(buf, internal_key_size);
+  memcpy(p, key.data(), key_size);
+  p += key_size;
+  EncodeFixed64(p, (s << 8) | type);
+  p += 8;
+  p = EncodeVarint32(p, val_size);
+  memcpy(p, value.data(), val_size);
+  assert((p + val_size) - buf == (unsigned)encoded_len);
+  table_->Insert(buf);
+
+  // The first sequence number inserted into the memtable
+  assert(first_seqno_ == 0 || s > first_seqno_);
+  if (first_seqno_ == 0) {
+    first_seqno_ = s;
+  }
+}
+
+bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
+                   std::deque<std::string>* operands, const Options& options) {
+  Slice memkey = key.memtable_key();
+  std::shared_ptr<MemTableRep::Iterator> iter(
+    table_->GetIterator(key.user_key()));
+  iter->Seek(memkey.data());
+
+  // It is the caller's responsibility to allocate/delete operands list
+  assert(operands != nullptr);
+
+  bool merge_in_progress = s->IsMergeInProgress();
+  auto merge_operator = options.merge_operator.get();
+  auto logger = options.info_log;
+  std::string merge_result;
+
+  for (; iter->Valid(); iter->Next()) {
+    // entry format is:
+    //    klength  varint32
+    //    userkey  char[klength-8]
+    //    tag      uint64
+    //    vlength  varint32
+    //    value    char[vlength]
+    // Check that it belongs to same user key.  We do not check the
+    // sequence number since the Seek() call above should have skipped
+    // all entries with overly large sequence numbers.
+    const char* entry = iter->key();
+    uint32_t key_length;
+    const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
+    if (comparator_.comparator.user_comparator()->Compare(
+        Slice(key_ptr, key_length - 8), key.user_key()) == 0) {
+      // Correct user key
+      const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
+      switch (static_cast<ValueType>(tag & 0xff)) {
+        case kTypeValue: {
+          if (options.inplace_update_support) {
+            GetLock(key.user_key())->ReadLock();
+          }
+          Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
+          *s = Status::OK();
+          if (merge_in_progress) {
+            assert(merge_operator);
+            if (!merge_operator->FullMerge(key.user_key(), &v, *operands,
+                                           value, logger.get())) {
+              RecordTick(options.statistics, NUMBER_MERGE_FAILURES);
+              *s = Status::Corruption("Error: Could not perform merge.");
+            }
+          } else {
+            value->assign(v.data(), v.size());
+          }
+          if (options.inplace_update_support) {
+            GetLock(key.user_key())->Unlock();
+          }
+          return true;
+        }
+        case kTypeDeletion: {
+          if (merge_in_progress) {
+            assert(merge_operator);
+            *s = Status::OK();
+            if (!merge_operator->FullMerge(key.user_key(), nullptr, *operands,
+                                           value, logger.get())) {
+              RecordTick(options.statistics, NUMBER_MERGE_FAILURES);
+              *s = Status::Corruption("Error: Could not perform merge.");
+            }
+          } else {
+            *s = Status::NotFound(Slice());
+          }
+          return true;
+        }
+        case kTypeMerge: {
+          Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
+          merge_in_progress = true;
+          operands->push_front(v.ToString());
+          while(operands->size() >= 2) {
+            // Attempt to associative merge. (Returns true if successful)
+            if (merge_operator->PartialMerge(key.user_key(),
+                                             Slice((*operands)[0]),
+                                             Slice((*operands)[1]),
+                                             &merge_result,
+                                             logger.get())) {
+              operands->pop_front();
+              swap(operands->front(), merge_result);
+            } else {
+              // Stack them because user can't associative merge
+              break;
+            }
+          }
+          break;
+        }
+        case kTypeLogData:
+          assert(false);
+          break;
+      }
+    } else {
+      // exit loop if user key does not match
+      break;
+    }
+  }
+
+  // No change to value, since we have not yet found a Put/Delete
+
+  if (merge_in_progress) {
+    *s = Status::MergeInProgress("");
+  }
+  return false;
+}
+
+bool MemTable::Update(SequenceNumber seq, ValueType type,
+                      const Slice& key,
+                      const Slice& value) {
+  LookupKey lkey(key, seq);
+  Slice memkey = lkey.memtable_key();
+
+  std::shared_ptr<MemTableRep::Iterator> iter(
+    table_.get()->GetIterator(lkey.user_key()));
+  iter->Seek(memkey.data());
+
+  if (iter->Valid()) {
+    // entry format is:
+    //    klength  varint32
+    //    userkey  char[klength-8]
+    //    tag      uint64
+    //    vlength  varint32
+    //    value    char[vlength]
+    // Check that it belongs to same user key.  We do not check the
+    // sequence number since the Seek() call above should have skipped
+    // all entries with overly large sequence numbers.
+    const char* entry = iter->key();
+    uint32_t key_length;
+    const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
+    if (comparator_.comparator.user_comparator()->Compare(
+        Slice(key_ptr, key_length - 8), lkey.user_key()) == 0) {
+      // Correct user key
+      const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
+      switch (static_cast<ValueType>(tag & 0xff)) {
+        case kTypeValue: {
+          uint32_t vlength;
+          GetVarint32Ptr(key_ptr + key_length,
+                         key_ptr + key_length+5, &vlength);
+          // Update value, if newValue size  <= curValue size
+          if (value.size() <= vlength) {
+            char* p = EncodeVarint32(const_cast<char*>(key_ptr) + key_length,
+                                     value.size());
+            WriteLock wl(GetLock(lkey.user_key()));
+            memcpy(p, value.data(), value.size());
+            assert(
+              (p + value.size()) - entry ==
+              (unsigned) (VarintLength(key_length) +
+                          key_length +
+                          VarintLength(value.size()) +
+                          value.size())
+            );
+            return true;
+          }
+        }
+        default:
+          // If the latest value is kTypeDeletion, kTypeMerge or kTypeLogData
+          // then we probably don't have enough space to update in-place
+          // Maybe do something later
+          // Return false, and do normal Add()
+          return false;
+      }
+    }
+  }
+
+  // Key doesn't exist
+  return false;
+}
+}  // namespace rocksdb
--- a/db/memtable.h
+++ b/db/memtable.h
@ -0,0 +1,172 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <string>
+#include <memory>
+#include <deque>
+#include "db/dbformat.h"
+#include "db/skiplist.h"
+#include "db/version_set.h"
+#include "rocksdb/db.h"
+#include "rocksdb/memtablerep.h"
+#include "util/arena_impl.h"
+
+namespace rocksdb {
+
+class Mutex;
+class MemTableIterator;
+
+class MemTable {
+ public:
+  struct KeyComparator : public MemTableRep::KeyComparator {
+    const InternalKeyComparator comparator;
+    explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) { }
+    virtual int operator()(const char* a, const char* b) const;
+  };
+
+  // MemTables are reference counted.  The initial reference count
+  // is zero and the caller must call Ref() at least once.
+  explicit MemTable(
+    const InternalKeyComparator& comparator,
+    std::shared_ptr<MemTableRepFactory> table_factory,
+    int numlevel = 7,
+    const Options& options = Options());
+
+  // Increase reference count.
+  void Ref() { ++refs_; }
+
+  // Drop reference count.  Delete if no more references exist.
+  void Unref() {
+    --refs_;
+    assert(refs_ >= 0);
+    if (refs_ <= 0) {
+      delete this;
+    }
+  }
+
+  // Returns an estimate of the number of bytes of data in use by this
+  // data structure.
+  //
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable.
+  size_t ApproximateMemoryUsage();
+
+  // Return an iterator that yields the contents of the memtable.
+  //
+  // The caller must ensure that the underlying MemTable remains live
+  // while the returned iterator is live.  The keys returned by this
+  // iterator are internal keys encoded by AppendInternalKey in the
+  // db/dbformat.{h,cc} module.
+  //
+  // If options.prefix is supplied, it is passed to the underlying MemTableRep
+  // as a hint that the iterator only need to support access to keys with that
+  // specific prefix.
+  // If options.prefix is not supplied and options.prefix_seek is set, the
+  // iterator is not bound to a specific prefix. However, the semantics of
+  // Seek is changed - the result might only include keys with the same prefix
+  // as the seek-key.
+  Iterator* NewIterator(const ReadOptions& options = ReadOptions());
+
+  // Add an entry into memtable that maps key to value at the
+  // specified sequence number and with the specified type.
+  // Typically value will be empty if type==kTypeDeletion.
+  void Add(SequenceNumber seq, ValueType type,
+           const Slice& key,
+           const Slice& value);
+
+  // If memtable contains a value for key, store it in *value and return true.
+  // If memtable contains a deletion for key, store a NotFound() error
+  // in *status and return true.
+  // If memtable contains Merge operation as the most recent entry for a key,
+  //   and the merge process does not stop (not reaching a value or delete),
+  //   prepend the current merge operand to *operands.
+  //   store MergeInProgress in s, and return false.
+  // Else, return false.
+  bool Get(const LookupKey& key, std::string* value, Status* s,
+           std::deque<std::string>* operands, const Options& options);
+
+  // Update the value and return status ok,
+  //   if key exists in current memtable
+  //     if new sizeof(new_value) <= sizeof(old_value) &&
+  //       old_value for that key is a put i.e. kTypeValue
+  //     else return false, and status - NotUpdatable()
+  //   else return false, and status - NotFound()
+  bool Update(SequenceNumber seq, ValueType type,
+              const Slice& key,
+              const Slice& value);
+
+  // Returns the edits area that is needed for flushing the memtable
+  VersionEdit* GetEdits() { return &edit_; }
+
+  // Returns the sequence number of the first element that was inserted
+  // into the memtable
+  SequenceNumber GetFirstSequenceNumber() { return first_seqno_; }
+
+  // Returns the next active logfile number when this memtable is about to
+  // be flushed to storage
+  uint64_t GetNextLogNumber() { return mem_next_logfile_number_; }
+
+  // Sets the next active logfile number when this memtable is about to
+  // be flushed to storage
+  void SetNextLogNumber(uint64_t num) { mem_next_logfile_number_ = num; }
+
+  // Returns the logfile number that can be safely deleted when this
+  // memstore is flushed to storage
+  uint64_t GetLogNumber() { return mem_logfile_number_; }
+
+  // Sets the logfile number that can be safely deleted when this
+  // memstore is flushed to storage
+  void SetLogNumber(uint64_t num) { mem_logfile_number_ = num; }
+
+  // Notify the underlying storage that no more items will be added
+  void MarkImmutable() { table_->MarkReadOnly(); }
+
+ private:
+  ~MemTable();  // Private since only Unref() should be used to delete it
+  friend class MemTableIterator;
+  friend class MemTableBackwardIterator;
+  friend class MemTableList;
+
+  KeyComparator comparator_;
+  int refs_;
+  ArenaImpl arena_impl_;
+  shared_ptr<MemTableRep> table_;
+
+  // These are used to manage memtable flushes to storage
+  bool flush_in_progress_; // started the flush
+  bool flush_completed_;   // finished the flush
+  uint64_t file_number_;    // filled up after flush is complete
+
+  // The udpates to be applied to the transaction log when this
+  // memtable is flushed to storage.
+  VersionEdit edit_;
+
+  // The sequence number of the kv that was inserted first
+  SequenceNumber first_seqno_;
+
+  // The log files earlier than this number can be deleted.
+  uint64_t mem_next_logfile_number_;
+
+  // The log file that backs this memtable (to be deleted when
+  // memtable flush is done)
+  uint64_t mem_logfile_number_;
+
+  // rw locks for inplace updates
+  std::vector<port::RWMutex> locks_;
+
+  // No copying allowed
+  MemTable(const MemTable&);
+  void operator=(const MemTable&);
+
+  // Get the lock associated for the key
+  port::RWMutex* GetLock(const Slice& key);
+};
+
+}  // namespace rocksdb
--- a/db/memtablelist.cc
+++ b/db/memtablelist.cc
@ -0,0 +1,215 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#include "db/memtablelist.h"
+
+#include <string>
+#include "rocksdb/db.h"
+#include "db/memtable.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "util/coding.h"
+
+namespace rocksdb {
+
+class InternalKeyComparator;
+class Mutex;
+class MemTableListIterator;
+class VersionSet;
+
+using std::list;
+
+// Increase reference count on all underling memtables
+void MemTableList::RefAll() {
+  for (auto &memtable : memlist_) {
+    memtable->Ref();
+  }
+}
+
+// Drop reference count on all underling memtables
+void MemTableList::UnrefAll() {
+  for (auto &memtable : memlist_) {
+    memtable->Unref();
+  }
+}
+
+// Returns the total number of memtables in the list
+int MemTableList::size() {
+  assert(num_flush_not_started_ <= size_);
+  return size_;
+}
+
+// Returns true if there is at least one memtable on which flush has
+// not yet started.
+bool MemTableList::IsFlushPending(int min_write_buffer_number_to_merge) {
+  if ((flush_requested_ && num_flush_not_started_ >= 1) ||
+      (num_flush_not_started_ >= min_write_buffer_number_to_merge)) {
+    assert(imm_flush_needed.NoBarrier_Load() != nullptr);
+    return true;
+  }
+  return false;
+}
+
+// Returns the memtables that need to be flushed.
+void MemTableList::PickMemtablesToFlush(std::vector<MemTable*>* ret) {
+  for (auto it = memlist_.rbegin(); it != memlist_.rend(); it++) {
+    MemTable* m = *it;
+    if (!m->flush_in_progress_) {
+      assert(!m->flush_completed_);
+      num_flush_not_started_--;
+      if (num_flush_not_started_ == 0) {
+        imm_flush_needed.Release_Store(nullptr);
+      }
+      m->flush_in_progress_ = true; // flushing will start very soon
+      ret->push_back(m);
+    }
+  }
+  flush_requested_ = false; // start-flush request is complete
+}
+
+// Record a successful flush in the manifest file
+Status MemTableList::InstallMemtableFlushResults(
+                      const std::vector<MemTable*> &mems,
+                      VersionSet* vset, Status flushStatus,
+                      port::Mutex* mu, Logger* info_log,
+                      uint64_t file_number,
+                      std::set<uint64_t>& pending_outputs) {
+  mu->AssertHeld();
+
+  // If the flush was not successful, then just reset state.
+  // Maybe a suceeding attempt to flush will be successful.
+  if (!flushStatus.ok()) {
+    for (MemTable* m : mems) {
+      assert(m->flush_in_progress_);
+      assert(m->file_number_ == 0);
+
+      m->flush_in_progress_ = false;
+      m->flush_completed_ = false;
+      m->edit_.Clear();
+      num_flush_not_started_++;
+      imm_flush_needed.Release_Store((void *)1);
+      pending_outputs.erase(file_number);
+    }
+    return flushStatus;
+  }
+
+  // flush was sucessful
+  for (size_t i = 0; i < mems.size(); ++i) {
+    // All the edits are associated with the first memtable of this batch.
+    assert(i == 0 || mems[i]->GetEdits()->NumEntries() == 0);
+
+    mems[i]->flush_completed_ = true;
+    mems[i]->file_number_ = file_number;
+  }
+
+  // if some other thread is already commiting, then return
+  Status s;
+  if (commit_in_progress_) {
+    return s;
+  }
+
+  // Only a single thread can be executing this piece of code
+  commit_in_progress_ = true;
+
+  // scan all memtables from the earliest, and commit those
+  // (in that order) that have finished flushing. Memetables
+  // are always committed in the order that they were created.
+  while (!memlist_.empty() && s.ok()) {
+    MemTable* m = memlist_.back(); // get the last element
+    if (!m->flush_completed_) {
+      break;
+    }
+
+    Log(info_log,
+        "Level-0 commit table #%lu started",
+        (unsigned long)m->file_number_);
+
+    // this can release and reacquire the mutex.
+    s = vset->LogAndApply(&m->edit_, mu);
+
+    // All the later memtables that have the same filenum
+    // are part of the same batch. They can be committed now.
+    uint64_t mem_id = 1;  // how many memtables has been flushed.
+    do {
+      if (s.ok()) { // commit new state
+        Log(info_log,
+            "Level-0 commit table #%lu: memtable #%lu done",
+            (unsigned long)m->file_number_,
+            (unsigned long)mem_id);
+        memlist_.remove(m);
+        assert(m->file_number_ > 0);
+
+        // pending_outputs can be cleared only after the newly created file
+        // has been written to a committed version so that other concurrently
+        // executing compaction threads do not mistakenly assume that this
+        // file is not live.
+        pending_outputs.erase(m->file_number_);
+        m->Unref();
+        size_--;
+      } else {
+        //commit failed. setup state so that we can flush again.
+        Log(info_log,
+            "Level-0 commit table #%lu: memtable #%lu failed",
+            (unsigned long)m->file_number_,
+            (unsigned long)mem_id);
+        m->flush_completed_ = false;
+        m->flush_in_progress_ = false;
+        m->edit_.Clear();
+        num_flush_not_started_++;
+        pending_outputs.erase(m->file_number_);
+        m->file_number_ = 0;
+        imm_flush_needed.Release_Store((void *)1);
+        s = Status::IOError("Unable to commit flushed memtable");
+      }
+      ++mem_id;
+    } while (!memlist_.empty() && (m = memlist_.back()) &&
+             m->file_number_ == file_number);
+  }
+  commit_in_progress_ = false;
+  return s;
+}
+
+// New memtables are inserted at the front of the list.
+void MemTableList::Add(MemTable* m) {
+  assert(size_ >= num_flush_not_started_);
+  size_++;
+  memlist_.push_front(m);
+  m->MarkImmutable();
+  num_flush_not_started_++;
+  if (num_flush_not_started_ == 1) {
+    imm_flush_needed.Release_Store((void *)1);
+  }
+}
+
+// Returns an estimate of the number of bytes of data in use.
+size_t MemTableList::ApproximateMemoryUsage() {
+  size_t size = 0;
+  for (auto &memtable : memlist_) {
+    size += memtable->ApproximateMemoryUsage();
+  }
+  return size;
+}
+
+// Search all the memtables starting from the most recent one.
+// Return the most recent value found, if any.
+// Operands stores the list of merge operations to apply, so far.
+bool MemTableList::Get(const LookupKey& key, std::string* value, Status* s,
+                       std::deque<std::string>* operands,
+                       const Options& options) {
+  for (auto &memtable : memlist_) {
+    if (memtable->Get(key, value, s, operands, options)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+void MemTableList::GetMemTables(std::vector<MemTable*>* output) {
+  for (auto &memtable : memlist_) {
+    output->push_back(memtable);
+  }
+}
+
+}  // namespace rocksdb
--- a/db/memtablelist.h
+++ b/db/memtablelist.h
@ -0,0 +1,105 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+
+#pragma once
+#include <string>
+#include <list>
+#include <deque>
+#include "rocksdb/db.h"
+#include "db/dbformat.h"
+#include "db/skiplist.h"
+#include "memtable.h"
+
+namespace rocksdb {
+
+class InternalKeyComparator;
+class Mutex;
+class MemTableListIterator;
+
+//
+// This class stores references to all the immutable memtables.
+// The memtables are flushed to L0 as soon as possible and in
+// any order. If there are more than one immutable memtable, their
+// flushes can occur concurrently.  However, they are 'committed'
+// to the manifest in FIFO order to maintain correctness and
+// recoverability from a crash.
+//
+class MemTableList {
+ public:
+  // A list of memtables.
+  MemTableList() : size_(0), num_flush_not_started_(0),
+    commit_in_progress_(false),
+    flush_requested_(false) {
+    imm_flush_needed.Release_Store(nullptr);
+  }
+  ~MemTableList() {};
+
+  // so that backgrund threads can detect non-nullptr pointer to
+  // determine whether this is anything more to start flushing.
+  port::AtomicPointer imm_flush_needed;
+
+  // Increase reference count on all underling memtables
+  void RefAll();
+
+  // Drop reference count on all underling memtables
+  void UnrefAll();
+
+  // Returns the total number of memtables in the list
+  int size();
+
+  // Returns true if there is at least one memtable on which flush has
+  // not yet started.
+  bool IsFlushPending(int min_write_buffer_number_to_merge);
+
+  // Returns the earliest memtables that needs to be flushed. The returned
+  // memtables are guaranteed to be in the ascending order of created time.
+  void PickMemtablesToFlush(std::vector<MemTable*>* mems);
+
+  // Commit a successful flush in the manifest file
+  Status InstallMemtableFlushResults(const std::vector<MemTable*> &m,
+                      VersionSet* vset, Status flushStatus,
+                      port::Mutex* mu, Logger* info_log,
+                      uint64_t file_number,
+                      std::set<uint64_t>& pending_outputs);
+
+  // New memtables are inserted at the front of the list.
+  // Takes ownership of the referenced held on *m by the caller of Add().
+  void Add(MemTable* m);
+
+  // Returns an estimate of the number of bytes of data in use.
+  size_t ApproximateMemoryUsage();
+
+  // Search all the memtables starting from the most recent one.
+  // Return the most recent value found, if any.
+  bool Get(const LookupKey& key, std::string* value, Status* s,
+           std::deque<std::string>* operands, const Options& options);
+
+  // Returns the list of underlying memtables.
+  void GetMemTables(std::vector<MemTable*>* list);
+
+  // Request a flush of all existing memtables to storage
+  void FlushRequested() { flush_requested_ = true; }
+
+  // Copying allowed
+  // MemTableList(const MemTableList&);
+  // void operator=(const MemTableList&);
+
+ private:
+  std::list<MemTable*> memlist_;
+  int size_;
+
+  // the number of elements that still need flushing
+  int num_flush_not_started_;
+
+  // committing in progress
+  bool commit_in_progress_;
+
+  // Requested a flush of all memtables to storage
+  bool flush_requested_;
+
+};
+
+}  // namespace rocksdb
--- a/db/merge_helper.cc
+++ b/db/merge_helper.cc
@ -0,0 +1,197 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#include "merge_helper.h"
+#include "db/dbformat.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/merge_operator.h"
+#include <string>
+#include <stdio.h>
+
+namespace rocksdb {
+
+// PRE:  iter points to the first merge type entry
+// POST: iter points to the first entry beyond the merge process (or the end)
+//       keys_, operands_ are updated to reflect the merge result.
+//       keys_ stores the list of keys encountered while merging.
+//       operands_ stores the list of merge operands encountered while merging.
+//       keys_[i] corresponds to operands_[i] for each i.
+void MergeHelper::MergeUntil(Iterator* iter, SequenceNumber stop_before,
+                             bool at_bottom, shared_ptr<Statistics> stats) {
+  // Get a copy of the internal key, before it's invalidated by iter->Next()
+  // Also maintain the list of merge operands seen.
+  keys_.clear();
+  operands_.clear();
+  keys_.push_front(iter->key().ToString());
+  operands_.push_front(iter->value().ToString());
+
+  success_ = false;   // Will become true if we hit Put/Delete or bottom
+
+  // We need to parse the internal key again as the parsed key is
+  // backed by the internal key!
+  // Assume no internal key corruption as it has been successfully parsed
+  // by the caller.
+  // Invariant: keys_.back() will not change. Hence, orig_ikey is always valid.
+  ParsedInternalKey orig_ikey;
+  ParseInternalKey(keys_.back(), &orig_ikey);
+
+  bool hit_the_next_user_key = false;
+  ParsedInternalKey ikey;
+  std::string merge_result;  // Temporary value for merge results
+  for (iter->Next(); iter->Valid(); iter->Next()) {
+    assert(operands_.size() >= 1);        // Should be invariants!
+    assert(keys_.size() == operands_.size());
+
+    if (!ParseInternalKey(iter->key(), &ikey)) {
+      // stop at corrupted key
+      if (assert_valid_internal_key_) {
+        assert(!"corrupted internal key is not expected");
+      }
+      break;
+    }
+
+    if (user_comparator_->Compare(ikey.user_key, orig_ikey.user_key) != 0) {
+      // hit a different user key, stop right here
+      hit_the_next_user_key = true;
+      break;
+    }
+
+    if (stop_before && ikey.sequence <= stop_before) {
+      // hit an entry that's visible by the previous snapshot, can't touch that
+      break;
+    }
+
+    // At this point we are guaranteed that we need to process this key.
+
+    if (kTypeDeletion == ikey.type) {
+      // hit a delete
+      //   => merge nullptr with operands_
+      //   => store result in operands_.back() (and update keys_.back())
+      //   => change the entry type to kTypeValue for keys_.back()
+      // We are done! Return a success if the merge passes.
+      success_ = user_merge_operator_->FullMerge(ikey.user_key, nullptr,
+                                                 operands_, &merge_result,
+                                                 logger_);
+
+      // We store the result in keys_.back() and operands_.back()
+      // if nothing went wrong (i.e.: no operand corruption on disk)
+      if (success_) {
+        std::string& key = keys_.back();  // The original key encountered
+        orig_ikey.type = kTypeValue;
+        UpdateInternalKey(&key[0], key.size(),
+                          orig_ikey.sequence, orig_ikey.type);
+        swap(operands_.back(), merge_result);
+      } else {
+        RecordTick(stats, NUMBER_MERGE_FAILURES);
+      }
+
+      // move iter to the next entry (before doing anything else)
+      iter->Next();
+      return;
+    }
+
+    if (kTypeValue == ikey.type) {
+      // hit a put
+      //   => merge the put value with operands_
+      //   => store result in operands_.back() (and update keys_.back())
+      //   => change the entry type to kTypeValue for keys_.back()
+      // We are done! Success!
+      const Slice value = iter->value();
+      success_ = user_merge_operator_->FullMerge(ikey.user_key, &value,
+                                                 operands_, &merge_result,
+                                                 logger_);
+
+      // We store the result in keys_.back() and operands_.back()
+      // if nothing went wrong (i.e.: no operand corruption on disk)
+      if (success_) {
+        std::string& key = keys_.back();  // The original key encountered
+        orig_ikey.type = kTypeValue;
+        UpdateInternalKey(&key[0], key.size(),
+                          orig_ikey.sequence, orig_ikey.type);
+        swap(operands_.back(), merge_result);
+      } else {
+        RecordTick(stats, NUMBER_MERGE_FAILURES);
+      }
+
+      // move iter to the next entry
+      iter->Next();
+      return;
+    }
+
+    if (kTypeMerge == ikey.type) {
+      // hit a merge
+      //   => merge the operand into the front of the operands_ list
+      //   => use the user's associative merge function to determine how.
+      //   => then continue because we haven't yet seen a Put/Delete.
+      assert(!operands_.empty()); // Should have at least one element in it
+
+      keys_.push_front(iter->key().ToString());
+      operands_.push_front(iter->value().ToString());
+      while (operands_.size() >= 2) {
+        // Returns false when the merge_operator can no longer process it
+        if (user_merge_operator_->PartialMerge(ikey.user_key,
+                                               Slice(operands_[0]),
+                                               Slice(operands_[1]),
+                                               &merge_result,
+                                               logger_)) {
+          // Merging of operands (associative merge) was successful.
+          // Replace these frontmost two operands with the merge result
+          keys_.pop_front();
+          operands_.pop_front();
+          swap(operands_.front(), merge_result);
+        } else {
+          // Merging of operands (associative merge) returned false.
+          // The user merge_operator does not know how to merge these operands.
+          // So we just stack them up until we find a Put/Delete or end of key.
+          break;
+        }
+      }
+      continue;
+    }
+  }
+
+  // We are sure we have seen this key's entire history if we are at the
+  // last level and exhausted all internal keys of this user key.
+  // NOTE: !iter->Valid() does not necessarily mean we hit the
+  // beginning of a user key, as versions of a user key might be
+  // split into multiple files (even files on the same level)
+  // and some files might not be included in the compaction/merge.
+  //
+  // There are also cases where we have seen the root of history of this
+  // key without being sure of it. Then, we simply miss the opportunity
+  // to combine the keys. Since VersionSet::SetupOtherInputs() always makes
+  // sure that all merge-operands on the same level get compacted together,
+  // this will simply lead to these merge operands moving to the next level.
+  //
+  // So, we only perform the following logic (to merge all operands together
+  // without a Put/Delete) if we are certain that we have seen the end of key.
+  bool surely_seen_the_beginning = hit_the_next_user_key && at_bottom;
+  if (surely_seen_the_beginning) {
+    // do a final merge with nullptr as the existing value and say
+    // bye to the merge type (it's now converted to a Put)
+    assert(kTypeMerge == orig_ikey.type);
+    assert(operands_.size() >= 1);
+    assert(operands_.size() == keys_.size());
+    success_ = user_merge_operator_->FullMerge(orig_ikey.user_key, nullptr,
+                                               operands_, &merge_result,
+                                               logger_);
+
+    if (success_) {
+      std::string& key = keys_.back();  // The original key encountered
+      orig_ikey.type = kTypeValue;
+      UpdateInternalKey(&key[0], key.size(),
+                        orig_ikey.sequence, orig_ikey.type);
+
+      // The final value() is always stored in operands_.back()
+      swap(operands_.back(),merge_result);
+    } else {
+      RecordTick(stats, NUMBER_MERGE_FAILURES);
+      // Do nothing if not success_. Leave keys() and operands() as they are.
+    }
+  }
+}
+
+} // namespace rocksdb
--- a/db/merge_helper.h
+++ b/db/merge_helper.h
@ -0,0 +1,102 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#ifndef MERGE_HELPER_H
+#define MERGE_HELPER_H
+
+#include "db/dbformat.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/statistics.h"
+#include <string>
+#include <deque>
+
+namespace rocksdb {
+
+class Comparator;
+class Iterator;
+class Logger;
+class MergeOperator;
+
+class MergeHelper {
+ public:
+  MergeHelper(const Comparator* user_comparator,
+              const MergeOperator* user_merge_operator,
+              Logger* logger,
+              bool assert_valid_internal_key)
+      : user_comparator_(user_comparator),
+        user_merge_operator_(user_merge_operator),
+        logger_(logger),
+        assert_valid_internal_key_(assert_valid_internal_key),
+        keys_(),
+        operands_(),
+        success_(false) {}
+
+  // Merge entries until we hit
+  //     - a corrupted key
+  //     - a Put/Delete,
+  //     - a different user key,
+  //     - a specific sequence number (snapshot boundary),
+  //  or - the end of iteration
+  // iter: (IN)  points to the first merge type entry
+  //       (OUT) points to the first entry not included in the merge process
+  // stop_before: (IN) a sequence number that merge should not cross.
+  //                   0 means no restriction
+  // at_bottom:   (IN) true if the iterator covers the bottem level, which means
+  //                   we could reach the start of the history of this user key.
+  void MergeUntil(Iterator* iter, SequenceNumber stop_before = 0,
+                  bool at_bottom = false, shared_ptr<Statistics> stats=nullptr);
+
+  // Query the merge result
+  // These are valid until the next MergeUntil call
+  // If the merging was successful:
+  //   - IsSuccess() will be true
+  //   - key() will have the latest sequence number of the merges.
+  //           The type will be Put or Merge. See IMPORTANT 1 note, below.
+  //   - value() will be the result of merging all the operands together
+  //   - The user should ignore keys() and values().
+  //
+  //   IMPORTANT 1: the key type could change after the MergeUntil call.
+  //        Put/Delete + Merge + ... + Merge => Put
+  //        Merge + ... + Merge => Merge
+  //
+  // If the merge operator is not associative, and if a Put/Delete is not found
+  // then the merging will be unsuccessful. In this case:
+  //   - IsSuccess() will be false
+  //   - keys() contains the list of internal keys seen in order of iteration.
+  //   - values() contains the list of values (merges) seen in the same order.
+  //              values() is parallel to keys() so that the first entry in
+  //              keys() is the key associated with the first entry in values()
+  //              and so on. These lists will be the same length.
+  //              All of these pairs will be merges over the same user key.
+  //              See IMPORTANT 2 note below.
+  //   - The user should ignore key() and value().
+  //
+  //   IMPORTANT 2: The entries were traversed in order from BACK to FRONT.
+  //                So keys().back() was the first key seen by iterator.
+  // TODO: Re-style this comment to be like the first one
+  bool IsSuccess() { return success_; }
+  Slice key() { assert(success_); return Slice(keys_.back()); }
+  Slice value() { assert(success_); return Slice(operands_.back()); }
+  const std::deque<std::string>& keys() { assert(!success_); return keys_; }
+  const std::deque<std::string>& values() {
+    assert(!success_); return operands_;
+  }
+
+ private:
+  const Comparator* user_comparator_;
+  const MergeOperator* user_merge_operator_;
+  Logger* logger_;
+  bool assert_valid_internal_key_; // enforce no internal key corruption?
+
+  // the scratch area that holds the result of MergeUntil
+  // valid up to the next MergeUntil call
+  std::deque<std::string> keys_;    // Keeps track of the sequence of keys seen
+  std::deque<std::string> operands_;  // Parallel with keys_; stores the values
+  bool success_;
+};
+
+} // namespace rocksdb
+
+#endif
--- a/db/merge_operator.cc
+++ b/db/merge_operator.cc
@ -0,0 +1,53 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+/**
+ * Back-end implementation details specific to the Merge Operator.
+ */
+
+#include "rocksdb/merge_operator.h"
+
+namespace rocksdb {
+
+// Given a "real" merge from the library, call the user's
+// associative merge function one-by-one on each of the operands.
+// NOTE: It is assumed that the client's merge-operator will handle any errors.
+bool AssociativeMergeOperator::FullMerge(
+    const Slice& key,
+    const Slice* existing_value,
+    const std::deque<std::string>& operand_list,
+    std::string* new_value,
+    Logger* logger) const {
+
+  // Simply loop through the operands
+  Slice temp_existing;
+  std::string temp_value;
+  for (const auto& operand : operand_list) {
+    Slice value(operand);
+    if (!Merge(key, existing_value, value, &temp_value, logger)) {
+      return false;
+    }
+    swap(temp_value, *new_value);
+    temp_existing = Slice(*new_value);
+    existing_value = &temp_existing;
+  }
+
+  // The result will be in *new_value. All merges succeeded.
+  return true;
+}
+
+// Call the user defined simple merge on the operands;
+// NOTE: It is assumed that the client's merge-operator will handle any errors.
+bool AssociativeMergeOperator::PartialMerge(
+    const Slice& key,
+    const Slice& left_operand,
+    const Slice& right_operand,
+    std::string* new_value,
+    Logger* logger) const {
+
+  return Merge(key, &left_operand, right_operand, new_value, logger);
+}
+
+} // namespace rocksdb
--- a/db/merge_test.cc
+++ b/db/merge_test.cc
@ -0,0 +1,275 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#include <assert.h>
+#include <memory>
+#include <iostream>
+
+#include "rocksdb/cache.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/merge_operator.h"
+#include "db/dbformat.h"
+#include "db/db_impl.h"
+#include "utilities/merge_operators.h"
+#include "util/testharness.h"
+#include "utilities/utility_db.h"
+
+using namespace std;
+using namespace rocksdb;
+
+
+std::shared_ptr<DB> OpenDb(const string& dbname, const bool ttl = false) {
+  DB* db;
+  StackableDB* sdb;
+  Options options;
+  options.create_if_missing = true;
+  options.merge_operator = MergeOperators::CreateUInt64AddOperator();
+  Status s;
+  DestroyDB(dbname, Options());
+  if (ttl) {
+    cout << "Opening database with TTL\n";
+    s = UtilityDB::OpenTtlDB(options, dbname, &sdb);
+    db = sdb;
+  } else {
+    s = DB::Open(options, dbname, &db);
+  }
+  if (!s.ok()) {
+    cerr << s.ToString() << endl;
+    assert(false);
+  }
+  return std::shared_ptr<DB>(db);
+}
+
+// Imagine we are maintaining a set of uint64 counters.
+// Each counter has a distinct name. And we would like
+// to support four high level operations:
+// set, add, get and remove
+// This is a quick implementation without a Merge operation.
+class Counters {
+
+ protected:
+  std::shared_ptr<DB> db_;
+
+  WriteOptions put_option_;
+  ReadOptions get_option_;
+  WriteOptions delete_option_;
+
+  uint64_t default_;
+
+ public:
+  explicit Counters(std::shared_ptr<DB> db, uint64_t defaultCount = 0)
+      : db_(db),
+        put_option_(),
+        get_option_(),
+        delete_option_(),
+        default_(defaultCount) {
+    assert(db_);
+  }
+
+  virtual ~Counters() {}
+
+  // public interface of Counters.
+  // All four functions return false
+  // if the underlying level db operation failed.
+
+  // mapped to a levedb Put
+  bool set(const string& key, uint64_t value) {
+    // just treat the internal rep of int64 as the string
+    Slice slice((char *)&value, sizeof(value));
+    auto s = db_->Put(put_option_, key, slice);
+
+    if (s.ok()) {
+      return true;
+    } else {
+      cerr << s.ToString() << endl;
+      return false;
+    }
+  }
+
+  // mapped to a rocksdb Delete
+  bool remove(const string& key) {
+    auto s = db_->Delete(delete_option_, key);
+
+    if (s.ok()) {
+      return true;
+    } else {
+      cerr << s.ToString() << std::endl;
+      return false;
+    }
+  }
+
+  // mapped to a rocksdb Get
+  bool get(const string& key, uint64_t *value) {
+    string str;
+    auto s = db_->Get(get_option_, key, &str);
+
+    if (s.IsNotFound()) {
+      // return default value if not found;
+      *value = default_;
+      return true;
+    } else if (s.ok()) {
+      // deserialization
+      if (str.size() != sizeof(uint64_t)) {
+        cerr << "value corruption\n";
+        return false;
+      }
+      *value = DecodeFixed64(&str[0]);
+      return true;
+    } else {
+      cerr << s.ToString() << std::endl;
+      return false;
+    }
+  }
+
+  // 'add' is implemented as get -> modify -> set
+  // An alternative is a single merge operation, see MergeBasedCounters
+  virtual bool add(const string& key, uint64_t value) {
+    uint64_t base = default_;
+    return get(key, &base) && set(key, base + value);
+  }
+
+
+  // convenience functions for testing
+  void assert_set(const string& key, uint64_t value) {
+    assert(set(key, value));
+  }
+
+  void assert_remove(const string& key) {
+    assert(remove(key));
+  }
+
+  uint64_t assert_get(const string& key) {
+    uint64_t value = default_;
+    assert(get(key, &value));
+    return value;
+  }
+
+  void assert_add(const string& key, uint64_t value) {
+    assert(add(key, value));
+  }
+};
+
+// Implement 'add' directly with the new Merge operation
+class MergeBasedCounters : public Counters {
+ private:
+  WriteOptions merge_option_; // for merge
+
+ public:
+  explicit MergeBasedCounters(std::shared_ptr<DB> db, uint64_t defaultCount = 0)
+      : Counters(db, defaultCount),
+        merge_option_() {
+  }
+
+  // mapped to a rocksdb Merge operation
+  virtual bool add(const string& key, uint64_t value) override {
+    char encoded[sizeof(uint64_t)];
+    EncodeFixed64(encoded, value);
+    Slice slice(encoded, sizeof(uint64_t));
+    auto s = db_->Merge(merge_option_, key, slice);
+
+    if (s.ok()) {
+      return true;
+    } else {
+      cerr << s.ToString() << endl;
+      return false;
+    }
+  }
+};
+
+void dumpDb(DB* db) {
+  auto it = unique_ptr<Iterator>(db->NewIterator(ReadOptions()));
+  for (it->SeekToFirst(); it->Valid(); it->Next()) {
+    uint64_t value = DecodeFixed64(it->value().data());
+    cout << it->key().ToString() << ": "  << value << endl;
+  }
+  assert(it->status().ok());  // Check for any errors found during the scan
+}
+
+void testCounters(Counters& counters, DB* db, bool test_compaction) {
+
+  FlushOptions o;
+  o.wait = true;
+
+  counters.assert_set("a", 1);
+
+  if (test_compaction) db->Flush(o);
+
+  assert(counters.assert_get("a") == 1);
+
+  counters.assert_remove("b");
+
+  // defaut value is 0 if non-existent
+  assert(counters.assert_get("b") == 0);
+
+  counters.assert_add("a", 2);
+
+  if (test_compaction) db->Flush(o);
+
+  // 1+2 = 3
+  assert(counters.assert_get("a")== 3);
+
+  dumpDb(db);
+
+  std::cout << "1\n";
+
+  // 1+...+49 = ?
+  uint64_t sum = 0;
+  for (int i = 1; i < 50; i++) {
+    counters.assert_add("b", i);
+    sum += i;
+  }
+  assert(counters.assert_get("b") == sum);
+
+  std::cout << "2\n";
+  dumpDb(db);
+
+  std::cout << "3\n";
+
+  if (test_compaction) {
+    db->Flush(o);
+
+    cout << "Compaction started ...\n";
+    db->CompactRange(nullptr, nullptr);
+    cout << "Compaction ended\n";
+
+    dumpDb(db);
+
+    assert(counters.assert_get("a")== 3);
+    assert(counters.assert_get("b") == sum);
+  }
+}
+
+void runTest(int argc, const string& dbname, const bool use_ttl = false) {
+  auto db = OpenDb(dbname, use_ttl);
+
+  {
+    cout << "Test read-modify-write counters... \n";
+    Counters counters(db, 0);
+    testCounters(counters, db.get(), true);
+  }
+
+  bool compact = false;
+  if (argc > 1) {
+    compact = true;
+    cout << "Turn on Compaction\n";
+  }
+
+  {
+    cout << "Test merge-based counters... \n";
+    MergeBasedCounters counters(db, 0);
+    testCounters(counters, db.get(), compact);
+  }
+
+  DestroyDB(dbname, Options());
+}
+
+int main(int argc, char *argv[]) {
+  //TODO: Make this test like a general rocksdb unit-test
+  runTest(argc, test::TmpDir() + "/merge_testdb");
+  runTest(argc, test::TmpDir() + "/merge_testdbttl", true); // Run test on TTL database
+  return 0;
+}
--- a/db/perf_context_test.cc
+++ b/db/perf_context_test.cc
@ -0,0 +1,328 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#include <algorithm>
+#include <iostream>
+#include <vector>
+#include "/usr/include/valgrind/callgrind.h"
+
+#include "rocksdb/db.h"
+#include "rocksdb/perf_context.h"
+#include "util/histogram.h"
+#include "util/stop_watch.h"
+#include "util/testharness.h"
+
+
+bool FLAGS_random_key = false;
+bool FLAGS_use_set_based_memetable = false;
+int FLAGS_total_keys = 100;
+int FLAGS_write_buffer_size = 1000000000;
+int FLAGS_max_write_buffer_number = 8;
+int FLAGS_min_write_buffer_number_to_merge = 7;
+
+// Path to the database on file system
+const std::string kDbName = rocksdb::test::TmpDir() + "/perf_context_test";
+
+namespace rocksdb {
+
+std::shared_ptr<DB> OpenDb() {
+    DB* db;
+    Options options;
+    options.create_if_missing = true;
+    options.write_buffer_size = FLAGS_write_buffer_size;
+    options.max_write_buffer_number = FLAGS_max_write_buffer_number;
+    options.min_write_buffer_number_to_merge =
+      FLAGS_min_write_buffer_number_to_merge;
+
+    if (FLAGS_use_set_based_memetable) {
+      auto prefix_extractor = rocksdb::NewFixedPrefixTransform(0);
+      options.memtable_factory =
+        std::make_shared<rocksdb::PrefixHashRepFactory>(prefix_extractor);
+    }
+
+    Status s = DB::Open(options, kDbName,  &db);
+    ASSERT_OK(s);
+    return std::shared_ptr<DB>(db);
+}
+
+class PerfContextTest { };
+
+TEST(PerfContextTest, SeekIntoDeletion) {
+  DestroyDB(kDbName, Options());
+  auto db = OpenDb();
+  WriteOptions write_options;
+  ReadOptions read_options;
+
+  for (int i = 0; i < FLAGS_total_keys; ++i) {
+    std::string key = "k" + std::to_string(i);
+    std::string value = "v" + std::to_string(i);
+
+    db->Put(write_options, key, value);
+  }
+
+  for (int i = 0; i < FLAGS_total_keys -1 ; ++i) {
+    std::string key = "k" + std::to_string(i);
+    db->Delete(write_options, key);
+  }
+
+  HistogramImpl hist_get;
+  HistogramImpl hist_get_time;
+  for (int i = 0; i < FLAGS_total_keys - 1; ++i) {
+    std::string key = "k" + std::to_string(i);
+    std::string value;
+
+    perf_context.Reset();
+    StopWatchNano timer(Env::Default(), true);
+    auto status = db->Get(read_options, key, &value);
+    auto elapsed_nanos = timer.ElapsedNanos();
+    ASSERT_TRUE(status.IsNotFound());
+    hist_get.Add(perf_context.user_key_comparison_count);
+    hist_get_time.Add(elapsed_nanos);
+  }
+
+  std::cout << "Get uesr key comparison: \n" << hist_get.ToString()
+            << "Get time: \n" << hist_get_time.ToString();
+
+  HistogramImpl hist_seek_to_first;
+  std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+
+  perf_context.Reset();
+  StopWatchNano timer(Env::Default(), true);
+  iter->SeekToFirst();
+  hist_seek_to_first.Add(perf_context.user_key_comparison_count);
+  auto elapsed_nanos = timer.ElapsedNanos();
+
+  std::cout << "SeekToFirst uesr key comparison: \n" << hist_seek_to_first.ToString()
+            << "ikey skipped: " << perf_context.internal_key_skipped_count << "\n"
+            << "idelete skipped: " << perf_context.internal_delete_skipped_count << "\n"
+            << "elapsed: " << elapsed_nanos << "\n";
+
+  HistogramImpl hist_seek;
+  for (int i = 0; i < FLAGS_total_keys; ++i) {
+    std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+    std::string key = "k" + std::to_string(i);
+
+    perf_context.Reset();
+    StopWatchNano timer(Env::Default(), true);
+    iter->Seek(key);
+    auto elapsed_nanos = timer.ElapsedNanos();
+    hist_seek.Add(perf_context.user_key_comparison_count);
+    std::cout << "seek cmp: " << perf_context.user_key_comparison_count
+              << " ikey skipped " << perf_context.internal_key_skipped_count
+              << " idelete skipped " << perf_context.internal_delete_skipped_count
+              << " elapsed: " << elapsed_nanos << "ns\n";
+
+    perf_context.Reset();
+    ASSERT_TRUE(iter->Valid());
+    StopWatchNano timer2(Env::Default(), true);
+    iter->Next();
+    auto elapsed_nanos2 = timer2.ElapsedNanos();
+    std::cout << "next cmp: " << perf_context.user_key_comparison_count
+              << "elapsed: " << elapsed_nanos2 << "ns\n";
+  }
+
+  std::cout << "Seek uesr key comparison: \n" << hist_seek.ToString();
+}
+
+TEST(PerfContextTest, StopWatchNanoOverhead) {
+  // profile the timer cost by itself!
+  const int kTotalIterations = 1000000;
+  std::vector<uint64_t> timings(kTotalIterations);
+
+  StopWatchNano timer(Env::Default(), true);
+  for (auto& timing : timings) {
+    timing = timer.ElapsedNanos(true /* reset */);
+  }
+
+  HistogramImpl histogram;
+  for (const auto timing : timings) {
+    histogram.Add(timing);
+  }
+
+  std::cout << histogram.ToString();
+}
+
+TEST(PerfContextTest, StopWatchOverhead) {
+  // profile the timer cost by itself!
+  const int kTotalIterations = 1000000;
+  std::vector<uint64_t> timings(kTotalIterations);
+
+  StopWatch timer(Env::Default());
+  for (auto& timing : timings) {
+    timing = timer.ElapsedMicros();
+  }
+
+  HistogramImpl histogram;
+  uint64_t prev_timing = 0;
+  for (const auto timing : timings) {
+    histogram.Add(timing - prev_timing);
+    prev_timing = timing;
+  }
+
+  std::cout << histogram.ToString();
+}
+
+void ProfileKeyComparison() {
+  DestroyDB(kDbName, Options());    // Start this test with a fresh DB
+
+  auto db = OpenDb();
+
+  WriteOptions write_options;
+  ReadOptions read_options;
+
+  HistogramImpl hist_put;
+  HistogramImpl hist_get;
+
+  std::cout << "Inserting " << FLAGS_total_keys << " key/value pairs\n...\n";
+
+  std::vector<int> keys;
+  for (int i = 0; i < FLAGS_total_keys; ++i) {
+    keys.push_back(i);
+  }
+
+  if (FLAGS_random_key) {
+    std::random_shuffle(keys.begin(), keys.end());
+  }
+
+  for (const int i : keys) {
+    std::string key = "k" + std::to_string(i);
+    std::string value = "v" + std::to_string(i);
+
+    perf_context.Reset();
+    db->Put(write_options, key, value);
+    hist_put.Add(perf_context.user_key_comparison_count);
+
+    perf_context.Reset();
+    db->Get(read_options, key, &value);
+    hist_get.Add(perf_context.user_key_comparison_count);
+  }
+
+  std::cout << "Put uesr key comparison: \n" << hist_put.ToString()
+            << "Get uesr key comparison: \n" << hist_get.ToString();
+
+}
+
+TEST(PerfContextTest, KeyComparisonCount) {
+  SetPerfLevel(kEnableCount);
+  ProfileKeyComparison();
+
+  SetPerfLevel(kDisable);
+  ProfileKeyComparison();
+
+  SetPerfLevel(kEnableTime);
+  ProfileKeyComparison();
+}
+
+// make perf_context_test
+// export ROCKSDB_TESTS=PerfContextTest.SeekKeyComparison
+// For one memtable:
+// ./perf_context_test --write_buffer_size=500000 --total_keys=10000
+// For two memtables:
+// ./perf_context_test --write_buffer_size=250000 --total_keys=10000
+// Specify --random_key=1 to shuffle the key before insertion
+// Results show that, for sequential insertion, worst-case Seek Key comparison
+// is close to the total number of keys (linear), when there is only one
+// memtable. When there are two memtables, even the avg Seek Key comparison
+// starts to become linear to the input size.
+
+TEST(PerfContextTest, SeekKeyComparison) {
+  DestroyDB(kDbName, Options());
+  auto db = OpenDb();
+  WriteOptions write_options;
+  ReadOptions read_options;
+
+  std::cout << "Inserting " << FLAGS_total_keys << " key/value pairs\n...\n";
+
+  std::vector<int> keys;
+  for (int i = 0; i < FLAGS_total_keys; ++i) {
+    keys.push_back(i);
+  }
+
+  if (FLAGS_random_key) {
+    std::random_shuffle(keys.begin(), keys.end());
+  }
+
+  HistogramImpl hist_put_time;
+  HistogramImpl hist_wal_time;
+  HistogramImpl hist_time_diff;
+
+  SetPerfLevel(kEnableTime);
+  StopWatchNano timer(Env::Default());
+  for (const int i : keys) {
+    std::string key = "k" + std::to_string(i);
+    std::string value = "v" + std::to_string(i);
+
+    perf_context.Reset();
+    timer.Start();
+    db->Put(write_options, key, value);
+    auto put_time = timer.ElapsedNanos();
+    hist_put_time.Add(put_time);
+    hist_wal_time.Add(perf_context.wal_write_time);
+    hist_time_diff.Add(put_time - perf_context.wal_write_time);
+  }
+
+  std::cout << "Put time:\n" << hist_put_time.ToString()
+            << "WAL time:\n" << hist_wal_time.ToString()
+            << "time diff:\n" << hist_time_diff.ToString();
+
+  HistogramImpl hist_seek;
+  HistogramImpl hist_next;
+
+  for (int i = 0; i < FLAGS_total_keys; ++i) {
+    std::string key = "k" + std::to_string(i);
+    std::string value = "v" + std::to_string(i);
+
+    std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+    perf_context.Reset();
+    iter->Seek(key);
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->value().ToString(), value);
+    hist_seek.Add(perf_context.user_key_comparison_count);
+  }
+
+  std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+  for (iter->SeekToFirst(); iter->Valid();) {
+    perf_context.Reset();
+    iter->Next();
+    hist_next.Add(perf_context.user_key_comparison_count);
+  }
+
+  std::cout << "Seek:\n" << hist_seek.ToString()
+            << "Next:\n" << hist_next.ToString();
+}
+
+}
+
+int main(int argc, char** argv) {
+
+  for (int i = 1; i < argc; i++) {
+    int n;
+    char junk;
+
+    if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) {
+      FLAGS_write_buffer_size = n;
+    }
+
+    if (sscanf(argv[i], "--total_keys=%d%c", &n, &junk) == 1) {
+      FLAGS_total_keys = n;
+    }
+
+    if (sscanf(argv[i], "--random_key=%d%c", &n, &junk) == 1 &&
+        (n == 0 || n == 1)) {
+      FLAGS_random_key = n;
+    }
+
+    if (sscanf(argv[i], "--use_set_based_memetable=%d%c", &n, &junk) == 1 &&
+        (n == 0 || n == 1)) {
+      FLAGS_use_set_based_memetable = n;
+    }
+
+  }
+
+  std::cout << kDbName << "\n";
+
+  rocksdb::test::RunAllTests();
+  return 0;
+}
--- a/db/prefix_filter_iterator.h
+++ b/db/prefix_filter_iterator.h
@ -0,0 +1,73 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Wrap an underlying iterator, but exclude any results not starting
+// with a given prefix.  Seeking to keys not beginning with the prefix
+// is invalid, and SeekToLast is not implemented (that would be
+// non-trivial), but otherwise this iterator will behave just like the
+// underlying iterator would if there happened to be no non-matching
+// keys in the dataset.
+
+#pragma once
+#include "rocksdb/iterator.h"
+
+namespace rocksdb {
+
+class PrefixFilterIterator : public Iterator {
+ private:
+  Iterator* iter_;
+  const Slice &prefix_;
+  const SliceTransform *prefix_extractor_;
+  Status status_;
+
+ public:
+  PrefixFilterIterator(Iterator* iter,
+                       const Slice &prefix,
+                       const SliceTransform* prefix_extractor)
+                             : iter_(iter), prefix_(prefix),
+                               prefix_extractor_(prefix_extractor),
+                               status_(Status::OK()) {
+    if (prefix_extractor == nullptr) {
+      status_ = Status::InvalidArgument("A prefix filter may not be used "
+                                        "unless a function is also defined "
+                                        "for extracting prefixes");
+    } else if (!prefix_extractor_->InRange(prefix)) {
+      status_ = Status::InvalidArgument("Must provide a slice for prefix which"
+                                        "is a prefix for some key");
+    }
+  }
+  ~PrefixFilterIterator() {
+    delete iter_;
+  }
+  Slice key() const { return iter_->key(); }
+  Slice value() const { return iter_->value(); }
+  Status status() const {
+    if (!status_.ok()) {
+      return status_;
+    }
+    return iter_->status();
+  }
+  void Next() { iter_->Next(); }
+  void Prev() { iter_->Prev(); }
+  void Seek(const Slice& k) {
+    if (prefix_extractor_->Transform(k) == prefix_) {
+      iter_->Seek(k);
+    } else {
+      status_ = Status::InvalidArgument("Seek must begin with target prefix");
+    }
+  }
+  void SeekToFirst() {
+    Seek(prefix_);
+  }
+  void SeekToLast() {
+    status_ = Status::NotSupported("SeekToLast is incompatible with prefixes");
+  }
+  bool Valid() const {
+    return (status_.ok() && iter_->Valid() &&
+            prefix_extractor_->Transform(iter_->key()) == prefix_);
+  }
+};
+
+}  // namespace rocksdb
--- a/db/prefix_test.cc
+++ b/db/prefix_test.cc
@ -0,0 +1,319 @@
+#include <algorithm>
+#include <iostream>
+#include <vector>
+
+#include <gflags/gflags.h>
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/perf_context.h"
+#include "util/histogram.h"
+#include "util/stop_watch.h"
+#include "util/testharness.h"
+
+DEFINE_bool(use_prefix_hash_memtable, true, "");
+DEFINE_bool(use_nolock_version, true, "");
+DEFINE_bool(trigger_deadlock, false,
+            "issue delete in range scan to trigger PrefixHashMap deadlock");
+DEFINE_uint64(bucket_count, 100000, "number of buckets");
+DEFINE_uint64(num_locks, 10001, "number of locks");
+DEFINE_bool(random_prefix, false, "randomize prefix");
+DEFINE_uint64(total_prefixes, 1000, "total number of prefixes");
+DEFINE_uint64(items_per_prefix, 10, "total number of values per prefix");
+DEFINE_int64(write_buffer_size, 1000000000, "");
+DEFINE_int64(max_write_buffer_number, 8, "");
+DEFINE_int64(min_write_buffer_number_to_merge, 7, "");
+
+// Path to the database on file system
+const std::string kDbName = rocksdb::test::TmpDir() + "/prefix_test";
+
+namespace rocksdb {
+
+struct TestKey {
+  uint64_t prefix;
+  uint64_t sorted;
+
+  TestKey(uint64_t prefix, uint64_t sorted) : prefix(prefix), sorted(sorted) {}
+};
+
+// return a slice backed by test_key
+inline Slice TestKeyToSlice(const TestKey& test_key) {
+  return Slice((const char*)&test_key, sizeof(test_key));
+}
+
+inline const TestKey* SliceToTestKey(const Slice& slice) {
+  assert(slice.size() == sizeof(TestKey));
+  return (const TestKey*)slice.data();
+}
+
+class TestKeyComparator : public Comparator {
+ public:
+  virtual int Compare(const Slice& a, const Slice& b) const {
+    const TestKey* key_a = SliceToTestKey(a);
+    const TestKey* key_b = SliceToTestKey(b);
+    if (key_a->prefix != key_b->prefix) {
+      if (key_a->prefix < key_b->prefix) return -1;
+      if (key_a->prefix > key_b->prefix) return 1;
+      assert(false);
+    } else {
+      if (key_a->sorted < key_b->sorted) return -1;
+      if (key_a->sorted > key_b->sorted) return 1;
+      if (key_a->sorted == key_b->sorted) return 0;
+      assert(false);
+    }
+    assert(false);
+    return 0;
+  }
+
+  virtual const char* Name() const override {
+    return "TestKeyComparator";
+  }
+
+  virtual void FindShortestSeparator(
+      std::string* start,
+      const Slice& limit) const {
+  }
+
+  virtual void FindShortSuccessor(std::string* key) const {}
+
+};
+
+class PrefixTest {
+ public:
+  std::shared_ptr<DB> OpenDb() {
+    DB* db;
+
+    options.create_if_missing = true;
+    options.write_buffer_size = FLAGS_write_buffer_size;
+    options.max_write_buffer_number = FLAGS_max_write_buffer_number;
+    options.min_write_buffer_number_to_merge =
+      FLAGS_min_write_buffer_number_to_merge;
+
+    options.comparator = new TestKeyComparator();
+    if (FLAGS_use_prefix_hash_memtable) {
+      auto prefix_extractor = NewFixedPrefixTransform(8);
+      options.prefix_extractor = prefix_extractor;
+      if (FLAGS_use_nolock_version) {
+        options.memtable_factory.reset(NewHashSkipListRepFactory(
+            prefix_extractor, FLAGS_bucket_count));
+      } else {
+        options.memtable_factory =
+          std::make_shared<rocksdb::PrefixHashRepFactory>(
+            prefix_extractor, FLAGS_bucket_count, FLAGS_num_locks);
+      }
+    }
+
+    Status s = DB::Open(options, kDbName,  &db);
+    ASSERT_OK(s);
+    return std::shared_ptr<DB>(db);
+  }
+  ~PrefixTest() {
+    delete options.comparator;
+  }
+ protected:
+  Options options;
+};
+
+TEST(PrefixTest, DynamicPrefixIterator) {
+
+  DestroyDB(kDbName, Options());
+  auto db = OpenDb();
+  WriteOptions write_options;
+  ReadOptions read_options;
+
+  std::vector<uint64_t> prefixes;
+  for (uint64_t i = 0; i < FLAGS_total_prefixes; ++i) {
+    prefixes.push_back(i);
+  }
+
+  if (FLAGS_random_prefix) {
+    std::random_shuffle(prefixes.begin(), prefixes.end());
+  }
+
+  // insert x random prefix, each with y continuous element.
+  for (auto prefix : prefixes) {
+     for (uint64_t sorted = 0; sorted < FLAGS_items_per_prefix; sorted++) {
+      TestKey test_key(prefix, sorted);
+
+      Slice key = TestKeyToSlice(test_key);
+      std::string value = "v" + std::to_string(sorted);
+
+      ASSERT_OK(db->Put(write_options, key, value));
+    }
+  }
+
+  // test seek existing keys
+  HistogramImpl hist_seek_time;
+  HistogramImpl hist_seek_comparison;
+
+  if (FLAGS_use_prefix_hash_memtable) {
+    read_options.prefix_seek = true;
+  }
+  std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+
+  for (auto prefix : prefixes) {
+    TestKey test_key(prefix, FLAGS_items_per_prefix / 2);
+    Slice key = TestKeyToSlice(test_key);
+    std::string value = "v" + std::to_string(0);
+
+    perf_context.Reset();
+    StopWatchNano timer(Env::Default(), true);
+    uint64_t total_keys = 0;
+    for (iter->Seek(key); iter->Valid(); iter->Next()) {
+      if (FLAGS_trigger_deadlock) {
+        std::cout << "Behold the deadlock!\n";
+        db->Delete(write_options, iter->key());
+      }
+      auto test_key = SliceToTestKey(iter->key());
+      if (test_key->prefix != prefix) break;
+      total_keys++;
+    }
+    hist_seek_time.Add(timer.ElapsedNanos());
+    hist_seek_comparison.Add(perf_context.user_key_comparison_count);
+    ASSERT_EQ(total_keys, FLAGS_items_per_prefix - FLAGS_items_per_prefix/2);
+  }
+
+  std::cout << "Seek key comparison: \n"
+            << hist_seek_comparison.ToString()
+            << "Seek time: \n"
+            << hist_seek_time.ToString();
+
+  // test non-existing keys
+  HistogramImpl hist_no_seek_time;
+  HistogramImpl hist_no_seek_comparison;
+
+  for (auto prefix = FLAGS_total_prefixes;
+       prefix < FLAGS_total_prefixes + 100;
+       prefix++) {
+    TestKey test_key(prefix, 0);
+    Slice key = TestKeyToSlice(test_key);
+
+    perf_context.Reset();
+    StopWatchNano timer(Env::Default(), true);
+    iter->Seek(key);
+    hist_no_seek_time.Add(timer.ElapsedNanos());
+    hist_no_seek_comparison.Add(perf_context.user_key_comparison_count);
+    ASSERT_TRUE(!iter->Valid());
+  }
+
+  std::cout << "non-existing Seek key comparison: \n"
+            << hist_no_seek_comparison.ToString()
+            << "non-existing Seek time: \n"
+            << hist_no_seek_time.ToString();
+}
+
+TEST(PrefixTest, PrefixHash) {
+
+  DestroyDB(kDbName, Options());
+  auto db = OpenDb();
+  WriteOptions write_options;
+  ReadOptions read_options;
+
+  std::vector<uint64_t> prefixes;
+  for (uint64_t i = 0; i < FLAGS_total_prefixes; ++i) {
+    prefixes.push_back(i);
+  }
+
+  if (FLAGS_random_prefix) {
+    std::random_shuffle(prefixes.begin(), prefixes.end());
+  }
+
+  // insert x random prefix, each with y continuous element.
+  HistogramImpl hist_put_time;
+  HistogramImpl hist_put_comparison;
+
+  for (auto prefix : prefixes) {
+     for (uint64_t sorted = 0; sorted < FLAGS_items_per_prefix; sorted++) {
+      TestKey test_key(prefix, sorted);
+
+      Slice key = TestKeyToSlice(test_key);
+      std::string value = "v" + std::to_string(sorted);
+
+      perf_context.Reset();
+      StopWatchNano timer(Env::Default(), true);
+      ASSERT_OK(db->Put(write_options, key, value));
+      hist_put_time.Add(timer.ElapsedNanos());
+      hist_put_comparison.Add(perf_context.user_key_comparison_count);
+    }
+  }
+
+  std::cout << "Put key comparison: \n" << hist_put_comparison.ToString()
+            << "Put time: \n" << hist_put_time.ToString();
+
+
+  // test seek existing keys
+  HistogramImpl hist_seek_time;
+  HistogramImpl hist_seek_comparison;
+
+  for (auto prefix : prefixes) {
+    TestKey test_key(prefix, 0);
+    Slice key = TestKeyToSlice(test_key);
+    std::string value = "v" + std::to_string(0);
+
+    Slice key_prefix;
+    if (FLAGS_use_prefix_hash_memtable) {
+      key_prefix = options.prefix_extractor->Transform(key);
+      read_options.prefix = &key_prefix;
+    }
+    std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+
+    perf_context.Reset();
+    StopWatchNano timer(Env::Default(), true);
+    uint64_t total_keys = 0;
+    for (iter->Seek(key); iter->Valid(); iter->Next()) {
+      if (FLAGS_trigger_deadlock) {
+        std::cout << "Behold the deadlock!\n";
+        db->Delete(write_options, iter->key());
+      }
+      auto test_key = SliceToTestKey(iter->key());
+      if (test_key->prefix != prefix) break;
+      total_keys++;
+    }
+    hist_seek_time.Add(timer.ElapsedNanos());
+    hist_seek_comparison.Add(perf_context.user_key_comparison_count);
+    ASSERT_EQ(total_keys, FLAGS_items_per_prefix);
+  }
+
+  std::cout << "Seek key comparison: \n"
+            << hist_seek_comparison.ToString()
+            << "Seek time: \n"
+            << hist_seek_time.ToString();
+
+  // test non-existing keys
+  HistogramImpl hist_no_seek_time;
+  HistogramImpl hist_no_seek_comparison;
+
+  for (auto prefix = FLAGS_total_prefixes;
+       prefix < FLAGS_total_prefixes + 100;
+       prefix++) {
+    TestKey test_key(prefix, 0);
+    Slice key = TestKeyToSlice(test_key);
+
+    if (FLAGS_use_prefix_hash_memtable) {
+      Slice key_prefix = options.prefix_extractor->Transform(key);
+      read_options.prefix = &key_prefix;
+    }
+    std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+
+    perf_context.Reset();
+    StopWatchNano timer(Env::Default(), true);
+    iter->Seek(key);
+    hist_no_seek_time.Add(timer.ElapsedNanos());
+    hist_no_seek_comparison.Add(perf_context.user_key_comparison_count);
+    ASSERT_TRUE(!iter->Valid());
+  }
+
+  std::cout << "non-existing Seek key comparison: \n"
+            << hist_no_seek_comparison.ToString()
+            << "non-existing Seek time: \n"
+            << hist_no_seek_time.ToString();
+}
+
+}
+
+int main(int argc, char** argv) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  std::cout << kDbName << "\n";
+
+  rocksdb::test::RunAllTests();
+  return 0;
+}
--- a/db/repair.cc
+++ b/db/repair.cc
@ -0,0 +1,390 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// We recover the contents of the descriptor from the other files we find.
+// (1) Any log files are first converted to tables
+// (2) We scan every table to compute
+//     (a) smallest/largest for the table
+//     (b) largest sequence number in the table
+// (3) We generate descriptor contents:
+//      - log number is set to zero
+//      - next-file-number is set to 1 + largest file number we found
+//      - last-sequence-number is set to largest sequence# found across
+//        all tables (see 2c)
+//      - compaction pointers are cleared
+//      - every table file is added at level 0
+//
+// Possible optimization 1:
+//   (a) Compute total size and use to pick appropriate max-level M
+//   (b) Sort tables by largest sequence# in the table
+//   (c) For each table: if it overlaps earlier table, place in level-0,
+//       else place in level-M.
+// Possible optimization 2:
+//   Store per-table metadata (smallest, largest, largest-seq#, ...)
+//   in the table's meta section to speed up ScanTable.
+
+#include "db/builder.h"
+#include "db/db_impl.h"
+#include "db/dbformat.h"
+#include "db/filename.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/memtable.h"
+#include "db/table_cache.h"
+#include "db/version_edit.h"
+#include "db/write_batch_internal.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+
+namespace rocksdb {
+
+namespace {
+
+class Repairer {
+ public:
+  Repairer(const std::string& dbname, const Options& options)
+      : dbname_(dbname),
+        env_(options.env),
+        icmp_(options.comparator),
+        ipolicy_(options.filter_policy),
+        options_(SanitizeOptions(dbname, &icmp_, &ipolicy_, options)),
+        next_file_number_(1) {
+    // TableCache can be small since we expect each table to be opened once.
+    table_cache_ = new TableCache(dbname_, &options_, storage_options_, 10);
+    edit_ = new VersionEdit(options.num_levels);
+  }
+
+  ~Repairer() {
+    delete table_cache_;
+    delete edit_;
+  }
+
+  Status Run() {
+    Status status = FindFiles();
+    if (status.ok()) {
+      ConvertLogFilesToTables();
+      ExtractMetaData();
+      status = WriteDescriptor();
+    }
+    if (status.ok()) {
+      unsigned long long bytes = 0;
+      for (size_t i = 0; i < tables_.size(); i++) {
+        bytes += tables_[i].meta.file_size;
+      }
+      Log(options_.info_log,
+          "**** Repaired rocksdb %s; "
+          "recovered %d files; %llu bytes. "
+          "Some data may have been lost. "
+          "****",
+          dbname_.c_str(),
+          static_cast<int>(tables_.size()),
+          bytes);
+    }
+    return status;
+  }
+
+ private:
+  struct TableInfo {
+    FileMetaData meta;
+    SequenceNumber min_sequence;
+    SequenceNumber max_sequence;
+  };
+
+  std::string const dbname_;
+  Env* const env_;
+  InternalKeyComparator const icmp_;
+  InternalFilterPolicy const ipolicy_;
+  Options const options_;
+  TableCache* table_cache_;
+  VersionEdit* edit_;
+
+  std::vector<std::string> manifests_;
+  std::vector<uint64_t> table_numbers_;
+  std::vector<uint64_t> logs_;
+  std::vector<TableInfo> tables_;
+  uint64_t next_file_number_;
+  const EnvOptions storage_options_;
+
+  Status FindFiles() {
+    std::vector<std::string> filenames;
+    Status status = env_->GetChildren(dbname_, &filenames);
+    if (!status.ok()) {
+      return status;
+    }
+    if (filenames.empty()) {
+      return Status::IOError(dbname_, "repair found no files");
+    }
+
+    uint64_t number;
+    FileType type;
+    for (size_t i = 0; i < filenames.size(); i++) {
+      if (ParseFileName(filenames[i], &number, &type)) {
+        if (type == kDescriptorFile) {
+          manifests_.push_back(filenames[i]);
+        } else {
+          if (number + 1 > next_file_number_) {
+            next_file_number_ = number + 1;
+          }
+          if (type == kLogFile) {
+            logs_.push_back(number);
+          } else if (type == kTableFile) {
+            table_numbers_.push_back(number);
+          } else {
+            // Ignore other files
+          }
+        }
+      }
+    }
+    return status;
+  }
+
+  void ConvertLogFilesToTables() {
+    for (size_t i = 0; i < logs_.size(); i++) {
+      std::string logname = LogFileName(dbname_, logs_[i]);
+      Status status = ConvertLogToTable(logs_[i]);
+      if (!status.ok()) {
+        Log(options_.info_log, "Log #%llu: ignoring conversion error: %s",
+            (unsigned long long) logs_[i],
+            status.ToString().c_str());
+      }
+      ArchiveFile(logname);
+    }
+  }
+
+  Status ConvertLogToTable(uint64_t log) {
+    struct LogReporter : public log::Reader::Reporter {
+      Env* env;
+      std::shared_ptr<Logger> info_log;
+      uint64_t lognum;
+      virtual void Corruption(size_t bytes, const Status& s) {
+        // We print error messages for corruption, but continue repairing.
+        Log(info_log, "Log #%llu: dropping %d bytes; %s",
+            (unsigned long long) lognum,
+            static_cast<int>(bytes),
+            s.ToString().c_str());
+      }
+    };
+
+    // Open the log file
+    std::string logname = LogFileName(dbname_, log);
+    unique_ptr<SequentialFile> lfile;
+    Status status = env_->NewSequentialFile(logname, &lfile, storage_options_);
+    if (!status.ok()) {
+      return status;
+    }
+
+    // Create the log reader.
+    LogReporter reporter;
+    reporter.env = env_;
+    reporter.info_log = options_.info_log;
+    reporter.lognum = log;
+    // We intentially make log::Reader do checksumming so that
+    // corruptions cause entire commits to be skipped instead of
+    // propagating bad information (like overly large sequence
+    // numbers).
+    log::Reader reader(std::move(lfile), &reporter, false/*do not checksum*/,
+                       0/*initial_offset*/);
+
+    // Read all the records and add to a memtable
+    std::string scratch;
+    Slice record;
+    WriteBatch batch;
+    MemTable* mem = new MemTable(icmp_, options_.memtable_factory,
+      options_.num_levels);
+    mem->Ref();
+    int counter = 0;
+    while (reader.ReadRecord(&record, &scratch)) {
+      if (record.size() < 12) {
+        reporter.Corruption(
+            record.size(), Status::Corruption("log record too small"));
+        continue;
+      }
+      WriteBatchInternal::SetContents(&batch, record);
+      status = WriteBatchInternal::InsertInto(&batch, mem, &options_);
+      if (status.ok()) {
+        counter += WriteBatchInternal::Count(&batch);
+      } else {
+        Log(options_.info_log, "Log #%llu: ignoring %s",
+            (unsigned long long) log,
+            status.ToString().c_str());
+        status = Status::OK();  // Keep going with rest of file
+      }
+    }
+
+    // Do not record a version edit for this conversion to a Table
+    // since ExtractMetaData() will also generate edits.
+    FileMetaData meta;
+    meta.number = next_file_number_++;
+    Iterator* iter = mem->NewIterator();
+    status = BuildTable(dbname_, env_, options_, storage_options_,
+                        table_cache_, iter, &meta,
+                        icmp_.user_comparator(), 0, 0, true);
+    delete iter;
+    mem->Unref();
+    mem = nullptr;
+    if (status.ok()) {
+      if (meta.file_size > 0) {
+        table_numbers_.push_back(meta.number);
+      }
+    }
+    Log(options_.info_log, "Log #%llu: %d ops saved to Table #%llu %s",
+        (unsigned long long) log,
+        counter,
+        (unsigned long long) meta.number,
+        status.ToString().c_str());
+    return status;
+  }
+
+  void ExtractMetaData() {
+    std::vector<TableInfo> kept;
+    for (size_t i = 0; i < table_numbers_.size(); i++) {
+      TableInfo t;
+      t.meta.number = table_numbers_[i];
+      Status status = ScanTable(&t);
+      if (!status.ok()) {
+        std::string fname = TableFileName(dbname_, table_numbers_[i]);
+        Log(options_.info_log, "Table #%llu: ignoring %s",
+            (unsigned long long) table_numbers_[i],
+            status.ToString().c_str());
+        ArchiveFile(fname);
+      } else {
+        tables_.push_back(t);
+      }
+    }
+  }
+
+  Status ScanTable(TableInfo* t) {
+    std::string fname = TableFileName(dbname_, t->meta.number);
+    int counter = 0;
+    Status status = env_->GetFileSize(fname, &t->meta.file_size);
+    if (status.ok()) {
+      Iterator* iter = table_cache_->NewIterator(
+          ReadOptions(), storage_options_, t->meta.number, t->meta.file_size);
+      bool empty = true;
+      ParsedInternalKey parsed;
+      t->min_sequence = 0;
+      t->max_sequence = 0;
+      for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+        Slice key = iter->key();
+        if (!ParseInternalKey(key, &parsed)) {
+          Log(options_.info_log, "Table #%llu: unparsable key %s",
+              (unsigned long long) t->meta.number,
+              EscapeString(key).c_str());
+          continue;
+        }
+
+        counter++;
+        if (empty) {
+          empty = false;
+          t->meta.smallest.DecodeFrom(key);
+        }
+        t->meta.largest.DecodeFrom(key);
+        if (parsed.sequence < t->min_sequence) {
+          t->min_sequence = parsed.sequence;
+        }
+        if (parsed.sequence > t->max_sequence) {
+          t->max_sequence = parsed.sequence;
+        }
+      }
+      if (!iter->status().ok()) {
+        status = iter->status();
+      }
+      delete iter;
+    }
+    Log(options_.info_log, "Table #%llu: %d entries %s",
+        (unsigned long long) t->meta.number,
+        counter,
+        status.ToString().c_str());
+    return status;
+  }
+
+  Status WriteDescriptor() {
+    std::string tmp = TempFileName(dbname_, 1);
+    unique_ptr<WritableFile> file;
+    Status status = env_->NewWritableFile(tmp, &file, storage_options_);
+    if (!status.ok()) {
+      return status;
+    }
+
+    SequenceNumber max_sequence = 0;
+    for (size_t i = 0; i < tables_.size(); i++) {
+      if (max_sequence < tables_[i].max_sequence) {
+        max_sequence = tables_[i].max_sequence;
+      }
+    }
+
+    edit_->SetComparatorName(icmp_.user_comparator()->Name());
+    edit_->SetLogNumber(0);
+    edit_->SetNextFile(next_file_number_);
+    edit_->SetLastSequence(max_sequence);
+
+    for (size_t i = 0; i < tables_.size(); i++) {
+      // TODO(opt): separate out into multiple levels
+      const TableInfo& t = tables_[i];
+      edit_->AddFile(0, t.meta.number, t.meta.file_size,
+                    t.meta.smallest, t.meta.largest,
+                    t.min_sequence, t.max_sequence);
+    }
+
+    //fprintf(stderr, "NewDescriptor:\n%s\n", edit_.DebugString().c_str());
+    {
+      log::Writer log(std::move(file));
+      std::string record;
+      edit_->EncodeTo(&record);
+      status = log.AddRecord(record);
+    }
+
+    if (!status.ok()) {
+      env_->DeleteFile(tmp);
+    } else {
+      // Discard older manifests
+      for (size_t i = 0; i < manifests_.size(); i++) {
+        ArchiveFile(dbname_ + "/" + manifests_[i]);
+      }
+
+      // Install new manifest
+      status = env_->RenameFile(tmp, DescriptorFileName(dbname_, 1));
+      if (status.ok()) {
+        status = SetCurrentFile(env_, dbname_, 1);
+      } else {
+        env_->DeleteFile(tmp);
+      }
+    }
+    return status;
+  }
+
+  void ArchiveFile(const std::string& fname) {
+    // Move into another directory.  E.g., for
+    //    dir/foo
+    // rename to
+    //    dir/lost/foo
+    const char* slash = strrchr(fname.c_str(), '/');
+    std::string new_dir;
+    if (slash != nullptr) {
+      new_dir.assign(fname.data(), slash - fname.data());
+    }
+    new_dir.append("/lost");
+    env_->CreateDir(new_dir);  // Ignore error
+    std::string new_file = new_dir;
+    new_file.append("/");
+    new_file.append((slash == nullptr) ? fname.c_str() : slash + 1);
+    Status s = env_->RenameFile(fname, new_file);
+    Log(options_.info_log, "Archiving %s: %s\n",
+        fname.c_str(), s.ToString().c_str());
+  }
+};
+}  // namespace
+
+Status RepairDB(const std::string& dbname, const Options& options) {
+  Repairer repairer(dbname, options);
+  return repairer.Run();
+}
+
+}  // namespace rocksdb
--- a/db/simple_table_db_test.cc
+++ b/db/simple_table_db_test.cc
@ -0,0 +1,793 @@
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include <algorithm>
+#include <set>
+
+#include "rocksdb/db.h"
+#include "rocksdb/filter_policy.h"
+#include "db/db_impl.h"
+#include "db/filename.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "db/db_statistics.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/env.h"
+#include "rocksdb/table.h"
+#include "util/hash.h"
+#include "util/logging.h"
+#include "util/mutexlock.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+#include "utilities/merge_operators.h"
+
+using std::unique_ptr;
+
+namespace rocksdb {
+
+// SimpleTable is a simple table format for UNIT TEST ONLY. It is not built
+// as production quality.
+// SimpleTable requires the input key size to be fixed 16 bytes, value cannot
+// be longer than 150000 bytes and stored data on disk in this format:
+// +--------------------------------------------+  <= key1 offset
+// | key1            | value_size (4 bytes) |   |
+// +----------------------------------------+   |
+// | value1                                     |
+// |                                            |
+// +----------------------------------------+---+  <= key2 offset
+// | key2            | value_size (4 bytes) |   |
+// +----------------------------------------+   |
+// | value2                                     |
+// |                                            |
+// |        ......                              |
+// +-----------------+--------------------------+   <= index_block_offset
+// | key1            | key1 offset (8 bytes)    |
+// +-----------------+--------------------------+
+// | key2            | key2 offset (8 bytes)    |
+// +-----------------+--------------------------+
+// | key3            | key3 offset (8 bytes)    |
+// +-----------------+--------------------------+
+// |        ......                              |
+// +-----------------+------------+-------------+
+// | index_block_offset (8 bytes) |
+// +------------------------------+
+
+// SimpleTable is a simple table format for UNIT TEST ONLY. It is not built
+// as production quality.
+class SimpleTableReader: public TableReader {
+public:
+  // Attempt to open the table that is stored in bytes [0..file_size)
+  // of "file", and read the metadata entries necessary to allow
+  // retrieving data from the table.
+  //
+  // If successful, returns ok and sets "*table" to the newly opened
+  // table.  The client should delete "*table" when no longer needed.
+  // If there was an error while initializing the table, sets "*table"
+  // to nullptr and returns a non-ok status.  Does not take ownership of
+  // "*source", but the client must ensure that "source" remains live
+  // for the duration of the returned table's lifetime.
+  //
+  // *file must remain live while this Table is in use.
+  static Status Open(const Options& options, const EnvOptions& soptions,
+                     unique_ptr<RandomAccessFile> && file, uint64_t file_size,
+                     unique_ptr<TableReader>* table_reader);
+
+  bool PrefixMayMatch(const Slice& internal_prefix) override;
+
+  Iterator* NewIterator(const ReadOptions&) override;
+
+  Status Get(
+      const ReadOptions&, const Slice& key, void* arg,
+      bool (*handle_result)(void* arg, const Slice& k, const Slice& v, bool),
+      void (*mark_key_may_exist)(void*) = nullptr) override;
+
+  uint64_t ApproximateOffsetOf(const Slice& key) override;
+
+  bool TEST_KeyInCache(const ReadOptions& options, const Slice& key) override;
+
+  void SetupForCompaction() override;
+
+  TableStats& GetTableStats() override;
+
+  ~SimpleTableReader();
+
+private:
+  struct Rep;
+  Rep* rep_;
+
+  explicit SimpleTableReader(Rep* rep) {
+    rep_ = rep;
+  }
+  friend class TableCache;
+  friend class SimpleTableIterator;
+
+  Status GetOffset(const Slice& target, uint64_t* offset);
+
+  // No copying allowed
+  explicit SimpleTableReader(const TableReader&) = delete;
+  void operator=(const TableReader&) = delete;
+};
+
+// Iterator to iterate SimpleTable
+class SimpleTableIterator: public Iterator {
+public:
+  explicit SimpleTableIterator(SimpleTableReader* table);
+  ~SimpleTableIterator();
+
+  bool Valid() const;
+
+  void SeekToFirst();
+
+  void SeekToLast();
+
+  void Seek(const Slice& target);
+
+  void Next();
+
+  void Prev();
+
+  Slice key() const;
+
+  Slice value() const;
+
+  Status status() const;
+
+private:
+  SimpleTableReader* table_;
+  uint64_t offset_;
+  uint64_t next_offset_;
+  Slice key_;
+  Slice value_;
+  char tmp_str_[4];
+  char* key_str_;
+  char* value_str_;
+  int value_str_len_;
+  Status status_;
+  // No copying allowed
+  SimpleTableIterator(const SimpleTableIterator&) = delete;
+  void operator=(const Iterator&) = delete;
+};
+
+struct SimpleTableReader::Rep {
+  ~Rep() {
+  }
+  Rep(const EnvOptions& storage_options, uint64_t index_start_offset,
+      int num_entries) :
+      soptions(storage_options), index_start_offset(index_start_offset),
+      num_entries(num_entries) {
+  }
+
+  Options options;
+  const EnvOptions& soptions;
+  Status status;
+  unique_ptr<RandomAccessFile> file;
+  uint64_t index_start_offset;
+  int num_entries;
+  TableStats table_stats;
+
+  const static int user_key_size = 16;
+  const static int offset_length = 8;
+  const static int key_footer_len = 8;
+
+  static int GetInternalKeyLength() {
+    return user_key_size + key_footer_len;
+  }
+};
+
+SimpleTableReader::~SimpleTableReader() {
+  delete rep_;
+}
+
+Status SimpleTableReader::Open(const Options& options,
+                               const EnvOptions& soptions,
+                               unique_ptr<RandomAccessFile> && file,
+                               uint64_t size,
+                               unique_ptr<TableReader>* table_reader) {
+  char footer_space[Rep::offset_length];
+  Slice footer_input;
+  Status s = file->Read(size - Rep::offset_length, Rep::offset_length,
+                        &footer_input, footer_space);
+  if (s.ok()) {
+    uint64_t index_start_offset = DecodeFixed64(footer_space);
+
+    int num_entries = (size - Rep::offset_length - index_start_offset)
+        / (Rep::GetInternalKeyLength() + Rep::offset_length);
+    SimpleTableReader::Rep* rep = new SimpleTableReader::Rep(soptions,
+                                                             index_start_offset,
+                                                             num_entries);
+
+    rep->file = std::move(file);
+    rep->options = options;
+    table_reader->reset(new SimpleTableReader(rep));
+  }
+  return s;
+}
+
+void SimpleTableReader::SetupForCompaction() {
+}
+
+TableStats& SimpleTableReader::GetTableStats() {
+  return rep_->table_stats;
+}
+
+bool SimpleTableReader::PrefixMayMatch(const Slice& internal_prefix) {
+  return true;
+}
+
+Iterator* SimpleTableReader::NewIterator(const ReadOptions& options) {
+  return new SimpleTableIterator(this);
+}
+
+Status SimpleTableReader::GetOffset(const Slice& target, uint64_t* offset) {
+  uint32_t left = 0;
+  uint32_t right = rep_->num_entries - 1;
+  char key_chars[Rep::GetInternalKeyLength()];
+  Slice tmp_slice;
+
+  uint32_t target_offset = 0;
+  while (left <= right) {
+    uint32_t mid = (left + right + 1) / 2;
+
+    uint64_t offset_to_read = rep_->index_start_offset
+        + (Rep::GetInternalKeyLength() + Rep::offset_length) * mid;
+    Status s = rep_->file->Read(offset_to_read, Rep::GetInternalKeyLength(),
+                                &tmp_slice, key_chars);
+    if (!s.ok()) {
+      return s;
+    }
+
+    int compare_result = rep_->options.comparator->Compare(tmp_slice, target);
+
+    if (compare_result < 0) {
+      if (left == right) {
+        target_offset = right + 1;
+        break;
+      }
+      left = mid;
+    } else {
+      if (left == right) {
+        target_offset = left;
+        break;
+      }
+      right = mid - 1;
+    }
+  }
+
+  if (target_offset >= (uint32_t) rep_->num_entries) {
+    *offset = rep_->index_start_offset;
+    return Status::OK();
+  }
+
+  char value_offset_chars[Rep::offset_length];
+
+  int64_t offset_for_value_offset = rep_->index_start_offset
+      + (Rep::GetInternalKeyLength() + Rep::offset_length) * target_offset
+      + Rep::GetInternalKeyLength();
+  Status s = rep_->file->Read(offset_for_value_offset, Rep::offset_length,
+                              &tmp_slice, value_offset_chars);
+  if (s.ok()) {
+    *offset = DecodeFixed64(value_offset_chars);
+  }
+  return s;
+}
+
+Status SimpleTableReader::Get(
+    const ReadOptions& options, const Slice& k, void* arg,
+    bool (*saver)(void*, const Slice&, const Slice&, bool),
+    void (*mark_key_may_exist)(void*)) {
+  Status s;
+  SimpleTableIterator* iter = new SimpleTableIterator(this);
+  for (iter->Seek(k); iter->Valid(); iter->Next()) {
+    if (!(*saver)(arg, iter->key(), iter->value(), true)) {
+      break;
+    }
+  }
+  s = iter->status();
+  delete iter;
+  return s;
+}
+
+bool SimpleTableReader::TEST_KeyInCache(const ReadOptions& options,
+                                        const Slice& key) {
+  return false;
+}
+
+uint64_t SimpleTableReader::ApproximateOffsetOf(const Slice& key) {
+  return 0;
+}
+
+SimpleTableIterator::SimpleTableIterator(SimpleTableReader* table) :
+    table_(table) {
+  key_str_ = new char[SimpleTableReader::Rep::GetInternalKeyLength()];
+  value_str_len_ = -1;
+  SeekToFirst();
+}
+
+SimpleTableIterator::~SimpleTableIterator() {
+ delete[] key_str_;
+ if (value_str_len_ >= 0) {
+   delete[] value_str_;
+ }
+}
+
+bool SimpleTableIterator::Valid() const {
+  return offset_ < table_->rep_->index_start_offset;
+}
+
+void SimpleTableIterator::SeekToFirst() {
+  next_offset_ = 0;
+  Next();
+}
+
+void SimpleTableIterator::SeekToLast() {
+  assert(false);
+}
+
+void SimpleTableIterator::Seek(const Slice& target) {
+  Status s = table_->GetOffset(target, &next_offset_);
+  if (!s.ok()) {
+    status_ = s;
+  }
+  Next();
+}
+
+void SimpleTableIterator::Next() {
+  offset_ = next_offset_;
+  if (offset_ >= table_->rep_->index_start_offset) {
+    return;
+  }
+  Slice result;
+  int internal_key_size = SimpleTableReader::Rep::GetInternalKeyLength();
+
+  Status s = table_->rep_->file->Read(next_offset_, internal_key_size, &result,
+                                      key_str_);
+  next_offset_ += internal_key_size;
+  key_ = result;
+
+  Slice value_size_slice;
+  s = table_->rep_->file->Read(next_offset_, 4, &value_size_slice, tmp_str_);
+  next_offset_ += 4;
+  uint32_t value_size = DecodeFixed32(tmp_str_);
+
+  Slice value_slice;
+  if ((int) value_size > value_str_len_) {
+    if (value_str_len_ >= 0) {
+      delete[] value_str_;
+    }
+    value_str_ = new char[value_size];
+    value_str_len_ = value_size;
+  }
+  s = table_->rep_->file->Read(next_offset_, value_size, &value_slice,
+                               value_str_);
+  next_offset_ += value_size;
+  value_ = value_slice;
+}
+
+void SimpleTableIterator::Prev() {
+  assert(false);
+}
+
+Slice SimpleTableIterator::key() const {
+  Log(table_->rep_->options.info_log, "key!!!!");
+  return key_;
+}
+
+Slice SimpleTableIterator::value() const {
+  return value_;
+}
+
+Status SimpleTableIterator::status() const {
+  return status_;
+}
+
+class SimpleTableBuilder: public TableBuilder {
+public:
+  // Create a builder that will store the contents of the table it is
+  // building in *file.  Does not close the file.  It is up to the
+  // caller to close the file after calling Finish(). The output file
+  // will be part of level specified by 'level'.  A value of -1 means
+  // that the caller does not know which level the output file will reside.
+  SimpleTableBuilder(const Options& options, WritableFile* file,
+                     CompressionType compression_type);
+
+  // REQUIRES: Either Finish() or Abandon() has been called.
+  ~SimpleTableBuilder();
+
+  // Add key,value to the table being constructed.
+  // REQUIRES: key is after any previously added key according to comparator.
+  // REQUIRES: Finish(), Abandon() have not been called
+  void Add(const Slice& key, const Slice& value) override;
+
+  // Return non-ok iff some error has been detected.
+  Status status() const override;
+
+  // Finish building the table.  Stops using the file passed to the
+  // constructor after this function returns.
+  // REQUIRES: Finish(), Abandon() have not been called
+  Status Finish() override;
+
+  // Indicate that the contents of this builder should be abandoned.  Stops
+  // using the file passed to the constructor after this function returns.
+  // If the caller is not going to call Finish(), it must call Abandon()
+  // before destroying this builder.
+  // REQUIRES: Finish(), Abandon() have not been called
+  void Abandon() override;
+
+  // Number of calls to Add() so far.
+  uint64_t NumEntries() const override;
+
+  // Size of the file generated so far.  If invoked after a successful
+  // Finish() call, returns the size of the final generated file.
+  uint64_t FileSize() const override;
+
+private:
+  struct Rep;
+  Rep* rep_;
+
+  // No copying allowed
+  SimpleTableBuilder(const SimpleTableBuilder&) = delete;
+  void operator=(const SimpleTableBuilder&) = delete;
+};
+
+struct SimpleTableBuilder::Rep {
+  Options options;
+  WritableFile* file;
+  uint64_t offset = 0;
+  Status status;
+
+  uint64_t num_entries = 0;
+
+  bool closed = false;  // Either Finish() or Abandon() has been called.
+
+  const static int user_key_size = 16;
+  const static int offset_length = 8;
+  const static int key_footer_len = 8;
+
+  static int GetInternalKeyLength() {
+    return user_key_size + key_footer_len;
+  }
+
+  std::string index;
+
+  Rep(const Options& opt, WritableFile* f) :
+      options(opt), file(f) {
+  }
+  ~Rep() {
+  }
+};
+
+SimpleTableBuilder::SimpleTableBuilder(const Options& options,
+                                       WritableFile* file,
+                                       CompressionType compression_type) :
+    rep_(new SimpleTableBuilder::Rep(options, file)) {
+}
+
+SimpleTableBuilder::~SimpleTableBuilder() {
+  delete (rep_);
+}
+
+void SimpleTableBuilder::Add(const Slice& key, const Slice& value) {
+  assert((int ) key.size() == Rep::GetInternalKeyLength());
+
+  // Update index
+  rep_->index.append(key.data(), key.size());
+  PutFixed64(&(rep_->index), rep_->offset);
+
+  // Write key-value pair
+  rep_->file->Append(key);
+  rep_->offset += Rep::GetInternalKeyLength();
+
+  std::string size;
+  int value_size = value.size();
+  PutFixed32(&size, value_size);
+  Slice sizeSlice(size);
+  rep_->file->Append(sizeSlice);
+  rep_->file->Append(value);
+  rep_->offset += value_size + 4;
+
+  rep_->num_entries++;
+}
+
+Status SimpleTableBuilder::status() const {
+  return Status::OK();
+}
+
+Status SimpleTableBuilder::Finish() {
+  Rep* r = rep_;
+  assert(!r->closed);
+  r->closed = true;
+
+  uint64_t index_offset = rep_->offset;
+  Slice index_slice(rep_->index);
+  rep_->file->Append(index_slice);
+  rep_->offset += index_slice.size();
+
+  std::string index_offset_str;
+  PutFixed64(&index_offset_str, index_offset);
+  Slice foot_slice(index_offset_str);
+  rep_->file->Append(foot_slice);
+  rep_->offset += foot_slice.size();
+
+  return Status::OK();
+}
+
+void SimpleTableBuilder::Abandon() {
+  rep_->closed = true;
+}
+
+uint64_t SimpleTableBuilder::NumEntries() const {
+  return rep_->num_entries;
+}
+
+uint64_t SimpleTableBuilder::FileSize() const {
+  return rep_->offset;
+}
+
+class SimpleTableFactory: public TableFactory {
+public:
+  ~SimpleTableFactory() {
+  }
+  SimpleTableFactory() {
+  }
+  const char* Name() const override {
+    return "SimpleTable";
+  }
+  Status GetTableReader(const Options& options, const EnvOptions& soptions,
+                        unique_ptr<RandomAccessFile> && file,
+                        uint64_t file_size,
+                        unique_ptr<TableReader>* table_reader) const;
+
+  TableBuilder* GetTableBuilder(const Options& options, WritableFile* file,
+                                CompressionType compression_type) const;
+};
+
+Status SimpleTableFactory::GetTableReader(
+    const Options& options, const EnvOptions& soptions,
+    unique_ptr<RandomAccessFile> && file, uint64_t file_size,
+    unique_ptr<TableReader>* table_reader) const {
+
+  return SimpleTableReader::Open(options, soptions, std::move(file), file_size,
+                                 table_reader);
+}
+
+TableBuilder* SimpleTableFactory::GetTableBuilder(
+    const Options& options, WritableFile* file,
+    CompressionType compression_type) const {
+  return new SimpleTableBuilder(options, file, compression_type);
+}
+
+class SimpleTableDBTest {
+protected:
+public:
+  std::string dbname_;
+  Env* env_;
+  DB* db_;
+
+  Options last_options_;
+
+  SimpleTableDBTest() :
+      env_(Env::Default()) {
+    dbname_ = test::TmpDir() + "/simple_table_db_test";
+    ASSERT_OK(DestroyDB(dbname_, Options()));
+    db_ = nullptr;
+    Reopen();
+  }
+
+  ~SimpleTableDBTest() {
+    delete db_;
+    ASSERT_OK(DestroyDB(dbname_, Options()));
+  }
+
+  // Return the current option configuration.
+  Options CurrentOptions() {
+    Options options;
+    options.table_factory.reset(new SimpleTableFactory());
+    return options;
+  }
+
+  DBImpl* dbfull() {
+    return reinterpret_cast<DBImpl*>(db_);
+  }
+
+  void Reopen(Options* options = nullptr) {
+    ASSERT_OK(TryReopen(options));
+  }
+
+  void Close() {
+    delete db_;
+    db_ = nullptr;
+  }
+
+  void DestroyAndReopen(Options* options = nullptr) {
+    //Destroy using last options
+    Destroy(&last_options_);
+    ASSERT_OK(TryReopen(options));
+  }
+
+  void Destroy(Options* options) {
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(DestroyDB(dbname_, *options));
+  }
+
+  Status PureReopen(Options* options, DB** db) {
+    return DB::Open(*options, dbname_, db);
+  }
+
+  Status TryReopen(Options* options = nullptr) {
+    delete db_;
+    db_ = nullptr;
+    Options opts;
+    if (options != nullptr) {
+      opts = *options;
+    } else {
+      opts = CurrentOptions();
+      opts.create_if_missing = true;
+    }
+    last_options_ = opts;
+
+    return DB::Open(opts, dbname_, &db_);
+  }
+
+  Status Put(const Slice& k, const Slice& v) {
+    return db_->Put(WriteOptions(), k, v);
+  }
+
+  Status Delete(const std::string& k) {
+    return db_->Delete(WriteOptions(), k);
+  }
+
+  std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) {
+    ReadOptions options;
+    options.snapshot = snapshot;
+    std::string result;
+    Status s = db_->Get(options, k, &result);
+    if (s.IsNotFound()) {
+      result = "NOT_FOUND";
+    } else if (!s.ok()) {
+      result = s.ToString();
+    }
+    return result;
+  }
+
+
+  int NumTableFilesAtLevel(int level) {
+    std::string property;
+    ASSERT_TRUE(
+        db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(level),
+                         &property));
+    return atoi(property.c_str());
+  }
+
+  // Return spread of files per level
+  std::string FilesPerLevel() {
+    std::string result;
+    int last_non_zero_offset = 0;
+    for (int level = 0; level < db_->NumberLevels(); level++) {
+      int f = NumTableFilesAtLevel(level);
+      char buf[100];
+      snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
+      result += buf;
+      if (f > 0) {
+        last_non_zero_offset = result.size();
+      }
+    }
+    result.resize(last_non_zero_offset);
+    return result;
+  }
+
+  std::string IterStatus(Iterator* iter) {
+    std::string result;
+    if (iter->Valid()) {
+      result = iter->key().ToString() + "->" + iter->value().ToString();
+    } else {
+      result = "(invalid)";
+    }
+    return result;
+  }
+};
+
+TEST(SimpleTableDBTest, Empty) {
+  ASSERT_TRUE(db_ != nullptr);
+  ASSERT_EQ("NOT_FOUND", Get("0000000000000foo"));
+}
+
+TEST(SimpleTableDBTest, ReadWrite) {
+  ASSERT_OK(Put("0000000000000foo", "v1"));
+  ASSERT_EQ("v1", Get("0000000000000foo"));
+  ASSERT_OK(Put("0000000000000bar", "v2"));
+  ASSERT_OK(Put("0000000000000foo", "v3"));
+  ASSERT_EQ("v3", Get("0000000000000foo"));
+  ASSERT_EQ("v2", Get("0000000000000bar"));
+}
+
+TEST(SimpleTableDBTest, Flush) {
+  ASSERT_OK(Put("0000000000000foo", "v1"));
+  ASSERT_OK(Put("0000000000000bar", "v2"));
+  ASSERT_OK(Put("0000000000000foo", "v3"));
+  dbfull()->TEST_FlushMemTable();
+  ASSERT_EQ("v3", Get("0000000000000foo"));
+  ASSERT_EQ("v2", Get("0000000000000bar"));
+}
+
+TEST(SimpleTableDBTest, Flush2) {
+  ASSERT_OK(Put("0000000000000bar", "b"));
+  ASSERT_OK(Put("0000000000000foo", "v1"));
+  dbfull()->TEST_FlushMemTable();
+
+  ASSERT_OK(Put("0000000000000foo", "v2"));
+  dbfull()->TEST_FlushMemTable();
+  ASSERT_EQ("v2", Get("0000000000000foo"));
+
+  ASSERT_OK(Put("0000000000000eee", "v3"));
+  dbfull()->TEST_FlushMemTable();
+  ASSERT_EQ("v3", Get("0000000000000eee"));
+
+  ASSERT_OK(Delete("0000000000000bar"));
+  dbfull()->TEST_FlushMemTable();
+  ASSERT_EQ("NOT_FOUND", Get("0000000000000bar"));
+
+  ASSERT_OK(Put("0000000000000eee", "v5"));
+  dbfull()->TEST_FlushMemTable();
+  ASSERT_EQ("v5", Get("0000000000000eee"));
+}
+
+static std::string Key(int i) {
+  char buf[100];
+  snprintf(buf, sizeof(buf), "key_______%06d", i);
+  return std::string(buf);
+}
+
+static std::string RandomString(Random* rnd, int len) {
+  std::string r;
+  test::RandomString(rnd, len, &r);
+  return r;
+}
+
+TEST(SimpleTableDBTest, CompactionTrigger) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100 << 10; //100KB
+  options.num_levels = 3;
+  options.max_mem_compaction_level = 0;
+  options.level0_file_num_compaction_trigger = 3;
+  Reopen(&options);
+
+  Random rnd(301);
+
+  for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
+      num++) {
+    std::vector<std::string> values;
+    // Write 120KB (12 values, each 10K)
+    for (int i = 0; i < 12; i++) {
+      values.push_back(RandomString(&rnd, 10000));
+      ASSERT_OK(Put(Key(i), values[i]));
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_EQ(NumTableFilesAtLevel(0), num + 1);
+  }
+
+  //generate one more file in level-0, and should trigger level-0 compaction
+  std::vector<std::string> values;
+  for (int i = 0; i < 12; i++) {
+    values.push_back(RandomString(&rnd, 10000));
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  dbfull()->TEST_WaitForCompact();
+
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1), 1);
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
--- a/db/skiplist.h
+++ b/db/skiplist.h
@ -0,0 +1,416 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Thread safety
+// -------------
+//
+// Writes require external synchronization, most likely a mutex.
+// Reads require a guarantee that the SkipList will not be destroyed
+// while the read is in progress.  Apart from that, reads progress
+// without any internal locking or synchronization.
+//
+// Invariants:
+//
+// (1) Allocated nodes are never deleted until the SkipList is
+// destroyed.  This is trivially guaranteed by the code since we
+// never delete any skip list nodes.
+//
+// (2) The contents of a Node except for the next/prev pointers are
+// immutable after the Node has been linked into the SkipList.
+// Only Insert() modifies the list, and it is careful to initialize
+// a node and use release-stores to publish the nodes in one or
+// more lists.
+//
+// ... prev vs. next pointer ordering ...
+//
+
+#pragma once
+#include <assert.h>
+#include <stdlib.h>
+#include "port/port.h"
+#include "util/random.h"
+
+namespace rocksdb {
+
+template<typename Key, class Comparator>
+class SkipList {
+ private:
+  struct Node;
+
+ public:
+  // Create a new SkipList object that will use "cmp" for comparing keys,
+  // and will allocate memory using "*arena".  Objects allocated in the arena
+  // must remain allocated for the lifetime of the skiplist object.
+  explicit SkipList(Comparator cmp, Arena* arena);
+
+  // Insert key into the list.
+  // REQUIRES: nothing that compares equal to key is currently in the list.
+  void Insert(const Key& key);
+
+  // Returns true iff an entry that compares equal to key is in the list.
+  bool Contains(const Key& key) const;
+
+  // Iteration over the contents of a skip list
+  class Iterator {
+   public:
+    // Initialize an iterator over the specified list.
+    // The returned iterator is not valid.
+    explicit Iterator(const SkipList* list);
+
+    // Change the underlying skiplist used for this iterator
+    // This enables us not changing the iterator without deallocating
+    // an old one and then allocating a new one
+    void SetList(const SkipList* list);
+
+    // Returns true iff the iterator is positioned at a valid node.
+    bool Valid() const;
+
+    // Returns the key at the current position.
+    // REQUIRES: Valid()
+    const Key& key() const;
+
+    // Advances to the next position.
+    // REQUIRES: Valid()
+    void Next();
+
+    // Advances to the previous position.
+    // REQUIRES: Valid()
+    void Prev();
+
+    // Advance to the first entry with a key >= target
+    void Seek(const Key& target);
+
+    // Position at the first entry in list.
+    // Final state of iterator is Valid() iff list is not empty.
+    void SeekToFirst();
+
+    // Position at the last entry in list.
+    // Final state of iterator is Valid() iff list is not empty.
+    void SeekToLast();
+
+   private:
+    const SkipList* list_;
+    Node* node_;
+    // Intentionally copyable
+  };
+
+ private:
+  enum { kMaxHeight = 12 };
+
+  // Immutable after construction
+  Comparator const compare_;
+  Arena* const arena_;    // Arena used for allocations of nodes
+
+  Node* const head_;
+
+  // Modified only by Insert().  Read racily by readers, but stale
+  // values are ok.
+  port::AtomicPointer max_height_;   // Height of the entire list
+
+  // Used for optimizing sequential insert patterns
+  Node* prev_[kMaxHeight];
+  int   prev_height_;
+
+  inline int GetMaxHeight() const {
+    return static_cast<int>(
+        reinterpret_cast<intptr_t>(max_height_.NoBarrier_Load()));
+  }
+
+  // Read/written only by Insert().
+  Random rnd_;
+
+  Node* NewNode(const Key& key, int height);
+  int RandomHeight();
+  bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); }
+
+  // Return true if key is greater than the data stored in "n"
+  bool KeyIsAfterNode(const Key& key, Node* n) const;
+
+  // Return the earliest node that comes at or after key.
+  // Return nullptr if there is no such node.
+  //
+  // If prev is non-nullptr, fills prev[level] with pointer to previous
+  // node at "level" for every level in [0..max_height_-1].
+  Node* FindGreaterOrEqual(const Key& key, Node** prev) const;
+
+  // Return the latest node with a key < key.
+  // Return head_ if there is no such node.
+  Node* FindLessThan(const Key& key) const;
+
+  // Return the last node in the list.
+  // Return head_ if list is empty.
+  Node* FindLast() const;
+
+  // No copying allowed
+  SkipList(const SkipList&);
+  void operator=(const SkipList&);
+};
+
+// Implementation details follow
+template<typename Key, class Comparator>
+struct SkipList<Key,Comparator>::Node {
+  explicit Node(const Key& k) : key(k) { }
+
+  Key const key;
+
+  // Accessors/mutators for links.  Wrapped in methods so we can
+  // add the appropriate barriers as necessary.
+  Node* Next(int n) {
+    assert(n >= 0);
+    // Use an 'acquire load' so that we observe a fully initialized
+    // version of the returned Node.
+    return reinterpret_cast<Node*>(next_[n].Acquire_Load());
+  }
+  void SetNext(int n, Node* x) {
+    assert(n >= 0);
+    // Use a 'release store' so that anybody who reads through this
+    // pointer observes a fully initialized version of the inserted node.
+    next_[n].Release_Store(x);
+  }
+
+  // No-barrier variants that can be safely used in a few locations.
+  Node* NoBarrier_Next(int n) {
+    assert(n >= 0);
+    return reinterpret_cast<Node*>(next_[n].NoBarrier_Load());
+  }
+  void NoBarrier_SetNext(int n, Node* x) {
+    assert(n >= 0);
+    next_[n].NoBarrier_Store(x);
+  }
+
+ private:
+  // Array of length equal to the node height.  next_[0] is lowest level link.
+  port::AtomicPointer next_[1];
+};
+
+template<typename Key, class Comparator>
+typename SkipList<Key,Comparator>::Node*
+SkipList<Key,Comparator>::NewNode(const Key& key, int height) {
+  char* mem = arena_->AllocateAligned(
+      sizeof(Node) + sizeof(port::AtomicPointer) * (height - 1));
+  return new (mem) Node(key);
+}
+
+template<typename Key, class Comparator>
+inline SkipList<Key,Comparator>::Iterator::Iterator(const SkipList* list) {
+  SetList(list);
+}
+
+template<typename Key, class Comparator>
+inline void SkipList<Key,Comparator>::Iterator::SetList(const SkipList* list) {
+  list_ = list;
+  node_ = nullptr;
+}
+
+template<typename Key, class Comparator>
+inline bool SkipList<Key,Comparator>::Iterator::Valid() const {
+  return node_ != nullptr;
+}
+
+template<typename Key, class Comparator>
+inline const Key& SkipList<Key,Comparator>::Iterator::key() const {
+  assert(Valid());
+  return node_->key;
+}
+
+template<typename Key, class Comparator>
+inline void SkipList<Key,Comparator>::Iterator::Next() {
+  assert(Valid());
+  node_ = node_->Next(0);
+}
+
+template<typename Key, class Comparator>
+inline void SkipList<Key,Comparator>::Iterator::Prev() {
+  // Instead of using explicit "prev" links, we just search for the
+  // last node that falls before key.
+  assert(Valid());
+  node_ = list_->FindLessThan(node_->key);
+  if (node_ == list_->head_) {
+    node_ = nullptr;
+  }
+}
+
+template<typename Key, class Comparator>
+inline void SkipList<Key,Comparator>::Iterator::Seek(const Key& target) {
+  node_ = list_->FindGreaterOrEqual(target, nullptr);
+}
+
+template<typename Key, class Comparator>
+inline void SkipList<Key,Comparator>::Iterator::SeekToFirst() {
+  node_ = list_->head_->Next(0);
+}
+
+template<typename Key, class Comparator>
+inline void SkipList<Key,Comparator>::Iterator::SeekToLast() {
+  node_ = list_->FindLast();
+  if (node_ == list_->head_) {
+    node_ = nullptr;
+  }
+}
+
+template<typename Key, class Comparator>
+int SkipList<Key,Comparator>::RandomHeight() {
+  // Increase height with probability 1 in kBranching
+  static const unsigned int kBranching = 4;
+  int height = 1;
+  while (height < kMaxHeight && ((rnd_.Next() % kBranching) == 0)) {
+    height++;
+  }
+  assert(height > 0);
+  assert(height <= kMaxHeight);
+  return height;
+}
+
+template<typename Key, class Comparator>
+bool SkipList<Key,Comparator>::KeyIsAfterNode(const Key& key, Node* n) const {
+  // nullptr n is considered infinite
+  return (n != nullptr) && (compare_(n->key, key) < 0);
+}
+
+template<typename Key, class Comparator>
+typename SkipList<Key,Comparator>::Node* SkipList<Key,Comparator>::FindGreaterOrEqual(const Key& key, Node** prev)
+    const {
+  // Use prev as an optimization hint and fallback to slow path
+  if (prev && !KeyIsAfterNode(key, prev[0]->Next(0))) {
+    Node* x = prev[0];
+    Node* next = x->Next(0);
+    if ((x == head_) || KeyIsAfterNode(key, x)) {
+      // Adjust all relevant insertion points to the previous entry
+      for (int i = 1; i < prev_height_; i++) {
+        prev[i] = x;
+      }
+      return next;
+    }
+  }
+  // Normal lookup
+  Node* x = head_;
+  int level = GetMaxHeight() - 1;
+  while (true) {
+    Node* next = x->Next(level);
+    // Make sure the lists are sorted.
+    // If x points to head_ or next points nullptr, it is trivially satisfied.
+    assert((x == head_) || (next == nullptr) || KeyIsAfterNode(next->key, x));
+    if (KeyIsAfterNode(key, next)) {
+      // Keep searching in this list
+      x = next;
+    } else {
+      if (prev != nullptr) prev[level] = x;
+      if (level == 0) {
+        return next;
+      } else {
+        // Switch to next list
+        level--;
+      }
+    }
+  }
+}
+
+template<typename Key, class Comparator>
+typename SkipList<Key,Comparator>::Node*
+SkipList<Key,Comparator>::FindLessThan(const Key& key) const {
+  Node* x = head_;
+  int level = GetMaxHeight() - 1;
+  while (true) {
+    assert(x == head_ || compare_(x->key, key) < 0);
+    Node* next = x->Next(level);
+    if (next == nullptr || compare_(next->key, key) >= 0) {
+      if (level == 0) {
+        return x;
+      } else {
+        // Switch to next list
+        level--;
+      }
+    } else {
+      x = next;
+    }
+  }
+}
+
+template<typename Key, class Comparator>
+typename SkipList<Key,Comparator>::Node* SkipList<Key,Comparator>::FindLast()
+    const {
+  Node* x = head_;
+  int level = GetMaxHeight() - 1;
+  while (true) {
+    Node* next = x->Next(level);
+    if (next == nullptr) {
+      if (level == 0) {
+        return x;
+      } else {
+        // Switch to next list
+        level--;
+      }
+    } else {
+      x = next;
+    }
+  }
+}
+
+template<typename Key, class Comparator>
+SkipList<Key,Comparator>::SkipList(Comparator cmp, Arena* arena)
+    : compare_(cmp),
+      arena_(arena),
+      head_(NewNode(0 /* any key will do */, kMaxHeight)),
+      max_height_(reinterpret_cast<void*>(1)),
+      prev_height_(1),
+      rnd_(0xdeadbeef) {
+  for (int i = 0; i < kMaxHeight; i++) {
+    head_->SetNext(i, nullptr);
+    prev_[i] = head_;
+  }
+}
+
+template<typename Key, class Comparator>
+void SkipList<Key,Comparator>::Insert(const Key& key) {
+  // TODO(opt): We can use a barrier-free variant of FindGreaterOrEqual()
+  // here since Insert() is externally synchronized.
+  Node* x = FindGreaterOrEqual(key, prev_);
+
+  // Our data structure does not allow duplicate insertion
+  assert(x == nullptr || !Equal(key, x->key));
+
+  int height = RandomHeight();
+  if (height > GetMaxHeight()) {
+    for (int i = GetMaxHeight(); i < height; i++) {
+      prev_[i] = head_;
+    }
+    //fprintf(stderr, "Change height from %d to %d\n", max_height_, height);
+
+    // It is ok to mutate max_height_ without any synchronization
+    // with concurrent readers.  A concurrent reader that observes
+    // the new value of max_height_ will see either the old value of
+    // new level pointers from head_ (nullptr), or a new value set in
+    // the loop below.  In the former case the reader will
+    // immediately drop to the next level since nullptr sorts after all
+    // keys.  In the latter case the reader will use the new node.
+    max_height_.NoBarrier_Store(reinterpret_cast<void*>(height));
+  }
+
+  x = NewNode(key, height);
+  for (int i = 0; i < height; i++) {
+    // NoBarrier_SetNext() suffices since we will add a barrier when
+    // we publish a pointer to "x" in prev[i].
+    x->NoBarrier_SetNext(i, prev_[i]->NoBarrier_Next(i));
+    prev_[i]->SetNext(i, x);
+  }
+  prev_[0] = x;
+  prev_height_ = height;
+}
+
+template<typename Key, class Comparator>
+bool SkipList<Key,Comparator>::Contains(const Key& key) const {
+  Node* x = FindGreaterOrEqual(key, nullptr);
+  if (x != nullptr && Equal(key, x->key)) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+}  // namespace rocksdb
--- a/db/skiplist_test.cc
+++ b/db/skiplist_test.cc
@ -0,0 +1,383 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/skiplist.h"
+#include <set>
+#include "rocksdb/env.h"
+#include "util/arena_impl.h"
+#include "util/hash.h"
+#include "util/random.h"
+#include "util/testharness.h"
+
+namespace rocksdb {
+
+typedef uint64_t Key;
+
+struct TestComparator {
+  int operator()(const Key& a, const Key& b) const {
+    if (a < b) {
+      return -1;
+    } else if (a > b) {
+      return +1;
+    } else {
+      return 0;
+    }
+  }
+};
+
+class SkipTest { };
+
+TEST(SkipTest, Empty) {
+  ArenaImpl arena_impl;
+  TestComparator cmp;
+  SkipList<Key, TestComparator> list(cmp, &arena_impl);
+  ASSERT_TRUE(!list.Contains(10));
+
+  SkipList<Key, TestComparator>::Iterator iter(&list);
+  ASSERT_TRUE(!iter.Valid());
+  iter.SeekToFirst();
+  ASSERT_TRUE(!iter.Valid());
+  iter.Seek(100);
+  ASSERT_TRUE(!iter.Valid());
+  iter.SeekToLast();
+  ASSERT_TRUE(!iter.Valid());
+}
+
+TEST(SkipTest, InsertAndLookup) {
+  const int N = 2000;
+  const int R = 5000;
+  Random rnd(1000);
+  std::set<Key> keys;
+  ArenaImpl arena_impl;
+  TestComparator cmp;
+  SkipList<Key, TestComparator> list(cmp, &arena_impl);
+  for (int i = 0; i < N; i++) {
+    Key key = rnd.Next() % R;
+    if (keys.insert(key).second) {
+      list.Insert(key);
+    }
+  }
+
+  for (int i = 0; i < R; i++) {
+    if (list.Contains(i)) {
+      ASSERT_EQ(keys.count(i), 1U);
+    } else {
+      ASSERT_EQ(keys.count(i), 0U);
+    }
+  }
+
+  // Simple iterator tests
+  {
+    SkipList<Key, TestComparator>::Iterator iter(&list);
+    ASSERT_TRUE(!iter.Valid());
+
+    iter.Seek(0);
+    ASSERT_TRUE(iter.Valid());
+    ASSERT_EQ(*(keys.begin()), iter.key());
+
+    iter.SeekToFirst();
+    ASSERT_TRUE(iter.Valid());
+    ASSERT_EQ(*(keys.begin()), iter.key());
+
+    iter.SeekToLast();
+    ASSERT_TRUE(iter.Valid());
+    ASSERT_EQ(*(keys.rbegin()), iter.key());
+  }
+
+  // Forward iteration test
+  for (int i = 0; i < R; i++) {
+    SkipList<Key, TestComparator>::Iterator iter(&list);
+    iter.Seek(i);
+
+    // Compare against model iterator
+    std::set<Key>::iterator model_iter = keys.lower_bound(i);
+    for (int j = 0; j < 3; j++) {
+      if (model_iter == keys.end()) {
+        ASSERT_TRUE(!iter.Valid());
+        break;
+      } else {
+        ASSERT_TRUE(iter.Valid());
+        ASSERT_EQ(*model_iter, iter.key());
+        ++model_iter;
+        iter.Next();
+      }
+    }
+  }
+
+  // Backward iteration test
+  {
+    SkipList<Key, TestComparator>::Iterator iter(&list);
+    iter.SeekToLast();
+
+    // Compare against model iterator
+    for (std::set<Key>::reverse_iterator model_iter = keys.rbegin();
+         model_iter != keys.rend();
+         ++model_iter) {
+      ASSERT_TRUE(iter.Valid());
+      ASSERT_EQ(*model_iter, iter.key());
+      iter.Prev();
+    }
+    ASSERT_TRUE(!iter.Valid());
+  }
+}
+
+// We want to make sure that with a single writer and multiple
+// concurrent readers (with no synchronization other than when a
+// reader's iterator is created), the reader always observes all the
+// data that was present in the skip list when the iterator was
+// constructor.  Because insertions are happening concurrently, we may
+// also observe new values that were inserted since the iterator was
+// constructed, but we should never miss any values that were present
+// at iterator construction time.
+//
+// We generate multi-part keys:
+//     <key,gen,hash>
+// where:
+//     key is in range [0..K-1]
+//     gen is a generation number for key
+//     hash is hash(key,gen)
+//
+// The insertion code picks a random key, sets gen to be 1 + the last
+// generation number inserted for that key, and sets hash to Hash(key,gen).
+//
+// At the beginning of a read, we snapshot the last inserted
+// generation number for each key.  We then iterate, including random
+// calls to Next() and Seek().  For every key we encounter, we
+// check that it is either expected given the initial snapshot or has
+// been concurrently added since the iterator started.
+class ConcurrentTest {
+ private:
+  static const uint32_t K = 4;
+
+  static uint64_t key(Key key) { return (key >> 40); }
+  static uint64_t gen(Key key) { return (key >> 8) & 0xffffffffu; }
+  static uint64_t hash(Key key) { return key & 0xff; }
+
+  static uint64_t HashNumbers(uint64_t k, uint64_t g) {
+    uint64_t data[2] = { k, g };
+    return Hash(reinterpret_cast<char*>(data), sizeof(data), 0);
+  }
+
+  static Key MakeKey(uint64_t k, uint64_t g) {
+    assert(sizeof(Key) == sizeof(uint64_t));
+    assert(k <= K);  // We sometimes pass K to seek to the end of the skiplist
+    assert(g <= 0xffffffffu);
+    return ((k << 40) | (g << 8) | (HashNumbers(k, g) & 0xff));
+  }
+
+  static bool IsValidKey(Key k) {
+    return hash(k) == (HashNumbers(key(k), gen(k)) & 0xff);
+  }
+
+  static Key RandomTarget(Random* rnd) {
+    switch (rnd->Next() % 10) {
+      case 0:
+        // Seek to beginning
+        return MakeKey(0, 0);
+      case 1:
+        // Seek to end
+        return MakeKey(K, 0);
+      default:
+        // Seek to middle
+        return MakeKey(rnd->Next() % K, 0);
+    }
+  }
+
+  // Per-key generation
+  struct State {
+    port::AtomicPointer generation[K];
+    void Set(int k, intptr_t v) {
+      generation[k].Release_Store(reinterpret_cast<void*>(v));
+    }
+    intptr_t Get(int k) {
+      return reinterpret_cast<intptr_t>(generation[k].Acquire_Load());
+    }
+
+    State() {
+      for (unsigned int k = 0; k < K; k++) {
+        Set(k, 0);
+      }
+    }
+  };
+
+  // Current state of the test
+  State current_;
+
+  ArenaImpl arena_impl_;
+
+  // SkipList is not protected by mu_.  We just use a single writer
+  // thread to modify it.
+  SkipList<Key, TestComparator> list_;
+
+ public:
+  ConcurrentTest() : list_(TestComparator(), &arena_impl_) { }
+
+  // REQUIRES: External synchronization
+  void WriteStep(Random* rnd) {
+    const uint32_t k = rnd->Next() % K;
+    const intptr_t g = current_.Get(k) + 1;
+    const Key key = MakeKey(k, g);
+    list_.Insert(key);
+    current_.Set(k, g);
+  }
+
+  void ReadStep(Random* rnd) {
+    // Remember the initial committed state of the skiplist.
+    State initial_state;
+    for (unsigned int k = 0; k < K; k++) {
+      initial_state.Set(k, current_.Get(k));
+    }
+
+    Key pos = RandomTarget(rnd);
+    SkipList<Key, TestComparator>::Iterator iter(&list_);
+    iter.Seek(pos);
+    while (true) {
+      Key current;
+      if (!iter.Valid()) {
+        current = MakeKey(K, 0);
+      } else {
+        current = iter.key();
+        ASSERT_TRUE(IsValidKey(current)) << current;
+      }
+      ASSERT_LE(pos, current) << "should not go backwards";
+
+      // Verify that everything in [pos,current) was not present in
+      // initial_state.
+      while (pos < current) {
+        ASSERT_LT(key(pos), K) << pos;
+
+        // Note that generation 0 is never inserted, so it is ok if
+        // <*,0,*> is missing.
+        ASSERT_TRUE((gen(pos) == 0U) ||
+                    (gen(pos) > (uint64_t)initial_state.Get(key(pos)))
+                    ) << "key: " << key(pos)
+                      << "; gen: " << gen(pos)
+                      << "; initgen: "
+                      << initial_state.Get(key(pos));
+
+        // Advance to next key in the valid key space
+        if (key(pos) < key(current)) {
+          pos = MakeKey(key(pos) + 1, 0);
+        } else {
+          pos = MakeKey(key(pos), gen(pos) + 1);
+        }
+      }
+
+      if (!iter.Valid()) {
+        break;
+      }
+
+      if (rnd->Next() % 2) {
+        iter.Next();
+        pos = MakeKey(key(pos), gen(pos) + 1);
+      } else {
+        Key new_target = RandomTarget(rnd);
+        if (new_target > pos) {
+          pos = new_target;
+          iter.Seek(new_target);
+        }
+      }
+    }
+  }
+};
+const uint32_t ConcurrentTest::K;
+
+// Simple test that does single-threaded testing of the ConcurrentTest
+// scaffolding.
+TEST(SkipTest, ConcurrentWithoutThreads) {
+  ConcurrentTest test;
+  Random rnd(test::RandomSeed());
+  for (int i = 0; i < 10000; i++) {
+    test.ReadStep(&rnd);
+    test.WriteStep(&rnd);
+  }
+}
+
+class TestState {
+ public:
+  ConcurrentTest t_;
+  int seed_;
+  port::AtomicPointer quit_flag_;
+
+  enum ReaderState {
+    STARTING,
+    RUNNING,
+    DONE
+  };
+
+  explicit TestState(int s)
+      : seed_(s),
+        quit_flag_(nullptr),
+        state_(STARTING),
+        state_cv_(&mu_) {}
+
+  void Wait(ReaderState s) {
+    mu_.Lock();
+    while (state_ != s) {
+      state_cv_.Wait();
+    }
+    mu_.Unlock();
+  }
+
+  void Change(ReaderState s) {
+    mu_.Lock();
+    state_ = s;
+    state_cv_.Signal();
+    mu_.Unlock();
+  }
+
+ private:
+  port::Mutex mu_;
+  ReaderState state_;
+  port::CondVar state_cv_;
+};
+
+static void ConcurrentReader(void* arg) {
+  TestState* state = reinterpret_cast<TestState*>(arg);
+  Random rnd(state->seed_);
+  int64_t reads = 0;
+  state->Change(TestState::RUNNING);
+  while (!state->quit_flag_.Acquire_Load()) {
+    state->t_.ReadStep(&rnd);
+    ++reads;
+  }
+  state->Change(TestState::DONE);
+}
+
+static void RunConcurrent(int run) {
+  const int seed = test::RandomSeed() + (run * 100);
+  Random rnd(seed);
+  const int N = 1000;
+  const int kSize = 1000;
+  for (int i = 0; i < N; i++) {
+    if ((i % 100) == 0) {
+      fprintf(stderr, "Run %d of %d\n", i, N);
+    }
+    TestState state(seed + 1);
+    Env::Default()->Schedule(ConcurrentReader, &state);
+    state.Wait(TestState::RUNNING);
+    for (int i = 0; i < kSize; i++) {
+      state.t_.WriteStep(&rnd);
+    }
+    state.quit_flag_.Release_Store(&state);  // Any non-nullptr arg will do
+    state.Wait(TestState::DONE);
+  }
+}
+
+TEST(SkipTest, Concurrent1) { RunConcurrent(1); }
+TEST(SkipTest, Concurrent2) { RunConcurrent(2); }
+TEST(SkipTest, Concurrent3) { RunConcurrent(3); }
+TEST(SkipTest, Concurrent4) { RunConcurrent(4); }
+TEST(SkipTest, Concurrent5) { RunConcurrent(5); }
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
--- a/db/snapshot.h
+++ b/db/snapshot.h
@ -0,0 +1,86 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include "rocksdb/db.h"
+
+namespace rocksdb {
+
+class SnapshotList;
+
+// Snapshots are kept in a doubly-linked list in the DB.
+// Each SnapshotImpl corresponds to a particular sequence number.
+class SnapshotImpl : public Snapshot {
+ public:
+  SequenceNumber number_;  // const after creation
+
+ private:
+  friend class SnapshotList;
+
+  // SnapshotImpl is kept in a doubly-linked circular list
+  SnapshotImpl* prev_;
+  SnapshotImpl* next_;
+
+  SnapshotList* list_;                 // just for sanity checks
+};
+
+class SnapshotList {
+ public:
+  SnapshotList() {
+    list_.prev_ = &list_;
+    list_.next_ = &list_;
+    list_.number_ = 0xFFFFFFFFL;      // placeholder marker, for debugging
+  }
+
+  bool empty() const { return list_.next_ == &list_; }
+  SnapshotImpl* oldest() const { assert(!empty()); return list_.next_; }
+  SnapshotImpl* newest() const { assert(!empty()); return list_.prev_; }
+
+  const SnapshotImpl* New(SequenceNumber seq) {
+    SnapshotImpl* s = new SnapshotImpl;
+    s->number_ = seq;
+    s->list_ = this;
+    s->next_ = &list_;
+    s->prev_ = list_.prev_;
+    s->prev_->next_ = s;
+    s->next_->prev_ = s;
+    return s;
+  }
+
+  void Delete(const SnapshotImpl* s) {
+    assert(s->list_ == this);
+    s->prev_->next_ = s->next_;
+    s->next_->prev_ = s->prev_;
+    delete s;
+  }
+
+  // retrieve all snapshot numbers. They are sorted in ascending order.
+  void getAll(std::vector<SequenceNumber>& ret) {
+    if (empty()) return;
+    SnapshotImpl* s = &list_;
+    while (s->next_ != &list_) {
+      ret.push_back(s->next_->number_);
+      s = s ->next_;
+    }
+  }
+
+  // get the sequence number of the most recent snapshot
+  const SequenceNumber GetNewest() {
+    if (empty()) {
+      return 0;
+    }
+    return newest()->number_;
+  }
+
+ private:
+  // Dummy head of doubly-linked list of snapshots
+  SnapshotImpl list_;
+};
+
+}  // namespace rocksdb
--- a/db/table_cache.cc
+++ b/db/table_cache.cc
@ -0,0 +1,173 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/table_cache.h"
+
+#include "db/filename.h"
+
+#include "rocksdb/statistics.h"
+#include "rocksdb/table.h"
+#include "util/coding.h"
+#include "util/stop_watch.h"
+
+namespace rocksdb {
+
+static void DeleteEntry(const Slice& key, void* value) {
+  TableReader* table_reader = reinterpret_cast<TableReader*>(value);
+  delete table_reader;
+}
+
+static void UnrefEntry(void* arg1, void* arg2) {
+  Cache* cache = reinterpret_cast<Cache*>(arg1);
+  Cache::Handle* h = reinterpret_cast<Cache::Handle*>(arg2);
+  cache->Release(h);
+}
+
+TableCache::TableCache(const std::string& dbname,
+                       const Options* options,
+                       const EnvOptions& storage_options,
+                       int entries)
+    : env_(options->env),
+      dbname_(dbname),
+      options_(options),
+      storage_options_(storage_options),
+      cache_(
+        NewLRUCache(entries, options->table_cache_numshardbits,
+                    options->table_cache_remove_scan_count_limit)) {
+}
+
+TableCache::~TableCache() {
+}
+
+Status TableCache::FindTable(const EnvOptions& toptions,
+                             uint64_t file_number, uint64_t file_size,
+                             Cache::Handle** handle, bool* table_io,
+                             const bool no_io) {
+  Status s;
+  char buf[sizeof(file_number)];
+  EncodeFixed64(buf, file_number);
+  Slice key(buf, sizeof(buf));
+  *handle = cache_->Lookup(key);
+  if (*handle == nullptr) {
+    if (no_io) { // Dont do IO and return a not-found status
+      return Status::Incomplete("Table not found in table_cache, no_io is set");
+    }
+    if (table_io != nullptr) {
+      *table_io = true;    // we had to do IO from storage
+    }
+    std::string fname = TableFileName(dbname_, file_number);
+    unique_ptr<RandomAccessFile> file;
+    unique_ptr<TableReader> table_reader;
+    s = env_->NewRandomAccessFile(fname, &file, toptions);
+    RecordTick(options_->statistics, NO_FILE_OPENS);
+    if (s.ok()) {
+      if (options_->advise_random_on_open) {
+        file->Hint(RandomAccessFile::RANDOM);
+      }
+      StopWatch sw(env_, options_->statistics, TABLE_OPEN_IO_MICROS);
+      s = options_->table_factory->GetTableReader(*options_, toptions,
+                                                  std::move(file), file_size,
+                                                  &table_reader);
+    }
+
+    if (!s.ok()) {
+      assert(table_reader == nullptr);
+      RecordTick(options_->statistics, NO_FILE_ERRORS);
+      // We do not cache error results so that if the error is transient,
+      // or somebody repairs the file, we recover automatically.
+    } else {
+      assert(file.get() == nullptr);
+      *handle = cache_->Insert(key, table_reader.release(), 1, &DeleteEntry);
+    }
+  }
+  return s;
+}
+
+Iterator* TableCache::NewIterator(const ReadOptions& options,
+                                  const EnvOptions& toptions,
+                                  uint64_t file_number,
+                                  uint64_t file_size,
+                                  TableReader** table_reader_ptr,
+                                  bool for_compaction) {
+  if (table_reader_ptr != nullptr) {
+    *table_reader_ptr = nullptr;
+  }
+
+  Cache::Handle* handle = nullptr;
+  Status s = FindTable(toptions, file_number, file_size, &handle,
+                       nullptr, options.read_tier == kBlockCacheTier);
+  if (!s.ok()) {
+    return NewErrorIterator(s);
+  }
+
+  TableReader* table_reader =
+    reinterpret_cast<TableReader*>(cache_->Value(handle));
+  Iterator* result = table_reader->NewIterator(options);
+  result->RegisterCleanup(&UnrefEntry, cache_.get(), handle);
+  if (table_reader_ptr != nullptr) {
+    *table_reader_ptr = table_reader;
+  }
+
+  if (for_compaction) {
+    table_reader->SetupForCompaction();
+  }
+
+  return result;
+}
+
+Status TableCache::Get(const ReadOptions& options,
+                       uint64_t file_number,
+                       uint64_t file_size,
+                       const Slice& k,
+                       void* arg,
+                       bool (*saver)(void*, const Slice&, const Slice&, bool),
+                       bool* table_io,
+                       void (*mark_key_may_exist)(void*)) {
+  Cache::Handle* handle = nullptr;
+  Status s = FindTable(storage_options_, file_number, file_size,
+                       &handle, table_io,
+                       options.read_tier == kBlockCacheTier);
+  if (s.ok()) {
+    TableReader* t =
+      reinterpret_cast<TableReader*>(cache_->Value(handle));
+    s = t->Get(options, k, arg, saver, mark_key_may_exist);
+    cache_->Release(handle);
+  } else if (options.read_tier && s.IsIncomplete()) {
+    // Couldnt find Table in cache but treat as kFound if no_io set
+    (*mark_key_may_exist)(arg);
+    return Status::OK();
+  }
+  return s;
+}
+
+bool TableCache::PrefixMayMatch(const ReadOptions& options,
+                                uint64_t file_number,
+                                uint64_t file_size,
+                                const Slice& internal_prefix,
+                                bool* table_io) {
+  Cache::Handle* handle = nullptr;
+  Status s = FindTable(storage_options_, file_number,
+                       file_size, &handle, table_io);
+  bool may_match = true;
+  if (s.ok()) {
+    TableReader* t =
+      reinterpret_cast<TableReader*>(cache_->Value(handle));
+    may_match = t->PrefixMayMatch(internal_prefix);
+    cache_->Release(handle);
+  }
+  return may_match;
+}
+
+void TableCache::Evict(uint64_t file_number) {
+  char buf[sizeof(file_number)];
+  EncodeFixed64(buf, file_number);
+  cache_->Erase(Slice(buf, sizeof(buf)));
+}
+
+}  // namespace rocksdb
--- a/db/table_cache.h
+++ b/db/table_cache.h
@ -0,0 +1,78 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Thread-safe (provides internal synchronization)
+
+#pragma once
+#include <string>
+#include <stdint.h>
+#include "db/dbformat.h"
+#include "rocksdb/env.h"
+#include "rocksdb/cache.h"
+#include "port/port.h"
+#include "rocksdb/table.h"
+
+namespace rocksdb {
+
+class Env;
+
+class TableCache {
+ public:
+  TableCache(const std::string& dbname, const Options* options,
+             const EnvOptions& storage_options, int entries);
+  ~TableCache();
+
+  // Return an iterator for the specified file number (the corresponding
+  // file length must be exactly "file_size" bytes).  If "tableptr" is
+  // non-nullptr, also sets "*tableptr" to point to the Table object
+  // underlying the returned iterator, or nullptr if no Table object underlies
+  // the returned iterator.  The returned "*tableptr" object is owned by
+  // the cache and should not be deleted, and is valid for as long as the
+  // returned iterator is live.
+  Iterator* NewIterator(const ReadOptions& options,
+                        const EnvOptions& toptions,
+                        uint64_t file_number,
+                        uint64_t file_size,
+                        TableReader** table_reader_ptr = nullptr,
+                        bool for_compaction = false);
+
+  // If a seek to internal key "k" in specified file finds an entry,
+  // call (*handle_result)(arg, found_key, found_value) repeatedly until
+  // it returns false.
+  Status Get(const ReadOptions& options,
+             uint64_t file_number,
+             uint64_t file_size,
+             const Slice& k,
+             void* arg,
+             bool (*handle_result)(void*, const Slice&, const Slice&, bool),
+             bool* table_io,
+             void (*mark_key_may_exist)(void*) = nullptr);
+
+  // Determine whether the table may contain the specified prefix.  If
+  // the table index of blooms are not in memory, this may cause an I/O
+  bool PrefixMayMatch(const ReadOptions& options, uint64_t file_number,
+                      uint64_t file_size, const Slice& internal_prefix,
+                      bool* table_io);
+
+  // Evict any entry for the specified file number
+  void Evict(uint64_t file_number);
+
+ private:
+  Env* const env_;
+  const std::string dbname_;
+  const Options* options_;
+  const EnvOptions& storage_options_;
+  std::shared_ptr<Cache> cache_;
+
+  Status FindTable(const EnvOptions& toptions, uint64_t file_number,
+                   uint64_t file_size, Cache::Handle**, bool* table_io=nullptr,
+                   const bool no_io = false);
+};
+
+}  // namespace rocksdb
--- a/db/table_stats_collector.cc
+++ b/db/table_stats_collector.cc
@ -0,0 +1,55 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include "db/table_stats_collector.h"
+
+#include "db/dbformat.h"
+#include "util/coding.h"
+
+namespace rocksdb {
+
+Status InternalKeyStatsCollector::Add(const Slice& key, const Slice& value) {
+  ParsedInternalKey ikey;
+  if (!ParseInternalKey(key, &ikey)) {
+    return Status::InvalidArgument("Invalid internal key");
+  }
+
+  if (ikey.type == ValueType::kTypeDeletion) {
+    ++deleted_keys_;
+  }
+
+  return Status::OK();
+}
+
+Status InternalKeyStatsCollector::Finish(
+    TableStats::UserCollectedStats* stats) {
+  assert(stats);
+  assert(stats->find(InternalKeyTableStatsNames::kDeletedKeys) == stats->end());
+  std::string val;
+
+  PutVarint64(&val, deleted_keys_);
+  stats->insert(std::make_pair(InternalKeyTableStatsNames::kDeletedKeys, val));
+
+  return Status::OK();
+}
+
+Status UserKeyTableStatsCollector::Add(const Slice& key, const Slice& value) {
+  ParsedInternalKey ikey;
+  if (!ParseInternalKey(key, &ikey)) {
+    return Status::InvalidArgument("Invalid internal key");
+  }
+
+  return collector_->Add(ikey.user_key, value);
+}
+
+Status UserKeyTableStatsCollector::Finish(
+    TableStats::UserCollectedStats* stats) {
+  return collector_->Finish(stats);
+}
+
+const std::string InternalKeyTableStatsNames::kDeletedKeys
+  = "rocksdb.deleted.keys";
+
+}  // namespace rocksdb
--- a/db/table_stats_collector.h
+++ b/db/table_stats_collector.h
@ -0,0 +1,58 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file defines a collection of statistics collectors.
+#pragma once
+
+#include "rocksdb/table_stats.h"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace rocksdb {
+
+struct InternalKeyTableStatsNames {
+  static const std::string kDeletedKeys;
+};
+
+// Collecting the statistics for internal keys. Visible only by internal
+// rocksdb modules.
+class InternalKeyStatsCollector : public TableStatsCollector {
+ public:
+  virtual Status Add(const Slice& key, const Slice& value);
+  virtual Status Finish(TableStats::UserCollectedStats* stats);
+  virtual const char* Name() const { return "InternalKeyStatsCollector"; }
+
+ private:
+  uint64_t deleted_keys_ = 0;
+};
+
+// When rocksdb creates a new table, it will encode all "user keys" into
+// "internal keys", which contains meta information of a given entry.
+//
+// This class extracts user key from the encoded internal key when Add() is
+// invoked.
+class UserKeyTableStatsCollector : public TableStatsCollector {
+ public:
+  explicit UserKeyTableStatsCollector(TableStatsCollector* collector):
+    UserKeyTableStatsCollector(
+        std::shared_ptr<TableStatsCollector>(collector)
+    ) {
+  }
+
+  explicit UserKeyTableStatsCollector(
+      std::shared_ptr<TableStatsCollector> collector) : collector_(collector) {
+  }
+  virtual ~UserKeyTableStatsCollector() { }
+  virtual Status Add(const Slice& key, const Slice& value);
+  virtual Status Finish(TableStats::UserCollectedStats* stats);
+  virtual const char* Name() const { return collector_->Name(); }
+
+ protected:
+  std::shared_ptr<TableStatsCollector> collector_;
+};
+
+}  // namespace rocksdb
--- a/db/table_stats_collector_test.cc
+++ b/db/table_stats_collector_test.cc
@ -0,0 +1,261 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include <map>
+#include <memory>
+#include <string>
+
+#include "db/dbformat.h"
+#include "db/db_impl.h"
+#include "db/table_stats_collector.h"
+#include "rocksdb/table_stats.h"
+#include "rocksdb/table.h"
+#include "util/coding.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+class TableStatsTest {
+ private:
+  unique_ptr<TableReader> table_reader_;
+};
+
+// TODO(kailiu) the following classes should be moved to some more general
+// places, so that other tests can also make use of them.
+// `FakeWritableFile` and `FakeRandomeAccessFile` bypass the real file system
+// and therefore enable us to quickly setup the tests.
+class FakeWritableFile : public WritableFile {
+ public:
+  ~FakeWritableFile() { }
+
+  const std::string& contents() const { return contents_; }
+
+  virtual Status Close() { return Status::OK(); }
+  virtual Status Flush() { return Status::OK(); }
+  virtual Status Sync() { return Status::OK(); }
+
+  virtual Status Append(const Slice& data) {
+    contents_.append(data.data(), data.size());
+    return Status::OK();
+  }
+
+ private:
+  std::string contents_;
+};
+
+
+class FakeRandomeAccessFile : public RandomAccessFile {
+ public:
+  explicit FakeRandomeAccessFile(const Slice& contents)
+      : contents_(contents.data(), contents.size()) {
+  }
+
+  virtual ~FakeRandomeAccessFile() { }
+
+  uint64_t Size() const { return contents_.size(); }
+
+  virtual Status Read(uint64_t offset, size_t n, Slice* result,
+                       char* scratch) const {
+    if (offset > contents_.size()) {
+      return Status::InvalidArgument("invalid Read offset");
+    }
+    if (offset + n > contents_.size()) {
+      n = contents_.size() - offset;
+    }
+    memcpy(scratch, &contents_[offset], n);
+    *result = Slice(scratch, n);
+    return Status::OK();
+  }
+
+ private:
+  std::string contents_;
+};
+
+
+class DumbLogger : public Logger {
+ public:
+  virtual void Logv(const char* format, va_list ap) { }
+  virtual size_t GetLogFileSize() const { return 0; }
+};
+
+// Utilities test functions
+void MakeBuilder(
+    Options options,
+    std::unique_ptr<FakeWritableFile>* writable,
+    std::unique_ptr<TableBuilder>* builder) {
+  writable->reset(new FakeWritableFile);
+  if (!options.flush_block_policy_factory) {
+    options.SetUpDefaultFlushBlockPolicyFactory();
+  }
+  builder->reset(
+      options.table_factory->GetTableBuilder(options, writable->get(),
+                                             options.compression));
+}
+
+void OpenTable(
+    const Options& options,
+    const std::string& contents,
+    std::unique_ptr<TableReader>* table_reader) {
+  std::unique_ptr<RandomAccessFile> file(new FakeRandomeAccessFile(contents));
+  auto s = options.table_factory->GetTableReader(
+      options,
+      EnvOptions(),
+      std::move(file),
+      contents.size(),
+      table_reader
+  );
+  ASSERT_OK(s);
+}
+
+// Collects keys that starts with "A" in a table.
+class RegularKeysStartWithA: public TableStatsCollector {
+ public:
+   const char* Name() const { return "RegularKeysStartWithA"; }
+
+   Status Finish(TableStats::UserCollectedStats* stats) {
+     std::string encoded;
+     PutVarint32(&encoded, count_);
+     *stats = TableStats::UserCollectedStats {
+       { "TableStatsTest", "Rocksdb" },
+       { "Count", encoded }
+     };
+     return Status::OK();
+   }
+
+   Status Add(const Slice& user_key, const Slice& value) {
+     // simply asssume all user keys are not empty.
+     if (user_key.data()[0] == 'A') {
+       ++count_;
+     }
+     return Status::OK();
+   }
+
+ private:
+  uint32_t count_ = 0;
+};
+
+TEST(TableStatsTest, CustomizedTableStatsCollector) {
+  Options options;
+
+  // make sure the entries will be inserted with order.
+  std::map<std::string, std::string> kvs = {
+    {"About",     "val5"},  // starts with 'A'
+    {"Abstract",  "val2"},  // starts with 'A'
+    {"Around",    "val7"},  // starts with 'A'
+    {"Beyond",    "val3"},
+    {"Builder",   "val1"},
+    {"Cancel",    "val4"},
+    {"Find",      "val6"},
+  };
+
+  // Test stats collectors with internal keys or regular keys
+  for (bool encode_as_internal : { true, false }) {
+    // -- Step 1: build table
+    auto collector = new RegularKeysStartWithA();
+    if (encode_as_internal) {
+      options.table_stats_collectors = {
+        std::make_shared<UserKeyTableStatsCollector>(collector)
+      };
+    } else {
+      options.table_stats_collectors.resize(1);
+      options.table_stats_collectors[0].reset(collector);
+    }
+    std::unique_ptr<TableBuilder> builder;
+    std::unique_ptr<FakeWritableFile> writable;
+    MakeBuilder(options, &writable, &builder);
+
+    for (const auto& kv : kvs) {
+      if (encode_as_internal) {
+        InternalKey ikey(kv.first, 0, ValueType::kTypeValue);
+        builder->Add(ikey.Encode(), kv.second);
+      } else {
+        builder->Add(kv.first, kv.second);
+      }
+    }
+    ASSERT_OK(builder->Finish());
+
+    // -- Step 2: Open table
+    std::unique_ptr<TableReader> table_reader;
+    OpenTable(options, writable->contents(), &table_reader);
+    const auto& stats = table_reader->GetTableStats().user_collected_stats;
+
+    ASSERT_EQ("Rocksdb", stats.at("TableStatsTest"));
+
+    uint32_t starts_with_A = 0;
+    Slice key(stats.at("Count"));
+    ASSERT_TRUE(GetVarint32(&key, &starts_with_A));
+    ASSERT_EQ(3u, starts_with_A);
+  }
+}
+
+TEST(TableStatsTest, InternalKeyStatsCollector) {
+  InternalKey keys[] = {
+    InternalKey("A", 0, ValueType::kTypeValue),
+    InternalKey("B", 0, ValueType::kTypeValue),
+    InternalKey("C", 0, ValueType::kTypeValue),
+    InternalKey("W", 0, ValueType::kTypeDeletion),
+    InternalKey("X", 0, ValueType::kTypeDeletion),
+    InternalKey("Y", 0, ValueType::kTypeDeletion),
+    InternalKey("Z", 0, ValueType::kTypeDeletion),
+  };
+
+  for (bool sanitized : { false, true }) {
+    std::unique_ptr<TableBuilder> builder;
+    std::unique_ptr<FakeWritableFile> writable;
+    Options options;
+    if (sanitized) {
+      options.table_stats_collectors = {
+        std::make_shared<RegularKeysStartWithA>()
+      };
+      // with sanitization, even regular stats collector will be able to
+      // handle internal keys.
+      auto comparator = options.comparator;
+      // HACK: Set options.info_log to avoid writing log in
+      // SanitizeOptions().
+      options.info_log = std::make_shared<DumbLogger>();
+      options = SanitizeOptions(
+          "db",  // just a place holder
+          nullptr,  // with skip internal key comparator
+          nullptr,  // don't care filter policy
+          options
+      );
+      options.comparator = comparator;
+    } else {
+      options.table_stats_collectors = {
+        std::make_shared<InternalKeyStatsCollector>()
+      };
+    }
+
+    MakeBuilder(options, &writable, &builder);
+    for (const auto& k : keys) {
+      builder->Add(k.Encode(), "val");
+    }
+
+    ASSERT_OK(builder->Finish());
+
+    std::unique_ptr<TableReader> table_reader;
+    OpenTable(options, writable->contents(), &table_reader);
+    const auto& stats = table_reader->GetTableStats().user_collected_stats;
+
+    uint64_t deleted = 0;
+    Slice key(stats.at(InternalKeyTableStatsNames::kDeletedKeys));
+    ASSERT_TRUE(GetVarint64(&key, &deleted));
+    ASSERT_EQ(4u, deleted);
+
+    if (sanitized) {
+      uint32_t starts_with_A = 0;
+      Slice key(stats.at("Count"));
+      ASSERT_TRUE(GetVarint32(&key, &starts_with_A));
+      ASSERT_EQ(1u, starts_with_A);
+    }
+  }
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
--- a/db/transaction_log_impl.cc
+++ b/db/transaction_log_impl.cc
@ -0,0 +1,264 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#include "db/transaction_log_impl.h"
+#include "db/write_batch_internal.h"
+
+namespace rocksdb {
+
+TransactionLogIteratorImpl::TransactionLogIteratorImpl(
+                           const std::string& dir,
+                           const Options* options,
+                           const EnvOptions& soptions,
+                           const SequenceNumber seq,
+                           std::unique_ptr<VectorLogPtr> files,
+                           DBImpl const * const dbimpl) :
+    dir_(dir),
+    options_(options),
+    soptions_(soptions),
+    startingSequenceNumber_(seq),
+    files_(std::move(files)),
+    started_(false),
+    isValid_(false),
+    currentFileIndex_(0),
+    currentBatchSeq_(0),
+    currentLastSeq_(0),
+    dbimpl_(dbimpl) {
+  assert(files_ != nullptr);
+  assert(dbimpl_ != nullptr);
+
+  reporter_.env = options_->env;
+  reporter_.info_log = options_->info_log.get();
+  SeekToStartSequence(); // Seek till starting sequence
+}
+
+Status TransactionLogIteratorImpl::OpenLogFile(
+    const LogFile* logFile,
+    unique_ptr<SequentialFile>* file) {
+  Env* env = options_->env;
+  if (logFile->Type() == kArchivedLogFile) {
+    std::string fname = ArchivedLogFileName(dir_, logFile->LogNumber());
+    return env->NewSequentialFile(fname, file, soptions_);
+  } else {
+    std::string fname = LogFileName(dir_, logFile->LogNumber());
+    Status status = env->NewSequentialFile(fname, file, soptions_);
+    if (!status.ok()) {
+      //  If cannot open file in DB directory.
+      //  Try the archive dir, as it could have moved in the meanwhile.
+      fname = ArchivedLogFileName(dir_, logFile->LogNumber());
+      status = env->NewSequentialFile(fname, file, soptions_);
+      if (!status.ok()) {
+        return Status::IOError("Requested file not present in the dir");
+      }
+    }
+    return status;
+  }
+}
+
+BatchResult TransactionLogIteratorImpl::GetBatch()  {
+  assert(isValid_);  //  cannot call in a non valid state.
+  BatchResult result;
+  result.sequence = currentBatchSeq_;
+  result.writeBatchPtr = std::move(currentBatch_);
+  return result;
+}
+
+Status TransactionLogIteratorImpl::status() {
+  return currentStatus_;
+}
+
+bool TransactionLogIteratorImpl::Valid() {
+  return started_ && isValid_;
+}
+
+bool TransactionLogIteratorImpl::RestrictedRead(
+    Slice* record,
+    std::string* scratch) {
+  // Don't read if no more complete entries to read from logs
+  if (currentLastSeq_ >= dbimpl_->GetLatestSequenceNumber()) {
+    return false;
+  }
+  return currentLogReader_->ReadRecord(record, scratch);
+}
+
+void TransactionLogIteratorImpl::SeekToStartSequence(
+    uint64_t startFileIndex,
+    bool strict) {
+  std::string scratch;
+  Slice record;
+  started_ = false;
+  isValid_ = false;
+  if (files_->size() <= startFileIndex) {
+    return;
+  }
+  Status s = OpenLogReader(files_->at(startFileIndex).get());
+  if (!s.ok()) {
+    currentStatus_ = s;
+    return;
+  }
+  while (RestrictedRead(&record, &scratch)) {
+    if (record.size() < 12) {
+      reporter_.Corruption(
+        record.size(), Status::Corruption("very small log record"));
+      continue;
+    }
+    UpdateCurrentWriteBatch(record);
+    if (currentLastSeq_ >= startingSequenceNumber_) {
+      if (strict && currentBatchSeq_ != startingSequenceNumber_) {
+        currentStatus_ = Status::Corruption("Gap in sequence number. Could not "
+                                            "seek to required sequence number");
+        reporter_.Info(currentStatus_.ToString().c_str());
+        return;
+      } else if (strict) {
+        reporter_.Info("Could seek required sequence number. Iterator will "
+                       "continue.");
+      }
+      isValid_ = true;
+      started_ = true; // set started_ as we could seek till starting sequence
+      return;
+    } else {
+      isValid_ = false;
+    }
+  }
+
+  // Could not find start sequence in first file. Normally this must be the
+  // only file. Otherwise log the error and let the iterator return next entry
+  // If strict is set, we want to seek exactly till the start sequence and it
+  // should have been present in the file we scanned above
+  if (strict) {
+    currentStatus_ = Status::Corruption("Gap in sequence number. Could not "
+                                        "seek to required sequence number");
+    reporter_.Info(currentStatus_.ToString().c_str());
+  } else if (files_->size() != 1) {
+    currentStatus_ = Status::Corruption("Start sequence was not found, "
+                                        "skipping to the next available");
+    reporter_.Info(currentStatus_.ToString().c_str());
+    // Let NextImpl find the next available entry. started_ remains false
+    // because we don't want to check for gaps while moving to start sequence
+    NextImpl(true);
+  }
+}
+
+void TransactionLogIteratorImpl::Next() {
+  return NextImpl(false);
+}
+
+void TransactionLogIteratorImpl::NextImpl(bool internal) {
+  std::string scratch;
+  Slice record;
+  isValid_ = false;
+  if (!internal && !started_) {
+    // Runs every time until we can seek to the start sequence
+    return SeekToStartSequence();
+  }
+  while(true) {
+    assert(currentLogReader_);
+    if (currentLogReader_->IsEOF()) {
+      currentLogReader_->UnmarkEOF();
+    }
+    while (RestrictedRead(&record, &scratch)) {
+      if (record.size() < 12) {
+        reporter_.Corruption(
+          record.size(), Status::Corruption("very small log record"));
+        continue;
+      } else {
+        // started_ should be true if called by application
+        assert(internal || started_);
+        // started_ should be false if called internally
+        assert(!internal || !started_);
+        UpdateCurrentWriteBatch(record);
+        if (internal && !started_) {
+          started_ = true;
+        }
+        return;
+      }
+    }
+
+    // Open the next file
+    if (currentFileIndex_ < files_->size() - 1) {
+      ++currentFileIndex_;
+      Status status =OpenLogReader(files_->at(currentFileIndex_).get());
+      if (!status.ok()) {
+        isValid_ = false;
+        currentStatus_ = status;
+        return;
+      }
+    } else {
+      isValid_ = false;
+      if (currentLastSeq_ == dbimpl_->GetLatestSequenceNumber()) {
+        currentStatus_ = Status::OK();
+      } else {
+        currentStatus_ = Status::IOError("NO MORE DATA LEFT");
+      }
+      return;
+    }
+  }
+}
+
+bool TransactionLogIteratorImpl::IsBatchExpected(
+    const WriteBatch* batch,
+    const SequenceNumber expectedSeq) {
+  assert(batch);
+  SequenceNumber batchSeq = WriteBatchInternal::Sequence(batch);
+  if (batchSeq != expectedSeq) {
+    char buf[200];
+    snprintf(buf, sizeof(buf),
+             "Discontinuity in log records. Got seq=%lu, Expected seq=%lu, "
+             "Last flushed seq=%lu.Log iterator will reseek the correct "
+             "batch.",
+             (unsigned long)batchSeq,
+             (unsigned long)expectedSeq,
+             (unsigned long)dbimpl_->GetLatestSequenceNumber());
+    reporter_.Info(buf);
+    return false;
+  }
+  return true;
+}
+
+void TransactionLogIteratorImpl::UpdateCurrentWriteBatch(const Slice& record) {
+  std::unique_ptr<WriteBatch> batch(new WriteBatch());
+  WriteBatchInternal::SetContents(batch.get(), record);
+
+  SequenceNumber expectedSeq = currentLastSeq_ + 1;
+  // If the iterator has started, then confirm that we get continuous batches
+  if (started_ && !IsBatchExpected(batch.get(), expectedSeq)) {
+    // Seek to the batch having expected sequence number
+    if (expectedSeq < files_->at(currentFileIndex_)->StartSequence()) {
+      // Expected batch must lie in the previous log file
+      // Avoid underflow.
+      if (currentFileIndex_ != 0) {
+        currentFileIndex_--;
+      }
+    }
+    startingSequenceNumber_ = expectedSeq;
+    // currentStatus_ will be set to Ok if reseek succeeds
+    currentStatus_ = Status::NotFound("Gap in sequence numbers");
+    return SeekToStartSequence(currentFileIndex_, true);
+  }
+
+  currentBatchSeq_ = WriteBatchInternal::Sequence(batch.get());
+  currentLastSeq_ = currentBatchSeq_ +
+                    WriteBatchInternal::Count(batch.get()) - 1;
+  // currentBatchSeq_ can only change here
+  assert(currentLastSeq_ <= dbimpl_->GetLatestSequenceNumber());
+
+  currentBatch_ = move(batch);
+  isValid_ = true;
+  currentStatus_ = Status::OK();
+}
+
+Status TransactionLogIteratorImpl::OpenLogReader(const LogFile* logFile) {
+  unique_ptr<SequentialFile> file;
+  Status status = OpenLogFile(logFile, &file);
+  if (!status.ok()) {
+    return status;
+  }
+  assert(file);
+  currentLogReader_.reset(
+    new log::Reader(std::move(file), &reporter_, true, 0)
+  );
+  return Status::OK();
+}
+}  //  namespace rocksdb
--- a/db/transaction_log_impl.h
+++ b/db/transaction_log_impl.h
@ -0,0 +1,118 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#pragma once
+#include <vector>
+
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/types.h"
+#include "rocksdb/transaction_log.h"
+#include "db/db_impl.h"
+#include "db/log_reader.h"
+#include "db/filename.h"
+
+namespace rocksdb {
+
+struct LogReporter : public log::Reader::Reporter {
+  Env* env;
+  Logger* info_log;
+  virtual void Corruption(size_t bytes, const Status& s) {
+    Log(info_log, "dropping %zu bytes; %s", bytes, s.ToString().c_str());
+  }
+  virtual void Info(const char* s) {
+    Log(info_log, "%s", s);
+  }
+};
+
+class LogFileImpl : public LogFile {
+ public:
+  LogFileImpl(uint64_t logNum, WalFileType logType, SequenceNumber startSeq,
+              uint64_t sizeBytes) :
+    logNumber_(logNum),
+    type_(logType),
+    startSequence_(startSeq),
+    sizeFileBytes_(sizeBytes) {
+  }
+
+  std::string PathName() const {
+    if (type_ == kArchivedLogFile) {
+      return ArchivedLogFileName("", logNumber_);
+    }
+    return LogFileName("", logNumber_);
+  }
+
+  uint64_t LogNumber() const { return logNumber_; }
+
+  WalFileType Type() const { return type_; }
+
+  SequenceNumber StartSequence() const { return startSequence_; }
+
+  uint64_t SizeFileBytes() const { return sizeFileBytes_; }
+
+  bool operator < (const LogFile& that) const {
+    return LogNumber() < that.LogNumber();
+  }
+
+ private:
+  uint64_t logNumber_;
+  WalFileType type_;
+  SequenceNumber startSequence_;
+  uint64_t sizeFileBytes_;
+
+};
+
+class TransactionLogIteratorImpl : public TransactionLogIterator {
+ public:
+  TransactionLogIteratorImpl(const std::string& dir,
+                             const Options* options,
+                             const EnvOptions& soptions,
+                             const SequenceNumber seqNum,
+                             std::unique_ptr<VectorLogPtr> files,
+                             DBImpl const * const dbimpl);
+
+  virtual bool Valid();
+
+  virtual void Next();
+
+  virtual Status status();
+
+  virtual BatchResult GetBatch();
+
+ private:
+  const std::string& dir_;
+  const Options* options_;
+  const EnvOptions& soptions_;
+  SequenceNumber startingSequenceNumber_;
+  std::unique_ptr<VectorLogPtr> files_;
+  bool started_;
+  bool isValid_;  // not valid when it starts of.
+  Status currentStatus_;
+  size_t currentFileIndex_;
+  std::unique_ptr<WriteBatch> currentBatch_;
+  unique_ptr<log::Reader> currentLogReader_;
+  Status OpenLogFile(const LogFile* logFile, unique_ptr<SequentialFile>* file);
+  LogReporter reporter_;
+  SequenceNumber currentBatchSeq_; // sequence number at start of current batch
+  SequenceNumber currentLastSeq_; // last sequence in the current batch
+  DBImpl const * const dbimpl_; // The db on whose log files this iterates
+
+  // Reads from transaction log only if the writebatch record has been written
+  bool RestrictedRead(Slice* record, std::string* scratch);
+  // Seeks to startingSequenceNumber reading from startFileIndex in files_.
+  // If strict is set,then must get a batch starting with startingSequenceNumber
+  void SeekToStartSequence(uint64_t startFileIndex = 0, bool strict = false);
+  // Implementation of Next. SeekToStartSequence calls it internally with
+  // internal=true to let it find next entry even if it has to jump gaps because
+  // the iterator may start off from the first available entry but promises to
+  // be continuous after that
+  void NextImpl(bool internal = false);
+  // Check if batch is expected, else return false
+  bool IsBatchExpected(const WriteBatch* batch, SequenceNumber expectedSeq);
+  // Update current batch if a continuous batch is found, else return false
+  void UpdateCurrentWriteBatch(const Slice& record);
+  Status OpenLogReader(const LogFile* file);
+};
+}  //  namespace rocksdb
--- a/db/version_edit.cc
+++ b/db/version_edit.cc
@ -0,0 +1,301 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_edit.h"
+
+#include "db/version_set.h"
+#include "util/coding.h"
+
+namespace rocksdb {
+
+// Tag numbers for serialized VersionEdit.  These numbers are written to
+// disk and should not be changed.
+enum Tag {
+  kComparator           = 1,
+  kLogNumber            = 2,
+  kNextFileNumber       = 3,
+  kLastSequence         = 4,
+  kCompactPointer       = 5,
+  kDeletedFile          = 6,
+  kNewFile              = 7,
+  // 8 was used for large value refs
+  kPrevLogNumber        = 9,
+
+  // these are new formats divergent from open source leveldb
+  kNewFile2             = 100  // store smallest & largest seqno
+};
+
+void VersionEdit::Clear() {
+  comparator_.clear();
+  log_number_ = 0;
+  prev_log_number_ = 0;
+  last_sequence_ = 0;
+  next_file_number_ = 0;
+  has_comparator_ = false;
+  has_log_number_ = false;
+  has_prev_log_number_ = false;
+  has_next_file_number_ = false;
+  has_last_sequence_ = false;
+  deleted_files_.clear();
+  new_files_.clear();
+}
+
+void VersionEdit::EncodeTo(std::string* dst) const {
+  if (has_comparator_) {
+    PutVarint32(dst, kComparator);
+    PutLengthPrefixedSlice(dst, comparator_);
+  }
+  if (has_log_number_) {
+    PutVarint32(dst, kLogNumber);
+    PutVarint64(dst, log_number_);
+  }
+  if (has_prev_log_number_) {
+    PutVarint32(dst, kPrevLogNumber);
+    PutVarint64(dst, prev_log_number_);
+  }
+  if (has_next_file_number_) {
+    PutVarint32(dst, kNextFileNumber);
+    PutVarint64(dst, next_file_number_);
+  }
+  if (has_last_sequence_) {
+    PutVarint32(dst, kLastSequence);
+    PutVarint64(dst, last_sequence_);
+  }
+
+  for (size_t i = 0; i < compact_pointers_.size(); i++) {
+    PutVarint32(dst, kCompactPointer);
+    PutVarint32(dst, compact_pointers_[i].first);  // level
+    PutLengthPrefixedSlice(dst, compact_pointers_[i].second.Encode());
+  }
+
+  for (DeletedFileSet::const_iterator iter = deleted_files_.begin();
+       iter != deleted_files_.end();
+       ++iter) {
+    PutVarint32(dst, kDeletedFile);
+    PutVarint32(dst, iter->first);   // level
+    PutVarint64(dst, iter->second);  // file number
+  }
+
+  for (size_t i = 0; i < new_files_.size(); i++) {
+    const FileMetaData& f = new_files_[i].second;
+    PutVarint32(dst, kNewFile2);
+    PutVarint32(dst, new_files_[i].first);  // level
+    PutVarint64(dst, f.number);
+    PutVarint64(dst, f.file_size);
+    PutLengthPrefixedSlice(dst, f.smallest.Encode());
+    PutLengthPrefixedSlice(dst, f.largest.Encode());
+    PutVarint64(dst, f.smallest_seqno);
+    PutVarint64(dst, f.largest_seqno);
+  }
+}
+
+static bool GetInternalKey(Slice* input, InternalKey* dst) {
+  Slice str;
+  if (GetLengthPrefixedSlice(input, &str)) {
+    dst->DecodeFrom(str);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+bool VersionEdit::GetLevel(Slice* input, int* level, const char** msg) {
+  uint32_t v;
+  if (GetVarint32(input, &v) &&
+      (int)v < number_levels_) {
+    *level = v;
+    return true;
+  } else {
+    if ((int)v >= number_levels_) {
+      *msg = "db already has more levels than options.num_levels";
+    }
+    return false;
+  }
+}
+
+Status VersionEdit::DecodeFrom(const Slice& src) {
+  Clear();
+  Slice input = src;
+  const char* msg = nullptr;
+  uint32_t tag;
+
+  // Temporary storage for parsing
+  int level;
+  uint64_t number;
+  FileMetaData f;
+  Slice str;
+  InternalKey key;
+
+  while (msg == nullptr && GetVarint32(&input, &tag)) {
+    switch (tag) {
+      case kComparator:
+        if (GetLengthPrefixedSlice(&input, &str)) {
+          comparator_ = str.ToString();
+          has_comparator_ = true;
+        } else {
+          msg = "comparator name";
+        }
+        break;
+
+      case kLogNumber:
+        if (GetVarint64(&input, &log_number_)) {
+          has_log_number_ = true;
+        } else {
+          msg = "log number";
+        }
+        break;
+
+      case kPrevLogNumber:
+        if (GetVarint64(&input, &prev_log_number_)) {
+          has_prev_log_number_ = true;
+        } else {
+          msg = "previous log number";
+        }
+        break;
+
+      case kNextFileNumber:
+        if (GetVarint64(&input, &next_file_number_)) {
+          has_next_file_number_ = true;
+        } else {
+          msg = "next file number";
+        }
+        break;
+
+      case kLastSequence:
+        if (GetVarint64(&input, &last_sequence_)) {
+          has_last_sequence_ = true;
+        } else {
+          msg = "last sequence number";
+        }
+        break;
+
+      case kCompactPointer:
+        if (GetLevel(&input, &level, &msg) &&
+            GetInternalKey(&input, &key)) {
+          compact_pointers_.push_back(std::make_pair(level, key));
+        } else {
+          if (!msg) {
+            msg = "compaction pointer";
+          }
+        }
+        break;
+
+      case kDeletedFile:
+        if (GetLevel(&input, &level, &msg) &&
+            GetVarint64(&input, &number)) {
+          deleted_files_.insert(std::make_pair(level, number));
+        } else {
+          if (!msg) {
+            msg = "deleted file";
+          }
+        }
+        break;
+
+      case kNewFile:
+        if (GetLevel(&input, &level, &msg) &&
+            GetVarint64(&input, &f.number) &&
+            GetVarint64(&input, &f.file_size) &&
+            GetInternalKey(&input, &f.smallest) &&
+            GetInternalKey(&input, &f.largest)) {
+          new_files_.push_back(std::make_pair(level, f));
+        } else {
+          if (!msg) {
+            msg = "new-file entry";
+          }
+        }
+        break;
+
+      case kNewFile2:
+        if (GetLevel(&input, &level, &msg) &&
+            GetVarint64(&input, &f.number) &&
+            GetVarint64(&input, &f.file_size) &&
+            GetInternalKey(&input, &f.smallest) &&
+            GetInternalKey(&input, &f.largest) &&
+            GetVarint64(&input, &f.smallest_seqno) &&
+            GetVarint64(&input, &f.largest_seqno) ) {
+          new_files_.push_back(std::make_pair(level, f));
+        } else {
+          if (!msg) {
+            msg = "new-file2 entry";
+          }
+        }
+        break;
+
+      default:
+        msg = "unknown tag";
+        break;
+    }
+  }
+
+  if (msg == nullptr && !input.empty()) {
+    msg = "invalid tag";
+  }
+
+  Status result;
+  if (msg != nullptr) {
+    result = Status::Corruption("VersionEdit", msg);
+  }
+  return result;
+}
+
+std::string VersionEdit::DebugString(bool hex_key) const {
+  std::string r;
+  r.append("VersionEdit {");
+  if (has_comparator_) {
+    r.append("\n  Comparator: ");
+    r.append(comparator_);
+  }
+  if (has_log_number_) {
+    r.append("\n  LogNumber: ");
+    AppendNumberTo(&r, log_number_);
+  }
+  if (has_prev_log_number_) {
+    r.append("\n  PrevLogNumber: ");
+    AppendNumberTo(&r, prev_log_number_);
+  }
+  if (has_next_file_number_) {
+    r.append("\n  NextFile: ");
+    AppendNumberTo(&r, next_file_number_);
+  }
+  if (has_last_sequence_) {
+    r.append("\n  LastSeq: ");
+    AppendNumberTo(&r, last_sequence_);
+  }
+  for (size_t i = 0; i < compact_pointers_.size(); i++) {
+    r.append("\n  CompactPointer: ");
+    AppendNumberTo(&r, compact_pointers_[i].first);
+    r.append(" ");
+    r.append(compact_pointers_[i].second.DebugString(hex_key));
+  }
+  for (DeletedFileSet::const_iterator iter = deleted_files_.begin();
+       iter != deleted_files_.end();
+       ++iter) {
+    r.append("\n  DeleteFile: ");
+    AppendNumberTo(&r, iter->first);
+    r.append(" ");
+    AppendNumberTo(&r, iter->second);
+  }
+  for (size_t i = 0; i < new_files_.size(); i++) {
+    const FileMetaData& f = new_files_[i].second;
+    r.append("\n  AddFile: ");
+    AppendNumberTo(&r, new_files_[i].first);
+    r.append(" ");
+    AppendNumberTo(&r, f.number);
+    r.append(" ");
+    AppendNumberTo(&r, f.file_size);
+    r.append(" ");
+    r.append(f.smallest.DebugString(hex_key));
+    r.append(" .. ");
+    r.append(f.largest.DebugString(hex_key));
+  }
+  r.append("\n}\n");
+  return r;
+}
+
+}  // namespace rocksdb
--- a/db/version_edit.h
+++ b/db/version_edit.h
@ -0,0 +1,128 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <set>
+#include <utility>
+#include <vector>
+#include "db/dbformat.h"
+
+namespace rocksdb {
+
+class VersionSet;
+
+struct FileMetaData {
+  int refs;
+  int allowed_seeks;          // Seeks allowed until compaction
+  uint64_t number;
+  uint64_t file_size;         // File size in bytes
+  InternalKey smallest;       // Smallest internal key served by table
+  InternalKey largest;        // Largest internal key served by table
+  bool being_compacted;       // Is this file undergoing compaction?
+  SequenceNumber smallest_seqno;// The smallest seqno in this file
+  SequenceNumber largest_seqno; // The largest seqno in this file
+
+  FileMetaData() : refs(0), allowed_seeks(1 << 30), file_size(0),
+                   being_compacted(false) { }
+};
+
+class VersionEdit {
+ public:
+  explicit VersionEdit(int number_levels) :
+      number_levels_(number_levels) {
+    Clear();
+  }
+  ~VersionEdit() { }
+
+  void Clear();
+
+  void SetComparatorName(const Slice& name) {
+    has_comparator_ = true;
+    comparator_ = name.ToString();
+  }
+  void SetLogNumber(uint64_t num) {
+    has_log_number_ = true;
+    log_number_ = num;
+  }
+  void SetPrevLogNumber(uint64_t num) {
+    has_prev_log_number_ = true;
+    prev_log_number_ = num;
+  }
+  void SetNextFile(uint64_t num) {
+    has_next_file_number_ = true;
+    next_file_number_ = num;
+  }
+  void SetLastSequence(SequenceNumber seq) {
+    has_last_sequence_ = true;
+    last_sequence_ = seq;
+  }
+  void SetCompactPointer(int level, const InternalKey& key) {
+    compact_pointers_.push_back(std::make_pair(level, key));
+  }
+
+  // Add the specified file at the specified number.
+  // REQUIRES: This version has not been saved (see VersionSet::SaveTo)
+  // REQUIRES: "smallest" and "largest" are smallest and largest keys in file
+  void AddFile(int level, uint64_t file,
+               uint64_t file_size,
+               const InternalKey& smallest,
+               const InternalKey& largest,
+               const SequenceNumber& smallest_seqno,
+               const SequenceNumber& largest_seqno) {
+    FileMetaData f;
+    f.number = file;
+    f.file_size = file_size;
+    f.smallest = smallest;
+    f.largest = largest;
+    f.smallest_seqno = smallest_seqno;
+    f.largest_seqno = largest_seqno;
+    assert(smallest_seqno <= largest_seqno);
+    new_files_.push_back(std::make_pair(level, f));
+  }
+
+  // Delete the specified "file" from the specified "level".
+  void DeleteFile(int level, uint64_t file) {
+    deleted_files_.insert(std::make_pair(level, file));
+  }
+
+  // Number of edits
+  int NumEntries() {
+    return new_files_.size() + deleted_files_.size();
+  }
+
+  void EncodeTo(std::string* dst) const;
+  Status DecodeFrom(const Slice& src);
+
+  std::string DebugString(bool hex_key = false) const;
+
+ private:
+  friend class VersionSet;
+
+  typedef std::set< std::pair<int, uint64_t> > DeletedFileSet;
+
+  bool GetLevel(Slice* input, int* level, const char** msg);
+
+  int number_levels_;
+  std::string comparator_;
+  uint64_t log_number_;
+  uint64_t prev_log_number_;
+  uint64_t next_file_number_;
+  SequenceNumber last_sequence_;
+  bool has_comparator_;
+  bool has_log_number_;
+  bool has_prev_log_number_;
+  bool has_next_file_number_;
+  bool has_last_sequence_;
+
+  std::vector< std::pair<int, InternalKey> > compact_pointers_;
+  DeletedFileSet deleted_files_;
+  std::vector< std::pair<int, FileMetaData> > new_files_;
+};
+
+}  // namespace rocksdb
--- a/db/version_edit_test.cc
+++ b/db/version_edit_test.cc
@ -0,0 +1,53 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_edit.h"
+#include "util/testharness.h"
+
+namespace rocksdb {
+
+static void TestEncodeDecode(const VersionEdit& edit) {
+  std::string encoded, encoded2;
+  edit.EncodeTo(&encoded);
+  VersionEdit parsed(7);
+  Status s = parsed.DecodeFrom(encoded);
+  ASSERT_TRUE(s.ok()) << s.ToString();
+  parsed.EncodeTo(&encoded2);
+  ASSERT_EQ(encoded, encoded2);
+}
+
+class VersionEditTest { };
+
+TEST(VersionEditTest, EncodeDecode) {
+  static const uint64_t kBig = 1ull << 50;
+
+  VersionEdit edit(7);
+  for (int i = 0; i < 4; i++) {
+    TestEncodeDecode(edit);
+    edit.AddFile(3, kBig + 300 + i, kBig + 400 + i,
+                 InternalKey("foo", kBig + 500 + i, kTypeValue),
+                 InternalKey("zoo", kBig + 600 + i, kTypeDeletion),
+                 kBig + 500 + i,
+                 kBig + 600 + i);
+    edit.DeleteFile(4, kBig + 700 + i);
+    edit.SetCompactPointer(i, InternalKey("x", kBig + 900 + i, kTypeValue));
+  }
+
+  edit.SetComparatorName("foo");
+  edit.SetLogNumber(kBig + 100);
+  edit.SetNextFile(kBig + 200);
+  edit.SetLastSequence(kBig + 1000);
+  TestEncodeDecode(edit);
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
--- a/db/version_set.cc
+++ b/db/version_set.cc
--- a/db/version_set.h
+++ b/db/version_set.h
@ -0,0 +1,654 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// The representation of a DBImpl consists of a set of Versions.  The
+// newest version is called "current".  Older versions may be kept
+// around to provide a consistent view to live iterators.
+//
+// Each Version keeps track of a set of Table files per level.  The
+// entire set of versions is maintained in a VersionSet.
+//
+// Version,VersionSet are thread-compatible, but require external
+// synchronization on all accesses.
+
+#pragma once
+#include <map>
+#include <memory>
+#include <set>
+#include <vector>
+#include <deque>
+#include "db/dbformat.h"
+#include "db/version_edit.h"
+#include "port/port.h"
+#include "db/table_cache.h"
+
+namespace rocksdb {
+
+namespace log { class Writer; }
+
+class Compaction;
+class Iterator;
+class MemTable;
+class TableCache;
+class Version;
+class VersionSet;
+
+// Return the smallest index i such that files[i]->largest >= key.
+// Return files.size() if there is no such file.
+// REQUIRES: "files" contains a sorted list of non-overlapping files.
+extern int FindFile(const InternalKeyComparator& icmp,
+                    const std::vector<FileMetaData*>& files,
+                    const Slice& key);
+
+// Returns true iff some file in "files" overlaps the user key range
+// [*smallest,*largest].
+// smallest==nullptr represents a key smaller than all keys in the DB.
+// largest==nullptr represents a key largest than all keys in the DB.
+// REQUIRES: If disjoint_sorted_files, files[] contains disjoint ranges
+//           in sorted order.
+extern bool SomeFileOverlapsRange(
+    const InternalKeyComparator& icmp,
+    bool disjoint_sorted_files,
+    const std::vector<FileMetaData*>& files,
+    const Slice* smallest_user_key,
+    const Slice* largest_user_key);
+
+class Version {
+ public:
+  // Append to *iters a sequence of iterators that will
+  // yield the contents of this Version when merged together.
+  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
+  void AddIterators(const ReadOptions&, const EnvOptions& soptions,
+                    std::vector<Iterator*>* iters);
+
+  // Lookup the value for key.  If found, store it in *val and
+  // return OK.  Else return a non-OK status.  Fills *stats.
+  // Uses *operands to store merge_operator operations to apply later
+  // REQUIRES: lock is not held
+  struct GetStats {
+    FileMetaData* seek_file;
+    int seek_file_level;
+  };
+  void Get(const ReadOptions&, const LookupKey& key, std::string* val,
+           Status* status, std::deque<std::string>* operands, GetStats* stats,
+           const Options& db_option,
+           bool* value_found = nullptr);
+
+  // Adds "stats" into the current state.  Returns true if a new
+  // compaction may need to be triggered, false otherwise.
+  // REQUIRES: lock is held
+  bool UpdateStats(const GetStats& stats);
+
+  // Reference count management (so Versions do not disappear out from
+  // under live iterators)
+  void Ref();
+  void Unref();
+
+  void GetOverlappingInputs(
+      int level,
+      const InternalKey* begin,         // nullptr means before all keys
+      const InternalKey* end,           // nullptr means after all keys
+      std::vector<FileMetaData*>* inputs,
+      int hint_index = -1,              // index of overlap file
+      int* file_index = nullptr);          // return index of overlap file
+
+  void GetOverlappingInputsBinarySearch(
+      int level,
+      const Slice& begin,         // nullptr means before all keys
+      const Slice& end,           // nullptr means after all keys
+      std::vector<FileMetaData*>* inputs,
+      int hint_index,             // index of overlap file
+      int* file_index);           // return index of overlap file
+
+  void ExtendOverlappingInputs(
+      int level,
+      const Slice& begin,         // nullptr means before all keys
+      const Slice& end,           // nullptr means after all keys
+      std::vector<FileMetaData*>* inputs,
+      unsigned int index);                 // start extending from this index
+
+  // Returns true iff some file in the specified level overlaps
+  // some part of [*smallest_user_key,*largest_user_key].
+  // smallest_user_key==NULL represents a key smaller than all keys in the DB.
+  // largest_user_key==NULL represents a key largest than all keys in the DB.
+  bool OverlapInLevel(int level,
+                      const Slice* smallest_user_key,
+                      const Slice* largest_user_key);
+
+  // Returns true iff the first or last file in inputs contains
+  // an overlapping user key to the file "just outside" of it (i.e.
+  // just after the last file, or just before the first file)
+  // REQUIRES: "*inputs" is a sorted list of non-overlapping files
+  bool HasOverlappingUserKey(const std::vector<FileMetaData*>* inputs,
+                             int level);
+
+
+  // Return the level at which we should place a new memtable compaction
+  // result that covers the range [smallest_user_key,largest_user_key].
+  int PickLevelForMemTableOutput(const Slice& smallest_user_key,
+                                 const Slice& largest_user_key);
+
+  int NumFiles(int level) const { return files_[level].size(); }
+
+  // Return a human readable string that describes this version's contents.
+  std::string DebugString(bool hex = false) const;
+
+  // Returns the version nuber of this version
+  uint64_t GetVersionNumber() {
+    return version_number_;
+  }
+
+ private:
+  friend class Compaction;
+  friend class VersionSet;
+  friend class DBImpl;
+
+  class LevelFileNumIterator;
+  Iterator* NewConcatenatingIterator(const ReadOptions&,
+                                     const EnvOptions& soptions,
+                                     int level) const;
+  bool PrefixMayMatch(const ReadOptions& options, const EnvOptions& soptions,
+                      const Slice& internal_prefix, Iterator* level_iter) const;
+
+  VersionSet* vset_;            // VersionSet to which this Version belongs
+  Version* next_;               // Next version in linked list
+  Version* prev_;               // Previous version in linked list
+  int refs_;                    // Number of live refs to this version
+
+  // List of files per level, files in each level are arranged
+  // in increasing order of keys
+  std::vector<FileMetaData*>* files_;
+
+  // A list for the same set of files that are stored in files_,
+  // but files in each level are now sorted based on file
+  // size. The file with the largest size is at the front.
+  // This vector stores the index of the file from files_.
+  std::vector< std::vector<int> > files_by_size_;
+
+  // An index into files_by_size_ that specifies the first
+  // file that is not yet compacted
+  std::vector<int> next_file_to_compact_by_size_;
+
+  // Only the first few entries of files_by_size_ are sorted.
+  // There is no need to sort all the files because it is likely
+  // that on a running system, we need to look at only the first
+  // few largest files because a new version is created every few
+  // seconds/minutes (because of concurrent compactions).
+  static const int number_of_files_to_sort_ = 50;
+
+  // Next file to compact based on seek stats.
+  FileMetaData* file_to_compact_;
+  int file_to_compact_level_;
+
+  // Level that should be compacted next and its compaction score.
+  // Score < 1 means compaction is not strictly needed.  These fields
+  // are initialized by Finalize().
+  // The most critical level to be compacted is listed first
+  // These are used to pick the best compaction level
+  std::vector<double> compaction_score_;
+  std::vector<int> compaction_level_;
+  double max_compaction_score_; // max score in l1 to ln-1
+  int max_compaction_score_level_; // level on which max score occurs
+
+  // The offset in the manifest file where this version is stored.
+  uint64_t offset_manifest_file_;
+
+  // A version number that uniquely represents this version. This is
+  // used for debugging and logging purposes only.
+  uint64_t version_number_;
+
+  explicit Version(VersionSet* vset, uint64_t version_number = 0);
+
+  ~Version();
+
+  // re-initializes the index that is used to offset into files_by_size_
+  // to find the next compaction candidate file.
+  void ResetNextCompactionIndex(int level) {
+    next_file_to_compact_by_size_[level] = 0;
+  }
+
+  // No copying allowed
+  Version(const Version&);
+  void operator=(const Version&);
+};
+
+class VersionSet {
+ public:
+  VersionSet(const std::string& dbname,
+             const Options* options,
+             const EnvOptions& storage_options,
+             TableCache* table_cache,
+             const InternalKeyComparator*);
+  ~VersionSet();
+
+  // Apply *edit to the current version to form a new descriptor that
+  // is both saved to persistent state and installed as the new
+  // current version.  Will release *mu while actually writing to the file.
+  // REQUIRES: *mu is held on entry.
+  // REQUIRES: no other thread concurrently calls LogAndApply()
+  Status LogAndApply(VersionEdit* edit, port::Mutex* mu,
+      bool new_descriptor_log = false);
+
+  // Recover the last saved descriptor from persistent storage.
+  Status Recover();
+
+  // Try to reduce the number of levels. This call is valid when
+  // only one level from the new max level to the old
+  // max level containing files.
+  // For example, a db currently has 7 levels [0-6], and a call to
+  // to reduce to 5 [0-4] can only be executed when only one level
+  // among [4-6] contains files.
+  Status ReduceNumberOfLevels(int new_levels, port::Mutex* mu);
+
+  // Return the current version.
+  Version* current() const { return current_; }
+
+  // Return the current manifest file number
+  uint64_t ManifestFileNumber() const { return manifest_file_number_; }
+
+  // Allocate and return a new file number
+  uint64_t NewFileNumber() { return next_file_number_++; }
+
+  // Arrange to reuse "file_number" unless a newer file number has
+  // already been allocated.
+  // REQUIRES: "file_number" was returned by a call to NewFileNumber().
+  void ReuseFileNumber(uint64_t file_number) {
+    if (next_file_number_ == file_number + 1) {
+      next_file_number_ = file_number;
+    }
+  }
+
+  // Return the number of Table files at the specified level.
+  int NumLevelFiles(int level) const;
+
+  // Return the combined file size of all files at the specified level.
+  int64_t NumLevelBytes(int level) const;
+
+  // Return the last sequence number.
+  uint64_t LastSequence() const { return last_sequence_; }
+
+  // Set the last sequence number to s.
+  void SetLastSequence(uint64_t s) {
+    assert(s >= last_sequence_);
+    last_sequence_ = s;
+  }
+
+  // Mark the specified file number as used.
+  void MarkFileNumberUsed(uint64_t number);
+
+  // Return the current log file number.
+  uint64_t LogNumber() const { return log_number_; }
+
+  // Return the log file number for the log file that is currently
+  // being compacted, or zero if there is no such log file.
+  uint64_t PrevLogNumber() const { return prev_log_number_; }
+
+  int NumberLevels() const { return num_levels_; }
+
+  // Pick level and inputs for a new compaction.
+  // Returns nullptr if there is no compaction to be done.
+  // Otherwise returns a pointer to a heap-allocated object that
+  // describes the compaction.  Caller should delete the result.
+  Compaction* PickCompaction();
+
+  // Return a compaction object for compacting the range [begin,end] in
+  // the specified level.  Returns nullptr if there is nothing in that
+  // level that overlaps the specified range.  Caller should delete
+  // the result.
+  Compaction* CompactRange(
+      int level,
+      const InternalKey* begin,
+      const InternalKey* end);
+
+  // Return the maximum overlapping data (in bytes) at next level for any
+  // file at a level >= 1.
+  int64_t MaxNextLevelOverlappingBytes();
+
+  // Create an iterator that reads over the compaction inputs for "*c".
+  // The caller should delete the iterator when no longer needed.
+  Iterator* MakeInputIterator(Compaction* c);
+
+  // Returns true iff some level needs a compaction because it has
+  // exceeded its target size.
+  bool NeedsSizeCompaction() const {
+    // In universal compaction case, this check doesn't really
+    // check the compaction condition, but checks num of files threshold
+    // only. We are not going to miss any compaction opportunity
+    // but it's likely that more compactions are scheduled but
+    // ending up with nothing to do. We can improve it later.
+    // TODO: improve this function to be accurate for universal
+    //       compactions.
+    int num_levels_to_check =
+        (options_->compaction_style != kCompactionStyleUniversal) ?
+            NumberLevels() - 1 : 1;
+    for (int i = 0; i < num_levels_to_check; i++) {
+      if (current_->compaction_score_[i] >= 1) {
+        return true;
+      }
+    }
+    return false;
+  }
+  // Returns true iff some level needs a compaction.
+  bool NeedsCompaction() const {
+    return ((current_->file_to_compact_ != nullptr) ||
+            NeedsSizeCompaction());
+  }
+
+  // Returns the maxmimum compaction score for levels 1 to max
+  double MaxCompactionScore() const {
+    return current_->max_compaction_score_;
+  }
+
+  // See field declaration
+  int MaxCompactionScoreLevel() const {
+    return current_->max_compaction_score_level_;
+  }
+
+  // Add all files listed in any live version to *live.
+  void AddLiveFiles(std::vector<uint64_t>* live_list);
+
+  // Add all files listed in the current version to *live.
+  void AddLiveFilesCurrentVersion(std::set<uint64_t>* live);
+
+  // Return the approximate offset in the database of the data for
+  // "key" as of version "v".
+  uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key);
+
+  // Return a human-readable short (single-line) summary of the number
+  // of files per level.  Uses *scratch as backing store.
+  struct LevelSummaryStorage {
+    char buffer[100];
+  };
+  struct FileSummaryStorage {
+    char buffer[1000];
+  };
+  const char* LevelSummary(LevelSummaryStorage* scratch) const;
+
+  // printf contents (for debugging)
+  Status DumpManifest(Options& options, std::string& manifestFileName,
+                      bool verbose, bool hex = false);
+
+  // Return a human-readable short (single-line) summary of the data size
+  // of files per level.  Uses *scratch as backing store.
+  const char* LevelDataSizeSummary(LevelSummaryStorage* scratch) const;
+
+  // Return a human-readable short (single-line) summary of files
+  // in a specified level.  Uses *scratch as backing store.
+  const char* LevelFileSummary(FileSummaryStorage* scratch, int level) const;
+
+  // Return the size of the current manifest file
+  const uint64_t ManifestFileSize() { return current_->offset_manifest_file_; }
+
+  // For the specfied level, pick a compaction.
+  // Returns nullptr if there is no compaction to be done.
+  // If level is 0 and there is already a compaction on that level, this
+  // function will return nullptr.
+  Compaction* PickCompactionBySize(int level, double score);
+
+  // Pick files to compact in Universal mode
+  Compaction* PickCompactionUniversal(int level, double score);
+
+  // Pick Universal compaction to limit read amplification
+  Compaction* PickCompactionUniversalReadAmp(int level, double score,
+                unsigned int ratio, unsigned int num_files);
+
+  // Pick Universal compaction to limit space amplification.
+  Compaction* PickCompactionUniversalSizeAmp(int level, double score);
+
+  // Free up the files that were participated in a compaction
+  void ReleaseCompactionFiles(Compaction* c, Status status);
+
+  // verify that the files that we started with for a compaction
+  // still exist in the current version and in the same original level.
+  // This ensures that a concurrent compaction did not erroneously
+  // pick the same files to compact.
+  bool VerifyCompactionFileConsistency(Compaction* c);
+
+  // used to sort files by size
+  typedef struct fsize {
+    int index;
+    FileMetaData* file;
+  } Fsize;
+
+  // Sort all files for this version based on their file size and
+  // record results in files_by_size_. The largest files are listed first.
+  void UpdateFilesBySize(Version *v);
+
+  // Get the max file size in a given level.
+  uint64_t MaxFileSizeForLevel(int level);
+
+  double MaxBytesForLevel(int level);
+
+  Status GetMetadataForFile(
+    uint64_t number, int *filelevel, FileMetaData *metadata);
+
+  void GetLiveFilesMetaData(
+    std::vector<LiveFileMetaData> *metadata);
+
+  void GetObsoleteFiles(std::vector<FileMetaData*>* files);
+
+ private:
+  class Builder;
+  struct ManifestWriter;
+
+  friend class Compaction;
+  friend class Version;
+
+  void Init(int num_levels);
+
+  void Finalize(Version* v, std::vector<uint64_t>&);
+
+  void GetRange(const std::vector<FileMetaData*>& inputs,
+                InternalKey* smallest,
+                InternalKey* largest);
+
+  void GetRange2(const std::vector<FileMetaData*>& inputs1,
+                 const std::vector<FileMetaData*>& inputs2,
+                 InternalKey* smallest,
+                 InternalKey* largest);
+
+  void ExpandWhileOverlapping(Compaction* c);
+
+  void SetupOtherInputs(Compaction* c);
+
+  // Save current contents to *log
+  Status WriteSnapshot(log::Writer* log);
+
+  void AppendVersion(Version* v);
+
+  bool ManifestContains(const std::string& record) const;
+
+  uint64_t ExpandedCompactionByteSizeLimit(int level);
+
+  uint64_t MaxGrandParentOverlapBytes(int level);
+
+  Env* const env_;
+  const std::string dbname_;
+  const Options* const options_;
+  TableCache* const table_cache_;
+  const InternalKeyComparator icmp_;
+  uint64_t next_file_number_;
+  uint64_t manifest_file_number_;
+  uint64_t last_sequence_;
+  uint64_t log_number_;
+  uint64_t prev_log_number_;  // 0 or backing store for memtable being compacted
+
+  int num_levels_;
+
+  // Opened lazily
+  unique_ptr<log::Writer> descriptor_log_;
+  Version dummy_versions_;  // Head of circular doubly-linked list of versions.
+  Version* current_;        // == dummy_versions_.prev_
+
+  // Per-level key at which the next compaction at that level should start.
+  // Either an empty string, or a valid InternalKey.
+  std::string* compact_pointer_;
+
+  // Per-level target file size.
+  uint64_t* max_file_size_;
+
+  // Per-level max bytes
+  uint64_t* level_max_bytes_;
+
+  // record all the ongoing compactions for all levels
+  std::vector<std::set<Compaction*> > compactions_in_progress_;
+
+  // generates a increasing version number for every new version
+  uint64_t current_version_number_;
+
+  // Queue of writers to the manifest file
+  std::deque<ManifestWriter*> manifest_writers_;
+
+  // Store the manifest file size when it is checked.
+  // Save us the cost of checking file size twice in LogAndApply
+  uint64_t last_observed_manifest_size_;
+
+  std::vector<FileMetaData*> obsolete_files_;
+
+  // storage options for all reads and writes except compactions
+  const EnvOptions& storage_options_;
+
+  // storage options used for compactions. This is a copy of
+  // storage_options_ but with readaheads set to readahead_compactions_.
+  const EnvOptions storage_options_compactions_;
+
+  // No copying allowed
+  VersionSet(const VersionSet&);
+  void operator=(const VersionSet&);
+
+  // Return the total amount of data that is undergoing
+  // compactions per level
+  void SizeBeingCompacted(std::vector<uint64_t>&);
+
+  // Returns true if any one of the parent files are being compacted
+  bool ParentRangeInCompaction(const InternalKey* smallest,
+    const InternalKey* largest, int level, int* index);
+
+  // Returns true if any one of the specified files are being compacted
+  bool FilesInCompaction(std::vector<FileMetaData*>& files);
+
+  void LogAndApplyHelper(Builder*b, Version* v,
+                           VersionEdit* edit, port::Mutex* mu);
+};
+
+// A Compaction encapsulates information about a compaction.
+class Compaction {
+ public:
+  ~Compaction();
+
+  // Return the level that is being compacted.  Inputs from "level"
+  // will be merged.
+  int level() const { return level_; }
+
+  // Outputs will go to this level
+  int output_level() const { return out_level_; }
+
+  // Return the object that holds the edits to the descriptor done
+  // by this compaction.
+  VersionEdit* edit() { return edit_; }
+
+  // "which" must be either 0 or 1
+  int num_input_files(int which) const { return inputs_[which].size(); }
+
+  // Return the ith input file at "level()+which" ("which" must be 0 or 1).
+  FileMetaData* input(int which, int i) const { return inputs_[which][i]; }
+
+  // Maximum size of files to build during this compaction.
+  uint64_t MaxOutputFileSize() const { return max_output_file_size_; }
+
+  // Whether compression will be enabled for compaction outputs
+  bool enable_compression() const { return enable_compression_; }
+
+  // Is this a trivial compaction that can be implemented by just
+  // moving a single input file to the next level (no merging or splitting)
+  bool IsTrivialMove() const;
+
+  // Add all inputs to this compaction as delete operations to *edit.
+  void AddInputDeletions(VersionEdit* edit);
+
+  // Returns true if the information we have available guarantees that
+  // the compaction is producing data in "level+1" for which no data exists
+  // in levels greater than "level+1".
+  bool IsBaseLevelForKey(const Slice& user_key);
+
+  // Returns true iff we should stop building the current output
+  // before processing "internal_key".
+  bool ShouldStopBefore(const Slice& internal_key);
+
+  // Release the input version for the compaction, once the compaction
+  // is successful.
+  void ReleaseInputs();
+
+  void Summary(char* output, int len);
+
+  // Return the score that was used to pick this compaction run.
+  double score() const { return score_; }
+
+  // Is this compaction creating a file in the bottom most level?
+  bool BottomMostLevel() { return bottommost_level_; }
+
+  // Does this compaction include all sst files?
+  bool IsFullCompaction() { return is_full_compaction_; }
+
+ private:
+  friend class Version;
+  friend class VersionSet;
+
+  explicit Compaction(int level, int out_level, uint64_t target_file_size,
+    uint64_t max_grandparent_overlap_bytes, int number_levels,
+    bool seek_compaction = false, bool enable_compression = true);
+
+  int level_;
+  int out_level_; // levels to which output files are stored
+  uint64_t max_output_file_size_;
+  uint64_t maxGrandParentOverlapBytes_;
+  Version* input_version_;
+  VersionEdit* edit_;
+  int number_levels_;
+
+  bool seek_compaction_;
+  bool enable_compression_;
+
+  // Each compaction reads inputs from "level_" and "level_+1"
+  std::vector<FileMetaData*> inputs_[2];      // The two sets of inputs
+
+  // State used to check for number of of overlapping grandparent files
+  // (parent == level_ + 1, grandparent == level_ + 2)
+  std::vector<FileMetaData*> grandparents_;
+  size_t grandparent_index_;  // Index in grandparent_starts_
+  bool seen_key_;             // Some output key has been seen
+  uint64_t overlapped_bytes_;  // Bytes of overlap between current output
+                              // and grandparent files
+  int base_index_;   // index of the file in files_[level_]
+  int parent_index_; // index of some file with same range in files_[level_+1]
+  double score_;     // score that was used to pick this compaction.
+
+  // Is this compaction creating a file in the bottom most level?
+  bool bottommost_level_;
+  // Does this compaction include all sst files?
+  bool is_full_compaction_;
+
+  // level_ptrs_ holds indices into input_version_->levels_: our state
+  // is that we are positioned at one of the file ranges for each
+  // higher level than the ones involved in this compaction (i.e. for
+  // all L >= level_ + 2).
+  std::vector<size_t> level_ptrs_;
+
+  // mark (or clear) all files that are being compacted
+  void MarkFilesBeingCompacted(bool);
+
+  // Initialize whether compaction producing files at the bottommost level
+  void SetupBottomMostLevel(bool isManual);
+
+  // In case of compaction error, reset the nextIndex that is used
+  // to pick up the next file to be compacted from files_by_size_
+  void ResetNextCompactionIndex();
+};
+
+}  // namespace rocksdb
--- a/db/version_set_reduce_num_levels.cc
+++ b/db/version_set_reduce_num_levels.cc
@ -0,0 +1,80 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2012 Facebook. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "db/version_set.h"
+
+#include <algorithm>
+#include <stdio.h>
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "util/logging.h"
+
+namespace rocksdb {
+
+Status VersionSet::ReduceNumberOfLevels(int new_levels, port::Mutex* mu) {
+
+  if(new_levels <= 1) {
+    return Status::InvalidArgument(
+        "Number of levels needs to be bigger than 1");
+  }
+
+  Version* current_version = current_;
+  int current_levels = NumberLevels();
+
+  if (current_levels <= new_levels) {
+    return Status::OK();
+  }
+
+  // Make sure there are file only on one level from
+  // (new_levels-1) to (current_levels-1)
+  int first_nonempty_level = -1;
+  int first_nonempty_level_filenum = 0;
+  for (int i = new_levels - 1; i < current_levels; i++) {
+    int file_num = NumLevelFiles(i);
+    if (file_num != 0) {
+      if (first_nonempty_level < 0) {
+        first_nonempty_level = i;
+        first_nonempty_level_filenum = file_num;
+      } else {
+        char msg[255];
+        sprintf(msg, "Found at least two levels containing files: "
+            "[%d:%d],[%d:%d].\n",
+            first_nonempty_level, first_nonempty_level_filenum, i, file_num);
+        return Status::InvalidArgument(msg);
+      }
+    }
+  }
+
+  Status st;
+  std::vector<FileMetaData*>*  old_files_list = current_version->files_;
+  std::vector<FileMetaData*>* new_files_list =
+      new std::vector<FileMetaData*>[new_levels];
+  for (int i = 0; i < new_levels - 1; i++) {
+    new_files_list[i] = old_files_list[i];
+  }
+
+  if (first_nonempty_level > 0) {
+    new_files_list[new_levels - 1] = old_files_list[first_nonempty_level];
+  }
+
+  delete[] current_version->files_;
+  current_version->files_ = new_files_list;
+
+  delete[] compact_pointer_;
+  delete[] max_file_size_;
+  delete[] level_max_bytes_;
+  num_levels_ = new_levels;
+  compact_pointer_ = new std::string[new_levels];
+  Init(new_levels);
+  VersionEdit ve(new_levels);
+  st = LogAndApply(&ve , mu, true);
+  return st;
+}
+
+}
--- a/db/version_set_test.cc
+++ b/db/version_set_test.cc
@ -0,0 +1,184 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_set.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+class FindFileTest {
+ public:
+  std::vector<FileMetaData*> files_;
+  bool disjoint_sorted_files_;
+
+  FindFileTest() : disjoint_sorted_files_(true) { }
+
+  ~FindFileTest() {
+    for (unsigned int i = 0; i < files_.size(); i++) {
+      delete files_[i];
+    }
+  }
+
+  void Add(const char* smallest, const char* largest,
+           SequenceNumber smallest_seq = 100,
+           SequenceNumber largest_seq = 100) {
+    FileMetaData* f = new FileMetaData;
+    f->number = files_.size() + 1;
+    f->smallest = InternalKey(smallest, smallest_seq, kTypeValue);
+    f->largest = InternalKey(largest, largest_seq, kTypeValue);
+    files_.push_back(f);
+  }
+
+  int Find(const char* key) {
+    InternalKey target(key, 100, kTypeValue);
+    InternalKeyComparator cmp(BytewiseComparator());
+    return FindFile(cmp, files_, target.Encode());
+  }
+
+  bool Overlaps(const char* smallest, const char* largest) {
+    InternalKeyComparator cmp(BytewiseComparator());
+    Slice s(smallest != nullptr ? smallest : "");
+    Slice l(largest != nullptr ? largest : "");
+    return SomeFileOverlapsRange(cmp, disjoint_sorted_files_, files_,
+                                 (smallest != nullptr ? &s : nullptr),
+                                 (largest != nullptr ? &l : nullptr));
+  }
+};
+
+TEST(FindFileTest, Empty) {
+  ASSERT_EQ(0, Find("foo"));
+  ASSERT_TRUE(! Overlaps("a", "z"));
+  ASSERT_TRUE(! Overlaps(nullptr, "z"));
+  ASSERT_TRUE(! Overlaps("a", nullptr));
+  ASSERT_TRUE(! Overlaps(nullptr, nullptr));
+}
+
+TEST(FindFileTest, Single) {
+  Add("p", "q");
+  ASSERT_EQ(0, Find("a"));
+  ASSERT_EQ(0, Find("p"));
+  ASSERT_EQ(0, Find("p1"));
+  ASSERT_EQ(0, Find("q"));
+  ASSERT_EQ(1, Find("q1"));
+  ASSERT_EQ(1, Find("z"));
+
+  ASSERT_TRUE(! Overlaps("a", "b"));
+  ASSERT_TRUE(! Overlaps("z1", "z2"));
+  ASSERT_TRUE(Overlaps("a", "p"));
+  ASSERT_TRUE(Overlaps("a", "q"));
+  ASSERT_TRUE(Overlaps("a", "z"));
+  ASSERT_TRUE(Overlaps("p", "p1"));
+  ASSERT_TRUE(Overlaps("p", "q"));
+  ASSERT_TRUE(Overlaps("p", "z"));
+  ASSERT_TRUE(Overlaps("p1", "p2"));
+  ASSERT_TRUE(Overlaps("p1", "z"));
+  ASSERT_TRUE(Overlaps("q", "q"));
+  ASSERT_TRUE(Overlaps("q", "q1"));
+
+  ASSERT_TRUE(! Overlaps(nullptr, "j"));
+  ASSERT_TRUE(! Overlaps("r", nullptr));
+  ASSERT_TRUE(Overlaps(nullptr, "p"));
+  ASSERT_TRUE(Overlaps(nullptr, "p1"));
+  ASSERT_TRUE(Overlaps("q", nullptr));
+  ASSERT_TRUE(Overlaps(nullptr, nullptr));
+}
+
+
+TEST(FindFileTest, Multiple) {
+  Add("150", "200");
+  Add("200", "250");
+  Add("300", "350");
+  Add("400", "450");
+  ASSERT_EQ(0, Find("100"));
+  ASSERT_EQ(0, Find("150"));
+  ASSERT_EQ(0, Find("151"));
+  ASSERT_EQ(0, Find("199"));
+  ASSERT_EQ(0, Find("200"));
+  ASSERT_EQ(1, Find("201"));
+  ASSERT_EQ(1, Find("249"));
+  ASSERT_EQ(1, Find("250"));
+  ASSERT_EQ(2, Find("251"));
+  ASSERT_EQ(2, Find("299"));
+  ASSERT_EQ(2, Find("300"));
+  ASSERT_EQ(2, Find("349"));
+  ASSERT_EQ(2, Find("350"));
+  ASSERT_EQ(3, Find("351"));
+  ASSERT_EQ(3, Find("400"));
+  ASSERT_EQ(3, Find("450"));
+  ASSERT_EQ(4, Find("451"));
+
+  ASSERT_TRUE(! Overlaps("100", "149"));
+  ASSERT_TRUE(! Overlaps("251", "299"));
+  ASSERT_TRUE(! Overlaps("451", "500"));
+  ASSERT_TRUE(! Overlaps("351", "399"));
+
+  ASSERT_TRUE(Overlaps("100", "150"));
+  ASSERT_TRUE(Overlaps("100", "200"));
+  ASSERT_TRUE(Overlaps("100", "300"));
+  ASSERT_TRUE(Overlaps("100", "400"));
+  ASSERT_TRUE(Overlaps("100", "500"));
+  ASSERT_TRUE(Overlaps("375", "400"));
+  ASSERT_TRUE(Overlaps("450", "450"));
+  ASSERT_TRUE(Overlaps("450", "500"));
+}
+
+TEST(FindFileTest, MultipleNullBoundaries) {
+  Add("150", "200");
+  Add("200", "250");
+  Add("300", "350");
+  Add("400", "450");
+  ASSERT_TRUE(! Overlaps(nullptr, "149"));
+  ASSERT_TRUE(! Overlaps("451", nullptr));
+  ASSERT_TRUE(Overlaps(nullptr, nullptr));
+  ASSERT_TRUE(Overlaps(nullptr, "150"));
+  ASSERT_TRUE(Overlaps(nullptr, "199"));
+  ASSERT_TRUE(Overlaps(nullptr, "200"));
+  ASSERT_TRUE(Overlaps(nullptr, "201"));
+  ASSERT_TRUE(Overlaps(nullptr, "400"));
+  ASSERT_TRUE(Overlaps(nullptr, "800"));
+  ASSERT_TRUE(Overlaps("100", nullptr));
+  ASSERT_TRUE(Overlaps("200", nullptr));
+  ASSERT_TRUE(Overlaps("449", nullptr));
+  ASSERT_TRUE(Overlaps("450", nullptr));
+}
+
+TEST(FindFileTest, OverlapSequenceChecks) {
+  Add("200", "200", 5000, 3000);
+  ASSERT_TRUE(! Overlaps("199", "199"));
+  ASSERT_TRUE(! Overlaps("201", "300"));
+  ASSERT_TRUE(Overlaps("200", "200"));
+  ASSERT_TRUE(Overlaps("190", "200"));
+  ASSERT_TRUE(Overlaps("200", "210"));
+}
+
+TEST(FindFileTest, OverlappingFiles) {
+  Add("150", "600");
+  Add("400", "500");
+  disjoint_sorted_files_ = false;
+  ASSERT_TRUE(! Overlaps("100", "149"));
+  ASSERT_TRUE(! Overlaps("601", "700"));
+  ASSERT_TRUE(Overlaps("100", "150"));
+  ASSERT_TRUE(Overlaps("100", "200"));
+  ASSERT_TRUE(Overlaps("100", "300"));
+  ASSERT_TRUE(Overlaps("100", "400"));
+  ASSERT_TRUE(Overlaps("100", "500"));
+  ASSERT_TRUE(Overlaps("375", "400"));
+  ASSERT_TRUE(Overlaps("450", "450"));
+  ASSERT_TRUE(Overlaps("450", "500"));
+  ASSERT_TRUE(Overlaps("450", "700"));
+  ASSERT_TRUE(Overlaps("600", "700"));
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
--- a/db/write_batch.cc
+++ b/db/write_batch.cc
@ -0,0 +1,247 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// WriteBatch::rep_ :=
+//    sequence: fixed64
+//    count: fixed32
+//    data: record[count]
+// record :=
+//    kTypeValue varstring varstring
+//    kTypeMerge varstring varstring
+//    kTypeDeletion varstring
+// varstring :=
+//    len: varint32
+//    data: uint8[len]
+
+#include "rocksdb/write_batch.h"
+
+#include "rocksdb/options.h"
+#include "rocksdb/statistics.h"
+#include "db/dbformat.h"
+#include "db/db_impl.h"
+#include "db/memtable.h"
+#include "db/snapshot.h"
+#include "db/write_batch_internal.h"
+#include "util/coding.h"
+#include <stdexcept>
+
+namespace rocksdb {
+
+// WriteBatch header has an 8-byte sequence number followed by a 4-byte count.
+static const size_t kHeader = 12;
+
+WriteBatch::WriteBatch() {
+  Clear();
+}
+
+WriteBatch::~WriteBatch() { }
+
+WriteBatch::Handler::~Handler() { }
+
+void WriteBatch::Handler::Merge(const Slice& key, const Slice& value) {
+  throw std::runtime_error("Handler::Merge not implemented!");
+}
+
+void WriteBatch::Handler::LogData(const Slice& blob) {
+  // If the user has not specified something to do with blobs, then we ignore
+  // them.
+}
+
+bool WriteBatch::Handler::Continue() {
+  return true;
+}
+
+void WriteBatch::Clear() {
+  rep_.clear();
+  rep_.resize(kHeader);
+}
+
+int WriteBatch::Count() const {
+  return WriteBatchInternal::Count(this);
+}
+
+Status WriteBatch::Iterate(Handler* handler) const {
+  Slice input(rep_);
+  if (input.size() < kHeader) {
+    return Status::Corruption("malformed WriteBatch (too small)");
+  }
+
+  input.remove_prefix(kHeader);
+  Slice key, value, blob;
+  int found = 0;
+  while (!input.empty() && handler->Continue()) {
+    char tag = input[0];
+    input.remove_prefix(1);
+    switch (tag) {
+      case kTypeValue:
+        if (GetLengthPrefixedSlice(&input, &key) &&
+            GetLengthPrefixedSlice(&input, &value)) {
+          handler->Put(key, value);
+          found++;
+        } else {
+          return Status::Corruption("bad WriteBatch Put");
+        }
+        break;
+      case kTypeDeletion:
+        if (GetLengthPrefixedSlice(&input, &key)) {
+          handler->Delete(key);
+          found++;
+        } else {
+          return Status::Corruption("bad WriteBatch Delete");
+        }
+        break;
+      case kTypeMerge:
+        if (GetLengthPrefixedSlice(&input, &key) &&
+            GetLengthPrefixedSlice(&input, &value)) {
+          handler->Merge(key, value);
+          found++;
+        } else {
+          return Status::Corruption("bad WriteBatch Merge");
+        }
+        break;
+      case kTypeLogData:
+        if (GetLengthPrefixedSlice(&input, &blob)) {
+          handler->LogData(blob);
+        } else {
+          return Status::Corruption("bad WriteBatch Blob");
+        }
+        break;
+      default:
+        return Status::Corruption("unknown WriteBatch tag");
+    }
+  }
+  if (found != WriteBatchInternal::Count(this)) {
+    return Status::Corruption("WriteBatch has wrong count");
+  } else {
+    return Status::OK();
+  }
+}
+
+int WriteBatchInternal::Count(const WriteBatch* b) {
+  return DecodeFixed32(b->rep_.data() + 8);
+}
+
+void WriteBatchInternal::SetCount(WriteBatch* b, int n) {
+  EncodeFixed32(&b->rep_[8], n);
+}
+
+SequenceNumber WriteBatchInternal::Sequence(const WriteBatch* b) {
+  return SequenceNumber(DecodeFixed64(b->rep_.data()));
+}
+
+void WriteBatchInternal::SetSequence(WriteBatch* b, SequenceNumber seq) {
+  EncodeFixed64(&b->rep_[0], seq);
+}
+
+void WriteBatch::Put(const Slice& key, const Slice& value) {
+  WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1);
+  rep_.push_back(static_cast<char>(kTypeValue));
+  PutLengthPrefixedSlice(&rep_, key);
+  PutLengthPrefixedSlice(&rep_, value);
+}
+
+void WriteBatch::Put(const SliceParts& key, const SliceParts& value) {
+  WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1);
+  rep_.push_back(static_cast<char>(kTypeValue));
+  PutLengthPrefixedSliceParts(&rep_, key);
+  PutLengthPrefixedSliceParts(&rep_, value);
+}
+
+void WriteBatch::Delete(const Slice& key) {
+  WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1);
+  rep_.push_back(static_cast<char>(kTypeDeletion));
+  PutLengthPrefixedSlice(&rep_, key);
+}
+
+void WriteBatch::Merge(const Slice& key, const Slice& value) {
+  WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1);
+  rep_.push_back(static_cast<char>(kTypeMerge));
+  PutLengthPrefixedSlice(&rep_, key);
+  PutLengthPrefixedSlice(&rep_, value);
+}
+
+void WriteBatch::PutLogData(const Slice& blob) {
+  rep_.push_back(static_cast<char>(kTypeLogData));
+  PutLengthPrefixedSlice(&rep_, blob);
+}
+
+namespace {
+class MemTableInserter : public WriteBatch::Handler {
+ public:
+  SequenceNumber sequence_;
+  MemTable* mem_;
+  const Options* options_;
+  DBImpl* db_;
+  const bool filter_deletes_;
+
+  MemTableInserter(SequenceNumber sequence, MemTable* mem, const Options* opts,
+                   DB* db, const bool filter_deletes)
+    : sequence_(sequence),
+      mem_(mem),
+      options_(opts),
+      db_(reinterpret_cast<DBImpl*>(db)),
+      filter_deletes_(filter_deletes) {
+    assert(mem_);
+    if (filter_deletes_) {
+      assert(options_);
+      assert(db_);
+    }
+  }
+
+  virtual void Put(const Slice& key, const Slice& value) {
+    if (options_->inplace_update_support
+        && mem_->Update(sequence_, kTypeValue, key, value)) {
+      RecordTick(options_->statistics, NUMBER_KEYS_UPDATED);
+    } else {
+      mem_->Add(sequence_, kTypeValue, key, value);
+    }
+    sequence_++;
+  }
+  virtual void Merge(const Slice& key, const Slice& value) {
+    mem_->Add(sequence_, kTypeMerge, key, value);
+    sequence_++;
+  }
+  virtual void Delete(const Slice& key) {
+    if (filter_deletes_) {
+      SnapshotImpl read_from_snapshot;
+      read_from_snapshot.number_ = sequence_;
+      ReadOptions ropts;
+      ropts.snapshot = &read_from_snapshot;
+      std::string value;
+      if (!db_->KeyMayExist(ropts, key, &value)) {
+        RecordTick(options_->statistics, NUMBER_FILTERED_DELETES);
+        return;
+      }
+    }
+    mem_->Add(sequence_, kTypeDeletion, key, Slice());
+    sequence_++;
+  }
+};
+}  // namespace
+
+Status WriteBatchInternal::InsertInto(const WriteBatch* b, MemTable* mem,
+                                      const Options* opts, DB* db,
+                                      const bool filter_deletes) {
+  MemTableInserter inserter(WriteBatchInternal::Sequence(b), mem, opts, db,
+                            filter_deletes);
+  return b->Iterate(&inserter);
+}
+
+void WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) {
+  assert(contents.size() >= kHeader);
+  b->rep_.assign(contents.data(), contents.size());
+}
+
+void WriteBatchInternal::Append(WriteBatch* dst, const WriteBatch* src) {
+  SetCount(dst, Count(dst) + Count(src));
+  assert(src->rep_.size() >= kHeader);
+  dst->rep_.append(src->rep_.data() + kHeader, src->rep_.size() - kHeader);
+}
+
+}  // namespace rocksdb
--- a/db/write_batch_internal.h
+++ b/db/write_batch_internal.h
@ -0,0 +1,57 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include "rocksdb/types.h"
+#include "rocksdb/write_batch.h"
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+
+namespace rocksdb {
+
+class MemTable;
+
+// WriteBatchInternal provides static methods for manipulating a
+// WriteBatch that we don't want in the public WriteBatch interface.
+class WriteBatchInternal {
+ public:
+  // Return the number of entries in the batch.
+  static int Count(const WriteBatch* batch);
+
+  // Set the count for the number of entries in the batch.
+  static void SetCount(WriteBatch* batch, int n);
+
+  // Return the seqeunce number for the start of this batch.
+  static SequenceNumber Sequence(const WriteBatch* batch);
+
+  // Store the specified number as the seqeunce number for the start of
+  // this batch.
+  static void SetSequence(WriteBatch* batch, SequenceNumber seq);
+
+  static Slice Contents(const WriteBatch* batch) {
+    return Slice(batch->rep_);
+  }
+
+  static size_t ByteSize(const WriteBatch* batch) {
+    return batch->rep_.size();
+  }
+
+  static void SetContents(WriteBatch* batch, const Slice& contents);
+
+  // Inserts batch entries into memtable
+  // Drops deletes in batch if filter_del is set to true and
+  // db->KeyMayExist returns false
+  static Status InsertInto(const WriteBatch* batch, MemTable* memtable,
+                           const Options* opts, DB* db = nullptr,
+                           const bool filter_del = false);
+
+  static void Append(WriteBatch* dst, const WriteBatch* src);
+};
+
+}  // namespace rocksdb
--- a/db/write_batch_test.cc
+++ b/db/write_batch_test.cc
@ -0,0 +1,262 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/db.h"
+
+#include <memory>
+#include "db/memtable.h"
+#include "db/write_batch_internal.h"
+#include "rocksdb/env.h"
+#include "rocksdb/memtablerep.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+
+namespace rocksdb {
+
+static std::string PrintContents(WriteBatch* b) {
+  InternalKeyComparator cmp(BytewiseComparator());
+  auto factory = std::make_shared<SkipListFactory>();
+  MemTable* mem = new MemTable(cmp, factory);
+  mem->Ref();
+  std::string state;
+  Options options;
+  Status s = WriteBatchInternal::InsertInto(b, mem, &options);
+  int count = 0;
+  Iterator* iter = mem->NewIterator();
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ParsedInternalKey ikey;
+    memset((void *)&ikey, 0, sizeof(ikey));
+    ASSERT_TRUE(ParseInternalKey(iter->key(), &ikey));
+    switch (ikey.type) {
+      case kTypeValue:
+        state.append("Put(");
+        state.append(ikey.user_key.ToString());
+        state.append(", ");
+        state.append(iter->value().ToString());
+        state.append(")");
+        count++;
+        break;
+      case kTypeMerge:
+        state.append("Merge(");
+        state.append(ikey.user_key.ToString());
+        state.append(", ");
+        state.append(iter->value().ToString());
+        state.append(")");
+        count++;
+        break;
+      case kTypeDeletion:
+        state.append("Delete(");
+        state.append(ikey.user_key.ToString());
+        state.append(")");
+        count++;
+        break;
+      case kTypeLogData:
+        assert(false);
+        break;
+    }
+    state.append("@");
+    state.append(NumberToString(ikey.sequence));
+  }
+  delete iter;
+  if (!s.ok()) {
+    state.append(s.ToString());
+  } else if (count != WriteBatchInternal::Count(b)) {
+    state.append("CountMismatch()");
+  }
+  mem->Unref();
+  return state;
+}
+
+class WriteBatchTest { };
+
+TEST(WriteBatchTest, Empty) {
+  WriteBatch batch;
+  ASSERT_EQ("", PrintContents(&batch));
+  ASSERT_EQ(0, WriteBatchInternal::Count(&batch));
+  ASSERT_EQ(0, batch.Count());
+}
+
+TEST(WriteBatchTest, Multiple) {
+  WriteBatch batch;
+  batch.Put(Slice("foo"), Slice("bar"));
+  batch.Delete(Slice("box"));
+  batch.Put(Slice("baz"), Slice("boo"));
+  WriteBatchInternal::SetSequence(&batch, 100);
+  ASSERT_EQ(100U, WriteBatchInternal::Sequence(&batch));
+  ASSERT_EQ(3, WriteBatchInternal::Count(&batch));
+  ASSERT_EQ("Put(baz, boo)@102"
+            "Delete(box)@101"
+            "Put(foo, bar)@100",
+            PrintContents(&batch));
+  ASSERT_EQ(3, batch.Count());
+}
+
+TEST(WriteBatchTest, Corruption) {
+  WriteBatch batch;
+  batch.Put(Slice("foo"), Slice("bar"));
+  batch.Delete(Slice("box"));
+  WriteBatchInternal::SetSequence(&batch, 200);
+  Slice contents = WriteBatchInternal::Contents(&batch);
+  WriteBatchInternal::SetContents(&batch,
+                                  Slice(contents.data(),contents.size()-1));
+  ASSERT_EQ("Put(foo, bar)@200"
+            "Corruption: bad WriteBatch Delete",
+            PrintContents(&batch));
+}
+
+TEST(WriteBatchTest, Append) {
+  WriteBatch b1, b2;
+  WriteBatchInternal::SetSequence(&b1, 200);
+  WriteBatchInternal::SetSequence(&b2, 300);
+  WriteBatchInternal::Append(&b1, &b2);
+  ASSERT_EQ("",
+            PrintContents(&b1));
+  ASSERT_EQ(0, b1.Count());
+  b2.Put("a", "va");
+  WriteBatchInternal::Append(&b1, &b2);
+  ASSERT_EQ("Put(a, va)@200",
+            PrintContents(&b1));
+  ASSERT_EQ(1, b1.Count());
+  b2.Clear();
+  b2.Put("b", "vb");
+  WriteBatchInternal::Append(&b1, &b2);
+  ASSERT_EQ("Put(a, va)@200"
+            "Put(b, vb)@201",
+            PrintContents(&b1));
+  ASSERT_EQ(2, b1.Count());
+  b2.Delete("foo");
+  WriteBatchInternal::Append(&b1, &b2);
+  ASSERT_EQ("Put(a, va)@200"
+            "Put(b, vb)@202"
+            "Put(b, vb)@201"
+            "Delete(foo)@203",
+            PrintContents(&b1));
+  ASSERT_EQ(4, b1.Count());
+}
+
+namespace {
+  struct TestHandler : public WriteBatch::Handler {
+    std::string seen;
+    virtual void Put(const Slice& key, const Slice& value) {
+      seen += "Put(" + key.ToString() + ", " + value.ToString() + ")";
+    }
+    virtual void Merge(const Slice& key, const Slice& value) {
+      seen += "Merge(" + key.ToString() + ", " + value.ToString() + ")";
+    }
+    virtual void LogData(const Slice& blob) {
+      seen += "LogData(" + blob.ToString() + ")";
+    }
+    virtual void Delete(const Slice& key) {
+      seen += "Delete(" + key.ToString() + ")";
+    }
+  };
+}
+
+TEST(WriteBatchTest, Blob) {
+  WriteBatch batch;
+  batch.Put(Slice("k1"), Slice("v1"));
+  batch.Put(Slice("k2"), Slice("v2"));
+  batch.Put(Slice("k3"), Slice("v3"));
+  batch.PutLogData(Slice("blob1"));
+  batch.Delete(Slice("k2"));
+  batch.PutLogData(Slice("blob2"));
+  batch.Merge(Slice("foo"), Slice("bar"));
+  ASSERT_EQ(5, batch.Count());
+  ASSERT_EQ("Merge(foo, bar)@4"
+            "Put(k1, v1)@0"
+            "Delete(k2)@3"
+            "Put(k2, v2)@1"
+            "Put(k3, v3)@2",
+            PrintContents(&batch));
+
+  TestHandler handler;
+  batch.Iterate(&handler);
+  ASSERT_EQ(
+            "Put(k1, v1)"
+            "Put(k2, v2)"
+            "Put(k3, v3)"
+            "LogData(blob1)"
+            "Delete(k2)"
+            "LogData(blob2)"
+            "Merge(foo, bar)",
+            handler.seen);
+}
+
+TEST(WriteBatchTest, Continue) {
+  WriteBatch batch;
+
+  struct Handler : public TestHandler {
+    int num_seen = 0;
+    virtual void Put(const Slice& key, const Slice& value) {
+      ++num_seen;
+      TestHandler::Put(key, value);
+    }
+    virtual void Merge(const Slice& key, const Slice& value) {
+      ++num_seen;
+      TestHandler::Merge(key, value);
+    }
+    virtual void LogData(const Slice& blob) {
+      ++num_seen;
+      TestHandler::LogData(blob);
+    }
+    virtual void Delete(const Slice& key) {
+      ++num_seen;
+      TestHandler::Delete(key);
+    }
+    virtual bool Continue() override {
+      return num_seen < 3;
+    }
+  } handler;
+
+  batch.Put(Slice("k1"), Slice("v1"));
+  batch.PutLogData(Slice("blob1"));
+  batch.Delete(Slice("k1"));
+  batch.PutLogData(Slice("blob2"));
+  batch.Merge(Slice("foo"), Slice("bar"));
+  batch.Iterate(&handler);
+  ASSERT_EQ(
+            "Put(k1, v1)"
+            "LogData(blob1)"
+            "Delete(k1)",
+            handler.seen);
+}
+
+TEST(WriteBatchTest, PutGatherSlices) {
+  WriteBatch batch;
+  batch.Put(Slice("foo"), Slice("bar"));
+
+  {
+    // Try a write where the key is one slice but the value is two
+    Slice key_slice("baz");
+    Slice value_slices[2] = { Slice("header"), Slice("payload") };
+    batch.Put(SliceParts(&key_slice, 1),
+              SliceParts(value_slices, 2));
+  }
+
+  {
+    // One where the key is composite but the value is a single slice
+    Slice key_slices[3] = { Slice("key"), Slice("part2"), Slice("part3") };
+    Slice value_slice("value");
+    batch.Put(SliceParts(key_slices, 3),
+              SliceParts(&value_slice, 1));
+  }
+
+  WriteBatchInternal::SetSequence(&batch, 100);
+  ASSERT_EQ("Put(baz, headerpayload)@101"
+            "Put(foo, bar)@100"
+            "Put(keypart2part3, value)@102",
+            PrintContents(&batch));
+  ASSERT_EQ(3, batch.Count());
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
--- a/doc/doc.css
+++ b/doc/doc.css
@ -0,0 +1,89 @@
+body {
+  margin-left: 0.5in;
+  margin-right: 0.5in;
+  background: white;
+  color: black;
+}
+
+h1 {
+  margin-left: -0.2in;
+  font-size: 14pt;
+}
+h2 {
+  margin-left: -0in;
+  font-size: 12pt;
+}
+h3 {
+  margin-left: -0in;
+}
+h4 {
+  margin-left: -0in;
+}
+hr {
+  margin-left: -0in;
+}
+
+/* Definition lists: definition term bold */
+dt {
+  font-weight: bold;
+}
+
+address {
+  text-align: center;
+}
+code,samp,var {
+  color: blue;
+}
+kbd {
+  color: #600000;
+}
+div.note p {
+  float: right;
+  width: 3in;
+  margin-right: 0%;
+  padding: 1px;
+  border: 2px solid #6060a0;
+  background-color: #fffff0;
+}
+
+ul {
+  margin-top: -0em;
+  margin-bottom: -0em;
+}
+
+ol {
+  margin-top: -0em;
+  margin-bottom: -0em;
+}
+
+UL.nobullets {
+  list-style-type: none;
+  list-style-image: none;
+  margin-left: -1em;
+}
+
+p {
+  margin: 1em 0 1em 0;
+  padding: 0 0 0 0;
+}
+
+pre {
+  line-height: 1.3em;
+  padding: 0.4em 0 0.8em 0;
+  margin:  0 0 0 0;
+  border:  0 0 0 0;
+  color: blue;
+}
+
+.datatable {
+  margin-left: auto;
+  margin-right: auto;
+  margin-top: 2em;
+  margin-bottom: 2em;
+  border: 1px solid;
+}
+
+.datatable td,th {
+  padding: 0 0.5em 0 0.5em;
+  text-align: right;
+}
--- a/doc/index.html
+++ b/doc/index.html
@ -0,0 +1,831 @@
+<!DOCTYPE html>
+<html>
+<head>
+<link rel="stylesheet" type="text/css" href="doc.css" />
+<title>RocksDB</title>
+</head>
+
+<body>
+<h1>RocksDB</h1>
+<address>The Facebook Database Engineering Team</address>
+<address>Build on earlier work on leveldb by Sanjay Ghemawat
+               (sanjay@google.com) and Jeff Dean (jeff@google.com)</address>
+<p>
+The <code>rocksdb</code> library provides a persistent key value store.  Keys and
+values are arbitrary byte arrays.  The keys are ordered within the key
+value store according to a user-specified comparator function.
+
+<p>
+<h1>Opening A Database</h1>
+<p>
+A <code>rocksdb</code> database has a name which corresponds to a file system
+directory.  All of the contents of database are stored in this
+directory.  The following example shows how to open a database,
+creating it if necessary:
+<p>
+<pre>
+  #include &lt;assert&gt;
+  #include "rocksdb/db.h"
+
+  rocksdb::DB* db;
+  rocksdb::Options options;
+  options.create_if_missing = true;
+  rocksdb::Status status = rocksdb::DB::Open(options, "/tmp/testdb", &amp;db);
+  assert(status.ok());
+  ...
+</pre>
+If you want to raise an error if the database already exists, add
+the following line before the <code>rocksdb::DB::Open</code> call:
+<pre>
+  options.error_if_exists = true;
+</pre>
+<h1>Status</h1>
+<p>
+You may have noticed the <code>rocksdb::Status</code> type above.  Values of this
+type are returned by most functions in <code>rocksdb</code> that may encounter an
+error.  You can check if such a result is ok, and also print an
+associated error message:
+<p>
+<pre>
+   rocksdb::Status s = ...;
+   if (!s.ok()) cerr &lt;&lt; s.ToString() &lt;&lt; endl;
+</pre>
+<h1>Closing A Database</h1>
+<p>
+When you are done with a database, just delete the database object.
+Example:
+<p>
+<pre>
+  ... open the db as described above ...
+  ... do something with db ...
+  delete db;
+</pre>
+<h1>Reads And Writes</h1>
+<p>
+The database provides <code>Put</code>, <code>Delete</code>, and <code>Get</code> methods to
+modify/query the database.  For example, the following code
+moves the value stored under key1 to key2.
+<pre>
+  std::string value;
+  rocksdb::Status s = db-&gt;Get(rocksdb::ReadOptions(), key1, &amp;value);
+  if (s.ok()) s = db-&gt;Put(rocksdb::WriteOptions(), key2, value);
+  if (s.ok()) s = db-&gt;Delete(rocksdb::WriteOptions(), key1);
+</pre>
+
+<h1>Atomic Updates</h1>
+<p>
+Note that if the process dies after the Put of key2 but before the
+delete of key1, the same value may be left stored under multiple keys.
+Such problems can be avoided by using the <code>WriteBatch</code> class to
+atomically apply a set of updates:
+<p>
+<pre>
+  #include "leveldb/write_batch.h"
+  ...
+  std::string value;
+  rocksdb::Status s = db-&gt;Get(rocksdb::ReadOptions(), key1, &amp;value);
+  if (s.ok()) {
+    rocksdb::WriteBatch batch;
+    batch.Delete(key1);
+    batch.Put(key2, value);
+    s = db-&gt;Write(rocksdb::WriteOptions(), &amp;batch);
+  }
+</pre>
+The <code>WriteBatch</code> holds a sequence of edits to be made to the database,
+and these edits within the batch are applied in order.  Note that we
+called <code>Delete</code> before <code>Put</code> so that if <code>key1</code> is identical to <code>key2</code>,
+we do not end up erroneously dropping the value entirely.
+<p>
+Apart from its atomicity benefits, <code>WriteBatch</code> may also be used to
+speed up bulk updates by placing lots of individual mutations into the
+same batch.
+
+<h1>Synchronous Writes</h1>
+By default, each write to <code>leveldb</code> is asynchronous: it
+returns after pushing the write from the process into the operating
+system.  The transfer from operating system memory to the underlying
+persistent storage happens asynchronously.  The <code>sync</code> flag
+can be turned on for a particular write to make the write operation
+not return until the data being written has been pushed all the way to
+persistent storage.  (On Posix systems, this is implemented by calling
+either <code>fsync(...)</code> or <code>fdatasync(...)</code> or
+<code>msync(..., MS_SYNC)</code> before the write operation returns.)
+<pre>
+  rocksdb::WriteOptions write_options;
+  write_options.sync = true;
+  db-&gt;Put(write_options, ...);
+</pre>
+Asynchronous writes are often more than a thousand times as fast as
+synchronous writes.  The downside of asynchronous writes is that a
+crash of the machine may cause the last few updates to be lost.  Note
+that a crash of just the writing process (i.e., not a reboot) will not
+cause any loss since even when <code>sync</code> is false, an update
+is pushed from the process memory into the operating system before it
+is considered done.
+
+<p>
+Asynchronous writes can often be used safely.  For example, when
+loading a large amount of data into the database you can handle lost
+updates by restarting the bulk load after a crash.  A hybrid scheme is
+also possible where every Nth write is synchronous, and in the event
+of a crash, the bulk load is restarted just after the last synchronous
+write finished by the previous run.  (The synchronous write can update
+a marker that describes where to restart on a crash.)
+
+<p>
+<code>WriteBatch</code> provides an alternative to asynchronous writes.
+Multiple updates may be placed in the same <code>WriteBatch</code> and
+applied together using a synchronous write (i.e.,
+<code>write_options.sync</code> is set to true).  The extra cost of
+the synchronous write will be amortized across all of the writes in
+the batch.
+
+<p>
+We also provide a way to completely disable Write Ahead Log for a
+particular write. If you set write_option.disableWAL to true, the
+write will not go to the log at all and may be lost in an event of
+process crash.
+
+<p>
+When opening a DB, you can disable syncing of data files by setting
+Options::disableDataSync to true. This can be useful when doing
+bulk-loading or big idempotent operations. Once the operation is
+finished, you can manually call sync() to flush all dirty buffers
+to stable storage.
+
+<p>
+RocksDB by default uses faster fdatasync() to sync files. If you want
+to use fsync(), you can set Options::use_fsync to true. You should set
+this to true on filesystems like ext3 that can lose files after a
+reboot.
+
+<p>
+<h1>Concurrency</h1>
+<p>
+A database may only be opened by one process at a time.
+The <code>rocksdb</code> implementation acquires a lock from the
+operating system to prevent misuse.  Within a single process, the
+same <code>rocksdb::DB</code> object may be safely shared by multiple
+concurrent threads.  I.e., different threads may write into or fetch
+iterators or call <code>Get</code> on the same database without any
+external synchronization (the leveldb implementation will
+automatically do the required synchronization).  However other objects
+(like Iterator and WriteBatch) may require external synchronization.
+If two threads share such an object, they must protect access to it
+using their own locking protocol.  More details are available in
+the public header files.
+
+<p>
+<h1>Merge operators</h1>
+<p>
+Merge operators provide efficient support for read-modify-write operation.
+More on the interface and implementation can be found on:
+<p>
+<a href="https://github.com/facebook/rocksdb/wiki/Merge-Operator">
+    Merge Operator</a>
+<p>
+<a href="https://github.com/facebook/rocksdb/wiki/Merge-Operator-Implementation">
+    Merge Operator Implementation</a>
+
+<p>
+<h1>Iteration</h1>
+<p>
+The following example demonstrates how to print all key,value pairs
+in a database.
+<p>
+<pre>
+  rocksdb::Iterator* it = db-&gt;NewIterator(rocksdb::ReadOptions());
+  for (it-&gt;SeekToFirst(); it-&gt;Valid(); it-&gt;Next()) {
+    cout &lt;&lt; it-&gt;key().ToString() &lt;&lt; ": "  &lt;&lt; it-&gt;value().ToString() &lt;&lt; endl;
+  }
+  assert(it-&gt;status().ok());  // Check for any errors found during the scan
+  delete it;
+</pre>
+The following variation shows how to process just the keys in the
+range <code>[start,limit)</code>:
+<p>
+<pre>
+  for (it-&gt;Seek(start);
+       it-&gt;Valid() &amp;&amp; it-&gt;key().ToString() &lt; limit;
+       it-&gt;Next()) {
+    ...
+  }
+</pre>
+You can also process entries in reverse order.  (Caveat: reverse
+iteration may be somewhat slower than forward iteration.)
+<p>
+<pre>
+  for (it-&gt;SeekToLast(); it-&gt;Valid(); it-&gt;Prev()) {
+    ...
+  }
+</pre>
+<h1>Snapshots</h1>
+<p>
+Snapshots provide consistent read-only views over the entire state of
+the key-value store.  <code>ReadOptions::snapshot</code> may be non-NULL to indicate
+that a read should operate on a particular version of the DB state.
+If <code>ReadOptions::snapshot</code> is NULL, the read will operate on an
+implicit snapshot of the current state.
+<p>
+Snapshots are created by the DB::GetSnapshot() method:
+<p>
+<pre>
+  rocksdb::ReadOptions options;
+  options.snapshot = db-&gt;GetSnapshot();
+  ... apply some updates to db ...
+  rocksdb::Iterator* iter = db-&gt;NewIterator(options);
+  ... read using iter to view the state when the snapshot was created ...
+  delete iter;
+  db-&gt;ReleaseSnapshot(options.snapshot);
+</pre>
+Note that when a snapshot is no longer needed, it should be released
+using the DB::ReleaseSnapshot interface.  This allows the
+implementation to get rid of state that was being maintained just to
+support reading as of that snapshot.
+<h1>Slice</h1>
+<p>
+The return value of the <code>it->key()</code> and <code>it->value()</code> calls above
+are instances of the <code>rocksdb::Slice</code> type.  <code>Slice</code> is a simple
+structure that contains a length and a pointer to an external byte
+array.  Returning a <code>Slice</code> is a cheaper alternative to returning a
+<code>std::string</code> since we do not need to copy potentially large keys and
+values.  In addition, <code>rocksdb</code> methods do not return null-terminated
+C-style strings since <code>rocksdb</code> keys and values are allowed to
+contain '\0' bytes.
+<p>
+C++ strings and null-terminated C-style strings can be easily converted
+to a Slice:
+<p>
+<pre>
+   rocksdb::Slice s1 = "hello";
+
+   std::string str("world");
+   rocksdb::Slice s2 = str;
+</pre>
+A Slice can be easily converted back to a C++ string:
+<pre>
+   std::string str = s1.ToString();
+   assert(str == std::string("hello"));
+</pre>
+Be careful when using Slices since it is up to the caller to ensure that
+the external byte array into which the Slice points remains live while
+the Slice is in use.  For example, the following is buggy:
+<p>
+<pre>
+   rocksdb::Slice slice;
+   if (...) {
+     std::string str = ...;
+     slice = str;
+   }
+   Use(slice);
+</pre>
+When the <code>if</code> statement goes out of scope, <code>str</code> will be destroyed and the
+backing storage for <code>slice</code> will disappear.
+<p>
+<h1>Comparators</h1>
+<p>
+The preceding examples used the default ordering function for key,
+which orders bytes lexicographically.  You can however supply a custom
+comparator when opening a database.  For example, suppose each
+database key consists of two numbers and we should sort by the first
+number, breaking ties by the second number.  First, define a proper
+subclass of <code>rocksdb::Comparator</code> that expresses these rules:
+<p>
+<pre>
+  class TwoPartComparator : public rocksdb::Comparator {
+   public:
+    // Three-way comparison function:
+    //   if a &lt; b: negative result
+    //   if a &gt; b: positive result
+    //   else: zero result
+    int Compare(const rocksdb::Slice&amp; a, const rocksdb::Slice&amp; b) const {
+      int a1, a2, b1, b2;
+      ParseKey(a, &amp;a1, &amp;a2);
+      ParseKey(b, &amp;b1, &amp;b2);
+      if (a1 &lt; b1) return -1;
+      if (a1 &gt; b1) return +1;
+      if (a2 &lt; b2) return -1;
+      if (a2 &gt; b2) return +1;
+      return 0;
+    }
+
+    // Ignore the following methods for now:
+    const char* Name() const { return "TwoPartComparator"; }
+    void FindShortestSeparator(std::string*, const rocksdb::Slice&amp;) const { }
+    void FindShortSuccessor(std::string*) const { }
+  };
+</pre>
+Now create a database using this custom comparator:
+<p>
+<pre>
+  TwoPartComparator cmp;
+  rocksdb::DB* db;
+  rocksdb::Options options;
+  options.create_if_missing = true;
+  options.comparator = &amp;cmp;
+  rocksdb::Status status = rocksdb::DB::Open(options, "/tmp/testdb", &amp;db);
+  ...
+</pre>
+<h2>Backwards compatibility</h2>
+<p>
+The result of the comparator's <code>Name</code> method is attached to the
+database when it is created, and is checked on every subsequent
+database open.  If the name changes, the <code>rocksdb::DB::Open</code> call will
+fail.  Therefore, change the name if and only if the new key format
+and comparison function are incompatible with existing databases, and
+it is ok to discard the contents of all existing databases.
+<p>
+You can however still gradually evolve your key format over time with
+a little bit of pre-planning.  For example, you could store a version
+number at the end of each key (one byte should suffice for most uses).
+When you wish to switch to a new key format (e.g., adding an optional
+third part to the keys processed by <code>TwoPartComparator</code>),
+(a) keep the same comparator name (b) increment the version number
+for new keys (c) change the comparator function so it uses the
+version numbers found in the keys to decide how to interpret them.
+
+
+<p>
+<h1>MemTable and Table factories</h1>
+<p>
+By default, we keep the data in memory in skiplist memtable and the data
+on disk in a table format described here:
+<a href="https://github.com/facebook/rocksdb/wiki/Rocksdb-Table-Format">
+    RocksDB Table Format</a>.
+<p>
+Since one of the goals of RocksDB is to have
+different parts of the system easily pluggable, we support different
+implementations of both memtable and table format. You can supply
+your own memtable factory by setting <code>Options::memtable_factory</code>
+and your own table factory by setting <code>Options::table_factory</code>.
+For available memtable factories, please refer to
+<code>rocksdb/memtablerep.h</code> and for table factores to
+<code>rocksdb/table.h</code>. These features are both in active development
+and please be wary of any API changes that might break your application
+going forward.
+<p>
+You can also read more about memtables here:
+<a href="https://github.com/facebook/rocksdb/wiki/Rocksdb-Architecture-Guide#memtables">
+Memtables wiki
+</a>
+
+<p>
+<h1>Performance</h1>
+<p>
+Performance can be tuned by changing the default values of the
+types defined in <code>include/rocksdb/options.h</code>.
+
+<p>
+<h2>Block size</h2>
+<p>
+<code>rocksdb</code> groups adjacent keys together into the same block and such a
+block is the unit of transfer to and from persistent storage.  The
+default block size is approximately 4096 uncompressed bytes.
+Applications that mostly do bulk scans over the contents of the
+database may wish to increase this size.  Applications that do a lot
+of point reads of small values may wish to switch to a smaller block
+size if performance measurements indicate an improvement.  There isn't
+much benefit in using blocks smaller than one kilobyte, or larger than
+a few megabytes.  Also note that compression will be more effective
+with larger block sizes. To change block size parameter, use
+<code>Options::block_size</code>.
+<p>
+<h2>Write buffer</h2>
+<p>
+<code>Options::write_buffer_size</code> specifies the amount of data
+to build up in memory before converting to a sorted on-disk file.
+Larger values increase performance, especially during bulk loads.
+Up to max_write_buffer_number write buffers may be held in memory
+at the same time,
+so you may wish to adjust this parameter to control memory usage.
+Also, a larger write buffer will result in a longer recovery time
+the next time the database is opened.
+Related option is
+<code>Options::max_write_buffer_number</code>, which is maximum number
+of write buffers that are built up in memory. The default is 2, so that
+when 1 write buffer is being flushed to storage, new writes can continue
+to the other write buffer.
+<code>Options::min_write_buffer_number_to_merge</code> is the minimum number
+of write buffers that will be merged together before writing to storage.
+If set to 1, then all write buffers are fushed to L0 as individual files and
+this increases read amplification because a get request has to check in all
+of these files. Also, an in-memory merge may result in writing lesser
+data to storage if there are duplicate records in each of these
+individual write buffers.  Default: 1
+<p>
+<h2>Compression</h2>
+<p>
+Each block is individually compressed before being written to
+persistent storage.  Compression is on by default since the default
+compression method is very fast, and is automatically disabled for
+uncompressible data.  In rare cases, applications may want to disable
+compression entirely, but should only do so if benchmarks show a
+performance improvement:
+<p>
+<pre>
+  rocksdb::Options options;
+  options.compression = rocksdb::kNoCompression;
+  ... rocksdb::DB::Open(options, name, ...) ....
+</pre>
+<h2>Cache</h2>
+<p>
+The contents of the database are stored in a set of files in the
+filesystem and each file stores a sequence of compressed blocks.  If
+<code>options.block_cache</code> is non-NULL, it is used to cache frequently
+used uncompressed block contents. If <code>options.block_cache_compressed</code>
+is non-NULL, it is used to cache frequently used compressed blocks. Compressed
+cache is an alternative to OS cache, which also caches compressed blocks. If
+compressed cache is used, the OS cache will be disabled automatically by setting
+<code>options.allow_os_buffer</code> to false.
+<p>
+<pre>
+  #include "rocksdb/cache.h"
+
+  rocksdb::Options options;
+  options.block_cache = rocksdb::NewLRUCache(100 * 1048576);  // 100MB uncompressed cache
+  options.block_cache_compressed = rocksdb::NewLRUCache(100 * 1048576);  // 100MB compressed cache
+  rocksdb::DB* db;
+  rocksdb::DB::Open(options, name, &db);
+  ... use the db ...
+  delete db
+  delete options.block_cache;
+  delete options.block_cache_compressed;
+</pre>
+<p>
+When performing a bulk read, the application may wish to disable
+caching so that the data processed by the bulk read does not end up
+displacing most of the cached contents.  A per-iterator option can be
+used to achieve this:
+<p>
+<pre>
+  rocksdb::ReadOptions options;
+  options.fill_cache = false;
+  rocksdb::Iterator* it = db-&gt;NewIterator(options);
+  for (it-&gt;SeekToFirst(); it-&gt;Valid(); it-&gt;Next()) {
+    ...
+  }
+</pre>
+<p>
+You can also disable block cache by setting <code>options.no_block_cache</code>
+to true.
+<h2>Key Layout</h2>
+<p>
+Note that the unit of disk transfer and caching is a block.  Adjacent
+keys (according to the database sort order) will usually be placed in
+the same block.  Therefore the application can improve its performance
+by placing keys that are accessed together near each other and placing
+infrequently used keys in a separate region of the key space.
+<p>
+For example, suppose we are implementing a simple file system on top
+of <code>rocksdb</code>.  The types of entries we might wish to store are:
+<p>
+<pre>
+   filename -&gt; permission-bits, length, list of file_block_ids
+   file_block_id -&gt; data
+</pre>
+We might want to prefix <code>filename</code> keys with one letter (say '/') and the
+<code>file_block_id</code> keys with a different letter (say '0') so that scans
+over just the metadata do not force us to fetch and cache bulky file
+contents.
+<p>
+<h2>Filters</h2>
+<p>
+Because of the way <code>rocksdb</code> data is organized on disk,
+a single <code>Get()</code> call may involve multiple reads from disk.
+The optional <code>FilterPolicy</code> mechanism can be used to reduce
+the number of disk reads substantially.
+<pre>
+   rocksdb::Options options;
+   options.filter_policy = NewBloomFilter(10);
+   rocksdb::DB* db;
+   rocksdb::DB::Open(options, "/tmp/testdb", &amp;db);
+   ... use the database ...
+   delete db;
+   delete options.filter_policy;
+</pre>
+The preceding code associates a
+<a href="http://en.wikipedia.org/wiki/Bloom_filter">Bloom filter</a>
+based filtering policy with the database.  Bloom filter based
+filtering relies on keeping some number of bits of data in memory per
+key (in this case 10 bits per key since that is the argument we passed
+to NewBloomFilter).  This filter will reduce the number of unnecessary
+disk reads needed for <code>Get()</code> calls by a factor of
+approximately a 100.  Increasing the bits per key will lead to a
+larger reduction at the cost of more memory usage.  We recommend that
+applications whose working set does not fit in memory and that do a
+lot of random reads set a filter policy.
+<p>
+If you are using a custom comparator, you should ensure that the filter
+policy you are using is compatible with your comparator.  For example,
+consider a comparator that ignores trailing spaces when comparing keys.
+<code>NewBloomFilter</code> must not be used with such a comparator.
+Instead, the application should provide a custom filter policy that
+also ignores trailing spaces.  For example:
+<pre>
+  class CustomFilterPolicy : public rocksdb::FilterPolicy {
+   private:
+    FilterPolicy* builtin_policy_;
+   public:
+    CustomFilterPolicy() : builtin_policy_(NewBloomFilter(10)) { }
+    ~CustomFilterPolicy() { delete builtin_policy_; }
+
+    const char* Name() const { return "IgnoreTrailingSpacesFilter"; }
+
+    void CreateFilter(const Slice* keys, int n, std::string* dst) const {
+      // Use builtin bloom filter code after removing trailing spaces
+      std::vector&lt;Slice&gt; trimmed(n);
+      for (int i = 0; i &lt; n; i++) {
+        trimmed[i] = RemoveTrailingSpaces(keys[i]);
+      }
+      return builtin_policy_-&gt;CreateFilter(&amp;trimmed[i], n, dst);
+    }
+
+    bool KeyMayMatch(const Slice& key, const Slice& filter) const {
+      // Use builtin bloom filter code after removing trailing spaces
+      return builtin_policy_-&gt;KeyMayMatch(RemoveTrailingSpaces(key), filter);
+    }
+  };
+</pre>
+<p>
+Advanced applications may provide a filter policy that does not use
+a bloom filter but uses some other mechanism for summarizing a set
+of keys.  See <code>rocksdb/filter_policy.h</code> for detail.
+<p>
+<h1>Checksums</h1>
+<p>
+<code>rocksdb</code> associates checksums with all data it stores in the file system.
+There are two separate controls provided over how aggressively these
+checksums are verified:
+<p>
+<ul>
+<li> <code>ReadOptions::verify_checksums</code> may be set to true to force
+  checksum verification of all data that is read from the file system on
+  behalf of a particular read.  By default, no such verification is
+  done.
+<p>
+<li> <code>Options::paranoid_checks</code> may be set to true before opening a
+  database to make the database implementation raise an error as soon as
+  it detects an internal corruption.  Depending on which portion of the
+  database has been corrupted, the error may be raised when the database
+  is opened, or later by another database operation.  By default,
+  paranoid checking is off so that the database can be used even if
+  parts of its persistent storage have been corrupted.
+<p>
+  If a database is corrupted (perhaps it cannot be opened when
+  paranoid checking is turned on), the <code>rocksdb::RepairDB</code> function
+  may be used to recover as much of the data as possible.
+<p>
+</ul>
+
+<p>
+<h1>Compaction</h1>
+<p>
+You can read more on Compactions here:
+<a href="https://github.com/facebook/rocksdb/wiki/Rocksdb-Architecture-Guide#multi-threaded-compactions">
+    Multi-threaded compactions
+</a>
+<p>
+Here we give overview of the options that impact behavior of Compactions:
+<ul>
+<p>
+<li><code>Options::compaction_style</code> - RocksDB currently supports two
+compaction algorithms - Universal  style and Level style. This option switches
+between the two.  Can be kCompactionStyleUniversal or kCompactionStyleLevel.
+If this is kCompactionStyleUniversal, then you can configure universal style
+parameters with <code>Options::compaction_options_universal</code>.
+<p>
+<li><code>Options::disable_auto_compactions</code> - Disable automatic compactions.
+Manual compactions can still be issued on this database.
+<p>
+<li><code>Options::compaction_filter</code> - Allows an application to modify/delete
+a key-value during background compaction. The client must provide
+compaction_filter_factory if it requires a new compaction filter to be used
+for different compaction processes. Client should specify only one of filter
+or factory.
+<p>
+<li><code>Options::compaction_filter_factory</code> - a factory that provides
+compaction filter objects which allow an application to modify/delete a
+key-value during background compaction.
+</ul>
+<p>
+Other options impacting performance of compactions and when they get triggered
+are: 
+<ul>
+<p>
+<li> <code>Options::access_hint_on_compaction_start</code> - Specify the file access 
+pattern once a compaction is started. It will be applied to all input files of a compaction. Default: NORMAL
+<p>
+<li> <code>Options::level0_file_num_compaction_trigger</code> -  Number of files to trigger level-0 compaction. 
+A negative value means that level-0 compaction will not be triggered by number of files at all.
+<p>
+<li> <code>Options::max_mem_compaction_level</code> -  Maximum level to which a new compacted memtable is pushed if it
+does not create overlap.  We try to push to level 2 to avoid the relatively expensive level 0=>1 compactions and to avoid some
+expensive manifest file operations.  We do not push all the way to the largest level since that can generate a lot of wasted disk
+space if the same key space is being repeatedly overwritten.
+<p>
+<li> <code>Options::target_file_size_base</code> and <code>Options::target_file_size_multiplier</code> - 
+Target file size for compaction.  target_file_size_base is per-file size for level-1.
+Target file size for level L can be calculated by target_file_size_base * (target_file_size_multiplier ^ (L-1))
+For example, if target_file_size_base is 2MB and target_file_size_multiplier is 10, then each file on level-1 will
+be 2MB, and each file on level 2 will be 20MB, and each file on level-3 will be 200MB. Default target_file_size_base is 2MB
+and default target_file_size_multiplier is 1.
+<p>
+<li> <code>Options::expanded_compaction_factor</code> -  Maximum number of bytes in all compacted files.  We avoid expanding
+the lower level file set of a compaction if it would make the total compaction cover more than
+(expanded_compaction_factor * targetFileSizeLevel()) many bytes.
+<p>
+<li> <code>Options::source_compaction_factor</code> -    Maximum number of bytes in all source files to be compacted in a
+single compaction run. We avoid picking too many files in the source level so that we do not exceed the total source bytes
+for compaction to exceed (source_compaction_factor * targetFileSizeLevel()) many bytes.
+Default:1, i.e. pick maxfilesize amount of data as the source of a compaction.
+<p>
+<li> <code>Options::max_grandparent_overlap_factor</code> -   Control maximum bytes of overlaps in grandparent (i.e., level+2) before we
+stop building a single file in a level->level+1 compaction.
+<p>
+<li> <code>Options::disable_seek_compaction</code> -  Disable compaction triggered by seek.
+With bloomfilter and fast storage, a miss on one level is very cheap if the file handle is cached in table cache
+(which is true if max_open_files is large).
+<p>
+<li> <code>Options::max_background_compactions</code> - Maximum number of concurrent background jobs, submitted to
+the default LOW priority thread pool
+</ul>
+
+<p>
+You can learn more about all of those options in <code>rocksdb/options.h</code>
+
+<h2> Universal style compaction specific settings</h2>
+<p>
+If you're using Universal style compaction, there is an object <code>CompactionOptionsUniversal</code>
+that hold all the different options for that compaction. The exact definition is in
+<code>rocksdb/universal_compaction.h</code> and you can set it in <code>Options::compaction_options_universal</code>.
+Here we give short overview of options in <code>CompactionOptionsUniversal</code>:
+<ul>
+<p>
+<li> <code>CompactionOptionsUniversal::size_ratio</code> - Percentage flexibilty while comparing file size. If the candidate file(s)
+   size is 1% smaller than the next file's size, then include next file into
+   this candidate set.  Default: 1
+<p>
+<li> <code>CompactionOptionsUniversal::min_merge_width</code> - The minimum number of files in a single compaction run. Default: 2
+<p>
+<li> <code>CompactionOptionsUniversal::max_merge_width</code> - The maximum number of files in a single compaction run. Default: UINT_MAX
+<p>
+<li> <code>CompactionOptionsUniversal::max_size_amplification_percent</code> - The size amplification is defined as the amount (in percentage) of
+additional storage needed to store a single byte of data in the database.  For example, a size amplification of 2% means that a database that
+contains 100 bytes of user-data may occupy upto 102 bytes of physical storage. By this definition, a fully compacted database has
+a size amplification of 0%. Rocksdb uses the following heuristic to calculate size amplification: it assumes that all files excluding
+the earliest file contribute to the size amplification.  Default: 200, which means that a 100 byte database could require upto
+300 bytes of storage.
+<p>
+<li> <code>CompactionOptionsUniversal::compression_size_percent</code> - If this option is set to be -1 (the default value), all the output files
+will follow compression type specified.  If this option is not negative, we will try to make sure compressed
+size is just above this value. In normal cases, at least this percentage
+of data will be compressed.
+When we are compacting to a new file, here is the criteria whether
+it needs to be compressed: assuming here are the list of files sorted
+by generation time: [ A1...An B1...Bm C1...Ct ],
+where A1 is the newest and Ct is the oldest, and we are going to compact
+B1...Bm, we calculate the total size of all the files as total_size, as
+well as  the total size of C1...Ct as total_C, the compaction output file
+will be compressed iff total_C / total_size < this percentage
+<p>
+<li> <code>CompactionOptionsUniversal::stop_style</code> - The algorithm used to stop picking files into a single compaction run.
+Can be kCompactionStopStyleSimilarSize (pick files of similar size) or kCompactionStopStyleTotalSize (total size of picked files > next file).
+Default: kCompactionStopStyleTotalSize
+</ul>
+
+<h1>Thread pools</h1>
+<p>
+A thread pool is associated with Env environment object. The client has to create a thread pool by setting the number of background
+threads using method <code>Env::SetBackgroundThreads()</code> defined in <code>rocksdb/env.h</code>.
+We use the thread pool for compactions and memtable flushes.
+Since memtable flushes are in critical code path (stalling memtable flush can stall writes, increasing p99), we suggest 
+having two thread pools - with priorities HIGH and LOW. Memtable flushes can be set up to be scheduled on HIGH thread pool.
+There are two options available for configuration of background compactions and flushes:
+<ul>
+<p>
+<li> <code>Options::max_background_compactions</code> - Maximum number of concurrent background jobs,
+submitted to the default LOW priority thread pool
+<p>
+<li> <code>Options::max_background_flushes</code> - Maximum number of concurrent background memtable flush jobs, submitted to
+the HIGH priority thread pool.  By default, all background jobs (major compaction and memtable flush) go
+to the LOW priority pool. If this option is set to a positive number, memtable flush jobs will be submitted to the HIGH priority pool.
+It is important when the same Env is shared by multiple db instances.  Without a separate pool, long running major compaction jobs could
+potentially block memtable flush jobs of other db instances, leading to unnecessary Put stalls.
+</ul>
+<p>
+<pre>
+  #include "rocksdb/env.h"
+  #include "rocksdb/db.h"
+
+  auto env = rocksdb::Env::Default();
+  env->SetBackgroundThreads(2, rocksdb::Env::LOW);
+  env->SetBackgroundThreads(1, rocksdb::Env::HIGH);
+  rocksdb::DB* db;
+  rocksdb::Options options;
+  options.env = env;
+  options.max_background_compactions = 2;
+  options.max_background_flushes = 1;
+  rocksdb::Status status = rocksdb::DB::Open(options, "/tmp/testdb", &amp;db);
+  assert(status.ok());
+  ...
+</pre>
+<h1>Approximate Sizes</h1>
+<p>
+The <code>GetApproximateSizes</code> method can used to get the approximate
+number of bytes of file system space used by one or more key ranges.
+<p>
+<pre>
+   rocksdb::Range ranges[2];
+   ranges[0] = rocksdb::Range("a", "c");
+   ranges[1] = rocksdb::Range("x", "z");
+   uint64_t sizes[2];
+   rocksdb::Status s = db-&gt;GetApproximateSizes(ranges, 2, sizes);
+</pre>
+The preceding call will set <code>sizes[0]</code> to the approximate number of
+bytes of file system space used by the key range <code>[a..c)</code> and
+<code>sizes[1]</code> to the approximate number of bytes used by the key range
+<code>[x..z)</code>.
+<p>
+<h1>Environment</h1>
+<p>
+All file operations (and other operating system calls) issued by the
+<code>rocksdb</code> implementation are routed through a <code>rocksdb::Env</code> object.
+Sophisticated clients may wish to provide their own <code>Env</code>
+implementation to get better control.  For example, an application may
+introduce artificial delays in the file IO paths to limit the impact
+of <code>rocksdb</code> on other activities in the system.
+<p>
+<pre>
+  class SlowEnv : public rocksdb::Env {
+    .. implementation of the Env interface ...
+  };
+
+  SlowEnv env;
+  rocksdb::Options options;
+  options.env = &amp;env;
+  Status s = rocksdb::DB::Open(options, ...);
+</pre>
+<h1>Porting</h1>
+<p>
+<code>rocksdb</code> may be ported to a new platform by providing platform
+specific implementations of the types/methods/functions exported by
+<code>rocksdb/port/port.h</code>.  See <code>rocksdb/port/port_example.h</code> for more
+details.
+<p>
+In addition, the new platform may need a new default <code>rocksdb::Env</code>
+implementation.  See <code>rocksdb/util/env_posix.h</code> for an example.
+
+<h1>Statistics</h1>
+<p>
+To be able to efficiently tune your application, it is always helpful if you
+have access to usage statistics. You can collect those statistics by setting
+<code>Options::table_stats_collectors</code> or
+<code>Options::statistics</code>. For more information, refer to
+<code>rocksdb/table_stats.h</code> and <code>rocksdb/statistics.h</code>.
+These should not add significant overhead to your application and we
+recommend exporting them to other monitoring tools.
+
+<h1>Purging WAL files</h1>
+<p>
+By default, old write-ahead logs are deleted automatically when they fall out
+of scope and application doesn't need them anymore. There are options that
+enable the user to archive the logs and then delete them lazily, either in
+TTL fashion or based on size limit.
+
+The options are <code>Options::WAL_ttl_seconds</code> and
+<code>Options::WAL_size_limit_MB</code>. Here is how they can be used:
+<ul>
+<li>
+<p>
+If both set to 0, logs will be deleted asap and will never get into the archive.
+<li>
+<p>
+If <code>WAL_ttl_seconds</code> is 0 and WAL_size_limit_MB is not 0, WAL
+files will be checked every 10 min and if total size is greater then
+<code>WAL_size_limit_MB</code>, they will be deleted starting with the
+earliest until size_limit is met. All empty files will be deleted.
+<li>
+<p>
+If <code>WAL_ttl_seconds</code> is not 0 and WAL_size_limit_MB is 0, then
+WAL files will be checked every <code>WAL_ttl_seconds / 2</code> and those
+that are older than WAL_ttl_seconds will be deleted.
+<li>
+<p>
+If both are not 0, WAL files will be checked every 10 min and both
+checks will be performed with ttl being first.
+</ul>
+
+<h1>Other Information</h1>
+<p>
+Details about the <code>rocksdb</code> implementation may be found in
+the following documents:
+<ul>
+<li> <a href="https://github.com/facebook/rocksdb/wiki/Rocksdb-Architecture-Guide">
+  RocksDB Architecture Guide</a>
+<li> <a href="https://github.com/facebook/rocksdb/wiki/Rocksdb-Table-Format">
+    Format of an immutable Table file</a>
+<li> <a href="log_format.txt">Format of a log file</a>
+</ul>
+
+</body>
+</html>
--- a/doc/log_format.txt
+++ b/doc/log_format.txt
@ -0,0 +1,75 @@
+The log file contents are a sequence of 32KB blocks.  The only
+exception is that the tail of the file may contain a partial block.
+
+Each block consists of a sequence of records:
+   block := record* trailer?
+   record :=
+	checksum: uint32	// crc32c of type and data[]
+	length: uint16
+	type: uint8		// One of FULL, FIRST, MIDDLE, LAST
+	data: uint8[length]
+
+A record never starts within the last six bytes of a block (since it
+won't fit).  Any leftover bytes here form the trailer, which must
+consist entirely of zero bytes and must be skipped by readers.  
+
+Aside: if exactly seven bytes are left in the current block, and a new
+non-zero length record is added, the writer must emit a FIRST record
+(which contains zero bytes of user data) to fill up the trailing seven
+bytes of the block and then emit all of the user data in subsequent
+blocks.
+
+More types may be added in the future.  Some Readers may skip record
+types they do not understand, others may report that some data was
+skipped.
+
+FULL == 1
+FIRST == 2
+MIDDLE == 3
+LAST == 4
+
+The FULL record contains the contents of an entire user record.
+
+FIRST, MIDDLE, LAST are types used for user records that have been
+split into multiple fragments (typically because of block boundaries).
+FIRST is the type of the first fragment of a user record, LAST is the
+type of the last fragment of a user record, and MID is the type of all
+interior fragments of a user record.
+
+Example: consider a sequence of user records:
+   A: length 1000
+   B: length 97270
+   C: length 8000
+A will be stored as a FULL record in the first block.
+
+B will be split into three fragments: first fragment occupies the rest
+of the first block, second fragment occupies the entirety of the
+second block, and the third fragment occupies a prefix of the third
+block.  This will leave six bytes free in the third block, which will
+be left empty as the trailer.
+
+C will be stored as a FULL record in the fourth block.
+
+===================
+
+Some benefits over the recordio format:
+
+(1) We do not need any heuristics for resyncing - just go to next
+block boundary and scan.  If there is a corruption, skip to the next
+block.  As a side-benefit, we do not get confused when part of the
+contents of one log file are embedded as a record inside another log
+file.
+
+(2) Splitting at approximate boundaries (e.g., for mapreduce) is
+simple: find the next block boundary and skip records until we
+hit a FULL or FIRST record.
+
+(3) We do not need extra buffering for large records.
+
+Some downsides compared to recordio format:
+
+(1) No packing of tiny records.  This could be fixed by adding a new
+record type, so it is a shortcoming of the current implementation,
+not necessarily the format.
+
+(2) No compression.  Again, this could be fixed by adding new record types.
--- a/doc/rockslogo.jpg
+++ b/doc/rockslogo.jpg
--- a/doc/rockslogo.png
+++ b/doc/rockslogo.png
--- a/hdfs/README
+++ b/hdfs/README
@ -0,0 +1,26 @@
+This directory contains the hdfs extensions needed to make rocksdb store
+files in HDFS.
+
+The hdfs.h file is copied from the Apache Hadoop 1.0 source code. 
+It defines the libhdfs library
+(http://hadoop.apache.org/common/docs/r0.20.2/libhdfs.html) to access 
+data in HDFS.  The libhdfs.a is copied from the Apache Hadoop 1.0 build. 
+It implements the API defined in hdfs.h. If your hadoop cluster is running
+a different hadoop release, then install these two files manually from your
+hadoop distribution and then recompile rocksdb.
+
+The env_hdfs.h file defines the rocksdb objects that are needed to talk to an
+underlying filesystem. 
+
+If you want to compile rocksdb with hdfs support, please set the following
+enviroment variables appropriately:
+   USE_HDFS=1
+   JAVA_HOME=/usr/local/jdk-6u22-64
+   LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/jdk-6u22-64/jre/lib/amd64/server:/usr/local/jdk-6u22-64/jre/lib/amd64/:./snappy/libs
+   make clean all db_bench
+
+To run dbbench,
+  set CLASSPATH to include your hadoop distribution
+  db_bench --hdfs="hdfs://hbaseudbperf001.snc1.facebook.com:9000"
+
+
--- a/hdfs/env_hdfs.h
+++ b/hdfs/env_hdfs.h
@ -0,0 +1,302 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+
+#pragma once
+#include <algorithm>
+#include <stdio.h>
+#include <sys/time.h>
+#include <time.h>
+#include <iostream>
+#include "rocksdb/env.h"
+#include "rocksdb/status.h"
+
+#ifdef USE_HDFS
+#include "hdfs/hdfs.h"
+
+namespace rocksdb {
+
+static const std::string kProto = "hdfs://";
+static const std::string pathsep = "/";
+
+// Thrown during execution when there is an issue with the supplied
+// arguments.
+class HdfsUsageException : public std::exception { };
+
+// A simple exception that indicates something went wrong that is not
+// recoverable.  The intention is for the message to be printed (with
+// nothing else) and the process terminate.
+class HdfsFatalException : public std::exception {
+public:
+  explicit HdfsFatalException(const std::string& s) : what_(s) { }
+  virtual ~HdfsFatalException() throw() { }
+  virtual const char* what() const throw() {
+    return what_.c_str();
+  }
+private:
+  const std::string what_;
+};
+
+//
+// The HDFS environment for rocksdb. This class overrides all the
+// file/dir access methods and delegates the thread-mgmt methods to the
+// default posix environment.
+//
+class HdfsEnv : public Env {
+
+ public:
+  HdfsEnv(const std::string& fsname) : fsname_(fsname) {
+    posixEnv = Env::Default();
+    fileSys_ = connectToPath(fsname_);
+  }
+
+  virtual ~HdfsEnv() {
+    fprintf(stderr, "Destroying HdfsEnv::Default()\n");
+    hdfsDisconnect(fileSys_);
+  }
+
+  virtual Status NewSequentialFile(const std::string& fname,
+                                   SequentialFile** result);
+
+  virtual Status NewRandomAccessFile(const std::string& fname,
+                                     RandomAccessFile** result);
+
+  virtual Status NewWritableFile(const std::string& fname,
+                                 WritableFile** result);
+
+  virtual Status NewRandomRWFile(const std::string& fname,
+                                 unique_ptr<RandomRWFile>* result,
+                                 const EnvOptions& options);
+
+  virtual bool FileExists(const std::string& fname);
+
+  virtual Status GetChildren(const std::string& path,
+                             std::vector<std::string>* result);
+
+  virtual Status DeleteFile(const std::string& fname);
+
+  virtual Status CreateDir(const std::string& name);
+
+  virtual Status CreateDirIfMissing(const std::string& name);
+
+  virtual Status DeleteDir(const std::string& name);
+
+  virtual Status GetFileSize(const std::string& fname, uint64_t* size);
+
+  virtual Status GetFileModificationTime(const std::string& fname,
+                                         uint64_t* file_mtime);
+
+  virtual Status RenameFile(const std::string& src, const std::string& target);
+
+  virtual Status LockFile(const std::string& fname, FileLock** lock);
+
+  virtual Status UnlockFile(FileLock* lock);
+
+  virtual Status NewLogger(const std::string& fname, Logger** result);
+
+  virtual void Schedule(void (*function)(void* arg), void* arg,
+                        Priority pri = LOW) {
+    posixEnv->Schedule(function, arg, pri);
+  }
+
+  virtual void StartThread(void (*function)(void* arg), void* arg) {
+    posixEnv->StartThread(function, arg);
+  }
+
+  virtual Status GetTestDirectory(std::string* path) {
+    return posixEnv->GetTestDirectory(path);
+  }
+
+  virtual uint64_t NowMicros() {
+    return posixEnv->NowMicros();
+  }
+
+  virtual void SleepForMicroseconds(int micros) {
+    posixEnv->SleepForMicroseconds(micros);
+  }
+
+  virtual Status GetHostName(char* name, uint64_t len) {
+    return posixEnv->GetHostName(name, len);
+  }
+
+  virtual Status GetCurrentTime(int64_t* unix_time) {
+    return posixEnv->GetCurrentTime(unix_time);
+  }
+
+  virtual Status GetAbsolutePath(const std::string& db_path,
+      std::string* output_path) {
+    return posixEnv->GetAbsolutePath(db_path, output_path);
+  }
+
+  virtual void SetBackgroundThreads(int number, Priority pri = LOW) {
+    posixEnv->SetBackgroundThreads(number, pri);
+  }
+
+  virtual std::string TimeToString(uint64_t number) {
+    return posixEnv->TimeToString(number);
+  }
+
+  static uint64_t gettid() {
+    assert(sizeof(pthread_t) <= sizeof(uint64_t));
+    return (uint64_t)pthread_self();
+  }
+
+ private:
+  std::string fsname_;  // string of the form "hdfs://hostname:port/"
+  hdfsFS fileSys_;      //  a single FileSystem object for all files
+  Env*  posixEnv;       // This object is derived from Env, but not from
+                        // posixEnv. We have posixnv as an encapsulated
+                        // object here so that we can use posix timers,
+                        // posix threads, etc.
+
+  /**
+   * If the URI is specified of the form hdfs://server:port/path,
+   * then connect to the specified cluster
+   * else connect to default.
+   */
+  hdfsFS connectToPath(const std::string& uri) {
+    if (uri.empty()) {
+      return NULL;
+    }
+    if (uri.find(kProto) != 0) {
+      // uri doesn't start with hdfs:// -> use default:0, which is special
+      // to libhdfs.
+      return hdfsConnectNewInstance("default", 0);
+    }
+    const std::string hostport = uri.substr(kProto.length());
+
+    std::vector <std::string> parts;
+    split(hostport, ':', parts);
+    if (parts.size() != 2) {
+      throw HdfsFatalException("Bad uri for hdfs " + uri);
+    }
+    // parts[0] = hosts, parts[1] = port/xxx/yyy
+    std::string host(parts[0]);
+    std::string remaining(parts[1]);
+
+    int rem = remaining.find(pathsep);
+    std::string portStr = (rem == 0 ? remaining :
+                           remaining.substr(0, rem));
+
+    tPort port;
+    port = atoi(portStr.c_str());
+    if (port == 0) {
+      throw HdfsFatalException("Bad host-port for hdfs " + uri);
+    }
+    hdfsFS fs = hdfsConnectNewInstance(host.c_str(), port);
+    return fs;
+  }
+
+  void split(const std::string &s, char delim,
+             std::vector<std::string> &elems) {
+    elems.clear();
+    size_t prev = 0;
+    size_t pos = s.find(delim);
+    while (pos != std::string::npos) {
+      elems.push_back(s.substr(prev, pos));
+      prev = pos + 1;
+      pos = s.find(delim, prev);
+    }
+    elems.push_back(s.substr(prev, s.size()));
+  }
+};
+
+}  // namespace rocksdb
+
+#else // USE_HDFS
+
+
+namespace rocksdb {
+
+static const Status notsup;
+
+class HdfsEnv : public Env {
+
+ public:
+  HdfsEnv(const std::string& fsname) {
+    fprintf(stderr, "You have not build rocksdb with HDFS support\n");
+    fprintf(stderr, "Please see hdfs/README for details\n");
+    throw new std::exception();
+  }
+
+  virtual ~HdfsEnv() {
+  }
+
+  virtual Status NewSequentialFile(const std::string& fname,
+                                   unique_ptr<SequentialFile>* result,
+                                   const EnvOptions& options);
+
+  virtual Status NewRandomAccessFile(const std::string& fname,
+                                     unique_ptr<RandomAccessFile>* result,
+                                     const EnvOptions& options) {
+    return notsup;
+  }
+
+  virtual Status NewWritableFile(const std::string& fname,
+                                 unique_ptr<WritableFile>* result,
+                                 const EnvOptions& options) {
+    return notsup;
+  }
+
+  virtual Status NewRandomRWFile(const std::string& fname,
+                                 unique_ptr<RandomRWFile>* result,
+                                 const EnvOptions& options) {
+    return notsup;
+  }
+
+  virtual bool FileExists(const std::string& fname){return false;}
+
+  virtual Status GetChildren(const std::string& path,
+                             std::vector<std::string>* result){return notsup;}
+
+  virtual Status DeleteFile(const std::string& fname){return notsup;}
+
+  virtual Status CreateDir(const std::string& name){return notsup;}
+
+  virtual Status CreateDirIfMissing(const std::string& name){return notsup;}
+
+  virtual Status DeleteDir(const std::string& name){return notsup;}
+
+  virtual Status GetFileSize(const std::string& fname, uint64_t* size){return notsup;}
+
+  virtual Status GetFileModificationTime(const std::string& fname,
+                                         uint64_t* time) {
+    return notsup;
+  }
+
+  virtual Status RenameFile(const std::string& src, const std::string& target){return notsup;}
+
+  virtual Status LockFile(const std::string& fname, FileLock** lock){return notsup;}
+
+  virtual Status UnlockFile(FileLock* lock){return notsup;}
+
+  virtual Status NewLogger(const std::string& fname,
+                           shared_ptr<Logger>* result){return notsup;}
+
+  virtual void Schedule(void (*function)(void* arg), void* arg,
+                        Priority pri = LOW) {}
+
+  virtual void StartThread(void (*function)(void* arg), void* arg) {}
+
+  virtual Status GetTestDirectory(std::string* path) {return notsup;}
+
+  virtual uint64_t NowMicros() {return 0;}
+
+  virtual void SleepForMicroseconds(int micros) {}
+
+  virtual Status GetHostName(char* name, uint64_t len) {return notsup;}
+
+  virtual Status GetCurrentTime(int64_t* unix_time) {return notsup;}
+
+  virtual Status GetAbsolutePath(const std::string& db_path,
+      std::string* outputpath) {return notsup;}
+
+  virtual void SetBackgroundThreads(int number, Priority pri = LOW) {}
+
+  virtual std::string TimeToString(uint64_t number) { return "";}
+};
+}
+
+#endif // USE_HDFS
--- a/hdfs/hdfs.h
+++ b/hdfs/hdfs.h
@ -0,0 +1,477 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#ifndef LIBHDFS_HDFS_H
+#define LIBHDFS_HDFS_H
+
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include <time.h>
+#include <errno.h>
+
+#include <jni.h>
+
+#ifndef O_RDONLY
+#define O_RDONLY 1
+#endif
+
+#ifndef O_WRONLY 
+#define O_WRONLY 2
+#endif
+
+#ifndef EINTERNAL
+#define EINTERNAL 255 
+#endif
+
+
+/** All APIs set errno to meaningful values */
+#ifdef __cplusplus
+extern  "C" {
+#endif
+
+    /**
+     * Some utility decls used in libhdfs.
+     */
+
+    typedef int32_t   tSize; /// size of data for read/write io ops 
+    typedef time_t    tTime; /// time type in seconds
+    typedef int64_t   tOffset;/// offset within the file
+    typedef uint16_t  tPort; /// port
+    typedef enum tObjectKind {
+        kObjectKindFile = 'F',
+        kObjectKindDirectory = 'D',
+    } tObjectKind;
+
+
+    /**
+     * The C reflection of org.apache.org.hadoop.FileSystem .
+     */
+    typedef void* hdfsFS;
+
+    
+    /**
+     * The C equivalent of org.apache.org.hadoop.FSData(Input|Output)Stream .
+     */
+    enum hdfsStreamType
+    {
+        UNINITIALIZED = 0,
+        INPUT = 1,
+        OUTPUT = 2,
+    };
+
+    
+    /**
+     * The 'file-handle' to a file in hdfs.
+     */
+    struct hdfsFile_internal {
+        void* file;
+        enum hdfsStreamType type;
+    };
+    typedef struct hdfsFile_internal* hdfsFile;
+      
+
+    /** 
+     * hdfsConnectAsUser - Connect to a hdfs file system as a specific user
+     * Connect to the hdfs.
+     * @param host A string containing either a host name, or an ip address
+     * of the namenode of a hdfs cluster. 'host' should be passed as NULL if
+     * you want to connect to local filesystem. 'host' should be passed as
+     * 'default' (and port as 0) to used the 'configured' filesystem
+     * (core-site/core-default.xml).
+     * @param port The port on which the server is listening.
+     * @param user the user name (this is hadoop domain user). Or NULL is equivelant to hhdfsConnect(host, port)
+     * @param groups the groups (these are hadoop domain groups)
+     * @return Returns a handle to the filesystem or NULL on error.
+     */
+     hdfsFS hdfsConnectAsUser(const char* host, tPort port, const char *user , const char *groups[], int groups_size );
+
+
+    /** 
+     * hdfsConnect - Connect to a hdfs file system.
+     * Connect to the hdfs.
+     * @param host A string containing either a host name, or an ip address
+     * of the namenode of a hdfs cluster. 'host' should be passed as NULL if
+     * you want to connect to local filesystem. 'host' should be passed as
+     * 'default' (and port as 0) to used the 'configured' filesystem
+     * (core-site/core-default.xml).
+     * @param port The port on which the server is listening.
+     * @return Returns a handle to the filesystem or NULL on error.
+     */
+     hdfsFS hdfsConnect(const char* host, tPort port);
+
+
+    /**
+     * This are the same as hdfsConnectAsUser except that every invocation returns a new FileSystem handle.
+     * Applications should call a hdfsDisconnect for every call to hdfsConnectAsUserNewInstance.
+     */
+     hdfsFS hdfsConnectAsUserNewInstance(const char* host, tPort port, const char *user , const char *groups[], int groups_size );
+     hdfsFS hdfsConnectNewInstance(const char* host, tPort port);
+     hdfsFS hdfsConnectPath(const char* uri);
+
+    /** 
+     * hdfsDisconnect - Disconnect from the hdfs file system.
+     * Disconnect from hdfs.
+     * @param fs The configured filesystem handle.
+     * @return Returns 0 on success, -1 on error.  
+     */
+    int hdfsDisconnect(hdfsFS fs);
+        
+
+    /** 
+     * hdfsOpenFile - Open a hdfs file in given mode.
+     * @param fs The configured filesystem handle.
+     * @param path The full path to the file.
+     * @param flags - an | of bits/fcntl.h file flags - supported flags are O_RDONLY, O_WRONLY (meaning create or overwrite i.e., implies O_TRUNCAT), 
+     * O_WRONLY|O_APPEND. Other flags are generally ignored other than (O_RDWR || (O_EXCL & O_CREAT)) which return NULL and set errno equal ENOTSUP.
+     * @param bufferSize Size of buffer for read/write - pass 0 if you want
+     * to use the default configured values.
+     * @param replication Block replication - pass 0 if you want to use
+     * the default configured values.
+     * @param blocksize Size of block - pass 0 if you want to use the
+     * default configured values.
+     * @return Returns the handle to the open file or NULL on error.
+     */
+    hdfsFile hdfsOpenFile(hdfsFS fs, const char* path, int flags,
+                          int bufferSize, short replication, tSize blocksize);
+
+
+    /** 
+     * hdfsCloseFile - Close an open file. 
+     * @param fs The configured filesystem handle.
+     * @param file The file handle.
+     * @return Returns 0 on success, -1 on error.  
+     */
+    int hdfsCloseFile(hdfsFS fs, hdfsFile file);
+
+
+    /** 
+     * hdfsExists - Checks if a given path exsits on the filesystem 
+     * @param fs The configured filesystem handle.
+     * @param path The path to look for
+     * @return Returns 0 on exists, 1 on non-exists, -1/-2 on error.  
+     */
+    int hdfsExists(hdfsFS fs, const char *path);
+
+
+    /** 
+     * hdfsSeek - Seek to given offset in file. 
+     * This works only for files opened in read-only mode. 
+     * @param fs The configured filesystem handle.
+     * @param file The file handle.
+     * @param desiredPos Offset into the file to seek into.
+     * @return Returns 0 on success, -1 on error.  
+     */
+    int hdfsSeek(hdfsFS fs, hdfsFile file, tOffset desiredPos); 
+
+
+    /** 
+     * hdfsTell - Get the current offset in the file, in bytes.
+     * @param fs The configured filesystem handle.
+     * @param file The file handle.
+     * @return Current offset, -1 on error.
+     */
+    tOffset hdfsTell(hdfsFS fs, hdfsFile file);
+
+
+    /** 
+     * hdfsRead - Read data from an open file.
+     * @param fs The configured filesystem handle.
+     * @param file The file handle.
+     * @param buffer The buffer to copy read bytes into.
+     * @param length The length of the buffer.
+     * @return Returns the number of bytes actually read, possibly less
+     * than than length;-1 on error.
+     */
+    tSize hdfsRead(hdfsFS fs, hdfsFile file, void* buffer, tSize length);
+
+
+    /** 
+     * hdfsPread - Positional read of data from an open file.
+     * @param fs The configured filesystem handle.
+     * @param file The file handle.
+     * @param position Position from which to read
+     * @param buffer The buffer to copy read bytes into.
+     * @param length The length of the buffer.
+     * @return Returns the number of bytes actually read, possibly less than
+     * than length;-1 on error.
+     */
+    tSize hdfsPread(hdfsFS fs, hdfsFile file, tOffset position,
+                    void* buffer, tSize length);
+
+
+    /** 
+     * hdfsWrite - Write data into an open file.
+     * @param fs The configured filesystem handle.
+     * @param file The file handle.
+     * @param buffer The data.
+     * @param length The no. of bytes to write. 
+     * @return Returns the number of bytes written, -1 on error.
+     */
+    tSize hdfsWrite(hdfsFS fs, hdfsFile file, const void* buffer,
+                    tSize length);
+
+
+    /** 
+     * hdfsWrite - Flush the data. 
+     * @param fs The configured filesystem handle.
+     * @param file The file handle.
+     * @return Returns 0 on success, -1 on error. 
+     */
+    int hdfsFlush(hdfsFS fs, hdfsFile file);
+
+    /**
+     * hdfsSync - Sync the data to persistent store.
+     * @param fs The configured filesystem handle.
+     * @param file The file handle.
+     * @return Returns 0 on success, -1 on error.
+     */
+    int hdfsSync(hdfsFS fs, hdfsFile file);
+
+    /**
+     * hdfsGetNumReplicasInPipeline - get number of remaining replicas in 
+     * pipeline
+     * @param fs The configured filesystem handle
+     * @param file the file handle
+     * @return returns the # of datanodes in the write pipeline; -1 on error
+     */
+   int hdfsGetNumCurrentReplicas(hdfsFS, hdfsFile file);
+
+    /**
+     * hdfsAvailable - Number of bytes that can be read from this
+     * input stream without blocking.
+     * @param fs The configured filesystem handle.
+     * @param file The file handle.
+     * @return Returns available bytes; -1 on error. 
+     */
+    int hdfsAvailable(hdfsFS fs, hdfsFile file);
+
+
+    /**
+     * hdfsCopy - Copy file from one filesystem to another.
+     * @param srcFS The handle to source filesystem.
+     * @param src The path of source file. 
+     * @param dstFS The handle to destination filesystem.
+     * @param dst The path of destination file. 
+     * @return Returns 0 on success, -1 on error. 
+     */
+    int hdfsCopy(hdfsFS srcFS, const char* src, hdfsFS dstFS, const char* dst);
+
+
+    /**
+     * hdfsMove - Move file from one filesystem to another.
+     * @param srcFS The handle to source filesystem.
+     * @param src The path of source file. 
+     * @param dstFS The handle to destination filesystem.
+     * @param dst The path of destination file. 
+     * @return Returns 0 on success, -1 on error. 
+     */
+    int hdfsMove(hdfsFS srcFS, const char* src, hdfsFS dstFS, const char* dst);
+
+
+    /**
+     * hdfsDelete - Delete file. 
+     * @param fs The configured filesystem handle.
+     * @param path The path of the file. 
+     * @return Returns 0 on success, -1 on error. 
+     */
+    int hdfsDelete(hdfsFS fs, const char* path);
+
+
+    /**
+     * hdfsRename - Rename file. 
+     * @param fs The configured filesystem handle.
+     * @param oldPath The path of the source file. 
+     * @param newPath The path of the destination file. 
+     * @return Returns 0 on success, -1 on error. 
+     */
+    int hdfsRename(hdfsFS fs, const char* oldPath, const char* newPath);
+
+
+    /** 
+     * hdfsGetWorkingDirectory - Get the current working directory for
+     * the given filesystem.
+     * @param fs The configured filesystem handle.
+     * @param buffer The user-buffer to copy path of cwd into. 
+     * @param bufferSize The length of user-buffer.
+     * @return Returns buffer, NULL on error.
+     */
+    char* hdfsGetWorkingDirectory(hdfsFS fs, char *buffer, size_t bufferSize);
+
+
+    /** 
+     * hdfsSetWorkingDirectory - Set the working directory. All relative
+     * paths will be resolved relative to it.
+     * @param fs The configured filesystem handle.
+     * @param path The path of the new 'cwd'. 
+     * @return Returns 0 on success, -1 on error. 
+     */
+    int hdfsSetWorkingDirectory(hdfsFS fs, const char* path);
+
+
+    /** 
+     * hdfsCreateDirectory - Make the given file and all non-existent
+     * parents into directories.
+     * @param fs The configured filesystem handle.
+     * @param path The path of the directory. 
+     * @return Returns 0 on success, -1 on error. 
+     */
+    int hdfsCreateDirectory(hdfsFS fs, const char* path);
+
+
+    /** 
+     * hdfsSetReplication - Set the replication of the specified
+     * file to the supplied value
+     * @param fs The configured filesystem handle.
+     * @param path The path of the file. 
+     * @return Returns 0 on success, -1 on error. 
+     */
+    int hdfsSetReplication(hdfsFS fs, const char* path, int16_t replication);
+
+
+    /** 
+     * hdfsFileInfo - Information about a file/directory.
+     */
+    typedef struct  {
+        tObjectKind mKind;   /* file or directory */
+        char *mName;         /* the name of the file */
+        tTime mLastMod;      /* the last modification time for the file in seconds */
+        tOffset mSize;       /* the size of the file in bytes */
+        short mReplication;    /* the count of replicas */
+        tOffset mBlockSize;  /* the block size for the file */
+        char *mOwner;        /* the owner of the file */
+        char *mGroup;        /* the group associated with the file */
+        short mPermissions;  /* the permissions associated with the file */
+        tTime mLastAccess;    /* the last access time for the file in seconds */
+    } hdfsFileInfo;
+
+
+    /** 
+     * hdfsListDirectory - Get list of files/directories for a given
+     * directory-path. hdfsFreeFileInfo should be called to deallocate memory if
+     * the function returns non-NULL value.
+     * @param fs The configured filesystem handle.
+     * @param path The path of the directory. 
+     * @param numEntries Set to the number of files/directories in path.
+     * @return Returns a dynamically-allocated array of hdfsFileInfo
+     * objects; NULL if empty or on error.
+     * on error, numEntries will be -1.
+     */
+    hdfsFileInfo *hdfsListDirectory(hdfsFS fs, const char* path,
+                                    int *numEntries);
+
+
+    /** 
+     * hdfsGetPathInfo - Get information about a path as a (dynamically
+     * allocated) single hdfsFileInfo struct. hdfsFreeFileInfo should be
+     * called when the pointer is no longer needed.
+     * @param fs The configured filesystem handle.
+     * @param path The path of the file. 
+     * @return Returns a dynamically-allocated hdfsFileInfo object;
+     * NULL on error.
+     */
+    hdfsFileInfo *hdfsGetPathInfo(hdfsFS fs, const char* path);
+
+
+    /** 
+     * hdfsFreeFileInfo - Free up the hdfsFileInfo array (including fields) 
+     * @param hdfsFileInfo The array of dynamically-allocated hdfsFileInfo
+     * objects.
+     * @param numEntries The size of the array.
+     */
+    void hdfsFreeFileInfo(hdfsFileInfo *hdfsFileInfo, int numEntries);
+
+
+    /** 
+     * hdfsGetHosts - Get hostnames where a particular block (determined by
+     * pos & blocksize) of a file is stored. The last element in the array
+     * is NULL. Due to replication, a single block could be present on
+     * multiple hosts.
+     * @param fs The configured filesystem handle.
+     * @param path The path of the file. 
+     * @param start The start of the block.
+     * @param length The length of the block.
+     * @return Returns a dynamically-allocated 2-d array of blocks-hosts;
+     * NULL on error.
+     */
+    char*** hdfsGetHosts(hdfsFS fs, const char* path, 
+            tOffset start, tOffset length);
+
+
+    /** 
+     * hdfsFreeHosts - Free up the structure returned by hdfsGetHosts
+     * @param hdfsFileInfo The array of dynamically-allocated hdfsFileInfo
+     * objects.
+     * @param numEntries The size of the array.
+     */
+    void hdfsFreeHosts(char ***blockHosts);
+
+
+    /** 
+     * hdfsGetDefaultBlockSize - Get the optimum blocksize.
+     * @param fs The configured filesystem handle.
+     * @return Returns the blocksize; -1 on error. 
+     */
+    tOffset hdfsGetDefaultBlockSize(hdfsFS fs);
+
+
+    /** 
+     * hdfsGetCapacity - Return the raw capacity of the filesystem.  
+     * @param fs The configured filesystem handle.
+     * @return Returns the raw-capacity; -1 on error. 
+     */
+    tOffset hdfsGetCapacity(hdfsFS fs);
+
+
+    /** 
+     * hdfsGetUsed - Return the total raw size of all files in the filesystem.
+     * @param fs The configured filesystem handle.
+     * @return Returns the total-size; -1 on error. 
+     */
+    tOffset hdfsGetUsed(hdfsFS fs);
+
+    /** 
+     * hdfsChown 
+     * @param fs The configured filesystem handle.
+     * @param path the path to the file or directory
+     * @param owner this is a string in Hadoop land. Set to null or "" if only setting group
+     * @param group  this is a string in Hadoop land. Set to null or "" if only setting user
+     * @return 0 on success else -1
+     */
+    int hdfsChown(hdfsFS fs, const char* path, const char *owner, const char *group);
+
+    /** 
+     * hdfsChmod
+     * @param fs The configured filesystem handle.
+     * @param path the path to the file or directory
+     * @param mode the bitmask to set it to
+     * @return 0 on success else -1
+     */
+      int hdfsChmod(hdfsFS fs, const char* path, short mode);
+
+    /** 
+     * hdfsUtime
+     * @param fs The configured filesystem handle.
+     * @param path the path to the file or directory
+     * @param mtime new modification time or 0 for only set access time in seconds
+     * @param atime new access time or 0 for only set modification time in seconds
+     * @return 0 on success else -1
+     */
+    int hdfsUtime(hdfsFS fs, const char* path, tTime mtime, tTime atime);
+    
+#ifdef __cplusplus
+}
+#endif
+
+#endif /*LIBHDFS_HDFS_H*/
+
+/**
+ * vim: ts=4: sw=4: et
+ */
--- a/hdfs/libhdfs.a
+++ b/hdfs/libhdfs.a
--- a/helpers/memenv/memenv.cc
+++ b/helpers/memenv/memenv.cc
@ -0,0 +1,386 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "helpers/memenv/memenv.h"
+
+#include "rocksdb/env.h"
+#include "rocksdb/status.h"
+#include "port/port.h"
+#include "util/mutexlock.h"
+#include <map>
+#include <string.h>
+#include <string>
+#include <vector>
+
+namespace rocksdb {
+
+namespace {
+
+class FileState {
+ public:
+  // FileStates are reference counted. The initial reference count is zero
+  // and the caller must call Ref() at least once.
+  FileState() : refs_(0), size_(0) {}
+
+  // Increase the reference count.
+  void Ref() {
+    MutexLock lock(&refs_mutex_);
+    ++refs_;
+  }
+
+  // Decrease the reference count. Delete if this is the last reference.
+  void Unref() {
+    bool do_delete = false;
+
+    {
+      MutexLock lock(&refs_mutex_);
+      --refs_;
+      assert(refs_ >= 0);
+      if (refs_ <= 0) {
+        do_delete = true;
+      }
+    }
+
+    if (do_delete) {
+      delete this;
+    }
+  }
+
+  uint64_t Size() const { return size_; }
+
+  Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const {
+    if (offset > size_) {
+      return Status::IOError("Offset greater than file size.");
+    }
+    const uint64_t available = size_ - offset;
+    if (n > available) {
+      n = available;
+    }
+    if (n == 0) {
+      *result = Slice();
+      return Status::OK();
+    }
+
+    size_t block = offset / kBlockSize;
+    size_t block_offset = offset % kBlockSize;
+
+    if (n <= kBlockSize - block_offset) {
+      // The requested bytes are all in the first block.
+      *result = Slice(blocks_[block] + block_offset, n);
+      return Status::OK();
+    }
+
+    size_t bytes_to_copy = n;
+    char* dst = scratch;
+
+    while (bytes_to_copy > 0) {
+      size_t avail = kBlockSize - block_offset;
+      if (avail > bytes_to_copy) {
+        avail = bytes_to_copy;
+      }
+      memcpy(dst, blocks_[block] + block_offset, avail);
+
+      bytes_to_copy -= avail;
+      dst += avail;
+      block++;
+      block_offset = 0;
+    }
+
+    *result = Slice(scratch, n);
+    return Status::OK();
+  }
+
+  Status Append(const Slice& data) {
+    const char* src = data.data();
+    size_t src_len = data.size();
+
+    while (src_len > 0) {
+      size_t avail;
+      size_t offset = size_ % kBlockSize;
+
+      if (offset != 0) {
+        // There is some room in the last block.
+        avail = kBlockSize - offset;
+      } else {
+        // No room in the last block; push new one.
+        blocks_.push_back(new char[kBlockSize]);
+        avail = kBlockSize;
+      }
+
+      if (avail > src_len) {
+        avail = src_len;
+      }
+      memcpy(blocks_.back() + offset, src, avail);
+      src_len -= avail;
+      src += avail;
+      size_ += avail;
+    }
+
+    return Status::OK();
+  }
+
+ private:
+  // Private since only Unref() should be used to delete it.
+  ~FileState() {
+    for (std::vector<char*>::iterator i = blocks_.begin(); i != blocks_.end();
+         ++i) {
+      delete [] *i;
+    }
+  }
+
+  // No copying allowed.
+  FileState(const FileState&);
+  void operator=(const FileState&);
+
+  port::Mutex refs_mutex_;
+  int refs_;  // Protected by refs_mutex_;
+
+  // The following fields are not protected by any mutex. They are only mutable
+  // while the file is being written, and concurrent access is not allowed
+  // to writable files.
+  std::vector<char*> blocks_;
+  uint64_t size_;
+
+  enum { kBlockSize = 8 * 1024 };
+};
+
+class SequentialFileImpl : public SequentialFile {
+ public:
+  explicit SequentialFileImpl(FileState* file) : file_(file), pos_(0) {
+    file_->Ref();
+  }
+
+  ~SequentialFileImpl() {
+    file_->Unref();
+  }
+
+  virtual Status Read(size_t n, Slice* result, char* scratch) {
+    Status s = file_->Read(pos_, n, result, scratch);
+    if (s.ok()) {
+      pos_ += result->size();
+    }
+    return s;
+  }
+
+  virtual Status Skip(uint64_t n) {
+    if (pos_ > file_->Size()) {
+      return Status::IOError("pos_ > file_->Size()");
+    }
+    const size_t available = file_->Size() - pos_;
+    if (n > available) {
+      n = available;
+    }
+    pos_ += n;
+    return Status::OK();
+  }
+
+ private:
+  FileState* file_;
+  size_t pos_;
+};
+
+class RandomAccessFileImpl : public RandomAccessFile {
+ public:
+  explicit RandomAccessFileImpl(FileState* file) : file_(file) {
+    file_->Ref();
+  }
+
+  ~RandomAccessFileImpl() {
+    file_->Unref();
+  }
+
+  virtual Status Read(uint64_t offset, size_t n, Slice* result,
+                      char* scratch) const {
+    return file_->Read(offset, n, result, scratch);
+  }
+
+ private:
+  FileState* file_;
+};
+
+class WritableFileImpl : public WritableFile {
+ public:
+  WritableFileImpl(FileState* file) : file_(file) {
+    file_->Ref();
+  }
+
+  ~WritableFileImpl() {
+    file_->Unref();
+  }
+
+  virtual Status Append(const Slice& data) {
+    return file_->Append(data);
+  }
+
+  virtual Status Close() { return Status::OK(); }
+  virtual Status Flush() { return Status::OK(); }
+  virtual Status Sync() { return Status::OK(); }
+
+ private:
+  FileState* file_;
+};
+
+class InMemoryEnv : public EnvWrapper {
+ public:
+  explicit InMemoryEnv(Env* base_env) : EnvWrapper(base_env) { }
+
+  virtual ~InMemoryEnv() {
+    for (FileSystem::iterator i = file_map_.begin(); i != file_map_.end(); ++i){
+      i->second->Unref();
+    }
+  }
+
+  // Partial implementation of the Env interface.
+  virtual Status NewSequentialFile(const std::string& fname,
+                                   unique_ptr<SequentialFile>* result,
+                                   const EnvOptions& soptions) {
+    MutexLock lock(&mutex_);
+    if (file_map_.find(fname) == file_map_.end()) {
+      *result = NULL;
+      return Status::IOError(fname, "File not found");
+    }
+
+    result->reset(new SequentialFileImpl(file_map_[fname]));
+    return Status::OK();
+  }
+
+  virtual Status NewRandomAccessFile(const std::string& fname,
+                                     unique_ptr<RandomAccessFile>* result,
+                                     const EnvOptions& soptions) {
+    MutexLock lock(&mutex_);
+    if (file_map_.find(fname) == file_map_.end()) {
+      *result = NULL;
+      return Status::IOError(fname, "File not found");
+    }
+
+    result->reset(new RandomAccessFileImpl(file_map_[fname]));
+    return Status::OK();
+  }
+
+  virtual Status NewWritableFile(const std::string& fname,
+                                 unique_ptr<WritableFile>* result,
+                                 const EnvOptions& soptions) {
+    MutexLock lock(&mutex_);
+    if (file_map_.find(fname) != file_map_.end()) {
+      DeleteFileInternal(fname);
+    }
+
+    FileState* file = new FileState();
+    file->Ref();
+    file_map_[fname] = file;
+
+    result->reset(new WritableFileImpl(file));
+    return Status::OK();
+  }
+
+  virtual bool FileExists(const std::string& fname) {
+    MutexLock lock(&mutex_);
+    return file_map_.find(fname) != file_map_.end();
+  }
+
+  virtual Status GetChildren(const std::string& dir,
+                             std::vector<std::string>* result) {
+    MutexLock lock(&mutex_);
+    result->clear();
+
+    for (FileSystem::iterator i = file_map_.begin(); i != file_map_.end(); ++i){
+      const std::string& filename = i->first;
+
+      if (filename.size() >= dir.size() + 1 && filename[dir.size()] == '/' &&
+          Slice(filename).starts_with(Slice(dir))) {
+        result->push_back(filename.substr(dir.size() + 1));
+      }
+    }
+
+    return Status::OK();
+  }
+
+  void DeleteFileInternal(const std::string& fname) {
+    if (file_map_.find(fname) == file_map_.end()) {
+      return;
+    }
+
+    file_map_[fname]->Unref();
+    file_map_.erase(fname);
+  }
+
+  virtual Status DeleteFile(const std::string& fname) {
+    MutexLock lock(&mutex_);
+    if (file_map_.find(fname) == file_map_.end()) {
+      return Status::IOError(fname, "File not found");
+    }
+
+    DeleteFileInternal(fname);
+    return Status::OK();
+  }
+
+  virtual Status CreateDir(const std::string& dirname) {
+    return Status::OK();
+  }
+
+  virtual Status CreateDirIfMissing(const std::string& dirname) {
+    return Status::OK();
+  }
+
+  virtual Status DeleteDir(const std::string& dirname) {
+    return Status::OK();
+  }
+
+  virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) {
+    MutexLock lock(&mutex_);
+    if (file_map_.find(fname) == file_map_.end()) {
+      return Status::IOError(fname, "File not found");
+    }
+
+    *file_size = file_map_[fname]->Size();
+    return Status::OK();
+  }
+
+  virtual Status GetFileModificationTime(const std::string& fname,
+                                         uint64_t* time) {
+    return Status::NotSupported("getFileMTime", "Not supported in MemEnv");
+  }
+
+  virtual Status RenameFile(const std::string& src,
+                            const std::string& target) {
+    MutexLock lock(&mutex_);
+    if (file_map_.find(src) == file_map_.end()) {
+      return Status::IOError(src, "File not found");
+    }
+
+    DeleteFileInternal(target);
+    file_map_[target] = file_map_[src];
+    file_map_.erase(src);
+    return Status::OK();
+  }
+
+  virtual Status LockFile(const std::string& fname, FileLock** lock) {
+    *lock = new FileLock;
+    return Status::OK();
+  }
+
+  virtual Status UnlockFile(FileLock* lock) {
+    delete lock;
+    return Status::OK();
+  }
+
+  virtual Status GetTestDirectory(std::string* path) {
+    *path = "/test";
+    return Status::OK();
+  }
+
+ private:
+  // Map from filenames to FileState objects, representing a simple file system.
+  typedef std::map<std::string, FileState*> FileSystem;
+  port::Mutex mutex_;
+  FileSystem file_map_;  // Protected by mutex_.
+};
+
+}  // namespace
+
+Env* NewMemEnv(Env* base_env) {
+  return new InMemoryEnv(base_env);
+}
+
+}  // namespace rocksdb
--- a/helpers/memenv/memenv.h
+++ b/helpers/memenv/memenv.h
@ -0,0 +1,19 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_ROCKSDB_HELPERS_MEMENV_MEMENV_H_
+#define STORAGE_ROCKSDB_HELPERS_MEMENV_MEMENV_H_
+namespace rocksdb {
+
+class Env;
+
+// Returns a new environment that stores its data in memory and delegates
+// all non-file-storage tasks to base_env. The caller must delete the result
+// when it is no longer needed.
+// *base_env must remain live while the result is in use.
+Env* NewMemEnv(Env* base_env);
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_HELPERS_MEMENV_MEMENV_H_
--- a/helpers/memenv/memenv_test.cc
+++ b/helpers/memenv/memenv_test.cc
@ -0,0 +1,233 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "helpers/memenv/memenv.h"
+
+#include "db/db_impl.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "util/testharness.h"
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace rocksdb {
+
+class MemEnvTest {
+ public:
+  Env* env_;
+  const EnvOptions soptions_;
+
+  MemEnvTest()
+      : env_(NewMemEnv(Env::Default())) {
+  }
+  ~MemEnvTest() {
+    delete env_;
+  }
+};
+
+TEST(MemEnvTest, Basics) {
+  uint64_t file_size;
+  unique_ptr<WritableFile> writable_file;
+  std::vector<std::string> children;
+
+  ASSERT_OK(env_->CreateDir("/dir"));
+
+  // Check that the directory is empty.
+  ASSERT_TRUE(!env_->FileExists("/dir/non_existent"));
+  ASSERT_TRUE(!env_->GetFileSize("/dir/non_existent", &file_size).ok());
+  ASSERT_OK(env_->GetChildren("/dir", &children));
+  ASSERT_EQ(0U, children.size());
+
+  // Create a file.
+  ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_));
+  writable_file.reset();
+
+  // Check that the file exists.
+  ASSERT_TRUE(env_->FileExists("/dir/f"));
+  ASSERT_OK(env_->GetFileSize("/dir/f", &file_size));
+  ASSERT_EQ(0U, file_size);
+  ASSERT_OK(env_->GetChildren("/dir", &children));
+  ASSERT_EQ(1U, children.size());
+  ASSERT_EQ("f", children[0]);
+
+  // Write to the file.
+  ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_));
+  ASSERT_OK(writable_file->Append("abc"));
+  writable_file.reset();
+
+  // Check for expected size.
+  ASSERT_OK(env_->GetFileSize("/dir/f", &file_size));
+  ASSERT_EQ(3U, file_size);
+
+  // Check that renaming works.
+  ASSERT_TRUE(!env_->RenameFile("/dir/non_existent", "/dir/g").ok());
+  ASSERT_OK(env_->RenameFile("/dir/f", "/dir/g"));
+  ASSERT_TRUE(!env_->FileExists("/dir/f"));
+  ASSERT_TRUE(env_->FileExists("/dir/g"));
+  ASSERT_OK(env_->GetFileSize("/dir/g", &file_size));
+  ASSERT_EQ(3U, file_size);
+
+  // Check that opening non-existent file fails.
+  unique_ptr<SequentialFile> seq_file;
+  unique_ptr<RandomAccessFile> rand_file;
+  ASSERT_TRUE(!env_->NewSequentialFile("/dir/non_existent", &seq_file,
+                                       soptions_).ok());
+  ASSERT_TRUE(!seq_file);
+  ASSERT_TRUE(!env_->NewRandomAccessFile("/dir/non_existent", &rand_file,
+                                         soptions_).ok());
+  ASSERT_TRUE(!rand_file);
+
+  // Check that deleting works.
+  ASSERT_TRUE(!env_->DeleteFile("/dir/non_existent").ok());
+  ASSERT_OK(env_->DeleteFile("/dir/g"));
+  ASSERT_TRUE(!env_->FileExists("/dir/g"));
+  ASSERT_OK(env_->GetChildren("/dir", &children));
+  ASSERT_EQ(0U, children.size());
+  ASSERT_OK(env_->DeleteDir("/dir"));
+}
+
+TEST(MemEnvTest, ReadWrite) {
+  unique_ptr<WritableFile> writable_file;
+  unique_ptr<SequentialFile> seq_file;
+  unique_ptr<RandomAccessFile> rand_file;
+  Slice result;
+  char scratch[100];
+
+  ASSERT_OK(env_->CreateDir("/dir"));
+
+  ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_));
+  ASSERT_OK(writable_file->Append("hello "));
+  ASSERT_OK(writable_file->Append("world"));
+  writable_file.reset();
+
+  // Read sequentially.
+  ASSERT_OK(env_->NewSequentialFile("/dir/f", &seq_file, soptions_));
+  ASSERT_OK(seq_file->Read(5, &result, scratch)); // Read "hello".
+  ASSERT_EQ(0, result.compare("hello"));
+  ASSERT_OK(seq_file->Skip(1));
+  ASSERT_OK(seq_file->Read(1000, &result, scratch)); // Read "world".
+  ASSERT_EQ(0, result.compare("world"));
+  ASSERT_OK(seq_file->Read(1000, &result, scratch)); // Try reading past EOF.
+  ASSERT_EQ(0U, result.size());
+  ASSERT_OK(seq_file->Skip(100)); // Try to skip past end of file.
+  ASSERT_OK(seq_file->Read(1000, &result, scratch));
+  ASSERT_EQ(0U, result.size());
+
+  // Random reads.
+  ASSERT_OK(env_->NewRandomAccessFile("/dir/f", &rand_file, soptions_));
+  ASSERT_OK(rand_file->Read(6, 5, &result, scratch)); // Read "world".
+  ASSERT_EQ(0, result.compare("world"));
+  ASSERT_OK(rand_file->Read(0, 5, &result, scratch)); // Read "hello".
+  ASSERT_EQ(0, result.compare("hello"));
+  ASSERT_OK(rand_file->Read(10, 100, &result, scratch)); // Read "d".
+  ASSERT_EQ(0, result.compare("d"));
+
+  // Too high offset.
+  ASSERT_TRUE(!rand_file->Read(1000, 5, &result, scratch).ok());
+}
+
+TEST(MemEnvTest, Locks) {
+  FileLock* lock;
+
+  // These are no-ops, but we test they return success.
+  ASSERT_OK(env_->LockFile("some file", &lock));
+  ASSERT_OK(env_->UnlockFile(lock));
+}
+
+TEST(MemEnvTest, Misc) {
+  std::string test_dir;
+  ASSERT_OK(env_->GetTestDirectory(&test_dir));
+  ASSERT_TRUE(!test_dir.empty());
+
+  unique_ptr<WritableFile> writable_file;
+  ASSERT_OK(env_->NewWritableFile("/a/b", &writable_file, soptions_));
+
+  // These are no-ops, but we test they return success.
+  ASSERT_OK(writable_file->Sync());
+  ASSERT_OK(writable_file->Flush());
+  ASSERT_OK(writable_file->Close());
+  writable_file.reset();
+}
+
+TEST(MemEnvTest, LargeWrite) {
+  const size_t kWriteSize = 300 * 1024;
+  char* scratch = new char[kWriteSize * 2];
+
+  std::string write_data;
+  for (size_t i = 0; i < kWriteSize; ++i) {
+    write_data.append(1, static_cast<char>(i));
+  }
+
+  unique_ptr<WritableFile> writable_file;
+  ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_));
+  ASSERT_OK(writable_file->Append("foo"));
+  ASSERT_OK(writable_file->Append(write_data));
+  writable_file.reset();
+
+  unique_ptr<SequentialFile> seq_file;
+  Slice result;
+  ASSERT_OK(env_->NewSequentialFile("/dir/f", &seq_file, soptions_));
+  ASSERT_OK(seq_file->Read(3, &result, scratch)); // Read "foo".
+  ASSERT_EQ(0, result.compare("foo"));
+
+  size_t read = 0;
+  std::string read_data;
+  while (read < kWriteSize) {
+    ASSERT_OK(seq_file->Read(kWriteSize - read, &result, scratch));
+    read_data.append(result.data(), result.size());
+    read += result.size();
+  }
+  ASSERT_TRUE(write_data == read_data);
+  delete [] scratch;
+}
+
+TEST(MemEnvTest, DBTest) {
+  Options options;
+  options.create_if_missing = true;
+  options.env = env_;
+  DB* db;
+
+  const Slice keys[] = {Slice("aaa"), Slice("bbb"), Slice("ccc")};
+  const Slice vals[] = {Slice("foo"), Slice("bar"), Slice("baz")};
+
+  ASSERT_OK(DB::Open(options, "/dir/db", &db));
+  for (size_t i = 0; i < 3; ++i) {
+    ASSERT_OK(db->Put(WriteOptions(), keys[i], vals[i]));
+  }
+
+  for (size_t i = 0; i < 3; ++i) {
+    std::string res;
+    ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
+    ASSERT_TRUE(res == vals[i]);
+  }
+
+  Iterator* iterator = db->NewIterator(ReadOptions());
+  iterator->SeekToFirst();
+  for (size_t i = 0; i < 3; ++i) {
+    ASSERT_TRUE(iterator->Valid());
+    ASSERT_TRUE(keys[i] == iterator->key());
+    ASSERT_TRUE(vals[i] == iterator->value());
+    iterator->Next();
+  }
+  ASSERT_TRUE(!iterator->Valid());
+  delete iterator;
+
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db);
+  ASSERT_OK(dbi->TEST_FlushMemTable());
+
+  for (size_t i = 0; i < 3; ++i) {
+    std::string res;
+    ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
+    ASSERT_TRUE(res == vals[i]);
+  }
+
+  delete db;
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  return rocksdb::test::RunAllTests();
+}
--- a/include/rocksdb/arena.h
+++ b/include/rocksdb/arena.h
@ -0,0 +1,41 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Arena class defines memory allocation methods. It's used by memtable and
+// skiplist.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_ARENA_H_
+#define STORAGE_ROCKSDB_INCLUDE_ARENA_H_
+
+#include <limits>
+#include <memory>
+
+namespace rocksdb {
+
+class Arena {
+ public:
+  Arena() {};
+  virtual ~Arena() {};
+
+  // Return a pointer to a newly allocated memory block of "bytes" bytes.
+  virtual char* Allocate(size_t bytes) = 0;
+
+  // Allocate memory with the normal alignment guarantees provided by malloc.
+  virtual char* AllocateAligned(size_t bytes) = 0;
+
+  // Returns an estimate of the total memory used by arena.
+  virtual const size_t ApproximateMemoryUsage() = 0;
+
+  // Returns the total number of bytes in all blocks allocated so far.
+  virtual const size_t MemoryAllocatedBytes() = 0;
+
+ private:
+  // No copying allowed
+  Arena(const Arena&);
+  void operator=(const Arena&);
+};
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_ARENA_H_
--- a/include/rocksdb/c.h
+++ b/include/rocksdb/c.h
@ -0,0 +1,281 @@
+/* Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+  Use of this source code is governed by a BSD-style license that can be
+  found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+  C bindings for leveldb.  May be useful as a stable ABI that can be
+  used by programs that keep leveldb in a shared library, or for
+  a JNI api.
+
+  Does not support:
+  . getters for the option types
+  . custom comparators that implement key shortening
+  . capturing post-write-snapshot
+  . custom iter, db, env, cache implementations using just the C bindings
+
+  Some conventions:
+
+  (1) We expose just opaque struct pointers and functions to clients.
+  This allows us to change internal representations without having to
+  recompile clients.
+
+  (2) For simplicity, there is no equivalent to the Slice type.  Instead,
+  the caller has to pass the pointer and length as separate
+  arguments.
+
+  (3) Errors are represented by a null-terminated c string.  NULL
+  means no error.  All operations that can raise an error are passed
+  a "char** errptr" as the last argument.  One of the following must
+  be true on entry:
+     *errptr == NULL
+     *errptr points to a malloc()ed null-terminated error message
+  On success, a leveldb routine leaves *errptr unchanged.
+  On failure, leveldb frees the old value of *errptr and
+  set *errptr to a malloc()ed error message.
+
+  (4) Bools have the type unsigned char (0 == false; rest == true)
+
+  (5) All of the pointer arguments must be non-NULL.
+*/
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_C_H_
+#define STORAGE_ROCKSDB_INCLUDE_C_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdarg.h>
+#include <stddef.h>
+#include <stdint.h>
+
+/* Exported types */
+
+typedef struct leveldb_t               leveldb_t;
+typedef struct leveldb_cache_t         leveldb_cache_t;
+typedef struct leveldb_comparator_t    leveldb_comparator_t;
+typedef struct leveldb_env_t           leveldb_env_t;
+typedef struct leveldb_filelock_t      leveldb_filelock_t;
+typedef struct leveldb_filterpolicy_t  leveldb_filterpolicy_t;
+typedef struct leveldb_iterator_t      leveldb_iterator_t;
+typedef struct leveldb_logger_t        leveldb_logger_t;
+typedef struct leveldb_options_t       leveldb_options_t;
+typedef struct leveldb_randomfile_t    leveldb_randomfile_t;
+typedef struct leveldb_readoptions_t   leveldb_readoptions_t;
+typedef struct leveldb_seqfile_t       leveldb_seqfile_t;
+typedef struct leveldb_snapshot_t      leveldb_snapshot_t;
+typedef struct leveldb_writablefile_t  leveldb_writablefile_t;
+typedef struct leveldb_writebatch_t    leveldb_writebatch_t;
+typedef struct leveldb_writeoptions_t  leveldb_writeoptions_t;
+
+/* DB operations */
+
+extern leveldb_t* leveldb_open(
+    const leveldb_options_t* options,
+    const char* name,
+    char** errptr);
+
+extern void leveldb_close(leveldb_t* db);
+
+extern void leveldb_put(
+    leveldb_t* db,
+    const leveldb_writeoptions_t* options,
+    const char* key, size_t keylen,
+    const char* val, size_t vallen,
+    char** errptr);
+
+extern void leveldb_delete(
+    leveldb_t* db,
+    const leveldb_writeoptions_t* options,
+    const char* key, size_t keylen,
+    char** errptr);
+
+extern void leveldb_write(
+    leveldb_t* db,
+    const leveldb_writeoptions_t* options,
+    leveldb_writebatch_t* batch,
+    char** errptr);
+
+/* Returns NULL if not found.  A malloc()ed array otherwise.
+   Stores the length of the array in *vallen. */
+extern char* leveldb_get(
+    leveldb_t* db,
+    const leveldb_readoptions_t* options,
+    const char* key, size_t keylen,
+    size_t* vallen,
+    char** errptr);
+
+extern leveldb_iterator_t* leveldb_create_iterator(
+    leveldb_t* db,
+    const leveldb_readoptions_t* options);
+
+extern const leveldb_snapshot_t* leveldb_create_snapshot(
+    leveldb_t* db);
+
+extern void leveldb_release_snapshot(
+    leveldb_t* db,
+    const leveldb_snapshot_t* snapshot);
+
+/* Returns NULL if property name is unknown.
+   Else returns a pointer to a malloc()-ed null-terminated value. */
+extern char* leveldb_property_value(
+    leveldb_t* db,
+    const char* propname);
+
+extern void leveldb_approximate_sizes(
+    leveldb_t* db,
+    int num_ranges,
+    const char* const* range_start_key, const size_t* range_start_key_len,
+    const char* const* range_limit_key, const size_t* range_limit_key_len,
+    uint64_t* sizes);
+
+extern void leveldb_compact_range(
+    leveldb_t* db,
+    const char* start_key, size_t start_key_len,
+    const char* limit_key, size_t limit_key_len);
+
+/* Management operations */
+
+extern void leveldb_destroy_db(
+    const leveldb_options_t* options,
+    const char* name,
+    char** errptr);
+
+extern void leveldb_repair_db(
+    const leveldb_options_t* options,
+    const char* name,
+    char** errptr);
+
+/* Iterator */
+
+extern void leveldb_iter_destroy(leveldb_iterator_t*);
+extern unsigned char leveldb_iter_valid(const leveldb_iterator_t*);
+extern void leveldb_iter_seek_to_first(leveldb_iterator_t*);
+extern void leveldb_iter_seek_to_last(leveldb_iterator_t*);
+extern void leveldb_iter_seek(leveldb_iterator_t*, const char* k, size_t klen);
+extern void leveldb_iter_next(leveldb_iterator_t*);
+extern void leveldb_iter_prev(leveldb_iterator_t*);
+extern const char* leveldb_iter_key(const leveldb_iterator_t*, size_t* klen);
+extern const char* leveldb_iter_value(const leveldb_iterator_t*, size_t* vlen);
+extern void leveldb_iter_get_error(const leveldb_iterator_t*, char** errptr);
+
+/* Write batch */
+
+extern leveldb_writebatch_t* leveldb_writebatch_create();
+extern void leveldb_writebatch_destroy(leveldb_writebatch_t*);
+extern void leveldb_writebatch_clear(leveldb_writebatch_t*);
+extern void leveldb_writebatch_put(
+    leveldb_writebatch_t*,
+    const char* key, size_t klen,
+    const char* val, size_t vlen);
+extern void leveldb_writebatch_delete(
+    leveldb_writebatch_t*,
+    const char* key, size_t klen);
+extern void leveldb_writebatch_iterate(
+    leveldb_writebatch_t*,
+    void* state,
+    void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
+    void (*deleted)(void*, const char* k, size_t klen));
+
+/* Options */
+
+extern leveldb_options_t* leveldb_options_create();
+extern void leveldb_options_destroy(leveldb_options_t*);
+extern void leveldb_options_set_comparator(
+    leveldb_options_t*,
+    leveldb_comparator_t*);
+extern void leveldb_options_set_compression_per_level(
+  leveldb_options_t* opt,
+  int* level_values,
+  size_t num_levels);
+extern void leveldb_options_set_filter_policy(
+    leveldb_options_t*,
+    leveldb_filterpolicy_t*);
+extern void leveldb_options_set_create_if_missing(
+    leveldb_options_t*, unsigned char);
+extern void leveldb_options_set_error_if_exists(
+    leveldb_options_t*, unsigned char);
+extern void leveldb_options_set_paranoid_checks(
+    leveldb_options_t*, unsigned char);
+extern void leveldb_options_set_env(leveldb_options_t*, leveldb_env_t*);
+extern void leveldb_options_set_info_log(leveldb_options_t*, leveldb_logger_t*);
+extern void leveldb_options_set_write_buffer_size(leveldb_options_t*, size_t);
+extern void leveldb_options_set_max_open_files(leveldb_options_t*, int);
+extern void leveldb_options_set_cache(leveldb_options_t*, leveldb_cache_t*);
+extern void leveldb_options_set_block_size(leveldb_options_t*, size_t);
+extern void leveldb_options_set_block_restart_interval(leveldb_options_t*, int);
+extern void leveldb_options_set_compression_options(
+    leveldb_options_t* opt, int w_bits, int level, int strategy);
+
+enum {
+  leveldb_no_compression = 0,
+  leveldb_snappy_compression = 1
+};
+extern void leveldb_options_set_compression(leveldb_options_t*, int);
+
+/* Comparator */
+
+extern leveldb_comparator_t* leveldb_comparator_create(
+    void* state,
+    void (*destructor)(void*),
+    int (*compare)(
+        void*,
+        const char* a, size_t alen,
+        const char* b, size_t blen),
+    const char* (*name)(void*));
+extern void leveldb_comparator_destroy(leveldb_comparator_t*);
+
+/* Filter policy */
+
+extern leveldb_filterpolicy_t* leveldb_filterpolicy_create(
+    void* state,
+    void (*destructor)(void*),
+    char* (*create_filter)(
+        void*,
+        const char* const* key_array, const size_t* key_length_array,
+        int num_keys,
+        size_t* filter_length),
+    unsigned char (*key_may_match)(
+        void*,
+        const char* key, size_t length,
+        const char* filter, size_t filter_length),
+    const char* (*name)(void*));
+extern void leveldb_filterpolicy_destroy(leveldb_filterpolicy_t*);
+
+extern leveldb_filterpolicy_t* leveldb_filterpolicy_create_bloom(
+    int bits_per_key);
+
+/* Read options */
+
+extern leveldb_readoptions_t* leveldb_readoptions_create();
+extern void leveldb_readoptions_destroy(leveldb_readoptions_t*);
+extern void leveldb_readoptions_set_verify_checksums(
+    leveldb_readoptions_t*,
+    unsigned char);
+extern void leveldb_readoptions_set_fill_cache(
+    leveldb_readoptions_t*, unsigned char);
+extern void leveldb_readoptions_set_snapshot(
+    leveldb_readoptions_t*,
+    const leveldb_snapshot_t*);
+
+/* Write options */
+
+extern leveldb_writeoptions_t* leveldb_writeoptions_create();
+extern void leveldb_writeoptions_destroy(leveldb_writeoptions_t*);
+extern void leveldb_writeoptions_set_sync(
+    leveldb_writeoptions_t*, unsigned char);
+
+/* Cache */
+
+extern leveldb_cache_t* leveldb_cache_create_lru(size_t capacity);
+extern void leveldb_cache_destroy(leveldb_cache_t* cache);
+
+/* Env */
+
+extern leveldb_env_t* leveldb_create_default_env();
+extern void leveldb_env_destroy(leveldb_env_t*);
+
+#ifdef __cplusplus
+}  /* end extern "C" */
+#endif
+
+#endif  /* STORAGE_ROCKSDB_INCLUDE_C_H_ */
--- a/include/rocksdb/cache.h
+++ b/include/rocksdb/cache.h
@ -0,0 +1,118 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A Cache is an interface that maps keys to values.  It has internal
+// synchronization and may be safely accessed concurrently from
+// multiple threads.  It may automatically evict entries to make room
+// for new entries.  Values have a specified charge against the cache
+// capacity.  For example, a cache where the values are variable
+// length strings, may use the length of the string as the charge for
+// the string.
+//
+// A builtin cache implementation with a least-recently-used eviction
+// policy is provided.  Clients may use their own implementations if
+// they want something more sophisticated (like scan-resistance, a
+// custom eviction policy, variable cache sizing, etc.)
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_CACHE_H_
+#define STORAGE_ROCKSDB_INCLUDE_CACHE_H_
+
+#include <memory>
+#include <stdint.h>
+#include "rocksdb/slice.h"
+
+namespace rocksdb {
+
+using std::shared_ptr;
+
+class Cache;
+
+// Create a new cache with a fixed size capacity. The cache is sharded
+// to 2^numShardBits shards, by hash of the key. The total capacity
+// is divided and evenly assigned to each shard. Inside each shard,
+// the eviction is done in two passes: first try to free spaces by
+// evicting entries that are among the most least used removeScanCountLimit
+// entries and do not have reference other than by the cache itself, in
+// the least-used order. If not enough space is freed, further free the
+// entries in least used order.
+//
+// The functions without parameter numShardBits and/or removeScanCountLimit
+// use default values. removeScanCountLimit's default value is 0, which
+// means a strict LRU order inside each shard.
+extern shared_ptr<Cache> NewLRUCache(size_t capacity);
+extern shared_ptr<Cache> NewLRUCache(size_t capacity, int numShardBits);
+extern shared_ptr<Cache> NewLRUCache(size_t capacity, int numShardBits,
+                                     int removeScanCountLimit);
+
+class Cache {
+ public:
+  Cache() { }
+
+  // Destroys all existing entries by calling the "deleter"
+  // function that was passed to the constructor.
+  virtual ~Cache();
+
+  // Opaque handle to an entry stored in the cache.
+  struct Handle { };
+
+  // Insert a mapping from key->value into the cache and assign it
+  // the specified charge against the total cache capacity.
+  //
+  // Returns a handle that corresponds to the mapping.  The caller
+  // must call this->Release(handle) when the returned mapping is no
+  // longer needed.
+  //
+  // When the inserted entry is no longer needed, the key and
+  // value will be passed to "deleter".
+  virtual Handle* Insert(const Slice& key, void* value, size_t charge,
+                         void (*deleter)(const Slice& key, void* value)) = 0;
+
+  // If the cache has no mapping for "key", returns nullptr.
+  //
+  // Else return a handle that corresponds to the mapping.  The caller
+  // must call this->Release(handle) when the returned mapping is no
+  // longer needed.
+  virtual Handle* Lookup(const Slice& key) = 0;
+
+  // Release a mapping returned by a previous Lookup().
+  // REQUIRES: handle must not have been released yet.
+  // REQUIRES: handle must have been returned by a method on *this.
+  virtual void Release(Handle* handle) = 0;
+
+  // Return the value encapsulated in a handle returned by a
+  // successful Lookup().
+  // REQUIRES: handle must not have been released yet.
+  // REQUIRES: handle must have been returned by a method on *this.
+  virtual void* Value(Handle* handle) = 0;
+
+  // If the cache contains entry for key, erase it.  Note that the
+  // underlying entry will be kept around until all existing handles
+  // to it have been released.
+  virtual void Erase(const Slice& key) = 0;
+
+  // Return a new numeric id.  May be used by multiple clients who are
+  // sharing the same cache to partition the key space.  Typically the
+  // client will allocate a new id at startup and prepend the id to
+  // its cache keys.
+  virtual uint64_t NewId() = 0;
+
+  // returns the maximum configured capacity of the cache
+  virtual size_t GetCapacity() = 0;
+
+ private:
+  void LRU_Remove(Handle* e);
+  void LRU_Append(Handle* e);
+  void Unref(Handle* e);
+
+  struct Rep;
+  Rep* rep_;
+
+  // No copying allowed
+  Cache(const Cache&);
+  void operator=(const Cache&);
+};
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_UTIL_CACHE_H_
--- a/include/rocksdb/compaction_filter.h
+++ b/include/rocksdb/compaction_filter.h
@ -0,0 +1,79 @@
+// Copyright (c) 2013 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_
+#define STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_
+
+#include <string>
+
+namespace rocksdb {
+
+class Slice;
+
+// CompactionFilter allows an application to modify/delete a key-value at
+// the time of compaction.
+
+class CompactionFilter {
+ public:
+
+  // Context information of a compaction run
+  struct Context {
+    // Does this compaction run include all data files
+    bool is_full_compaction;
+  };
+
+  virtual ~CompactionFilter() {}
+
+  // The compaction process invokes this
+  // method for kv that is being compacted. A return value
+  // of false indicates that the kv should be preserved in the
+  // output of this compaction run and a return value of true
+  // indicates that this key-value should be removed from the
+  // output of the compaction.  The application can inspect
+  // the existing value of the key and make decision based on it.
+  //
+  // When the value is to be preserved, the application has the option
+  // to modify the existing_value and pass it back through new_value.
+  // value_changed needs to be set to true in this case.
+  virtual bool Filter(int level,
+                      const Slice& key,
+                      const Slice& existing_value,
+                      std::string* new_value,
+                      bool* value_changed) const = 0;
+
+  // Returns a name that identifies this compaction filter.
+  // The name will be printed to LOG file on start up for diagnosis.
+  virtual const char* Name() const = 0;
+};
+
+// Each compaction will create a new CompactionFilter allowing the
+// application to know about different campactions
+class CompactionFilterFactory {
+ public:
+  virtual ~CompactionFilterFactory() { };
+
+  virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+    const CompactionFilter::Context& context) = 0;
+
+  // Returns a name that identifies this compaction filter factory.
+  virtual const char* Name() const = 0;
+};
+
+// Default implementaion of CompactionFilterFactory which does not
+// return any filter
+class DefaultCompactionFilterFactory : public CompactionFilterFactory {
+ public:
+  virtual std::unique_ptr<CompactionFilter>
+  CreateCompactionFilter(const CompactionFilter::Context& context) override {
+    return std::unique_ptr<CompactionFilter>(nullptr);
+  }
+
+  virtual const char* Name() const override {
+    return "DefaultCompactionFilterFactory";
+  }
+};
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_
--- a/include/rocksdb/comparator.h
+++ b/include/rocksdb/comparator.h
@ -0,0 +1,63 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_COMPARATOR_H_
+#define STORAGE_ROCKSDB_INCLUDE_COMPARATOR_H_
+
+#include <string>
+
+namespace rocksdb {
+
+class Slice;
+
+// A Comparator object provides a total order across slices that are
+// used as keys in an sstable or a database.  A Comparator implementation
+// must be thread-safe since rocksdb may invoke its methods concurrently
+// from multiple threads.
+class Comparator {
+ public:
+  virtual ~Comparator();
+
+  // Three-way comparison.  Returns value:
+  //   < 0 iff "a" < "b",
+  //   == 0 iff "a" == "b",
+  //   > 0 iff "a" > "b"
+  virtual int Compare(const Slice& a, const Slice& b) const = 0;
+
+  // The name of the comparator.  Used to check for comparator
+  // mismatches (i.e., a DB created with one comparator is
+  // accessed using a different comparator.
+  //
+  // The client of this package should switch to a new name whenever
+  // the comparator implementation changes in a way that will cause
+  // the relative ordering of any two keys to change.
+  //
+  // Names starting with "rocksdb." are reserved and should not be used
+  // by any clients of this package.
+  virtual const char* Name() const = 0;
+
+  // Advanced functions: these are used to reduce the space requirements
+  // for internal data structures like index blocks.
+
+  // If *start < limit, changes *start to a short string in [start,limit).
+  // Simple comparator implementations may return with *start unchanged,
+  // i.e., an implementation of this method that does nothing is correct.
+  virtual void FindShortestSeparator(
+      std::string* start,
+      const Slice& limit) const = 0;
+
+  // Changes *key to a short string >= *key.
+  // Simple comparator implementations may return with *key unchanged,
+  // i.e., an implementation of this method that does nothing is correct.
+  virtual void FindShortSuccessor(std::string* key) const = 0;
+};
+
+// Return a builtin comparator that uses lexicographic byte-wise
+// ordering.  The result remains the property of this module and
+// must not be deleted.
+extern const Comparator* BytewiseComparator();
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_COMPARATOR_H_
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@ -0,0 +1,303 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_DB_H_
+#define STORAGE_ROCKSDB_INCLUDE_DB_H_
+
+#include <stdint.h>
+#include <stdio.h>
+#include <memory>
+#include <vector>
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/types.h"
+#include "rocksdb/transaction_log.h"
+
+namespace rocksdb {
+
+using std::unique_ptr;
+
+// Update Makefile if you change these
+static const int kMajorVersion = 2;
+static const int kMinorVersion = 0;
+
+struct Options;
+struct ReadOptions;
+struct WriteOptions;
+struct FlushOptions;
+class WriteBatch;
+
+// Metadata associated with each SST file.
+struct LiveFileMetaData {
+  std::string name;        // Name of the file
+  int level;               // Level at which this file resides.
+  size_t size;             // File size in bytes.
+  std::string smallestkey; // Smallest user defined key in the file.
+  std::string largestkey;  // Largest user defined key in the file.
+  SequenceNumber smallest_seqno; // smallest seqno in file
+  SequenceNumber largest_seqno;  // largest seqno in file
+};
+
+// Abstract handle to particular state of a DB.
+// A Snapshot is an immutable object and can therefore be safely
+// accessed from multiple threads without any external synchronization.
+class Snapshot {
+ protected:
+  virtual ~Snapshot();
+};
+
+// A range of keys
+struct Range {
+  Slice start;          // Included in the range
+  Slice limit;          // Not included in the range
+
+  Range() { }
+  Range(const Slice& s, const Slice& l) : start(s), limit(l) { }
+};
+
+// A DB is a persistent ordered map from keys to values.
+// A DB is safe for concurrent access from multiple threads without
+// any external synchronization.
+class DB {
+ public:
+  // Open the database with the specified "name".
+  // Stores a pointer to a heap-allocated database in *dbptr and returns
+  // OK on success.
+  // Stores nullptr in *dbptr and returns a non-OK status on error.
+  // Caller should delete *dbptr when it is no longer needed.
+  static Status Open(const Options& options,
+                     const std::string& name,
+                     DB** dbptr);
+
+  // Open the database for read only. All DB interfaces
+  // that modify data, like put/delete, will return error.
+  // If the db is opened in read only mode, then no compactions
+  // will happen.
+  static Status OpenForReadOnly(const Options& options,
+      const std::string& name, DB** dbptr,
+      bool error_if_log_file_exist = false);
+
+  DB() { }
+  virtual ~DB();
+
+  // Set the database entry for "key" to "value".
+  // Returns OK on success, and a non-OK status on error.
+  // Note: consider setting options.sync = true.
+  virtual Status Put(const WriteOptions& options,
+                     const Slice& key,
+                     const Slice& value) = 0;
+
+  // Remove the database entry (if any) for "key".  Returns OK on
+  // success, and a non-OK status on error.  It is not an error if "key"
+  // did not exist in the database.
+  // Note: consider setting options.sync = true.
+  virtual Status Delete(const WriteOptions& options, const Slice& key) = 0;
+
+  // Merge the database entry for "key" with "value".  Returns OK on success,
+  // and a non-OK status on error. The semantics of this operation is
+  // determined by the user provided merge_operator when opening DB.
+  // Note: consider setting options.sync = true.
+  virtual Status Merge(const WriteOptions& options,
+                       const Slice& key,
+                       const Slice& value) = 0;
+
+  // Apply the specified updates to the database.
+  // Returns OK on success, non-OK on failure.
+  // Note: consider setting options.sync = true.
+  virtual Status Write(const WriteOptions& options, WriteBatch* updates) = 0;
+
+  // If the database contains an entry for "key" store the
+  // corresponding value in *value and return OK.
+  //
+  // If there is no entry for "key" leave *value unchanged and return
+  // a status for which Status::IsNotFound() returns true.
+  //
+  // May return some other Status on an error.
+  virtual Status Get(const ReadOptions& options,
+                     const Slice& key,
+                     std::string* value) = 0;
+
+  // If keys[i] does not exist in the database, then the i'th returned
+  // status will be one for which Status::IsNotFound() is true, and
+  // (*values)[i] will be set to some arbitrary value (often ""). Otherwise,
+  // the i'th returned status will have Status::ok() true, and (*values)[i]
+  // will store the value associated with keys[i].
+  //
+  // (*values) will always be resized to be the same size as (keys).
+  // Similarly, the number of returned statuses will be the number of keys.
+  // Note: keys will not be "de-duplicated". Duplicate keys will return
+  // duplicate values in order.
+  virtual std::vector<Status> MultiGet(const ReadOptions& options,
+                                       const std::vector<Slice>& keys,
+                                       std::vector<std::string>* values) = 0;
+
+  // If the key definitely does not exist in the database, then this method
+  // returns false, else true. If the caller wants to obtain value when the key
+  // is found in memory, a bool for 'value_found' must be passed. 'value_found'
+  // will be true on return if value has been set properly.
+  // This check is potentially lighter-weight than invoking DB::Get(). One way
+  // to make this lighter weight is to avoid doing any IOs.
+  // Default implementation here returns true and sets 'value_found' to false
+  virtual bool KeyMayExist(const ReadOptions& options,
+                           const Slice& key,
+                           std::string* value,
+                           bool* value_found = nullptr) {
+    if (value_found != nullptr) {
+      *value_found = false;
+    }
+    return true;
+  }
+
+  // Return a heap-allocated iterator over the contents of the database.
+  // The result of NewIterator() is initially invalid (caller must
+  // call one of the Seek methods on the iterator before using it).
+  //
+  // Caller should delete the iterator when it is no longer needed.
+  // The returned iterator should be deleted before this db is deleted.
+  virtual Iterator* NewIterator(const ReadOptions& options) = 0;
+
+  // Return a handle to the current DB state.  Iterators created with
+  // this handle will all observe a stable snapshot of the current DB
+  // state.  The caller must call ReleaseSnapshot(result) when the
+  // snapshot is no longer needed.
+  virtual const Snapshot* GetSnapshot() = 0;
+
+  // Release a previously acquired snapshot.  The caller must not
+  // use "snapshot" after this call.
+  virtual void ReleaseSnapshot(const Snapshot* snapshot) = 0;
+
+  // DB implementations can export properties about their state
+  // via this method.  If "property" is a valid property understood by this
+  // DB implementation, fills "*value" with its current value and returns
+  // true.  Otherwise returns false.
+  //
+  //
+  // Valid property names include:
+  //
+  //  "rocksdb.num-files-at-level<N>" - return the number of files at level <N>,
+  //     where <N> is an ASCII representation of a level number (e.g. "0").
+  //  "rocksdb.stats" - returns a multi-line string that describes statistics
+  //     about the internal operation of the DB.
+  //  "rocksdb.sstables" - returns a multi-line string that describes all
+  //     of the sstables that make up the db contents.
+  virtual bool GetProperty(const Slice& property, std::string* value) = 0;
+
+  // For each i in [0,n-1], store in "sizes[i]", the approximate
+  // file system space used by keys in "[range[i].start .. range[i].limit)".
+  //
+  // Note that the returned sizes measure file system space usage, so
+  // if the user data compresses by a factor of ten, the returned
+  // sizes will be one-tenth the size of the corresponding user data size.
+  //
+  // The results may not include the sizes of recently written data.
+  virtual void GetApproximateSizes(const Range* range, int n,
+                                   uint64_t* sizes) = 0;
+
+  // Compact the underlying storage for the key range [*begin,*end].
+  // In particular, deleted and overwritten versions are discarded,
+  // and the data is rearranged to reduce the cost of operations
+  // needed to access the data.  This operation should typically only
+  // be invoked by users who understand the underlying implementation.
+  //
+  // begin==nullptr is treated as a key before all keys in the database.
+  // end==nullptr is treated as a key after all keys in the database.
+  // Therefore the following call will compact the entire database:
+  //    db->CompactRange(nullptr, nullptr);
+  // Note that after the entire database is compacted, all data are pushed
+  // down to the last level containing any data. If the total data size
+  // after compaction is reduced, that level might not be appropriate for
+  // hosting all the files. In this case, client could set reduce_level
+  // to true, to move the files back to the minimum level capable of holding
+  // the data set or a given level (specified by non-negative target_level).
+  virtual void CompactRange(const Slice* begin, const Slice* end,
+                            bool reduce_level = false,
+                            int target_level = -1) = 0;
+
+  // Number of levels used for this DB.
+  virtual int NumberLevels() = 0;
+
+  // Maximum level to which a new compacted memtable is pushed if it
+  // does not create overlap.
+  virtual int MaxMemCompactionLevel() = 0;
+
+  // Number of files in level-0 that would stop writes.
+  virtual int Level0StopWriteTrigger() = 0;
+
+  // Flush all mem-table data.
+  virtual Status Flush(const FlushOptions& options) = 0;
+
+  // Prevent file deletions. Compactions will continue to occur,
+  // but no obsolete files will be deleted. Calling this multiple
+  // times have the same effect as calling it once.
+  virtual Status DisableFileDeletions() = 0;
+
+  // Allow compactions to delete obselete files.
+  virtual Status EnableFileDeletions() = 0;
+
+  // GetLiveFiles followed by GetSortedWalFiles can generate a lossless backup
+
+  // THIS METHOD IS DEPRECATED. Use the GetTableMetaData to get more
+  // detailed information on the live files.
+  // Retrieve the list of all files in the database. The files are
+  // relative to the dbname and are not absolute paths. The valid size of the
+  // manifest file is returned in manifest_file_size. The manifest file is an
+  // ever growing file, but only the portion specified by manifest_file_size is
+  // valid for this snapshot.
+  // Setting flush_memtable to true does Flush before recording the live files.
+  // Setting flush_memtable to false is useful when we don't want to wait for
+  // flush which may have to wait for compaction to complete taking an
+  // indeterminate time. But this will have to use GetSortedWalFiles after
+  // GetLiveFiles to compensate for memtables missed in this snapshot due to the
+  // absence of Flush, by WAL files to recover the database consistently later
+  virtual Status GetLiveFiles(std::vector<std::string>&,
+                              uint64_t* manifest_file_size,
+                              bool flush_memtable = true) = 0;
+
+  // Retrieve the sorted list of all wal files with earliest file first
+  virtual Status GetSortedWalFiles(VectorLogPtr& files) = 0;
+
+  // The sequence number of the most recent transaction.
+  virtual SequenceNumber GetLatestSequenceNumber() const = 0;
+
+  // Sets iter to an iterator that is positioned at a write-batch containing
+  // seq_number. If the sequence number is non existent, it returns an iterator
+  // at the first available seq_no after the requested seq_no
+  // Returns Status::Ok if iterator is valid
+  // Must set WAL_ttl_seconds or WAL_size_limit_MB to large values to
+  // use this api, else the WAL files will get
+  // cleared aggressively and the iterator might keep getting invalid before
+  // an update is read.
+  virtual Status GetUpdatesSince(SequenceNumber seq_number,
+                                 unique_ptr<TransactionLogIterator>* iter) = 0;
+
+  // Delete the file name from the db directory and update the internal state to
+  // reflect that. Supports deletion of sst and log files only. 'name' must be
+  // path relative to the db directory. eg. 000001.sst, /archive/000003.log
+  virtual Status DeleteFile(std::string name) = 0;
+
+  // Returns a list of all table files with their level, start key
+  // and end key
+  virtual void GetLiveFilesMetaData(
+    std::vector<LiveFileMetaData> *metadata) {
+  }
+
+ private:
+  // No copying allowed
+  DB(const DB&);
+  void operator=(const DB&);
+};
+
+// Destroy the contents of the specified database.
+// Be very careful using this method.
+Status DestroyDB(const std::string& name, const Options& options);
+
+// If a DB cannot be opened, you may attempt to call this method to
+// resurrect as much of the contents of the database as possible.
+// Some data may be lost, so be careful when calling this function
+// on a database that contains important information.
+Status RepairDB(const std::string& dbname, const Options& options);
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_DB_H_
--- a/include/rocksdb/env.h
+++ b/include/rocksdb/env.h
@ -0,0 +1,646 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// An Env is an interface used by the rocksdb implementation to access
+// operating system functionality like the filesystem etc.  Callers
+// may wish to provide a custom Env object when opening a database to
+// get fine gain control; e.g., to rate limit file system operations.
+//
+// All Env implementations are safe for concurrent access from
+// multiple threads without any external synchronization.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_ENV_H_
+#define STORAGE_ROCKSDB_INCLUDE_ENV_H_
+
+#include <cstdarg>
+#include <string>
+#include <memory>
+#include <vector>
+#include <stdint.h>
+#include "rocksdb/status.h"
+
+namespace rocksdb {
+
+class FileLock;
+class Logger;
+class RandomAccessFile;
+class SequentialFile;
+class Slice;
+class WritableFile;
+class RandomRWFile;
+struct Options;
+
+using std::unique_ptr;
+using std::shared_ptr;
+
+
+// Options while opening a file to read/write
+struct EnvOptions {
+
+  // construct with default Options
+  EnvOptions();
+
+  // construct from Options
+  explicit EnvOptions(const Options& options);
+
+  // If true, then allow caching of data in environment buffers
+  bool use_os_buffer;
+
+   // If true, then use mmap to read data
+  bool use_mmap_reads;
+
+   // If true, then use mmap to write data
+  bool use_mmap_writes;
+
+  // If true, set the FD_CLOEXEC on open fd.
+  bool set_fd_cloexec;
+
+  // Allows OS to incrementally sync files to disk while they are being
+  // written, in the background. Issue one request for every bytes_per_sync
+  // written. 0 turns it off.
+  // Default: 0
+  uint64_t bytes_per_sync;
+
+};
+
+class Env {
+ public:
+  Env() { }
+  virtual ~Env();
+
+  // Return a default environment suitable for the current operating
+  // system.  Sophisticated users may wish to provide their own Env
+  // implementation instead of relying on this default environment.
+  //
+  // The result of Default() belongs to rocksdb and must never be deleted.
+  static Env* Default();
+
+  // Create a brand new sequentially-readable file with the specified name.
+  // On success, stores a pointer to the new file in *result and returns OK.
+  // On failure stores nullptr in *result and returns non-OK.  If the file does
+  // not exist, returns a non-OK status.
+  //
+  // The returned file will only be accessed by one thread at a time.
+  virtual Status NewSequentialFile(const std::string& fname,
+                                   unique_ptr<SequentialFile>* result,
+                                   const EnvOptions& options)
+                                   = 0;
+
+  // Create a brand new random access read-only file with the
+  // specified name.  On success, stores a pointer to the new file in
+  // *result and returns OK.  On failure stores nullptr in *result and
+  // returns non-OK.  If the file does not exist, returns a non-OK
+  // status.
+  //
+  // The returned file may be concurrently accessed by multiple threads.
+  virtual Status NewRandomAccessFile(const std::string& fname,
+                                     unique_ptr<RandomAccessFile>* result,
+                                     const EnvOptions& options)
+                                     = 0;
+
+  // Create an object that writes to a new file with the specified
+  // name.  Deletes any existing file with the same name and creates a
+  // new file.  On success, stores a pointer to the new file in
+  // *result and returns OK.  On failure stores nullptr in *result and
+  // returns non-OK.
+  //
+  // The returned file will only be accessed by one thread at a time.
+  virtual Status NewWritableFile(const std::string& fname,
+                                 unique_ptr<WritableFile>* result,
+                                 const EnvOptions& options) = 0;
+
+  // Create an object that both reads and writes to a file on
+  // specified offsets (random access). If file already exists,
+  // does not overwrite it. On success, stores a pointer to the
+  // new file in *result and returns OK. On failure stores nullptr
+  // in *result and returns non-OK.
+  virtual Status NewRandomRWFile(const std::string& fname,
+                                 unique_ptr<RandomRWFile>* result,
+                                 const EnvOptions& options) = 0;
+
+  // Returns true iff the named file exists.
+  virtual bool FileExists(const std::string& fname) = 0;
+
+  // Store in *result the names of the children of the specified directory.
+  // The names are relative to "dir".
+  // Original contents of *results are dropped.
+  virtual Status GetChildren(const std::string& dir,
+                             std::vector<std::string>* result) = 0;
+
+  // Delete the named file.
+  virtual Status DeleteFile(const std::string& fname) = 0;
+
+  // Create the specified directory. Returns error if directory exists.
+  virtual Status CreateDir(const std::string& dirname) = 0;
+
+  // Creates directory if missing. Return Ok if it exists, or successful in
+  // Creating.
+  virtual Status CreateDirIfMissing(const std::string& dirname) = 0;
+
+  // Delete the specified directory.
+  virtual Status DeleteDir(const std::string& dirname) = 0;
+
+  // Store the size of fname in *file_size.
+  virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) = 0;
+
+  // Store the last modification time of fname in *file_mtime.
+  virtual Status GetFileModificationTime(const std::string& fname,
+                                         uint64_t* file_mtime) = 0;
+  // Rename file src to target.
+  virtual Status RenameFile(const std::string& src,
+                            const std::string& target) = 0;
+
+  // Lock the specified file.  Used to prevent concurrent access to
+  // the same db by multiple processes.  On failure, stores nullptr in
+  // *lock and returns non-OK.
+  //
+  // On success, stores a pointer to the object that represents the
+  // acquired lock in *lock and returns OK.  The caller should call
+  // UnlockFile(*lock) to release the lock.  If the process exits,
+  // the lock will be automatically released.
+  //
+  // If somebody else already holds the lock, finishes immediately
+  // with a failure.  I.e., this call does not wait for existing locks
+  // to go away.
+  //
+  // May create the named file if it does not already exist.
+  virtual Status LockFile(const std::string& fname, FileLock** lock) = 0;
+
+  // Release the lock acquired by a previous successful call to LockFile.
+  // REQUIRES: lock was returned by a successful LockFile() call
+  // REQUIRES: lock has not already been unlocked.
+  virtual Status UnlockFile(FileLock* lock) = 0;
+
+  enum Priority { LOW, HIGH, TOTAL };
+
+  // Arrange to run "(*function)(arg)" once in a background thread, in
+  // the thread pool specified by pri. By default, jobs go to the 'LOW'
+  // priority thread pool.
+
+  // "function" may run in an unspecified thread.  Multiple functions
+  // added to the same Env may run concurrently in different threads.
+  // I.e., the caller may not assume that background work items are
+  // serialized.
+  virtual void Schedule(
+      void (*function)(void* arg),
+      void* arg,
+      Priority pri = LOW) = 0;
+
+  // Start a new thread, invoking "function(arg)" within the new thread.
+  // When "function(arg)" returns, the thread will be destroyed.
+  virtual void StartThread(void (*function)(void* arg), void* arg) = 0;
+
+  // *path is set to a temporary directory that can be used for testing. It may
+  // or many not have just been created. The directory may or may not differ
+  // between runs of the same process, but subsequent calls will return the
+  // same directory.
+  virtual Status GetTestDirectory(std::string* path) = 0;
+
+  // Create and return a log file for storing informational messages.
+  virtual Status NewLogger(const std::string& fname,
+                           shared_ptr<Logger>* result) = 0;
+
+  // Returns the number of micro-seconds since some fixed point in time. Only
+  // useful for computing deltas of time.
+  virtual uint64_t NowMicros() = 0;
+
+  // Returns the number of nano-seconds since some fixed point in time. Only
+  // useful for computing deltas of time in one run.
+  // Default implementation simply relies on NowMicros
+  virtual uint64_t NowNanos() {
+    return NowMicros() * 1000;
+  }
+
+  // Sleep/delay the thread for the perscribed number of micro-seconds.
+  virtual void SleepForMicroseconds(int micros) = 0;
+
+  // Get the current host name.
+  virtual Status GetHostName(char* name, uint64_t len) = 0;
+
+  // Get the number of seconds since the Epoch, 1970-01-01 00:00:00 (UTC).
+  virtual Status GetCurrentTime(int64_t* unix_time) = 0;
+
+  // Get full directory name for this db.
+  virtual Status GetAbsolutePath(const std::string& db_path,
+      std::string* output_path) = 0;
+
+  // The number of background worker threads of a specific thread pool
+  // for this environment. 'LOW' is the default pool.
+  // default number: 1
+  virtual void SetBackgroundThreads(int number, Priority pri = LOW) = 0;
+
+  // Converts seconds-since-Jan-01-1970 to a printable string
+  virtual std::string TimeToString(uint64_t time) = 0;
+
+  // Generates a unique id that can be used to identify a db
+  virtual std::string GenerateUniqueId();
+
+ private:
+  // No copying allowed
+  Env(const Env&);
+  void operator=(const Env&);
+};
+
+// A file abstraction for reading sequentially through a file
+class SequentialFile {
+ public:
+  SequentialFile() { }
+  virtual ~SequentialFile();
+
+  // Read up to "n" bytes from the file.  "scratch[0..n-1]" may be
+  // written by this routine.  Sets "*result" to the data that was
+  // read (including if fewer than "n" bytes were successfully read).
+  // May set "*result" to point at data in "scratch[0..n-1]", so
+  // "scratch[0..n-1]" must be live when "*result" is used.
+  // If an error was encountered, returns a non-OK status.
+  //
+  // REQUIRES: External synchronization
+  virtual Status Read(size_t n, Slice* result, char* scratch) = 0;
+
+  // Skip "n" bytes from the file. This is guaranteed to be no
+  // slower that reading the same data, but may be faster.
+  //
+  // If end of file is reached, skipping will stop at the end of the
+  // file, and Skip will return OK.
+  //
+  // REQUIRES: External synchronization
+  virtual Status Skip(uint64_t n) = 0;
+
+  // Remove any kind of caching of data from the offset to offset+length
+  // of this file. If the length is 0, then it refers to the end of file.
+  // If the system is not caching the file contents, then this is a noop.
+  virtual Status InvalidateCache(size_t offset, size_t length) {
+    return Status::NotSupported("InvalidateCache not supported.");
+  }
+};
+
+// A file abstraction for randomly reading the contents of a file.
+class RandomAccessFile {
+ public:
+  RandomAccessFile() { }
+  virtual ~RandomAccessFile();
+
+  // Read up to "n" bytes from the file starting at "offset".
+  // "scratch[0..n-1]" may be written by this routine.  Sets "*result"
+  // to the data that was read (including if fewer than "n" bytes were
+  // successfully read).  May set "*result" to point at data in
+  // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
+  // "*result" is used.  If an error was encountered, returns a non-OK
+  // status.
+  //
+  // Safe for concurrent use by multiple threads.
+  virtual Status Read(uint64_t offset, size_t n, Slice* result,
+                      char* scratch) const = 0;
+
+  // Tries to get an unique ID for this file that will be the same each time
+  // the file is opened (and will stay the same while the file is open).
+  // Furthermore, it tries to make this ID at most "max_size" bytes. If such an
+  // ID can be created this function returns the length of the ID and places it
+  // in "id"; otherwise, this function returns 0, in which case "id"
+  // may not have been modified.
+  //
+  // This function guarantees, for IDs from a given environment, two unique ids
+  // cannot be made equal to eachother by adding arbitrary bytes to one of
+  // them. That is, no unique ID is the prefix of another.
+  //
+  // This function guarantees that the returned ID will not be interpretable as
+  // a single varint.
+  //
+  // Note: these IDs are only valid for the duration of the process.
+  virtual size_t GetUniqueId(char* id, size_t max_size) const {
+    return 0; // Default implementation to prevent issues with backwards
+              // compatibility.
+  };
+
+
+  enum AccessPattern { NORMAL, RANDOM, SEQUENTIAL, WILLNEED, DONTNEED };
+
+  virtual void Hint(AccessPattern pattern) {}
+
+  // Remove any kind of caching of data from the offset to offset+length
+  // of this file. If the length is 0, then it refers to the end of file.
+  // If the system is not caching the file contents, then this is a noop.
+  virtual Status InvalidateCache(size_t offset, size_t length) {
+    return Status::NotSupported("InvalidateCache not supported.");
+  }
+};
+
+// A file abstraction for sequential writing.  The implementation
+// must provide buffering since callers may append small fragments
+// at a time to the file.
+class WritableFile {
+ public:
+  WritableFile() : last_preallocated_block_(0), preallocation_block_size_ (0) {
+  }
+  virtual ~WritableFile();
+
+  virtual Status Append(const Slice& data) = 0;
+  virtual Status Close() = 0;
+  virtual Status Flush() = 0;
+  virtual Status Sync() = 0; // sync data
+
+  /*
+   * Sync data and/or metadata as well.
+   * By default, sync only data.
+   * Override this method for environments where we need to sync
+   * metadata as well.
+   */
+  virtual Status Fsync() {
+    return Sync();
+  }
+
+  /*
+   * Get the size of valid data in the file.
+   */
+  virtual uint64_t GetFileSize() {
+    return 0;
+  }
+
+  /*
+   * Get and set the default pre-allocation block size for writes to
+   * this file.  If non-zero, then Allocate will be used to extend the
+   * underlying storage of a file (generally via fallocate) if the Env
+   * instance supports it.
+   */
+  void SetPreallocationBlockSize(size_t size) {
+    preallocation_block_size_ = size;
+  }
+
+  virtual void GetPreallocationStatus(size_t* block_size,
+                                      size_t* last_allocated_block) {
+    *last_allocated_block = last_preallocated_block_;
+    *block_size = preallocation_block_size_;
+  }
+
+  // For documentation, refer to RandomAccessFile::GetUniqueId()
+  virtual size_t GetUniqueId(char* id, size_t max_size) const {
+    return 0; // Default implementation to prevent issues with backwards
+  }
+
+  // Remove any kind of caching of data from the offset to offset+length
+  // of this file. If the length is 0, then it refers to the end of file.
+  // If the system is not caching the file contents, then this is a noop.
+  // This call has no effect on dirty pages in the cache.
+  virtual Status InvalidateCache(size_t offset, size_t length) {
+    return Status::NotSupported("InvalidateCache not supported.");
+  }
+
+ protected:
+  // PrepareWrite performs any necessary preparation for a write
+  // before the write actually occurs.  This allows for pre-allocation
+  // of space on devices where it can result in less file
+  // fragmentation and/or less waste from over-zealous filesystem
+  // pre-allocation.
+  void PrepareWrite(size_t offset, size_t len) {
+    if (preallocation_block_size_ == 0) {
+      return;
+    }
+    // If this write would cross one or more preallocation blocks,
+    // determine what the last preallocation block necesessary to
+    // cover this write would be and Allocate to that point.
+    const auto block_size = preallocation_block_size_;
+    size_t new_last_preallocated_block =
+      (offset + len + block_size - 1) / block_size;
+    if (new_last_preallocated_block > last_preallocated_block_) {
+      size_t num_spanned_blocks =
+        new_last_preallocated_block - last_preallocated_block_;
+      Allocate(block_size * last_preallocated_block_,
+               block_size * num_spanned_blocks);
+      last_preallocated_block_ = new_last_preallocated_block;
+    }
+  }
+
+  /*
+   * Pre-allocate space for a file.
+   */
+  virtual Status Allocate(off_t offset, off_t len) {
+    return Status::OK();
+  }
+
+  // Sync a file range with disk.
+  // offset is the starting byte of the file range to be synchronized.
+  // nbytes specifies the length of the range to be synchronized.
+  // This asks the OS to initiate flushing the cached data to disk,
+  // without waiting for completion.
+  // Default implementation does nothing.
+  virtual Status RangeSync(off_t offset, off_t nbytes) {
+    return Status::OK();
+  }
+
+ private:
+  size_t last_preallocated_block_;
+  size_t preallocation_block_size_;
+  // No copying allowed
+  WritableFile(const WritableFile&);
+  void operator=(const WritableFile&);
+};
+
+// A file abstraction for random reading and writing.
+class RandomRWFile {
+ public:
+  RandomRWFile() {}
+  virtual ~RandomRWFile() {}
+
+  // Write data from Slice data to file starting from offset
+  // Returns IOError on failure, but does not guarantee
+  // atomicity of a write.  Returns OK status on success.
+  //
+  // Safe for concurrent use.
+  virtual Status Write(uint64_t offset, const Slice& data) = 0;
+  // Read up to "n" bytes from the file starting at "offset".
+  // "scratch[0..n-1]" may be written by this routine.  Sets "*result"
+  // to the data that was read (including if fewer than "n" bytes were
+  // successfully read).  May set "*result" to point at data in
+  // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
+  // "*result" is used.  If an error was encountered, returns a non-OK
+  // status.
+  //
+  // Safe for concurrent use by multiple threads.
+  virtual Status Read(uint64_t offset, size_t n, Slice* result,
+                      char* scratch) const = 0;
+  virtual Status Close() = 0; // closes the file
+  virtual Status Sync() = 0; // sync data
+
+  /*
+   * Sync data and/or metadata as well.
+   * By default, sync only data.
+   * Override this method for environments where we need to sync
+   * metadata as well.
+   */
+  virtual Status Fsync() {
+    return Sync();
+  }
+
+  /*
+   * Pre-allocate space for a file.
+   */
+  virtual Status Allocate(off_t offset, off_t len) {
+    return Status::OK();
+  }
+
+ private:
+  // No copying allowed
+  RandomRWFile(const RandomRWFile&);
+  void operator=(const RandomRWFile&);
+};
+
+// An interface for writing log messages.
+class Logger {
+ public:
+  enum { DO_NOT_SUPPORT_GET_LOG_FILE_SIZE = -1 };
+  Logger() { }
+  virtual ~Logger();
+
+  // Write an entry to the log file with the specified format.
+  virtual void Logv(const char* format, va_list ap) = 0;
+  virtual size_t GetLogFileSize() const {
+    return DO_NOT_SUPPORT_GET_LOG_FILE_SIZE;
+  }
+  // Flush to the OS buffers
+  virtual void Flush() {}
+
+ private:
+  // No copying allowed
+  Logger(const Logger&);
+  void operator=(const Logger&);
+};
+
+
+// Identifies a locked file.
+class FileLock {
+ public:
+  FileLock() { }
+  virtual ~FileLock();
+ private:
+  // No copying allowed
+  FileLock(const FileLock&);
+  void operator=(const FileLock&);
+};
+
+
+extern void LogFlush(const shared_ptr<Logger>& info_log);
+
+// Log the specified data to *info_log if info_log is non-nullptr.
+extern void Log(const shared_ptr<Logger>& info_log, const char* format, ...)
+#   if defined(__GNUC__) || defined(__clang__)
+    __attribute__((__format__ (__printf__, 2, 3)))
+#   endif
+    ;
+
+extern void LogFlush(Logger *info_log);
+
+extern void Log(Logger* info_log, const char* format, ...)
+#   if defined(__GNUC__) || defined(__clang__)
+    __attribute__((__format__ (__printf__, 2, 3)))
+#   endif
+    ;
+
+// A utility routine: write "data" to the named file.
+extern Status WriteStringToFile(Env* env, const Slice& data,
+                                const std::string& fname);
+
+// A utility routine: read contents of named file into *data
+extern Status ReadFileToString(Env* env, const std::string& fname,
+                               std::string* data);
+
+// An implementation of Env that forwards all calls to another Env.
+// May be useful to clients who wish to override just part of the
+// functionality of another Env.
+class EnvWrapper : public Env {
+ public:
+  // Initialize an EnvWrapper that delegates all calls to *t
+  explicit EnvWrapper(Env* t) : target_(t) { }
+  virtual ~EnvWrapper();
+
+  // Return the target to which this Env forwards all calls
+  Env* target() const { return target_; }
+
+  // The following text is boilerplate that forwards all methods to target()
+  Status NewSequentialFile(const std::string& f,
+                           unique_ptr<SequentialFile>* r,
+                           const EnvOptions& options) {
+    return target_->NewSequentialFile(f, r, options);
+  }
+  Status NewRandomAccessFile(const std::string& f,
+                             unique_ptr<RandomAccessFile>* r,
+                             const EnvOptions& options) {
+    return target_->NewRandomAccessFile(f, r, options);
+  }
+  Status NewWritableFile(const std::string& f, unique_ptr<WritableFile>* r,
+                         const EnvOptions& options) {
+    return target_->NewWritableFile(f, r, options);
+  }
+  Status NewRandomRWFile(const std::string& f, unique_ptr<RandomRWFile>* r,
+                         const EnvOptions& options) {
+    return target_->NewRandomRWFile(f, r, options);
+  }
+  bool FileExists(const std::string& f) { return target_->FileExists(f); }
+  Status GetChildren(const std::string& dir, std::vector<std::string>* r) {
+    return target_->GetChildren(dir, r);
+  }
+  Status DeleteFile(const std::string& f) { return target_->DeleteFile(f); }
+  Status CreateDir(const std::string& d) { return target_->CreateDir(d); }
+  Status CreateDirIfMissing(const std::string& d) {
+    return target_->CreateDirIfMissing(d);
+  }
+  Status DeleteDir(const std::string& d) { return target_->DeleteDir(d); }
+  Status GetFileSize(const std::string& f, uint64_t* s) {
+    return target_->GetFileSize(f, s);
+  }
+
+  Status GetFileModificationTime(const std::string& fname,
+                                 uint64_t* file_mtime) {
+    return target_->GetFileModificationTime(fname, file_mtime);
+  }
+
+  Status RenameFile(const std::string& s, const std::string& t) {
+    return target_->RenameFile(s, t);
+  }
+  Status LockFile(const std::string& f, FileLock** l) {
+    return target_->LockFile(f, l);
+  }
+  Status UnlockFile(FileLock* l) { return target_->UnlockFile(l); }
+  void Schedule(void (*f)(void*), void* a, Priority pri) {
+    return target_->Schedule(f, a, pri);
+  }
+  void StartThread(void (*f)(void*), void* a) {
+    return target_->StartThread(f, a);
+  }
+  virtual Status GetTestDirectory(std::string* path) {
+    return target_->GetTestDirectory(path);
+  }
+  virtual Status NewLogger(const std::string& fname,
+                           shared_ptr<Logger>* result) {
+    return target_->NewLogger(fname, result);
+  }
+  uint64_t NowMicros() {
+    return target_->NowMicros();
+  }
+  void SleepForMicroseconds(int micros) {
+    target_->SleepForMicroseconds(micros);
+  }
+  Status GetHostName(char* name, uint64_t len) {
+    return target_->GetHostName(name, len);
+  }
+  Status GetCurrentTime(int64_t* unix_time) {
+    return target_->GetCurrentTime(unix_time);
+  }
+  Status GetAbsolutePath(const std::string& db_path,
+      std::string* output_path) {
+    return target_->GetAbsolutePath(db_path, output_path);
+  }
+  void SetBackgroundThreads(int num, Priority pri) {
+    return target_->SetBackgroundThreads(num, pri);
+  }
+  std::string TimeToString(uint64_t time) {
+    return target_->TimeToString(time);
+  }
+
+ private:
+  Env* target_;
+};
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_ENV_H_
--- a/include/rocksdb/filter_policy.h
+++ b/include/rocksdb/filter_policy.h
@ -0,0 +1,70 @@
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A database can be configured with a custom FilterPolicy object.
+// This object is responsible for creating a small filter from a set
+// of keys.  These filters are stored in rocksdb and are consulted
+// automatically by rocksdb to decide whether or not to read some
+// information from disk. In many cases, a filter can cut down the
+// number of disk seeks form a handful to a single disk seek per
+// DB::Get() call.
+//
+// Most people will want to use the builtin bloom filter support (see
+// NewBloomFilterPolicy() below).
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_
+#define STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_
+
+#include <string>
+
+namespace rocksdb {
+
+class Slice;
+
+class FilterPolicy {
+ public:
+  virtual ~FilterPolicy();
+
+  // Return the name of this policy.  Note that if the filter encoding
+  // changes in an incompatible way, the name returned by this method
+  // must be changed.  Otherwise, old incompatible filters may be
+  // passed to methods of this type.
+  virtual const char* Name() const = 0;
+
+  // keys[0,n-1] contains a list of keys (potentially with duplicates)
+  // that are ordered according to the user supplied comparator.
+  // Append a filter that summarizes keys[0,n-1] to *dst.
+  //
+  // Warning: do not change the initial contents of *dst.  Instead,
+  // append the newly constructed filter to *dst.
+  virtual void CreateFilter(const Slice* keys, int n, std::string* dst)
+      const = 0;
+
+  // "filter" contains the data appended by a preceding call to
+  // CreateFilter() on this class.  This method must return true if
+  // the key was in the list of keys passed to CreateFilter().
+  // This method may return true or false if the key was not on the
+  // list, but it should aim to return false with a high probability.
+  virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const = 0;
+};
+
+// Return a new filter policy that uses a bloom filter with approximately
+// the specified number of bits per key.  A good value for bits_per_key
+// is 10, which yields a filter with ~ 1% false positive rate.
+//
+// Callers must delete the result after any database that is using the
+// result has been closed.
+//
+// Note: if you are using a custom comparator that ignores some parts
+// of the keys being compared, you must not use NewBloomFilterPolicy()
+// and must provide your own FilterPolicy that also ignores the
+// corresponding parts of the keys.  For example, if the comparator
+// ignores trailing spaces, it would be incorrect to use a
+// FilterPolicy (like NewBloomFilterPolicy) that does not ignore
+// trailing spaces in keys.
+extern const FilterPolicy* NewBloomFilterPolicy(int bits_per_key);
+
+}
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_
--- a/include/rocksdb/flush_block_policy.h
+++ b/include/rocksdb/flush_block_policy.h
@ -0,0 +1,64 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#include <string>
+
+namespace rocksdb {
+
+class Slice;
+class BlockBuilder;
+
+// FlushBlockPolicy provides a configurable way to determine when to flush a
+// block in the block based tables,
+class FlushBlockPolicy {
+ public:
+  // Keep track of the key/value sequences and return the boolean value to
+  // determine if table builder should flush current data block.
+  virtual bool Update(const Slice& key,
+                      const Slice& value) = 0;
+
+  virtual ~FlushBlockPolicy() { }
+};
+
+class FlushBlockPolicyFactory {
+ public:
+  // Return the name of the flush block policy.
+  virtual const char* Name() const = 0;
+
+  // Return a new block flush policy that flushes data blocks by data size.
+  // FlushBlockPolicy may need to access the metadata of the data block
+  // builder to determine when to flush the blocks.
+  //
+  // Callers must delete the result after any database that is using the
+  // result has been closed.
+  virtual FlushBlockPolicy* NewFlushBlockPolicy(
+      const BlockBuilder& data_block_builder) const = 0;
+
+  virtual ~FlushBlockPolicyFactory() { }
+};
+
+class FlushBlockBySizePolicyFactory : public FlushBlockPolicyFactory {
+ public:
+  FlushBlockBySizePolicyFactory(const uint64_t block_size,
+                                const uint64_t block_size_deviation) :
+      block_size_(block_size),
+      block_size_deviation_(block_size_deviation) {
+  }
+
+  virtual const char* Name() const override {
+    return "FlushBlockBySizePolicyFactory";
+  }
+
+  virtual FlushBlockPolicy* NewFlushBlockPolicy(
+      const BlockBuilder& data_block_builder) const override;
+
+ private:
+  const uint64_t block_size_;
+  const uint64_t block_size_deviation_;
+};
+
+}  // rocksdb
--- a/include/rocksdb/iterator.h
+++ b/include/rocksdb/iterator.h
@ -0,0 +1,102 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// An iterator yields a sequence of key/value pairs from a source.
+// The following class defines the interface.  Multiple implementations
+// are provided by this library.  In particular, iterators are provided
+// to access the contents of a Table or a DB.
+//
+// Multiple threads can invoke const methods on an Iterator without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same Iterator must use
+// external synchronization.
+
+#ifndef STORAGE_ROCKSDB_INCLUDE_ITERATOR_H_
+#define STORAGE_ROCKSDB_INCLUDE_ITERATOR_H_
+
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+
+namespace rocksdb {
+
+class Iterator {
+ public:
+  Iterator();
+  virtual ~Iterator();
+
+  // An iterator is either positioned at a key/value pair, or
+  // not valid.  This method returns true iff the iterator is valid.
+  virtual bool Valid() const = 0;
+
+  // Position at the first key in the source.  The iterator is Valid()
+  // after this call iff the source is not empty.
+  virtual void SeekToFirst() = 0;
+
+  // Position at the last key in the source.  The iterator is
+  // Valid() after this call iff the source is not empty.
+  virtual void SeekToLast() = 0;
+
+  // Position at the first key in the source that at or past target
+  // The iterator is Valid() after this call iff the source contains
+  // an entry that comes at or past target.
+  virtual void Seek(const Slice& target) = 0;
+
+  // Moves to the next entry in the source.  After this call, Valid() is
+  // true iff the iterator was not positioned at the last entry in the source.
+  // REQUIRES: Valid()
+  virtual void Next() = 0;
+
+  // Moves to the previous entry in the source.  After this call, Valid() is
+  // true iff the iterator was not positioned at the first entry in source.
+  // REQUIRES: Valid()
+  virtual void Prev() = 0;
+
+  // Return the key for the current entry.  The underlying storage for
+  // the returned slice is valid only until the next modification of
+  // the iterator.
+  // REQUIRES: Valid()
+  virtual Slice key() const = 0;
+
+  // Return the value for the current entry.  The underlying storage for
+  // the returned slice is valid only until the next modification of
+  // the iterator.
+  // REQUIRES: !AtEnd() && !AtStart()
+  virtual Slice value() const = 0;
+
+  // If an error has occurred, return it.  Else return an ok status.
+  // If non-blocking IO is requested and this operation cannot be
+  // satisfied without doing some IO, then this returns Status::Incomplete().
+  virtual Status status() const = 0;
+
+  // Clients are allowed to register function/arg1/arg2 triples that
+  // will be invoked when this iterator is destroyed.
+  //
+  // Note that unlike all of the preceding methods, this method is
+  // not abstract and therefore clients should not override it.
+  typedef void (*CleanupFunction)(void* arg1, void* arg2);
+  void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2);
+
+ private:
+  struct Cleanup {
+    CleanupFunction function;
+    void* arg1;
+    void* arg2;
+    Cleanup* next;
+  };
+  Cleanup cleanup_;
+
+  // No copying allowed
+  Iterator(const Iterator&);
+  void operator=(const Iterator&);
+};
+
+// Return an empty iterator (yields nothing).
+extern Iterator* NewEmptyIterator();
+
+// Return an empty iterator with the specified status.
+extern Iterator* NewErrorIterator(const Status& status);
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_ROCKSDB_INCLUDE_ITERATOR_H_
--- a/include/rocksdb/ldb_tool.h
+++ b/include/rocksdb/ldb_tool.h
@ -0,0 +1,15 @@
+// Copyright 2008-present Facebook. All Rights Reserved.
+#ifndef STORAGE_ROCKSDB_INCLUDE_LDB_TOOL_H
+#define STORAGE_ROCKSDB_INCLUDE_LDB_TOOL_H
+#include "rocksdb/options.h"
+
+namespace rocksdb {
+
+class LDBTool {
+ public:
+  void Run(int argc, char** argv, Options = Options());
+};
+
+} // namespace rocksdb
+
+#endif // STORAGE_ROCKSDB_INCLUDE_LDB_TOOL_H
--- a/Show More
+++ b/Show More