Added bloom filter support.

In particular, we add a new FilterPolicy class. An instance of this class can be supplied in Options when opening a database. If supplied, the instance is used to generate summaries of keys (e.g., a bloom filter) which are placed in sstables. These summaries are consulted by DB::Get() so we can avoid reading sstable blocks that are guaranteed to not contain the key we are looking for. This change provides one implementation of FilterPolicy based on bloom filters. Other changes: - Updated version number to 1.4. - Some build tweaks. - C binding for CompactRange. - A few more benchmarks: deleteseq, deleterandom, readmissing, seekrandom. - Minor .gitignore update.
2012-04-17 08:36:46 -07:00 · 2012-04-17 08:36:46 -07:00 · 85584d497e
commit 85584d497e
parent bc1ee4d25e
38 changed files with 2070 additions and 570 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,5 +1,8 @@
 build_config.mk
 *.a
 *.o
 *.dylib*
 *.so
 *.so.*
 *_test
 db_bench
--- a/14
+++ b/14
@ -17,8 +17,8 @@ OPT ?= -O2 -DNDEBUG       # (A) Production use (optimized mode)
 #-----------------------------------------------
 # detect what platform we're building on
-$(shell sh ./build_detect_platform)
+$(shell ./build_detect_platform build_config.mk)
-# this file is generated by build_detect_platform to set build flags and sources
+# this file is generated by the previous line to set build flags and sources
 include build_config.mk
 CFLAGS += -I. -I./include $(PLATFORM_CCFLAGS) $(OPT)
@ -34,6 +34,7 @@ TESTHARNESS = ./util/testharness.o $(TESTUTIL)
 TESTS = \
 	arena_test \
 	bloom_test \
 	c_test \
 	cache_test \
 	coding_test \
@ -43,6 +44,7 @@ TESTS = \
 	dbformat_test \
 	env_test \
 	filename_test \
 	filter_block_test \
 	log_test \
 	memenv_test \
 	skiplist_test \
@ -63,7 +65,7 @@ default: all
 ifneq ($(PLATFORM_SHARED_EXT),)
 # Update db.h if you change these.
 SHARED_MAJOR = 1
-SHARED_MINOR = 3
+SHARED_MINOR = 4
 SHARED1 = libleveldb.$(PLATFORM_SHARED_EXT)
 SHARED2 = $(SHARED1).$(SHARED_MAJOR)
 SHARED3 = $(SHARED1).$(SHARED_MAJOR).$(SHARED_MINOR)
@ -101,6 +103,9 @@ db_bench_tree_db: doc/bench/db_bench_tree_db.o $(LIBOBJECTS) $(TESTUTIL)
 arena_test: util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS)
 bloom_test: util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS)
 c_test: db/c_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) db/c_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS)
@ -128,6 +133,9 @@ env_test: util/env_test.o $(LIBOBJECTS) $(TESTHARNESS)
 filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS)
 filter_block_test: table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS)
 log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(CXX) db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@ $(LDFLAGS)
--- a/38
+++ b/38
@ -1,9 +1,9 @@
 #!/bin/sh
 #
-# Detects OS we're compiling on and generates build_config.mk,
+# Detects OS we're compiling on and outputs a file specified by the first
-# which in turn gets read while processing Makefile.
+# argument, which in turn gets read while processing Makefile.
 #
-# build_config.mk will set the following variables:
+# The output will set the following variables:
 #   PLATFORM_LDFLAGS            Linker flags
 #   PLATFORM_SHARED_EXT         Extension for shared libraries
 #   PLATFORM_SHARED_LDFLAGS     Flags for building shared library
@ -13,11 +13,15 @@
 #       -DLEVELDB_PLATFORM_POSIX if cstdatomic is present
 #       -DLEVELDB_PLATFORM_NOATOMIC if it is not
-SCRIPT_DIR=`dirname $0`
+OUTPUT=$1
 if test -z "$OUTPUT"; then
  echo "usage: $0 <output-filename>"
  exit 1
 fi
-# Delete existing build_config.mk
+# Delete existing output, if it exists
-rm -f build_config.mk
+rm -f $OUTPUT
-touch build_config.mk
+touch $OUTPUT
 if test -z "$CXX"; then
    CXX=g++
@ -96,7 +100,7 @@ esac
 # except for the test and benchmark files. By default, find will output a list
 # of all files matching either rule, so we need to append -print to make the
 # prune take effect.
-DIRS="$SCRIPT_DIR/util $SCRIPT_DIR/db $SCRIPT_DIR/table"
+DIRS="util db table"
 set -f # temporarily disable globbing so that our patterns aren't expanded
 PRUNE_TEST="-name *test*.cc -prune"
 PRUNE_BENCH="-name *_bench.cc -prune"
@ -105,8 +109,8 @@ set +f # re-enable globbing
 # The sources consist of the portable files, plus the platform-specific port
 # file.
-echo "SOURCES=$PORTABLE_FILES $PORT_FILE" >> build_config.mk
+echo "SOURCES=$PORTABLE_FILES $PORT_FILE" >> $OUTPUT
-echo "MEMENV_SOURCES=helpers/memenv/memenv.cc" >> build_config.mk
+echo "MEMENV_SOURCES=helpers/memenv/memenv.cc" >> $OUTPUT
 if [ "$PLATFORM" = "OS_ANDROID_CROSSCOMPILE" ]; then
    # Cross-compiling; do not try any compilation tests.
@ -147,10 +151,10 @@ fi
 PLATFORM_CCFLAGS="$PLATFORM_CCFLAGS $COMMON_FLAGS"
 PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS $COMMON_FLAGS"
-echo "PLATFORM=$PLATFORM" >> build_config.mk
+echo "PLATFORM=$PLATFORM" >> $OUTPUT
-echo "PLATFORM_LDFLAGS=$PLATFORM_LDFLAGS" >> build_config.mk
+echo "PLATFORM_LDFLAGS=$PLATFORM_LDFLAGS" >> $OUTPUT
-echo "PLATFORM_CCFLAGS=$PLATFORM_CCFLAGS" >> build_config.mk
+echo "PLATFORM_CCFLAGS=$PLATFORM_CCFLAGS" >> $OUTPUT
-echo "PLATFORM_CXXFLAGS=$PLATFORM_CXXFLAGS" >> build_config.mk
+echo "PLATFORM_CXXFLAGS=$PLATFORM_CXXFLAGS" >> $OUTPUT
-echo "PLATFORM_SHARED_CFLAGS=$PLATFORM_SHARED_CFLAGS" >> build_config.mk
+echo "PLATFORM_SHARED_CFLAGS=$PLATFORM_SHARED_CFLAGS" >> $OUTPUT
-echo "PLATFORM_SHARED_EXT=$PLATFORM_SHARED_EXT" >> build_config.mk
+echo "PLATFORM_SHARED_EXT=$PLATFORM_SHARED_EXT" >> $OUTPUT
-echo "PLATFORM_SHARED_LDFLAGS=$PLATFORM_SHARED_LDFLAGS" >> build_config.mk
+echo "PLATFORM_SHARED_LDFLAGS=$PLATFORM_SHARED_LDFLAGS" >> $OUTPUT
--- a/db/c.cc
+++ b/db/c.cc
@ -10,6 +10,7 @@
 #include "leveldb/comparator.h"
 #include "leveldb/db.h"
 #include "leveldb/env.h"
 #include "leveldb/filter_policy.h"
 #include "leveldb/iterator.h"
 #include "leveldb/options.h"
 #include "leveldb/status.h"
@ -21,8 +22,10 @@ using leveldb::CompressionType;
 using leveldb::DB;
 using leveldb::Env;
 using leveldb::FileLock;
 using leveldb::FilterPolicy;
 using leveldb::Iterator;
 using leveldb::Logger;
 using leveldb::NewBloomFilterPolicy;
 using leveldb::NewLRUCache;
 using leveldb::Options;
 using leveldb::RandomAccessFile;
@ -78,6 +81,47 @@ struct leveldb_comparator_t : public Comparator {
  virtual void FindShortSuccessor(std::string* key) const { }
 };
 struct leveldb_filterpolicy_t : public FilterPolicy {
  void* state_;
  void (*destructor_)(void*);
  const char* (*name_)(void*);
  char* (*create_)(
      void*,
      const char* const* key_array, const size_t* key_length_array,
      int num_keys,
      size_t* filter_length);
  unsigned char (*key_match_)(
      void*,
      const char* key, size_t length,
      const char* filter, size_t filter_length);
  virtual ~leveldb_filterpolicy_t() {
    (*destructor_)(state_);
  }
  virtual const char* Name() const {
    return (*name_)(state_);
  }
  virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const {
    std::vector<const char*> key_pointers(n);
    std::vector<size_t> key_sizes(n);
    for (int i = 0; i < n; i++) {
      key_pointers[i] = keys[i].data();
      key_sizes[i] = keys[i].size();
    }
    size_t len;
    char* filter = (*create_)(state_, &key_pointers[0], &key_sizes[0], n, &len);
    dst->append(filter, len);
    free(filter);
  }
  virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const {
    return (*key_match_)(state_, key.data(), key.size(),
                         filter.data(), filter.size());
  }
 };
 struct leveldb_env_t {
  Env* rep;
  bool is_default;
@ -218,6 +262,17 @@ void leveldb_approximate_sizes(
  delete[] ranges;
 }
 void leveldb_compact_range(
    leveldb_t* db,
    const char* start_key, size_t start_key_len,
    const char* limit_key, size_t limit_key_len) {
  Slice a, b;
  db->rep->CompactRange(
      // Pass NULL Slice if corresponding "const char*" is NULL
      (start_key ? (a = Slice(start_key, start_key_len), &a) : NULL),
      (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : NULL));
 }
 void leveldb_destroy_db(
    const leveldb_options_t* options,
    const char* name,
@ -340,6 +395,12 @@ void leveldb_options_set_comparator(
  opt->rep.comparator = cmp;
 }
 void leveldb_options_set_filter_policy(
    leveldb_options_t* opt,
    leveldb_filterpolicy_t* policy) {
  opt->rep.filter_policy = policy;
 }
 void leveldb_options_set_create_if_missing(
    leveldb_options_t* opt, unsigned char v) {
  opt->rep.create_if_missing = v;
@ -407,6 +468,55 @@ void leveldb_comparator_destroy(leveldb_comparator_t* cmp) {
  delete cmp;
 }
 leveldb_filterpolicy_t* leveldb_filterpolicy_create(
    void* state,
    void (*destructor)(void*),
    char* (*create_filter)(
        void*,
        const char* const* key_array, const size_t* key_length_array,
        int num_keys,
        size_t* filter_length),
    unsigned char (*key_may_match)(
        void*,
        const char* key, size_t length,
        const char* filter, size_t filter_length),
    const char* (*name)(void*)) {
  leveldb_filterpolicy_t* result = new leveldb_filterpolicy_t;
  result->state_ = state;
  result->destructor_ = destructor;
  result->create_ = create_filter;
  result->key_match_ = key_may_match;
  result->name_ = name;
  return result;
 }
 void leveldb_filterpolicy_destroy(leveldb_filterpolicy_t* filter) {
  delete filter;
 }
 leveldb_filterpolicy_t* leveldb_filterpolicy_create_bloom(int bits_per_key) {
  // Make a leveldb_filterpolicy_t, but override all of its methods so
  // they delegate to a NewBloomFilterPolicy() instead of user
  // supplied C functions.
  struct Wrapper : public leveldb_filterpolicy_t {
    const FilterPolicy* rep_;
    ~Wrapper() { delete rep_; }
    const char* Name() const { return rep_->Name(); }
    void CreateFilter(const Slice* keys, int n, std::string* dst) const {
      return rep_->CreateFilter(keys, n, dst);
    }
    bool KeyMayMatch(const Slice& key, const Slice& filter) const {
      return rep_->KeyMayMatch(key, filter);
    }
    static void DoNothing(void*) { }
  };
  Wrapper* wrapper = new Wrapper;
  wrapper->rep_ = NewBloomFilterPolicy(bits_per_key);
  wrapper->state_ = NULL;
  wrapper->destructor_ = &Wrapper::DoNothing;
  return wrapper;
 }
 leveldb_readoptions_t* leveldb_readoptions_create() {
  return new leveldb_readoptions_t;
 }
--- a/db/c_test.c
+++ b/db/c_test.c
@ -122,6 +122,31 @@ static const char* CmpName(void* arg) {
  return "foo";
 }
 // Custom filter policy
 static unsigned char fake_filter_result = 1;
 static void FilterDestroy(void* arg) { }
 static const char* FilterName(void* arg) {
  return "TestFilter";
 }
 static char* FilterCreate(
    void* arg,
    const char* const* key_array, const size_t* key_length_array,
    int num_keys,
    size_t* filter_length) {
  *filter_length = 4;
  char* result = malloc(4);
  memcpy(result, "fake", 4);
  return result;
 }
 unsigned char FilterKeyMatch(
    void* arg,
    const char* key, size_t length,
    const char* filter, size_t filter_length) {
  CheckCondition(filter_length == 4);
  CheckCondition(memcmp(filter, "fake", 4) == 0);
  return fake_filter_result;
 }
 int main(int argc, char** argv) {
  leveldb_t* db;
  leveldb_comparator_t* cmp;
@ -131,6 +156,7 @@ int main(int argc, char** argv) {
  leveldb_readoptions_t* roptions;
  leveldb_writeoptions_t* woptions;
  char* err = NULL;
  int run = -1;
  snprintf(dbname, sizeof(dbname), "/tmp/leveldb_c_test-%d",
           ((int) geteuid()));
@ -180,6 +206,14 @@ int main(int argc, char** argv) {
  CheckNoError(err);
  CheckGet(db, roptions, "foo", "hello");
  StartPhase("compactall");
  leveldb_compact_range(db, NULL, 0, NULL, 0);
  CheckGet(db, roptions, "foo", "hello");
  StartPhase("compactrange");
  leveldb_compact_range(db, "a", 1, "z", 1);
  CheckGet(db, roptions, "foo", "hello");
  StartPhase("writebatch");
  {
    leveldb_writebatch_t* wb = leveldb_writebatch_create();
@ -279,6 +313,49 @@ int main(int argc, char** argv) {
    CheckGet(db, roptions, "foo", NULL);
    CheckGet(db, roptions, "bar", NULL);
    CheckGet(db, roptions, "box", "c");
    leveldb_options_set_create_if_missing(options, 1);
    leveldb_options_set_error_if_exists(options, 1);
  }
  StartPhase("filter");
  for (run = 0; run < 2; run++) {
    // First run uses custom filter, second run uses bloom filter
    CheckNoError(err);
    leveldb_filterpolicy_t* policy;
    if (run == 0) {
      policy = leveldb_filterpolicy_create(
          NULL, FilterDestroy, FilterCreate, FilterKeyMatch, FilterName);
    } else {
      policy = leveldb_filterpolicy_create_bloom(10);
    }
    // Create new database
    leveldb_close(db);
    leveldb_destroy_db(options, dbname, &err);
    leveldb_options_set_filter_policy(options, policy);
    db = leveldb_open(options, dbname, &err);
    CheckNoError(err);
    leveldb_put(db, woptions, "foo", 3, "foovalue", 8, &err);
    CheckNoError(err);
    leveldb_put(db, woptions, "bar", 3, "barvalue", 8, &err);
    CheckNoError(err);
    leveldb_compact_range(db, NULL, 0, NULL, 0);
    fake_filter_result = 1;
    CheckGet(db, roptions, "foo", "foovalue");
    CheckGet(db, roptions, "bar", "barvalue");
    if (phase == 0) {
      // Must not find value when custom filter returns false
      fake_filter_result = 0;
      CheckGet(db, roptions, "foo", NULL);
      CheckGet(db, roptions, "bar", NULL);
      fake_filter_result = 1;
      CheckGet(db, roptions, "foo", "foovalue");
      CheckGet(db, roptions, "bar", "barvalue");
    }
    leveldb_options_set_filter_policy(options, NULL);
    leveldb_filterpolicy_destroy(policy);
  }
  StartPhase("cleanup");
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@ -25,15 +25,20 @@
 //      overwrite     -- overwrite N values in random key order in async mode
 //      fillsync      -- write N/100 values in random key order in sync mode
 //      fill100K      -- write N/1000 100K values in random order in async mode
 //      deleteseq     -- delete N keys in sequential order
 //      deleterandom  -- delete N keys in random order
 //      readseq       -- read N times sequentially
 //      readreverse   -- read N times in reverse order
 //      readrandom    -- read N times in random order
 //      readmissing   -- read N missing keys in random order
 //      readhot       -- read N times in random order from 1% section of DB
 //      seekrandom    -- N random seeks
 //      crc32c        -- repeated crc32c of 4K of data
 //      acquireload   -- load N*1000 times
 //   Meta operations:
 //      compact     -- Compact the entire DB
 //      stats       -- Print DB stats
 //      sstables    -- Print sstable info
 //      heapprofile -- Dump a heap profile (if supported by this port)
 static const char* FLAGS_benchmarks =
    "fillseq,"
@ -85,6 +90,10 @@ static int FLAGS_cache_size = -1;
 // Maximum number of files to keep open at the same time (use default if == 0)
 static int FLAGS_open_files = 0;
 // Bloom filter bits per key.
 // Negative means use default settings.
 static int FLAGS_bloom_bits = -1;
 // If true, do not destroy the existing database.  If you set this
 // flag and also specify a benchmark that wants a fresh database, that
 // benchmark will fail.
@ -293,6 +302,7 @@ struct ThreadState {
 class Benchmark {
 private:
  Cache* cache_;
  const FilterPolicy* filter_policy_;
  DB* db_;
  int num_;
  int value_size_;
@ -378,6 +388,9 @@ class Benchmark {
 public:
  Benchmark()
  : cache_(FLAGS_cache_size >= 0 ? NewLRUCache(FLAGS_cache_size) : NULL),
    filter_policy_(FLAGS_bloom_bits >= 0
                   ? NewBloomFilterPolicy(FLAGS_bloom_bits)
                   : NULL),
    db_(NULL),
    num_(FLAGS_num),
    value_size_(FLAGS_value_size),
@ -399,6 +412,7 @@ class Benchmark {
  ~Benchmark() {
    delete db_;
    delete cache_;
    delete filter_policy_;
  }
  void Run() {
@ -457,11 +471,19 @@ class Benchmark {
        method = &Benchmark::ReadReverse;
      } else if (name == Slice("readrandom")) {
        method = &Benchmark::ReadRandom;
      } else if (name == Slice("readmissing")) {
        method = &Benchmark::ReadMissing;
      } else if (name == Slice("seekrandom")) {
        method = &Benchmark::SeekRandom;
      } else if (name == Slice("readhot")) {
        method = &Benchmark::ReadHot;
      } else if (name == Slice("readrandomsmall")) {
        reads_ /= 1000;
        method = &Benchmark::ReadRandom;
      } else if (name == Slice("deleteseq")) {
        method = &Benchmark::DeleteSeq;
      } else if (name == Slice("deleterandom")) {
        method = &Benchmark::DeleteRandom;
      } else if (name == Slice("readwhilewriting")) {
        num_threads++;  // Add extra thread for writing
        method = &Benchmark::ReadWhileWriting;
@ -478,7 +500,9 @@ class Benchmark {
      } else if (name == Slice("heapprofile")) {
        HeapProfile();
      } else if (name == Slice("stats")) {
-        PrintStats();
+        PrintStats("leveldb.stats");
      } else if (name == Slice("sstables")) {
        PrintStats("leveldb.sstables");
      } else {
        if (name != Slice()) {  // No error message for empty name
          fprintf(stderr, "unknown benchmark '%s'\n", name.ToString().c_str());
@ -669,6 +693,7 @@ class Benchmark {
    options.create_if_missing = !FLAGS_use_existing_db;
    options.block_cache = cache_;
    options.write_buffer_size = FLAGS_write_buffer_size;
    options.filter_policy = filter_policy_;
    Status s = DB::Open(options, FLAGS_db, &db_);
    if (!s.ok()) {
      fprintf(stderr, "open error: %s\n", s.ToString().c_str());
@ -743,10 +768,28 @@ class Benchmark {
  void ReadRandom(ThreadState* thread) {
    ReadOptions options;
    std::string value;
    int found = 0;
    for (int i = 0; i < reads_; i++) {
      char key[100];
      const int k = thread->rand.Next() % FLAGS_num;
      snprintf(key, sizeof(key), "%016d", k);
      if (db_->Get(options, key, &value).ok()) {
        found++;
      }
      thread->stats.FinishedSingleOp();
    }
    char msg[100];
    snprintf(msg, sizeof(msg), "(%d of %d found)", found, num_);
    thread->stats.AddMessage(msg);
  }
  void ReadMissing(ThreadState* thread) {
    ReadOptions options;
    std::string value;
    for (int i = 0; i < reads_; i++) {
      char key[100];
      const int k = thread->rand.Next() % FLAGS_num;
      snprintf(key, sizeof(key), "%016d.", k);
      db_->Get(options, key, &value);
      thread->stats.FinishedSingleOp();
    }
@ -765,6 +808,54 @@ class Benchmark {
    }
  }
  void SeekRandom(ThreadState* thread) {
    ReadOptions options;
    std::string value;
    int found = 0;
    for (int i = 0; i < reads_; i++) {
      Iterator* iter = db_->NewIterator(options);
      char key[100];
      const int k = thread->rand.Next() % FLAGS_num;
      snprintf(key, sizeof(key), "%016d", k);
      iter->Seek(key);
      if (iter->Valid() && iter->key() == key) found++;
      delete iter;
      thread->stats.FinishedSingleOp();
    }
    char msg[100];
    snprintf(msg, sizeof(msg), "(%d of %d found)", found, num_);
    thread->stats.AddMessage(msg);
  }
  void DoDelete(ThreadState* thread, bool seq) {
    RandomGenerator gen;
    WriteBatch batch;
    Status s;
    for (int i = 0; i < num_; i += entries_per_batch_) {
      batch.Clear();
      for (int j = 0; j < entries_per_batch_; j++) {
        const int k = seq ? i+j : (thread->rand.Next() % FLAGS_num);
        char key[100];
        snprintf(key, sizeof(key), "%016d", k);
        batch.Delete(key);
        thread->stats.FinishedSingleOp();
      }
      s = db_->Write(write_options_, &batch);
      if (!s.ok()) {
        fprintf(stderr, "del error: %s\n", s.ToString().c_str());
        exit(1);
      }
    }
  }
  void DeleteSeq(ThreadState* thread) {
    DoDelete(thread, true);
  }
  void DeleteRandom(ThreadState* thread) {
    DoDelete(thread, false);
  }
  void ReadWhileWriting(ThreadState* thread) {
    if (thread->tid > 0) {
      ReadRandom(thread);
@ -799,9 +890,9 @@ class Benchmark {
    db_->CompactRange(NULL, NULL);
  }
-  void PrintStats() {
+  void PrintStats(const char* key) {
    std::string stats;
-    if (!db_->GetProperty("leveldb.stats", &stats)) {
+    if (!db_->GetProperty(key, &stats)) {
      stats = "(failed)";
    }
    fprintf(stdout, "\n%s\n", stats.c_str());
@ -861,6 +952,8 @@ int main(int argc, char** argv) {
      FLAGS_write_buffer_size = n;
    } else if (sscanf(argv[i], "--cache_size=%d%c", &n, &junk) == 1) {
      FLAGS_cache_size = n;
    } else if (sscanf(argv[i], "--bloom_bits=%d%c", &n, &junk) == 1) {
      FLAGS_bloom_bits = n;
    } else if (sscanf(argv[i], "--open_files=%d%c", &n, &junk) == 1) {
      FLAGS_open_files = n;
    } else if (strncmp(argv[i], "--db=", 5) == 0) {
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@ -87,9 +87,11 @@ static void ClipToRange(T* ptr, V minvalue, V maxvalue) {
 }
 Options SanitizeOptions(const std::string& dbname,
                        const InternalKeyComparator* icmp,
                        const InternalFilterPolicy* ipolicy,
                        const Options& src) {
  Options result = src;
  result.comparator = icmp;
  result.filter_policy = (src.filter_policy != NULL) ? ipolicy : NULL;
  ClipToRange(&result.max_open_files,            20,     50000);
  ClipToRange(&result.write_buffer_size,         64<<10, 1<<30);
  ClipToRange(&result.block_size,                1<<10,  4<<20);
@ -112,7 +114,9 @@ Options SanitizeOptions(const std::string& dbname,
 DBImpl::DBImpl(const Options& options, const std::string& dbname)
    : env_(options.env),
      internal_comparator_(options.comparator),
-      options_(SanitizeOptions(dbname, &internal_comparator_, options)),
+      internal_filter_policy_(options.filter_policy),
      options_(SanitizeOptions(
          dbname, &internal_comparator_, &internal_filter_policy_, options)),
      owns_info_log_(options_.info_log != options.info_log),
      owns_cache_(options_.block_cache != options.block_cache),
      dbname_(dbname),
--- a/db/db_impl.h
+++ b/db/db_impl.h
@ -105,6 +105,7 @@ class DBImpl : public DB {
  // Constant after construction
  Env* const env_;
  const InternalKeyComparator internal_comparator_;
  const InternalFilterPolicy internal_filter_policy_;
  const Options options_;  // options_.comparator == &internal_comparator_
  bool owns_info_log_;
  bool owns_cache_;
@ -185,6 +186,7 @@ class DBImpl : public DB {
 // it is not equal to src.info_log.
 extern Options SanitizeOptions(const std::string& db,
                               const InternalKeyComparator* icmp,
                               const InternalFilterPolicy* ipolicy,
                               const Options& src);
 }  // namespace leveldb
--- a/db/db_test.cc
+++ b/db/db_test.cc
@ -3,12 +3,15 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 #include "leveldb/db.h"
 #include "leveldb/filter_policy.h"
 #include "db/db_impl.h"
 #include "db/filename.h"
 #include "db/version_set.h"
 #include "db/write_batch_internal.h"
 #include "leveldb/cache.h"
 #include "leveldb/env.h"
 #include "leveldb/table.h"
 #include "util/hash.h"
 #include "util/logging.h"
 #include "util/mutexlock.h"
 #include "util/testharness.h"
@ -22,6 +25,28 @@ static std::string RandomString(Random* rnd, int len) {
  return r;
 }
 namespace {
 class AtomicCounter {
 private:
  port::Mutex mu_;
  int count_;
 public:
  AtomicCounter() : count_(0) { }
  void Increment() {
    MutexLock l(&mu_);
    count_++;
  }
  int Read() {
    MutexLock l(&mu_);
    return count_;
  }
  void Reset() {
    MutexLock l(&mu_);
    count_ = 0;
  }
 };
 }
 // Special Env used to delay background operations
 class SpecialEnv : public EnvWrapper {
 public:
@ -31,9 +56,13 @@ class SpecialEnv : public EnvWrapper {
  // Simulate no-space errors while this pointer is non-NULL.
  port::AtomicPointer no_space_;
  bool count_random_reads_;
  AtomicCounter random_read_counter_;
  explicit SpecialEnv(Env* base) : EnvWrapper(base) {
    delay_sstable_sync_.Release_Store(NULL);
    no_space_.Release_Store(NULL);
    count_random_reads_ = false;
  }
  Status NewWritableFile(const std::string& f, WritableFile** r) {
@ -74,9 +103,44 @@ class SpecialEnv : public EnvWrapper {
    }
    return s;
  }
  Status NewRandomAccessFile(const std::string& f, RandomAccessFile** r) {
    class CountingFile : public RandomAccessFile {
     private:
      RandomAccessFile* target_;
      AtomicCounter* counter_;
     public:
      CountingFile(RandomAccessFile* target, AtomicCounter* counter)
          : target_(target), counter_(counter) {
      }
      virtual ~CountingFile() { delete target_; }
      virtual Status Read(uint64_t offset, size_t n, Slice* result,
                          char* scratch) const {
        counter_->Increment();
        return target_->Read(offset, n, result, scratch);
      }
    };
    Status s = target()->NewRandomAccessFile(f, r);
    if (s.ok() && count_random_reads_) {
      *r = new CountingFile(*r, &random_read_counter_);
    }
    return s;
  }
 };
 class DBTest {
 private:
  const FilterPolicy* filter_policy_;
  // Sequence of option configurations to try
  enum OptionConfig {
    kDefault,
    kFilter,
    kEnd
  };
  int option_config_;
 public:
  std::string dbname_;
  SpecialEnv* env_;
@ -84,7 +148,9 @@ class DBTest {
  Options last_options_;
-  DBTest() : env_(new SpecialEnv(Env::Default())) {
+  DBTest() : option_config_(kDefault),
             env_(new SpecialEnv(Env::Default())) {
    filter_policy_ = NewBloomFilterPolicy(10);
    dbname_ = test::TmpDir() + "/db_test";
    DestroyDB(dbname_, Options());
    db_ = NULL;
@ -95,6 +161,32 @@ class DBTest {
    delete db_;
    DestroyDB(dbname_, Options());
    delete env_;
    delete filter_policy_;
  }
  // Switch to a fresh database with the next option configuration to
  // test.  Return false if there are no more configurations to test.
  bool ChangeOptions() {
    if (option_config_ == kEnd) {
      return false;
    } else {
      option_config_++;
      DestroyAndReopen();
      return true;
    }
  }
  // Return the current option configuration.
  Options CurrentOptions() {
    Options options;
    switch (option_config_) {
      case kFilter:
        options.filter_policy = filter_policy_;
        break;
      default:
        break;
    }
    return options;
  }
  DBImpl* dbfull() {
@ -105,6 +197,11 @@ class DBTest {
    ASSERT_OK(TryReopen(options));
  }
  void Close() {
    delete db_;
    db_ = NULL;
  }
  void DestroyAndReopen(Options* options = NULL) {
    delete db_;
    db_ = NULL;
@ -119,6 +216,7 @@ class DBTest {
    if (options != NULL) {
      opts = *options;
    } else {
      opts = CurrentOptions();
      opts.create_if_missing = true;
    }
    last_options_ = opts;
@ -189,8 +287,7 @@ class DBTest {
        if (!ParseInternalKey(iter->key(), &ikey)) {
          result += "CORRUPTED";
        } else {
-          if (last_options_.comparator->Compare(
+          if (last_options_.comparator->Compare(ikey.user_key, user_key) != 0) {
                  ikey.user_key, user_key) != 0) {
            break;
          }
          if (!first) {
@ -314,30 +411,37 @@ class DBTest {
 };
 TEST(DBTest, Empty) {
  do {
    ASSERT_TRUE(db_ != NULL);
    ASSERT_EQ("NOT_FOUND", Get("foo"));
  } while (ChangeOptions());
 }
 TEST(DBTest, ReadWrite) {
  do {
    ASSERT_OK(Put("foo", "v1"));
    ASSERT_EQ("v1", Get("foo"));
    ASSERT_OK(Put("bar", "v2"));
    ASSERT_OK(Put("foo", "v3"));
    ASSERT_EQ("v3", Get("foo"));
    ASSERT_EQ("v2", Get("bar"));
  } while (ChangeOptions());
 }
 TEST(DBTest, PutDeleteGet) {
  do {
    ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1"));
    ASSERT_EQ("v1", Get("foo"));
    ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2"));
    ASSERT_EQ("v2", Get("foo"));
    ASSERT_OK(db_->Delete(WriteOptions(), "foo"));
    ASSERT_EQ("NOT_FOUND", Get("foo"));
  } while (ChangeOptions());
 }
 TEST(DBTest, GetFromImmutableLayer) {
-  Options options;
+  do {
    Options options = CurrentOptions();
    options.env = env_;
    options.write_buffer_size = 100000;  // Small write buffer
    Reopen(&options);
@ -350,15 +454,19 @@ TEST(DBTest, GetFromImmutableLayer) {
    Put("k2", std::string(100000, 'y'));             // Trigger compaction
    ASSERT_EQ("v1", Get("foo"));
    env_->delay_sstable_sync_.Release_Store(NULL);   // Release sync calls
  } while (ChangeOptions());
 }
 TEST(DBTest, GetFromVersions) {
  do {
    ASSERT_OK(Put("foo", "v1"));
    dbfull()->TEST_CompactMemTable();
    ASSERT_EQ("v1", Get("foo"));
  } while (ChangeOptions());
 }
 TEST(DBTest, GetSnapshot) {
  do {
    // Try with both a short key and a long key
    for (int i = 0; i < 2; i++) {
      std::string key = (i == 0) ? std::string("foo") : std::string(200, 'x');
@ -372,9 +480,11 @@ TEST(DBTest, GetSnapshot) {
      ASSERT_EQ("v1", Get(key, s1));
      db_->ReleaseSnapshot(s1);
    }
  } while (ChangeOptions());
 }
 TEST(DBTest, GetLevel0Ordering) {
  do {
    // Check that we process level-0 files in correct order.  The code
    // below generates two level-0 files where the earlier one comes
    // before the later one in the level-0 file list since the earlier
@ -385,9 +495,11 @@ TEST(DBTest, GetLevel0Ordering) {
    ASSERT_OK(Put("foo", "v2"));
    dbfull()->TEST_CompactMemTable();
    ASSERT_EQ("v2", Get("foo"));
  } while (ChangeOptions());
 }
 TEST(DBTest, GetOrderedByLevels) {
  do {
    ASSERT_OK(Put("foo", "v1"));
    Compact("a", "z");
    ASSERT_EQ("v1", Get("foo"));
@ -395,9 +507,11 @@ TEST(DBTest, GetOrderedByLevels) {
    ASSERT_EQ("v2", Get("foo"));
    dbfull()->TEST_CompactMemTable();
    ASSERT_EQ("v2", Get("foo"));
  } while (ChangeOptions());
 }
 TEST(DBTest, GetPicksCorrectFile) {
  do {
    // Arrange to have multiple files in a non-level-0 level.
    ASSERT_OK(Put("a", "va"));
    Compact("a", "b");
@ -408,9 +522,11 @@ TEST(DBTest, GetPicksCorrectFile) {
    ASSERT_EQ("va", Get("a"));
    ASSERT_EQ("vf", Get("f"));
    ASSERT_EQ("vx", Get("x"));
  } while (ChangeOptions());
 }
 TEST(DBTest, GetEncountersEmptyLevel) {
  do {
    // Arrange for the following to happen:
    //   * sstable A in level 0
    //   * nothing in level 1
@ -443,6 +559,7 @@ TEST(DBTest, GetEncountersEmptyLevel) {
      read_count++;
      ASSERT_EQ("NOT_FOUND", Get("missing"));
    }
  } while (ChangeOptions());
 }
 TEST(DBTest, IterEmpty) {
@ -620,6 +737,7 @@ TEST(DBTest, IterSmallAndLargeMix) {
 }
 TEST(DBTest, IterMultiWithDelete) {
  do {
    ASSERT_OK(Put("a", "va"));
    ASSERT_OK(Put("b", "vb"));
    ASSERT_OK(Put("c", "vc"));
@ -632,9 +750,11 @@ TEST(DBTest, IterMultiWithDelete) {
    iter->Prev();
    ASSERT_EQ(IterStatus(iter), "a->va");
    delete iter;
  } while (ChangeOptions());
 }
 TEST(DBTest, Recover) {
  do {
    ASSERT_OK(Put("foo", "v1"));
    ASSERT_OK(Put("baz", "v5"));
@ -652,9 +772,11 @@ TEST(DBTest, Recover) {
    ASSERT_EQ("v4", Get("foo"));
    ASSERT_EQ("v2", Get("bar"));
    ASSERT_EQ("v5", Get("baz"));
  } while (ChangeOptions());
 }
 TEST(DBTest, RecoveryWithEmptyLog) {
  do {
    ASSERT_OK(Put("foo", "v1"));
    ASSERT_OK(Put("foo", "v2"));
    Reopen();
@ -662,12 +784,14 @@ TEST(DBTest, RecoveryWithEmptyLog) {
    ASSERT_OK(Put("foo", "v3"));
    Reopen();
    ASSERT_EQ("v3", Get("foo"));
  } while (ChangeOptions());
 }
 // Check that writes done during a memtable compaction are recovered
 // if the database is shutdown during the memtable compaction.
 TEST(DBTest, RecoverDuringMemtableCompaction) {
-  Options options;
+  do {
    Options options = CurrentOptions();
    options.env = env_;
    options.write_buffer_size = 1000000;
    Reopen(&options);
@ -683,6 +807,7 @@ TEST(DBTest, RecoverDuringMemtableCompaction) {
    ASSERT_EQ("v2", Get("bar"));
    ASSERT_EQ(std::string(10000000, 'x'), Get("big1"));
    ASSERT_EQ(std::string(1000, 'y'), Get("big2"));
  } while (ChangeOptions());
 }
 static std::string Key(int i) {
@ -692,7 +817,7 @@ static std::string Key(int i) {
 }
 TEST(DBTest, MinorCompactionsHappen) {
-  Options options;
+  Options options = CurrentOptions();
  options.write_buffer_size = 10000;
  Reopen(&options);
@ -718,7 +843,7 @@ TEST(DBTest, MinorCompactionsHappen) {
 TEST(DBTest, RecoverWithLargeLog) {
  {
-    Options options;
+    Options options = CurrentOptions();
    Reopen(&options);
    ASSERT_OK(Put("big1", std::string(200000, '1')));
    ASSERT_OK(Put("big2", std::string(200000, '2')));
@ -729,7 +854,7 @@ TEST(DBTest, RecoverWithLargeLog) {
  // Make sure that if we re-open with a small write buffer size that
  // we flush table files in the middle of a large log file.
-  Options options;
+  Options options = CurrentOptions();
  options.write_buffer_size = 100000;
  Reopen(&options);
  ASSERT_EQ(NumTableFilesAtLevel(0), 3);
@ -741,7 +866,7 @@ TEST(DBTest, RecoverWithLargeLog) {
 }
 TEST(DBTest, CompactionsGenerateMultipleFiles) {
-  Options options;
+  Options options = CurrentOptions();
  options.write_buffer_size = 100000000;        // Large write buffer
  Reopen(&options);
@ -767,7 +892,7 @@ TEST(DBTest, CompactionsGenerateMultipleFiles) {
 }
 TEST(DBTest, RepeatedWritesToSameKey) {
-  Options options;
+  Options options = CurrentOptions();
  options.env = env_;
  options.write_buffer_size = 100000;  // Small write buffer
  Reopen(&options);
@ -786,7 +911,7 @@ TEST(DBTest, RepeatedWritesToSameKey) {
 }
 TEST(DBTest, SparseMerge) {
-  Options options;
+  Options options = CurrentOptions();
  options.compression = kNoCompression;
  Reopen(&options);
@ -837,7 +962,8 @@ static bool Between(uint64_t val, uint64_t low, uint64_t high) {
 }
 TEST(DBTest, ApproximateSizes) {
-  Options options;
+  do {
    Options options = CurrentOptions();
    options.write_buffer_size = 100000000;        // Large write buffer
    options.compression = kNoCompression;
    DestroyAndReopen();
@ -849,9 +975,11 @@ TEST(DBTest, ApproximateSizes) {
    // Write 8MB (80 values, each 100K)
    ASSERT_EQ(NumTableFilesAtLevel(0), 0);
    const int N = 80;
    static const int S1 = 100000;
    static const int S2 = 105000;  // Allow some expansion from metadata
    Random rnd(301);
    for (int i = 0; i < N; i++) {
-    ASSERT_OK(Put(Key(i), RandomString(&rnd, 100000)));
+      ASSERT_OK(Put(Key(i), RandomString(&rnd, S1)));
    }
    // 0 because GetApproximateSizes() does not account for memtable space
@ -863,14 +991,12 @@ TEST(DBTest, ApproximateSizes) {
      for (int compact_start = 0; compact_start < N; compact_start += 10) {
        for (int i = 0; i < N; i += 10) {
-        ASSERT_TRUE(Between(Size("", Key(i)), 100000*i, 100000*i + 10000));
+          ASSERT_TRUE(Between(Size("", Key(i)), S1*i, S2*i));
-        ASSERT_TRUE(Between(Size("", Key(i)+".suffix"),
+          ASSERT_TRUE(Between(Size("", Key(i)+".suffix"), S1*(i+1), S2*(i+1)));
-                            100000 * (i+1), 100000 * (i+1) + 10000));
+          ASSERT_TRUE(Between(Size(Key(i), Key(i+10)), S1*10, S2*10));
        ASSERT_TRUE(Between(Size(Key(i), Key(i+10)),
                            100000 * 10, 100000 * 10 + 10000));
        }
-      ASSERT_TRUE(Between(Size("", Key(50)), 5000000, 5010000));
+        ASSERT_TRUE(Between(Size("", Key(50)), S1*50, S2*50));
-      ASSERT_TRUE(Between(Size("", Key(50)+".suffix"), 5100000, 5110000));
+        ASSERT_TRUE(Between(Size("", Key(50)+".suffix"), S1*50, S2*50));
        std::string cstart_str = Key(compact_start);
        std::string cend_str = Key(compact_start + 9);
@ -882,10 +1008,12 @@ TEST(DBTest, ApproximateSizes) {
      ASSERT_EQ(NumTableFilesAtLevel(0), 0);
      ASSERT_GT(NumTableFilesAtLevel(1), 0);
    }
  } while (ChangeOptions());
 }
 TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) {
-  Options options;
+  do {
    Options options = CurrentOptions();
    options.compression = kNoCompression;
    Reopen();
@ -912,12 +1040,13 @@ TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) {
      ASSERT_TRUE(Between(Size("", Key(5)), 230000, 231000));
      ASSERT_TRUE(Between(Size("", Key(6)), 240000, 241000));
      ASSERT_TRUE(Between(Size("", Key(7)), 540000, 541000));
-    ASSERT_TRUE(Between(Size("", Key(8)), 550000, 551000));
+      ASSERT_TRUE(Between(Size("", Key(8)), 550000, 560000));
      ASSERT_TRUE(Between(Size(Key(3), Key(5)), 110000, 111000));
      dbfull()->TEST_CompactRange(0, NULL, NULL);
    }
  } while (ChangeOptions());
 }
 TEST(DBTest, IteratorPinsRef) {
@ -943,6 +1072,7 @@ TEST(DBTest, IteratorPinsRef) {
 }
 TEST(DBTest, Snapshot) {
  do {
    Put("foo", "v1");
    const Snapshot* s1 = db_->GetSnapshot();
    Put("foo", "v2");
@ -967,9 +1097,11 @@ TEST(DBTest, Snapshot) {
    db_->ReleaseSnapshot(s2);
    ASSERT_EQ("v4", Get("foo"));
  } while (ChangeOptions());
 }
 TEST(DBTest, HiddenValuesAreRemoved) {
  do {
    Random rnd(301);
    FillLevels("a", "z");
@ -996,6 +1128,7 @@ TEST(DBTest, HiddenValuesAreRemoved) {
    ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]");
    ASSERT_TRUE(Between(Size("", "pastfoo"), 0, 1000));
  } while (ChangeOptions());
 }
 TEST(DBTest, DeletionMarkers1) {
@ -1054,6 +1187,7 @@ TEST(DBTest, DeletionMarkers2) {
 }
 TEST(DBTest, OverlapInLevel0) {
  do {
    ASSERT_EQ(config::kMaxMemCompactLevel, 2) << "Fix test to match config";
    // Fill levels 1 and 2 to disable the pushing of new memtables to levels > 0.
@ -1090,6 +1224,7 @@ TEST(DBTest, OverlapInLevel0) {
    dbfull()->TEST_CompactMemTable();
    ASSERT_EQ("3", FilesPerLevel());
    ASSERT_EQ("NOT_FOUND", Get("600"));
  } while (ChangeOptions());
 }
 TEST(DBTest, L0_CompactionBug_Issue44_a) {
@ -1150,7 +1285,7 @@ TEST(DBTest, ComparatorCheck) {
    }
  };
  NewComparator cmp;
-  Options new_options;
+  Options new_options = CurrentOptions();
  new_options.comparator = &cmp;
  Status s = TryReopen(&new_options);
  ASSERT_TRUE(!s.ok());
@ -1185,9 +1320,10 @@ TEST(DBTest, CustomComparator) {
    }
  };
  NumberComparator cmp;
-  Options new_options;
+  Options new_options = CurrentOptions();
  new_options.create_if_missing = true;
  new_options.comparator = &cmp;
  new_options.filter_policy = NULL;     // Cannot use bloom filters
  new_options.write_buffer_size = 1000;  // Compact more often
  DestroyAndReopen(&new_options);
  ASSERT_OK(Put("[10]", "ten"));
@ -1197,6 +1333,8 @@ TEST(DBTest, CustomComparator) {
    ASSERT_EQ("ten", Get("[0xa]"));
    ASSERT_EQ("twenty", Get("[20]"));
    ASSERT_EQ("twenty", Get("[0x14]"));
    ASSERT_EQ("NOT_FOUND", Get("[15]"));
    ASSERT_EQ("NOT_FOUND", Get("[0xf]"));
    Compact("[0]", "[9999]");
  }
@ -1285,7 +1423,7 @@ TEST(DBTest, DBOpen_Options) {
 // Check that number of files does not grow when we are out of space
 TEST(DBTest, NoSpace) {
-  Options options;
+  Options options = CurrentOptions();
  options.env = env_;
  Reopen(&options);
@ -1314,6 +1452,53 @@ TEST(DBTest, FilesDeletedAfterCompaction) {
  ASSERT_EQ(CountFiles(), num_files);
 }
 TEST(DBTest, BloomFilter) {
  env_->count_random_reads_ = true;
  Options options = CurrentOptions();
  options.env = env_;
  options.block_cache = NewLRUCache(0);  // Prevent cache hits
  options.filter_policy = NewBloomFilterPolicy(10);
  Reopen(&options);
  // Populate multiple layers
  const int N = 10000;
  for (int i = 0; i < N; i++) {
    ASSERT_OK(Put(Key(i), Key(i)));
  }
  Compact("a", "z");
  for (int i = 0; i < N; i += 100) {
    ASSERT_OK(Put(Key(i), Key(i)));
  }
  dbfull()->TEST_CompactMemTable();
  // Prevent auto compactions triggered by seeks
  env_->delay_sstable_sync_.Release_Store(env_);
  // Lookup present keys.  Should rarely read from small sstable.
  env_->random_read_counter_.Reset();
  for (int i = 0; i < N; i++) {
    ASSERT_EQ(Key(i), Get(Key(i)));
  }
  int reads = env_->random_read_counter_.Read();
  fprintf(stderr, "%d present => %d reads\n", N, reads);
  ASSERT_GE(reads, N);
  ASSERT_LE(reads, N + 2*N/100);
  // Lookup present keys.  Should rarely read from either sstable.
  env_->random_read_counter_.Reset();
  for (int i = 0; i < N; i++) {
    ASSERT_EQ("NOT_FOUND", Get(Key(i) + ".missing"));
  }
  reads = env_->random_read_counter_.Read();
  fprintf(stderr, "%d missing => %d reads\n", N, reads);
  ASSERT_LE(reads, 3*N/100);
  env_->delay_sstable_sync_.Release_Store(NULL);
  Close();
  delete options.block_cache;
  delete options.filter_policy;
 }
 // Multi-threaded test:
 namespace {
@ -1381,6 +1566,7 @@ static void MTThreadBody(void* arg) {
 }  // namespace
 TEST(DBTest, MultiThreaded) {
  do {
    // Initialize state
    MTState mt;
    mt.test = this;
@ -1408,6 +1594,7 @@ TEST(DBTest, MultiThreaded) {
        env_->SleepForMicroseconds(100000);
      }
    }
  } while (ChangeOptions());
 }
 namespace {
@ -1573,7 +1760,8 @@ static bool CompareIterators(int step,
 TEST(DBTest, Randomized) {
  Random rnd(test::RandomSeed());
-  ModelDB model(last_options_);
+  do {
    ModelDB model(CurrentOptions());
    const int N = 10000;
    const Snapshot* model_snap = NULL;
    const Snapshot* db_snap = NULL;
@ -1582,6 +1770,7 @@ TEST(DBTest, Randomized) {
      if (step % 100 == 0) {
        fprintf(stderr, "Step %d of %d\n", step, N);
      }
      // TODO(sanjay): Test Get() works
      int p = rnd.Uniform(100);
      if (p < 45) {                               // Put
        k = RandomKey(&rnd);
@ -1637,6 +1826,7 @@ TEST(DBTest, Randomized) {
    }
    if (model_snap != NULL) model.ReleaseSnapshot(model_snap);
    if (db_snap != NULL) db_->ReleaseSnapshot(db_snap);
  } while (ChangeOptions());
 }
 std::string MakeKey(unsigned int num) {
--- a/db/dbformat.cc
+++ b/db/dbformat.cc
@ -98,6 +98,26 @@ void InternalKeyComparator::FindShortSuccessor(std::string* key) const {
  }
 }
 const char* InternalFilterPolicy::Name() const {
  return user_policy_->Name();
 }
 void InternalFilterPolicy::CreateFilter(const Slice* keys, int n,
                                        std::string* dst) const {
  // We rely on the fact that the code in table.cc does not mind us
  // adjusting keys[].
  Slice* mkey = const_cast<Slice*>(keys);
  for (int i = 0; i < n; i++) {
    mkey[i] = ExtractUserKey(keys[i]);
    // TODO(sanjay): Suppress dups?
  }
  user_policy_->CreateFilter(keys, n, dst);
 }
 bool InternalFilterPolicy::KeyMayMatch(const Slice& key, const Slice& f) const {
  return user_policy_->KeyMayMatch(ExtractUserKey(key), f);
 }
 LookupKey::LookupKey(const Slice& user_key, SequenceNumber s) {
  size_t usize = user_key.size();
  size_t needed = usize + 13;  // A conservative estimate
--- a/db/dbformat.h
+++ b/db/dbformat.h
@ -8,6 +8,7 @@
 #include <stdio.h>
 #include "leveldb/comparator.h"
 #include "leveldb/db.h"
 #include "leveldb/filter_policy.h"
 #include "leveldb/slice.h"
 #include "leveldb/table_builder.h"
 #include "util/coding.h"
@ -123,6 +124,17 @@ class InternalKeyComparator : public Comparator {
  int Compare(const InternalKey& a, const InternalKey& b) const;
 };
 // Filter policy wrapper that converts from internal keys to user keys
 class InternalFilterPolicy : public FilterPolicy {
 private:
  const FilterPolicy* const user_policy_;
 public:
  explicit InternalFilterPolicy(const FilterPolicy* p) : user_policy_(p) { }
  virtual const char* Name() const;
  virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const;
  virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const;
 };
 // Modules in this directory should keep internal keys wrapped inside
 // the following class instead of plain strings so that we do not
 // incorrectly use string comparisons instead of an InternalKeyComparator.
--- a/db/repair.cc
+++ b/db/repair.cc
@ -48,7 +48,8 @@ class Repairer {
      : dbname_(dbname),
        env_(options.env),
        icmp_(options.comparator),
-        options_(SanitizeOptions(dbname, &icmp_, options)),
+        ipolicy_(options.filter_policy),
        options_(SanitizeOptions(dbname, &icmp_, &ipolicy_, options)),
        owns_info_log_(options_.info_log != options.info_log),
        owns_cache_(options_.block_cache != options.block_cache),
        next_file_number_(1) {
@ -99,6 +100,7 @@ class Repairer {
  std::string const dbname_;
  Env* const env_;
  InternalKeyComparator const icmp_;
  InternalFilterPolicy const ipolicy_;
  Options const options_;
  bool owns_info_log_;
  bool owns_cache_;
--- a/db/table_cache.cc
+++ b/db/table_cache.cc
@ -42,23 +42,18 @@ TableCache::~TableCache() {
  delete cache_;
 }
-Iterator* TableCache::NewIterator(const ReadOptions& options,
+Status TableCache::FindTable(uint64_t file_number, uint64_t file_size,
-                                  uint64_t file_number,
+                             Cache::Handle** handle) {
-                                  uint64_t file_size,
+  Status s;
                                  Table** tableptr) {
  if (tableptr != NULL) {
    *tableptr = NULL;
  }
  char buf[sizeof(file_number)];
  EncodeFixed64(buf, file_number);
  Slice key(buf, sizeof(buf));
-  Cache::Handle* handle = cache_->Lookup(key);
+  *handle = cache_->Lookup(key);
-  if (handle == NULL) {
+  if (*handle == NULL) {
    std::string fname = TableFileName(dbname_, file_number);
    RandomAccessFile* file = NULL;
    Table* table = NULL;
-    Status s = env_->NewRandomAccessFile(fname, &file);
+    s = env_->NewRandomAccessFile(fname, &file);
    if (s.ok()) {
      s = Table::Open(*options_, file, file_size, &table);
    }
@ -68,13 +63,28 @@ Iterator* TableCache::NewIterator(const ReadOptions& options,
      delete file;
      // We do not cache error results so that if the error is transient,
      // or somebody repairs the file, we recover automatically.
-      return NewErrorIterator(s);
+    } else {
    }
      TableAndFile* tf = new TableAndFile;
      tf->file = file;
      tf->table = table;
-    handle = cache_->Insert(key, tf, 1, &DeleteEntry);
+      *handle = cache_->Insert(key, tf, 1, &DeleteEntry);
    }
  }
  return s;
 }
 Iterator* TableCache::NewIterator(const ReadOptions& options,
                                  uint64_t file_number,
                                  uint64_t file_size,
                                  Table** tableptr) {
  if (tableptr != NULL) {
    *tableptr = NULL;
  }
  Cache::Handle* handle = NULL;
  Status s = FindTable(file_number, file_size, &handle);
  if (!s.ok()) {
    return NewErrorIterator(s);
  }
  Table* table = reinterpret_cast<TableAndFile*>(cache_->Value(handle))->table;
@ -86,6 +96,22 @@ Iterator* TableCache::NewIterator(const ReadOptions& options,
  return result;
 }
 Status TableCache::Get(const ReadOptions& options,
                       uint64_t file_number,
                       uint64_t file_size,
                       const Slice& k,
                       void* arg,
                       void (*saver)(void*, const Slice&, const Slice&)) {
  Cache::Handle* handle = NULL;
  Status s = FindTable(file_number, file_size, &handle);
  if (s.ok()) {
    Table* t = reinterpret_cast<TableAndFile*>(cache_->Value(handle))->table;
    s = t->InternalGet(options, k, arg, saver);
    cache_->Release(handle);
  }
  return s;
 }
 void TableCache::Evict(uint64_t file_number) {
  char buf[sizeof(file_number)];
  EncodeFixed64(buf, file_number);
--- a/db/table_cache.h
+++ b/db/table_cache.h
@ -35,6 +35,15 @@ class TableCache {
                        uint64_t file_size,
                        Table** tableptr = NULL);
  // If a seek to internal key "k" in specified file finds an entry,
  // call (*handle_result)(arg, found_key, found_value).
  Status Get(const ReadOptions& options,
             uint64_t file_number,
             uint64_t file_size,
             const Slice& k,
             void* arg,
             void (*handle_result)(void*, const Slice&, const Slice&));
  // Evict any entry for the specified file number
  void Evict(uint64_t file_number);
@ -43,6 +52,8 @@ class TableCache {
  const std::string dbname_;
  const Options* options_;
  Cache* cache_;
  Status FindTable(uint64_t file_number, uint64_t file_size, Cache::Handle**);
 };
 }  // namespace leveldb
--- a/db/version_set.cc
+++ b/db/version_set.cc
@ -255,35 +255,34 @@ void Version::AddIterators(const ReadOptions& options,
  }
 }
-// If "*iter" points at a value or deletion for user_key, store
+// Callback from TableCache::Get()
-// either the value, or a NotFound error and return true.
+namespace {
-// Else return false.
+enum SaverState {
-static bool GetValue(const Comparator* cmp,
+  kNotFound,
-                     Iterator* iter, const Slice& user_key,
+  kFound,
-                     std::string* value,
+  kDeleted,
-                     Status* s) {
+  kCorrupt,
-  if (!iter->Valid()) {
+};
-    return false;
+struct Saver {
-  }
+  SaverState state;
  const Comparator* ucmp;
  Slice user_key;
  std::string* value;
 };
 }
 static void SaveValue(void* arg, const Slice& ikey, const Slice& v) {
  Saver* s = reinterpret_cast<Saver*>(arg);
  ParsedInternalKey parsed_key;
-  if (!ParseInternalKey(iter->key(), &parsed_key)) {
+  if (!ParseInternalKey(ikey, &parsed_key)) {
-    *s = Status::Corruption("corrupted key for ", user_key);
+    s->state = kCorrupt;
-    return true;
+  } else {
-  }
+    if (s->ucmp->Compare(parsed_key.user_key, s->user_key) == 0) {
-  if (cmp->Compare(parsed_key.user_key, user_key) != 0) {
+      s->state = (parsed_key.type == kTypeValue) ? kFound : kDeleted;
-    return false;
+      if (s->state == kFound) {
-  }
+        s->value->assign(v.data(), v.size());
-  switch (parsed_key.type) {
+      }
    case kTypeDeletion:
      *s = Status::NotFound(Slice());  // Use an empty error message for speed
      break;
    case kTypeValue: {
      Slice v = iter->value();
      value->assign(v.data(), v.size());
      break;
    }
  }
  return true;
 }
 static bool NewestFirst(FileMetaData* a, FileMetaData* b) {
@ -361,21 +360,27 @@ Status Version::Get(const ReadOptions& options,
      last_file_read = f;
      last_file_read_level = level;
-      Iterator* iter = vset_->table_cache_->NewIterator(
+      Saver saver;
-          options,
+      saver.state = kNotFound;
-          f->number,
+      saver.ucmp = ucmp;
-          f->file_size);
+      saver.user_key = user_key;
-      iter->Seek(ikey);
+      saver.value = value;
-      const bool done = GetValue(ucmp, iter, user_key, value, &s);
+      s = vset_->table_cache_->Get(options, f->number, f->file_size,
-      if (!iter->status().ok()) {
+                                   ikey, &saver, SaveValue);
-        s = iter->status();
+      if (!s.ok()) {
        delete iter;
        return s;
      } else {
        delete iter;
        if (done) {
        return s;
      }
      switch (saver.state) {
        case kNotFound:
          break;      // Keep searching in other files
        case kFound:
          return s;
        case kDeleted:
          s = Status::NotFound(Slice());  // Use empty error message for speed
          return s;
        case kCorrupt:
          s = Status::Corruption("corrupted key for ", user_key);
          return s;
      }
    }
  }
--- a/doc/index.html
+++ b/doc/index.html
@ -400,6 +400,69 @@ We might want to prefix <code>filename</code> keys with one letter (say '/') and
 over just the metadata do not force us to fetch and cache bulky file
 contents.
 <p>
 <h2>Filters</h2>
 <p>
 Because of the way <code>leveldb</code> data is organized on disk,
 a single <code>Get()</code> call may involve multiple reads from disk.
 The optional <code>FilterPolicy</code> mechanism can be used to reduce
 the number of disk reads substantially.
 <pre>
   leveldb::Options options;
   options.filter_policy = NewBloomFilter(10);
   leveldb::DB* db;
   leveldb::DB::Open(options, "/tmp/testdb", &amp;db);
   ... use the database ...
   delete db;
   delete options.filter_policy;
 </pre>
 The preceding code associates a
 <a href="http://en.wikipedia.org/wiki/Bloom_filter">Bloom filter</a>
 based filtering policy with the database.  Bloom filter based
 filtering relies on keeping some number of bits of data in memory per
 key (in this case 10 bits per key since that is the argument we passed
 to NewBloomFilter).  This filter will reduce the number of unnecessary
 disk reads needed for <code>Get()</code> calls by a factor of
 approximately a 100.  Increasing the bits per key will lead to a
 larger reduction at the cost of more memory usage.  We recommend that
 applications whose working set does not fit in memory and that do a
 lot of random reads set a filter policy.
 <p>
 If you are using a custom comparator, you should ensure that the filter
 policy you are using is compatible with your comparator.  For example,
 consider a comparator that ignores trailing spaces when comparing keys.
 <code>NewBloomFilter</code> must not be used with such a comparator.
 Instead, the application should provide a custom filter policy that
 also ignores trailing spaces.  For example:
 <pre>
  class CustomFilterPolicy : public leveldb::FilterPolicy {
   private:
    FilterPolicy* builtin_policy_;
   public:
    CustomFilterPolicy() : builtin_policy_(NewBloomFilter(10)) { }
    ~CustomFilterPolicy() { delete builtin_policy_; }
    const char* Name() const { return "IgnoreTrailingSpacesFilter"; }
    void CreateFilter(const Slice* keys, int n, std::string* dst) const {
      // Use builtin bloom filter code after removing trailing spaces
      std::vector&lt;Slice&gt; trimmed(n);
      for (int i = 0; i &lt; n; i++) {
        trimmed[i] = RemoveTrailingSpaces(keys[i]);
      }
      return builtin_policy_-&gt;CreateFilter(&amp;trimmed[i], n, dst);
    }
    bool KeyMayMatch(const Slice& key, const Slice& filter) const {
      // Use builtin bloom filter code after removing trailing spaces
      return builtin_policy_-&gt;KeyMayMatch(RemoveTrailingSpaces(key), filter);
    }
  };
 </pre>
 <p>
 Advanced applications may provide a filter policy that does not use
 a bloom filter but uses some other mechanism for summarizing a set
 of keys.  See <code>leveldb/filter_policy.h</code> for detail.
 <p>
 <h1>Checksums</h1>
 <p>
 <code>leveldb</code> associates checksums with all data it stores in the file system.
--- a/doc/table_format.txt
+++ b/doc/table_format.txt
@ -47,6 +47,47 @@ the BlockHandle of the metaindex and index blocks as well as a magic number.
       			 	       // (40==2*BlockHandle::kMaxEncodedLength)
       magic:		       fixed64;    // == 0xdb4775248b80fb57
 "filter" Meta Block
 -------------------
 If a "FilterPolicy" was specified when the database was opened, a
 filter block is stored in each table.  The "metaindex" block contains
 an entry that maps from "filter.<N>" to the BlockHandle for the filter
 block where "<N>" is the string returned by the filter policy's
 "Name()" method.
 The filter block stores a sequence of filters, where filter i contains
 the output of FilterPolicy::CreateFilter() on all keys that are stored
 in a block whose file offset falls within the range
    [ i*base ... (i+1)*base-1 ]
 Currently, "base" is 2KB.  So for example, if blocks X and Y start in
 the range [ 0KB .. 2KB-1 ], all of the keys in X and Y will be
 converted to a filter by calling FilterPolicy::CreateFilter(), and the
 resulting filter will be stored as the first filter in the filter
 block.
 The filter block is formatted as follows:
     [filter 0]
     [filter 1]
     [filter 2]
     ...
     [filter N-1]
     [offset of filter 0]                  : 4 bytes
     [offset of filter 1]                  : 4 bytes
     [offset of filter 2]                  : 4 bytes
     ...
     [offset of filter N-1]                : 4 bytes
     [offset of beginning of offset array] : 4 bytes
     lg(base)                              : 1 byte
 The offset array at the end of the filter block allows efficient
 mapping from a data block offset to the corresponding filter.
 "stats" Meta Block
 ------------------
--- a/include/leveldb/c.h
+++ b/include/leveldb/c.h
@ -55,6 +55,7 @@ typedef struct leveldb_cache_t         leveldb_cache_t;
 typedef struct leveldb_comparator_t    leveldb_comparator_t;
 typedef struct leveldb_env_t           leveldb_env_t;
 typedef struct leveldb_filelock_t      leveldb_filelock_t;
 typedef struct leveldb_filterpolicy_t  leveldb_filterpolicy_t;
 typedef struct leveldb_iterator_t      leveldb_iterator_t;
 typedef struct leveldb_logger_t        leveldb_logger_t;
 typedef struct leveldb_options_t       leveldb_options_t;
@ -127,6 +128,11 @@ extern void leveldb_approximate_sizes(
    const char* const* range_limit_key, const size_t* range_limit_key_len,
    uint64_t* sizes);
 extern void leveldb_compact_range(
    leveldb_t* db,
    const char* start_key, size_t start_key_len,
    const char* limit_key, size_t limit_key_len);
 /* Management operations */
 extern void leveldb_destroy_db(
@ -177,6 +183,9 @@ extern void leveldb_options_destroy(leveldb_options_t*);
 extern void leveldb_options_set_comparator(
    leveldb_options_t*,
    leveldb_comparator_t*);
 extern void leveldb_options_set_filter_policy(
    leveldb_options_t*,
    leveldb_filterpolicy_t*);
 extern void leveldb_options_set_create_if_missing(
    leveldb_options_t*, unsigned char);
 extern void leveldb_options_set_error_if_exists(
@ -209,6 +218,26 @@ extern leveldb_comparator_t* leveldb_comparator_create(
    const char* (*name)(void*));
 extern void leveldb_comparator_destroy(leveldb_comparator_t*);
 /* Filter policy */
 extern leveldb_filterpolicy_t* leveldb_filterpolicy_create(
    void* state,
    void (*destructor)(void*),
    char* (*create_filter)(
        void*,
        const char* const* key_array, const size_t* key_length_array,
        int num_keys,
        size_t* filter_length),
    unsigned char (*key_may_match)(
        void*,
        const char* key, size_t length,
        const char* filter, size_t filter_length),
    const char* (*name)(void*));
 extern void leveldb_filterpolicy_destroy(leveldb_filterpolicy_t*);
 extern leveldb_filterpolicy_t* leveldb_filterpolicy_create_bloom(
    int bits_per_key);
 /* Read options */
 extern leveldb_readoptions_t* leveldb_readoptions_create();
--- a/include/leveldb/db.h
+++ b/include/leveldb/db.h
@ -14,7 +14,7 @@ namespace leveldb {
 // Update Makefile if you change these
 static const int kMajorVersion = 1;
-static const int kMinorVersion = 3;
+static const int kMinorVersion = 4;
 struct Options;
 struct ReadOptions;
--- a/include/leveldb/filter_policy.h
+++ b/include/leveldb/filter_policy.h
@ -0,0 +1,70 @@
 // Copyright (c) 2012 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 //
 // A database can be configured with a custom FilterPolicy object.
 // This object is responsible for creating a small filter from a set
 // of keys.  These filters are stored in leveldb and are consulted
 // automatically by leveldb to decide whether or not to read some
 // information from disk. In many cases, a filter can cut down the
 // number of disk seeks form a handful to a single disk seek per
 // DB::Get() call.
 //
 // Most people will want to use the builtin bloom filter support (see
 // NewBloomFilterPolicy() below).
 #ifndef STORAGE_LEVELDB_INCLUDE_FILTER_POLICY_H_
 #define STORAGE_LEVELDB_INCLUDE_FILTER_POLICY_H_
 #include <string>
 namespace leveldb {
 class Slice;
 class FilterPolicy {
 public:
  virtual ~FilterPolicy();
  // Return the name of this policy.  Note that if the filter encoding
  // changes in an incompatible way, the name returned by this method
  // must be changed.  Otherwise, old incompatible filters may be
  // passed to methods of this type.
  virtual const char* Name() const = 0;
  // keys[0,n-1] contains a list of keys (potentially with duplicates)
  // that are ordered according to the user supplied comparator.
  // Append a filter that summarizes keys[0,n-1] to *dst.
  //
  // Warning: do not change the initial contents of *dst.  Instead,
  // append the newly constructed filter to *dst.
  virtual void CreateFilter(const Slice* keys, int n, std::string* dst)
      const = 0;
  // "filter" contains the data appended by a preceding call to
  // CreateFilter() on this class.  This method must return true if
  // the key was in the list of keys passed to CreateFilter().
  // This method may return true or false if the key was not on the
  // list, but it should aim to return false with a high probability.
  virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const = 0;
 };
 // Return a new filter policy that uses a bloom filter with approximately
 // the specified number of bits per key.  A good value for bits_per_key
 // is 10, which yields a filter with ~ 1% false positive rate.
 //
 // Callers must delete the result after any database that is using the
 // result has been closed.
 //
 // Note: if you are using a custom comparator that ignores some parts
 // of the keys being compared, you must not use NewBloomFilterPolicy()
 // and must provide your own FilterPolicy that also ignores the
 // corresponding parts of the keys.  For example, if the comparator
 // ignores trailing spaces, it would be incorrect to use a
 // FilterPolicy (like NewBloomFilterPolicy) that does not ignore
 // trailing spaces in keys.
 extern const FilterPolicy* NewBloomFilterPolicy(int bits_per_key);
 }
 #endif  // STORAGE_LEVELDB_INCLUDE_FILTER_POLICY_H_
--- a/include/leveldb/options.h
+++ b/include/leveldb/options.h
@ -12,6 +12,7 @@ namespace leveldb {
 class Cache;
 class Comparator;
 class Env;
 class FilterPolicy;
 class Logger;
 class Snapshot;
@ -127,6 +128,13 @@ struct Options {
  // efficiently detect that and will switch to uncompressed mode.
  CompressionType compression;
  // If non-NULL, use the specified filter policy to reduce disk reads.
  // Many applications will benefit from passing the result of
  // NewBloomFilterPolicy() here.
  //
  // Default: NULL
  const FilterPolicy* filter_policy;
  // Create an Options object with default values for all fields.
  Options();
 };
--- a/include/leveldb/table.h
+++ b/include/leveldb/table.h
@ -12,9 +12,11 @@ namespace leveldb {
 class Block;
 class BlockHandle;
 class Footer;
 struct Options;
 class RandomAccessFile;
 struct ReadOptions;
 class TableCache;
 // A Table is a sorted map from strings to strings.  Tables are
 // immutable and persistent.  A Table may be safely accessed from
@ -60,6 +62,19 @@ class Table {
  explicit Table(Rep* rep) { rep_ = rep; }
  static Iterator* BlockReader(void*, const ReadOptions&, const Slice&);
  // Calls (*handle_result)(arg, ...) with the entry found after a call
  // to Seek(key).  May not make such a call if filter policy says
  // that key is not present.
  friend class TableCache;
  Status InternalGet(
      const ReadOptions&, const Slice& key,
      void* arg,
      void (*handle_result)(void* arg, const Slice& k, const Slice& v));
  void ReadMeta(const Footer& footer);
  void ReadFilter(const Slice& filter_handle_value);
  // No copying allowed
  Table(const Table&);
  void operator=(const Table&);
--- a/include/leveldb/table_builder.h
+++ b/include/leveldb/table_builder.h
@ -77,6 +77,7 @@ class TableBuilder {
 private:
  bool ok() const { return status().ok(); }
  void WriteBlock(BlockBuilder* block, BlockHandle* handle);
  void WriteRawBlock(const Slice& data, CompressionType, BlockHandle* handle);
  struct Rep;
  Rep* rep_;
--- a/port/port_android.h
+++ b/port/port_android.h
@ -78,6 +78,9 @@ class CondVar {
 // On ARM chipsets <V6, 0xffff0fa0 is the hard coded address of a 
 // memory barrier function provided by the kernel.
 typedef void (*LinuxKernelMemoryBarrierFunc)(void);
 // TODO(user): ATTRIBUTE_WEAK is undefined, so this fails to build on
 // non-ARMV6_OR_7. We may be able to replace it with __attribute__((weak)) for
 // older ARM builds, but x86 builds will require a different memory barrier.
 LinuxKernelMemoryBarrierFunc pLinuxKernelMemoryBarrier ATTRIBUTE_WEAK =
    (LinuxKernelMemoryBarrierFunc) 0xffff0fa0;
 #endif
--- a/table/block.cc
+++ b/table/block.cc
@ -9,6 +9,7 @@
 #include <vector>
 #include <algorithm>
 #include "leveldb/comparator.h"
 #include "table/format.h"
 #include "util/coding.h"
 #include "util/logging.h"
@ -19,10 +20,10 @@ inline uint32_t Block::NumRestarts() const {
  return DecodeFixed32(data_ + size_ - sizeof(uint32_t));
 }
-Block::Block(const char* data, size_t size, bool take_ownership)
+Block::Block(const BlockContents& contents)
-    : data_(data),
+    : data_(contents.data.data()),
-      size_(size),
+      size_(contents.data.size()),
-      owned_(take_ownership) {
+      owned_(contents.heap_allocated) {
  if (size_ < sizeof(uint32_t)) {
    size_ = 0;  // Error marker
  } else {
--- a/table/block.h
+++ b/table/block.h
@ -11,14 +11,13 @@
 namespace leveldb {
 struct BlockContents;
 class Comparator;
 class Block {
 public:
  // Initialize the block with the specified contents.
-  // Takes ownership of data[] and will delete[] it when done iff
+  explicit Block(const BlockContents& contents);
  // "take_ownership is true.
  Block(const char* data, size_t size, bool take_ownership);
  ~Block();
--- a/table/filter_block.cc
+++ b/table/filter_block.cc
@ -0,0 +1,111 @@
 // Copyright (c) 2012 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 #include "table/filter_block.h"
 #include "leveldb/filter_policy.h"
 #include "util/coding.h"
 namespace leveldb {
 // See doc/table_format.txt for an explanation of the filter block format.
 // Generate new filter every 2KB of data
 static const size_t kFilterBaseLg = 11;
 static const size_t kFilterBase = 1 << kFilterBaseLg;
 FilterBlockBuilder::FilterBlockBuilder(const FilterPolicy* policy)
    : policy_(policy) {
 }
 void FilterBlockBuilder::StartBlock(uint64_t block_offset) {
  uint64_t filter_index = (block_offset / kFilterBase);
  assert(filter_index >= filter_offsets_.size());
  while (filter_index > filter_offsets_.size()) {
    GenerateFilter();
  }
 }
 void FilterBlockBuilder::AddKey(const Slice& key) {
  Slice k = key;
  start_.push_back(keys_.size());
  keys_.append(k.data(), k.size());
 }
 Slice FilterBlockBuilder::Finish() {
  if (!start_.empty()) {
    GenerateFilter();
  }
  // Append array of per-filter offsets
  const uint32_t array_offset = result_.size();
  for (size_t i = 0; i < filter_offsets_.size(); i++) {
    PutFixed32(&result_, filter_offsets_[i]);
  }
  PutFixed32(&result_, array_offset);
  result_.push_back(kFilterBaseLg);  // Save encoding parameter in result
  return Slice(result_);
 }
 void FilterBlockBuilder::GenerateFilter() {
  const size_t num_keys = start_.size();
  if (num_keys == 0) {
    // Fast path if there are no keys for this filter
    filter_offsets_.push_back(result_.size());
    return;
  }
  // Make list of keys from flattened key structure
  start_.push_back(keys_.size());  // Simplify length computation
  tmp_keys_.resize(num_keys);
  for (size_t i = 0; i < num_keys; i++) {
    const char* base = keys_.data() + start_[i];
    size_t length = start_[i+1] - start_[i];
    tmp_keys_[i] = Slice(base, length);
  }
  // Generate filter for current set of keys and append to result_.
  filter_offsets_.push_back(result_.size());
  policy_->CreateFilter(&tmp_keys_[0], num_keys, &result_);
  tmp_keys_.clear();
  keys_.clear();
  start_.clear();
 }
 FilterBlockReader::FilterBlockReader(const FilterPolicy* policy,
                                     const Slice& contents)
    : policy_(policy),
      data_(NULL),
      offset_(NULL),
      num_(0),
      base_lg_(0) {
  size_t n = contents.size();
  if (n < 5) return;  // 1 byte for base_lg_ and 4 for start of offset array
  base_lg_ = contents[n-1];
  uint32_t last_word = DecodeFixed32(contents.data() + n - 5);
  if (last_word > n - 5) return;
  data_ = contents.data();
  offset_ = data_ + last_word;
  num_ = (n - 5 - last_word) / 4;
 }
 bool FilterBlockReader::KeyMayMatch(uint64_t block_offset, const Slice& key) {
  uint64_t index = block_offset >> base_lg_;
  if (index < num_) {
    uint32_t start = DecodeFixed32(offset_ + index*4);
    uint32_t limit = DecodeFixed32(offset_ + index*4 + 4);
    if (start <= limit && limit <= (offset_ - data_)) {
      Slice filter = Slice(data_ + start, limit - start);
      return policy_->KeyMayMatch(key, filter);
    } else if (start == limit) {
      // Empty filters do not match any keys
      return false;
    }
  }
  return true;  // Errors are treated as potential matches
 }
 }
--- a/table/filter_block.h
+++ b/table/filter_block.h
@ -0,0 +1,68 @@
 // Copyright (c) 2012 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 //
 // A filter block is stored near the end of a Table file.  It contains
 // filters (e.g., bloom filters) for all data blocks in the table combined
 // into a single filter block.
 #ifndef STORAGE_LEVELDB_TABLE_FILTER_BLOCK_H_
 #define STORAGE_LEVELDB_TABLE_FILTER_BLOCK_H_
 #include <stddef.h>
 #include <stdint.h>
 #include <string>
 #include <vector>
 #include "leveldb/slice.h"
 #include "util/hash.h"
 namespace leveldb {
 class FilterPolicy;
 // A FilterBlockBuilder is used to construct all of the filters for a
 // particular Table.  It generates a single string which is stored as
 // a special block in the Table.
 //
 // The sequence of calls to FilterBlockBuilder must match the regexp:
 //      (StartBlock AddKey*)* Finish
 class FilterBlockBuilder {
 public:
  explicit FilterBlockBuilder(const FilterPolicy*);
  void StartBlock(uint64_t block_offset);
  void AddKey(const Slice& key);
  Slice Finish();
 private:
  void GenerateFilter();
  const FilterPolicy* policy_;
  std::string keys_;              // Flattened key contents
  std::vector<size_t> start_;     // Starting index in keys_ of each key
  std::string result_;            // Filter data computed so far
  std::vector<Slice> tmp_keys_;   // policy_->CreateFilter() argument
  std::vector<uint32_t> filter_offsets_;
  // No copying allowed
  FilterBlockBuilder(const FilterBlockBuilder&);
  void operator=(const FilterBlockBuilder&);
 };
 class FilterBlockReader {
 public:
 // REQUIRES: "contents" and *policy must stay live while *this is live.
  FilterBlockReader(const FilterPolicy* policy, const Slice& contents);
  bool KeyMayMatch(uint64_t block_offset, const Slice& key);
 private:
  const FilterPolicy* policy_;
  const char* data_;    // Pointer to filter data (at block-start)
  const char* offset_;  // Pointer to beginning of offset array (at block-end)
  size_t num_;          // Number of entries in offset array
  size_t base_lg_;      // Encoding parameter (see kFilterBaseLg in .cc file)
 };
 }
 #endif  // STORAGE_LEVELDB_TABLE_FILTER_BLOCK_H_
--- a/table/filter_block_test.cc
+++ b/table/filter_block_test.cc
@ -0,0 +1,128 @@
 // Copyright (c) 2012 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 #include "table/filter_block.h"
 #include "leveldb/filter_policy.h"
 #include "util/coding.h"
 #include "util/hash.h"
 #include "util/logging.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
 namespace leveldb {
 // For testing: emit an array with one hash value per key
 class TestHashFilter : public FilterPolicy {
 public:
  virtual const char* Name() const {
    return "TestHashFilter";
  }
  virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const {
    for (int i = 0; i < n; i++) {
      uint32_t h = Hash(keys[i].data(), keys[i].size(), 1);
      PutFixed32(dst, h);
    }
  }
  virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const {
    uint32_t h = Hash(key.data(), key.size(), 1);
    for (int i = 0; i + 4 <= filter.size(); i += 4) {
      if (h == DecodeFixed32(filter.data() + i)) {
        return true;
      }
    }
    return false;
  }
 };
 class FilterBlockTest {
 public:
  TestHashFilter policy_;
 };
 TEST(FilterBlockTest, EmptyBuilder) {
  FilterBlockBuilder builder(&policy_);
  Slice block = builder.Finish();
  ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block));
  FilterBlockReader reader(&policy_, block);
  ASSERT_TRUE(reader.KeyMayMatch(0, "foo"));
  ASSERT_TRUE(reader.KeyMayMatch(100000, "foo"));
 }
 TEST(FilterBlockTest, SingleChunk) {
  FilterBlockBuilder builder(&policy_);
  builder.StartBlock(100);
  builder.AddKey("foo");
  builder.AddKey("bar");
  builder.AddKey("box");
  builder.StartBlock(200);
  builder.AddKey("box");
  builder.StartBlock(300);
  builder.AddKey("hello");
  Slice block = builder.Finish();
  FilterBlockReader reader(&policy_, block);
  ASSERT_TRUE(reader.KeyMayMatch(100, "foo"));
  ASSERT_TRUE(reader.KeyMayMatch(100, "bar"));
  ASSERT_TRUE(reader.KeyMayMatch(100, "box"));
  ASSERT_TRUE(reader.KeyMayMatch(100, "hello"));
  ASSERT_TRUE(reader.KeyMayMatch(100, "foo"));
  ASSERT_TRUE(! reader.KeyMayMatch(100, "missing"));
  ASSERT_TRUE(! reader.KeyMayMatch(100, "other"));
 }
 TEST(FilterBlockTest, MultiChunk) {
  FilterBlockBuilder builder(&policy_);
  // First filter
  builder.StartBlock(0);
  builder.AddKey("foo");
  builder.StartBlock(2000);
  builder.AddKey("bar");
  // Second filter
  builder.StartBlock(3100);
  builder.AddKey("box");
  // Third filter is empty
  // Last filter
  builder.StartBlock(9000);
  builder.AddKey("box");
  builder.AddKey("hello");
  Slice block = builder.Finish();
  FilterBlockReader reader(&policy_, block);
  // Check first filter
  ASSERT_TRUE(reader.KeyMayMatch(0, "foo"));
  ASSERT_TRUE(reader.KeyMayMatch(2000, "bar"));
  ASSERT_TRUE(! reader.KeyMayMatch(0, "box"));
  ASSERT_TRUE(! reader.KeyMayMatch(0, "hello"));
  // Check second filter
  ASSERT_TRUE(reader.KeyMayMatch(3100, "box"));
  ASSERT_TRUE(! reader.KeyMayMatch(3100, "foo"));
  ASSERT_TRUE(! reader.KeyMayMatch(3100, "bar"));
  ASSERT_TRUE(! reader.KeyMayMatch(3100, "hello"));
  // Check third filter (empty)
  ASSERT_TRUE(! reader.KeyMayMatch(4100, "foo"));
  ASSERT_TRUE(! reader.KeyMayMatch(4100, "bar"));
  ASSERT_TRUE(! reader.KeyMayMatch(4100, "box"));
  ASSERT_TRUE(! reader.KeyMayMatch(4100, "hello"));
  // Check last filter
  ASSERT_TRUE(reader.KeyMayMatch(9000, "box"));
  ASSERT_TRUE(reader.KeyMayMatch(9000, "hello"));
  ASSERT_TRUE(! reader.KeyMayMatch(9000, "foo"));
  ASSERT_TRUE(! reader.KeyMayMatch(9000, "bar"));
 }
 }  // namespace leveldb
 int main(int argc, char** argv) {
  return leveldb::test::RunAllTests();
 }
--- a/table/format.cc
+++ b/table/format.cc
@ -66,10 +66,10 @@ Status Footer::DecodeFrom(Slice* input) {
 Status ReadBlock(RandomAccessFile* file,
                 const ReadOptions& options,
                 const BlockHandle& handle,
-                 Block** block,
+                 BlockContents* result) {
-                 bool* may_cache) {
+  result->data = Slice();
-  *block = NULL;
+  result->cachable = false;
-  *may_cache = false;
+  result->heap_allocated = false;
  // Read the block contents as well as the type/crc footer.
  // See table_builder.cc for the code that built this structure.
@ -105,11 +105,13 @@ Status ReadBlock(RandomAccessFile* file,
        // Use it directly under the assumption that it will be live
        // while the file is open.
        delete[] buf;
-        *block = new Block(data, n, false /* do not take ownership */);
+        result->data = Slice(data, n);
-        *may_cache = false;  // Do not double-cache
+        result->heap_allocated = false;
        result->cachable = false;  // Do not double-cache
      } else {
-        *block = new Block(buf, n, true /* take ownership */);
+        result->data = Slice(buf, n);
-        *may_cache = true;
+        result->heap_allocated = true;
        result->cachable = true;
      }
      // Ok
@ -127,8 +129,9 @@ Status ReadBlock(RandomAccessFile* file,
        return Status::Corruption("corrupted compressed block contents");
      }
      delete[] buf;
-      *block = new Block(ubuf, ulength, true /* take ownership */);
+      result->data = Slice(ubuf, ulength);
-      *may_cache = true;
+      result->heap_allocated = true;
      result->cachable = true;
      break;
    }
    default:
--- a/table/format.h
+++ b/table/format.h
@ -83,16 +83,18 @@ static const uint64_t kTableMagicNumber = 0xdb4775248b80fb57ull;
 // 1-byte type + 32-bit crc
 static const size_t kBlockTrailerSize = 5;
-// Read the block identified by "handle" from "file".  On success,
+struct BlockContents {
-// store a pointer to the heap-allocated result in *block and return
+  Slice data;           // Actual contents of data
-// OK.  On failure store NULL in *block and return non-OK.
+  bool cachable;        // True iff data can be cached
-// On success, stores true in *may_cache if the result may be
+  bool heap_allocated;  // True iff caller should delete[] data.data()
-// cached, false if it must not be cached.
+};
 // Read the block identified by "handle" from "file".  On failure
 // return non-OK.  On success fill *result and return OK.
 extern Status ReadBlock(RandomAccessFile* file,
                        const ReadOptions& options,
                        const BlockHandle& handle,
-                        Block** block,
+                        BlockContents* result);
                        bool* may_cache);
 // Implementation details follow.  Clients should ignore,
--- a/table/table.cc
+++ b/table/table.cc
@ -5,8 +5,12 @@
 #include "leveldb/table.h"
 #include "leveldb/cache.h"
 #include "leveldb/comparator.h"
 #include "leveldb/env.h"
 #include "leveldb/filter_policy.h"
 #include "leveldb/options.h"
 #include "table/block.h"
 #include "table/filter_block.h"
 #include "table/format.h"
 #include "table/two_level_iterator.h"
 #include "util/coding.h"
@ -15,6 +19,8 @@ namespace leveldb {
 struct Table::Rep {
  ~Rep() {
    delete filter;
    delete [] filter_data;
    delete index_block;
  }
@ -22,6 +28,8 @@ struct Table::Rep {
  Status status;
  RandomAccessFile* file;
  uint64_t cache_id;
  FilterBlockReader* filter;
  const char* filter_data;
  BlockHandle metaindex_handle;  // Handle to metaindex_block: saved from footer
  Block* index_block;
@ -47,11 +55,13 @@ Status Table::Open(const Options& options,
  if (!s.ok()) return s;
  // Read the index block
  BlockContents contents;
  Block* index_block = NULL;
  if (s.ok()) {
-    bool may_cache;  // Ignored result
+    s = ReadBlock(file, ReadOptions(), footer.index_handle(), &contents);
-    s = ReadBlock(file, ReadOptions(), footer.index_handle(), &index_block,
+    if (s.ok()) {
-                  &may_cache);
+      index_block = new Block(contents);
    }
  }
  if (s.ok()) {
@ -63,7 +73,10 @@ Status Table::Open(const Options& options,
    rep->metaindex_handle = footer.metaindex_handle();
    rep->index_block = index_block;
    rep->cache_id = (options.block_cache ? options.block_cache->NewId() : 0);
    rep->filter_data = NULL;
    rep->filter = NULL;
    *table = new Table(rep);
    (*table)->ReadMeta(footer);
  } else {
    if (index_block) delete index_block;
  }
@ -71,6 +84,52 @@ Status Table::Open(const Options& options,
  return s;
 }
 void Table::ReadMeta(const Footer& footer) {
  if (rep_->options.filter_policy == NULL) {
    return;  // Do not need any metadata
  }
  // TODO(sanjay): Skip this if footer.metaindex_handle() size indicates
  // it is an empty block.
  ReadOptions opt;
  BlockContents contents;
  if (!ReadBlock(rep_->file, opt, footer.metaindex_handle(), &contents).ok()) {
    // Do not propagate errors since meta info is not needed for operation
    return;
  }
  Block* meta = new Block(contents);
  Iterator* iter = meta->NewIterator(BytewiseComparator());
  std::string key = "filter.";
  key.append(rep_->options.filter_policy->Name());
  iter->Seek(key);
  if (iter->Valid() && iter->key() == Slice(key)) {
    ReadFilter(iter->value());
  }
  delete iter;
  delete meta;
 }
 void Table::ReadFilter(const Slice& filter_handle_value) {
  Slice v = filter_handle_value;
  BlockHandle filter_handle;
  if (!filter_handle.DecodeFrom(&v).ok()) {
    return;
  }
  // We might want to unify with ReadBlock() if we start
  // requiring checksum verification in Table::Open.
  ReadOptions opt;
  BlockContents block;
  if (!ReadBlock(rep_->file, opt, filter_handle, &block).ok()) {
    return;
  }
  if (block.heap_allocated) {
    rep_->filter_data = block.data.data();     // Will need to delete later
  }
  rep_->filter = new FilterBlockReader(rep_->options.filter_policy, block.data);
 }
 Table::~Table() {
  delete rep_;
 }
@ -107,7 +166,7 @@ Iterator* Table::BlockReader(void* arg,
  // can add more features in the future.
  if (s.ok()) {
-    bool may_cache;
+    BlockContents contents;
    if (block_cache != NULL) {
      char cache_key_buffer[16];
      EncodeFixed64(cache_key_buffer, table->rep_->cache_id);
@ -117,14 +176,20 @@ Iterator* Table::BlockReader(void* arg,
      if (cache_handle != NULL) {
        block = reinterpret_cast<Block*>(block_cache->Value(cache_handle));
      } else {
-        s = ReadBlock(table->rep_->file, options, handle, &block, &may_cache);
+        s = ReadBlock(table->rep_->file, options, handle, &contents);
-        if (s.ok() && may_cache && options.fill_cache) {
+        if (s.ok()) {
          block = new Block(contents);
          if (contents.cachable && options.fill_cache) {
            cache_handle = block_cache->Insert(
                key, block, block->size(), &DeleteCachedBlock);
          }
        }
      }
    } else {
-      s = ReadBlock(table->rep_->file, options, handle, &block, &may_cache);
+      s = ReadBlock(table->rep_->file, options, handle, &contents);
      if (s.ok()) {
        block = new Block(contents);
      }
    }
  }
@ -148,6 +213,39 @@ Iterator* Table::NewIterator(const ReadOptions& options) const {
      &Table::BlockReader, const_cast<Table*>(this), options);
 }
 Status Table::InternalGet(const ReadOptions& options, const Slice& k,
                          void* arg,
                          void (*saver)(void*, const Slice&, const Slice&)) {
  Status s;
  Iterator* iiter = rep_->index_block->NewIterator(rep_->options.comparator);
  iiter->Seek(k);
  if (iiter->Valid()) {
    Slice handle_value = iiter->value();
    FilterBlockReader* filter = rep_->filter;
    BlockHandle handle;
    if (filter != NULL &&
        handle.DecodeFrom(&handle_value).ok() &&
        !filter->KeyMayMatch(handle.offset(), k)) {
      // Not found
    } else {
      Slice handle = iiter->value();
      Iterator* block_iter = BlockReader(this, options, iiter->value());
      block_iter->Seek(k);
      if (block_iter->Valid()) {
        (*saver)(arg, block_iter->key(), block_iter->value());
      }
      s = block_iter->status();
      delete block_iter;
    }
  }
  if (s.ok()) {
    s = iiter->status();
  }
  delete iiter;
  return s;
 }
 uint64_t Table::ApproximateOffsetOf(const Slice& key) const {
  Iterator* index_iter =
      rep_->index_block->NewIterator(rep_->options.comparator);
--- a/table/table_builder.cc
+++ b/table/table_builder.cc
@ -5,14 +5,15 @@
 #include "leveldb/table_builder.h"
 #include <assert.h>
 #include <stdio.h>
 #include "leveldb/comparator.h"
 #include "leveldb/env.h"
 #include "leveldb/filter_policy.h"
 #include "leveldb/options.h"
 #include "table/block_builder.h"
 #include "table/filter_block.h"
 #include "table/format.h"
 #include "util/coding.h"
 #include "util/crc32c.h"
 #include "util/logging.h"
 namespace leveldb {
@ -27,6 +28,7 @@ struct TableBuilder::Rep {
  std::string last_key;
  int64_t num_entries;
  bool closed;          // Either Finish() or Abandon() has been called.
  FilterBlockBuilder* filter_block;
  // We do not emit the index entry for a block until we have seen the
  // first key for the next data block.  This allows us to use shorter
@ -51,6 +53,8 @@ struct TableBuilder::Rep {
        index_block(&index_block_options),
        num_entries(0),
        closed(false),
        filter_block(opt.filter_policy == NULL ? NULL
                     : new FilterBlockBuilder(opt.filter_policy)),
        pending_index_entry(false) {
    index_block_options.block_restart_interval = 1;
  }
@ -58,10 +62,14 @@ struct TableBuilder::Rep {
 TableBuilder::TableBuilder(const Options& options, WritableFile* file)
    : rep_(new Rep(options, file)) {
  if (rep_->filter_block != NULL) {
    rep_->filter_block->StartBlock(0);
  }
 }
 TableBuilder::~TableBuilder() {
  assert(rep_->closed);  // Catch errors where caller forgot to call Finish()
  delete rep_->filter_block;
  delete rep_;
 }
@ -98,6 +106,10 @@ void TableBuilder::Add(const Slice& key, const Slice& value) {
    r->pending_index_entry = false;
  }
  if (r->filter_block != NULL) {
    r->filter_block->AddKey(key);
  }
  r->last_key.assign(key.data(), key.size());
  r->num_entries++;
  r->data_block.Add(key, value);
@ -119,6 +131,9 @@ void TableBuilder::Flush() {
    r->pending_index_entry = true;
    r->status = r->file->Flush();
  }
  if (r->filter_block != NULL) {
    r->filter_block->StartBlock(r->offset);
  }
 }
 void TableBuilder::WriteBlock(BlockBuilder* block, BlockHandle* handle) {
@ -152,6 +167,15 @@ void TableBuilder::WriteBlock(BlockBuilder* block, BlockHandle* handle) {
      break;
    }
  }
  WriteRawBlock(block_contents, type, handle);
  r->compressed_output.clear();
  block->Reset();
 }
 void TableBuilder::WriteRawBlock(const Slice& block_contents,
                                 CompressionType type,
                                 BlockHandle* handle) {
  Rep* r = rep_;
  handle->set_offset(r->offset);
  handle->set_size(block_contents.size());
  r->status = r->file->Append(block_contents);
@ -166,8 +190,6 @@ void TableBuilder::WriteBlock(BlockBuilder* block, BlockHandle* handle) {
      r->offset += block_contents.size() + kBlockTrailerSize;
    }
  }
  r->compressed_output.clear();
  block->Reset();
 }
 Status TableBuilder::status() const {
@ -179,13 +201,32 @@ Status TableBuilder::Finish() {
  Flush();
  assert(!r->closed);
  r->closed = true;
-  BlockHandle metaindex_block_handle;
+
-  BlockHandle index_block_handle;
+  BlockHandle filter_block_handle, metaindex_block_handle, index_block_handle;
  // Write filter block
  if (ok() && r->filter_block != NULL) {
    WriteRawBlock(r->filter_block->Finish(), kNoCompression,
                  &filter_block_handle);
  }
  // Write metaindex block
  if (ok()) {
    BlockBuilder meta_index_block(&r->options);
    if (r->filter_block != NULL) {
      // Add mapping from "filter.Name" to location of filter data
      std::string key = "filter.";
      key.append(r->options.filter_policy->Name());
      std::string handle_encoding;
      filter_block_handle.EncodeTo(&handle_encoding);
      meta_index_block.Add(key, handle_encoding);
    }
    // TODO(postrelease): Add stats and other meta blocks
    WriteBlock(&meta_index_block, &metaindex_block_handle);
  }
  // Write index block
  if (ok()) {
    if (r->pending_index_entry) {
      r->options.comparator->FindShortSuccessor(&r->last_key);
@ -196,6 +237,8 @@ Status TableBuilder::Finish() {
    }
    WriteBlock(&r->index_block, &index_block_handle);
  }
  // Write footer
  if (ok()) {
    Footer footer;
    footer.set_metaindex_handle(metaindex_block_handle);
--- a/table/table_test.cc
+++ b/table/table_test.cc
@ -168,8 +168,6 @@ class Constructor {
  // Construct the data structure from the data in "data"
  virtual Status FinishImpl(const Options& options, const KVMap& data) = 0;
  virtual size_t NumBytes() const = 0;
  virtual Iterator* NewIterator() const = 0;
  virtual const KVMap& data() { return data_; }
@ -185,7 +183,6 @@ class BlockConstructor: public Constructor {
  explicit BlockConstructor(const Comparator* cmp)
      : Constructor(cmp),
        comparator_(cmp),
        block_size_(-1),
        block_(NULL) { }
  ~BlockConstructor() {
    delete block_;
@ -201,22 +198,21 @@ class BlockConstructor: public Constructor {
      builder.Add(it->first, it->second);
    }
    // Open the block
-    Slice block_data = builder.Finish();
+    data_ = builder.Finish().ToString();
-    block_size_ = block_data.size();
+    BlockContents contents;
-    char* block_data_copy = new char[block_size_];
+    contents.data = data_;
-    memcpy(block_data_copy, block_data.data(), block_size_);
+    contents.cachable = false;
-    block_ = new Block(block_data_copy, block_size_, true /* take ownership */);
+    contents.heap_allocated = false;
    block_ = new Block(contents);
    return Status::OK();
  }
  virtual size_t NumBytes() const { return block_size_; }
  virtual Iterator* NewIterator() const {
    return block_->NewIterator(comparator_);
  }
 private:
  const Comparator* comparator_;
-  int block_size_;
+  std::string data_;
  Block* block_;
  BlockConstructor();
@ -253,7 +249,6 @@ class TableConstructor: public Constructor {
    table_options.comparator = options.comparator;
    return Table::Open(table_options, source_, sink.contents().size(), &table_);
  }
  virtual size_t NumBytes() const { return source_->Size(); }
  virtual Iterator* NewIterator() const {
    return table_->NewIterator(ReadOptions());
@ -342,10 +337,6 @@ class MemTableConstructor: public Constructor {
    }
    return Status::OK();
  }
  virtual size_t NumBytes() const {
    return memtable_->ApproximateMemoryUsage();
  }
  virtual Iterator* NewIterator() const {
    return new KeyConvertingIterator(memtable_->NewIterator());
  }
@ -379,13 +370,6 @@ class DBConstructor: public Constructor {
    }
    return Status::OK();
  }
  virtual size_t NumBytes() const {
    Range r("", "\xff\xff");
    uint64_t size;
    db_->GetApproximateSizes(&r, 1, &size);
    return size;
  }
  virtual Iterator* NewIterator() const {
    return db_->NewIterator(ReadOptions());
  }
@ -809,7 +793,7 @@ TEST(TableTest, ApproximateOffsetOfPlain) {
  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k05"),  210000, 211000));
  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k06"),  510000, 511000));
  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k07"),  510000, 511000));
-  ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"),  610000, 611000));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"),  610000, 612000));
 }
--- a/util/bloom.cc
+++ b/util/bloom.cc
@ -0,0 +1,95 @@
 // Copyright (c) 2012 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 #include "leveldb/filter_policy.h"
 #include "leveldb/slice.h"
 #include "util/hash.h"
 namespace leveldb {
 namespace {
 static uint32_t BloomHash(const Slice& key) {
  return Hash(key.data(), key.size(), 0xbc9f1d34);
 }
 class BloomFilterPolicy : public FilterPolicy {
 private:
  size_t bits_per_key_;
  size_t k_;
 public:
  explicit BloomFilterPolicy(int bits_per_key)
      : bits_per_key_(bits_per_key) {
    // We intentionally round down to reduce probing cost a little bit
    k_ = static_cast<size_t>(bits_per_key * 0.69);  // 0.69 =~ ln(2)
    if (k_ < 1) k_ = 1;
    if (k_ > 30) k_ = 30;
  }
  virtual const char* Name() const {
    return "leveldb.BuiltinBloomFilter";
  }
  virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const {
    // Compute bloom filter size (in both bits and bytes)
    size_t bits = n * bits_per_key_;
    // For small n, we can see a very high false positive rate.  Fix it
    // by enforcing a minimum bloom filter length.
    if (bits < 64) bits = 64;
    size_t bytes = (bits + 7) / 8;
    bits = bytes * 8;
    const size_t init_size = dst->size();
    dst->resize(init_size + bytes, 0);
    dst->push_back(static_cast<char>(k_));  // Remember # of probes in filter
    char* array = &(*dst)[init_size];
    for (size_t i = 0; i < n; i++) {
      // Use double-hashing to generate a sequence of hash values.
      // See analysis in [Kirsch,Mitzenmacher 2006].
      uint32_t h = BloomHash(keys[i]);
      const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
      for (size_t j = 0; j < k_; j++) {
        const uint32_t bitpos = h % bits;
        array[bitpos/8] |= (1 << (bitpos % 8));
        h += delta;
      }
    }
  }
  virtual bool KeyMayMatch(const Slice& key, const Slice& bloom_filter) const {
    const size_t len = bloom_filter.size();
    if (len < 2) return false;
    const char* array = bloom_filter.data();
    const size_t bits = (len - 1) * 8;
    // Use the encoded k so that we can read filters generated by
    // bloom filters created using different parameters.
    const size_t k = array[len-1];
    if (k > 30) {
      // Reserved for potentially new encodings for short bloom filters.
      // Consider it a match.
      return true;
    }
    uint32_t h = BloomHash(key);
    const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
    for (size_t j = 0; j < k; j++) {
      const uint32_t bitpos = h % bits;
      if ((array[bitpos/8] & (1 << (bitpos % 8))) == 0) return false;
      h += delta;
    }
    return true;
  }
 };
 }
 const FilterPolicy* NewBloomFilterPolicy(int bits_per_key) {
  return new BloomFilterPolicy(bits_per_key);
 }
 }  // namespace leveldb
--- a/util/bloom_test.cc
+++ b/util/bloom_test.cc
@ -0,0 +1,159 @@
 // Copyright (c) 2012 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 #include "leveldb/filter_policy.h"
 #include "util/logging.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
 namespace leveldb {
 static const int kVerbose = 1;
 static Slice Key(int i, char* buffer) {
  memcpy(buffer, &i, sizeof(i));
  return Slice(buffer, sizeof(i));
 }
 class BloomTest {
 private:
  const FilterPolicy* policy_;
  std::string filter_;
  std::vector<std::string> keys_;
 public:
  BloomTest() : policy_(NewBloomFilterPolicy(10)) { }
  ~BloomTest() {
    delete policy_;
  }
  void Reset() {
    keys_.clear();
    filter_.clear();
  }
  void Add(const Slice& s) {
    keys_.push_back(s.ToString());
  }
  void Build() {
    std::vector<Slice> key_slices;
    for (size_t i = 0; i < keys_.size(); i++) {
      key_slices.push_back(Slice(keys_[i]));
    }
    filter_.clear();
    policy_->CreateFilter(&key_slices[0], key_slices.size(), &filter_);
    keys_.clear();
    if (kVerbose >= 2) DumpFilter();
  }
  size_t FilterSize() const {
    return filter_.size();
  }
  void DumpFilter() {
    fprintf(stderr, "F(");
    for (size_t i = 0; i+1 < filter_.size(); i++) {
      const unsigned int c = static_cast<unsigned int>(filter_[i]);
      for (int j = 0; j < 8; j++) {
        fprintf(stderr, "%c", (c & (1 <<j)) ? '1' : '.');
      }
    }
    fprintf(stderr, ")\n");
  }
  bool Matches(const Slice& s) {
    if (!keys_.empty()) {
      Build();
    }
    return policy_->KeyMayMatch(s, filter_);
  }
  double FalsePositiveRate() {
    char buffer[sizeof(int)];
    int result = 0;
    for (int i = 0; i < 10000; i++) {
      if (Matches(Key(i + 1000000000, buffer))) {
        result++;
      }
    }
    return result / 10000.0;
  }
 };
 TEST(BloomTest, EmptyFilter) {
  ASSERT_TRUE(! Matches("hello"));
  ASSERT_TRUE(! Matches("world"));
 }
 TEST(BloomTest, Small) {
  Add("hello");
  Add("world");
  ASSERT_TRUE(Matches("hello"));
  ASSERT_TRUE(Matches("world"));
  ASSERT_TRUE(! Matches("x"));
  ASSERT_TRUE(! Matches("foo"));
 }
 static int NextLength(int length) {
  if (length < 10) {
    length += 1;
  } else if (length < 100) {
    length += 10;
  } else if (length < 1000) {
    length += 100;
  } else {
    length += 1000;
  }
  return length;
 }
 TEST(BloomTest, VaryingLengths) {
  char buffer[sizeof(int)];
  // Count number of filters that significantly exceed the false positive rate
  int mediocre_filters = 0;
  int good_filters = 0;
  for (int length = 1; length <= 10000; length = NextLength(length)) {
    Reset();
    for (int i = 0; i < length; i++) {
      Add(Key(i, buffer));
    }
    Build();
    ASSERT_LE(FilterSize(), (length * 10 / 8) + 40) << length;
    // All added keys must match
    for (int i = 0; i < length; i++) {
      ASSERT_TRUE(Matches(Key(i, buffer)))
          << "Length " << length << "; key " << i;
    }
    // Check false positive rate
    double rate = FalsePositiveRate();
    if (kVerbose >= 1) {
      fprintf(stderr, "False positives: %5.2f%% @ length = %6d ; bytes = %6d\n",
              rate*100.0, length, static_cast<int>(FilterSize()));
    }
    ASSERT_LE(rate, 0.02);   // Must not be over 2%
    if (rate > 0.0125) mediocre_filters++;  // Allowed, but not too often
    else good_filters++;
  }
  if (kVerbose >= 1) {
    fprintf(stderr, "Filters: %d good, %d mediocre\n",
            good_filters, mediocre_filters);
  }
  ASSERT_LE(mediocre_filters, good_filters/5);
 }
 // Different bits-per-byte
 }  // namespace leveldb
 int main(int argc, char** argv) {
  return leveldb::test::RunAllTests();
 }
--- a/util/filter_policy.cc
+++ b/util/filter_policy.cc
@ -0,0 +1,11 @@
 // Copyright (c) 2012 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 #include "leveldb/filter_policy.h"
 namespace leveldb {
 FilterPolicy::~FilterPolicy() { }
 }  // namespace leveldb
--- a/util/options.cc
+++ b/util/options.cc
@ -21,7 +21,8 @@ Options::Options()
      block_cache(NULL),
      block_size(4096),
      block_restart_interval(16),
-      compression(kSnappyCompression) {
+      compression(kSnappyCompression),
      filter_policy(NULL) {
 }