From 4605e20c58b160e515daa8ba1751901f1283c208 Mon Sep 17 00:00:00 2001
From: Siying Dong <siying.d@fb.com>
Date: Thu, 23 Jan 2014 16:27:34 -0800
Subject: [PATCH 1/8] If User setting of compaction multipliers overflow, use
 default value 1 instead

Summary: Currently, compaction multipliers can overflow and cause unexpected behaviors. In this patch, we detect those overflows and use multiplier 1 for them.

Test Plan: make all check

Reviewers: dhruba, haobo, igor, kailiu

Reviewed By: kailiu

CC: leveldb

Differential Revision: https://reviews.facebook.net/D15321
---
 db/compaction_picker.cc | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc
index cfa3770d7..5839d4dac 100644
--- a/db/compaction_picker.cc
+++ b/db/compaction_picker.cc
@@ -22,6 +22,21 @@ uint64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
   return sum;
 }
 
+// Multiple two operands. If they overflow, return op1.
+uint64_t MultiplyCheckOverflow(uint64_t op1, int op2) {
+  if (op1 == 0) {
+    return 0;
+  }
+  if (op2 <= 0) {
+    return op1;
+  }
+  uint64_t casted_op2 = (uint64_t) op2;
+  if (std::numeric_limits<uint64_t>::max() / op1 < casted_op2) {
+    return op1;
+  }
+  return op1 * casted_op2;
+}
+
 }  // anonymous namespace
 
 CompactionPicker::CompactionPicker(const Options* options,
@@ -48,10 +63,11 @@ void CompactionPicker::Init() {
       max_file_size_[i] = ULLONG_MAX;
       level_max_bytes_[i] = options_->max_bytes_for_level_base;
     } else if (i > 1) {
-      max_file_size_[i] = max_file_size_[i - 1] * target_file_size_multiplier;
-      level_max_bytes_[i] =
-          level_max_bytes_[i - 1] * max_bytes_multiplier *
-          options_->max_bytes_for_level_multiplier_additional[i - 1];
+      max_file_size_[i] = MultiplyCheckOverflow(max_file_size_[i - 1],
+                                                target_file_size_multiplier);
+      level_max_bytes_[i] = MultiplyCheckOverflow(
+          MultiplyCheckOverflow(level_max_bytes_[i - 1], max_bytes_multiplier),
+          options_->max_bytes_for_level_multiplier_additional[i - 1]);
     } else {
       max_file_size_[i] = options_->target_file_size_base;
       level_max_bytes_[i] = options_->max_bytes_for_level_base;

From d24961b65edeb155ebfc9f31c33e5aae380a4fec Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Fri, 24 Jan 2014 10:24:08 -0800
Subject: [PATCH 2/8] Moving to glibc-fb

Summary:
It looks like we might have some trouble when building the new release with 4.8, since fbcode is using glibc2.17-fb by default and we are using glibc2.17. It was reported by Benjamin Renard in our internal group.

This diff moves our fbcode build to use glibc2.17-fb by default. I got some linker errors when compiling, complaining that `google::SetUsageMessage()` was undefined. After deleting all offending lines, the compile was successful and everything works.

Test Plan:
Compiled
Ran ./db_bench ./db_stress ./db_repl_stress

Reviewers: kailiu

Reviewed By: kailiu

CC: leveldb

Differential Revision: https://reviews.facebook.net/D15405
---
 build_tools/fbcode.gcc481.sh | 4 ++--
 db/db_bench.cc               | 2 --
 tools/db_repl_stress.cc      | 3 ---
 tools/db_stress.cc           | 2 --
 4 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/build_tools/fbcode.gcc481.sh b/build_tools/fbcode.gcc481.sh
index e8c9f090b..ed7c8305e 100644
--- a/build_tools/fbcode.gcc481.sh
+++ b/build_tools/fbcode.gcc481.sh
@@ -11,13 +11,13 @@ if [ "$CENTOS_VERSION" = "6" ]; then
 else
   TOOLCHAIN_EXECUTABLES="/mnt/gvfs/third-party/$TOOLCHAIN_REV/centos5.2-native"
 fi
-TOOLCHAIN_LIB_BASE="/mnt/gvfs/third-party/$TOOLCHAIN_REV/gcc-4.8.1-glibc-2.17"
+TOOLCHAIN_LIB_BASE="/mnt/gvfs/third-party/$TOOLCHAIN_REV/gcc-4.8.1-glibc-2.17-fb"
 
 # location of libhdfs libraries
 if test "$USE_HDFS"; then
   JAVA_HOME="/usr/local/jdk-6u22-64"
   JINCLUDE="-I$JAVA_HOME/include -I$JAVA_HOME/include/linux"
-  GLIBC_RUNTIME_PATH="/usr/local/fbcode/gcc-4.8.1-glibc-2.17"
+  GLIBC_RUNTIME_PATH="/usr/local/fbcode/gcc-4.8.1-glibc-2.17-fb"
   HDFSLIB=" -Wl,--no-whole-archive hdfs/libhdfs.a -L$JAVA_HOME/jre/lib/amd64 "
   HDFSLIB+=" -L$JAVA_HOME/jre/lib/amd64/server -L$GLIBC_RUNTIME_PATH/lib "
   HDFSLIB+=" -ldl -lverify -ljava -ljvm "
diff --git a/db/db_bench.cc b/db/db_bench.cc
index e41a31cf3..78c789734 100644
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -2531,8 +2531,6 @@ class Benchmark {
 
 int main(int argc, char** argv) {
   rocksdb::InstallStackTraceHandler();
-  google::SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
-                          " [OPTIONS]...");
   google::ParseCommandLineFlags(&argc, &argv, true);
 
   FLAGS_compaction_style_e = (rocksdb::CompactionStyle) FLAGS_compaction_style;
diff --git a/tools/db_repl_stress.cc b/tools/db_repl_stress.cc
index 9dfe4b644..9b2465fbb 100644
--- a/tools/db_repl_stress.cc
+++ b/tools/db_repl_stress.cc
@@ -87,9 +87,6 @@ DEFINE_uint64(wal_size_limit_MB, 10, "the wal size limit for the run"
               "(in MB)");
 
 int main(int argc, const char** argv) {
-  google::SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
-    " --num_inserts=<num_inserts> --wal_ttl_seconds=<WAL_ttl_seconds>" +
-    " --wal_size_limit_MB=<WAL_size_limit_MB>");
   google::ParseCommandLineFlags(&argc, const_cast<char***>(&argv), true);
 
   Env* env = Env::Default();
diff --git a/tools/db_stress.cc b/tools/db_stress.cc
index 8321c7eaf..bd43504f4 100644
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@@ -1488,8 +1488,6 @@ class StressTest {
 
 
 int main(int argc, char** argv) {
-  google::SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
-                          " [OPTIONS]...");
   google::ParseCommandLineFlags(&argc, &argv, true);
 
   if (FLAGS_statistics) {

From 66dc033af33254ff7df85ffae0c6a628615bd7d1 Mon Sep 17 00:00:00 2001
From: kailiu <hfevers@gmail.com>
Date: Fri, 24 Jan 2014 10:57:15 -0800
Subject: [PATCH 3/8] Temporarily disable caching index/filter blocks

Summary:
Mixing index/filter blocks with data blocks resulted in some known
issues.  To make sure in next release our users won't be affected,
we added a new option in BlockBasedTableFactory::TableOption to
conceal this functionality for now.

This patch also introduced a BlockBasedTableReader::OpenOptions,
which avoids the "infinite" growth of parameters in
BlockBasedTableReader::Open().

Test Plan: make check

Reviewers: haobo, sdong, igor, dhruba

Reviewed By: igor

CC: leveldb, tnovak

Differential Revision: https://reviews.facebook.net/D15327
---
 db/db_test.cc                      |  4 +++
 table/block_based_table_factory.cc |  6 ++--
 table/block_based_table_factory.h  | 36 +++++++--------------
 table/block_based_table_options.h  | 31 ++++++++++++++++++
 table/block_based_table_reader.cc  | 52 +++++++++++++++---------------
 table/block_based_table_reader.h   |  8 ++---
 table/table_test.cc                | 14 ++++----
 7 files changed, 87 insertions(+), 64 deletions(-)
 create mode 100644 table/block_based_table_options.h

diff --git a/db/db_test.cc b/db/db_test.cc
index 1161deddb..8828ed1b9 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -17,6 +17,7 @@
 #include "db/filename.h"
 #include "db/version_set.h"
 #include "db/write_batch_internal.h"
+#include "table/block_based_table_factory.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/compaction_filter.h"
 #include "rocksdb/env.h"
@@ -732,6 +733,9 @@ TEST(DBTest, IndexAndFilterBlocksOfNewTableAddedToCache) {
   options.filter_policy = filter_policy.get();
   options.create_if_missing = true;
   options.statistics = rocksdb::CreateDBStatistics();
+  BlockBasedTableOptions table_options;
+  table_options.cache_index_and_filter_blocks = true;
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
   DestroyAndReopen(&options);
 
   ASSERT_OK(db_->Put(WriteOptions(), "key", "val"));
diff --git a/table/block_based_table_factory.cc b/table/block_based_table_factory.cc
index 836f6edf6..a9cd35a68 100644
--- a/table/block_based_table_factory.cc
+++ b/table/block_based_table_factory.cc
@@ -20,10 +20,10 @@ namespace rocksdb {
 
 Status BlockBasedTableFactory::GetTableReader(
     const Options& options, const EnvOptions& soptions,
-    unique_ptr<RandomAccessFile> && file, uint64_t file_size,
+    unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
     unique_ptr<TableReader>* table_reader) const {
-  return BlockBasedTable::Open(options, soptions, std::move(file), file_size,
-                               table_reader);
+  return BlockBasedTable::Open(options, soptions, table_options_,
+                               std::move(file), file_size, table_reader);
 }
 
 TableBuilder* BlockBasedTableFactory::GetTableBuilder(
diff --git a/table/block_based_table_factory.h b/table/block_based_table_factory.h
index ee525816f..5a4d1bd6e 100644
--- a/table/block_based_table_factory.h
+++ b/table/block_based_table_factory.h
@@ -14,6 +14,7 @@
 #include "rocksdb/flush_block_policy.h"
 #include "rocksdb/options.h"
 #include "rocksdb/table.h"
+#include "table/block_based_table_options.h"
 
 namespace rocksdb {
 
@@ -30,40 +31,25 @@ class BlockBasedTable;
 class BlockBasedTableBuilder;
 
 class BlockBasedTableFactory: public TableFactory {
-public:
-  struct TableOptions {
-    // @flush_block_policy_factory creates the instances of flush block policy.
-    // which provides a configurable way to determine when to flush a block in
-    // the block based tables.  If not set, table builder will use the default
-    // block flush policy, which cut blocks by block size (please refer to
-    // `FlushBlockBySizePolicy`).
-    std::shared_ptr<FlushBlockPolicyFactory> flush_block_policy_factory;
-  };
+ public:
+  BlockBasedTableFactory() : BlockBasedTableFactory(BlockBasedTableOptions()) {}
+  explicit BlockBasedTableFactory(const BlockBasedTableOptions& table_options)
+      : table_options_(table_options) {}
 
-  BlockBasedTableFactory() : BlockBasedTableFactory(TableOptions()) { }
-  BlockBasedTableFactory(const TableOptions& table_options): 
-      table_options_(table_options) { 
-  }
+  ~BlockBasedTableFactory() {}
 
-  ~BlockBasedTableFactory() {
-  }
-
-  const char* Name() const override {
-    return "BlockBasedTable";
-  }
+  const char* Name() const override { return "BlockBasedTable"; }
 
   Status GetTableReader(const Options& options, const EnvOptions& soptions,
-                        unique_ptr<RandomAccessFile> && file,
-                        uint64_t file_size,
+                        unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
                         unique_ptr<TableReader>* table_reader) const override;
 
   TableBuilder* GetTableBuilder(const Options& options, WritableFile* file,
-                                CompressionType compression_type) const
-                                    override;
+                                CompressionType compression_type)
+      const override;
 
  private:
-  TableOptions table_options_;
+  BlockBasedTableOptions table_options_;
 };
 
-
 }  // namespace rocksdb
diff --git a/table/block_based_table_options.h b/table/block_based_table_options.h
new file mode 100644
index 000000000..f5774e2bf
--- /dev/null
+++ b/table/block_based_table_options.h
@@ -0,0 +1,31 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+#include <memory>
+
+namespace rocksdb {
+
+class FlushBlockPolicyFactory;
+
+struct BlockBasedTableOptions {
+  // @flush_block_policy_factory creates the instances of flush block policy.
+  // which provides a configurable way to determine when to flush a block in
+  // the block based tables.  If not set, table builder will use the default
+  // block flush policy, which cut blocks by block size (please refer to
+  // `FlushBlockBySizePolicy`).
+  std::shared_ptr<FlushBlockPolicyFactory> flush_block_policy_factory;
+
+  // TODO(kailiu) Temporarily disable this feature by making the default value
+  // to be false. Also in master branch, this file is non-public so no user
+  // will be able to change the value of `cache_index_and_filter_blocks`.
+  //
+  // Indicating if we'd put index/filter blocks to the block cache.
+  // If not specified, each "table reader" object will pre-load index/filter
+  // block during table initialization.
+  bool cache_index_and_filter_blocks = false;
+};
+
+}  // namespace rocksdb
diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc
index dcb55fc36..b08ea1934 100644
--- a/table/block_based_table_reader.cc
+++ b/table/block_based_table_reader.cc
@@ -26,6 +26,7 @@
 #include "util/coding.h"
 #include "util/perf_context_imp.h"
 #include "util/stop_watch.h"
+#include "table/block_based_table_options.h"
 
 namespace rocksdb {
 
@@ -45,9 +46,9 @@ struct BlockBasedTable::Rep {
   Status status;
   unique_ptr<RandomAccessFile> file;
   char cache_key_prefix[kMaxCacheKeyPrefixSize];
-  size_t cache_key_prefix_size;
+  size_t cache_key_prefix_size = 0;
   char compressed_cache_key_prefix[kMaxCacheKeyPrefixSize];
-  size_t compressed_cache_key_prefix_size;
+  size_t compressed_cache_key_prefix_size = 0;
 
   // Handle to metaindex_block: saved from footer
   BlockHandle metaindex_handle;
@@ -220,20 +221,21 @@ Cache::Handle* GetFromBlockCache(
 
 } // end of anonymous namespace
 
-Status BlockBasedTable::Open(const Options& options,
-                             const EnvOptions& soptions,
-                             unique_ptr<RandomAccessFile> && file,
-                             uint64_t size,
+Status BlockBasedTable::Open(const Options& options, const EnvOptions& soptions,
+                             const BlockBasedTableOptions& table_options,
+                             unique_ptr<RandomAccessFile>&& file,
+                             uint64_t file_size,
                              unique_ptr<TableReader>* table_reader) {
   table_reader->reset();
-  if (size < Footer::kEncodedLength) {
+
+  if (file_size < Footer::kEncodedLength) {
     return Status::InvalidArgument("file is too short to be an sstable");
   }
 
   char footer_space[Footer::kEncodedLength];
   Slice footer_input;
-  Status s = file->Read(size - Footer::kEncodedLength, Footer::kEncodedLength,
-                        &footer_input, footer_space);
+  Status s = file->Read(file_size - Footer::kEncodedLength,
+                        Footer::kEncodedLength, &footer_input, footer_space);
   if (!s.ok()) return s;
 
   // Check that we actually read the whole footer from the file. It may be
@@ -277,11 +279,21 @@ Status BlockBasedTable::Open(const Options& options,
     }
   }
 
-  // Initialize index/filter blocks. If block cache is not specified,
-  // these blocks will be kept in member variables in Rep, which will
-  // reside in the memory as long as this table object is alive; otherwise
-  // they will be added to block cache.
-  if (!options.block_cache) {
+  // Will use block cache for index/filter blocks access?
+  if (options.block_cache && table_options.cache_index_and_filter_blocks) {
+    // Call IndexBlockReader() to implicitly add index to the block_cache
+    unique_ptr<Iterator> iter(new_table->IndexBlockReader(ReadOptions()));
+    s = iter->status();
+
+    if (s.ok()) {
+      // Call GetFilter() to implicitly add filter to the block_cache
+      auto filter_entry = new_table->GetFilter();
+      filter_entry.Release(options.block_cache.get());
+    }
+  } else {
+    // If we don't use block cache for index/filter blocks access, we'll
+    // pre-load these blocks, which will kept in member variables in Rep
+    // and with a same life-time as this table object.
     Block* index_block = nullptr;
     // TODO: we never really verify check sum for index block
     s = ReadBlockFromFile(
@@ -309,18 +321,7 @@ Status BlockBasedTable::Open(const Options& options,
     } else {
       delete index_block;
     }
-  } else {
-    // Call IndexBlockReader() to implicitly add index to the block_cache
-    unique_ptr<Iterator> iter(
-        new_table->IndexBlockReader(ReadOptions())
-    );
-    s = iter->status();
 
-    if (s.ok()) {
-      // Call GetFilter() to implicitly add filter to the block_cache
-      auto filter_entry = new_table->GetFilter();
-      filter_entry.Release(options.block_cache.get());
-    }
   }
 
   if (s.ok()) {
@@ -836,7 +837,6 @@ BlockBasedTable::GetFilter(bool no_io) const {
 // Get the iterator from the index block.
 Iterator* BlockBasedTable::IndexBlockReader(const ReadOptions& options) const {
   if (rep_->index_block) {
-    assert (!rep_->options.block_cache);
     return rep_->index_block->NewIterator(rep_->options.comparator);
   }
 
diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h
index 66f63fc59..52ece7441 100644
--- a/table/block_based_table_reader.h
+++ b/table/block_based_table_reader.h
@@ -29,6 +29,7 @@ struct ReadOptions;
 class TableCache;
 class TableReader;
 class FilterBlockReader;
+struct BlockBasedTableOptions;
 
 using std::unique_ptr;
 
@@ -50,10 +51,9 @@ class BlockBasedTable : public TableReader {
   // to nullptr and returns a non-ok status.
   //
   // *file must remain live while this Table is in use.
-  static Status Open(const Options& options,
-                     const EnvOptions& soptions,
-                     unique_ptr<RandomAccessFile>&& file,
-                     uint64_t file_size,
+  static Status Open(const Options& db_options, const EnvOptions& env_options,
+                     const BlockBasedTableOptions& table_options,
+                     unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
                      unique_ptr<TableReader>* table_reader);
 
   bool PrefixMayMatch(const Slice& internal_prefix) override;
diff --git a/table/table_test.cc b/table/table_test.cc
index 9907550ce..af794fd13 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -243,13 +243,12 @@ class BlockConstructor: public Constructor {
 
 class BlockBasedTableConstructor: public Constructor {
  public:
-  explicit BlockBasedTableConstructor(
-      const Comparator* cmp)
-      : Constructor(cmp) {
-  }
+  explicit BlockBasedTableConstructor(const Comparator* cmp)
+      : Constructor(cmp) {}
   ~BlockBasedTableConstructor() {
     Reset();
   }
+
   virtual Status FinishImpl(const Options& options, const KVMap& data) {
     Reset();
     sink_.reset(new StringSink());
@@ -277,7 +276,6 @@ class BlockBasedTableConstructor: public Constructor {
     // Open the table
     uniq_id_ = cur_uniq_id_++;
     source_.reset(new StringSource(sink_->contents(), uniq_id_));
-    unique_ptr<TableFactory> table_factory;
     return options.table_factory->GetTableReader(options, soptions,
                                                  std::move(source_),
                                                  sink_->contents().size(),
@@ -979,6 +977,11 @@ TEST(TableTest, BlockCacheTest) {
   options.create_if_missing = true;
   options.statistics = CreateDBStatistics();
   options.block_cache = NewLRUCache(1024);
+
+  // Enable the cache for index/filter blocks
+  BlockBasedTableOptions table_options;
+  table_options.cache_index_and_filter_blocks = true;
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
   std::vector<std::string> keys;
   KVMap kvmap;
 
@@ -1292,7 +1295,6 @@ TEST(MemTableTest, Simple) {
   delete memtable->Unref();
 }
 
-
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {

From e832e72b3125d929617e5116690cedd918520453 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Fri, 24 Jan 2014 11:50:38 -0800
Subject: [PATCH 4/8] Revert "Moving to glibc-fb"

This reverts commit d24961b65edeb155ebfc9f31c33e5aae380a4fec.

For some reason, glibc2.17-fb breaks gflags. Reverting for now
---
 build_tools/fbcode.gcc481.sh | 4 ++--
 db/db_bench.cc               | 2 ++
 tools/db_repl_stress.cc      | 3 +++
 tools/db_stress.cc           | 2 ++
 4 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/build_tools/fbcode.gcc481.sh b/build_tools/fbcode.gcc481.sh
index ed7c8305e..e8c9f090b 100644
--- a/build_tools/fbcode.gcc481.sh
+++ b/build_tools/fbcode.gcc481.sh
@@ -11,13 +11,13 @@ if [ "$CENTOS_VERSION" = "6" ]; then
 else
   TOOLCHAIN_EXECUTABLES="/mnt/gvfs/third-party/$TOOLCHAIN_REV/centos5.2-native"
 fi
-TOOLCHAIN_LIB_BASE="/mnt/gvfs/third-party/$TOOLCHAIN_REV/gcc-4.8.1-glibc-2.17-fb"
+TOOLCHAIN_LIB_BASE="/mnt/gvfs/third-party/$TOOLCHAIN_REV/gcc-4.8.1-glibc-2.17"
 
 # location of libhdfs libraries
 if test "$USE_HDFS"; then
   JAVA_HOME="/usr/local/jdk-6u22-64"
   JINCLUDE="-I$JAVA_HOME/include -I$JAVA_HOME/include/linux"
-  GLIBC_RUNTIME_PATH="/usr/local/fbcode/gcc-4.8.1-glibc-2.17-fb"
+  GLIBC_RUNTIME_PATH="/usr/local/fbcode/gcc-4.8.1-glibc-2.17"
   HDFSLIB=" -Wl,--no-whole-archive hdfs/libhdfs.a -L$JAVA_HOME/jre/lib/amd64 "
   HDFSLIB+=" -L$JAVA_HOME/jre/lib/amd64/server -L$GLIBC_RUNTIME_PATH/lib "
   HDFSLIB+=" -ldl -lverify -ljava -ljvm "
diff --git a/db/db_bench.cc b/db/db_bench.cc
index 78c789734..e41a31cf3 100644
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -2531,6 +2531,8 @@ class Benchmark {
 
 int main(int argc, char** argv) {
   rocksdb::InstallStackTraceHandler();
+  google::SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
+                          " [OPTIONS]...");
   google::ParseCommandLineFlags(&argc, &argv, true);
 
   FLAGS_compaction_style_e = (rocksdb::CompactionStyle) FLAGS_compaction_style;
diff --git a/tools/db_repl_stress.cc b/tools/db_repl_stress.cc
index 9b2465fbb..9dfe4b644 100644
--- a/tools/db_repl_stress.cc
+++ b/tools/db_repl_stress.cc
@@ -87,6 +87,9 @@ DEFINE_uint64(wal_size_limit_MB, 10, "the wal size limit for the run"
               "(in MB)");
 
 int main(int argc, const char** argv) {
+  google::SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
+    " --num_inserts=<num_inserts> --wal_ttl_seconds=<WAL_ttl_seconds>" +
+    " --wal_size_limit_MB=<WAL_size_limit_MB>");
   google::ParseCommandLineFlags(&argc, const_cast<char***>(&argv), true);
 
   Env* env = Env::Default();
diff --git a/tools/db_stress.cc b/tools/db_stress.cc
index bd43504f4..8321c7eaf 100644
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@@ -1488,6 +1488,8 @@ class StressTest {
 
 
 int main(int argc, char** argv) {
+  google::SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
+                          " [OPTIONS]...");
   google::ParseCommandLineFlags(&argc, &argv, true);
 
   if (FLAGS_statistics) {

From f131d4c2800632af9f705c05a75ab6b0e04c016a Mon Sep 17 00:00:00 2001
From: kailiu <hfevers@gmail.com>
Date: Fri, 24 Jan 2014 11:56:01 -0800
Subject: [PATCH 5/8] Add a make target for shared library

Summary:
Previous we made `make release` also compile shared library. However it takes a long time to complete.

To make our development process more efficient. I added a new make target shared_lib.

User can of course run `make <library_name>` for direct compilation. However the <library_name> changed under certain condition. Thus we need `make shared_lib` to get rid of the memorization from users' side.

Test Plan: make shared_lib

Reviewers: igor, sdong, haobo, dhruba

Reviewed By: igor

CC: leveldb

Differential Revision: https://reviews.facebook.net/D15309
---
 Makefile | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 572e42e9e..4beb74938 100644
--- a/Makefile
+++ b/Makefile
@@ -135,13 +135,12 @@ endif  # PLATFORM_SHARED_EXT
 all: $(LIBRARY) $(PROGRAMS)
 
 .PHONY: blackbox_crash_test check clean coverage crash_test ldb_tests \
-	release tags valgrind_check whitebox_crash_test format
+	release tags valgrind_check whitebox_crash_test format shared_lib
 
 # Will also generate shared libraries. 
 release:
 	$(MAKE) clean
 	OPT=-DNDEBUG $(MAKE) all -j32
-	OPT=-DNDEBUG $(MAKE) $(SHARED) -j32
 
 coverage:
 	$(MAKE) clean
@@ -201,6 +200,8 @@ tags:
 format:
 	build_tools/format-diff.sh
 
+shared_lib: $(SHARED)
+
 # ---------------------------------------------------------------------------
 # 	Unit tests and tools
 # ---------------------------------------------------------------------------

From c583157d497b02577dd8c08b54ab91821df1ea21 Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Fri, 24 Jan 2014 14:52:08 -0800
Subject: [PATCH 6/8] MemTableListVersion

Summary:
MemTableListVersion is to MemTableList what Version is to VersionSet. I took almost the same ideas to develop MemTableListVersion. The reason is to have copying std::list done in background, while flushing, rather than in foreground (MultiGet() and NewIterator()) under a mutex! Also, whenever we copied MemTableList, we copied also some MemTableList metadata (flush_requested_, commit_in_progress_, etc.), which was wasteful.

This diff avoids std::list copy under a mutex in both MultiGet() and NewIterator(). I created a small database with some number of immutable memtables, and creating 100.000 iterators in a single-thread (!) decreased from {188739, 215703, 198028} to {154352, 164035, 159817}. A lot of the savings come from code under a mutex, so we should see much higher savings with multiple threads. Creating new iterator is very important to LogDevice team.

I also think this diff will make SuperVersion obsolete for performance reasons. I will try it in the next diff. SuperVersion gave us huge savings on Get() code path, but I think that most of the savings came from copying MemTableList under a mutex. If we had MemTableListVersion, we would never need to copy the entire object (like we still do in NewIterator() and MultiGet())

Test Plan: `make check` works. I will also do `make valgrind_check` before commit

Reviewers: dhruba, haobo, kailiu, sdong, emayanke, tnovak

Reviewed By: kailiu

CC: leveldb

Differential Revision: https://reviews.facebook.net/D15255
---
 db/db_impl.cc      |  94 ++++++++++++++----------------
 db/db_impl.h       |   6 +-
 db/memtablelist.cc | 138 +++++++++++++++++++++++++++++----------------
 db/memtablelist.h  |  81 +++++++++++++++++---------
 4 files changed, 187 insertions(+), 132 deletions(-)

diff --git a/db/db_impl.cc b/db/db_impl.cc
index 48d0e3451..aff55c75d 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -264,6 +264,7 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname)
       bg_cv_(&mutex_),
       mem_rep_factory_(options_.memtable_factory.get()),
       mem_(new MemTable(internal_comparator_, options_)),
+      imm_(options_.min_write_buffer_number_to_merge),
       logfile_number_(0),
       super_version_(nullptr),
       super_version_number_(0),
@@ -360,7 +361,7 @@ DBImpl::~DBImpl() {
     delete mem_->Unref();
   }
 
-  imm_.UnrefAll(&to_delete);
+  imm_.current()->Unref(&to_delete);
   for (MemTable* m: to_delete) {
     delete m;
   }
@@ -508,7 +509,7 @@ bool DBImpl::SuperVersion::Unref() {
 
 void DBImpl::SuperVersion::Cleanup() {
   assert(refs.load(std::memory_order_relaxed) == 0);
-  imm.UnrefAll(&to_delete);
+  imm->Unref(&to_delete);
   MemTable* m = mem->Unref();
   if (m != nullptr) {
     to_delete.push_back(m);
@@ -516,13 +517,13 @@ void DBImpl::SuperVersion::Cleanup() {
   current->Unref();
 }
 
-void DBImpl::SuperVersion::Init(MemTable* new_mem, const MemTableList& new_imm,
+void DBImpl::SuperVersion::Init(MemTable* new_mem, MemTableListVersion* new_imm,
                                 Version* new_current) {
   mem = new_mem;
   imm = new_imm;
   current = new_current;
   mem->Ref();
-  imm.RefAll();
+  imm->Ref();
   current->Ref();
   refs.store(1, std::memory_order_relaxed);
 }
@@ -1221,7 +1222,7 @@ Status DBImpl::FlushMemTableToOutputFile(bool* madeProgress,
   mutex_.AssertHeld();
   assert(imm_.size() != 0);
 
-  if (!imm_.IsFlushPending(options_.min_write_buffer_number_to_merge)) {
+  if (!imm_.IsFlushPending()) {
     Log(options_.info_log, "FlushMemTableToOutputFile already in progress");
     Status s = Status::IOError("FlushMemTableToOutputFile already in progress");
     return s;
@@ -1762,8 +1763,7 @@ void DBImpl::MaybeScheduleFlushOrCompaction() {
   } else if (shutting_down_.Acquire_Load()) {
     // DB is being deleted; no more background compactions
   } else {
-    bool is_flush_pending =
-      imm_.IsFlushPending(options_.min_write_buffer_number_to_merge);
+    bool is_flush_pending = imm_.IsFlushPending();
     if (is_flush_pending &&
         (bg_flush_scheduled_ < options_.max_background_flushes)) {
       // memtable flush needed
@@ -1798,8 +1798,7 @@ void DBImpl::BGWorkCompaction(void* db) {
 Status DBImpl::BackgroundFlush(bool* madeProgress,
                                DeletionState& deletion_state) {
   Status stat;
-  while (stat.ok() &&
-         imm_.IsFlushPending(options_.min_write_buffer_number_to_merge)) {
+  while (stat.ok() && imm_.IsFlushPending()) {
     Log(options_.info_log,
         "BackgroundCallFlush doing FlushMemTableToOutputFile, flush slots available %d",
         options_.max_background_flushes - bg_flush_scheduled_);
@@ -1919,7 +1918,7 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress,
   mutex_.AssertHeld();
 
   // TODO: remove memtable flush from formal compaction
-  while (imm_.IsFlushPending(options_.min_write_buffer_number_to_merge)) {
+  while (imm_.IsFlushPending()) {
     Log(options_.info_log,
         "BackgroundCompaction doing FlushMemTableToOutputFile, compaction slots "
         "available %d",
@@ -2325,7 +2324,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
       const uint64_t imm_start = env_->NowMicros();
       LogFlush(options_.info_log);
       mutex_.Lock();
-      if (imm_.IsFlushPending(options_.min_write_buffer_number_to_merge)) {
+      if (imm_.IsFlushPending()) {
         FlushMemTableToOutputFile(nullptr, deletion_state);
         bg_cv_.SignalAll();  // Wakeup MakeRoomForWrite() if necessary
       }
@@ -2658,8 +2657,9 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
 namespace {
 struct IterState {
   port::Mutex* mu;
-  Version* version;
-  std::vector<MemTable*> mem; // includes both mem_ and imm_
+  Version* version = nullptr;
+  MemTable* mem = nullptr;
+  MemTableListVersion* imm = nullptr;
   DBImpl *db;
 };
 
@@ -2668,15 +2668,16 @@ static void CleanupIteratorState(void* arg1, void* arg2) {
   DBImpl::DeletionState deletion_state(state->db->GetOptions().
                                        max_write_buffer_number);
   state->mu->Lock();
-  for (unsigned int i = 0; i < state->mem.size(); i++) {
-    MemTable* m = state->mem[i]->Unref();
-    if (m != nullptr) {
-      deletion_state.memtables_to_free.push_back(m);
-    }
+  MemTable* m = state->mem->Unref();
+  if (m != nullptr) {
+    deletion_state.memtables_to_free.push_back(m);
   }
   if (state->version) {  // not set for memtable-only iterator
     state->version->Unref();
   }
+  if (state->imm) {  // not set for memtable-only iterator
+    state->imm->Unref(&deletion_state.memtables_to_free);
+  }
   // fast path FindObsoleteFiles
   state->db->FindObsoleteFiles(deletion_state, false, true);
   state->mu->Unlock();
@@ -2690,7 +2691,7 @@ Iterator* DBImpl::NewInternalIterator(const ReadOptions& options,
                                       SequenceNumber* latest_snapshot) {
   IterState* cleanup = new IterState;
   MemTable* mutable_mem;
-  std::vector<MemTable*> immutables;
+  MemTableListVersion* immutable_mems;
   Version* version;
 
   // Collect together all needed child iterators for mem
@@ -2699,27 +2700,22 @@ Iterator* DBImpl::NewInternalIterator(const ReadOptions& options,
   mem_->Ref();
   mutable_mem = mem_;
   // Collect together all needed child iterators for imm_
-  imm_.GetMemTables(&immutables);
-  for (unsigned int i = 0; i < immutables.size(); i++) {
-    immutables[i]->Ref();
-  }
+  immutable_mems = imm_.current();
+  immutable_mems->Ref();
   versions_->current()->Ref();
   version = versions_->current();
   mutex_.Unlock();
 
-  std::vector<Iterator*> list;
-  list.push_back(mutable_mem->NewIterator(options));
-  cleanup->mem.push_back(mutable_mem);
-
+  std::vector<Iterator*> iterator_list;
+  iterator_list.push_back(mutable_mem->NewIterator(options));
+  cleanup->mem = mutable_mem;
+  cleanup->imm = immutable_mems;
   // Collect all needed child iterators for immutable memtables
-  for (MemTable* m : immutables) {
-    list.push_back(m->NewIterator(options));
-    cleanup->mem.push_back(m);
-  }
+  immutable_mems->AddIterators(options, &iterator_list);
   // Collect iterators for files in L0 - Ln
-  version->AddIterators(options, storage_options_, &list);
-  Iterator* internal_iter =
-      NewMergingIterator(&internal_comparator_, &list[0], list.size());
+  version->AddIterators(options, storage_options_, &iterator_list);
+  Iterator* internal_iter = NewMergingIterator(
+      &internal_comparator_, &iterator_list[0], iterator_list.size());
   cleanup->version = version;
   cleanup->mu = &mutex_;
   cleanup->db = this;
@@ -2738,19 +2734,15 @@ std::pair<Iterator*, Iterator*> DBImpl::GetTailingIteratorPair(
     uint64_t* superversion_number) {
 
   MemTable* mutable_mem;
-  std::vector<MemTable*> immutables;
+  MemTableListVersion* immutable_mems;
   Version* version;
 
-  immutables.reserve(options_.max_write_buffer_number);
-
   // get all child iterators and bump their refcounts under lock
   mutex_.Lock();
   mutable_mem = mem_;
   mutable_mem->Ref();
-  imm_.GetMemTables(&immutables);
-  for (size_t i = 0; i < immutables.size(); ++i) {
-    immutables[i]->Ref();
-  }
+  immutable_mems = imm_.current();
+  immutable_mems->Ref();
   version = versions_->current();
   version->Ref();
   if (superversion_number != nullptr) {
@@ -2760,7 +2752,7 @@ std::pair<Iterator*, Iterator*> DBImpl::GetTailingIteratorPair(
 
   Iterator* mutable_iter = mutable_mem->NewIterator(options);
   IterState* mutable_cleanup = new IterState();
-  mutable_cleanup->mem.push_back(mutable_mem);
+  mutable_cleanup->mem = mutable_mem;
   mutable_cleanup->db = this;
   mutable_cleanup->mu = &mutex_;
   mutable_iter->RegisterCleanup(CleanupIteratorState, mutable_cleanup, nullptr);
@@ -2772,10 +2764,8 @@ std::pair<Iterator*, Iterator*> DBImpl::GetTailingIteratorPair(
   Iterator* immutable_iter;
   IterState* immutable_cleanup = new IterState();
   std::vector<Iterator*> list;
-  for (MemTable* m : immutables) {
-    list.push_back(m->NewIterator(options));
-    immutable_cleanup->mem.push_back(m);
-  }
+  immutable_mems->AddIterators(options, &list);
+  immutable_cleanup->imm = immutable_mems;
   version->AddIterators(options, storage_options_, &list);
   immutable_cleanup->version = version;
   immutable_cleanup->db = this;
@@ -2832,7 +2822,7 @@ void DBImpl::InstallSuperVersion(DeletionState& deletion_state) {
 DBImpl::SuperVersion* DBImpl::InstallSuperVersion(
     SuperVersion* new_superversion) {
   mutex_.AssertHeld();
-  new_superversion->Init(mem_, imm_, versions_->current());
+  new_superversion->Init(mem_, imm_.current(), versions_->current());
   SuperVersion* old_superversion = super_version_;
   super_version_ = new_superversion;
   ++super_version_number_;
@@ -2875,7 +2865,7 @@ Status DBImpl::GetImpl(const ReadOptions& options,
   if (get_version->mem->Get(lkey, value, &s, merge_context, options_)) {
     // Done
     RecordTick(options_.statistics.get(), MEMTABLE_HIT);
-  } else if (get_version->imm.Get(lkey, value, &s, merge_context, options_)) {
+  } else if (get_version->imm->Get(lkey, value, &s, merge_context, options_)) {
     // Done
     RecordTick(options_.statistics.get(), MEMTABLE_HIT);
   } else {
@@ -2930,10 +2920,10 @@ std::vector<Status> DBImpl::MultiGet(const ReadOptions& options,
   }
 
   MemTable* mem = mem_;
-  MemTableList imm = imm_;
+  MemTableListVersion* imm = imm_.current();
   Version* current = versions_->current();
   mem->Ref();
-  imm.RefAll();
+  imm->Ref();
   current->Ref();
 
   // Unlock while reading from files and memtables
@@ -2965,7 +2955,7 @@ std::vector<Status> DBImpl::MultiGet(const ReadOptions& options,
     LookupKey lkey(keys[i], snapshot);
     if (mem->Get(lkey, value, &s, merge_context, options_)) {
       // Done
-    } else if (imm.Get(lkey, value, &s, merge_context, options_)) {
+    } else if (imm->Get(lkey, value, &s, merge_context, options_)) {
       // Done
     } else {
       current->Get(options, lkey, value, &s, &merge_context, &stats, options_);
@@ -2984,7 +2974,7 @@ std::vector<Status> DBImpl::MultiGet(const ReadOptions& options,
     MaybeScheduleFlushOrCompaction();
   }
   MemTable* m = mem->Unref();
-  imm.UnrefAll(&to_delete);
+  imm->Unref(&to_delete);
   current->Unref();
   mutex_.Unlock();
 
diff --git a/db/db_impl.h b/db/db_impl.h
index cee574648..abefcba61 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -140,10 +140,10 @@ class DBImpl : public DB {
   // holds references to memtable, all immutable memtables and version
   struct SuperVersion {
     MemTable* mem;
-    MemTableList imm;
+    MemTableListVersion* imm;
     Version* current;
     std::atomic<uint32_t> refs;
-    // We need to_delete because during Cleanup(), imm.UnrefAll() returns
+    // We need to_delete because during Cleanup(), imm->Unref() returns
     // all memtables that we need to free through this vector. We then
     // delete all those memtables outside of mutex, during destruction
     std::vector<MemTable*> to_delete;
@@ -161,7 +161,7 @@ class DBImpl : public DB {
     // that needs to be deleted in to_delete vector. Unrefing those
     // objects needs to be done in the mutex
     void Cleanup();
-    void Init(MemTable* new_mem, const MemTableList& new_imm,
+    void Init(MemTable* new_mem, MemTableListVersion* new_imm,
               Version* new_current);
   };
 
diff --git a/db/memtablelist.cc b/db/memtablelist.cc
index 71e4e5a92..b52563ae8 100644
--- a/db/memtablelist.cc
+++ b/db/memtablelist.cc
@@ -16,41 +16,85 @@ namespace rocksdb {
 
 class InternalKeyComparator;
 class Mutex;
-class MemTableListIterator;
 class VersionSet;
 
-using std::list;
-
-// Increase reference count on all underling memtables
-void MemTableList::RefAll() {
-  for (auto &memtable : memlist_) {
-    memtable->Ref();
-  }
-}
-
-// Drop reference count on all underling memtables. If the
-// refcount of an underlying memtable drops to zero, then
-// return it in to_delete vector.
-void MemTableList::UnrefAll(std::vector<MemTable*>* to_delete) {
-  for (auto &memtable : memlist_) {
-    MemTable* m = memtable->Unref();
-    if (m != nullptr) {
-      to_delete->push_back(m);
+MemTableListVersion::MemTableListVersion(MemTableListVersion* old) {
+  if (old != nullptr) {
+    memlist_ = old->memlist_;
+    size_ = old->size_;
+    for (auto& m : memlist_) {
+      m->Ref();
     }
   }
 }
 
+void MemTableListVersion::Ref() { ++refs_; }
+
+void MemTableListVersion::Unref(std::vector<MemTable*>* to_delete) {
+  --refs_;
+  if (refs_ == 0) {
+    // if to_delete is equal to nullptr it means we're confident
+    // that refs_ will not be zero
+    assert(to_delete != nullptr);
+    for (const auto& m : memlist_) {
+      MemTable* x = m->Unref();
+      if (x != nullptr) {
+        to_delete->push_back(x);
+      }
+    }
+    delete this;
+  }
+}
+
+int MemTableListVersion::size() const { return size_; }
+
 // Returns the total number of memtables in the list
-int MemTableList::size() {
-  assert(num_flush_not_started_ <= size_);
-  return size_;
+int MemTableList::size() const {
+  assert(num_flush_not_started_ <= current_->size_);
+  return current_->size_;
+}
+
+// Search all the memtables starting from the most recent one.
+// Return the most recent value found, if any.
+// Operands stores the list of merge operations to apply, so far.
+bool MemTableListVersion::Get(const LookupKey& key, std::string* value,
+                              Status* s, MergeContext& merge_context,
+                              const Options& options) {
+  for (auto& memtable : memlist_) {
+    if (memtable->Get(key, value, s, merge_context, options)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+void MemTableListVersion::AddIterators(const ReadOptions& options,
+                                       std::vector<Iterator*>* iterator_list) {
+  for (auto& m : memlist_) {
+    iterator_list->push_back(m->NewIterator(options));
+  }
+}
+
+void MemTableListVersion::Add(MemTable* m) {
+  assert(refs_ == 1);  // only when refs_ == 1 is MemTableListVersion mutable
+  m->Ref();
+  memlist_.push_front(m);
+  ++size_;
+}
+
+void MemTableListVersion::Remove(MemTable* m) {
+  assert(refs_ == 1);  // only when refs_ == 1 is MemTableListVersion mutable
+  MemTable* x __attribute__((unused)) = m->Unref();
+  assert(x == nullptr);  // it still needs to be alive!
+  memlist_.remove(m);
+  --size_;
 }
 
 // Returns true if there is at least one memtable on which flush has
 // not yet started.
-bool MemTableList::IsFlushPending(int min_write_buffer_number_to_merge) {
+bool MemTableList::IsFlushPending() {
   if ((flush_requested_ && num_flush_not_started_ >= 1) ||
-      (num_flush_not_started_ >= min_write_buffer_number_to_merge)) {
+      (num_flush_not_started_ >= min_write_buffer_number_to_merge_)) {
     assert(imm_flush_needed.NoBarrier_Load() != nullptr);
     return true;
   }
@@ -59,7 +103,8 @@ bool MemTableList::IsFlushPending(int min_write_buffer_number_to_merge) {
 
 // Returns the memtables that need to be flushed.
 void MemTableList::PickMemtablesToFlush(std::vector<MemTable*>* ret) {
-  for (auto it = memlist_.rbegin(); it != memlist_.rend(); it++) {
+  const auto& memlist = current_->memlist_;
+  for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) {
     MemTable* m = *it;
     if (!m->flush_in_progress_) {
       assert(!m->flush_completed_);
@@ -122,8 +167,8 @@ Status MemTableList::InstallMemtableFlushResults(
   // scan all memtables from the earliest, and commit those
   // (in that order) that have finished flushing. Memetables
   // are always committed in the order that they were created.
-  while (!memlist_.empty() && s.ok()) {
-    MemTable* m = memlist_.back(); // get the last element
+  while (!current_->memlist_.empty() && s.ok()) {
+    MemTable* m = current_->memlist_.back();  // get the last element
     if (!m->flush_completed_) {
       break;
     }
@@ -135,6 +180,10 @@ Status MemTableList::InstallMemtableFlushResults(
     // this can release and reacquire the mutex.
     s = vset->LogAndApply(&m->edit_, mu);
 
+    // we will be changing the version in the next code path,
+    // so we better create a new one, since versions are immutable
+    InstallNewVersion();
+
     // All the later memtables that have the same filenum
     // are part of the same batch. They can be committed now.
     uint64_t mem_id = 1;  // how many memtables has been flushed.
@@ -144,7 +193,7 @@ Status MemTableList::InstallMemtableFlushResults(
             "Level-0 commit table #%lu: memtable #%lu done",
             (unsigned long)m->file_number_,
             (unsigned long)mem_id);
-        memlist_.remove(m);
+        current_->Remove(m);
         assert(m->file_number_ > 0);
 
         // pending_outputs can be cleared only after the newly created file
@@ -155,7 +204,6 @@ Status MemTableList::InstallMemtableFlushResults(
         if (m->Unref() != nullptr) {
           to_delete->push_back(m);
         }
-        size_--;
       } else {
         //commit failed. setup state so that we can flush again.
         Log(info_log,
@@ -172,7 +220,7 @@ Status MemTableList::InstallMemtableFlushResults(
         s = Status::IOError("Unable to commit flushed memtable");
       }
       ++mem_id;
-    } while (!memlist_.empty() && (m = memlist_.back()) &&
+    } while (!current_->memlist_.empty() && (m = current_->memlist_.back()) &&
              m->file_number_ == file_number);
   }
   commit_in_progress_ = false;
@@ -181,9 +229,9 @@ Status MemTableList::InstallMemtableFlushResults(
 
 // New memtables are inserted at the front of the list.
 void MemTableList::Add(MemTable* m) {
-  assert(size_ >= num_flush_not_started_);
-  size_++;
-  memlist_.push_front(m);
+  assert(current_->size_ >= num_flush_not_started_);
+  InstallNewVersion();
+  current_->Add(m);
   m->MarkImmutable();
   num_flush_not_started_++;
   if (num_flush_not_started_ == 1) {
@@ -194,28 +242,20 @@ void MemTableList::Add(MemTable* m) {
 // Returns an estimate of the number of bytes of data in use.
 size_t MemTableList::ApproximateMemoryUsage() {
   size_t size = 0;
-  for (auto &memtable : memlist_) {
+  for (auto& memtable : current_->memlist_) {
     size += memtable->ApproximateMemoryUsage();
   }
   return size;
 }
 
-// Search all the memtables starting from the most recent one.
-// Return the most recent value found, if any.
-// Operands stores the list of merge operations to apply, so far.
-bool MemTableList::Get(const LookupKey& key, std::string* value, Status* s,
-                       MergeContext& merge_context, const Options& options) {
-  for (auto &memtable : memlist_) {
-    if (memtable->Get(key, value, s, merge_context, options)) {
-      return true;
-    }
-  }
-  return false;
-}
-
-void MemTableList::GetMemTables(std::vector<MemTable*>* output) {
-  for (auto &memtable : memlist_) {
-    output->push_back(memtable);
+void MemTableList::InstallNewVersion() {
+  if (current_->refs_ == 1) {
+    // we're the only one using the version, just keep using it
+  } else {
+    // somebody else holds the current version, we need to create new one
+    MemTableListVersion* version = current_;
+    current_ = new MemTableListVersion(current_);
+    version->Unref();
   }
 }
 
diff --git a/db/memtablelist.h b/db/memtablelist.h
index ed353c8b8..354e9872a 100644
--- a/db/memtablelist.h
+++ b/db/memtablelist.h
@@ -7,8 +7,10 @@
 #pragma once
 #include <string>
 #include <list>
-#include <deque>
+#include <vector>
 #include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/iterator.h"
 #include "db/dbformat.h"
 #include "db/skiplist.h"
 #include "memtable.h"
@@ -17,44 +19,71 @@ namespace rocksdb {
 
 class InternalKeyComparator;
 class Mutex;
-class MemTableListIterator;
 
-//
+// keeps a list of immutable memtables in a vector. the list is immutable
+// if refcount is bigger than one. It is used as a state for Get() and
+// Iterator code paths
+class MemTableListVersion {
+ public:
+  explicit MemTableListVersion(MemTableListVersion* old = nullptr);
+
+  void Ref();
+  void Unref(std::vector<MemTable*>* to_delete = nullptr);
+
+  int size() const;
+
+  // Search all the memtables starting from the most recent one.
+  // Return the most recent value found, if any.
+  bool Get(const LookupKey& key, std::string* value, Status* s,
+           MergeContext& merge_context, const Options& options);
+
+  void AddIterators(const ReadOptions& options,
+                    std::vector<Iterator*>* iterator_list);
+
+  // REQUIRE: m is mutable memtable
+  void Add(MemTable* m);
+  // REQUIRE: m is mutable memtable
+  void Remove(MemTable* m);
+
+ private:
+  friend class MemTableList;
+  std::list<MemTable*> memlist_;
+  int size_ = 0;
+  int refs_ = 1;
+};
+
 // This class stores references to all the immutable memtables.
 // The memtables are flushed to L0 as soon as possible and in
 // any order. If there are more than one immutable memtable, their
 // flushes can occur concurrently.  However, they are 'committed'
 // to the manifest in FIFO order to maintain correctness and
 // recoverability from a crash.
-//
 class MemTableList {
  public:
   // A list of memtables.
-  MemTableList() : size_(0), num_flush_not_started_(0),
-    commit_in_progress_(false),
-    flush_requested_(false) {
+  explicit MemTableList(int min_write_buffer_number_to_merge)
+      : min_write_buffer_number_to_merge_(min_write_buffer_number_to_merge),
+        current_(new MemTableListVersion()),
+        num_flush_not_started_(0),
+        commit_in_progress_(false),
+        flush_requested_(false) {
     imm_flush_needed.Release_Store(nullptr);
+    current_->Ref();
   }
-  ~MemTableList() {};
+  ~MemTableList() {}
+
+  MemTableListVersion* current() { return current_; }
 
   // so that backgrund threads can detect non-nullptr pointer to
   // determine whether this is anything more to start flushing.
   port::AtomicPointer imm_flush_needed;
 
-  // Increase reference count on all underling memtables
-  void RefAll();
-
-  // Drop reference count on all underling memtables. If the refcount
-  // on an underlying memtable drops to zero, then return it in
-  // to_delete vector.
-  void UnrefAll(std::vector<MemTable*>* to_delete);
-
   // Returns the total number of memtables in the list
-  int size();
+  int size() const;
 
   // Returns true if there is at least one memtable on which flush has
   // not yet started.
-  bool IsFlushPending(int min_write_buffer_number_to_merge);
+  bool IsFlushPending();
 
   // Returns the earliest memtables that needs to be flushed. The returned
   // memtables are guaranteed to be in the ascending order of created time.
@@ -75,14 +104,6 @@ class MemTableList {
   // Returns an estimate of the number of bytes of data in use.
   size_t ApproximateMemoryUsage();
 
-  // Search all the memtables starting from the most recent one.
-  // Return the most recent value found, if any.
-  bool Get(const LookupKey& key, std::string* value, Status* s,
-           MergeContext& merge_context, const Options& options);
-
-  // Returns the list of underlying memtables.
-  void GetMemTables(std::vector<MemTable*>* list);
-
   // Request a flush of all existing memtables to storage
   void FlushRequested() { flush_requested_ = true; }
 
@@ -91,8 +112,12 @@ class MemTableList {
   // void operator=(const MemTableList&);
 
  private:
-  std::list<MemTable*> memlist_;
-  int size_;
+  // DB mutex held
+  void InstallNewVersion();
+
+  int min_write_buffer_number_to_merge_;
+
+  MemTableListVersion* current_;
 
   // the number of elements that still need flushing
   int num_flush_not_started_;

From 677fee27c6efcff677caf47e691103e4fb5f208c Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Fri, 24 Jan 2014 14:57:04 -0800
Subject: [PATCH 7/8] Make VersionSet::ReduceNumberOfLevels() static

Summary:
A lot of our code implicitly assumes number_levels to be static. ReduceNumberOfLevels() breaks that assumption. For example, after calling ReduceNumberOfLevels(), DBImpl::NumberLevels() will be different from VersionSet::NumberLevels(). This is dangerous. Thankfully, it's not in public headers and is only used from LDB cmd tool. LDB tool is only using it statically, i.e. it never calls it with running DB instance. With this diff, we make it explicitly static. This way, we can assume number_levels to be immutable and not break assumption that lot of our code is relying upon. LDB tool can still use the method.

Also, I removed the method from a separate file since it breaks filename completition. version_se<TAB> now completes to "version_set." instead of "version_set" (without the dot). I don't see a big reason that the function should be in a different file.

Test Plan: reduce_levels_test

Reviewers: dhruba, haobo, kailiu, sdong

Reviewed By: kailiu

CC: leveldb

Differential Revision: https://reviews.facebook.net/D15303
---
 db/compaction_picker.cc             |  8 ---
 db/compaction_picker.h              |  5 --
 db/version_set.cc                   | 69 ++++++++++++++++++++++++++
 db/version_set.h                    |  8 ++-
 db/version_set_reduce_num_levels.cc | 77 -----------------------------
 util/ldb_cmd.cc                     | 17 +------
 6 files changed, 77 insertions(+), 107 deletions(-)
 delete mode 100644 db/version_set_reduce_num_levels.cc

diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc
index 5839d4dac..284942117 100644
--- a/db/compaction_picker.cc
+++ b/db/compaction_picker.cc
@@ -45,15 +45,7 @@ CompactionPicker::CompactionPicker(const Options* options,
       options_(options),
       num_levels_(options->num_levels),
       icmp_(icmp) {
-  Init();
-}
 
-void CompactionPicker::ReduceNumberOfLevels(int new_levels) {
-  num_levels_ = new_levels;
-  Init();
-}
-
-void CompactionPicker::Init() {
   max_file_size_.reset(new uint64_t[NumberLevels()]);
   level_max_bytes_.reset(new uint64_t[NumberLevels()]);
   int target_file_size_multiplier = options_->target_file_size_multiplier;
diff --git a/db/compaction_picker.h b/db/compaction_picker.h
index 0fe086a18..ee77cc4c7 100644
--- a/db/compaction_picker.h
+++ b/db/compaction_picker.h
@@ -27,9 +27,6 @@ class CompactionPicker {
   CompactionPicker(const Options* options, const InternalKeyComparator* icmp);
   virtual ~CompactionPicker();
 
-  // See VersionSet::ReduceNumberOfLevels()
-  void ReduceNumberOfLevels(int new_levels);
-
   // Pick level and inputs for a new compaction.
   // Returns nullptr if there is no compaction to be done.
   // Otherwise returns a pointer to a heap-allocated object that
@@ -120,8 +117,6 @@ class CompactionPicker {
 
   const Options* const options_;
  private:
-  void Init();
-
   int num_levels_;
 
   const InternalKeyComparator* const icmp_;
diff --git a/db/version_set.cc b/db/version_set.cc
index a08feb875..e03d115d7 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -1730,6 +1730,75 @@ Status VersionSet::Recover() {
   return s;
 }
 
+Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
+                                        const Options* options,
+                                        const EnvOptions& storage_options,
+                                        int new_levels) {
+  if (new_levels <= 1) {
+    return Status::InvalidArgument(
+        "Number of levels needs to be bigger than 1");
+  }
+
+  const InternalKeyComparator cmp(options->comparator);
+  TableCache tc(dbname, options, storage_options, 10);
+  VersionSet versions(dbname, options, storage_options, &tc, &cmp);
+  Status status;
+
+  status = versions.Recover();
+  if (!status.ok()) {
+    return status;
+  }
+
+  Version* current_version = versions.current();
+  int current_levels = current_version->NumberLevels();
+
+  if (current_levels <= new_levels) {
+    return Status::OK();
+  }
+
+  // Make sure there are file only on one level from
+  // (new_levels-1) to (current_levels-1)
+  int first_nonempty_level = -1;
+  int first_nonempty_level_filenum = 0;
+  for (int i = new_levels - 1; i < current_levels; i++) {
+    int file_num = current_version->NumLevelFiles(i);
+    if (file_num != 0) {
+      if (first_nonempty_level < 0) {
+        first_nonempty_level = i;
+        first_nonempty_level_filenum = file_num;
+      } else {
+        char msg[255];
+        snprintf(msg, sizeof(msg),
+                 "Found at least two levels containing files: "
+                 "[%d:%d],[%d:%d].\n",
+                 first_nonempty_level, first_nonempty_level_filenum, i,
+                 file_num);
+        return Status::InvalidArgument(msg);
+      }
+    }
+  }
+
+  std::vector<FileMetaData*>* old_files_list = current_version->files_;
+  std::vector<FileMetaData*>* new_files_list =
+      new std::vector<FileMetaData*>[new_levels];
+  for (int i = 0; i < new_levels - 1; i++) {
+    new_files_list[i] = old_files_list[i];
+  }
+
+  if (first_nonempty_level > 0) {
+    new_files_list[new_levels - 1] = old_files_list[first_nonempty_level];
+  }
+
+  delete[] current_version->files_;
+  current_version->files_ = new_files_list;
+  current_version->num_levels_ = new_levels;
+
+  VersionEdit ve;
+  port::Mutex dummy_mutex;
+  MutexLock l(&dummy_mutex);
+  return versions.LogAndApply(&ve, &dummy_mutex, true);
+}
+
 Status VersionSet::DumpManifest(Options& options, std::string& dscname,
                                 bool verbose, bool hex) {
   struct LogReporter : public log::Reader::Reporter {
diff --git a/db/version_set.h b/db/version_set.h
index 1d92629f1..b0922d319 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -283,10 +283,16 @@ class VersionSet {
   // Try to reduce the number of levels. This call is valid when
   // only one level from the new max level to the old
   // max level containing files.
+  // The call is static, since number of levels is immutable during
+  // the lifetime of a RocksDB instance. It reduces number of levels
+  // in a DB by applying changes to manifest.
   // For example, a db currently has 7 levels [0-6], and a call to
   // to reduce to 5 [0-4] can only be executed when only one level
   // among [4-6] contains files.
-  Status ReduceNumberOfLevels(int new_levels, port::Mutex* mu);
+  static Status ReduceNumberOfLevels(const std::string& dbname,
+                                     const Options* options,
+                                     const EnvOptions& storage_options,
+                                     int new_levels);
 
   // Return the current version.
   Version* current() const { return current_; }
diff --git a/db/version_set_reduce_num_levels.cc b/db/version_set_reduce_num_levels.cc
deleted file mode 100644
index 68b84dab1..000000000
--- a/db/version_set_reduce_num_levels.cc
+++ /dev/null
@@ -1,77 +0,0 @@
-//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-//
-// Copyright (c) 2012 Facebook. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#include "db/version_set.h"
-
-#include <algorithm>
-#include <stdio.h>
-#include "db/log_reader.h"
-#include "db/log_writer.h"
-#include "util/logging.h"
-
-namespace rocksdb {
-
-Status VersionSet::ReduceNumberOfLevels(int new_levels, port::Mutex* mu) {
-
-  if(new_levels <= 1) {
-    return Status::InvalidArgument(
-        "Number of levels needs to be bigger than 1");
-  }
-
-  Version* current_version = current_;
-  int current_levels = current_version->NumberLevels();
-
-  if (current_levels <= new_levels) {
-    return Status::OK();
-  }
-
-  // Make sure there are file only on one level from
-  // (new_levels-1) to (current_levels-1)
-  int first_nonempty_level = -1;
-  int first_nonempty_level_filenum = 0;
-  for (int i = new_levels - 1; i < current_levels; i++) {
-    int file_num = current_version->NumLevelFiles(i);
-    if (file_num != 0) {
-      if (first_nonempty_level < 0) {
-        first_nonempty_level = i;
-        first_nonempty_level_filenum = file_num;
-      } else {
-        char msg[255];
-        sprintf(msg, "Found at least two levels containing files: "
-            "[%d:%d],[%d:%d].\n",
-            first_nonempty_level, first_nonempty_level_filenum, i, file_num);
-        return Status::InvalidArgument(msg);
-      }
-    }
-  }
-
-  Status st;
-  std::vector<FileMetaData*>*  old_files_list = current_version->files_;
-  std::vector<FileMetaData*>* new_files_list =
-      new std::vector<FileMetaData*>[new_levels];
-  for (int i = 0; i < new_levels - 1; i++) {
-    new_files_list[i] = old_files_list[i];
-  }
-
-  if (first_nonempty_level > 0) {
-    new_files_list[new_levels - 1] = old_files_list[first_nonempty_level];
-  }
-
-  delete[] current_version->files_;
-  current_version->files_ = new_files_list;
-  current_version->num_levels_ = new_levels;
-
-  num_levels_ = new_levels;
-  compaction_picker_->ReduceNumberOfLevels(new_levels);
-  VersionEdit ve;
-  st = LogAndApply(&ve, mu, true);
-  return st;
-}
-
-}
diff --git a/util/ldb_cmd.cc b/util/ldb_cmd.cc
index 65ecd61a2..80f609cd4 100644
--- a/util/ldb_cmd.cc
+++ b/util/ldb_cmd.cc
@@ -1069,23 +1069,8 @@ void ReduceDBLevelsCommand::DoCommand() {
   CloseDB();
 
   EnvOptions soptions;
-  TableCache tc(db_path_, &opt, soptions, 10);
-  const InternalKeyComparator cmp(opt.comparator);
-  VersionSet versions(db_path_, &opt, soptions, &tc, &cmp);
-  // We rely the VersionSet::Recover to tell us the internal data structures
-  // in the db. And the Recover() should never do any change (like LogAndApply)
-  // to the manifest file.
-  st = versions.Recover();
-  if (!st.ok()) {
-    exec_state_ = LDBCommandExecuteResult::FAILED(st.ToString());
-    return;
-  }
-
-  port::Mutex mu;
-  mu.Lock();
-  st = versions.ReduceNumberOfLevels(new_levels_, &mu);
-  mu.Unlock();
 
+  st = VersionSet::ReduceNumberOfLevels(db_path_, &opt, soptions, new_levels_);
   if (!st.ok()) {
     exec_state_ = LDBCommandExecuteResult::FAILED(st.ToString());
     return;

From b13bdfa500821587411fb72a91cebf9ebafac5cc Mon Sep 17 00:00:00 2001
From: Igor Canadi <icanadi@fb.com>
Date: Fri, 24 Jan 2014 14:57:52 -0800
Subject: [PATCH 8/8] Add a call DisownData() to Cache, which should speed up
 shutdown

Summary: On a shutdown, freeing memory takes a long time. If we're shutting down, we don't really care about memory leaks. I added a call to Cache that will avoid freeing all objects in cache.

Test Plan:
I created a script to test the speedup and demonstrate how to use the call: https://phabricator.fb.com/P3864368

Clean shutdown took 7.2 seconds, while fast and dirty one took 6.3 seconds. Unfortunately, the speedup is not that big, but should be bigger with bigger block_cache. I have set up the capacity to 80GB, but the script filled up only ~7GB.

Reviewers: dhruba, haobo, MarkCallaghan, xjin

Reviewed By: dhruba

CC: leveldb

Differential Revision: https://reviews.facebook.net/D15069
---
 include/rocksdb/cache.h | 9 +++++++++
 util/cache.cc           | 3 +++
 2 files changed, 12 insertions(+)

diff --git a/include/rocksdb/cache.h b/include/rocksdb/cache.h
index 3e0e5c1cd..7d58e1546 100644
--- a/include/rocksdb/cache.h
+++ b/include/rocksdb/cache.h
@@ -104,6 +104,15 @@ class Cache {
   // returns the maximum configured capacity of the cache
   virtual size_t GetCapacity() = 0;
 
+  // Call this on shutdown if you want to speed it up. Cache will disown
+  // any underlying data and will not free it on delete. This call will leak
+  // memory - call this only if you're shutting down the process.
+  // Any attempts of using cache after this call will fail terribly.
+  // Always delete the DB object before calling this method!
+  virtual void DisownData() {
+    // default implementation is noop
+  };
+
  private:
   void LRU_Remove(Handle* e);
   void LRU_Append(Handle* e);
diff --git a/util/cache.cc b/util/cache.cc
index 8fa03626b..4707eac94 100644
--- a/util/cache.cc
+++ b/util/cache.cc
@@ -409,6 +409,9 @@ class ShardedLRUCache : public Cache {
   virtual size_t GetCapacity() {
     return capacity_;
   }
+  virtual void DisownData() {
+    shard_ = nullptr;
+  }
 };
 
 }  // end anonymous namespace