From 5340484266df3e07544b6f4aa82f30dbbdc39af2 Mon Sep 17 00:00:00 2001 From: fyrz Date: Fri, 26 Sep 2014 10:35:12 +0200 Subject: [PATCH 01/16] Built-in comparator(s) in RocksJava Extended Built-in comparators with ReverseBytewiseComparator. Reverse key handling is under certain conditions essential. E.g. while using timestamp versioned data. As native-comparators were not available using JAVA-API. Both built-in comparators were exposed via JNI to be set upon database creation time. --- include/rocksdb/comparator.h | 4 ++++ java/org/rocksdb/Options.java | 23 +++++++++++++++++++++++ java/rocksjni/options.cc | 18 ++++++++++++++++++ util/comparator.cc | 23 ++++++++++++++++++++++- 4 files changed, 67 insertions(+), 1 deletion(-) diff --git a/include/rocksdb/comparator.h b/include/rocksdb/comparator.h index f3a8499a8..8e7366752 100644 --- a/include/rocksdb/comparator.h +++ b/include/rocksdb/comparator.h @@ -62,6 +62,10 @@ class Comparator { // must not be deleted. extern const Comparator* BytewiseComparator(); +// Return a builtin comparator that uses reverse lexicographic byte-wise +// ordering. +extern const Comparator* ReverseBytewiseComparator(); + } // namespace rocksdb #endif // STORAGE_ROCKSDB_INCLUDE_COMPARATOR_H_ diff --git a/java/org/rocksdb/Options.java b/java/org/rocksdb/Options.java index 8446136f8..7ccc74834 100644 --- a/java/org/rocksdb/Options.java +++ b/java/org/rocksdb/Options.java @@ -18,6 +18,14 @@ public class Options extends RocksObject { } static final long DEFAULT_CACHE_SIZE = 8 << 20; static final int DEFAULT_NUM_SHARD_BITS = -1; + + /** + * Builtin RocksDB comparators + */ + public enum BuiltinComparator { + BYTEWISE_COMPARATOR, REVERSE_BYTEWISE_COMPARATOR; + } + /** * Construct options for opening a RocksDB. * @@ -78,6 +86,21 @@ public class Options extends RocksObject { return createIfMissing(nativeHandle_); } + /** + * Set BuiltinComparator to be used with RocksDB. + * + * Note: Comparator can be set once upon database creation. + * + * Default: BytewiseComparator. + * @param builtinComparator a BuiltinComparator type. + */ + public void setBuiltinComparator(BuiltinComparator builtinComparator) { + assert(isInitialized()); + setBuiltinComparator(nativeHandle_, builtinComparator.ordinal()); + } + + private native void setBuiltinComparator(long handle, int builtinComparator); + /** * Amount of data to build up in memory (backed by an unsorted log * on disk) before converting to a sorted on-disk file. diff --git a/java/rocksjni/options.cc b/java/rocksjni/options.cc index 2dc2ffdc8..50416ef81 100644 --- a/java/rocksjni/options.cc +++ b/java/rocksjni/options.cc @@ -22,6 +22,7 @@ #include "rocksdb/table.h" #include "rocksdb/slice_transform.h" #include "rocksdb/rate_limiter.h" +#include "rocksdb/comparator.h" /* * Class: org_rocksdb_Options @@ -63,6 +64,23 @@ jboolean Java_org_rocksdb_Options_createIfMissing( return reinterpret_cast(jhandle)->create_if_missing; } +/* + * Class: org_rocksdb_Options + * Method: useReverseBytewiseComparator + * Signature: (JI)V + */ +void Java_org_rocksdb_Options_setBuiltinComparator( + JNIEnv* env, jobject jobj, jlong jhandle, jint builtinComparator) { + switch (builtinComparator){ + case 1: + reinterpret_cast(jhandle)->comparator = rocksdb::ReverseBytewiseComparator(); + break; + default: + reinterpret_cast(jhandle)->comparator = rocksdb::BytewiseComparator(); + break; + } +} + /* * Class: org_rocksdb_Options * Method: setWriteBufferSize diff --git a/util/comparator.cc b/util/comparator.cc index adeacac0a..d77d43117 100644 --- a/util/comparator.cc +++ b/util/comparator.cc @@ -69,13 +69,29 @@ class BytewiseComparatorImpl : public Comparator { // *key is a run of 0xffs. Leave it alone. } }; -} // namespace + +class ReverseBytewiseComparatorImpl : public BytewiseComparatorImpl { + public: + ReverseBytewiseComparatorImpl() { } + + virtual const char* Name() const { + return "leveldb.ReverseBytewiseComparator"; + } + + virtual int Compare(const Slice& a, const Slice& b) const { + return -a.compare(b); + } +}; + +}// namespace static port::OnceType once = LEVELDB_ONCE_INIT; static const Comparator* bytewise; +static const Comparator* rbytewise; static void InitModule() { bytewise = new BytewiseComparatorImpl; + rbytewise= new ReverseBytewiseComparatorImpl; } const Comparator* BytewiseComparator() { @@ -83,4 +99,9 @@ const Comparator* BytewiseComparator() { return bytewise; } +const Comparator* ReverseBytewiseComparator() { + port::InitOnce(&once, InitModule); + return rbytewise; +} + } // namespace rocksdb From 389edb6b1bd8ead82090feff7a723e26ba8d001e Mon Sep 17 00:00:00 2001 From: sdong Date: Fri, 26 Sep 2014 14:15:09 -0700 Subject: [PATCH 02/16] universal compaction picker: use double for potential overflow Summary: There is a possible overflow case in universal compaction picker. Use double to make the logic straight-forward Test Plan: make all check Reviewers: yhchiang, igor, MarkCallaghan, ljin Reviewed By: ljin Subscribers: leveldb Differential Revision: https://reviews.facebook.net/D23817 --- db/compaction_picker.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc index 7cd965c20..eb434eeac 100644 --- a/db/compaction_picker.cc +++ b/db/compaction_picker.cc @@ -746,15 +746,15 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp( // default kCompactionStopStyleTotalSize; with // kCompactionStopStyleSimilarSize, it's simply the size of the last // picked file. - uint64_t sz = (candidate_size * (100L + ratio)) /100; - if (sz < f->fd.GetFileSize()) { + double sz = candidate_size * (100.0 + ratio) / 100.0; + if (sz < static_cast(f->fd.GetFileSize())) { break; } if (options_->compaction_options_universal.stop_style == kCompactionStopStyleSimilarSize) { // Similar-size stopping rule: also check the last picked file isn't // far larger than the next candidate file. - sz = (f->fd.GetFileSize() * (100L + ratio)) / 100; - if (sz < candidate_size) { + sz = (f->fd.GetFileSize() * (100.0 + ratio)) / 100.0; + if (sz < static_cast(candidate_size)) { // If the small file we've encountered begins a run of similar-size // files, we'll pick them up on a future iteration of the outer // loop. If it's some lonely straggler, it'll eventually get picked From 8b8011a68ca80fc2523447302e27ceb912af9771 Mon Sep 17 00:00:00 2001 From: fyrz Date: Sat, 27 Sep 2014 10:06:13 +0200 Subject: [PATCH 03/16] Changed name of ReverseBytewiseComparator based on review comment --- util/comparator.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/util/comparator.cc b/util/comparator.cc index d77d43117..bbf0262f0 100644 --- a/util/comparator.cc +++ b/util/comparator.cc @@ -75,7 +75,7 @@ class ReverseBytewiseComparatorImpl : public BytewiseComparatorImpl { ReverseBytewiseComparatorImpl() { } virtual const char* Name() const { - return "leveldb.ReverseBytewiseComparator"; + return "rocksdb.ReverseBytewiseComparator"; } virtual int Compare(const Slice& a, const Slice& b) const { From 2dc6f62bb9205d5b071757d72bd28bbb77ab0745 Mon Sep 17 00:00:00 2001 From: Lei Jin Date: Mon, 29 Sep 2014 10:25:21 -0700 Subject: [PATCH 04/16] handle kDelete type in cuckoo builder Summary: when I changed std::vector to std::string to store key/value pairs in builder, I missed the handling for kDeletion type. As a result, value_size_ can be wrong if the first add key is for deletion. The is captured by ./cuckoo_table_db_test Test Plan: ./cuckoo_table_db_test ./cuckoo_table_reader_test ./cuckoo_table_builder_test Reviewers: sdong, yhchiang, igor Reviewed By: igor Subscribers: leveldb Differential Revision: https://reviews.facebook.net/D24045 --- table/cuckoo_table_builder.cc | 59 +++++++++++++++++++++++++++++++---- table/cuckoo_table_builder.h | 7 ++++- 2 files changed, 59 insertions(+), 7 deletions(-) diff --git a/table/cuckoo_table_builder.cc b/table/cuckoo_table_builder.cc index a66e9899e..6ff1fa0cf 100644 --- a/table/cuckoo_table_builder.cc +++ b/table/cuckoo_table_builder.cc @@ -60,9 +60,11 @@ CuckooTableBuilder::CuckooTableBuilder( hash_table_size_(use_module_hash ? 0 : 2), is_last_level_file_(false), has_seen_first_key_(false), + has_seen_first_value_(false), key_size_(0), value_size_(0), num_entries_(0), + num_values_(0), ucomp_(user_comparator), use_module_hash_(use_module_hash), identity_as_first_hash_(identity_as_first_hash), @@ -84,6 +86,12 @@ void CuckooTableBuilder::Add(const Slice& key, const Slice& value) { status_ = Status::Corruption("Unable to parse key into inernal key."); return; } + if (ikey.type != kTypeDeletion && ikey.type != kTypeValue) { + status_ = Status::NotSupported("Unsupported key type " + + std::to_string(ikey.type)); + return; + } + // Determine if we can ignore the sequence number and value type from // internal keys by looking at sequence number from first key. We assume // that if first key has a zero sequence number, then all the remaining @@ -94,16 +102,38 @@ void CuckooTableBuilder::Add(const Slice& key, const Slice& value) { smallest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size()); largest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size()); key_size_ = is_last_level_file_ ? ikey.user_key.size() : key.size(); - value_size_ = value.size(); + } + if (key_size_ != (is_last_level_file_ ? ikey.user_key.size() : key.size())) { + status_ = Status::NotSupported("all keys have to be the same size"); + return; } // Even if one sequence number is non-zero, then it is not last level. assert(!is_last_level_file_ || ikey.sequence == 0); - if (is_last_level_file_) { - kvs_.append(ikey.user_key.data(), ikey.user_key.size()); + + if (ikey.type == kTypeValue) { + if (!has_seen_first_value_) { + has_seen_first_value_ = true; + value_size_ = value.size(); + } + if (value_size_ != value.size()) { + status_ = Status::NotSupported("all values have to be the same size"); + return; + } + + if (is_last_level_file_) { + kvs_.append(ikey.user_key.data(), ikey.user_key.size()); + } else { + kvs_.append(key.data(), key.size()); + } + kvs_.append(value.data(), value.size()); + ++num_values_; } else { - kvs_.append(key.data(), key.size()); + if (is_last_level_file_) { + deleted_keys_.append(ikey.user_key.data(), ikey.user_key.size()); + } else { + deleted_keys_.append(key.data(), key.size()); + } } - kvs_.append(value.data(), value.size()); ++num_entries_; // In order to fill the empty buckets in the hash table, we identify a @@ -123,15 +153,30 @@ void CuckooTableBuilder::Add(const Slice& key, const Slice& value) { } } +bool CuckooTableBuilder::IsDeletedKey(uint64_t idx) const { + assert(closed_); + return idx >= num_values_; +} + Slice CuckooTableBuilder::GetKey(uint64_t idx) const { + assert(closed_); + if (IsDeletedKey(idx)) { + return Slice(&deleted_keys_[(idx - num_values_) * key_size_], key_size_); + } return Slice(&kvs_[idx * (key_size_ + value_size_)], key_size_); } Slice CuckooTableBuilder::GetUserKey(uint64_t idx) const { + assert(closed_); return is_last_level_file_ ? GetKey(idx) : ExtractUserKey(GetKey(idx)); } Slice CuckooTableBuilder::GetValue(uint64_t idx) const { + assert(closed_); + if (IsDeletedKey(idx)) { + static std::string empty_value(value_size_, 'a'); + return Slice(empty_value); + } return Slice(&kvs_[idx * (key_size_ + value_size_) + key_size_], value_size_); } @@ -256,7 +301,9 @@ Status CuckooTableBuilder::Finish() { ++num_added; s = file_->Append(GetKey(bucket.vector_idx)); if (s.ok()) { - s = file_->Append(GetValue(bucket.vector_idx)); + if (value_size_ > 0) { + s = file_->Append(GetValue(bucket.vector_idx)); + } } } if (!s.ok()) { diff --git a/table/cuckoo_table_builder.h b/table/cuckoo_table_builder.h index b1d7e649c..6898c1ef6 100644 --- a/table/cuckoo_table_builder.h +++ b/table/cuckoo_table_builder.h @@ -75,6 +75,7 @@ class CuckooTableBuilder: public TableBuilder { uint64_t* bucket_id); Status MakeHashTable(std::vector* buckets); + inline bool IsDeletedKey(uint64_t idx) const; inline Slice GetKey(uint64_t idx) const; inline Slice GetUserKey(uint64_t idx) const; inline Slice GetValue(uint64_t idx) const; @@ -88,14 +89,18 @@ class CuckooTableBuilder: public TableBuilder { uint64_t hash_table_size_; bool is_last_level_file_; bool has_seen_first_key_; + bool has_seen_first_value_; uint64_t key_size_; uint64_t value_size_; // A list of fixed-size key-value pairs concatenating into a string. // Use GetKey(), GetUserKey(), and GetValue() to retrieve a specific // key / value given an index std::string kvs_; - // Number of key-value pairs stored in kvs_ + std::string deleted_keys_; + // Number of key-value pairs stored in kvs_ + number of deleted keys uint64_t num_entries_; + // Number of keys that contain value (non-deletion op) + uint64_t num_values_; Status status_; TableProperties properties_; const Comparator* ucomp_; From 983d2de2de3468476c855be0ce405bd07dc31643 Mon Sep 17 00:00:00 2001 From: Igor Canadi Date: Mon, 29 Sep 2014 10:52:18 -0700 Subject: [PATCH 05/16] Add AUTHORS file. Fix #203 --- AUTHORS | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 AUTHORS diff --git a/AUTHORS b/AUTHORS new file mode 100644 index 000000000..e644f5530 --- /dev/null +++ b/AUTHORS @@ -0,0 +1,11 @@ +Facebook Inc. +Facebook Engineering Team + +Google Inc. +# Initial version authors: +Jeffrey Dean +Sanjay Ghemawat + +# Partial list of contributors: +Kevin Regan +Johan Bilien From 2faf49d5f15dc2e1c1bc79e84cee585631632019 Mon Sep 17 00:00:00 2001 From: Lei Jin Date: Mon, 29 Sep 2014 11:09:09 -0700 Subject: [PATCH 06/16] use GetContext to replace callback function pointer Summary: Intead of passing callback function pointer and its arg on Table::Get() interface, passing GetContext. This makes the interface cleaner and possible better perf. Also adding a fast pass for SaveValue() Test Plan: make all check Reviewers: igor, yhchiang, sdong Reviewed By: sdong Subscribers: leveldb Differential Revision: https://reviews.facebook.net/D24057 --- Makefile | 12 +- db/simple_table_db_test.cc | 815 -------------------- db/table_cache.cc | 11 +- db/table_cache.h | 7 +- db/version_set.cc | 114 +-- db/version_set.h | 22 - table/block_based_table_reader.cc | 12 +- table/block_based_table_reader.h | 7 +- table/cuckoo_table_reader.cc | 14 +- table/cuckoo_table_reader.h | 8 +- table/cuckoo_table_reader_test.cc | 108 +-- table/get_context.cc | 101 +++ table/get_context.h | 47 ++ table/plain_table_reader.cc | 10 +- table/plain_table_reader.h | 7 +- table/table_reader.h | 21 +- table/table_reader_bench.cc | 16 +- table/table_test.cc | 6 +- utilities/compacted_db/compacted_db_impl.cc | 50 +- utilities/compacted_db/compacted_db_impl.h | 3 - 20 files changed, 275 insertions(+), 1116 deletions(-) delete mode 100644 db/simple_table_db_test.cc create mode 100644 table/get_context.cc create mode 100644 table/get_context.h diff --git a/Makefile b/Makefile index 9d626e17f..75da74b08 100644 --- a/Makefile +++ b/Makefile @@ -122,7 +122,6 @@ TESTS = \ reduce_levels_test \ plain_table_db_test \ prefix_test \ - simple_table_db_test \ skiplist_test \ stringappend_test \ ttl_test \ @@ -371,9 +370,6 @@ log_write_bench: util/log_write_bench.o $(LIBOBJECTS) $(TESTHARNESS) plain_table_db_test: db/plain_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) db/plain_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) -simple_table_db_test: db/simple_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) db/simple_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) - table_reader_bench: table/table_reader_bench.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) table/table_reader_bench.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) -pg @@ -523,11 +519,11 @@ libz.a: curl -O http://zlib.net/zlib-1.2.8.tar.gz tar xvzf zlib-1.2.8.tar.gz cd zlib-1.2.8 && CFLAGS='-fPIC' ./configure --static && make - cp zlib-1.2.8/libz.a . + cp zlib-1.2.8/libz.a . libbz2.a: -rm -rf bzip2-1.0.6 - curl -O http://www.bzip.org/1.0.6/bzip2-1.0.6.tar.gz + curl -O http://www.bzip.org/1.0.6/bzip2-1.0.6.tar.gz tar xvzf bzip2-1.0.6.tar.gz cd bzip2-1.0.6 && make CFLAGS='-fPIC -Wall -Winline -O2 -g -D_FILE_OFFSET_BITS=64' cp bzip2-1.0.6/libbz2.a . @@ -539,7 +535,7 @@ libsnappy.a: cd snappy-1.1.1 && ./configure --with-pic --enable-static cd snappy-1.1.1 && make cp snappy-1.1.1/.libs/libsnappy.a . - + rocksdbjavastatic: libz.a libbz2.a libsnappy.a OPT="-fPIC -DNDEBUG -O2" $(MAKE) $(LIBRARY) -j @@ -547,7 +543,7 @@ rocksdbjavastatic: libz.a libbz2.a libsnappy.a rm -f ./java/$(ROCKSDBJNILIB) $(CXX) $(CXXFLAGS) -I./java/. $(JAVA_INCLUDE) -shared -fPIC -o ./java/$(ROCKSDBJNILIB) $(JNI_NATIVE_SOURCES) $(LIBOBJECTS) $(COVERAGEFLAGS) libz.a libbz2.a libsnappy.a cd java;jar -cf $(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class HISTORY*.md $(ROCKSDBJNILIB) - + rocksdbjava: OPT="-fPIC -DNDEBUG -O2" $(MAKE) $(LIBRARY) -j32 diff --git a/db/simple_table_db_test.cc b/db/simple_table_db_test.cc deleted file mode 100644 index 0a0ecf064..000000000 --- a/db/simple_table_db_test.cc +++ /dev/null @@ -1,815 +0,0 @@ -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// Copyright (c) 2013, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. -// -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -#include -#include - -#include "rocksdb/db.h" -#include "rocksdb/filter_policy.h" -#include "db/db_impl.h" -#include "db/filename.h" -#include "db/version_set.h" -#include "db/write_batch_internal.h" -#include "rocksdb/statistics.h" -#include "rocksdb/cache.h" -#include "rocksdb/compaction_filter.h" -#include "rocksdb/env.h" -#include "rocksdb/table.h" -#include "rocksdb/table_properties.h" -#include "table/table_builder.h" -#include "util/hash.h" -#include "util/logging.h" -#include "util/mutexlock.h" -#include "util/testharness.h" -#include "util/testutil.h" -#include "utilities/merge_operators.h" - -using std::unique_ptr; - -// IS THIS FILE STILL NEEDED? -namespace rocksdb { - -// SimpleTable is a simple table format for UNIT TEST ONLY. It is not built -// as production quality. -// SimpleTable requires the input key size to be fixed 16 bytes, value cannot -// be longer than 150000 bytes and stored data on disk in this format: -// +--------------------------------------------+ <= key1 offset -// | key1 | value_size (4 bytes) | | -// +----------------------------------------+ | -// | value1 | -// | | -// +----------------------------------------+---+ <= key2 offset -// | key2 | value_size (4 bytes) | | -// +----------------------------------------+ | -// | value2 | -// | | -// | ...... | -// +-----------------+--------------------------+ <= index_block_offset -// | key1 | key1 offset (8 bytes) | -// +-----------------+--------------------------+ -// | key2 | key2 offset (8 bytes) | -// +-----------------+--------------------------+ -// | key3 | key3 offset (8 bytes) | -// +-----------------+--------------------------+ -// | ...... | -// +-----------------+------------+-------------+ -// | index_block_offset (8 bytes) | -// +------------------------------+ - -// SimpleTable is a simple table format for UNIT TEST ONLY. It is not built -// as production quality. -class SimpleTableReader: public TableReader { -public: - // Attempt to open the table that is stored in bytes [0..file_size) - // of "file", and read the metadata entries necessary to allow - // retrieving data from the table. - // - // If successful, returns ok and sets "*table" to the newly opened - // table. The client should delete "*table" when no longer needed. - // If there was an error while initializing the table, sets "*table" - // to nullptr and returns a non-ok status. Does not take ownership of - // "*source", but the client must ensure that "source" remains live - // for the duration of the returned table's lifetime. - // - // *file must remain live while this Table is in use. - static Status Open(const ImmutableCFOptions& options, - const EnvOptions& env_options, - unique_ptr && file, uint64_t file_size, - unique_ptr* table_reader); - - Iterator* NewIterator(const ReadOptions&, Arena* arena) override; - - Status Get(const ReadOptions&, const Slice& key, void* arg, - bool (*handle_result)(void* arg, const ParsedInternalKey& k, - const Slice& v), - void (*mark_key_may_exist)(void*) = nullptr) override; - - uint64_t ApproximateOffsetOf(const Slice& key) override; - - virtual size_t ApproximateMemoryUsage() const override { return 0; } - - void SetupForCompaction() override; - - std::shared_ptr GetTableProperties() const override; - - ~SimpleTableReader(); - -private: - struct Rep; - Rep* rep_; - - explicit SimpleTableReader(Rep* rep) { - rep_ = rep; - } - friend class TableCache; - friend class SimpleTableIterator; - - Status GetOffset(const Slice& target, uint64_t* offset); - - // No copying allowed - explicit SimpleTableReader(const TableReader&) = delete; - void operator=(const TableReader&) = delete; -}; - -// Iterator to iterate SimpleTable -class SimpleTableIterator: public Iterator { -public: - explicit SimpleTableIterator(SimpleTableReader* table); - ~SimpleTableIterator(); - - bool Valid() const; - - void SeekToFirst(); - - void SeekToLast(); - - void Seek(const Slice& target); - - void Next(); - - void Prev(); - - Slice key() const; - - Slice value() const; - - Status status() const; - -private: - SimpleTableReader* table_; - uint64_t offset_; - uint64_t next_offset_; - Slice key_; - Slice value_; - char tmp_str_[4]; - char* key_str_; - char* value_str_; - int value_str_len_; - Status status_; - // No copying allowed - SimpleTableIterator(const SimpleTableIterator&) = delete; - void operator=(const Iterator&) = delete; -}; - -struct SimpleTableReader::Rep { - ~Rep() { - } - Rep(const ImmutableCFOptions& ioptions, const EnvOptions& env_options, - uint64_t index_start_offset, int num_entries) : - ioptions(ioptions), env_options(env_options), - index_start_offset(index_start_offset), num_entries(num_entries) { - } - - const ImmutableCFOptions& ioptions; - const EnvOptions& env_options; - Status status; - unique_ptr file; - uint64_t index_start_offset; - int num_entries; - std::shared_ptr table_properties; - - const static int user_key_size = 16; - const static int offset_length = 8; - const static int key_footer_len = 8; - - static int GetInternalKeyLength() { - return user_key_size + key_footer_len; - } -}; - -SimpleTableReader::~SimpleTableReader() { - delete rep_; -} - -Status SimpleTableReader::Open(const ImmutableCFOptions& ioptions, - const EnvOptions& env_options, - unique_ptr && file, - uint64_t size, - unique_ptr* table_reader) { - char footer_space[Rep::offset_length]; - Slice footer_input; - Status s = file->Read(size - Rep::offset_length, Rep::offset_length, - &footer_input, footer_space); - if (s.ok()) { - uint64_t index_start_offset = DecodeFixed64(footer_space); - - int num_entries = (size - Rep::offset_length - index_start_offset) - / (Rep::GetInternalKeyLength() + Rep::offset_length); - SimpleTableReader::Rep* rep = new SimpleTableReader::Rep( - ioptions, env_options, index_start_offset, num_entries); - - rep->file = std::move(file); - table_reader->reset(new SimpleTableReader(rep)); - } - return s; -} - -void SimpleTableReader::SetupForCompaction() { -} - -std::shared_ptr SimpleTableReader::GetTableProperties() - const { - return rep_->table_properties; -} - -Iterator* SimpleTableReader::NewIterator(const ReadOptions& options, - Arena* arena) { - if (arena == nullptr) { - return new SimpleTableIterator(this); - } else { - auto mem = arena->AllocateAligned(sizeof(SimpleTableIterator)); - return new (mem) SimpleTableIterator(this); - } -} - -Status SimpleTableReader::GetOffset(const Slice& target, uint64_t* offset) { - uint32_t left = 0; - uint32_t right = rep_->num_entries - 1; - char key_chars[Rep::GetInternalKeyLength()]; - Slice tmp_slice; - - uint32_t target_offset = 0; - while (left <= right) { - uint32_t mid = (left + right + 1) / 2; - - uint64_t offset_to_read = rep_->index_start_offset - + (Rep::GetInternalKeyLength() + Rep::offset_length) * mid; - Status s = rep_->file->Read(offset_to_read, Rep::GetInternalKeyLength(), - &tmp_slice, key_chars); - if (!s.ok()) { - return s; - } - - InternalKeyComparator ikc(rep_->ioptions.comparator); - int compare_result = ikc.Compare(tmp_slice, target); - - if (compare_result < 0) { - if (left == right) { - target_offset = right + 1; - break; - } - left = mid; - } else { - if (left == right) { - target_offset = left; - break; - } - right = mid - 1; - } - } - - if (target_offset >= (uint32_t) rep_->num_entries) { - *offset = rep_->index_start_offset; - return Status::OK(); - } - - char value_offset_chars[Rep::offset_length]; - - int64_t offset_for_value_offset = rep_->index_start_offset - + (Rep::GetInternalKeyLength() + Rep::offset_length) * target_offset - + Rep::GetInternalKeyLength(); - Status s = rep_->file->Read(offset_for_value_offset, Rep::offset_length, - &tmp_slice, value_offset_chars); - if (s.ok()) { - *offset = DecodeFixed64(value_offset_chars); - } - return s; -} - -Status SimpleTableReader::Get(const ReadOptions& options, const Slice& k, - void* arg, - bool (*saver)(void*, const ParsedInternalKey&, - const Slice&), - void (*mark_key_may_exist)(void*)) { - Status s; - SimpleTableIterator* iter = new SimpleTableIterator(this); - for (iter->Seek(k); iter->Valid(); iter->Next()) { - ParsedInternalKey parsed_key; - if (!ParseInternalKey(iter->key(), &parsed_key)) { - return Status::Corruption(Slice()); - } - - if (!(*saver)(arg, parsed_key, iter->value())) { - break; - } - } - s = iter->status(); - delete iter; - return s; -} - -uint64_t SimpleTableReader::ApproximateOffsetOf(const Slice& key) { - return 0; -} - -SimpleTableIterator::SimpleTableIterator(SimpleTableReader* table) : - table_(table) { - key_str_ = new char[SimpleTableReader::Rep::GetInternalKeyLength()]; - value_str_len_ = -1; - SeekToFirst(); -} - -SimpleTableIterator::~SimpleTableIterator() { - delete[] key_str_; - if (value_str_len_ >= 0) { - delete[] value_str_; - } -} - -bool SimpleTableIterator::Valid() const { - return offset_ < table_->rep_->index_start_offset; -} - -void SimpleTableIterator::SeekToFirst() { - next_offset_ = 0; - Next(); -} - -void SimpleTableIterator::SeekToLast() { - assert(false); -} - -void SimpleTableIterator::Seek(const Slice& target) { - Status s = table_->GetOffset(target, &next_offset_); - if (!s.ok()) { - status_ = s; - } - Next(); -} - -void SimpleTableIterator::Next() { - offset_ = next_offset_; - if (offset_ >= table_->rep_->index_start_offset) { - return; - } - Slice result; - int internal_key_size = SimpleTableReader::Rep::GetInternalKeyLength(); - - Status s = table_->rep_->file->Read(next_offset_, internal_key_size, &result, - key_str_); - next_offset_ += internal_key_size; - key_ = result; - - Slice value_size_slice; - s = table_->rep_->file->Read(next_offset_, 4, &value_size_slice, tmp_str_); - next_offset_ += 4; - uint32_t value_size = DecodeFixed32(tmp_str_); - - Slice value_slice; - if ((int) value_size > value_str_len_) { - if (value_str_len_ >= 0) { - delete[] value_str_; - } - value_str_ = new char[value_size]; - value_str_len_ = value_size; - } - s = table_->rep_->file->Read(next_offset_, value_size, &value_slice, - value_str_); - next_offset_ += value_size; - value_ = value_slice; -} - -void SimpleTableIterator::Prev() { - assert(false); -} - -Slice SimpleTableIterator::key() const { - Log(table_->rep_->ioptions.info_log, "key!!!!"); - return key_; -} - -Slice SimpleTableIterator::value() const { - return value_; -} - -Status SimpleTableIterator::status() const { - return status_; -} - -class SimpleTableBuilder: public TableBuilder { -public: - // Create a builder that will store the contents of the table it is - // building in *file. Does not close the file. It is up to the - // caller to close the file after calling Finish(). The output file - // will be part of level specified by 'level'. A value of -1 means - // that the caller does not know which level the output file will reside. - SimpleTableBuilder(const ImmutableCFOptions& ioptions, WritableFile* file, - CompressionType compression_type); - - // REQUIRES: Either Finish() or Abandon() has been called. - ~SimpleTableBuilder(); - - // Add key,value to the table being constructed. - // REQUIRES: key is after any previously added key according to comparator. - // REQUIRES: Finish(), Abandon() have not been called - void Add(const Slice& key, const Slice& value) override; - - // Return non-ok iff some error has been detected. - Status status() const override; - - // Finish building the table. Stops using the file passed to the - // constructor after this function returns. - // REQUIRES: Finish(), Abandon() have not been called - Status Finish() override; - - // Indicate that the contents of this builder should be abandoned. Stops - // using the file passed to the constructor after this function returns. - // If the caller is not going to call Finish(), it must call Abandon() - // before destroying this builder. - // REQUIRES: Finish(), Abandon() have not been called - void Abandon() override; - - // Number of calls to Add() so far. - uint64_t NumEntries() const override; - - // Size of the file generated so far. If invoked after a successful - // Finish() call, returns the size of the final generated file. - uint64_t FileSize() const override; - -private: - struct Rep; - Rep* rep_; - - // No copying allowed - SimpleTableBuilder(const SimpleTableBuilder&) = delete; - void operator=(const SimpleTableBuilder&) = delete; -}; - -struct SimpleTableBuilder::Rep { - const ImmutableCFOptions& ioptions; - WritableFile* file; - uint64_t offset = 0; - Status status; - - uint64_t num_entries = 0; - - bool closed = false; // Either Finish() or Abandon() has been called. - - const static int user_key_size = 16; - const static int offset_length = 8; - const static int key_footer_len = 8; - - static int GetInternalKeyLength() { - return user_key_size + key_footer_len; - } - - std::string index; - - Rep(const ImmutableCFOptions& iopt, WritableFile* f) : - ioptions(iopt), file(f) { - } - ~Rep() { - } -}; - -SimpleTableBuilder::SimpleTableBuilder(const ImmutableCFOptions& ioptions, - WritableFile* file, - CompressionType compression_type) : - rep_(new SimpleTableBuilder::Rep(ioptions, file)) { -} - -SimpleTableBuilder::~SimpleTableBuilder() { - delete (rep_); -} - -void SimpleTableBuilder::Add(const Slice& key, const Slice& value) { - assert((int ) key.size() == Rep::GetInternalKeyLength()); - - // Update index - rep_->index.append(key.data(), key.size()); - PutFixed64(&(rep_->index), rep_->offset); - - // Write key-value pair - rep_->file->Append(key); - rep_->offset += Rep::GetInternalKeyLength(); - - std::string size; - int value_size = value.size(); - PutFixed32(&size, value_size); - Slice sizeSlice(size); - rep_->file->Append(sizeSlice); - rep_->file->Append(value); - rep_->offset += value_size + 4; - - rep_->num_entries++; -} - -Status SimpleTableBuilder::status() const { - return Status::OK(); -} - -Status SimpleTableBuilder::Finish() { - Rep* r = rep_; - assert(!r->closed); - r->closed = true; - - uint64_t index_offset = rep_->offset; - Slice index_slice(rep_->index); - rep_->file->Append(index_slice); - rep_->offset += index_slice.size(); - - std::string index_offset_str; - PutFixed64(&index_offset_str, index_offset); - Slice foot_slice(index_offset_str); - rep_->file->Append(foot_slice); - rep_->offset += foot_slice.size(); - - return Status::OK(); -} - -void SimpleTableBuilder::Abandon() { - rep_->closed = true; -} - -uint64_t SimpleTableBuilder::NumEntries() const { - return rep_->num_entries; -} - -uint64_t SimpleTableBuilder::FileSize() const { - return rep_->offset; -} - -class SimpleTableFactory: public TableFactory { -public: - ~SimpleTableFactory() { - } - SimpleTableFactory() { - } - const char* Name() const override { - return "SimpleTable"; - } - Status NewTableReader(const ImmutableCFOptions& ioptions, - const EnvOptions& env_options, - const InternalKeyComparator& internal_key, - unique_ptr&& file, uint64_t file_size, - unique_ptr* table_reader) const; - - TableBuilder* NewTableBuilder( - const ImmutableCFOptions& ioptions, - const InternalKeyComparator& internal_key, - WritableFile* file, - const CompressionType compression_type, - const CompressionOptions& compression_opts) const; - - virtual Status SanitizeDBOptions(const DBOptions* db_opts) const override { - return Status::OK(); - } - - virtual std::string GetPrintableTableOptions() const override { - return std::string(); - } -}; - -Status SimpleTableFactory::NewTableReader( - const ImmutableCFOptions& ioptions, - const EnvOptions& env_options, - const InternalKeyComparator& internal_key, - unique_ptr&& file, uint64_t file_size, - unique_ptr* table_reader) const { - - return SimpleTableReader::Open(ioptions, env_options, std::move(file), - file_size, table_reader); -} - -TableBuilder* SimpleTableFactory::NewTableBuilder( - const ImmutableCFOptions& ioptions, - const InternalKeyComparator& internal_key, - WritableFile* file, const CompressionType compression_type, - const CompressionOptions& compression_opts) const { - return new SimpleTableBuilder(ioptions, file, compression_type); -} - -class SimpleTableDBTest { -protected: -public: - std::string dbname_; - Env* env_; - DB* db_; - - Options last_options_; - - SimpleTableDBTest() : - env_(Env::Default()) { - dbname_ = test::TmpDir() + "/simple_table_db_test"; - ASSERT_OK(DestroyDB(dbname_, Options())); - db_ = nullptr; - Reopen(); - } - - ~SimpleTableDBTest() { - delete db_; - ASSERT_OK(DestroyDB(dbname_, Options())); - } - - // Return the current option configuration. - Options CurrentOptions() { - Options options; - options.table_factory.reset(new SimpleTableFactory()); - return options; - } - - DBImpl* dbfull() { - return reinterpret_cast(db_); - } - - void Reopen(Options* options = nullptr) { - ASSERT_OK(TryReopen(options)); - } - - void Close() { - delete db_; - db_ = nullptr; - } - - void DestroyAndReopen(Options* options = nullptr) { - //Destroy using last options - Destroy(&last_options_); - ASSERT_OK(TryReopen(options)); - } - - void Destroy(Options* options) { - delete db_; - db_ = nullptr; - ASSERT_OK(DestroyDB(dbname_, *options)); - } - - Status PureReopen(Options* options, DB** db) { - return DB::Open(*options, dbname_, db); - } - - Status TryReopen(Options* options = nullptr) { - delete db_; - db_ = nullptr; - Options opts; - if (options != nullptr) { - opts = *options; - } else { - opts = CurrentOptions(); - opts.create_if_missing = true; - } - last_options_ = opts; - - return DB::Open(opts, dbname_, &db_); - } - - Status Put(const Slice& k, const Slice& v) { - return db_->Put(WriteOptions(), k, v); - } - - Status Delete(const std::string& k) { - return db_->Delete(WriteOptions(), k); - } - - std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) { - ReadOptions options; - options.snapshot = snapshot; - std::string result; - Status s = db_->Get(options, k, &result); - if (s.IsNotFound()) { - result = "NOT_FOUND"; - } else if (!s.ok()) { - result = s.ToString(); - } - return result; - } - - - int NumTableFilesAtLevel(int level) { - std::string property; - ASSERT_TRUE( - db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(level), - &property)); - return atoi(property.c_str()); - } - - // Return spread of files per level - std::string FilesPerLevel() { - std::string result; - int last_non_zero_offset = 0; - for (int level = 0; level < db_->NumberLevels(); level++) { - int f = NumTableFilesAtLevel(level); - char buf[100]; - snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f); - result += buf; - if (f > 0) { - last_non_zero_offset = result.size(); - } - } - result.resize(last_non_zero_offset); - return result; - } - - std::string IterStatus(Iterator* iter) { - std::string result; - if (iter->Valid()) { - result = iter->key().ToString() + "->" + iter->value().ToString(); - } else { - result = "(invalid)"; - } - return result; - } -}; - -TEST(SimpleTableDBTest, Empty) { - ASSERT_TRUE(db_ != nullptr); - ASSERT_EQ("NOT_FOUND", Get("0000000000000foo")); -} - -TEST(SimpleTableDBTest, ReadWrite) { - ASSERT_OK(Put("0000000000000foo", "v1")); - ASSERT_EQ("v1", Get("0000000000000foo")); - ASSERT_OK(Put("0000000000000bar", "v2")); - ASSERT_OK(Put("0000000000000foo", "v3")); - ASSERT_EQ("v3", Get("0000000000000foo")); - ASSERT_EQ("v2", Get("0000000000000bar")); -} - -TEST(SimpleTableDBTest, Flush) { - ASSERT_OK(Put("0000000000000foo", "v1")); - ASSERT_OK(Put("0000000000000bar", "v2")); - ASSERT_OK(Put("0000000000000foo", "v3")); - dbfull()->TEST_FlushMemTable(); - ASSERT_EQ("v3", Get("0000000000000foo")); - ASSERT_EQ("v2", Get("0000000000000bar")); -} - -TEST(SimpleTableDBTest, Flush2) { - ASSERT_OK(Put("0000000000000bar", "b")); - ASSERT_OK(Put("0000000000000foo", "v1")); - dbfull()->TEST_FlushMemTable(); - - ASSERT_OK(Put("0000000000000foo", "v2")); - dbfull()->TEST_FlushMemTable(); - ASSERT_EQ("v2", Get("0000000000000foo")); - - ASSERT_OK(Put("0000000000000eee", "v3")); - dbfull()->TEST_FlushMemTable(); - ASSERT_EQ("v3", Get("0000000000000eee")); - - ASSERT_OK(Delete("0000000000000bar")); - dbfull()->TEST_FlushMemTable(); - ASSERT_EQ("NOT_FOUND", Get("0000000000000bar")); - - ASSERT_OK(Put("0000000000000eee", "v5")); - dbfull()->TEST_FlushMemTable(); - ASSERT_EQ("v5", Get("0000000000000eee")); -} - -static std::string Key(int i) { - char buf[100]; - snprintf(buf, sizeof(buf), "key_______%06d", i); - return std::string(buf); -} - -static std::string RandomString(Random* rnd, int len) { - std::string r; - test::RandomString(rnd, len, &r); - return r; -} - -TEST(SimpleTableDBTest, CompactionTrigger) { - Options options = CurrentOptions(); - options.write_buffer_size = 100 << 10; //100KB - options.num_levels = 3; - options.max_mem_compaction_level = 0; - options.level0_file_num_compaction_trigger = 3; - Reopen(&options); - - Random rnd(301); - - for (int num = 0; num < options.level0_file_num_compaction_trigger - 1; - num++) { - std::vector values; - // Write 120KB (12 values, each 10K) - for (int i = 0; i < 12; i++) { - values.push_back(RandomString(&rnd, 10000)); - ASSERT_OK(Put(Key(i), values[i])); - } - dbfull()->TEST_WaitForFlushMemTable(); - ASSERT_EQ(NumTableFilesAtLevel(0), num + 1); - } - - //generate one more file in level-0, and should trigger level-0 compaction - std::vector values; - for (int i = 0; i < 12; i++) { - values.push_back(RandomString(&rnd, 10000)); - ASSERT_OK(Put(Key(i), values[i])); - } - dbfull()->TEST_WaitForCompact(); - - ASSERT_EQ(NumTableFilesAtLevel(0), 0); - ASSERT_EQ(NumTableFilesAtLevel(1), 1); -} - -} // namespace rocksdb - -int main(int argc, char** argv) { - return rocksdb::test::RunAllTests(); -} diff --git a/db/table_cache.cc b/db/table_cache.cc index 5cb96f8bf..580e8049d 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -15,6 +15,7 @@ #include "rocksdb/statistics.h" #include "table/iterator_wrapper.h" #include "table/table_reader.h" +#include "table/get_context.h" #include "util/coding.h" #include "util/stop_watch.h" @@ -132,10 +133,8 @@ Iterator* TableCache::NewIterator(const ReadOptions& options, Status TableCache::Get(const ReadOptions& options, const InternalKeyComparator& internal_comparator, - const FileDescriptor& fd, const Slice& k, void* arg, - bool (*saver)(void*, const ParsedInternalKey&, - const Slice&), - void (*mark_key_may_exist)(void*)) { + const FileDescriptor& fd, const Slice& k, + GetContext* get_context) { TableReader* t = fd.table_reader; Status s; Cache::Handle* handle = nullptr; @@ -147,13 +146,13 @@ Status TableCache::Get(const ReadOptions& options, } } if (s.ok()) { - s = t->Get(options, k, arg, saver, mark_key_may_exist); + s = t->Get(options, k, get_context); if (handle != nullptr) { ReleaseHandle(handle); } } else if (options.read_tier && s.IsIncomplete()) { // Couldnt find Table in cache but treat as kFound if no_io set - (*mark_key_may_exist)(arg); + get_context->MarkKeyMayExist(); return Status::OK(); } return s; diff --git a/db/table_cache.h b/db/table_cache.h index 2f6740d9f..76bb1c0a2 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -27,6 +27,7 @@ namespace rocksdb { class Env; class Arena; struct FileDescriptor; +class GetContext; class TableCache { public: @@ -52,10 +53,8 @@ class TableCache { // it returns false. Status Get(const ReadOptions& options, const InternalKeyComparator& internal_comparator, - const FileDescriptor& file_fd, const Slice& k, void* arg, - bool (*handle_result)(void*, const ParsedInternalKey&, - const Slice&), - void (*mark_key_may_exist)(void*) = nullptr); + const FileDescriptor& file_fd, const Slice& k, + GetContext* get_context); // Evict any entry for the specified file number static void Evict(Cache* cache, uint64_t file_number); diff --git a/db/version_set.cc b/db/version_set.cc index 0a46d7edc..10649fa6c 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -37,6 +37,7 @@ #include "table/format.h" #include "table/plain_table_factory.h" #include "table/meta_blocks.h" +#include "table/get_context.h" #include "util/coding.h" #include "util/logging.h" #include "util/stop_watch.h" @@ -627,81 +628,6 @@ void Version::AddIterators(const ReadOptions& read_options, } -// Called from TableCache::Get and Table::Get when file/block in which -// key may exist are not there in TableCache/BlockCache respectively. In this -// case we can't guarantee that key does not exist and are not permitted to do -// IO to be certain.Set the status=kFound and value_found=false to let the -// caller know that key may exist but is not there in memory -void MarkKeyMayExist(void* arg) { - Version::Saver* s = reinterpret_cast(arg); - s->state = Version::kFound; - if (s->value_found != nullptr) { - *(s->value_found) = false; - } -} - -bool SaveValue(void* arg, const ParsedInternalKey& parsed_key, - const Slice& v) { - Version::Saver* s = reinterpret_cast(arg); - MergeContext* merge_contex = s->merge_context; - std::string merge_result; // temporary area for merge results later - - assert(s != nullptr && merge_contex != nullptr); - - // TODO: Merge? - if (s->ucmp->Compare(parsed_key.user_key, s->user_key) == 0) { - // Key matches. Process it - switch (parsed_key.type) { - case kTypeValue: - if (Version::kNotFound == s->state) { - s->state = Version::kFound; - s->value->assign(v.data(), v.size()); - } else if (Version::kMerge == s->state) { - assert(s->merge_operator != nullptr); - s->state = Version::kFound; - if (!s->merge_operator->FullMerge(s->user_key, &v, - merge_contex->GetOperands(), - s->value, s->logger)) { - RecordTick(s->statistics, NUMBER_MERGE_FAILURES); - s->state = Version::kCorrupt; - } - } else { - assert(false); - } - return false; - - case kTypeDeletion: - if (Version::kNotFound == s->state) { - s->state = Version::kDeleted; - } else if (Version::kMerge == s->state) { - s->state = Version::kFound; - if (!s->merge_operator->FullMerge(s->user_key, nullptr, - merge_contex->GetOperands(), - s->value, s->logger)) { - RecordTick(s->statistics, NUMBER_MERGE_FAILURES); - s->state = Version::kCorrupt; - } - } else { - assert(false); - } - return false; - - case kTypeMerge: - assert(s->state == Version::kNotFound || s->state == Version::kMerge); - s->state = Version::kMerge; - merge_contex->PushOperand(v); - return true; - - default: - assert(false); - break; - } - } - - // s->state could be Corrupt, merge or notfound - - return false; -} Version::Version(ColumnFamilyData* cfd, VersionSet* vset, uint64_t version_number) @@ -756,46 +682,42 @@ void Version::Get(const ReadOptions& options, Slice user_key = k.user_key(); assert(status->ok() || status->IsMergeInProgress()); - Saver saver; - saver.state = status->ok()? kNotFound : kMerge; - saver.ucmp = user_comparator_; - saver.user_key = user_key; - saver.value_found = value_found; - saver.value = value; - saver.merge_operator = merge_operator_; - saver.merge_context = merge_context; - saver.logger = info_log_; - saver.statistics = db_statistics_; + + GetContext get_context(user_comparator_, merge_operator_, info_log_, + db_statistics_, status->ok() ? GetContext::kNotFound : GetContext::kMerge, + user_key, value, value_found, merge_context); FilePicker fp(files_, user_key, ikey, &file_levels_, num_non_empty_levels_, &file_indexer_, user_comparator_, internal_comparator_); FdWithKeyRange* f = fp.GetNextFile(); while (f != nullptr) { *status = table_cache_->Get(options, *internal_comparator_, f->fd, ikey, - &saver, SaveValue, MarkKeyMayExist); + &get_context); // TODO: examine the behavior for corrupted key if (!status->ok()) { return; } - switch (saver.state) { - case kNotFound: - break; // Keep searching in other files - case kFound: + switch (get_context.State()) { + case GetContext::kNotFound: + // Keep searching in other files + break; + case GetContext::kFound: return; - case kDeleted: - *status = Status::NotFound(); // Use empty error message for speed + case GetContext::kDeleted: + // Use empty error message for speed + *status = Status::NotFound(); return; - case kCorrupt: + case GetContext::kCorrupt: *status = Status::Corruption("corrupted key for ", user_key); return; - case kMerge: + case GetContext::kMerge: break; } f = fp.GetNextFile(); } - if (kMerge == saver.state) { + if (GetContext::kMerge == get_context.State()) { if (!merge_operator_) { *status = Status::InvalidArgument( "merge_operator is not properly initialized."); @@ -804,7 +726,7 @@ void Version::Get(const ReadOptions& options, // merge_operands are in saver and we hit the beginning of the key history // do a final merge of nullptr and operands; if (merge_operator_->FullMerge(user_key, nullptr, - saver.merge_context->GetOperands(), value, + merge_context->GetOperands(), value, info_log_)) { *status = Status::OK(); } else { diff --git a/db/version_set.h b/db/version_set.h index 9e6cc1e34..4a27a9592 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -241,28 +241,6 @@ class Version { FileMetaData* file; }; - enum SaverState { - kNotFound, - kFound, - kDeleted, - kCorrupt, - kMerge // saver contains the current merge result (the operands) - }; - - // Callback from TableCache::Get() - struct Saver { - SaverState state; - const Comparator* ucmp; - Slice user_key; - bool* value_found; // Is value set correctly? Used by KeyMayExist - std::string* value; - const MergeOperator* merge_operator; - // the merge operations encountered; - MergeContext* merge_context; - Logger* logger; - Statistics* statistics; - }; - private: friend class Compaction; friend class VersionSet; diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc index 09328dc3b..4b2050e03 100644 --- a/table/block_based_table_reader.cc +++ b/table/block_based_table_reader.cc @@ -33,6 +33,7 @@ #include "table/format.h" #include "table/meta_blocks.h" #include "table/two_level_iterator.h" +#include "table/get_context.h" #include "util/coding.h" #include "util/perf_context_imp.h" @@ -1100,10 +1101,8 @@ Iterator* BlockBasedTable::NewIterator(const ReadOptions& read_options, } Status BlockBasedTable::Get( - const ReadOptions& read_options, const Slice& key, void* handle_context, - bool (*result_handler)(void* handle_context, const ParsedInternalKey& k, - const Slice& v), - void (*mark_key_may_exist_handler)(void* handle_context)) { + const ReadOptions& read_options, const Slice& key, + GetContext* get_context) { Status s; auto filter_entry = GetFilter(read_options.read_tier == kBlockCacheTier); FilterBlockReader* filter = filter_entry.value; @@ -1141,7 +1140,7 @@ Status BlockBasedTable::Get( // couldn't get block from block_cache // Update Saver.state to Found because we are only looking for whether // we can guarantee the key is not there when "no_io" is set - (*mark_key_may_exist_handler)(handle_context); + get_context->MarkKeyMayExist(); break; } if (!biter.status().ok()) { @@ -1156,8 +1155,7 @@ Status BlockBasedTable::Get( s = Status::Corruption(Slice()); } - if (!(*result_handler)(handle_context, parsed_key, - biter.value())) { + if (!get_context->SaveValue(parsed_key, biter.value())) { done = true; break; } diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h index 503a91bb3..b272c4d13 100644 --- a/table/block_based_table_reader.h +++ b/table/block_based_table_reader.h @@ -40,6 +40,7 @@ class WritableFile; struct BlockBasedTableOptions; struct EnvOptions; struct ReadOptions; +class GetContext; using std::unique_ptr; @@ -76,11 +77,7 @@ class BlockBasedTable : public TableReader { Iterator* NewIterator(const ReadOptions&, Arena* arena = nullptr) override; Status Get(const ReadOptions& readOptions, const Slice& key, - void* handle_context, - bool (*result_handler)(void* handle_context, - const ParsedInternalKey& k, const Slice& v), - void (*mark_key_may_exist_handler)(void* handle_context) = - nullptr) override; + GetContext* get_context) override; // Given a key, return an approximate byte offset in the file where // the data for that key begins (or would begin if the key were diff --git a/table/cuckoo_table_reader.cc b/table/cuckoo_table_reader.cc index af6fe6e88..f8da4e288 100644 --- a/table/cuckoo_table_reader.cc +++ b/table/cuckoo_table_reader.cc @@ -19,6 +19,7 @@ #include "rocksdb/table.h" #include "table/meta_blocks.h" #include "table/cuckoo_table_factory.h" +#include "table/get_context.h" #include "util/arena.h" #include "util/coding.h" @@ -126,11 +127,8 @@ CuckooTableReader::CuckooTableReader( status_ = file_->Read(0, file_size, &file_data_, nullptr); } -Status CuckooTableReader::Get( - const ReadOptions& readOptions, const Slice& key, void* handle_context, - bool (*result_handler)(void* arg, const ParsedInternalKey& k, - const Slice& v), - void (*mark_key_may_exist_handler)(void* handle_context)) { +Status CuckooTableReader::Get(const ReadOptions& readOptions, const Slice& key, + GetContext* get_context) { assert(key.size() == key_length_ + (is_last_level_ ? 8 : 0)); Slice user_key = ExtractUserKey(key); for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_; ++hash_cnt) { @@ -149,14 +147,12 @@ Status CuckooTableReader::Get( if (ucomp_->Compare(user_key, Slice(bucket, user_key.size())) == 0) { Slice value(bucket + key_length_, value_length_); if (is_last_level_) { - ParsedInternalKey found_ikey( - Slice(bucket, key_length_), 0, kTypeValue); - result_handler(handle_context, found_ikey, value); + get_context->SaveValue(value); } else { Slice full_key(bucket, key_length_); ParsedInternalKey found_ikey; ParseInternalKey(full_key, &found_ikey); - result_handler(handle_context, found_ikey, value); + get_context->SaveValue(found_ikey, value); } // We don't support merge operations. So, we return here. return Status::OK(); diff --git a/table/cuckoo_table_reader.h b/table/cuckoo_table_reader.h index 8f7635cfa..4f00a9e41 100644 --- a/table/cuckoo_table_reader.h +++ b/table/cuckoo_table_reader.h @@ -40,12 +40,8 @@ class CuckooTableReader: public TableReader { Status status() const { return status_; } - Status Get( - const ReadOptions& read_options, const Slice& key, void* handle_context, - bool (*result_handler)(void* arg, const ParsedInternalKey& k, - const Slice& v), - void (*mark_key_may_exist_handler)(void* handle_context) = nullptr) - override; + Status Get(const ReadOptions& read_options, const Slice& key, + GetContext* get_context) override; Iterator* NewIterator(const ReadOptions&, Arena* arena = nullptr) override; void Prepare(const Slice& target) override; diff --git a/table/cuckoo_table_reader_test.cc b/table/cuckoo_table_reader_test.cc index 6566b7a29..66d88fc71 100644 --- a/table/cuckoo_table_reader_test.cc +++ b/table/cuckoo_table_reader_test.cc @@ -25,6 +25,7 @@ int main() { #include "table/cuckoo_table_builder.h" #include "table/cuckoo_table_reader.h" #include "table/cuckoo_table_factory.h" +#include "table/get_context.h" #include "util/arena.h" #include "util/random.h" #include "util/testharness.h" @@ -61,25 +62,6 @@ uint64_t GetSliceHash(const Slice& s, uint32_t index, return hash_map[s.ToString()][index]; } -// Methods, variables for checking key and values read. -struct ValuesToAssert { - ValuesToAssert(const std::string& key, const Slice& value) - : expected_user_key(key), - expected_value(value), - call_count(0) {} - std::string expected_user_key; - Slice expected_value; - int call_count; -}; - -bool AssertValues(void* assert_obj, - const ParsedInternalKey& k, const Slice& v) { - ValuesToAssert *ptr = reinterpret_cast(assert_obj); - ASSERT_EQ(ptr->expected_value.ToString(), v.ToString()); - ASSERT_EQ(ptr->expected_user_key, k.user_key.ToString()); - ++ptr->call_count; - return false; -} } // namespace class CuckooReaderTest { @@ -134,11 +116,14 @@ class CuckooReaderTest { ucomp, GetSliceHash); ASSERT_OK(reader.status()); + // Assume no merge/deletion for (uint32_t i = 0; i < num_items; ++i) { - ValuesToAssert v(user_keys[i], values[i]); - ASSERT_OK(reader.Get( - ReadOptions(), Slice(keys[i]), &v, AssertValues, nullptr)); - ASSERT_EQ(1, v.call_count); + std::string value; + GetContext get_context(ucomp, nullptr, nullptr, nullptr, + GetContext::kNotFound, Slice(user_keys[i]), &value, + nullptr, nullptr); + ASSERT_OK(reader.Get(ReadOptions(), Slice(keys[i]), &get_context)); + ASSERT_EQ(values[i], value); } } void UpdateKeys(bool with_zero_seqno) { @@ -329,6 +314,7 @@ TEST(CuckooReaderTest, WhenKeyNotFound) { // Make all hash values collide. AddHashLookups(user_keys[i], 0, kNumHashFunc); } + auto* ucmp = BytewiseComparator(); CreateCuckooFileAndCheckReader(); std::unique_ptr read_file; ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options)); @@ -337,7 +323,7 @@ TEST(CuckooReaderTest, WhenKeyNotFound) { ioptions, std::move(read_file), file_size, - BytewiseComparator(), + ucmp, GetSliceHash); ASSERT_OK(reader.status()); // Search for a key with colliding hash values. @@ -346,10 +332,11 @@ TEST(CuckooReaderTest, WhenKeyNotFound) { AddHashLookups(not_found_user_key, 0, kNumHashFunc); ParsedInternalKey ikey(not_found_user_key, 1000, kTypeValue); AppendInternalKey(¬_found_key, ikey); - ValuesToAssert v("", ""); - ASSERT_OK(reader.Get( - ReadOptions(), Slice(not_found_key), &v, AssertValues, nullptr)); - ASSERT_EQ(0, v.call_count); + std::string value; + GetContext get_context(ucmp, nullptr, nullptr, nullptr, GetContext::kNotFound, + Slice(not_found_key), &value, nullptr, nullptr); + ASSERT_OK(reader.Get(ReadOptions(), Slice(not_found_key), &get_context)); + ASSERT_TRUE(value.empty()); ASSERT_OK(reader.status()); // Search for a key with an independent hash value. std::string not_found_user_key2 = "key" + NumToStr(num_items + 1); @@ -357,9 +344,11 @@ TEST(CuckooReaderTest, WhenKeyNotFound) { ParsedInternalKey ikey2(not_found_user_key2, 1000, kTypeValue); std::string not_found_key2; AppendInternalKey(¬_found_key2, ikey2); - ASSERT_OK(reader.Get( - ReadOptions(), Slice(not_found_key2), &v, AssertValues, nullptr)); - ASSERT_EQ(0, v.call_count); + GetContext get_context2(ucmp, nullptr, nullptr, nullptr, + GetContext::kNotFound, Slice(not_found_key2), &value, + nullptr, nullptr); + ASSERT_OK(reader.Get(ReadOptions(), Slice(not_found_key2), &get_context2)); + ASSERT_TRUE(value.empty()); ASSERT_OK(reader.status()); // Test read when key is unused key. @@ -369,34 +358,16 @@ TEST(CuckooReaderTest, WhenKeyNotFound) { // Add hash values that map to empty buckets. AddHashLookups(ExtractUserKey(unused_key).ToString(), kNumHashFunc, kNumHashFunc); - ASSERT_OK(reader.Get( - ReadOptions(), Slice(unused_key), &v, AssertValues, nullptr)); - ASSERT_EQ(0, v.call_count); + GetContext get_context3(ucmp, nullptr, nullptr, nullptr, + GetContext::kNotFound, Slice(unused_key), &value, + nullptr, nullptr); + ASSERT_OK(reader.Get(ReadOptions(), Slice(unused_key), &get_context3)); + ASSERT_TRUE(value.empty()); ASSERT_OK(reader.status()); } // Performance tests namespace { -int64_t found_count = 0; -std::string value; -bool DoNothing(void* arg, const ParsedInternalKey& k, const Slice& v) { - // Deliberately empty. - if (*reinterpret_cast(k.user_key.data()) == - *reinterpret_cast(v.data())) { - ++found_count; - value.assign(v.data(), v.size()); - } - return false; -} - -bool CheckValue(void* cnt_ptr, const ParsedInternalKey& k, const Slice& v) { - ++*reinterpret_cast(cnt_ptr); - std::string expected_value; - AppendInternalKey(&expected_value, k); - ASSERT_EQ(0, v.compare(Slice(&expected_value[0], v.size()))); - return false; -} - void GetKeys(uint64_t num, std::vector* keys) { keys->clear(); IterKey k; @@ -457,13 +428,15 @@ void WriteFile(const std::vector& keys, test::Uint64Comparator(), nullptr); ASSERT_OK(reader.status()); ReadOptions r_options; + std::string value; + // Assume only the fast path is triggered + GetContext get_context(nullptr, nullptr, nullptr, nullptr, + GetContext::kNotFound, Slice(), &value, + nullptr, nullptr); for (uint64_t i = 0; i < num; ++i) { - int cnt = 0; - ASSERT_OK(reader.Get(r_options, Slice(keys[i]), &cnt, CheckValue, nullptr)); - if (cnt != 1) { - fprintf(stderr, "%" PRIu64 " not found.\n", i); - ASSERT_EQ(1, cnt); - } + value.clear(); + ASSERT_OK(reader.Get(r_options, Slice(keys[i]), &get_context)); + ASSERT_TRUE(Slice(keys[i]) == Slice(&keys[i][0], 4)); } } @@ -501,7 +474,11 @@ void ReadKeys(uint64_t num, uint32_t batch_size) { } std::random_shuffle(keys.begin(), keys.end()); - found_count = 0; + std::string value; + // Assume only the fast path is triggered + GetContext get_context(nullptr, nullptr, nullptr, nullptr, + GetContext::kNotFound, Slice(), &value, + nullptr, nullptr); uint64_t start_time = env->NowMicros(); if (batch_size > 0) { for (uint64_t i = 0; i < num; i += batch_size) { @@ -510,20 +487,19 @@ void ReadKeys(uint64_t num, uint32_t batch_size) { } for (uint64_t j = i; j < i+batch_size && j < num; ++j) { reader.Get(r_options, Slice(reinterpret_cast(&keys[j]), 16), - nullptr, DoNothing, nullptr); + &get_context); } } } else { for (uint64_t i = 0; i < num; i++) { reader.Get(r_options, Slice(reinterpret_cast(&keys[i]), 16), - nullptr, DoNothing, nullptr); + &get_context); } } float time_per_op = (env->NowMicros() - start_time) * 1.0 / num; fprintf(stderr, - "Time taken per op is %.3fus (%.1f Mqps) with batch size of %u, " - "# of found keys %" PRId64 "\n", - time_per_op, 1.0 / time_per_op, batch_size, found_count); + "Time taken per op is %.3fus (%.1f Mqps) with batch size of %u\n", + time_per_op, 1.0 / time_per_op, batch_size); } } // namespace. diff --git a/table/get_context.cc b/table/get_context.cc new file mode 100644 index 000000000..59dfa41e6 --- /dev/null +++ b/table/get_context.cc @@ -0,0 +1,101 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include "table/get_context.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/statistics.h" +#include "util/statistics.h" + +namespace rocksdb { + +GetContext::GetContext(const Comparator* ucmp, + const MergeOperator* merge_operator, + Logger* logger, Statistics* statistics, + GetState init_state, const Slice& user_key, std::string* ret_value, + bool* value_found, MergeContext* merge_context) + : ucmp_(ucmp), + merge_operator_(merge_operator), + logger_(logger), + statistics_(statistics), + state_(init_state), + user_key_(user_key), + value_(ret_value), + value_found_(value_found), + merge_context_(merge_context) { +} + +// Called from TableCache::Get and Table::Get when file/block in which +// key may exist are not there in TableCache/BlockCache respectively. In this +// case we can't guarantee that key does not exist and are not permitted to do +// IO to be certain.Set the status=kFound and value_found=false to let the +// caller know that key may exist but is not there in memory +void GetContext::MarkKeyMayExist() { + state_ = kFound; + if (value_found_ != nullptr) { + *value_found_ = false; + } +} + +void GetContext::SaveValue(const Slice& value) { + state_ = kFound; + value_->assign(value.data(), value.size()); +} + +bool GetContext::SaveValue(const ParsedInternalKey& parsed_key, + const Slice& value) { + assert((state_ != kMerge && parsed_key.type != kTypeMerge) || + merge_context_ != nullptr); + if (ucmp_->Compare(parsed_key.user_key, user_key_) == 0) { + // Key matches. Process it + switch (parsed_key.type) { + case kTypeValue: + assert(state_ == kNotFound || state_ == kMerge); + if (kNotFound == state_) { + state_ = kFound; + value_->assign(value.data(), value.size()); + } else if (kMerge == state_) { + assert(merge_operator_ != nullptr); + state_ = kFound; + if (!merge_operator_->FullMerge(user_key_, &value, + merge_context_->GetOperands(), + value_, logger_)) { + RecordTick(statistics_, NUMBER_MERGE_FAILURES); + state_ = kCorrupt; + } + } + return false; + + case kTypeDeletion: + assert(state_ == kNotFound || state_ == kMerge); + if (kNotFound == state_) { + state_ = kDeleted; + } else if (kMerge == state_) { + state_ = kFound; + if (!merge_operator_->FullMerge(user_key_, nullptr, + merge_context_->GetOperands(), + value_, logger_)) { + RecordTick(statistics_, NUMBER_MERGE_FAILURES); + state_ = kCorrupt; + } + } + return false; + + case kTypeMerge: + assert(state_ == kNotFound || state_ == kMerge); + state_ = kMerge; + merge_context_->PushOperand(value); + return true; + + default: + assert(false); + break; + } + } + + // state_ could be Corrupt, merge or notfound + return false; +} + +} // namespace rocksdb diff --git a/table/get_context.h b/table/get_context.h new file mode 100644 index 000000000..a38f3c533 --- /dev/null +++ b/table/get_context.h @@ -0,0 +1,47 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#pragma once +#include +#include "db/merge_context.h" + +namespace rocksdb { +class MergeContext; + +class GetContext { + public: + enum GetState { + kNotFound, + kFound, + kDeleted, + kCorrupt, + kMerge // saver contains the current merge result (the operands) + }; + + GetContext(const Comparator* ucmp, const MergeOperator* merge_operator, + Logger* logger, Statistics* statistics, + GetState init_state, const Slice& user_key, std::string* ret_value, + bool* value_found, MergeContext* merge_context); + + void MarkKeyMayExist(); + void SaveValue(const Slice& value); + bool SaveValue(const ParsedInternalKey& parsed_key, const Slice& value); + GetState State() const { return state_; } + + private: + const Comparator* ucmp_; + const MergeOperator* merge_operator_; + // the merge operations encountered; + Logger* logger_; + Statistics* statistics_; + + GetState state_; + Slice user_key_; + std::string* value_; + bool* value_found_; // Is value set correctly? Used by KeyMayExist + MergeContext* merge_context_; +}; + +} // namespace rocksdb diff --git a/table/plain_table_reader.cc b/table/plain_table_reader.cc index 3a6d48be8..db37241a9 100644 --- a/table/plain_table_reader.cc +++ b/table/plain_table_reader.cc @@ -26,6 +26,7 @@ #include "table/two_level_iterator.h" #include "table/plain_table_factory.h" #include "table/plain_table_key_coding.h" +#include "table/get_context.h" #include "util/arena.h" #include "util/coding.h" @@ -525,10 +526,7 @@ void PlainTableReader::Prepare(const Slice& target) { } Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target, - void* arg, - bool (*saver)(void*, const ParsedInternalKey&, - const Slice&), - void (*mark_key_may_exist)(void*)) { + GetContext* get_context) { // Check bloom filter first. Slice prefix_slice; uint32_t prefix_hash; @@ -580,8 +578,10 @@ Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target, } prefix_match = true; } + // TODO(ljin): since we know the key comparison result here, + // can we enable the fast path? if (internal_comparator_.Compare(found_key, parsed_target) >= 0) { - if (!(*saver)(arg, found_key, found_value)) { + if (!get_context->SaveValue(found_key, found_value)) { break; } } diff --git a/table/plain_table_reader.h b/table/plain_table_reader.h index fcc94a53e..531ac8e8b 100644 --- a/table/plain_table_reader.h +++ b/table/plain_table_reader.h @@ -36,6 +36,7 @@ class TableCache; class TableReader; class InternalKeyComparator; class PlainTableKeyDecoder; +class GetContext; using std::unique_ptr; using std::unordered_map; @@ -65,10 +66,8 @@ class PlainTableReader: public TableReader { void Prepare(const Slice& target); - Status Get(const ReadOptions&, const Slice& key, void* arg, - bool (*result_handler)(void* arg, const ParsedInternalKey& k, - const Slice& v), - void (*mark_key_may_exist)(void*) = nullptr); + Status Get(const ReadOptions&, const Slice& key, + GetContext* get_context) override; uint64_t ApproximateOffsetOf(const Slice& key); diff --git a/table/table_reader.h b/table/table_reader.h index 22f5a859e..2f6360ad1 100644 --- a/table/table_reader.h +++ b/table/table_reader.h @@ -18,6 +18,7 @@ class Slice; class Arena; struct ReadOptions; struct TableProperties; +class GetContext; // A Table is a sorted map from strings to strings. Tables are // immutable and persistent. A Table may be safely accessed from @@ -55,23 +56,17 @@ class TableReader { // Report an approximation of how much memory has been used. virtual size_t ApproximateMemoryUsage() const = 0; - // Calls (*result_handler)(handle_context, ...) repeatedly, starting with - // the entry found after a call to Seek(key), until result_handler returns - // false, where k is the actual internal key for a row found and v as the - // value of the key. May not make such a call if filter policy says that key - // is not present. + // Calls get_context->SaveValue() repeatedly, starting with + // the entry found after a call to Seek(key), until it returns false. + // May not make such a call if filter policy says that key is not present. // - // mark_key_may_exist_handler needs to be called when it is configured to be - // memory only and the key is not found in the block cache, with - // the parameter to be handle_context. + // get_context->MarkKeyMayExist needs to be called when it is configured to be + // memory only and the key is not found in the block cache. // // readOptions is the options for the read // key is the key to search for - virtual Status Get( - const ReadOptions& readOptions, const Slice& key, void* handle_context, - bool (*result_handler)(void* arg, const ParsedInternalKey& k, - const Slice& v), - void (*mark_key_may_exist_handler)(void* handle_context) = nullptr) = 0; + virtual Status Get(const ReadOptions& readOptions, const Slice& key, + GetContext* get_context) = 0; }; } // namespace rocksdb diff --git a/table/table_reader_bench.cc b/table/table_reader_bench.cc index aa791f4c4..52fa20ec0 100644 --- a/table/table_reader_bench.cc +++ b/table/table_reader_bench.cc @@ -22,6 +22,7 @@ int main() { #include "table/block_based_table_factory.h" #include "table/plain_table_factory.h" #include "table/table_builder.h" +#include "table/get_context.h" #include "util/histogram.h" #include "util/testharness.h" #include "util/testutil.h" @@ -48,11 +49,6 @@ static std::string MakeKey(int i, int j, bool through_db) { return key.Encode().ToString(); } -static bool DummySaveValue(void* arg, const ParsedInternalKey& ikey, - const Slice& v) { - return false; -} - uint64_t Now(Env* env, bool measured_by_nanosecond) { return measured_by_nanosecond ? env->NowNanos() : env->NowMicros(); } @@ -131,7 +127,6 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options, std::string result; HistogramImpl hist; - void* arg = nullptr; for (int it = 0; it < num_iter; it++) { for (int i = 0; i < num_keys1; i++) { for (int j = 0; j < num_keys2; j++) { @@ -147,8 +142,13 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options, std::string key = MakeKey(r1, r2, through_db); uint64_t start_time = Now(env, measured_by_nanosecond); if (!through_db) { - s = table_reader->Get(read_options, key, arg, DummySaveValue, - nullptr); + std::string value; + MergeContext merge_context; + GetContext get_context(ioptions.comparator, ioptions.merge_operator, + ioptions.info_log, ioptions.statistics, + GetContext::kNotFound, Slice(key), &value, + nullptr, &merge_context); + s = table_reader->Get(read_options, key, &get_context); } else { s = db->Get(read_options, key, &result); } diff --git a/table/table_test.cc b/table/table_test.cc index 776490871..1b032db53 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -37,6 +37,7 @@ #include "table/format.h" #include "table/meta_blocks.h" #include "table/plain_table_factory.h" +#include "table/get_context.h" #include "util/random.h" #include "util/statistics.h" @@ -1485,8 +1486,11 @@ TEST(BlockBasedTableTest, BlockCacheDisabledTest) { } { + GetContext get_context(options.comparator, nullptr, nullptr, nullptr, + GetContext::kNotFound, Slice(), nullptr, + nullptr, nullptr); // a hack that just to trigger BlockBasedTable::GetFilter. - reader->Get(ReadOptions(), "non-exist-key", nullptr, nullptr, nullptr); + reader->Get(ReadOptions(), "non-exist-key", &get_context); BlockCachePropertiesSnapshot props(options.statistics.get()); props.AssertIndexBlockStat(0, 0); props.AssertFilterBlockStat(0, 0); diff --git a/utilities/compacted_db/compacted_db_impl.cc b/utilities/compacted_db/compacted_db_impl.cc index 431eb3ba7..70ddb27cf 100644 --- a/utilities/compacted_db/compacted_db_impl.cc +++ b/utilities/compacted_db/compacted_db_impl.cc @@ -7,13 +7,13 @@ #include "utilities/compacted_db/compacted_db_impl.h" #include "db/db_impl.h" #include "db/version_set.h" -#include "db/merge_context.h" +#include "table/get_context.h" namespace rocksdb { extern void MarkKeyMayExist(void* arg); extern bool SaveValue(void* arg, const ParsedInternalKey& parsed_key, - const Slice& v); + const Slice& v, bool hit_and_return); CompactedDBImpl::CompactedDBImpl( const DBOptions& options, const std::string& dbname) @@ -44,25 +44,12 @@ size_t CompactedDBImpl::FindFile(const Slice& key) { Status CompactedDBImpl::Get(const ReadOptions& options, ColumnFamilyHandle*, const Slice& key, std::string* value) { - const FdWithKeyRange& f = files_.files[FindFile(key)]; - - bool value_found; - MergeContext merge_context; - Version::Saver saver; - saver.state = Version::kNotFound; - saver.ucmp = user_comparator_; - saver.user_key = key; - saver.value_found = &value_found; - saver.value = value; - saver.merge_operator = nullptr; - saver.merge_context = &merge_context; - saver.logger = info_log_; - saver.statistics = statistics_; + GetContext get_context(user_comparator_, nullptr, nullptr, nullptr, + GetContext::kNotFound, key, value, nullptr, nullptr); LookupKey lkey(key, kMaxSequenceNumber); - f.fd.table_reader->Get(options, lkey.internal_key(), - reinterpret_cast(&saver), SaveValue, - MarkKeyMayExist); - if (saver.state == Version::kFound) { + files_.files[FindFile(key)].fd.table_reader->Get( + options, lkey.internal_key(), &get_context); + if (get_context.State() == GetContext::kFound) { return Status::OK(); } return Status::NotFound(); @@ -84,26 +71,15 @@ std::vector CompactedDBImpl::MultiGet(const ReadOptions& options, } std::vector statuses(keys.size(), Status::NotFound()); values->resize(keys.size()); - bool value_found; - MergeContext merge_context; - Version::Saver saver; - saver.ucmp = user_comparator_; - saver.value_found = &value_found; - saver.merge_operator = nullptr; - saver.merge_context = &merge_context; - saver.logger = info_log_; - saver.statistics = statistics_; int idx = 0; for (auto* r : reader_list) { if (r != nullptr) { - saver.state = Version::kNotFound; - saver.user_key = keys[idx]; - saver.value = &(*values)[idx]; + GetContext get_context(user_comparator_, nullptr, nullptr, nullptr, + GetContext::kNotFound, keys[idx], &(*values)[idx], + nullptr, nullptr); LookupKey lkey(keys[idx], kMaxSequenceNumber); - r->Get(options, lkey.internal_key(), - reinterpret_cast(&saver), SaveValue, - MarkKeyMayExist); - if (saver.state == Version::kFound) { + r->Get(options, lkey.internal_key(), &get_context); + if (get_context.State() == GetContext::kFound) { statuses[idx] = Status::OK(); } } @@ -128,8 +104,6 @@ Status CompactedDBImpl::Init(const Options& options) { } version_ = cfd_->GetSuperVersion()->current; user_comparator_ = cfd_->user_comparator(); - statistics_ = cfd_->ioptions()->statistics; - info_log_ = cfd_->ioptions()->info_log; // L0 should not have files if (version_->file_levels_[0].num_files > 1) { return Status::NotSupported("L0 contain more than 1 file"); diff --git a/utilities/compacted_db/compacted_db_impl.h b/utilities/compacted_db/compacted_db_impl.h index ef3effced..49aca53b1 100644 --- a/utilities/compacted_db/compacted_db_impl.h +++ b/utilities/compacted_db/compacted_db_impl.h @@ -88,9 +88,6 @@ class CompactedDBImpl : public DBImpl { const Comparator* user_comparator_; FileLevel files_; - Statistics* statistics_; - Logger* info_log_; - // No copying allowed CompactedDBImpl(const CompactedDBImpl&); void operator=(const CompactedDBImpl&); From fd5d80d55e9bcb8c0dd4b6200937e0727fd4b819 Mon Sep 17 00:00:00 2001 From: Lei Jin Date: Mon, 29 Sep 2014 12:45:04 -0700 Subject: [PATCH 07/16] CompactedDB: log using the correct info_log Summary: info_log from supplied Options can be nullptr. Using the one from db_impl. Also call flush after that since no more loggging will happen and LOG can contain partial output Test Plan: verified with db_bench Reviewers: igor, yhchiang, sdong Reviewed By: sdong Subscribers: leveldb Differential Revision: https://reviews.facebook.net/D24183 --- db/db_impl_readonly.cc | 3 ++- utilities/compacted_db/compacted_db_impl.cc | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/db/db_impl_readonly.cc b/db/db_impl_readonly.cc index 98e2bfeb0..9faebd8c2 100644 --- a/db/db_impl_readonly.cc +++ b/db/db_impl_readonly.cc @@ -14,7 +14,8 @@ namespace rocksdb { DBImplReadOnly::DBImplReadOnly(const DBOptions& db_options, const std::string& dbname) : DBImpl(db_options, dbname) { - Log(db_options_.info_log, "Opening the db in read only mode"); + Log(INFO_LEVEL, db_options_.info_log, "Opening the db in read only mode"); + LogFlush(db_options_.info_log); } DBImplReadOnly::~DBImplReadOnly() { diff --git a/utilities/compacted_db/compacted_db_impl.cc b/utilities/compacted_db/compacted_db_impl.cc index 70ddb27cf..775033e2a 100644 --- a/utilities/compacted_db/compacted_db_impl.cc +++ b/utilities/compacted_db/compacted_db_impl.cc @@ -144,8 +144,10 @@ Status CompactedDBImpl::Open(const Options& options, std::unique_ptr db(new CompactedDBImpl(db_options, dbname)); Status s = db->Init(options); if (s.ok()) { + Log(INFO_LEVEL, db->db_options_.info_log, + "Opened the db as fully compacted mode"); + LogFlush(db->db_options_.info_log); *dbptr = db.release(); - Log(options.info_log, "Opened the db as fully compacted mode"); } return s; } From 827e31c746b64668f9ea67e07d9df706fd69668b Mon Sep 17 00:00:00 2001 From: erik Date: Mon, 29 Sep 2014 14:52:16 -0700 Subject: [PATCH 08/16] Make test use a compatible type in the size checks. --- db/db_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/db/db_test.cc b/db/db_test.cc index ab290d108..c09cc74df 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -1349,8 +1349,8 @@ TEST(DBTest, CompactedDB) { std::vector({Slice("aaa"), Slice("ccc"), Slice("eee"), Slice("ggg"), Slice("iii"), Slice("kkk")}), &values); - ASSERT_EQ(status_list.size(), 6); - ASSERT_EQ(values.size(), 6); + ASSERT_EQ(status_list.size(), static_cast(6)); + ASSERT_EQ(values.size(), static_cast(6)); ASSERT_OK(status_list[0]); ASSERT_EQ(DummyString(kFileSize / 2, 'a'), values[0]); ASSERT_TRUE(status_list[1].IsNotFound()); From 56ebd408776620246de2db83f9fa73bf60652144 Mon Sep 17 00:00:00 2001 From: Igor Canadi Date: Mon, 29 Sep 2014 15:10:37 -0700 Subject: [PATCH 09/16] Fix arc lint (should fix #238) Summary: See https://secure.phabricator.com/D9114 Test Plan: arc lint Reviewers: yhchiang Reviewed By: yhchiang Subscribers: leveldb Differential Revision: https://reviews.facebook.net/D24195 --- linters/lint_engine/FacebookFbcodeLintEngine.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/linters/lint_engine/FacebookFbcodeLintEngine.php b/linters/lint_engine/FacebookFbcodeLintEngine.php index cb9cf9bdb..3d28de7dd 100644 --- a/linters/lint_engine/FacebookFbcodeLintEngine.php +++ b/linters/lint_engine/FacebookFbcodeLintEngine.php @@ -36,7 +36,7 @@ class FacebookFbcodeLintEngine extends ArcanistLintEngine { )); $linters[] = $java_text_linter; - $pep8_options = $this->getPEP8WithTextOptions().',E302'; + $pep8_options = '--ignore=E101,E501,W291,W292,W293,E302'; $python_linter = new ArcanistPEP8Linter(); $python_linter->setConfig(array('options' => $pep8_options)); From 747523d241c472de64f4689e85a7e01f19534b15 Mon Sep 17 00:00:00 2001 From: Mark Callaghan Date: Mon, 29 Sep 2014 09:50:41 -0700 Subject: [PATCH 10/16] Print per column family metrics in db_bench Summary: see above Test Plan: make check, ran db_bench and looked at output - begin *PUBLIC* platform impact section - Bugzilla: # - end platform impact - Reviewers: igor Differential Revision: https://reviews.facebook.net/D24189 --- db/db_bench.cc | 71 +++++++++++++++++++++++++++++--------------------- 1 file changed, 41 insertions(+), 30 deletions(-) diff --git a/db/db_bench.cc b/db/db_bench.cc index 85e840a7f..3dec1f321 100644 --- a/db/db_bench.cc +++ b/db/db_bench.cc @@ -636,6 +636,14 @@ static void AppendWithSpace(std::string* str, Slice msg) { str->append(msg.data(), msg.size()); } +struct DBWithColumnFamilies { + std::vector cfh; + DB* db; + DBWithColumnFamilies() : db(nullptr) { + cfh.clear(); + } +}; + class Stats { private: int id_; @@ -699,7 +707,7 @@ class Stats { void SetId(int id) { id_ = id; } void SetExcludeFromMerge() { exclude_from_merge_ = true; } - void FinishedOps(DB* db, int64_t num_ops) { + void FinishedOps(DBWithColumnFamilies* db_with_cfh, DB* db, int64_t num_ops) { if (FLAGS_histogram) { double now = FLAGS_env->NowMicros(); double micros = now - last_op_finish_; @@ -739,8 +747,17 @@ class Stats { if (FLAGS_stats_per_interval) { std::string stats; - if (db && db->GetProperty("rocksdb.stats", &stats)) + + if (db_with_cfh && db_with_cfh->cfh.size()) { + for (size_t i = 0; i < db_with_cfh->cfh.size(); ++i) { + if (db->GetProperty(db_with_cfh->cfh[i], "rocksdb.cfstats", + &stats)) + fprintf(stderr, "%s\n", stats.c_str()); + } + + } else if (db && db->GetProperty("rocksdb.stats", &stats)) { fprintf(stderr, "%s\n", stats.c_str()); + } } fflush(stderr); @@ -859,13 +876,6 @@ class Benchmark { std::shared_ptr compressed_cache_; std::shared_ptr filter_policy_; const SliceTransform* prefix_extractor_; - struct DBWithColumnFamilies { - std::vector cfh; - DB* db; - DBWithColumnFamilies() : db(nullptr) { - cfh.clear(); - } - }; DBWithColumnFamilies db_; std::vector multi_dbs_; int64_t num_; @@ -1480,7 +1490,7 @@ class Benchmark { uint32_t crc = 0; while (bytes < 500 * 1048576) { crc = crc32c::Value(data.data(), size); - thread->stats.FinishedOps(nullptr, 1); + thread->stats.FinishedOps(nullptr, nullptr, 1); bytes += size; } // Print so result is not dead @@ -1499,7 +1509,7 @@ class Benchmark { unsigned int xxh32 = 0; while (bytes < 500 * 1048576) { xxh32 = XXH32(data.data(), size, 0); - thread->stats.FinishedOps(nullptr, 1); + thread->stats.FinishedOps(nullptr, nullptr, 1); bytes += size; } // Print so result is not dead @@ -1520,7 +1530,7 @@ class Benchmark { ptr = ap.Acquire_Load(); } count++; - thread->stats.FinishedOps(nullptr, 1); + thread->stats.FinishedOps(nullptr, nullptr, 1); } if (ptr == nullptr) exit(1); // Disable unused variable warning. } @@ -1561,7 +1571,7 @@ class Benchmark { } produced += compressed.size(); bytes += input.size(); - thread->stats.FinishedOps(nullptr, 1); + thread->stats.FinishedOps(nullptr, nullptr, 1); } if (!ok) { @@ -1642,7 +1652,7 @@ class Benchmark { } delete[] uncompressed; bytes += input.size(); - thread->stats.FinishedOps(nullptr, 1); + thread->stats.FinishedOps(nullptr, nullptr, 1); } if (!ok) { @@ -2022,7 +2032,8 @@ class Benchmark { bytes += value_size_ + key_size_; } s = db_with_cfh->db->Write(write_options_, &batch); - thread->stats.FinishedOps(db_with_cfh->db, entries_per_batch_); + thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, + entries_per_batch_); if (!s.ok()) { fprintf(stderr, "put error: %s\n", s.ToString().c_str()); exit(1); @@ -2047,7 +2058,7 @@ class Benchmark { int64_t bytes = 0; for (iter->SeekToFirst(); i < reads_ && iter->Valid(); iter->Next()) { bytes += iter->key().size() + iter->value().size(); - thread->stats.FinishedOps(db, 1); + thread->stats.FinishedOps(nullptr, db, 1); ++i; } delete iter; @@ -2070,7 +2081,7 @@ class Benchmark { int64_t bytes = 0; for (iter->SeekToLast(); i < reads_ && iter->Valid(); iter->Prev()) { bytes += iter->key().size() + iter->value().size(); - thread->stats.FinishedOps(db, 1); + thread->stats.FinishedOps(nullptr, db, 1); ++i; } delete iter; @@ -2105,7 +2116,7 @@ class Benchmark { ++nonexist; } } - thread->stats.FinishedOps(db, 100); + thread->stats.FinishedOps(nullptr, db, 100); } while (!duration.Done(100)); char msg[100]; @@ -2147,7 +2158,7 @@ class Benchmark { if (s.ok()) { found++; } - thread->stats.FinishedOps(db_with_cfh->db, 1); + thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1); } char msg[100]; @@ -2189,7 +2200,7 @@ class Benchmark { ++found; } } - thread->stats.FinishedOps(db, entries_per_batch_); + thread->stats.FinishedOps(nullptr, db, entries_per_batch_); } for (auto& k : keys) { delete k.data(); @@ -2208,7 +2219,7 @@ class Benchmark { DB* db = SelectDB(thread); Iterator* iter = db->NewIterator(options); delete iter; - thread->stats.FinishedOps(db, 1); + thread->stats.FinishedOps(nullptr, db, 1); } } @@ -2272,7 +2283,7 @@ class Benchmark { if (iter_to_use->Valid() && iter_to_use->key().compare(key) == 0) { found++; } - thread->stats.FinishedOps(db_.db, 1); + thread->stats.FinishedOps(&db_, db_.db, 1); } delete single_iter; for (auto iter : multi_iters) { @@ -2312,7 +2323,7 @@ class Benchmark { batch.Delete(key); } auto s = db->Write(write_options_, &batch); - thread->stats.FinishedOps(db, entries_per_batch_); + thread->stats.FinishedOps(nullptr, db, entries_per_batch_); if (!s.ok()) { fprintf(stderr, "del error: %s\n", s.ToString().c_str()); exit(1); @@ -2372,7 +2383,7 @@ class Benchmark { fprintf(stderr, "put error: %s\n", s.ToString().c_str()); exit(1); } - thread->stats.FinishedOps(db_.db, 1); + thread->stats.FinishedOps(&db_, db_.db, 1); ++num_writes; if (writes_per_second_by_10 && num_writes >= writes_per_second_by_10) { @@ -2532,7 +2543,7 @@ class Benchmark { deletes_done++; } - thread->stats.FinishedOps(db_.db, 1); + thread->stats.FinishedOps(&db_, db_.db, 1); } char msg[100]; snprintf(msg, sizeof(msg), @@ -2590,7 +2601,7 @@ class Benchmark { put_weight--; writes_done++; } - thread->stats.FinishedOps(db, 1); + thread->stats.FinishedOps(nullptr, db, 1); } char msg[100]; snprintf(msg, sizeof(msg), "( reads:%" PRIu64 " writes:%" PRIu64 \ @@ -2624,7 +2635,7 @@ class Benchmark { fprintf(stderr, "put error: %s\n", s.ToString().c_str()); exit(1); } - thread->stats.FinishedOps(db, 1); + thread->stats.FinishedOps(nullptr, db, 1); } char msg[100]; snprintf(msg, sizeof(msg), @@ -2671,7 +2682,7 @@ class Benchmark { fprintf(stderr, "put error: %s\n", s.ToString().c_str()); exit(1); } - thread->stats.FinishedOps(db, 1); + thread->stats.FinishedOps(nullptr, db, 1); } char msg[100]; @@ -2707,7 +2718,7 @@ class Benchmark { fprintf(stderr, "merge error: %s\n", s.ToString().c_str()); exit(1); } - thread->stats.FinishedOps(db, 1); + thread->stats.FinishedOps(nullptr, db, 1); } // Print some statistics @@ -2768,7 +2779,7 @@ class Benchmark { } - thread->stats.FinishedOps(db, 1); + thread->stats.FinishedOps(nullptr, db, 1); } char msg[100]; From b2d64a4861beef3cbf0c7270cbf5256ce6a658b4 Mon Sep 17 00:00:00 2001 From: Igor Canadi Date: Mon, 29 Sep 2014 15:48:27 -0700 Subject: [PATCH 11/16] Fix linters, second try --- linters/lint_engine/FacebookFbcodeLintEngine.php | 5 ----- 1 file changed, 5 deletions(-) diff --git a/linters/lint_engine/FacebookFbcodeLintEngine.php b/linters/lint_engine/FacebookFbcodeLintEngine.php index 3d28de7dd..33d2f0de3 100644 --- a/linters/lint_engine/FacebookFbcodeLintEngine.php +++ b/linters/lint_engine/FacebookFbcodeLintEngine.php @@ -39,13 +39,8 @@ class FacebookFbcodeLintEngine extends ArcanistLintEngine { $pep8_options = '--ignore=E101,E501,W291,W292,W293,E302'; $python_linter = new ArcanistPEP8Linter(); - $python_linter->setConfig(array('options' => $pep8_options)); $linters[] = $python_linter; - $python_2space_linter = new ArcanistPEP8Linter(); - $python_2space_linter->setConfig(array('options' => $pep8_options.',E111')); - $linters[] = $python_2space_linter; - // Currently we can't run cpplint in commit hook mode, because it // depends on having access to the working directory. if (!$this->getCommitHookMode()) { From 99fb613e542db5fcf91dba976e98136014aee7b7 Mon Sep 17 00:00:00 2001 From: Igor Canadi Date: Mon, 29 Sep 2014 15:52:12 -0700 Subject: [PATCH 12/16] remove 2 space linter --- linters/lint_engine/FacebookFbcodeLintEngine.php | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/linters/lint_engine/FacebookFbcodeLintEngine.php b/linters/lint_engine/FacebookFbcodeLintEngine.php index 33d2f0de3..6765c33d1 100644 --- a/linters/lint_engine/FacebookFbcodeLintEngine.php +++ b/linters/lint_engine/FacebookFbcodeLintEngine.php @@ -36,8 +36,6 @@ class FacebookFbcodeLintEngine extends ArcanistLintEngine { )); $linters[] = $java_text_linter; - $pep8_options = '--ignore=E101,E501,W291,W292,W293,E302'; - $python_linter = new ArcanistPEP8Linter(); $linters[] = $python_linter; @@ -114,11 +112,7 @@ class FacebookFbcodeLintEngine extends ArcanistLintEngine { $dir = dirname($dir); } while ($dir != '/' && $dir != '.'); - if ($space_count == 4) { - $cur_path_linter = $python_linter; - } else { - $cur_path_linter = $python_2space_linter; - } + $cur_path_linter = $python_linter; $cur_path_linter->addPath($path); $cur_path_linter->addData($path, $this->loadData($path)); From f0f795549745c54ebc8cef85fdb3809f481a1ab1 Mon Sep 17 00:00:00 2001 From: "mike@arpaia.co" Date: Mon, 29 Sep 2014 16:05:25 -0700 Subject: [PATCH 13/16] Fixing comile errors on OS X Summary: Building master on OS X has some compile errors due to implicit type conversions which generate warnings which RocksDB's build settings raise as errors. Test Plan: It compiles! Reviewers: ljin, igor Reviewed By: ljin Differential Revision: https://reviews.facebook.net/D24135 --- db/db_bench.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/db/db_bench.cc b/db/db_bench.cc index 3dec1f321..bbd807c2c 100644 --- a/db/db_bench.cc +++ b/db/db_bench.cc @@ -1278,7 +1278,8 @@ class Benchmark { method = &Benchmark::ReadRandomFast; } else if (name == Slice("multireadrandom")) { entries_per_batch_ = FLAGS_batch_size; - fprintf(stderr, "entries_per_batch_ = %ld\n", entries_per_batch_); + fprintf(stderr, "entries_per_batch = %" PRIi64 "\n", + entries_per_batch_); method = &Benchmark::MultiReadRandom; } else if (name == Slice("readmissing")) { ++key_size_; From ee1f3ccb061da17c5d8904b734205faa360d36a6 Mon Sep 17 00:00:00 2001 From: "mike@arpaia.co" Date: Mon, 29 Sep 2014 16:09:46 -0700 Subject: [PATCH 14/16] Package generation for Ubuntu and CentOS Summary: I put together a script to assist in the generation of deb's and rpm's. I've tested that this works on ubuntu via vagrant. I've included the Vagrantfile here, but I can remove it if it's not useful. The package.sh script should work on any ubuntu or centos machine, I just added a bit of logic in there to allow a base Ubuntu or Centos machine to be able to build RocksDB from scratch. Example output on Ubuntu 14.04: ``` root@vagrant-ubuntu-trusty-64:/vagrant# ./tools/package.sh [+] g++-4.7 is already installed. skipping. [+] libgflags-dev is already installed. skipping. [+] ruby-all-dev is already installed. skipping. [+] fpm is already installed. skipping. Created package {:path=>"rocksdb_3.5_amd64.deb"} root@vagrant-ubuntu-trusty-64:/vagrant# dpkg --info rocksdb_3.5_amd64.deb new debian package, version 2.0. size 17392022 bytes: control archive=1518 bytes. 275 bytes, 11 lines control 2911 bytes, 38 lines md5sums Package: rocksdb Version: 3.5 License: BSD Vendor: Facebook Architecture: amd64 Maintainer: rocksdb@fb.com Installed-Size: 83358 Section: default Priority: extra Homepage: http://rocksdb.org/ Description: RocksDB is an embeddable persistent key-value store for fast storage. ``` Example output on CentOS 6.5: ``` [root@localhost vagrant]# rpm -qip rocksdb-3.5-1.x86_64.rpm Name : rocksdb Relocations: /usr Version : 3.5 Vendor: Facebook Release : 1 Build Date: Mon 29 Sep 2014 01:26:11 AM UTC Install Date: (not installed) Build Host: localhost Group : default Source RPM: rocksdb-3.5-1.src.rpm Size : 96231106 License: BSD Signature : (none) Packager : rocksdb@fb.com URL : http://rocksdb.org/ Summary : RocksDB is an embeddable persistent key-value store for fast storage. Description : RocksDB is an embeddable persistent key-value store for fast storage. ``` Test Plan: How this gets used is really up to the RocksDB core team. If you want to actually get this into mainline, you might have to change `make install` such that it install the RocksDB shared object file as well, which would require you to link against gflags (maybe?) and that would require some potential modifications to the script here (basically add a depends on that package). Currently, this will install the headers and a pre-compiled statically linked object file. If that's what you want out of life, than this requires no modifications. Reviewers: ljin, yhchiang, igor Reviewed By: igor Differential Revision: https://reviews.facebook.net/D24141 --- .gitignore | 2 + Makefile | 14 ++++- Vagrantfile | 16 +++++ build_tools/make_package.sh | 116 ++++++++++++++++++++++++++++++++++++ 4 files changed, 145 insertions(+), 3 deletions(-) create mode 100644 Vagrantfile create mode 100755 build_tools/make_package.sh diff --git a/.gitignore b/.gitignore index 99a7d61d6..cbb817f61 100644 --- a/.gitignore +++ b/.gitignore @@ -28,8 +28,10 @@ util/build_version.cc build_tools/VALGRIND_LOGS/ coverage/COVERAGE_REPORT .gdbhistory +package/ .phutil_module_cache tags java/*.log java/include/org_rocksdb_*.h unity.cc +.vagrant/ diff --git a/Makefile b/Makefile index 75da74b08..4deb8fc5f 100644 --- a/Makefile +++ b/Makefile @@ -164,6 +164,9 @@ endif LIBRARY = ${LIBNAME}.a MEMENVLIBRARY = libmemenv.a +ROCKSDB_MAJOR = 3 +ROCKSDB_MINOR = 4 + default: all #----------------------------------------------- @@ -178,8 +181,8 @@ SHARED3 = $(SHARED1) SHARED = $(SHARED1) else # Update db.h if you change these. -SHARED_MAJOR = 3 -SHARED_MINOR = 4 +SHARED_MAJOR = $(ROCKSDB_MAJOR) +SHARED_MINOR = $(ROCKSDB_MINOR) SHARED1 = ${LIBNAME}.$(PLATFORM_SHARED_EXT) SHARED2 = $(SHARED1).$(SHARED_MAJOR) SHARED3 = $(SHARED1).$(SHARED_MAJOR).$(SHARED_MINOR) @@ -195,7 +198,7 @@ $(SHARED3): endif # PLATFORM_SHARED_EXT -.PHONY: blackbox_crash_test check clean coverage crash_test ldb_tests \ +.PHONY: blackbox_crash_test check clean coverage crash_test ldb_tests package \ release tags valgrind_check whitebox_crash_test format static_lib shared_lib all \ dbg rocksdbjavastatic rocksdbjava install uninstall @@ -276,6 +279,9 @@ tags: format: build_tools/format-diff.sh +package: + bash build_tools/make_package.sh $(SHARED_MAJOR).$(SHARED_MINOR) + # --------------------------------------------------------------------------- # Unit tests and tools # --------------------------------------------------------------------------- @@ -627,8 +633,10 @@ ifneq ($(MAKECMDGOALS),clean) ifneq ($(MAKECMDGOALS),format) ifneq ($(MAKECMDGOALS),jclean) ifneq ($(MAKECMDGOALS),jtest) +ifneq ($(MAKECMDGOALS),package) -include $(DEPFILES) endif endif endif endif +endif diff --git a/Vagrantfile b/Vagrantfile new file mode 100644 index 000000000..cdee5db53 --- /dev/null +++ b/Vagrantfile @@ -0,0 +1,16 @@ +Vagrant.configure("2") do |config| + + config.vm.provider "virtualbox" do |v| + v.memory = 4096 + v.cpus = 2 + end + + config.vm.define "ubuntu14" do |box| + box.vm.box = "ubuntu/trusty64" + end + + config.vm.define "centos65" do |box| + box.vm.box = "chef/centos-6.5" + end + +end diff --git a/build_tools/make_package.sh b/build_tools/make_package.sh new file mode 100755 index 000000000..2ca28023d --- /dev/null +++ b/build_tools/make_package.sh @@ -0,0 +1,116 @@ +#/usr/bin/env bash + +set -e + +function log() { + echo "[+] $1" +} + +function fatal() { + echo "[!] $1" + exit 1 +} + +function platform() { + local __resultvar=$1 + if [[ -f "/etc/yum.conf" ]]; then + eval $__resultvar="centos" + elif [[ -f "/etc/dpkg/dpkg.cfg" ]]; then + eval $__resultvar="ubuntu" + else + fatal "Unknwon operating system" + fi +} +platform OS + +function package() { + if [[ $OS = "ubuntu" ]]; then + if dpkg --get-selections | grep --quiet $1; then + log "$1 is already installed. skipping." + else + apt-get install $@ -y + fi + elif [[ $OS = "centos" ]]; then + if rpm -qa | grep --quiet $1; then + log "$1 is already installed. skipping." + else + yum install $@ -y + fi + fi +} + +function detect_fpm_output() { + if [[ $OS = "ubuntu" ]]; then + export FPM_OUTPUT=deb + elif [[ $OS = "centos" ]]; then + export FPM_OUTPUT=rpm + fi +} +detect_fpm_output + +function gem_install() { + if gem list | grep --quiet $1; then + log "$1 is already installed. skipping." + else + gem install $@ + fi +} + +function main() { + if [[ $# -ne 1 ]]; then + fatal "Usage: $0 " + else + log "using rocksdb version: $1" + fi + + if [[ -d /vagrant ]]; then + if [[ $OS = "ubuntu" ]]; then + package g++-4.7 + export CXX=g++-4.7 + + # the deb would depend on libgflags2, but the static lib is the only thing + # installed by make install + package libgflags-dev + + package ruby-all-dev + elif [[ $OS = "centos" ]]; then + pushd /etc/yum.repos.d + if [[ ! -f /etc/yum.repos.d/devtools-1.1.repo ]]; then + wget http://people.centos.org/tru/devtools-1.1/devtools-1.1.repo + fi + package devtoolset-1.1-gcc --enablerepo=testing-1.1-devtools-6 + package devtoolset-1.1-gcc-c++ --enablerepo=testing-1.1-devtools-6 + export CC=/opt/centos/devtoolset-1.1/root/usr/bin/gcc + export CPP=/opt/centos/devtoolset-1.1/root/usr/bin/cpp + export CXX=/opt/centos/devtoolset-1.1/root/usr/bin/c++ + export PATH=$PATH:/opt/centos/devtoolset-1.1/root/usr/bin + popd + if ! rpm -qa | grep --quiet gflags; then + rpm -i https://github.com/schuhschuh/gflags/releases/download/v2.1.0/gflags-devel-2.1.0-1.amd64.rpm + fi + + package ruby + package ruby-devel + package rubygems + package rpm-build + fi + fi + gem_install fpm + + make static_lib + make install INSTALL_PATH=package + fpm \ + -s dir \ + -t $FPM_OUTPUT \ + -n rocksdb \ + -v $1 \ + --prefix /usr \ + --url http://rocksdb.org/ \ + -m rocksdb@fb.com \ + --license BSD \ + --vendor Facebook \ + --description "RocksDB is an embeddable persistent key-value store for fast storage." \ + package +} + +main $@ From ffe3d490d406ffe516fc3d24186c85511a9a43dd Mon Sep 17 00:00:00 2001 From: sdong Date: Mon, 29 Sep 2014 17:25:02 -0700 Subject: [PATCH 15/16] Add an instruction about SSE in INSTALL.md Summary: As tittle. Test Plan: Not needed Reviewers: MarkCallaghan, ljin, yhchiang, igor Reviewed By: igor Subscribers: rven, leveldb Differential Revision: https://reviews.facebook.net/D24231 --- INSTALL.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/INSTALL.md b/INSTALL.md index 8cf66e6ab..607450f85 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -15,6 +15,10 @@ There are few options when compiling RocksDB: * `make all` will compile our static library, and all our tools and unit tests. Our tools depend on gflags. You will need to have gflags installed to run `make all`. +* if Intel SSE instruction set is supported, set USE_SSE=" -msse -msse4.2 " to make sure +SSE4.2 is used to speed up CRC32 when calculating data checksum. + + ## Dependencies * You can link RocksDB with following compression libraries: From 1f963305a8f3384da3215c37ed7a264c5c99417c Mon Sep 17 00:00:00 2001 From: Mark Callaghan Date: Mon, 29 Sep 2014 17:31:26 -0700 Subject: [PATCH 16/16] Print MB per second compaction throughput separately for reads and writes Summary: From this line there used to be one column (MB/sec) that includes reads and writes. This change splits it and for real workloads the rd and wr rates might not match when keys are dropped. 2014/09/29-17:31:01.213162 7f929fbff700 (Original Log Time 2014/09/29-17:31:01.180025) [default] compacted to: files[2 5 0 0 0 0 0], MB/sec: 14.0 rd, 14.0 wr, level 1, files in(4, 0) out(5) MB in(8.5, 0.0) out(8.5), read-write-amplify(2.0) write-amplify(1.0) OK Test Plan: make check, grepped LOG - begin *PUBLIC* platform impact section - Bugzilla: # - end platform impact - Reviewers: igor Differential Revision: https://reviews.facebook.net/D24237 --- db/db_impl.cc | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/db/db_impl.cc b/db/db_impl.cc index bd9b222b3..7f5a382c0 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -3340,12 +3340,14 @@ Status DBImpl::DoCompactionWork(CompactionState* compact, Version::LevelSummaryStorage tmp; LogToBuffer( log_buffer, - "[%s] compacted to: %s, %.1f MB/sec, level %d, files in(%d, %d) out(%d) " + "[%s] compacted to: %s, MB/sec: %.1f rd, %.1f wr, level %d, " + "files in(%d, %d) out(%d) " "MB in(%.1f, %.1f) out(%.1f), read-write-amplify(%.1f) " "write-amplify(%.1f) %s\n", cfd->GetName().c_str(), cfd->current()->LevelSummary(&tmp), - (stats.bytes_readn + stats.bytes_readnp1 + stats.bytes_written) / - (double)stats.micros, + (stats.bytes_readn + stats.bytes_readnp1) / + static_cast(stats.micros), + stats.bytes_written / static_cast(stats.micros), compact->compaction->output_level(), stats.files_in_leveln, stats.files_in_levelnp1, stats.files_out_levelnp1, stats.bytes_readn / 1048576.0, stats.bytes_readnp1 / 1048576.0,