diff --git a/Makefile b/Makefile index c7cac9249..564f1c117 100644 --- a/Makefile +++ b/Makefile @@ -6,7 +6,12 @@ INSTALL_PATH ?= $(CURDIR) #----------------------------------------------- + +ifneq ($(MAKECMDGOALS),dbg) +OPT += -O2 -fno-omit-frame-pointer -momit-leaf-frame-pointer +else OPT += -fno-omit-frame-pointer -momit-leaf-frame-pointer +endif #----------------------------------------------- # detect what platform we're building on @@ -134,10 +139,13 @@ $(SHARED3): $(LIBOBJECTS) endif # PLATFORM_SHARED_EXT +.PHONY: blackbox_crash_test check clean coverage crash_test ldb_tests \ + release tags valgrind_check whitebox_crash_test format shared_lib all \ + dbg + all: $(LIBRARY) $(PROGRAMS) -.PHONY: blackbox_crash_test check clean coverage crash_test ldb_tests \ - release tags valgrind_check whitebox_crash_test format shared_lib +dbg: $(PROGRAMS) # Will also generate shared libraries. release: @@ -151,7 +159,7 @@ coverage: # Delete intermediate files find . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \; -check: all $(PROGRAMS) $(TESTS) $(TOOLS) +check: $(PROGRAMS) $(TESTS) $(TOOLS) for t in $(TESTS); do echo "***** Running $$t"; ./$$t || exit 1; done python tools/ldb_test.py @@ -347,8 +355,8 @@ $(MEMENVLIBRARY) : $(MEMENVOBJECTS) rm -f $@ $(AR) -rs $@ $(MEMENVOBJECTS) -memenv_test : helpers/memenv/memenv_test.o $(MEMENVLIBRARY) $(LIBRARY) $(TESTHARNESS) - $(CXX) helpers/memenv/memenv_test.o $(MEMENVLIBRARY) $(LIBRARY) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) +memenv_test : helpers/memenv/memenv_test.o $(MEMENVOBJECTS) $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) helpers/memenv/memenv_test.o $(MEMENVOBJECTS) $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) manual_compaction_test: util/manual_compaction_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) util/manual_compaction_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) diff --git a/build_tools/regression_build_test.sh b/build_tools/regression_build_test.sh index d38b67c3c..58766f5df 100755 --- a/build_tools/regression_build_test.sh +++ b/build_tools/regression_build_test.sh @@ -117,6 +117,27 @@ make release --sync=0 \ --threads=16 > ${STAT_FILE}.readrandom +# measure readrandom with 6GB block cache and tailing iterator +./db_bench \ + --benchmarks=readrandom \ + --db=$DATA_DIR \ + --use_existing_db=1 \ + --bloom_bits=10 \ + --num=$NUM \ + --reads=$((NUM / 5)) \ + --cache_size=6442450944 \ + --cache_numshardbits=6 \ + --table_cache_numshardbits=4 \ + --open_files=55000 \ + --disable_seek_compaction=1 \ + --use_tailing_iterator=1 \ + --statistics=1 \ + --histogram=1 \ + --disable_data_sync=1 \ + --disable_wal=1 \ + --sync=0 \ + --threads=16 > ${STAT_FILE}.readrandomtailing + # measure readrandom with 100MB block cache ./db_bench \ --benchmarks=readrandom \ @@ -300,6 +321,7 @@ function send_benchmark_to_ods { send_benchmark_to_ods overwrite overwrite $STAT_FILE.overwrite send_benchmark_to_ods fillseq fillseq $STAT_FILE.fillseq send_benchmark_to_ods readrandom readrandom $STAT_FILE.readrandom +send_benchmark_to_ods readrandom readrandom_tailing $STAT_FILE.readrandomtailing send_benchmark_to_ods readrandom readrandom_smallblockcache $STAT_FILE.readrandomsmallblockcache send_benchmark_to_ods readrandom readrandom_memtable_sst $STAT_FILE.readrandom_mem_sst send_benchmark_to_ods readrandom readrandom_fillunique_random $STAT_FILE.readrandom_filluniquerandom diff --git a/db/db_bench.cc b/db/db_bench.cc index bdf842375..19938e0c1 100644 --- a/db/db_bench.cc +++ b/db/db_bench.cc @@ -447,6 +447,9 @@ static auto FLAGS_compaction_fadvice_e = DEFINE_bool(use_multiget, false, "Use multiget to access a series of keys instead of get"); +DEFINE_bool(use_tailing_iterator, false, + "Use tailing iterator to access a series of keys instead of get"); + DEFINE_int64(keys_per_multiget, 90, "If use_multiget is true, determines number" " of keys to group per call Arbitrary default is good because it" " agrees with readwritepercent"); @@ -1729,6 +1732,21 @@ class Benchmark { thread->stats.FinishedSingleOp(db_); keys_left -= num_keys; } + } else if (FLAGS_use_tailing_iterator) { // use tailing iterator for gets + options.tailing = true; + Iterator* iter = db_->NewIterator(options); + while (!duration.Done(1)) { + const long long k = thread->rand.Next() % FLAGS_num; + unique_ptr key = GenerateKeyFromInt(k); + + iter->Seek(key.get()); + if (iter->Valid() && iter->key().compare(Slice(key.get())) == 0) { + ++found; + } + + thread->stats.FinishedSingleOp(db_); + } + delete iter; } else { // Regular case. Do one "get" at a time Get Iterator* iter = db_->NewIterator(options); std::string value; diff --git a/include/rocksdb/memtablerep.h b/include/rocksdb/memtablerep.h index e9a41aedd..ab3af26c1 100644 --- a/include/rocksdb/memtablerep.h +++ b/include/rocksdb/memtablerep.h @@ -21,7 +21,7 @@ // types built in: // - SkipListRep: This is the default; it is backed by a skip list. // - HashSkipListRep: The memtable rep that is best used for keys that are -// structured like "prefix:suffix" where iteration withing a prefix is +// structured like "prefix:suffix" where iteration within a prefix is // common and iteration across different prefixes is rare. It is backed by // a hash map where each bucket is a skip list. // - VectorRep: This is backed by an unordered std::vector. On iteration, the @@ -85,7 +85,7 @@ class MemTableRep { // Initialize an iterator over the specified collection. // The returned iterator is not valid. // explicit Iterator(const MemTableRep* collection); - virtual ~Iterator() { }; + virtual ~Iterator() {} // Returns true iff the iterator is positioned at a valid node. virtual bool Valid() const = 0; @@ -143,7 +143,7 @@ class MemTableRep { // new MemTableRep objects class MemTableRepFactory { public: - virtual ~MemTableRepFactory() { }; + virtual ~MemTableRepFactory() {} virtual MemTableRep* CreateMemTableRep(MemTableRep::KeyComparator&, Arena*) = 0; virtual const char* Name() const = 0; @@ -159,7 +159,8 @@ class MemTableRepFactory { // bytes reserved for usage. class VectorRepFactory : public MemTableRepFactory { const size_t count_; -public: + + public: explicit VectorRepFactory(size_t count = 0) : count_(count) { } virtual MemTableRep* CreateMemTableRep(MemTableRep::KeyComparator&, Arena*) override; @@ -170,9 +171,9 @@ public: // This uses a skip list to store keys. It is the default. class SkipListFactory : public MemTableRepFactory { -public: - virtual MemTableRep* CreateMemTableRep(MemTableRep::KeyComparator&, - Arena*) override; + public: + virtual MemTableRep* CreateMemTableRep(MemTableRep::KeyComparator&, + Arena*) override; virtual const char* Name() const override { return "SkipListFactory"; } @@ -196,4 +197,4 @@ extern MemTableRepFactory* NewHashSkipListRepFactory( extern MemTableRepFactory* NewHashLinkListRepFactory( const SliceTransform* transform, size_t bucket_count = 50000); -} +} // namespace rocksdb diff --git a/port/port_posix.h b/port/port_posix.h index 8ff2480a3..839e89afe 100644 --- a/port/port_posix.h +++ b/port/port_posix.h @@ -349,7 +349,6 @@ inline bool BZip2_Compress(const CompressionOptions& opts, const char* input, output->resize(output->size() - _stream.avail_out); BZ2_bzCompressEnd(&_stream); return true; - return output; #endif return false; } diff --git a/table/format.cc b/table/format.cc index 77a55237e..561d1689a 100644 --- a/table/format.cc +++ b/table/format.cc @@ -9,6 +9,8 @@ #include "table/format.h" +#include + #include "port/port.h" #include "rocksdb/env.h" #include "table/block.h" @@ -43,8 +45,8 @@ void Footer::EncodeTo(std::string* dst) const { metaindex_handle_.EncodeTo(dst); index_handle_.EncodeTo(dst); dst->resize(2 * BlockHandle::kMaxEncodedLength); // Padding - PutFixed32(dst, static_cast(kTableMagicNumber & 0xffffffffu)); - PutFixed32(dst, static_cast(kTableMagicNumber >> 32)); + PutFixed32(dst, static_cast(table_magic_number() & 0xffffffffu)); + PutFixed32(dst, static_cast(table_magic_number() >> 32)); assert(dst->size() == original_size + kEncodedLength); } @@ -52,13 +54,21 @@ Status Footer::DecodeFrom(Slice* input) { assert(input != nullptr); assert(input->size() >= kEncodedLength); - const char* magic_ptr = input->data() + kEncodedLength - 8; + const char* magic_ptr = + input->data() + kEncodedLength - kMagicNumberLengthByte; const uint32_t magic_lo = DecodeFixed32(magic_ptr); const uint32_t magic_hi = DecodeFixed32(magic_ptr + 4); const uint64_t magic = ((static_cast(magic_hi) << 32) | (static_cast(magic_lo))); - if (magic != kTableMagicNumber) { - return Status::InvalidArgument("not an sstable (bad magic number)"); + if (HasInitializedTableMagicNumber()) { + if (magic != table_magic_number()) { + char buffer[80]; + snprintf(buffer, sizeof(buffer) - 1, + "not an sstable (bad magic number --- %lx)", magic); + return Status::InvalidArgument(buffer); + } + } else { + set_table_magic_number(magic); } Status result = metaindex_handle_.DecodeFrom(input); @@ -221,7 +231,7 @@ Status UncompressBlockContents(const char* data, size_t n, default: return Status::Corruption("bad block type"); } - result->compression_type = kNoCompression; // not compressed any more + result->compression_type = kNoCompression; // not compressed any more return Status::OK(); } diff --git a/table/format.h b/table/format.h index 207527fcb..64fa3fbe8 100644 --- a/table/format.h +++ b/table/format.h @@ -21,6 +21,9 @@ class Block; class RandomAccessFile; struct ReadOptions; +// the length of the magic number in bytes. +const int kMagicNumberLengthByte = 8; + // BlockHandle is a pointer to the extent of a file that stores a data // block or a meta block. class BlockHandle { @@ -63,12 +66,16 @@ class BlockHandle { // end of every table file. class Footer { public: + // Constructs a footer without specifying its table magic number. + // In such case, the table magic number of such footer should be + // initialized via @ReadFooterFromFile(). + Footer() : Footer(kInvalidTableMagicNumber) {} + // @table_magic_number serves two purposes: // 1. Identify different types of the tables. // 2. Help us to identify if a given file is a valid sst. - Footer(uint64_t table_magic_number) : - kTableMagicNumber(table_magic_number) { - } + explicit Footer(uint64_t table_magic_number) + : table_magic_number_(table_magic_number) {} // The block handle for the metaindex block of the table const BlockHandle& metaindex_handle() const { return metaindex_handle_; } @@ -78,24 +85,52 @@ class Footer { const BlockHandle& index_handle() const { return index_handle_; } + void set_index_handle(const BlockHandle& h) { index_handle_ = h; } + uint64_t table_magic_number() const { return table_magic_number_; } + void EncodeTo(std::string* dst) const; + + // Set the current footer based on the input slice. If table_magic_number_ + // is not set (i.e., HasInitializedTableMagicNumber() is true), then this + // function will also initialize table_magic_number_. Otherwise, this + // function will verify whether the magic number specified in the input + // slice matches table_magic_number_ and update the current footer only + // when the test passes. Status DecodeFrom(Slice* input); // Encoded length of a Footer. Note that the serialization of a // Footer will always occupy exactly this many bytes. It consists // of two block handles and a magic number. enum { - kEncodedLength = 2*BlockHandle::kMaxEncodedLength + 8 + kEncodedLength = 2 * BlockHandle::kMaxEncodedLength + 8 }; + const uint64_t kInvalidTableMagicNumber = 0; + private: + // Set the table_magic_number only when it was not previously + // initialized. Return true on success. + bool set_table_magic_number(uint64_t magic_number) { + if (HasInitializedTableMagicNumber()) { + table_magic_number_ = magic_number; + return true; + } + return false; + } + + // return true if @table_magic_number_ is set to a value different + // from @kInvalidTableMagicNumber. + bool HasInitializedTableMagicNumber() const { + return (table_magic_number_ != kInvalidTableMagicNumber); + } + BlockHandle metaindex_handle_; BlockHandle index_handle_; - const uint64_t kTableMagicNumber; + uint64_t table_magic_number_; }; // Read the footer from file diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc index a4d98bb22..fac84a01c 100644 --- a/table/meta_blocks.cc +++ b/table/meta_blocks.cc @@ -2,12 +2,13 @@ // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. - #include "table/meta_blocks.h" #include +#include #include "rocksdb/table.h" +#include "rocksdb/table_properties.h" #include "table/block.h" #include "table/format.h" #include "util/coding.h" @@ -104,9 +105,8 @@ bool NotifyCollectTableCollectorsOnAdd( Status s = collector->Add(key, value); all_succeeded = all_succeeded && s.ok(); if (!s.ok()) { - LogPropertiesCollectionError( - info_log, "Add", /* method */ collector->Name() - ); + LogPropertiesCollectionError(info_log, "Add" /* method */, + collector->Name()); } } return all_succeeded; @@ -123,9 +123,8 @@ bool NotifyCollectTableCollectorsOnFinish( all_succeeded = all_succeeded && s.ok(); if (!s.ok()) { - LogPropertiesCollectionError( - info_log, "Finish", /* method */ collector->Name() - ); + LogPropertiesCollectionError(info_log, "Finish" /* method */, + collector->Name()); } else { builder->Add(user_collected_properties); } @@ -151,14 +150,8 @@ Status ReadProperties( BlockContents block_contents; ReadOptions read_options; read_options.verify_checksums = false; - Status s = ReadBlockContents( - file, - read_options, - handle, - &block_contents, - env, - false - ); + Status s = ReadBlockContents(file, read_options, handle, &block_contents, env, + false); if (!s.ok()) { return s; @@ -166,22 +159,20 @@ Status ReadProperties( Block properties_block(block_contents); std::unique_ptr iter( - properties_block.NewIterator(BytewiseComparator()) - ); + properties_block.NewIterator(BytewiseComparator())); // All pre-defined properties of type uint64_t std::unordered_map predefined_uint64_properties = { - { TablePropertiesNames::kDataSize, &table_properties->data_size }, - { TablePropertiesNames::kIndexSize, &table_properties->index_size }, - { TablePropertiesNames::kFilterSize, &table_properties->filter_size }, - { TablePropertiesNames::kRawKeySize, &table_properties->raw_key_size }, - { TablePropertiesNames::kRawValueSize, &table_properties->raw_value_size }, - { TablePropertiesNames::kNumDataBlocks, - &table_properties->num_data_blocks }, - { TablePropertiesNames::kNumEntries, &table_properties->num_entries }, - { TablePropertiesNames::kFormatVersion, &table_properties->format_version }, - { TablePropertiesNames::kFixedKeyLen, &table_properties->fixed_key_len }, - }; + {TablePropertiesNames::kDataSize, &table_properties->data_size}, + {TablePropertiesNames::kIndexSize, &table_properties->index_size}, + {TablePropertiesNames::kFilterSize, &table_properties->filter_size}, + {TablePropertiesNames::kRawKeySize, &table_properties->raw_key_size}, + {TablePropertiesNames::kRawValueSize, &table_properties->raw_value_size}, + {TablePropertiesNames::kNumDataBlocks, + &table_properties->num_data_blocks}, + {TablePropertiesNames::kNumEntries, &table_properties->num_entries}, + {TablePropertiesNames::kFormatVersion, &table_properties->format_version}, + {TablePropertiesNames::kFixedKeyLen, &table_properties->fixed_key_len}}; std::string last_key; for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { @@ -192,10 +183,8 @@ Status ReadProperties( auto key = iter->key().ToString(); // properties block is strictly sorted with no duplicate key. - assert( - last_key.empty() || - BytewiseComparator()->Compare(key, last_key) > 0 - ); + assert(last_key.empty() || + BytewiseComparator()->Compare(key, last_key) > 0); last_key = key; auto raw_val = iter->value(); @@ -218,8 +207,7 @@ Status ReadProperties( } else { // handle user-collected properties table_properties->user_collected_properties.insert( - std::make_pair(key, raw_val.ToString()) - ); + {key, raw_val.ToString()}); } } @@ -244,21 +232,14 @@ Status ReadTableProperties( BlockContents metaindex_contents; ReadOptions read_options; read_options.verify_checksums = false; - s = ReadBlockContents( - file, - read_options, - metaindex_handle, - &metaindex_contents, - env, - false - ); + s = ReadBlockContents(file, read_options, metaindex_handle, + &metaindex_contents, env, false); if (!s.ok()) { return s; } Block metaindex_block(metaindex_contents); std::unique_ptr meta_iter( - metaindex_block.NewIterator(BytewiseComparator()) - ); + metaindex_block.NewIterator(BytewiseComparator())); // -- Read property block meta_iter->Seek(kPropertiesBlock); @@ -266,21 +247,39 @@ Status ReadTableProperties( if (meta_iter->Valid() && meta_iter->key() == kPropertiesBlock && meta_iter->status().ok()) { - s = ReadProperties( - meta_iter->value(), - file, - env, - info_log, - properties - ); + s = ReadProperties(meta_iter->value(), file, env, info_log, properties); } else { s = Status::Corruption( - "Unable to read the property block from the plain table" - ); + "Unable to read the property block from the plain table"); } return s; } +Status ReadTableMagicNumber(const std::string& file_path, + const Options& options, + const EnvOptions& env_options, + uint64_t* table_magic_number) { + unique_ptr file; + Status s = options.env->NewRandomAccessFile(file_path, &file, env_options); + if (!s.ok()) { + return s; + } + + uint64_t file_size; + options.env->GetFileSize(file_path, &file_size); + if (file_size < Footer::kEncodedLength) { + return Status::InvalidArgument("file is too short to be an sstable"); + } + + Footer footer; + s = ReadFooterFromFile(file.get(), file_size, &footer); + if (!s.ok()) { + return s; + } + + *table_magic_number = footer.table_magic_number(); + return Status::OK(); +} } // namespace rocksdb diff --git a/table/meta_blocks.h b/table/meta_blocks.h index 9f236eff6..8994b01f3 100644 --- a/table/meta_blocks.h +++ b/table/meta_blocks.h @@ -8,6 +8,7 @@ #include #include +#include "db/builder.h" #include "rocksdb/comparator.h" #include "rocksdb/options.h" #include "rocksdb/slice.h" @@ -118,4 +119,10 @@ Status ReadTableProperties( Logger* info_log, TableProperties* properties); +// Read the magic number of the specified file directly. The magic number +// of a valid sst table the last 8-byte of the file. +Status ReadTableMagicNumber(const std::string& file_path, + const Options& options, + const EnvOptions& env_options, + uint64_t* table_magic_number); } // namespace rocksdb diff --git a/table/table_properties.cc b/table/table_properties.cc index 414b15681..2da1a975a 100644 --- a/table/table_properties.cc +++ b/table/table_properties.cc @@ -40,50 +40,31 @@ std::string TableProperties::ToString( result.reserve(1024); // Basic Info - AppendProperty( - result, "# data blocks", num_data_blocks, prop_delim, kv_delim - ); + AppendProperty(result, "# data blocks", num_data_blocks, prop_delim, + kv_delim); AppendProperty(result, "# entries", num_entries, prop_delim, kv_delim); AppendProperty(result, "raw key size", raw_key_size, prop_delim, kv_delim); - AppendProperty( - result, - "raw average key size", - num_entries != 0 ? 1.0 * raw_key_size / num_entries : 0.0, - prop_delim, - kv_delim - ); - AppendProperty( - result, "raw value size", raw_value_size, prop_delim, kv_delim - ); - AppendProperty( - result, - "raw average value size", - num_entries != 0 ? 1.0 * raw_value_size / num_entries : 0.0, - prop_delim, - kv_delim - ); + AppendProperty(result, "raw average key size", + num_entries != 0 ? 1.0 * raw_key_size / num_entries : 0.0, + prop_delim, kv_delim); + AppendProperty(result, "raw value size", raw_value_size, prop_delim, + kv_delim); + AppendProperty(result, "raw average value size", + num_entries != 0 ? 1.0 * raw_value_size / num_entries : 0.0, + prop_delim, kv_delim); AppendProperty(result, "data block size", data_size, prop_delim, kv_delim); AppendProperty(result, "index block size", index_size, prop_delim, kv_delim); - AppendProperty( - result, "filter block size", filter_size, prop_delim, kv_delim - ); - AppendProperty( - result, - "(estimated) table size", - data_size + index_size + filter_size, - prop_delim, - kv_delim - ); + AppendProperty(result, "filter block size", filter_size, prop_delim, + kv_delim); + AppendProperty(result, "(estimated) table size", + data_size + index_size + filter_size, prop_delim, kv_delim); AppendProperty( - result, - "filter policy name", + result, "filter policy name", filter_policy_name.empty() ? std::string("N/A") : filter_policy_name, - prop_delim, - kv_delim - ); + prop_delim, kv_delim); return result; } diff --git a/table/table_test.cc b/table/table_test.cc index e473b8007..bac5a54ed 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -6,6 +6,9 @@ // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include #include #include #include @@ -25,6 +28,7 @@ #include "rocksdb/slice_transform.h" #include "rocksdb/memtablerep.h" #include "table/block.h" +#include "table/meta_blocks.h" #include "table/block_based_table_builder.h" #include "table/block_based_table_factory.h" #include "table/block_based_table_reader.h" @@ -946,10 +950,7 @@ TEST(BlockBasedTableTest, BasicBlockBasedTableProperties) { block_builder.Add(item.first, item.second); } Slice content = block_builder.Finish(); - ASSERT_EQ( - content.size() + kBlockTrailerSize, - props.data_size - ); + ASSERT_EQ(content.size() + kBlockTrailerSize, props.data_size); } TEST(BlockBasedTableTest, FilterPolicyNameProperties) { @@ -958,9 +959,7 @@ TEST(BlockBasedTableTest, FilterPolicyNameProperties) { std::vector keys; KVMap kvmap; Options options; - std::unique_ptr filter_policy( - NewBloomFilterPolicy(10) - ); + std::unique_ptr filter_policy(NewBloomFilterPolicy(10)); options.filter_policy = filter_policy.get(); c.Finish(options, GetPlainInternalComparator(options.comparator), &keys, @@ -1032,10 +1031,8 @@ TEST(BlockBasedTableTest, NumBlockStat) { KVMap kvmap; c.Finish(options, GetPlainInternalComparator(options.comparator), &ks, &kvmap); - ASSERT_EQ( - kvmap.size(), - c.table_reader()->GetTableProperties().num_data_blocks - ); + ASSERT_EQ(kvmap.size(), + c.table_reader()->GetTableProperties().num_data_blocks); } class BlockCacheProperties { @@ -1050,32 +1047,26 @@ class BlockCacheProperties { } // Check if the fetched props matches the expected ones. - void AssertEqual( - long index_block_cache_miss, - long index_block_cache_hit, - long data_block_cache_miss, - long data_block_cache_hit) const { + void AssertEqual(int64_t index_block_cache_miss, + int64_t index_block_cache_hit, int64_t data_block_cache_miss, + int64_t data_block_cache_hit) const { ASSERT_EQ(index_block_cache_miss, this->index_block_cache_miss); ASSERT_EQ(index_block_cache_hit, this->index_block_cache_hit); ASSERT_EQ(data_block_cache_miss, this->data_block_cache_miss); ASSERT_EQ(data_block_cache_hit, this->data_block_cache_hit); - ASSERT_EQ( - index_block_cache_miss + data_block_cache_miss, - this->block_cache_miss - ); - ASSERT_EQ( - index_block_cache_hit + data_block_cache_hit, - this->block_cache_hit - ); + ASSERT_EQ(index_block_cache_miss + data_block_cache_miss, + this->block_cache_miss); + ASSERT_EQ(index_block_cache_hit + data_block_cache_hit, + this->block_cache_hit); } private: - long block_cache_miss = 0; - long block_cache_hit = 0; - long index_block_cache_miss = 0; - long index_block_cache_hit = 0; - long data_block_cache_miss = 0; - long data_block_cache_hit = 0; + int64_t block_cache_miss = 0; + int64_t block_cache_hit = 0; + int64_t index_block_cache_miss = 0; + int64_t index_block_cache_hit = 0; + int64_t data_block_cache_miss = 0; + int64_t data_block_cache_hit = 0; }; TEST(BlockBasedTableTest, BlockCacheTest) { @@ -1105,12 +1096,8 @@ TEST(BlockBasedTableTest, BlockCacheTest) { { BlockCacheProperties props(options.statistics.get()); // index will be added to block cache. - props.AssertEqual( - 1, // index block miss - 0, - 0, - 0 - ); + props.AssertEqual(1, // index block miss + 0, 0, 0); } // Only index block will be accessed @@ -1120,24 +1107,16 @@ TEST(BlockBasedTableTest, BlockCacheTest) { // NOTE: to help better highlight the "detla" of each ticker, I use // + to indicate the increment of changed // value; other numbers remain the same. - props.AssertEqual( - 1, - 0 + 1, // index block hit - 0, - 0 - ); + props.AssertEqual(1, 0 + 1, // index block hit + 0, 0); } // Only data block will be accessed { iter->SeekToFirst(); BlockCacheProperties props(options.statistics.get()); - props.AssertEqual( - 1, - 1, - 0 + 1, // data block miss - 0 - ); + props.AssertEqual(1, 1, 0 + 1, // data block miss + 0); } // Data block will be in cache @@ -1145,12 +1124,8 @@ TEST(BlockBasedTableTest, BlockCacheTest) { iter.reset(c.NewIterator()); iter->SeekToFirst(); BlockCacheProperties props(options.statistics.get()); - props.AssertEqual( - 1, - 1 + 1, // index block hit - 1, - 0 + 1 // data block hit - ); + props.AssertEqual(1, 1 + 1, /* index block hit */ + 1, 0 + 1 /* data block hit */); } // release the iterator so that the block cache can reset correctly. iter.reset(); @@ -1176,12 +1151,8 @@ TEST(BlockBasedTableTest, BlockCacheTest) { c.Reopen(options); { BlockCacheProperties props(options.statistics.get()); - props.AssertEqual( - 1, // index block miss - 0, - 0, - 0 - ); + props.AssertEqual(1, // index block miss + 0, 0, 0); } @@ -1191,12 +1162,9 @@ TEST(BlockBasedTableTest, BlockCacheTest) { // is only 1, index block will be purged after data block is inserted. iter.reset(c.NewIterator()); BlockCacheProperties props(options.statistics.get()); - props.AssertEqual( - 1 + 1, // index block miss - 0, - 0, // data block miss - 0 - ); + props.AssertEqual(1 + 1, // index block miss + 0, 0, // data block miss + 0); } { @@ -1204,12 +1172,8 @@ TEST(BlockBasedTableTest, BlockCacheTest) { // block's cache miss. iter->SeekToFirst(); BlockCacheProperties props(options.statistics.get()); - props.AssertEqual( - 2, - 0, - 0 + 1, // data block miss - 0 - ); + props.AssertEqual(2, 0, 0 + 1, // data block miss + 0); } } @@ -1316,7 +1280,6 @@ TEST(GeneralTableTest, ApproximateOffsetOfPlain) { ASSERT_TRUE(Between(c.ApproximateOffsetOf("k06"), 510000, 511000)); ASSERT_TRUE(Between(c.ApproximateOffsetOf("k07"), 510000, 511000)); ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 610000, 612000)); - } static void DoCompressionTest(CompressionType comp) { @@ -1360,11 +1323,9 @@ TEST(GeneralTableTest, ApproximateOffsetOfCompressed) { valid++; } - for(int i =0; i < valid; i++) - { + for (int i = 0; i < valid; i++) { DoCompressionTest(compression_state[i]); } - } TEST(Harness, Randomized) { @@ -1375,8 +1336,8 @@ TEST(Harness, Randomized) { for (int num_entries = 0; num_entries < 2000; num_entries += (num_entries < 50 ? 1 : 200)) { if ((num_entries % 10) == 0) { - fprintf(stderr, "case %d of %d: num_entries = %d\n", - (i + 1), int(args.size()), num_entries); + fprintf(stderr, "case %d of %d: num_entries = %d\n", (i + 1), + static_cast(args.size()), num_entries); } for (int e = 0; e < num_entries; e++) { std::string v; diff --git a/tools/sst_dump.cc b/tools/sst_dump.cc index 79b361841..3b82571bf 100644 --- a/tools/sst_dump.cc +++ b/tools/sst_dump.cc @@ -14,10 +14,14 @@ #include "rocksdb/db.h" #include "rocksdb/env.h" #include "rocksdb/iterator.h" +#include "rocksdb/slice_transform.h" #include "rocksdb/table.h" #include "rocksdb/table_properties.h" +#include "table/block_based_table_factory.h" +#include "table/plain_table_factory.h" #include "table/block.h" #include "table/block_builder.h" +#include "table/meta_blocks.h" #include "table/format.h" #include "util/ldb_cmd.h" #include "util/random.h" @@ -44,6 +48,9 @@ class SstFileReader { private: Status NewTableReader(const std::string& file_path); + Status SetTableOptionsByMagicNumber(uint64_t table_magic_number, + RandomAccessFile* file, + uint64_t file_size); std::string file_name_; uint64_t read_num_; @@ -54,9 +61,9 @@ class SstFileReader { Status init_result_; unique_ptr table_reader_; unique_ptr file_; - // table_options_ and internal_comparator_ will also be used in + // options_ and internal_comparator_ will also be used in // ReadSequential internally (specifically, seek-related operations) - Options table_options_; + Options options_; InternalKeyComparator internal_comparator_; }; @@ -70,21 +77,68 @@ SstFileReader::SstFileReader(const std::string& file_path, init_result_ = NewTableReader(file_name_); } +extern uint64_t kBlockBasedTableMagicNumber; +extern uint64_t kPlainTableMagicNumber; + Status SstFileReader::NewTableReader(const std::string& file_path) { - Status s = table_options_.env->NewRandomAccessFile(file_path, &file_, - soptions_); + uint64_t magic_number; + Status s = + ReadTableMagicNumber(file_path, options_, soptions_, &magic_number); + if (!s.ok()) { + return s; + } + if (magic_number == kPlainTableMagicNumber) { + soptions_.use_mmap_reads = true; + } + options_.comparator = &internal_comparator_; + + s = options_.env->NewRandomAccessFile(file_path, &file_, soptions_); if (!s.ok()) { return s; } uint64_t file_size; - table_options_.env->GetFileSize(file_path, &file_size); - unique_ptr table_factory; - s = table_options_.table_factory->NewTableReader( - table_options_, soptions_, internal_comparator_, std::move(file_), - file_size, &table_reader_); + options_.env->GetFileSize(file_path, &file_size); + s = SetTableOptionsByMagicNumber(magic_number, file_.get(), file_size); + if (!s.ok()) { + return s; + } + + s = options_.table_factory->NewTableReader( + options_, soptions_, internal_comparator_, std::move(file_), file_size, + &table_reader_); return s; } +Status SstFileReader::SetTableOptionsByMagicNumber(uint64_t table_magic_number, + RandomAccessFile* file, + uint64_t file_size) { + TableProperties table_properties; + Status s = rocksdb::ReadTableProperties(file, file_size, table_magic_number, + options_.env, options_.info_log.get(), + &table_properties); + if (!s.ok()) { + return s; + } + + if (table_magic_number == kBlockBasedTableMagicNumber) { + options_.table_factory = std::make_shared(); + fprintf(stdout, "Sst file format: block-based\n"); + } else if (table_magic_number == kPlainTableMagicNumber) { + options_.allow_mmap_reads = true; + options_.table_factory = std::make_shared( + table_properties.fixed_key_len, 2, 0.8); + options_.prefix_extractor = NewNoopTransform(); + fprintf(stdout, "Sst file format: plain table\n"); + } else { + char error_msg_buffer[80]; + snprintf(error_msg_buffer, sizeof(error_msg_buffer) - 1, + "Unsupported table magic number --- %lx)", table_magic_number); + return Status::InvalidArgument(error_msg_buffer); + } + + return Status::OK(); +} + Status SstFileReader::ReadSequential(bool print_kv, uint64_t read_num, bool has_from, diff --git a/util/env_posix.cc b/util/env_posix.cc index b53cd0103..1ccb32084 100644 --- a/util/env_posix.cc +++ b/util/env_posix.cc @@ -1047,16 +1047,16 @@ class PosixEnv : public Env { unique_ptr* result, const EnvOptions& options) { result->reset(); + // no support for mmap yet + if (options.use_mmap_writes || options.use_mmap_reads) { + return Status::NotSupported("No support for mmap read/write yet"); + } Status s; const int fd = open(fname.c_str(), O_CREAT | O_RDWR, 0644); if (fd < 0) { s = IOError(fname, errno); } else { SetFD_CLOEXEC(fd, &options); - // no support for mmap yet - if (options.use_mmap_writes || options.use_mmap_reads) { - return Status::NotSupported("No support for mmap read/write yet"); - } result->reset(new PosixRandomRWFile(fname, fd, options)); } return s;