diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc index fdab81502..23d11aba7 100644 --- a/db/plain_table_db_test.cc +++ b/db/plain_table_db_test.cc @@ -23,6 +23,7 @@ #include "rocksdb/slice_transform.h" #include "rocksdb/table.h" #include "table/meta_blocks.h" +#include "table/bloom_block.h" #include "table/plain_table_factory.h" #include "table/plain_table_reader.h" #include "util/hash.h" @@ -70,10 +71,11 @@ class PlainTableDBTest { plain_table_options.huge_page_tlb_size = 0; plain_table_options.encoding_type = kPrefix; plain_table_options.full_scan_mode = false; + plain_table_options.store_index_in_file = false; options.table_factory.reset(NewPlainTableFactory(plain_table_options)); + options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0, 3, true)); - options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0, 3, true, 3)); options.prefix_extractor.reset(NewFixedPrefixTransform(8)); options.allow_mmap_reads = true; return options; @@ -186,6 +188,8 @@ TEST(PlainTableDBTest, Empty) { ASSERT_EQ("NOT_FOUND", Get("0000000000000foo")); } +extern const uint64_t kPlainTableMagicNumber; + class TestPlainTableReader : public PlainTableReader { public: TestPlainTableReader(const EnvOptions& storage_options, @@ -195,7 +199,8 @@ class TestPlainTableReader : public PlainTableReader { size_t index_sparseness, const TableProperties* table_properties, unique_ptr&& file, - const Options& options, bool* expect_bloom_not_match) + const Options& options, bool* expect_bloom_not_match, + bool store_index_in_file) : PlainTableReader(options, std::move(file), storage_options, icomparator, encoding_type, file_size, table_properties), expect_bloom_not_match_(expect_bloom_not_match) { @@ -206,6 +211,19 @@ class TestPlainTableReader : public PlainTableReader { bloom_bits_per_key, hash_table_ratio, index_sparseness, 2 * 1024 * 1024); ASSERT_TRUE(s.ok()); + + TableProperties* props = const_cast(table_properties); + if (store_index_in_file) { + auto bloom_version_ptr = props->user_collected_properties.find( + PlainTablePropertyNames::kBloomVersion); + ASSERT_TRUE(bloom_version_ptr != props->user_collected_properties.end()); + ASSERT_EQ(bloom_version_ptr->second, std::string("1")); + if (options.bloom_locality > 0) { + auto num_blocks_ptr = props->user_collected_properties.find( + PlainTablePropertyNames::kNumBloomBlocks); + ASSERT_TRUE(num_blocks_ptr != props->user_collected_properties.end()); + } + } } virtual ~TestPlainTableReader() {} @@ -213,7 +231,11 @@ class TestPlainTableReader : public PlainTableReader { private: virtual bool MatchBloom(uint32_t hash) const override { bool ret = PlainTableReader::MatchBloom(hash); - ASSERT_TRUE(!*expect_bloom_not_match_ || !ret); + if (*expect_bloom_not_match_) { + ASSERT_TRUE(!ret); + } else { + ASSERT_TRUE(ret); + } return ret; } bool* expect_bloom_not_match_; @@ -228,6 +250,7 @@ class TestPlainTableFactory : public PlainTableFactory { bloom_bits_per_key_(options.bloom_bits_per_key), hash_table_ratio_(options.hash_table_ratio), index_sparseness_(options.index_sparseness), + store_index_in_file_(options.store_index_in_file), expect_bloom_not_match_(expect_bloom_not_match) {} Status NewTableReader(const Options& options, const EnvOptions& soptions, @@ -239,6 +262,20 @@ class TestPlainTableFactory : public PlainTableFactory { options.env, options.info_log.get(), &props); ASSERT_TRUE(s.ok()); + if (store_index_in_file_) { + BlockHandle bloom_block_handle; + s = FindMetaBlock(file.get(), file_size, kPlainTableMagicNumber, + options.env, BloomBlockBuilder::kBloomBlock, + &bloom_block_handle); + ASSERT_TRUE(s.ok()); + + BlockHandle index_block_handle; + s = FindMetaBlock( + file.get(), file_size, kPlainTableMagicNumber, options.env, + PlainTableIndexBuilder::kPlainTableIndexBlock, &index_block_handle); + ASSERT_TRUE(s.ok()); + } + auto& user_props = props->user_collected_properties; auto encoding_type_prop = user_props.find(PlainTablePropertyNames::kEncodingType); @@ -249,7 +286,8 @@ class TestPlainTableFactory : public PlainTableFactory { std::unique_ptr new_reader(new TestPlainTableReader( soptions, internal_comparator, encoding_type, file_size, bloom_bits_per_key_, hash_table_ratio_, index_sparseness_, props, - std::move(file), options, expect_bloom_not_match_)); + std::move(file), options, expect_bloom_not_match_, + store_index_in_file_)); *table = std::move(new_reader); return s; @@ -259,6 +297,7 @@ class TestPlainTableFactory : public PlainTableFactory { int bloom_bits_per_key_; double hash_table_ratio_; size_t index_sparseness_; + bool store_index_in_file_; bool* expect_bloom_not_match_; }; @@ -268,59 +307,75 @@ TEST(PlainTableDBTest, Flush) { for (EncodingType encoding_type : {kPlain, kPrefix}) { for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) { for (int total_order = 0; total_order <= 1; total_order++) { - Options options = CurrentOptions(); - options.create_if_missing = true; - // Set only one bucket to force bucket conflict. - // Test index interval for the same prefix to be 1, 2 and 4 - if (total_order) { - options.prefix_extractor.reset(); + for (int store_index_in_file = 0; store_index_in_file <= 1; + ++store_index_in_file) { + if (!bloom_bits && store_index_in_file) { + continue; + } - PlainTableOptions plain_table_options; - plain_table_options.user_key_len = 0; - plain_table_options.bloom_bits_per_key = bloom_bits; - plain_table_options.hash_table_ratio = 0; - plain_table_options.index_sparseness = 2; - plain_table_options.huge_page_tlb_size = huge_page_tlb_size; - plain_table_options.encoding_type = encoding_type; - plain_table_options.full_scan_mode = false; + Options options = CurrentOptions(); + options.create_if_missing = true; + // Set only one bucket to force bucket conflict. + // Test index interval for the same prefix to be 1, 2 and 4 + if (total_order) { + options.prefix_extractor.reset(); - options.table_factory.reset( - NewPlainTableFactory(plain_table_options)); - } else { - PlainTableOptions plain_table_options; - plain_table_options.user_key_len = 0; - plain_table_options.bloom_bits_per_key = bloom_bits; - plain_table_options.hash_table_ratio = 0.75; - plain_table_options.index_sparseness = 16; - plain_table_options.huge_page_tlb_size = huge_page_tlb_size; - plain_table_options.encoding_type = encoding_type; - plain_table_options.full_scan_mode = false; + PlainTableOptions plain_table_options; + plain_table_options.user_key_len = 0; + plain_table_options.bloom_bits_per_key = bloom_bits; + plain_table_options.hash_table_ratio = 0; + plain_table_options.index_sparseness = 2; + plain_table_options.huge_page_tlb_size = huge_page_tlb_size; + plain_table_options.encoding_type = encoding_type; + plain_table_options.full_scan_mode = false; + plain_table_options.store_index_in_file = store_index_in_file; - options.table_factory.reset( - NewPlainTableFactory(plain_table_options)); + options.table_factory.reset( + NewPlainTableFactory(plain_table_options)); + } else { + PlainTableOptions plain_table_options; + plain_table_options.user_key_len = 0; + plain_table_options.bloom_bits_per_key = bloom_bits; + plain_table_options.hash_table_ratio = 0.75; + plain_table_options.index_sparseness = 16; + plain_table_options.huge_page_tlb_size = huge_page_tlb_size; + plain_table_options.encoding_type = encoding_type; + plain_table_options.full_scan_mode = false; + plain_table_options.store_index_in_file = store_index_in_file; + + options.table_factory.reset( + NewPlainTableFactory(plain_table_options)); + } + DestroyAndReopen(&options); + ASSERT_OK(Put("1000000000000foo", "v1")); + ASSERT_OK(Put("0000000000000bar", "v2")); + ASSERT_OK(Put("1000000000000foo", "v3")); + dbfull()->TEST_FlushMemTable(); + + TablePropertiesCollection ptc; + reinterpret_cast(dbfull())->GetPropertiesOfAllTables(&ptc); + ASSERT_EQ(1U, ptc.size()); + auto row = ptc.begin(); + auto tp = row->second; + + if (!store_index_in_file) { + ASSERT_EQ(total_order ? "4" : "12", + (tp->user_collected_properties) + .at("plain_table_hash_table_size")); + ASSERT_EQ("0", (tp->user_collected_properties) + .at("plain_table_sub_index_size")); + } else { + ASSERT_EQ("0", (tp->user_collected_properties) + .at("plain_table_hash_table_size")); + ASSERT_EQ("0", (tp->user_collected_properties) + .at("plain_table_sub_index_size")); + } + ASSERT_EQ("v3", Get("1000000000000foo")); + ASSERT_EQ("v2", Get("0000000000000bar")); + } } - DestroyAndReopen(&options); - - ASSERT_OK(Put("1000000000000foo", "v1")); - ASSERT_OK(Put("0000000000000bar", "v2")); - ASSERT_OK(Put("1000000000000foo", "v3")); - dbfull()->TEST_FlushMemTable(); - - TablePropertiesCollection ptc; - reinterpret_cast(dbfull())->GetPropertiesOfAllTables(&ptc); - ASSERT_EQ(1U, ptc.size()); - auto row = ptc.begin(); - auto tp = row->second; - ASSERT_EQ(total_order ? "4" : "12", (tp->user_collected_properties).at( - "plain_table_hash_table_size")); - ASSERT_EQ("0", (tp->user_collected_properties) - .at("plain_table_sub_index_size")); - - ASSERT_EQ("v3", Get("1000000000000foo")); - ASSERT_EQ("v2", Get("0000000000000bar")); } } - } } } @@ -330,7 +385,15 @@ TEST(PlainTableDBTest, Flush2) { for (EncodingType encoding_type : {kPlain, kPrefix}) { for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) { for (int total_order = 0; total_order <= 1; total_order++) { - if (encoding_type == kPrefix && total_order == 1) { + for (int store_index_in_file = 0; store_index_in_file <= 1; + ++store_index_in_file) { + if (encoding_type == kPrefix && total_order) { + continue; + } + if (!bloom_bits && store_index_in_file) { + continue; + } + if (total_order && store_index_in_file) { continue; } bool expect_bloom_not_match = false; @@ -338,30 +401,23 @@ TEST(PlainTableDBTest, Flush2) { options.create_if_missing = true; // Set only one bucket to force bucket conflict. // Test index interval for the same prefix to be 1, 2 and 4 + PlainTableOptions plain_table_options; if (total_order) { options.prefix_extractor = nullptr; - PlainTableOptions plain_table_options; - plain_table_options.user_key_len = 0; - plain_table_options.bloom_bits_per_key = bloom_bits; plain_table_options.hash_table_ratio = 0; plain_table_options.index_sparseness = 2; - plain_table_options.huge_page_tlb_size = huge_page_tlb_size; - plain_table_options.encoding_type = encoding_type; - - options.table_factory.reset(new TestPlainTableFactory( - &expect_bloom_not_match, plain_table_options)); } else { - PlainTableOptions plain_table_options; - plain_table_options.user_key_len = 0; - plain_table_options.bloom_bits_per_key = bloom_bits; plain_table_options.hash_table_ratio = 0.75; plain_table_options.index_sparseness = 16; - plain_table_options.huge_page_tlb_size = huge_page_tlb_size; - plain_table_options.encoding_type = encoding_type; - - options.table_factory.reset(new TestPlainTableFactory( - &expect_bloom_not_match, plain_table_options)); } + plain_table_options.user_key_len = kPlainTableVariableLength; + plain_table_options.bloom_bits_per_key = bloom_bits; + plain_table_options.huge_page_tlb_size = huge_page_tlb_size; + plain_table_options.encoding_type = encoding_type; + plain_table_options.store_index_in_file = store_index_in_file; + options.table_factory.reset(new TestPlainTableFactory( + &expect_bloom_not_match, plain_table_options)); + DestroyAndReopen(&options); ASSERT_OK(Put("0000000000000bar", "b")); ASSERT_OK(Put("1000000000000foo", "v1")); @@ -389,7 +445,6 @@ TEST(PlainTableDBTest, Flush2) { // Neither key nor value should exist. expect_bloom_not_match = true; ASSERT_EQ("NOT_FOUND", Get("5_not00000000bar")); - // Key doesn't exist any more but prefix exists. if (total_order) { ASSERT_EQ("NOT_FOUND", Get("1000000000000not")); @@ -398,6 +453,7 @@ TEST(PlainTableDBTest, Flush2) { expect_bloom_not_match = false; } } + } } } } diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h index f2dc0063d..96c67f956 100644 --- a/include/rocksdb/table.h +++ b/include/rocksdb/table.h @@ -119,6 +119,8 @@ enum EncodingType : char { struct PlainTablePropertyNames { static const std::string kPrefixExtractorName; static const std::string kEncodingType; + static const std::string kBloomVersion; + static const std::string kNumBloomBlocks; }; const uint32_t kPlainTableVariableLength = 0; @@ -166,6 +168,11 @@ EncodingType encoding_type = kPlain; // @full_scan_mode: mode for reading the whole file one record by one without // using the index. bool full_scan_mode = false; + + // @store_index_in_file: compute plain table index and bloom filter during + // file building and store it in file. When reading + // file, index will be mmaped instead of recomputation. + bool store_index_in_file = false; }; // -- Plain Table with prefix-only seek diff --git a/table/bloom_block.cc b/table/bloom_block.cc new file mode 100644 index 000000000..c44ab66ca --- /dev/null +++ b/table/bloom_block.cc @@ -0,0 +1,23 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include "table/bloom_block.h" + +#include +#include "rocksdb/slice.h" +#include "util/dynamic_bloom.h" + +namespace rocksdb { + +void BloomBlockBuilder::AddKeysHashes(const std::vector keys_hashes) { + for (auto hash : keys_hashes) { + bloom_.AddHash(hash); + } +} + +Slice BloomBlockBuilder::Finish() { return bloom_.GetRawData(); } + +const std::string BloomBlockBuilder::kBloomBlock = "kBloomBlock"; +} // namespace rocksdb diff --git a/table/bloom_block.h b/table/bloom_block.h new file mode 100644 index 000000000..d55453eda --- /dev/null +++ b/table/bloom_block.h @@ -0,0 +1,37 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +#pragma once + +#include +#include +#include "util/dynamic_bloom.h" + +namespace rocksdb { +class Logger; + +class BloomBlockBuilder { + public: + static const std::string kBloomBlock; + + explicit BloomBlockBuilder(uint32_t num_probes = 6) + : bloom_(num_probes, nullptr) {} + + void SetTotalBits(Arena* arena, uint32_t total_bits, uint32_t locality, + size_t huge_page_tlb_size, Logger* logger) { + bloom_.SetTotalBits(arena, total_bits, locality, huge_page_tlb_size, + logger); + } + + uint32_t GetNumBlocks() const { return bloom_.GetNumBlocks(); } + + void AddKeysHashes(const std::vector keys_hashes); + + Slice Finish(); + + private: + DynamicBloom bloom_; +}; + +}; // namespace rocksdb diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc index 7443eb731..407154015 100644 --- a/table/meta_blocks.cc +++ b/table/meta_blocks.cc @@ -273,4 +273,72 @@ Status FindMetaBlock(Iterator* meta_index_iter, } } +Status FindMetaBlock(RandomAccessFile* file, uint64_t file_size, + uint64_t table_magic_number, Env* env, + const std::string& meta_block_name, + BlockHandle* block_handle) { + Footer footer(table_magic_number); + auto s = ReadFooterFromFile(file, file_size, &footer); + if (!s.ok()) { + return s; + } + + auto metaindex_handle = footer.metaindex_handle(); + BlockContents metaindex_contents; + ReadOptions read_options; + read_options.verify_checksums = false; + s = ReadBlockContents(file, footer, read_options, metaindex_handle, + &metaindex_contents, env, false); + if (!s.ok()) { + return s; + } + Block metaindex_block(metaindex_contents); + + std::unique_ptr meta_iter; + meta_iter.reset(metaindex_block.NewIterator(BytewiseComparator())); + + return FindMetaBlock(meta_iter.get(), meta_block_name, block_handle); +} + +Status ReadMetaBlock(RandomAccessFile* file, uint64_t file_size, + uint64_t table_magic_number, Env* env, + const std::string& meta_block_name, + BlockContents* contents) { + Footer footer(table_magic_number); + auto s = ReadFooterFromFile(file, file_size, &footer); + if (!s.ok()) { + return s; + } + + // Reading metaindex block + auto metaindex_handle = footer.metaindex_handle(); + BlockContents metaindex_contents; + ReadOptions read_options; + read_options.verify_checksums = false; + s = ReadBlockContents(file, footer, read_options, metaindex_handle, + &metaindex_contents, env, false); + if (!s.ok()) { + return s; + } + + // Finding metablock + Block metaindex_block(metaindex_contents); + + std::unique_ptr meta_iter; + meta_iter.reset(metaindex_block.NewIterator(BytewiseComparator())); + + BlockHandle block_handle; + s = FindMetaBlock(meta_iter.get(), meta_block_name, &block_handle); + + if (!s.ok()) { + return s; + } + + // Reading metablock + s = ReadBlockContents(file, footer, read_options, block_handle, contents, env, + false); + + return s; +} + } // namespace rocksdb diff --git a/table/meta_blocks.h b/table/meta_blocks.h index 6cfc0babd..798a18af0 100644 --- a/table/meta_blocks.h +++ b/table/meta_blocks.h @@ -15,6 +15,7 @@ #include "rocksdb/slice.h" #include "rocksdb/table_properties.h" #include "table/block_builder.h" +#include "table/format.h" namespace rocksdb { @@ -128,4 +129,18 @@ Status FindMetaBlock(Iterator* meta_index_iter, const std::string& meta_block_name, BlockHandle* block_handle); +// Find the meta block +Status FindMetaBlock(RandomAccessFile* file, uint64_t file_size, + uint64_t table_magic_number, Env* env, + const std::string& meta_block_name, + BlockHandle* block_handle); + +// Read the specified meta block with name meta_block_name +// from `file` and initialize `contents` with contents of this block. +// Return Status::OK in case of success. +Status ReadMetaBlock(RandomAccessFile* file, uint64_t file_size, + uint64_t table_magic_number, Env* env, + const std::string& meta_block_name, + BlockContents* contents); + } // namespace rocksdb diff --git a/table/plain_table_builder.cc b/table/plain_table_builder.cc index e9c3f3624..4f3b62ad4 100644 --- a/table/plain_table_builder.cc +++ b/table/plain_table_builder.cc @@ -6,6 +6,7 @@ #ifndef ROCKSDB_LITE #include "table/plain_table_builder.h" +#include #include #include @@ -17,6 +18,8 @@ #include "table/plain_table_factory.h" #include "db/dbformat.h" #include "table/block_builder.h" +#include "table/bloom_block.h" +#include "table/plain_table_index.h" #include "table/filter_block.h" #include "table/format.h" #include "table/meta_blocks.h" @@ -54,20 +57,36 @@ Status WriteBlock( extern const uint64_t kPlainTableMagicNumber = 0x8242229663bf9564ull; extern const uint64_t kLegacyPlainTableMagicNumber = 0x4f3418eb7a8f13b8ull; -PlainTableBuilder::PlainTableBuilder(const Options& options, WritableFile* file, - uint32_t user_key_len, - EncodingType encoding_type, - size_t index_sparseness) +PlainTableBuilder::PlainTableBuilder( + const Options& options, WritableFile* file, uint32_t user_key_len, + EncodingType encoding_type, size_t index_sparseness, + uint32_t bloom_bits_per_key, uint32_t num_probes, size_t huge_page_tlb_size, + double hash_table_ratio, bool store_index_in_file) : options_(options), + bloom_block_(num_probes), file_(file), + bloom_bits_per_key_(bloom_bits_per_key), + huge_page_tlb_size_(huge_page_tlb_size), encoder_(encoding_type, user_key_len, options.prefix_extractor.get(), - index_sparseness) { + index_sparseness), + store_index_in_file_(store_index_in_file), + prefix_extractor_(options.prefix_extractor.get()) { + // Build index block and save it in the file if hash_table_ratio > 0 + if (store_index_in_file_) { + assert(hash_table_ratio > 0 || IsTotalOrderMode()); + index_builder_.reset( + new PlainTableIndexBuilder(&arena_, options, index_sparseness, + hash_table_ratio, huge_page_tlb_size_)); + assert(bloom_bits_per_key_ > 0); + properties_.user_collected_properties + [PlainTablePropertyNames::kBloomVersion] = "1"; // For future use + } + properties_.fixed_key_len = user_key_len; // for plain table, we put all the data in a big chuck. properties_.num_data_blocks = 1; - // emphasize that currently plain table doesn't have persistent index or - // filter block. + // Fill it later if store_index_in_file_ == true properties_.index_size = 0; properties_.filter_size = 0; // To support roll-back to previous version, now still use version 0 for @@ -100,9 +119,28 @@ void PlainTableBuilder::Add(const Slice& key, const Slice& value) { char meta_bytes_buf[6]; size_t meta_bytes_buf_size = 0; + ParsedInternalKey internal_key; + ParseInternalKey(key, &internal_key); + + // Store key hash + if (store_index_in_file_) { + if (options_.prefix_extractor.get() == nullptr) { + keys_or_prefixes_hashes_.push_back(GetSliceHash(internal_key.user_key)); + } else { + Slice prefix = + options_.prefix_extractor->Transform(internal_key.user_key); + keys_or_prefixes_hashes_.push_back(GetSliceHash(prefix)); + } + } + + // Write value + auto prev_offset = offset_; // Write out the key encoder_.AppendKey(key, file_, &offset_, meta_bytes_buf, &meta_bytes_buf_size); + if (SaveIndexInFile()) { + index_builder_->AddKeyPrefix(GetPrefix(internal_key), prev_offset); + } // Write value length int value_size = value.size(); @@ -133,12 +171,51 @@ Status PlainTableBuilder::Finish() { properties_.data_size = offset_; - // Write the following blocks - // 1. [meta block: properties] - // 2. [metaindex block] - // 3. [footer] + // Write the following blocks + // 1. [meta block: bloom] - optional + // 2. [meta block: index] - optional + // 3. [meta block: properties] + // 4. [metaindex block] + // 5. [footer] + MetaIndexBuilder meta_index_builer; + if (store_index_in_file_ && (properties_.num_entries > 0)) { + bloom_block_.SetTotalBits( + &arena_, properties_.num_entries * bloom_bits_per_key_, + options_.bloom_locality, huge_page_tlb_size_, options_.info_log.get()); + + PutVarint32(&properties_.user_collected_properties + [PlainTablePropertyNames::kNumBloomBlocks], + bloom_block_.GetNumBlocks()); + + bloom_block_.AddKeysHashes(keys_or_prefixes_hashes_); + BlockHandle bloom_block_handle; + auto finish_result = bloom_block_.Finish(); + + properties_.filter_size = finish_result.size(); + auto s = WriteBlock(finish_result, file_, &offset_, &bloom_block_handle); + + if (!s.ok()) { + return s; + } + + BlockHandle index_block_handle; + finish_result = index_builder_->Finish(); + + properties_.index_size = finish_result.size(); + s = WriteBlock(finish_result, file_, &offset_, &index_block_handle); + + if (!s.ok()) { + return s; + } + + meta_index_builer.Add(BloomBlockBuilder::kBloomBlock, bloom_block_handle); + meta_index_builer.Add(PlainTableIndexBuilder::kPlainTableIndexBlock, + index_block_handle); + } + + // Calculate bloom block size and index block size PropertyBlockBuilder property_block_builder; // -- Add basic properties property_block_builder.AddTableProperty(properties_); diff --git a/table/plain_table_builder.h b/table/plain_table_builder.h index a0bc1513a..2871d887e 100644 --- a/table/plain_table_builder.h +++ b/table/plain_table_builder.h @@ -13,6 +13,8 @@ #include "table/plain_table_key_coding.h" #include "rocksdb/table.h" #include "rocksdb/table_properties.h" +#include "table/bloom_block.h" +#include "table/plain_table_index.h" namespace rocksdb { @@ -30,7 +32,10 @@ class PlainTableBuilder: public TableBuilder { // that the caller does not know which level the output file will reside. PlainTableBuilder(const Options& options, WritableFile* file, uint32_t user_key_size, EncodingType encoding_type, - size_t index_sparseness); + size_t index_sparseness, uint32_t bloom_bits_per_key, + uint32_t num_probes = 6, size_t huge_page_tlb_size = 0, + double hash_table_ratio = 0, + bool store_index_in_file = false); // REQUIRES: Either Finish() or Abandon() has been called. ~PlainTableBuilder(); @@ -62,18 +67,59 @@ class PlainTableBuilder: public TableBuilder { // Finish() call, returns the size of the final generated file. uint64_t FileSize() const override; + bool SaveIndexInFile() const { return store_index_in_file_; } + private: + Arena arena_; Options options_; std::vector> table_properties_collectors_; + + BloomBlockBuilder bloom_block_; + std::unique_ptr index_builder_; + WritableFile* file_; uint64_t offset_ = 0; + uint32_t bloom_bits_per_key_; + uint32_t huge_page_tlb_size_; Status status_; TableProperties properties_; PlainTableKeyEncoder encoder_; + bool store_index_in_file_; + + std::vector keys_or_prefixes_hashes_; bool closed_ = false; // Either Finish() or Abandon() has been called. + const SliceTransform* prefix_extractor_; + + Slice GetPrefix(const Slice& target) const { + assert(target.size() >= 8); // target is internal key + return GetPrefixFromUserKey(GetUserKey(target)); + } + + Slice GetPrefix(const ParsedInternalKey& target) const { + return GetPrefixFromUserKey(target.user_key); + } + + Slice GetUserKey(const Slice& key) const { + return Slice(key.data(), key.size() - 8); + } + + Slice GetPrefixFromUserKey(const Slice& user_key) const { + if (!IsTotalOrderMode()) { + return prefix_extractor_->Transform(user_key); + } else { + // Use empty slice as prefix if prefix_extractor is not set. + // In that case, + // it falls back to pure binary search and + // total iterator seek is supported. + return Slice(); + } + } + + bool IsTotalOrderMode() const { return (prefix_extractor_ == nullptr); } + // No copying allowed PlainTableBuilder(const PlainTableBuilder&) = delete; void operator=(const PlainTableBuilder&) = delete; diff --git a/table/plain_table_factory.cc b/table/plain_table_factory.cc index fd070aad6..bd9d91d1c 100644 --- a/table/plain_table_factory.cc +++ b/table/plain_table_factory.cc @@ -30,7 +30,9 @@ TableBuilder* PlainTableFactory::NewTableBuilder( const Options& options, const InternalKeyComparator& internal_comparator, WritableFile* file, CompressionType compression_type) const { return new PlainTableBuilder(options, file, user_key_len_, encoding_type_, - index_sparseness_); + index_sparseness_, bloom_bits_per_key_, 6, + huge_page_tlb_size_, hash_table_ratio_, + store_index_in_file_); } extern TableFactory* NewPlainTableFactory(const PlainTableOptions& options) { @@ -43,5 +45,11 @@ const std::string PlainTablePropertyNames::kPrefixExtractorName = const std::string PlainTablePropertyNames::kEncodingType = "rocksdb.plain.table.encoding.type"; +const std::string PlainTablePropertyNames::kBloomVersion = + "rocksdb.plain.table.bloom.version"; + +const std::string PlainTablePropertyNames::kNumBloomBlocks = + "rocksdb.plain.table.bloom.numblocks"; + } // namespace rocksdb #endif // ROCKSDB_LITE diff --git a/table/plain_table_factory.h b/table/plain_table_factory.h index 0c646a9e0..ed54c4d10 100644 --- a/table/plain_table_factory.h +++ b/table/plain_table_factory.h @@ -151,7 +151,8 @@ class PlainTableFactory : public TableFactory { index_sparseness_(options.index_sparseness), huge_page_tlb_size_(options.huge_page_tlb_size), encoding_type_(options.encoding_type), - full_scan_mode_(options.full_scan_mode) {} + full_scan_mode_(options.full_scan_mode), + store_index_in_file_(options.store_index_in_file) {} const char* Name() const override { return "PlainTable"; } Status NewTableReader(const Options& options, const EnvOptions& soptions, const InternalKeyComparator& internal_comparator, @@ -173,6 +174,7 @@ class PlainTableFactory : public TableFactory { size_t huge_page_tlb_size_; EncodingType encoding_type_; bool full_scan_mode_; + bool store_index_in_file_; }; } // namespace rocksdb diff --git a/table/plain_table_index.cc b/table/plain_table_index.cc new file mode 100644 index 000000000..b3fc288ff --- /dev/null +++ b/table/plain_table_index.cc @@ -0,0 +1,196 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include "table/plain_table_index.h" +#include "util/coding.h" +#include "util/hash.h" + +namespace rocksdb { + +namespace { +inline uint32_t GetBucketIdFromHash(uint32_t hash, uint32_t num_buckets) { + assert(num_buckets > 0); + return hash % num_buckets; +} +} + +void PlainTableIndex::InitFromRawData(Slice data) { + assert(GetVarint32(&data, &index_size_)); + assert(index_size_ > 0); + assert(GetVarint32(&data, &num_prefixes_)); + sub_index_size_ = data.size() - index_size_ * kOffsetLen; + + char* index_data_begin = const_cast(data.data()); + index_ = reinterpret_cast(index_data_begin); + sub_index_ = reinterpret_cast(index_ + index_size_); +} + +PlainTableIndex::IndexSearchResult PlainTableIndex::GetOffset( + uint32_t prefix_hash, uint32_t* bucket_value) const { + int bucket = GetBucketIdFromHash(prefix_hash, index_size_); + *bucket_value = index_[bucket]; + if ((*bucket_value & kSubIndexMask) == kSubIndexMask) { + *bucket_value ^= kSubIndexMask; + return kSubindex; + } + if (*bucket_value >= kMaxFileSize) { + return kNoPrefixForBucket; + } else { + // point directly to the file + return kDirectToFile; + } +} + +void PlainTableIndexBuilder::IndexRecordList::AddRecord(murmur_t hash, + uint32_t offset) { + if (num_records_in_current_group_ == kNumRecordsPerGroup) { + current_group_ = AllocateNewGroup(); + num_records_in_current_group_ = 0; + } + auto& new_record = current_group_[num_records_in_current_group_++]; + new_record.hash = hash; + new_record.offset = offset; + new_record.next = nullptr; +} + +void PlainTableIndexBuilder::AddKeyPrefix(Slice key_prefix_slice, + uint64_t key_offset) { + if (is_first_record_ || prev_key_prefix_ != key_prefix_slice.ToString()) { + ++num_prefixes_; + if (!is_first_record_) { + keys_per_prefix_hist_.Add(num_keys_per_prefix_); + } + num_keys_per_prefix_ = 0; + prev_key_prefix_ = key_prefix_slice.ToString(); + prev_key_prefix_hash_ = GetSliceHash(key_prefix_slice); + due_index_ = true; + } + + if (due_index_) { + // Add an index key for every kIndexIntervalForSamePrefixKeys keys + record_list_.AddRecord(prev_key_prefix_hash_, key_offset); + due_index_ = false; + } + + num_keys_per_prefix_++; + if (index_sparseness_ == 0 || num_keys_per_prefix_ % index_sparseness_ == 0) { + due_index_ = true; + } + is_first_record_ = false; +} + +Slice PlainTableIndexBuilder::Finish() { + AllocateIndex(); + std::vector hash_to_offsets(index_size_, nullptr); + std::vector entries_per_bucket(index_size_, 0); + BucketizeIndexes(&hash_to_offsets, &entries_per_bucket); + + keys_per_prefix_hist_.Add(num_keys_per_prefix_); + Log(options_.info_log, "Number of Keys per prefix Histogram: %s", + keys_per_prefix_hist_.ToString().c_str()); + + // From the temp data structure, populate indexes. + return FillIndexes(hash_to_offsets, entries_per_bucket); +} + +void PlainTableIndexBuilder::AllocateIndex() { + if (prefix_extractor_ == nullptr || hash_table_ratio_ <= 0) { + // Fall back to pure binary search if the user fails to specify a prefix + // extractor. + index_size_ = 1; + } else { + double hash_table_size_multipier = 1.0 / hash_table_ratio_; + index_size_ = num_prefixes_ * hash_table_size_multipier + 1; + assert(index_size_ > 0); + } +} + +void PlainTableIndexBuilder::BucketizeIndexes( + std::vector* hash_to_offsets, + std::vector* entries_per_bucket) { + bool first = true; + uint32_t prev_hash = 0; + size_t num_records = record_list_.GetNumRecords(); + for (size_t i = 0; i < num_records; i++) { + IndexRecord* index_record = record_list_.At(i); + uint32_t cur_hash = index_record->hash; + if (first || prev_hash != cur_hash) { + prev_hash = cur_hash; + first = false; + } + uint32_t bucket = GetBucketIdFromHash(cur_hash, index_size_); + IndexRecord* prev_bucket_head = (*hash_to_offsets)[bucket]; + index_record->next = prev_bucket_head; + (*hash_to_offsets)[bucket] = index_record; + (*entries_per_bucket)[bucket]++; + } + + sub_index_size_ = 0; + for (auto entry_count : *entries_per_bucket) { + if (entry_count <= 1) { + continue; + } + // Only buckets with more than 1 entry will have subindex. + sub_index_size_ += VarintLength(entry_count); + // total bytes needed to store these entries' in-file offsets. + sub_index_size_ += entry_count * PlainTableIndex::kOffsetLen; + } +} + +Slice PlainTableIndexBuilder::FillIndexes( + const std::vector& hash_to_offsets, + const std::vector& entries_per_bucket) { + Log(options_.info_log, "Reserving %zu bytes for plain table's sub_index", + sub_index_size_); + auto total_allocate_size = GetTotalSize(); + char* allocated = arena_->AllocateAligned( + total_allocate_size, huge_page_tlb_size_, options_.info_log.get()); + + auto temp_ptr = EncodeVarint32(allocated, index_size_); + uint32_t* index = + reinterpret_cast(EncodeVarint32(temp_ptr, num_prefixes_)); + char* sub_index = reinterpret_cast(index + index_size_); + + size_t sub_index_offset = 0; + for (uint32_t i = 0; i < index_size_; i++) { + uint32_t num_keys_for_bucket = entries_per_bucket[i]; + switch (num_keys_for_bucket) { + case 0: + // No key for bucket + index[i] = PlainTableIndex::kMaxFileSize; + break; + case 1: + // point directly to the file offset + index[i] = hash_to_offsets[i]->offset; + break; + default: + // point to second level indexes. + index[i] = sub_index_offset | PlainTableIndex::kSubIndexMask; + char* prev_ptr = &sub_index[sub_index_offset]; + char* cur_ptr = EncodeVarint32(prev_ptr, num_keys_for_bucket); + sub_index_offset += (cur_ptr - prev_ptr); + char* sub_index_pos = &sub_index[sub_index_offset]; + IndexRecord* record = hash_to_offsets[i]; + int j; + for (j = num_keys_for_bucket - 1; j >= 0 && record; + j--, record = record->next) { + EncodeFixed32(sub_index_pos + j * sizeof(uint32_t), record->offset); + } + assert(j == -1 && record == nullptr); + sub_index_offset += PlainTableIndex::kOffsetLen * num_keys_for_bucket; + assert(sub_index_offset <= sub_index_size_); + break; + } + } + assert(sub_index_offset == sub_index_size_); + + Log(options_.info_log, "hash table size: %d, suffix_map length %zu", + index_size_, sub_index_size_); + return Slice(allocated, GetTotalSize()); +} + +const std::string PlainTableIndexBuilder::kPlainTableIndexBlock = + "PlainTableIndexBlock"; +}; // namespace rocksdb diff --git a/table/plain_table_index.h b/table/plain_table_index.h new file mode 100644 index 000000000..347bb0f05 --- /dev/null +++ b/table/plain_table_index.h @@ -0,0 +1,221 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#pragma once + +#include +#include + +#include "db/dbformat.h" +#include "rocksdb/options.h" +#include "util/murmurhash.h" +#include "util/hash.h" +#include "util/arena.h" +#include "util/histogram.h" + +namespace rocksdb { + +// PlainTableIndex contains buckets size of index_size_, each is a +// 32-bit integer. The lower 31 bits contain an offset value (explained below) +// and the first bit of the integer indicates type of the offset. +// +// +--------------+------------------------------------------------------+ +// | Flag (1 bit) | Offset to binary search buffer or file (31 bits) + +// +--------------+------------------------------------------------------+ +// +// Explanation for the "flag bit": +// +// 0 indicates that the bucket contains only one prefix (no conflict when +// hashing this prefix), whose first row starts from this offset of the +// file. +// 1 indicates that the bucket contains more than one prefixes, or there +// are too many rows for one prefix so we need a binary search for it. In +// this case, the offset indicates the offset of sub_index_ holding the +// binary search indexes of keys for those rows. Those binary search indexes +// are organized in this way: +// +// The first 4 bytes, indicate how many indexes (N) are stored after it. After +// it, there are N 32-bit integers, each points of an offset of the file, +// which +// points to starting of a row. Those offsets need to be guaranteed to be in +// ascending order so the keys they are pointing to are also in ascending +// order +// to make sure we can use them to do binary searches. Below is visual +// presentation of a bucket. +// +// +// number_of_records: varint32 +// record 1 file offset: fixedint32 +// record 2 file offset: fixedint32 +// .... +// record N file offset: fixedint32 +// +class PlainTableIndex { + public: + enum IndexSearchResult { + kNoPrefixForBucket = 0, + kDirectToFile = 1, + kSubindex = 2 + }; + + explicit PlainTableIndex(Slice data) { InitFromRawData(data); } + + PlainTableIndex() + : index_size_(0), + sub_index_size_(0), + num_prefixes_(0), + index_(nullptr), + sub_index_(nullptr) {} + + IndexSearchResult GetOffset(uint32_t prefix_hash, + uint32_t* bucket_value) const; + + void InitFromRawData(Slice data); + + const char* GetSubIndexBasePtrAndUpperBound(uint32_t offset, + uint32_t* upper_bound) const { + const char* index_ptr = &sub_index_[offset]; + return GetVarint32Ptr(index_ptr, index_ptr + 4, upper_bound); + } + + uint32_t GetIndexSize() const { return index_size_; } + + uint32_t GetSubIndexSize() const { return sub_index_size_; } + + uint32_t GetNumPrefixes() const { return num_prefixes_; } + + static const uint64_t kMaxFileSize = (1u << 31) - 1; + static const uint32_t kSubIndexMask = 0x80000000; + static const size_t kOffsetLen = sizeof(uint32_t); + + private: + uint32_t index_size_; + size_t sub_index_size_; + uint32_t num_prefixes_; + + uint32_t* index_; + char* sub_index_; +}; + +// PlainTableIndexBuilder is used to create plain table index. +// After calling Finish(), it returns Slice, which is usually +// used either to initialize PlainTableIndex or +// to save index to sst file. +// For more details about the index, please refer to: +// https://github.com/facebook/rocksdb/wiki/PlainTable-Format +// #wiki-in-memory-index-format +class PlainTableIndexBuilder { + public: + PlainTableIndexBuilder(Arena* arena, const Options& options, + uint32_t index_sparseness, double hash_table_ratio, + double huge_page_tlb_size) + : arena_(arena), + options_(options), + record_list_(kRecordsPerGroup), + is_first_record_(true), + due_index_(false), + num_prefixes_(0), + num_keys_per_prefix_(0), + prev_key_prefix_hash_(0), + index_sparseness_(index_sparseness), + prefix_extractor_(options.prefix_extractor.get()), + hash_table_ratio_(hash_table_ratio), + huge_page_tlb_size_(huge_page_tlb_size) {} + + void AddKeyPrefix(Slice key_prefix_slice, uint64_t key_offset); + + Slice Finish(); + + uint32_t GetTotalSize() const { + return VarintLength(index_size_) + VarintLength(num_prefixes_) + + PlainTableIndex::kOffsetLen * index_size_ + sub_index_size_; + } + + static const std::string kPlainTableIndexBlock; + + private: + struct IndexRecord { + uint32_t hash; // hash of the prefix + uint32_t offset; // offset of a row + IndexRecord* next; + }; + + // Helper class to track all the index records + class IndexRecordList { + public: + explicit IndexRecordList(size_t num_records_per_group) + : kNumRecordsPerGroup(num_records_per_group), + current_group_(nullptr), + num_records_in_current_group_(num_records_per_group) {} + + ~IndexRecordList() { + for (size_t i = 0; i < groups_.size(); i++) { + delete[] groups_[i]; + } + } + + void AddRecord(murmur_t hash, uint32_t offset); + + size_t GetNumRecords() const { + return (groups_.size() - 1) * kNumRecordsPerGroup + + num_records_in_current_group_; + } + IndexRecord* At(size_t index) { + return &(groups_[index / kNumRecordsPerGroup] + [index % kNumRecordsPerGroup]); + } + + private: + IndexRecord* AllocateNewGroup() { + IndexRecord* result = new IndexRecord[kNumRecordsPerGroup]; + groups_.push_back(result); + return result; + } + + // Each group in `groups_` contains fix-sized records (determined by + // kNumRecordsPerGroup). Which can help us minimize the cost if resizing + // occurs. + const size_t kNumRecordsPerGroup; + IndexRecord* current_group_; + // List of arrays allocated + std::vector groups_; + size_t num_records_in_current_group_; + }; + + void AllocateIndex(); + + // Internal helper function to bucket index record list to hash buckets. + void BucketizeIndexes(std::vector* hash_to_offsets, + std::vector* entries_per_bucket); + + // Internal helper class to fill the indexes and bloom filters to internal + // data structures. + Slice FillIndexes(const std::vector& hash_to_offsets, + const std::vector& entries_per_bucket); + + Arena* arena_; + Options options_; + HistogramImpl keys_per_prefix_hist_; + IndexRecordList record_list_; + bool is_first_record_; + bool due_index_; + uint32_t num_prefixes_; + uint32_t num_keys_per_prefix_; + + uint32_t prev_key_prefix_hash_; + uint32_t index_sparseness_; + uint32_t index_size_; + size_t sub_index_size_; + + const SliceTransform* prefix_extractor_; + double hash_table_ratio_; + double huge_page_tlb_size_; + + std::string prev_key_prefix_; + + static const size_t kRecordsPerGroup = 256; +}; + +}; // namespace rocksdb diff --git a/table/plain_table_reader.cc b/table/plain_table_reader.cc index a2aec8a6f..e20446f1e 100644 --- a/table/plain_table_reader.cc +++ b/table/plain_table_reader.cc @@ -3,6 +3,7 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #ifndef ROCKSDB_LITE + #include "table/plain_table_reader.h" #include @@ -18,6 +19,7 @@ #include "rocksdb/statistics.h" #include "table/block.h" +#include "table/bloom_block.h" #include "table/filter_block.h" #include "table/format.h" #include "table/meta_blocks.h" @@ -39,15 +41,6 @@ namespace rocksdb { namespace { -inline uint32_t GetSliceHash(const Slice& s) { - return Hash(s.data(), s.size(), 397) ; -} - -inline uint32_t GetBucketIdFromHash(uint32_t hash, uint32_t num_buckets) { - assert(num_buckets >= 0); - return hash % num_buckets; -} - // Safely getting a uint32_t element from a char array, where, starting from // `base`, every 4 bytes are considered as an fixed 32 bit integer. inline uint32_t GetFixed32Element(const char* base, size_t offset) { @@ -103,6 +96,7 @@ PlainTableReader::PlainTableReader(const Options& options, const TableProperties* table_properties) : internal_comparator_(icomparator), encoding_type_(encoding_type), + full_scan_mode_(false), data_end_offset_(table_properties->data_size), user_key_len_(table_properties->fixed_key_len), prefix_extractor_(options.prefix_extractor.get()), @@ -126,8 +120,7 @@ Status PlainTableReader::Open(const Options& options, double hash_table_ratio, size_t index_sparseness, size_t huge_page_tlb_size, bool full_scan_mode) { assert(options.allow_mmap_reads); - - if (file_size > kMaxFileSize) { + if (file_size > PlainTableIndex::kMaxFileSize) { return Status::NotSupported("File is too large for PlainTableReader!"); } @@ -173,7 +166,6 @@ Status PlainTableReader::Open(const Options& options, return s; } - // -- Populate Index if (!full_scan_mode) { s = new_reader->PopulateIndex(props, bloom_bits_per_key, hash_table_ratio, index_sparseness, huge_page_tlb_size); @@ -183,7 +175,7 @@ Status PlainTableReader::Open(const Options& options, } else { // Flag to indicate it is a full scan mode so that none of the indexes // can be used. - new_reader->index_size_ = kFullScanModeFlag; + new_reader->full_scan_mode_ = true; } *table_reader = std::move(new_reader); @@ -203,79 +195,15 @@ Iterator* PlainTableReader::NewIterator(const ReadOptions& options, } } -struct PlainTableReader::IndexRecord { - uint32_t hash; // hash of the prefix - uint32_t offset; // offset of a row - IndexRecord* next; -}; - -// Helper class to track all the index records -class PlainTableReader::IndexRecordList { - public: - explicit IndexRecordList(size_t num_records_per_group) - : kNumRecordsPerGroup(num_records_per_group), - current_group_(nullptr), - num_records_in_current_group_(num_records_per_group) {} - - ~IndexRecordList() { - for (size_t i = 0; i < groups_.size(); i++) { - delete[] groups_[i]; - } - } - - void AddRecord(murmur_t hash, uint32_t offset) { - if (num_records_in_current_group_ == kNumRecordsPerGroup) { - current_group_ = AllocateNewGroup(); - num_records_in_current_group_ = 0; - } - auto& new_record = current_group_[num_records_in_current_group_++]; - new_record.hash = hash; - new_record.offset = offset; - new_record.next = nullptr; - } - - size_t GetNumRecords() const { - return (groups_.size() - 1) * kNumRecordsPerGroup + - num_records_in_current_group_; - } - IndexRecord* At(size_t index) { - return &(groups_[index / kNumRecordsPerGroup][index % kNumRecordsPerGroup]); - } - - private: - IndexRecord* AllocateNewGroup() { - IndexRecord* result = new IndexRecord[kNumRecordsPerGroup]; - groups_.push_back(result); - return result; - } - - // Each group in `groups_` contains fix-sized records (determined by - // kNumRecordsPerGroup). Which can help us minimize the cost if resizing - // occurs. - const size_t kNumRecordsPerGroup; - IndexRecord* current_group_; - // List of arrays allocated - std::vector groups_; - size_t num_records_in_current_group_; -}; - -Status PlainTableReader::PopulateIndexRecordList(IndexRecordList* record_list, - int* num_prefixes, - int bloom_bits_per_key, - size_t index_sparseness) { +Status PlainTableReader::PopulateIndexRecordList( + PlainTableIndexBuilder* index_builder, vector* prefix_hashes) { Slice prev_key_prefix_slice; - uint32_t prev_key_prefix_hash = 0; uint32_t pos = data_start_offset_; - int num_keys_per_prefix = 0; - bool is_first_record = true; - HistogramImpl keys_per_prefix_hist; - // Need map to be ordered to make sure sub indexes generated - // are in order. - *num_prefixes = 0; + bool is_first_record = true; + Slice key_prefix_slice; PlainTableKeyDecoder decoder(encoding_type_, user_key_len_, options_.prefix_extractor.get()); - bool due_index = false; while (pos < data_end_offset_) { uint32_t key_offset = pos; ParsedInternalKey key; @@ -285,152 +213,53 @@ Status PlainTableReader::PopulateIndexRecordList(IndexRecordList* record_list, if (!s.ok()) { return s; } + + key_prefix_slice = GetPrefix(key); if (enable_bloom_) { - // total order mode and bloom filter is enabled. bloom_.AddHash(GetSliceHash(key.user_key)); - } - Slice key_prefix_slice = GetPrefix(key); - - if (is_first_record || prev_key_prefix_slice != key_prefix_slice) { - ++(*num_prefixes); - if (!is_first_record) { - keys_per_prefix_hist.Add(num_keys_per_prefix); + } else { + if (is_first_record || prev_key_prefix_slice != key_prefix_slice) { + if (!is_first_record) { + prefix_hashes->push_back(GetSliceHash(prev_key_prefix_slice)); + } + prev_key_prefix_slice = key_prefix_slice; } - num_keys_per_prefix = 0; - prev_key_prefix_slice = key_prefix_slice; - prev_key_prefix_hash = GetSliceHash(key_prefix_slice); - due_index = true; } - if (due_index) { - if (!seekable) { - return Status::Corruption("Key for a prefix is not seekable"); - } - // Add an index key for every kIndexIntervalForSamePrefixKeys keys - record_list->AddRecord(prev_key_prefix_hash, key_offset); - due_index = false; + index_builder->AddKeyPrefix(GetPrefix(key), key_offset); + + if (!seekable && is_first_record) { + return Status::Corruption("Key for a prefix is not seekable"); } - num_keys_per_prefix++; - if (index_sparseness == 0 || num_keys_per_prefix % index_sparseness == 0) { - due_index = true; - } is_first_record = false; } - keys_per_prefix_hist.Add(num_keys_per_prefix); - Log(options_.info_log, "Number of Keys per prefix Histogram: %s", - keys_per_prefix_hist.ToString().c_str()); - + prefix_hashes->push_back(GetSliceHash(key_prefix_slice)); + index_.InitFromRawData(index_builder->Finish()); return Status::OK(); } -void PlainTableReader::AllocateIndexAndBloom(int num_prefixes, - int bloom_bits_per_key, - double hash_table_ratio, - size_t huge_page_tlb_size) { - if (prefix_extractor_ != nullptr) { +void PlainTableReader::AllocateAndFillBloom(int bloom_bits_per_key, + int num_prefixes, + size_t huge_page_tlb_size, + vector* prefix_hashes) { + if (!IsTotalOrderMode()) { uint32_t bloom_total_bits = num_prefixes * bloom_bits_per_key; if (bloom_total_bits > 0) { enable_bloom_ = true; bloom_.SetTotalBits(&arena_, bloom_total_bits, options_.bloom_locality, huge_page_tlb_size, options_.info_log.get()); + FillBloom(prefix_hashes); } } - - if (prefix_extractor_ == nullptr || hash_table_ratio <= 0) { - // Fall back to pure binary search if the user fails to specify a prefix - // extractor. - index_size_ = 1; - } else { - double hash_table_size_multipier = 1.0 / hash_table_ratio; - index_size_ = num_prefixes * hash_table_size_multipier + 1; - } } -size_t PlainTableReader::BucketizeIndexesAndFillBloom( - IndexRecordList* record_list, std::vector* hash_to_offsets, - std::vector* entries_per_bucket) { - bool first = true; - uint32_t prev_hash = 0; - size_t num_records = record_list->GetNumRecords(); - for (size_t i = 0; i < num_records; i++) { - IndexRecord* index_record = record_list->At(i); - uint32_t cur_hash = index_record->hash; - if (first || prev_hash != cur_hash) { - prev_hash = cur_hash; - first = false; - if (enable_bloom_ && !IsTotalOrderMode()) { - bloom_.AddHash(cur_hash); - } - } - uint32_t bucket = GetBucketIdFromHash(cur_hash, index_size_); - IndexRecord* prev_bucket_head = (*hash_to_offsets)[bucket]; - index_record->next = prev_bucket_head; - (*hash_to_offsets)[bucket] = index_record; - (*entries_per_bucket)[bucket]++; +void PlainTableReader::FillBloom(vector* prefix_hashes) { + assert(bloom_.IsInitialized()); + for (auto prefix_hash : *prefix_hashes) { + bloom_.AddHash(prefix_hash); } - size_t sub_index_size = 0; - for (auto entry_count : *entries_per_bucket) { - if (entry_count <= 1) { - continue; - } - // Only buckets with more than 1 entry will have subindex. - sub_index_size += VarintLength(entry_count); - // total bytes needed to store these entries' in-file offsets. - sub_index_size += entry_count * kOffsetLen; - } - return sub_index_size; -} - -void PlainTableReader::FillIndexes( - const size_t kSubIndexSize, - const std::vector& hash_to_offsets, - const std::vector& entries_per_bucket, - size_t huge_page_tlb_size) { - Log(options_.info_log, "Reserving %zu bytes for plain table's sub_index", - kSubIndexSize); - auto total_allocate_size = sizeof(uint32_t) * index_size_ + kSubIndexSize; - char* allocated = arena_.AllocateAligned( - total_allocate_size, huge_page_tlb_size, options_.info_log.get()); - index_ = reinterpret_cast(allocated); - sub_index_ = allocated + sizeof(uint32_t) * index_size_; - - size_t sub_index_offset = 0; - for (int i = 0; i < index_size_; i++) { - uint32_t num_keys_for_bucket = entries_per_bucket[i]; - switch (num_keys_for_bucket) { - case 0: - // No key for bucket - index_[i] = data_end_offset_; - break; - case 1: - // point directly to the file offset - index_[i] = hash_to_offsets[i]->offset; - break; - default: - // point to second level indexes. - index_[i] = sub_index_offset | kSubIndexMask; - char* prev_ptr = &sub_index_[sub_index_offset]; - char* cur_ptr = EncodeVarint32(prev_ptr, num_keys_for_bucket); - sub_index_offset += (cur_ptr - prev_ptr); - char* sub_index_pos = &sub_index_[sub_index_offset]; - IndexRecord* record = hash_to_offsets[i]; - int j; - for (j = num_keys_for_bucket - 1; j >= 0 && record; - j--, record = record->next) { - EncodeFixed32(sub_index_pos + j * sizeof(uint32_t), record->offset); - } - assert(j == -1 && record == nullptr); - sub_index_offset += kOffsetLen * num_keys_for_bucket; - assert(sub_index_offset <= kSubIndexSize); - break; - } - } - assert(sub_index_offset == kSubIndexSize); - - Log(options_.info_log, "hash table size: %d, suffix_map length %zu", - index_size_, kSubIndexSize); } Status PlainTableReader::MmapDataFile() { @@ -445,59 +274,111 @@ Status PlainTableReader::PopulateIndex(TableProperties* props, size_t huge_page_tlb_size) { assert(props != nullptr); table_properties_.reset(props); - // options.prefix_extractor is requried for a hash-based look-up. - if ((options_.prefix_extractor.get() == nullptr) && (hash_table_ratio != 0)) { + BlockContents bloom_block_contents; + auto s = ReadMetaBlock(file_.get(), file_size_, kPlainTableMagicNumber, + options_.env, BloomBlockBuilder::kBloomBlock, + &bloom_block_contents); + bool index_in_file = s.ok(); + + BlockContents index_block_contents; + s = ReadMetaBlock(file_.get(), file_size_, kPlainTableMagicNumber, + options_.env, PlainTableIndexBuilder::kPlainTableIndexBlock, + &index_block_contents); + + index_in_file &= s.ok(); + + Slice* bloom_block; + if (index_in_file) { + bloom_block = &bloom_block_contents.data; + } else { + bloom_block = nullptr; + } + + // index_in_file == true only if there are kBloomBlock and + // kPlainTableIndexBlock + // in file + + Slice* index_block; + if (index_in_file) { + index_block = &index_block_contents.data; + } else { + index_block = nullptr; + } + + if ((options_.prefix_extractor.get() == nullptr) && (hash_table_ratio != 0)) { // options.prefix_extractor is requried for a hash-based look-up. return Status::NotSupported( "PlainTable requires a prefix extractor enable prefix hash mode."); } - IndexRecordList record_list(kRecordsPerGroup); // First, read the whole file, for every kIndexIntervalForSamePrefixKeys rows // for a prefix (starting from the first one), generate a record of (hash, // offset) and append it to IndexRecordList, which is a data structure created // to store them. - int num_prefixes; - // Allocate bloom filter here for total order mode. - if (IsTotalOrderMode()) { - uint32_t num_bloom_bits = - table_properties_->num_entries * bloom_bits_per_key; - if (num_bloom_bits > 0) { - enable_bloom_ = true; - bloom_.SetTotalBits(&arena_, num_bloom_bits, options_.bloom_locality, - huge_page_tlb_size, options_.info_log.get()); + if (!index_in_file) { + // Allocate bloom filter here for total order mode. + if (IsTotalOrderMode()) { + uint32_t num_bloom_bits = + table_properties_->num_entries * bloom_bits_per_key; + if (num_bloom_bits > 0) { + enable_bloom_ = true; + bloom_.SetTotalBits(&arena_, num_bloom_bits, options_.bloom_locality, + huge_page_tlb_size, options_.info_log.get()); + } } + } else { + enable_bloom_ = true; + auto num_blocks_property = props->user_collected_properties.find( + PlainTablePropertyNames::kNumBloomBlocks); + + uint32_t num_blocks = 0; + if (num_blocks_property != props->user_collected_properties.end()) { + Slice temp_slice(num_blocks_property->second); + if (!GetVarint32(&temp_slice, &num_blocks)) { + num_blocks = 0; + } + } + // cast away const qualifier, because bloom_ won't be changed + bloom_.SetRawData( + const_cast( + reinterpret_cast(bloom_block->data())), + bloom_block->size() * 8, num_blocks); } - Status s = PopulateIndexRecordList(&record_list, &num_prefixes, - bloom_bits_per_key, index_sparseness); - if (!s.ok()) { - return s; - } - // Calculated hash table and bloom filter size and allocate memory for indexes - // and bloom filter based on the number of prefixes. - AllocateIndexAndBloom(num_prefixes, bloom_bits_per_key, hash_table_ratio, - huge_page_tlb_size); + PlainTableIndexBuilder index_builder(&arena_, options_, index_sparseness, + hash_table_ratio, huge_page_tlb_size); - // Bucketize all the index records to a temp data structure, in which for - // each bucket, we generate a linked list of IndexRecord, in reversed order. - std::vector hash_to_offsets(index_size_, nullptr); - std::vector entries_per_bucket(index_size_, 0); - size_t sub_index_size_needed = BucketizeIndexesAndFillBloom( - &record_list, &hash_to_offsets, &entries_per_bucket); - // From the temp data structure, populate indexes. - FillIndexes(sub_index_size_needed, hash_to_offsets, entries_per_bucket, - huge_page_tlb_size); + std::vector prefix_hashes; + if (!index_in_file) { + Status s = PopulateIndexRecordList(&index_builder, &prefix_hashes); + if (!s.ok()) { + return s; + } + } else { + index_.InitFromRawData(*index_block); + } + + if (!index_in_file) { + // Calculated bloom filter size and allocate memory for + // bloom filter based on the number of prefixes, then fill it. + AllocateAndFillBloom(bloom_bits_per_key, index_.GetNumPrefixes(), + huge_page_tlb_size, &prefix_hashes); + } // Fill two table properties. - // TODO(sdong): after we have the feature of storing index in file, this - // properties need to be populated to index_size instead. - props->user_collected_properties["plain_table_hash_table_size"] = - std::to_string(index_size_ * 4U); - props->user_collected_properties["plain_table_sub_index_size"] = - std::to_string(sub_index_size_needed); + if (!index_in_file) { + props->user_collected_properties["plain_table_hash_table_size"] = + std::to_string(index_.GetIndexSize() * PlainTableIndex::kOffsetLen); + props->user_collected_properties["plain_table_sub_index_size"] = + std::to_string(index_.GetSubIndexSize()); + } else { + props->user_collected_properties["plain_table_hash_table_size"] = + std::to_string(0); + props->user_collected_properties["plain_table_sub_index_size"] = + std::to_string(0); + } return Status::OK(); } @@ -506,24 +387,21 @@ Status PlainTableReader::GetOffset(const Slice& target, const Slice& prefix, uint32_t prefix_hash, bool& prefix_matched, uint32_t* offset) const { prefix_matched = false; - int bucket = GetBucketIdFromHash(prefix_hash, index_size_); - uint32_t bucket_value = index_[bucket]; - if (bucket_value == data_end_offset_) { + uint32_t prefix_index_offset; + auto res = index_.GetOffset(prefix_hash, &prefix_index_offset); + if (res == PlainTableIndex::kNoPrefixForBucket) { *offset = data_end_offset_; return Status::OK(); - } else if ((bucket_value & kSubIndexMask) == 0) { - // point directly to the file - *offset = bucket_value; + } else if (res == PlainTableIndex::kDirectToFile) { + *offset = prefix_index_offset; return Status::OK(); } // point to sub-index, need to do a binary search + uint32_t upper_bound; + const char* base_ptr = + index_.GetSubIndexBasePtrAndUpperBound(prefix_index_offset, &upper_bound); uint32_t low = 0; - uint64_t prefix_index_offset = bucket_value ^ kSubIndexMask; - - const char* index_ptr = &sub_index_[prefix_index_offset]; - uint32_t upper_bound = 0; - const char* base_ptr = GetVarint32Ptr(index_ptr, index_ptr + 4, &upper_bound); uint32_t high = upper_bound; ParsedInternalKey mid_key; ParsedInternalKey parsed_target; @@ -593,9 +471,6 @@ bool PlainTableReader::MatchBloom(uint32_t hash) const { return !enable_bloom_ || bloom_.MayContainHash(hash); } -Slice PlainTableReader::GetPrefix(const ParsedInternalKey& target) const { - return GetPrefixFromUserKey(target.user_key); -} Status PlainTableReader::Next(PlainTableKeyDecoder* decoder, uint32_t* offset, ParsedInternalKey* parsed_key, @@ -650,8 +525,7 @@ Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target, Slice prefix_slice; uint32_t prefix_hash; if (IsTotalOrderMode()) { - if (index_size_ == kFullScanModeFlag) { - // Full Scan Mode + if (full_scan_mode_) { status_ = Status::InvalidArgument("Get() is not allowed in full scan mode."); } @@ -682,7 +556,6 @@ Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target, if (!ParseInternalKey(target, &parsed_target)) { return Status::Corruption(Slice()); } - Slice found_value; PlainTableKeyDecoder decoder(encoding_type_, user_key_len_, options_.prefix_extractor.get()); @@ -747,13 +620,12 @@ void PlainTableIterator::Seek(const Slice& target) { // If the user doesn't set prefix seek option and we are not able to do a // total Seek(). assert failure. if (!use_prefix_seek_) { - if (table_->index_size_ == PlainTableReader::kFullScanModeFlag) { - // Full Scan Mode. + if (table_->full_scan_mode_) { status_ = Status::InvalidArgument("Seek() is not allowed in full scan mode."); offset_ = next_offset_ = table_->data_end_offset_; return; - } else if (table_->index_size_ > 1) { + } else if (table_->GetIndexSize() > 1) { assert(false); status_ = Status::NotSupported( "PlainTable cannot issue non-prefix seek unless in total order " diff --git a/table/plain_table_reader.h b/table/plain_table_reader.h index 6759966c1..cb1e32f4d 100644 --- a/table/plain_table_reader.h +++ b/table/plain_table_reader.h @@ -19,12 +19,14 @@ #include "rocksdb/table_properties.h" #include "table/table_reader.h" #include "table/plain_table_factory.h" +#include "table/plain_table_index.h" #include "util/arena.h" #include "util/dynamic_bloom.h" namespace rocksdb { class Block; +class BlockContents; class BlockHandle; class Footer; struct Options; @@ -37,6 +39,7 @@ class PlainTableKeyDecoder; using std::unique_ptr; using std::unordered_map; +using std::vector; extern const uint32_t kPlainTableVariableLength; // Based on following output file format shown in plain_table_factory.h @@ -68,6 +71,7 @@ class PlainTableReader: public TableReader { uint64_t ApproximateOffsetOf(const Slice& key); + uint32_t GetIndexSize() const { return index_.GetIndexSize(); } void SetupForCompaction(); std::shared_ptr GetTableProperties() const { @@ -93,65 +97,23 @@ class PlainTableReader: public TableReader { // props: the table properties object that need to be stored. Ownership of // the object will be passed. // - // index_ contains buckets size of index_size_, each is a - // 32-bit integer. The lower 31 bits contain an offset value (explained below) - // and the first bit of the integer indicates type of the offset. - // - // +--------------+------------------------------------------------------+ - // | Flag (1 bit) | Offset to binary search buffer or file (31 bits) + - // +--------------+------------------------------------------------------+ - // - // Explanation for the "flag bit": - // - // 0 indicates that the bucket contains only one prefix (no conflict when - // hashing this prefix), whose first row starts from this offset of the - // file. - // 1 indicates that the bucket contains more than one prefixes, or there - // are too many rows for one prefix so we need a binary search for it. In - // this case, the offset indicates the offset of sub_index_ holding the - // binary search indexes of keys for those rows. Those binary search indexes - // are organized in this way: - // - // The first 4 bytes, indicate how many indexes (N) are stored after it. After - // it, there are N 32-bit integers, each points of an offset of the file, - // which - // points to starting of a row. Those offsets need to be guaranteed to be in - // ascending order so the keys they are pointing to are also in ascending - // order - // to make sure we can use them to do binary searches. Below is visual - // presentation of a bucket. - // - // - // number_of_records: varint32 - // record 1 file offset: fixedint32 - // record 2 file offset: fixedint32 - // .... - // record N file offset: fixedint32 - // + Status PopulateIndex(TableProperties* props, int bloom_bits_per_key, double hash_table_ratio, size_t index_sparseness, size_t huge_page_tlb_size); + Status MmapDataFile(); private: - struct IndexRecord; - class IndexRecordList; - - // Plain table maintains an index and a sub index. - // index is implemented by a hash table. - // subindex is a big of memory array. - // For more details about the in-memory index, please refer to: - // https://github.com/facebook/rocksdb/wiki/PlainTable-Format - // #wiki-in-memory-index-format - uint32_t* index_; - int index_size_ = 0; - char* sub_index_; const InternalKeyComparator internal_comparator_; EncodingType encoding_type_; // represents plain table's current status. Status status_; Slice file_data_; + PlainTableIndex index_; + bool full_scan_mode_; + // data_start_offset_ and data_end_offset_ defines the range of the // sst file that stores data. const uint32_t data_start_offset_ = 0; @@ -160,11 +122,6 @@ class PlainTableReader: public TableReader { const SliceTransform* prefix_extractor_; static const size_t kNumInternalBytes = 8; - static const uint32_t kSubIndexMask = 0x80000000; - static const size_t kOffsetLen = sizeof(uint32_t); - static const uint64_t kMaxFileSize = 1u << 31; - static const size_t kRecordsPerGroup = 256; - static const int kFullScanModeFlag = -1; // Bloom filter is used to rule out non-existent key bool enable_bloom_; @@ -184,6 +141,31 @@ class PlainTableReader: public TableReader { return user_key_len_ + kNumInternalBytes; } + Slice GetPrefix(const Slice& target) const { + assert(target.size() >= 8); // target is internal key + return GetPrefixFromUserKey(GetUserKey(target)); + } + + Slice GetPrefix(const ParsedInternalKey& target) const { + return GetPrefixFromUserKey(target.user_key); + } + + Slice GetUserKey(const Slice& key) const { + return Slice(key.data(), key.size() - 8); + } + + Slice GetPrefixFromUserKey(const Slice& user_key) const { + if (!IsTotalOrderMode()) { + return prefix_extractor_->Transform(user_key); + } else { + // Use empty slice as prefix if prefix_extractor is not set. + // In that case, + // it falls back to pure binary search and + // total iterator seek is supported. + return Slice(); + } + } + friend class TableCache; friend class PlainTableIterator; @@ -191,33 +173,15 @@ class PlainTableReader: public TableReader { // the rows, which contains index records as a list. // If bloom_ is not null, all the keys' full-key hash will be added to the // bloom filter. - Status PopulateIndexRecordList(IndexRecordList* record_list, - int* num_prefixes, int bloom_bits_per_key, - size_t index_sparseness); + Status PopulateIndexRecordList(PlainTableIndexBuilder* index_builder, + vector* prefix_hashes); - // Internal helper function to allocate memory for indexes and bloom filters - void AllocateIndexAndBloom(int num_prefixes, int bloom_bits_per_key, - double hash_table_ratio, - size_t huge_page_tlb_size); + // Internal helper function to allocate memory for bloom filter and fill it + void AllocateAndFillBloom(int bloom_bits_per_key, int num_prefixes, + size_t huge_page_tlb_size, + vector* prefix_hashes); - // Internal helper function to bucket index record list to hash buckets. - // bucket_header is a vector of size hash_table_size_, with each entry - // containing a linklist of IndexRecord hashed to the same bucket, in reverse - // order. - // of offsets for the hash, in reversed order. - // entries_per_bucket is sized of index_size_. The value is how many index - // records are there in bucket_headers for the same bucket. - size_t BucketizeIndexesAndFillBloom( - IndexRecordList* record_list, std::vector* bucket_headers, - std::vector* entries_per_bucket); - - // Internal helper class to fill the indexes and bloom filters to internal - // data structures. bucket_headers and entries_per_bucket are bucketized - // indexes and counts generated by BucketizeIndexesAndFillBloom(). - void FillIndexes(const size_t kSubIndexSize, - const std::vector& bucket_headers, - const std::vector& entries_per_bucket, - size_t huge_page_tlb_size); + void FillBloom(vector* prefix_hashes); // Read the key and value at `offset` to parameters for keys, the and // `seekable`. @@ -237,28 +201,6 @@ class PlainTableReader: public TableReader { uint32_t prefix_hash, bool& prefix_matched, uint32_t* offset) const; - Slice GetUserKey(const Slice& key) const { - return Slice(key.data(), key.size() - 8); - } - - Slice GetPrefix(const Slice& target) const { - assert(target.size() >= 8); // target is internal key - return GetPrefixFromUserKey(GetUserKey(target)); - } - - inline Slice GetPrefix(const ParsedInternalKey& target) const; - - Slice GetPrefixFromUserKey(const Slice& user_key) const { - if (!IsTotalOrderMode()) { - return prefix_extractor_->Transform(user_key); - } else { - // Use empty slice as prefix if prefix_extractor is not set. In that case, - // it falls back to pure binary search and total iterator seek is - // supported. - return Slice(); - } - } - bool IsTotalOrderMode() const { return (prefix_extractor_ == nullptr); } // No copying allowed diff --git a/util/dynamic_bloom.cc b/util/dynamic_bloom.cc index cbe895ace..4463faf51 100644 --- a/util/dynamic_bloom.cc +++ b/util/dynamic_bloom.cc @@ -48,6 +48,13 @@ DynamicBloom::DynamicBloom(uint32_t num_probes, kNumProbes(num_probes), hash_func_(hash_func == nullptr ? &BloomHash : hash_func) {} +void DynamicBloom::SetRawData(unsigned char* raw_data, uint32_t total_bits, + uint32_t num_blocks) { + data_ = raw_data; + kTotalBits = total_bits; + kNumBlocks = num_blocks; +} + void DynamicBloom::SetTotalBits(Arena* arena, uint32_t total_bits, uint32_t locality, size_t huge_page_tlb_size, diff --git a/util/dynamic_bloom.h b/util/dynamic_bloom.h index ba0016ddb..927710d24 100644 --- a/util/dynamic_bloom.h +++ b/util/dynamic_bloom.h @@ -5,6 +5,10 @@ #pragma once +#include + +#include "rocksdb/slice.h" + #include #include @@ -57,6 +61,19 @@ class DynamicBloom { void Prefetch(uint32_t h); + uint32_t GetNumBlocks() const { return kNumBlocks; } + + Slice GetRawData() const { + return Slice(reinterpret_cast(data_), GetTotalBits() / 8); + } + + void SetRawData(unsigned char* raw_data, uint32_t total_bits, + uint32_t num_blocks = 0); + + uint32_t GetTotalBits() const { return kTotalBits; } + + bool IsInitialized() const { return kNumBlocks > 0 || kTotalBits > 0; } + private: uint32_t kTotalBits; uint32_t kNumBlocks; @@ -81,7 +98,7 @@ inline void DynamicBloom::Prefetch(uint32_t h) { } inline bool DynamicBloom::MayContainHash(uint32_t h) const { - assert(kNumBlocks > 0 || kTotalBits > 0); + assert(IsInitialized()); const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits if (kNumBlocks != 0) { uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8); @@ -98,10 +115,6 @@ inline bool DynamicBloom::MayContainHash(uint32_t h) const { h += delta; } } else { - if (kTotalBits == 0) { - // Not initialized. - return true; - } for (uint32_t i = 0; i < kNumProbes; ++i) { const uint32_t bitpos = h % kTotalBits; if (((data_[bitpos / 8]) & (1 << (bitpos % 8))) == 0) { @@ -114,7 +127,7 @@ inline bool DynamicBloom::MayContainHash(uint32_t h) const { } inline void DynamicBloom::AddHash(uint32_t h) { - assert(kNumBlocks > 0 || kTotalBits > 0); + assert(IsInitialized()); const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits if (kNumBlocks != 0) { uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8); diff --git a/util/hash.h b/util/hash.h index c9eb659ab..2e7d30271 100644 --- a/util/hash.h +++ b/util/hash.h @@ -17,4 +17,7 @@ namespace rocksdb { extern uint32_t Hash(const char* data, size_t n, uint32_t seed); +inline uint32_t GetSliceHash(const Slice& s) { + return Hash(s.data(), s.size(), 397); +} }