diff --git a/db/comparator_db_test.cc b/db/comparator_db_test.cc index dc362b9c2..226451464 100644 --- a/db/comparator_db_test.cc +++ b/db/comparator_db_test.cc @@ -449,6 +449,70 @@ TEST_P(ComparatorDBTest, TwoStrComparator) { } } +TEST_P(ComparatorDBTest, IsSameLengthImmediateSuccessor) { + { + // different length + Slice s("abcxy"); + Slice t("abcxyz"); + ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t)); + } + { + Slice s("abcxyz"); + Slice t("abcxy"); + ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t)); + } + { + // not last byte different + Slice s("abc1xyz"); + Slice t("abc2xyz"); + ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t)); + } + { + // same string + Slice s("abcxyz"); + Slice t("abcxyz"); + ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t)); + } + { + Slice s("abcxy"); + Slice t("abcxz"); + ASSERT_TRUE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t)); + } + { + Slice s("abcxz"); + Slice t("abcxy"); + ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t)); + } + { + const char s_array[] = "\x50\x8a\xac"; + const char t_array[] = "\x50\x8a\xad"; + Slice s(s_array); + Slice t(t_array); + ASSERT_TRUE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t)); + } + { + const char s_array[] = "\x50\x8a\xff"; + const char t_array[] = "\x50\x8b\x00"; + Slice s(s_array, 3); + Slice t(t_array, 3); + ASSERT_TRUE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t)); + } + { + const char s_array[] = "\x50\x8a\xff\xff"; + const char t_array[] = "\x50\x8b\x00\x00"; + Slice s(s_array, 4); + Slice t(t_array, 4); + ASSERT_TRUE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t)); + } + { + const char s_array[] = "\x50\x8a\xff\xff"; + const char t_array[] = "\x50\x8b\x00\x01"; + Slice s(s_array, 4); + Slice t(t_array, 4); + ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t)); + } +} + TEST_P(ComparatorDBTest, FindShortestSeparator) { std::string s1 = "abc1xyz"; std::string s2 = "abc3xy"; diff --git a/db/db_bloom_filter_test.cc b/db/db_bloom_filter_test.cc index d4b034c53..4a97f93d7 100644 --- a/db/db_bloom_filter_test.cc +++ b/db/db_bloom_filter_test.cc @@ -1097,6 +1097,413 @@ TEST_F(DBBloomFilterTest, OptimizeFiltersForHits) { TestGetTickerCount(options, BLOCK_CACHE_ADD)); } +int CountIter(std::unique_ptr& iter, const Slice& key) { + int count = 0; + for (iter->Seek(key); iter->Valid() && iter->status() == Status::OK(); + iter->Next()) { + count++; + } + return count; +} + +// use iterate_upper_bound to hint compatiability of existing bloom filters. +// The BF is considered compatible if 1) upper bound and seek key transform +// into the same string, or 2) the transformed seek key is of the same length +// as the upper bound and two keys are adjacent according to the comparator. +TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) { + int iteration = 0; + for (bool use_block_based_builder : {true, false}) { + Options options; + options.create_if_missing = true; + options.prefix_extractor.reset(NewCappedPrefixTransform(4)); + options.disable_auto_compactions = true; + options.statistics = CreateDBStatistics(); + // Enable prefix bloom for SST files + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = true; + table_options.filter_policy.reset( + NewBloomFilterPolicy(10, use_block_based_builder)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + + ASSERT_OK(Put("abcdxxx0", "val1")); + ASSERT_OK(Put("abcdxxx1", "val2")); + ASSERT_OK(Put("abcdxxx2", "val3")); + ASSERT_OK(Put("abcdxxx3", "val4")); + dbfull()->Flush(FlushOptions()); + { + // prefix_extractor has not changed, BF will always be read + Slice upper_bound("abce"); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + read_options.iterate_upper_bound = &upper_bound; + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "abcd0000"), 4); + } + { + Slice upper_bound("abcdzzzz"); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + read_options.iterate_upper_bound = &upper_bound; + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "abcd0000"), 4); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 2); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:5"}})); + ASSERT_EQ(0, strcmp(dbfull()->GetOptions().prefix_extractor->Name(), + "rocksdb.FixedPrefix.5")); + { + // BF changed, [abcdxx00, abce) is a valid bound, will trigger BF read + Slice upper_bound("abce"); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + read_options.iterate_upper_bound = &upper_bound; + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "abcdxx00"), 4); + // should check bloom filter since upper bound meets requirement + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 2 + iteration); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + { + // [abcdxx01, abcey) is not valid bound since upper bound is too long for + // the BF in SST (capped:4) + Slice upper_bound("abcey"); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + read_options.iterate_upper_bound = &upper_bound; + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "abcdxx01"), 4); + // should skip bloom filter since upper bound is too long + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 2 + iteration); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + { + // [abcdxx02, abcdy) is a valid bound since the prefix is the same + Slice upper_bound("abcdy"); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + read_options.iterate_upper_bound = &upper_bound; + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "abcdxx02"), 4); + // should check bloom filter since upper bound matches transformed seek + // key + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 2 + iteration * 2); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + { + // [aaaaaaaa, abce) is not a valid bound since 1) they don't share the + // same prefix, 2) the prefixes are not consecutive + Slice upper_bound("abce"); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + read_options.iterate_upper_bound = &upper_bound; + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "aaaaaaaa"), 0); + // should skip bloom filter since mismatch is found + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 2 + iteration * 2); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:3"}})); + { + // [abc, abd) is not a valid bound since the upper bound is too short + // for BF (capped:4) + Slice upper_bound("abd"); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + read_options.iterate_upper_bound = &upper_bound; + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "abc"), 4); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 2 + iteration * 2); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:4"}})); + { + // set back to capped:4 and verify BF is always read + Slice upper_bound("abd"); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + read_options.iterate_upper_bound = &upper_bound; + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "abc"), 0); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 3 + iteration * 2); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1); + } + iteration++; + } +} + +// Create multiple SST files each with a different prefix_extractor config, +// verify iterators can read all SST files using the latest config. +TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) { + int iteration = 0; + for (bool use_block_based_builder : {true, false}) { + Options options; + options.create_if_missing = true; + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + options.disable_auto_compactions = true; + options.statistics = CreateDBStatistics(); + // Enable prefix bloom for SST files + BlockBasedTableOptions table_options; + table_options.filter_policy.reset( + NewBloomFilterPolicy(10, use_block_based_builder)); + table_options.cache_index_and_filter_blocks = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + + Slice upper_bound("foz90000"); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + + // first SST with fixed:1 BF + ASSERT_OK(Put("foo2", "bar2")); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("foq1", "bar1")); + ASSERT_OK(Put("fpa", "0")); + dbfull()->Flush(FlushOptions()); + std::unique_ptr iter_old(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter_old, "foo"), 4); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 1); + + ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}})); + ASSERT_EQ(0, strcmp(dbfull()->GetOptions().prefix_extractor->Name(), + "rocksdb.CappedPrefix.3")); + read_options.iterate_upper_bound = &upper_bound; + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "foo"), 2); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 1 + iteration); + ASSERT_EQ(CountIter(iter, "gpk"), 0); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 1 + iteration); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + + // second SST with capped:3 BF + ASSERT_OK(Put("foo3", "bar3")); + ASSERT_OK(Put("foo4", "bar4")); + ASSERT_OK(Put("foq5", "bar5")); + ASSERT_OK(Put("fpb", "1")); + dbfull()->Flush(FlushOptions()); + { + // BF is cappped:3 now + std::unique_ptr iter_tmp(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter_tmp, "foo"), 4); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 2 + iteration * 2); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + ASSERT_EQ(CountIter(iter_tmp, "gpk"), 0); + // both counters are incremented because BF is "not changed" for 1 of the + // 2 SST files, so filter is checked once and found no match. + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 3 + iteration * 2); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1); + } + + ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:2"}})); + ASSERT_EQ(0, strcmp(dbfull()->GetOptions().prefix_extractor->Name(), + "rocksdb.FixedPrefix.2")); + // third SST with fixed:2 BF + ASSERT_OK(Put("foo6", "bar6")); + ASSERT_OK(Put("foo7", "bar7")); + ASSERT_OK(Put("foq8", "bar8")); + ASSERT_OK(Put("fpc", "2")); + dbfull()->Flush(FlushOptions()); + { + // BF is fixed:2 now + std::unique_ptr iter_tmp(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter_tmp, "foo"), 9); + // the first and last BF are checked + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 4 + iteration * 3); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1); + ASSERT_EQ(CountIter(iter_tmp, "gpk"), 0); + // only last BF is checked and not found + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 5 + iteration * 3); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 2); + } + + // iter_old can only see the first SST, so checked plus 1 + ASSERT_EQ(CountIter(iter_old, "foo"), 4); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 6 + iteration * 3); + // iter was created after the first setoptions call so only full filter + // will check the filter + ASSERT_EQ(CountIter(iter, "foo"), 2); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 6 + iteration * 4); + + { + // keys in all three SSTs are visible to iterator + // The range of [foo, foz90000] is compatible with (fixed:1) and (fixed:2) + // so +2 for checked counter + std::unique_ptr iter_all(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter_all, "foo"), 9); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 7 + iteration * 5); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 2); + ASSERT_EQ(CountIter(iter_all, "gpk"), 0); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 8 + iteration * 5); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3); + } + ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}})); + ASSERT_EQ(0, strcmp(dbfull()->GetOptions().prefix_extractor->Name(), + "rocksdb.CappedPrefix.3")); + { + std::unique_ptr iter_all(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter_all, "foo"), 6); + // all three SST are checked because the current options has the same as + // the remaining SST (capped:3) + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 9 + iteration * 7); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3); + ASSERT_EQ(CountIter(iter_all, "gpk"), 0); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), + 10 + iteration * 7); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 4); + } + // TODO(Zhongyi): Maybe also need to add Get calls to test point look up? + iteration++; + } +} + +// Create a new column family in a running DB, change prefix_extractor +// dynamically, verify the iterator created on the new column family behaves +// as expected +TEST_F(DBBloomFilterTest, DynamicBloomFilterNewColumnFamily) { + int iteration = 0; + for (bool use_block_based_builder : {true, false}) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + options.disable_auto_compactions = true; + options.statistics = CreateDBStatistics(); + // Enable prefix bloom for SST files + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = true; + table_options.filter_policy.reset( + NewBloomFilterPolicy(10, use_block_based_builder)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + CreateAndReopenWithCF({"pikachu" + std::to_string(iteration)}, options); + ReadOptions read_options; + read_options.prefix_same_as_start = true; + // create a new CF and set prefix_extractor dynamically + options.prefix_extractor.reset(NewCappedPrefixTransform(3)); + CreateColumnFamilies({"ramen_dojo_" + std::to_string(iteration)}, options); + ASSERT_EQ(0, + strcmp(dbfull()->GetOptions(handles_[2]).prefix_extractor->Name(), + "rocksdb.CappedPrefix.3")); + ASSERT_OK(Put(2, "foo3", "bar3")); + ASSERT_OK(Put(2, "foo4", "bar4")); + ASSERT_OK(Put(2, "foo5", "bar5")); + ASSERT_OK(Put(2, "foq6", "bar6")); + ASSERT_OK(Put(2, "fpq7", "bar7")); + dbfull()->Flush(FlushOptions()); + { + std::unique_ptr iter( + db_->NewIterator(read_options, handles_[2])); + ASSERT_EQ(CountIter(iter, "foo"), 3); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 0); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + ASSERT_OK( + dbfull()->SetOptions(handles_[2], {{"prefix_extractor", "fixed:2"}})); + ASSERT_EQ(0, + strcmp(dbfull()->GetOptions(handles_[2]).prefix_extractor->Name(), + "rocksdb.FixedPrefix.2")); + { + std::unique_ptr iter( + db_->NewIterator(read_options, handles_[2])); + ASSERT_EQ(CountIter(iter, "foo"), 4); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 0); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + ASSERT_OK(dbfull()->DropColumnFamily(handles_[2])); + dbfull()->DestroyColumnFamilyHandle(handles_[2]); + handles_[2] = nullptr; + ASSERT_OK(dbfull()->DropColumnFamily(handles_[1])); + dbfull()->DestroyColumnFamilyHandle(handles_[1]); + handles_[1] = nullptr; + iteration++; + } +} + +// Verify it's possible to change prefix_extractor at runtime and iterators +// behaves as expected +TEST_F(DBBloomFilterTest, DynamicBloomFilterOptions) { + int iteration = 0; + for (bool use_block_based_builder : {true, false}) { + Options options; + options.create_if_missing = true; + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + options.disable_auto_compactions = true; + options.statistics = CreateDBStatistics(); + // Enable prefix bloom for SST files + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = true; + table_options.filter_policy.reset( + NewBloomFilterPolicy(10, use_block_based_builder)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + + ASSERT_OK(Put("foo2", "bar2")); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("foo1", "bar1")); + ASSERT_OK(Put("fpa", "0")); + dbfull()->Flush(FlushOptions()); + ASSERT_OK(Put("foo3", "bar3")); + ASSERT_OK(Put("foo4", "bar4")); + ASSERT_OK(Put("foo5", "bar5")); + ASSERT_OK(Put("fpb", "1")); + dbfull()->Flush(FlushOptions()); + ASSERT_OK(Put("foo6", "bar6")); + ASSERT_OK(Put("foo7", "bar7")); + ASSERT_OK(Put("foo8", "bar8")); + ASSERT_OK(Put("fpc", "2")); + dbfull()->Flush(FlushOptions()); + + ReadOptions read_options; + read_options.prefix_same_as_start = true; + { + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter, "foo"), 12); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 3); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + std::unique_ptr iter_old(db_->NewIterator(read_options)); + ASSERT_EQ(CountIter(iter_old, "foo"), 12); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 6); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + + ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}})); + ASSERT_EQ(0, strcmp(dbfull()->GetOptions().prefix_extractor->Name(), + "rocksdb.CappedPrefix.3")); + { + std::unique_ptr iter(db_->NewIterator(read_options)); + // "fp*" should be skipped + ASSERT_EQ(CountIter(iter, "foo"), 9); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 6); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + } + + // iterator created before should not be affected and see all keys + ASSERT_EQ(CountIter(iter_old, "foo"), 12); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 9); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + ASSERT_EQ(CountIter(iter_old, "abc"), 0); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 12); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3); + iteration++; + } +} + #endif // ROCKSDB_LITE } // namespace rocksdb diff --git a/db/db_test.cc b/db/db_test.cc index afff08673..346010bcd 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -4607,180 +4607,6 @@ TEST_F(DBTest, FileCreationRandomFailure) { } #ifndef ROCKSDB_LITE -int CountIter(Iterator* iter, const Slice& key) { - int count = 0; - for (iter->Seek(key); iter->Valid() && iter->status() == Status::OK(); - iter->Next()) { - count++; - } - return count; -} - -// Create multiple SST files each with a different prefix_extractor config, -// verify iterators can read all SST files using the latest config. -TEST_F(DBTest, DynamicBloomFilterMultipleSST) { - Options options; - options.create_if_missing = true; - options.prefix_extractor.reset(NewFixedPrefixTransform(1)); - options.disable_auto_compactions = true; - // Enable prefix bloom for SST files - BlockBasedTableOptions table_options; - table_options.filter_policy.reset(NewBloomFilterPolicy(10, true)); - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - DestroyAndReopen(options); - - ReadOptions read_options; - read_options.prefix_same_as_start = true; - - // first SST with fixed:1 BF - ASSERT_OK(Put("foo2", "bar2")); - ASSERT_OK(Put("foo", "bar")); - ASSERT_OK(Put("foq1", "bar1")); - ASSERT_OK(Put("fpa", "0")); - dbfull()->Flush(FlushOptions()); - Iterator* iter_old = db_->NewIterator(read_options); - ASSERT_EQ(CountIter(iter_old, "foo"), 4); - - ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}})); - ASSERT_EQ(0, strcmp(dbfull()->GetOptions().prefix_extractor->Name(), - "rocksdb.CappedPrefix.3")); - Iterator* iter = db_->NewIterator(read_options); - ASSERT_EQ(CountIter(iter, "foo"), 2); - - // second SST with capped:3 BF - ASSERT_OK(Put("foo3", "bar3")); - ASSERT_OK(Put("foo4", "bar4")); - ASSERT_OK(Put("foq5", "bar5")); - ASSERT_OK(Put("fpb", "1")); - dbfull()->Flush(FlushOptions()); - // BF is cappped:3 now - Iterator* iter_tmp = db_->NewIterator(read_options); - ASSERT_EQ(CountIter(iter_tmp, "foo"), 4); - delete iter_tmp; - - ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:2"}})); - ASSERT_EQ(0, strcmp(dbfull()->GetOptions().prefix_extractor->Name(), - "rocksdb.FixedPrefix.2")); - // third SST with fixed:2 BF - ASSERT_OK(Put("foo6", "bar6")); - ASSERT_OK(Put("foo7", "bar7")); - ASSERT_OK(Put("foq8", "bar8")); - ASSERT_OK(Put("fpc", "2")); - dbfull()->Flush(FlushOptions()); - // BF is fixed:2 now - iter_tmp = db_->NewIterator(read_options); - ASSERT_EQ(CountIter(iter_tmp, "foo"), 9); - delete iter_tmp; - - // TODO(Zhongyi): verify existing iterator cannot see newly inserted keys - ASSERT_EQ(CountIter(iter_old, "foo"), 4); - ASSERT_EQ(CountIter(iter, "foo"), 2); - delete iter; - delete iter_old; - - // keys in all three SSTs are visible to iterator - Iterator* iter_all = db_->NewIterator(read_options); - ASSERT_EQ(CountIter(iter_all, "foo"), 9); - delete iter_all; - ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}})); - ASSERT_EQ(0, strcmp(dbfull()->GetOptions().prefix_extractor->Name(), - "rocksdb.CappedPrefix.3")); - iter_all = db_->NewIterator(read_options); - ASSERT_EQ(CountIter(iter_all, "foo"), 6); - delete iter_all; - // TODO(Zhongyi): add test for cases where certain SST are skipped - // Also verify BF related counters like BLOOM_FILTER_USEFUL -} - -// Create a new column family in a running DB, change prefix_extractor -// dynamically, verify the iterator created on the new column family behaves -// as expected -TEST_F(DBTest, DynamicBloomFilterNewColumnFamily) { - Options options = CurrentOptions(); - options.create_if_missing = true; - options.prefix_extractor.reset(NewFixedPrefixTransform(1)); - options.disable_auto_compactions = true; - // Enable prefix bloom for SST files - BlockBasedTableOptions table_options; - table_options.filter_policy.reset(NewBloomFilterPolicy(10, true)); - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - CreateAndReopenWithCF({"pikachu"}, options); - ReadOptions read_options; - read_options.prefix_same_as_start = true; - // create a new CF and set prefix_extractor dynamically - options.prefix_extractor.reset(NewCappedPrefixTransform(3)); - CreateColumnFamilies({"ramen_dojo"}, options); - ASSERT_EQ(0, - strcmp(dbfull()->GetOptions(handles_[2]).prefix_extractor->Name(), - "rocksdb.CappedPrefix.3")); - ASSERT_OK(Put(2, "foo3", "bar3")); - ASSERT_OK(Put(2, "foo4", "bar4")); - ASSERT_OK(Put(2, "foo5", "bar5")); - ASSERT_OK(Put(2, "foq6", "bar6")); - ASSERT_OK(Put(2, "fpq7", "bar7")); - dbfull()->Flush(FlushOptions()); - Iterator* iter = db_->NewIterator(read_options, handles_[2]); - ASSERT_EQ(CountIter(iter, "foo"), 3); - delete iter; - ASSERT_OK( - dbfull()->SetOptions(handles_[2], {{"prefix_extractor", "fixed:2"}})); - ASSERT_EQ(0, - strcmp(dbfull()->GetOptions(handles_[2]).prefix_extractor->Name(), - "rocksdb.FixedPrefix.2")); - iter = db_->NewIterator(read_options, handles_[2]); - ASSERT_EQ(CountIter(iter, "foo"), 4); - delete iter; -} - -// Verify it's possible to change prefix_extractor at runtime and iterators -// behaves as expected -TEST_F(DBTest, DynamicBloomFilterOptions) { - Options options; - options.create_if_missing = true; - options.prefix_extractor.reset(NewFixedPrefixTransform(1)); - options.disable_auto_compactions = true; - // Enable prefix bloom for SST files - BlockBasedTableOptions table_options; - table_options.filter_policy.reset(NewBloomFilterPolicy(10, true)); - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - DestroyAndReopen(options); - - ASSERT_OK(Put("foo2", "bar2")); - ASSERT_OK(Put("foo", "bar")); - ASSERT_OK(Put("foo1", "bar1")); - ASSERT_OK(Put("fpa", "0")); - dbfull()->Flush(FlushOptions()); - ASSERT_OK(Put("foo3", "bar3")); - ASSERT_OK(Put("foo4", "bar4")); - ASSERT_OK(Put("foo5", "bar5")); - ASSERT_OK(Put("fpb", "1")); - dbfull()->Flush(FlushOptions()); - ASSERT_OK(Put("foo6", "bar6")); - ASSERT_OK(Put("foo7", "bar7")); - ASSERT_OK(Put("foo8", "bar8")); - ASSERT_OK(Put("fpc", "2")); - dbfull()->Flush(FlushOptions()); - - ReadOptions read_options; - read_options.prefix_same_as_start = true; - Iterator* iter = db_->NewIterator(read_options); - ASSERT_EQ(CountIter(iter, "foo"), 12); - delete iter; - Iterator* iter_old = db_->NewIterator(read_options); - ASSERT_EQ(CountIter(iter_old, "foo"), 12); - - ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}})); - ASSERT_EQ(0, strcmp(dbfull()->GetOptions().prefix_extractor->Name(), - "rocksdb.CappedPrefix.3")); - iter = db_->NewIterator(read_options); - // "fp*" should be skipped - ASSERT_EQ(CountIter(iter, "foo"), 9); - delete iter; - - // iterator created before should not be affected and see all keys - ASSERT_EQ(CountIter(iter_old, "foo"), 12); - delete iter_old; -} TEST_F(DBTest, DynamicMiscOptions) { // Test max_sequential_skip_in_iterations diff --git a/db/prefix_test.cc b/db/prefix_test.cc index 274ad3ad4..eeecdfa02 100644 --- a/db/prefix_test.cc +++ b/db/prefix_test.cc @@ -212,6 +212,10 @@ class SamePrefixTransform : public SliceTransform { virtual bool InRange(const Slice& dst) const override { return dst == prefix_; } + + virtual bool FullLengthEnabled(size_t* /*len*/) const override { + return false; + } }; } // namespace diff --git a/include/rocksdb/comparator.h b/include/rocksdb/comparator.h index 6479a28bc..6914f812b 100644 --- a/include/rocksdb/comparator.h +++ b/include/rocksdb/comparator.h @@ -68,6 +68,12 @@ class Comparator { // if it is a wrapped comparator, may return the root one. // return itself it is not wrapped. virtual const Comparator* GetRootComparator() const { return this; } + + // given two keys, determine if t is the successor of s + virtual bool IsSameLengthImmediateSuccessor(const Slice& /*s*/, + const Slice& /*t*/) const { + return false; + } }; // Return a builtin comparator that uses lexicographic byte-wise diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 74d7fef0a..d77136d05 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -1015,7 +1015,7 @@ struct ReadOptions { // can returns entries. Once the bound is reached, Valid() will be false. // "iterate_upper_bound" is exclusive ie the bound value is // not a valid entry. If iterator_extractor is not null, the Seek target - // and iterator_upper_bound need to have the same prefix. + // and iterate_upper_bound need to have the same prefix. // This is because ordering is not guaranteed outside of prefix domain. // // Default: nullptr diff --git a/include/rocksdb/slice_transform.h b/include/rocksdb/slice_transform.h index 39999cc62..5a461b776 100644 --- a/include/rocksdb/slice_transform.h +++ b/include/rocksdb/slice_transform.h @@ -60,6 +60,11 @@ class SliceTransform { // This is currently not used and remains here for backward compatibility. virtual bool InRange(const Slice& /*dst*/) const { return false; } + // Some SliceTransform will have a full length which can be used to + // determine if two keys are consecuitive. Can be disabled by always + // returning 0 + virtual bool FullLengthEnabled(size_t* /*len*/) const { return false; } + // Transform(s)=Transform(`prefix`) for any s with `prefix` as a prefix. // // This function is not used by RocksDB, but for users. If users pass diff --git a/options/options_helper.cc b/options/options_helper.cc index eba08cb85..40f468b43 100644 --- a/options/options_helper.cc +++ b/options/options_helper.cc @@ -379,6 +379,7 @@ bool ParseStructOptions( } return true; } +} // anonymouse namespace bool ParseSliceTransformHelper( const std::string& kFixedPrefixName, const std::string& kCappedPrefixName, @@ -434,7 +435,6 @@ bool ParseSliceTransform( // SliceTransforms here. return false; } -} // anonymouse namespace bool ParseOptionHelper(char* opt_address, const OptionType& opt_type, const std::string& value) { diff --git a/options/options_helper.h b/options/options_helper.h index ab91109be..2ec22077a 100644 --- a/options/options_helper.h +++ b/options/options_helper.h @@ -130,6 +130,10 @@ Status GetColumnFamilyOptionsFromMapInternal( std::vector* unsupported_options_names = nullptr, bool ignore_unknown_options = false); +bool ParseSliceTransform( + const std::string& value, + std::shared_ptr* slice_transform); + extern Status StringToMap( const std::string& opts_str, std::unordered_map* opts_map); diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc index f7b6cc4d7..cb327ad5e 100644 --- a/table/block_based_table_reader.cc +++ b/table/block_based_table_reader.cc @@ -249,8 +249,8 @@ class PartitionIndexReader : public IndexReader, public Cleanable { index_block_->NewIterator(icomparator_, icomparator_->user_comparator(), nullptr, true, nullptr, index_key_includes_seq_), - false, - /* prefix_extractor */ nullptr, kIsIndex, index_key_includes_seq_); + false, true, /* prefix_extractor */ nullptr, kIsIndex, + index_key_includes_seq_); } // TODO(myabandeh): Update TwoLevelIterator to be able to make use of // on-stack BlockIter while the state is on heap. Currentlly it assumes @@ -824,6 +824,12 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, ROCKS_LOG_ERROR(rep->ioptions.info_log, "Cannot find Properties block from file."); } +#ifndef ROCKSDB_LITE + if (rep->table_properties) { + ParseSliceTransform(rep->table_properties->prefix_extractor_name, + &(rep->table_prefix_extractor)); + } +#endif // ROCKSDB_LITE // Read the compression dictionary meta block bool found_compression_dict; @@ -898,6 +904,9 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, rep->ioptions.info_log); } + bool need_upper_bound_check = + PrefixExtractorChanged(rep->table_properties.get(), prefix_extractor); + // prefetch both index and filters, down to all partitions const bool prefetch_all = prefetch_index_and_filter_in_cache || level == 0; BlockBasedTableOptions::IndexType index_type = new_table->UpdateIndexType(); @@ -932,14 +941,12 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, // Hack: Call NewIndexIterator() to implicitly add index to the // block_cache CachableEntry index_entry; - bool prefix_extractor_changed = false; // check prefix_extractor match only if hash based index is used - if (rep->index_type == BlockBasedTableOptions::kHashSearch) { - prefix_extractor_changed = PrefixExtractorChanged( - rep->table_properties.get(), prefix_extractor); - } + bool disable_prefix_seek = + rep->index_type == BlockBasedTableOptions::kHashSearch && + need_upper_bound_check; unique_ptr iter(new_table->NewIndexIterator( - ReadOptions(), prefix_extractor_changed, nullptr, &index_entry)); + ReadOptions(), disable_prefix_seek, nullptr, &index_entry)); s = iter->status(); if (s.ok()) { // This is the first call to NewIndexIterator() since we're in Open(). @@ -958,9 +965,11 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, } if (s.ok() && prefetch_filter) { // Hack: Call GetFilter() to implicitly add filter to the block_cache - auto filter_entry = new_table->GetFilter(prefix_extractor); + auto filter_entry = + new_table->GetFilter(rep->table_prefix_extractor.get()); if (filter_entry.value != nullptr && prefetch_all) { - filter_entry.value->CacheDependencies(pin_all, prefix_extractor); + filter_entry.value->CacheDependencies( + pin_all, rep->table_prefix_extractor.get()); } // if pin_filter is true then save it in rep_->filter_entry; it will be // released in the destructor only, hence it will be pinned in the @@ -990,14 +999,14 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, // Set filter block if (rep->filter_policy) { const bool is_a_filter_partition = true; - auto filter = - new_table->ReadFilter(prefetch_buffer.get(), rep->filter_handle, - !is_a_filter_partition, prefix_extractor); + auto filter = new_table->ReadFilter( + prefetch_buffer.get(), rep->filter_handle, !is_a_filter_partition, + rep->table_prefix_extractor.get()); rep->filter.reset(filter); // Refer to the comment above about paritioned indexes always being // cached if (filter && (prefetch_index_and_filter_in_cache || level == 0)) { - filter->CacheDependencies(pin_all, prefix_extractor); + filter->CacheDependencies(pin_all, rep->table_prefix_extractor.get()); } } } else { @@ -1784,20 +1793,28 @@ BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator( // Otherwise, this method guarantees no I/O will be incurred. // // REQUIRES: this method shouldn't be called while the DB lock is held. -bool BlockBasedTable::PrefixMayMatch(const Slice& internal_key, - const SliceTransform* prefix_extractor) { +bool BlockBasedTable::PrefixMayMatch( + const Slice& internal_key, const ReadOptions& read_options, + const SliceTransform* options_prefix_extractor, + const bool need_upper_bound_check) { if (!rep_->filter_policy) { return true; } - assert(prefix_extractor != nullptr); + const SliceTransform* prefix_extractor; + + if (rep_->table_prefix_extractor == nullptr) { + if (need_upper_bound_check) { + return true; + } + prefix_extractor = options_prefix_extractor; + } else { + prefix_extractor = rep_->table_prefix_extractor.get(); + } auto user_key = ExtractUserKey(internal_key); if (!prefix_extractor->InDomain(user_key)) { return true; } - assert(rep_->table_properties->prefix_extractor_name.compare( - prefix_extractor->Name()) == 0); - auto prefix = prefix_extractor->Transform(user_key); bool may_match = true; Status s; @@ -1805,12 +1822,23 @@ bool BlockBasedTable::PrefixMayMatch(const Slice& internal_key, // First, try check with full filter auto filter_entry = GetFilter(prefix_extractor); FilterBlockReader* filter = filter_entry.value; + bool filter_checked = true; if (filter != nullptr) { if (!filter->IsBlockBased()) { const Slice* const const_ikey_ptr = &internal_key; - may_match = filter->PrefixMayMatch(prefix, prefix_extractor, kNotValid, - false, const_ikey_ptr); + may_match = filter->RangeMayExist( + read_options.iterate_upper_bound, user_key, prefix_extractor, + rep_->internal_comparator.user_comparator(), const_ikey_ptr, + &filter_checked, need_upper_bound_check); } else { + // if prefix_extractor changed for block based filter, skip filter + if (need_upper_bound_check) { + if (!rep_->filter_entry.IsSet()) { + filter_entry.Release(rep_->table_options.block_cache.get()); + } + return true; + } + auto prefix = prefix_extractor->Transform(user_key); InternalKey internal_key_prefix(prefix, kMaxSequenceNumber, kTypeValue); auto internal_prefix = internal_key_prefix.Encode(); @@ -1823,9 +1851,9 @@ bool BlockBasedTable::PrefixMayMatch(const Slice& internal_key, // Then, try find it within each block // we already know prefix_extractor and prefix_extractor_name must match // because `CheckPrefixMayMatch` first checks `check_filter_ == true` - bool prefix_extractor_changed = false; unique_ptr iiter( - NewIndexIterator(no_io_read_options, prefix_extractor_changed)); + NewIndexIterator(no_io_read_options, + /* need_upper_bound_check */ false)); iiter->Seek(internal_prefix); if (!iiter->Valid()) { @@ -1866,10 +1894,12 @@ bool BlockBasedTable::PrefixMayMatch(const Slice& internal_key, } } - Statistics* statistics = rep_->ioptions.statistics; - RecordTick(statistics, BLOOM_FILTER_PREFIX_CHECKED); - if (!may_match) { - RecordTick(statistics, BLOOM_FILTER_PREFIX_USEFUL); + if (filter_checked) { + Statistics* statistics = rep_->ioptions.statistics; + RecordTick(statistics, BLOOM_FILTER_PREFIX_CHECKED); + if (!may_match) { + RecordTick(statistics, BLOOM_FILTER_PREFIX_USEFUL); + } } // if rep_->filter_entry is not set, we should call Release(); otherwise @@ -1878,12 +1908,11 @@ bool BlockBasedTable::PrefixMayMatch(const Slice& internal_key, if (!rep_->filter_entry.IsSet()) { filter_entry.Release(rep_->table_options.block_cache.get()); } - return may_match; } void BlockBasedTableIterator::Seek(const Slice& target) { - if (!CheckPrefixMayMatch(target, prefix_extractor_)) { + if (!CheckPrefixMayMatch(target)) { ResetDataIter(); return; } @@ -1911,7 +1940,7 @@ void BlockBasedTableIterator::Seek(const Slice& target) { } void BlockBasedTableIterator::SeekForPrev(const Slice& target) { - if (!CheckPrefixMayMatch(target, prefix_extractor_)) { + if (!CheckPrefixMayMatch(target)) { ResetDataIter(); return; } @@ -2101,7 +2130,7 @@ void BlockBasedTableIterator::FindKeyBackward() { InternalIterator* BlockBasedTable::NewIterator( const ReadOptions& read_options, const SliceTransform* prefix_extractor, Arena* arena, bool skip_filters, bool for_compaction) { - bool prefix_extractor_changed = + bool need_upper_bound_check = PrefixExtractorChanged(rep_->table_properties.get(), prefix_extractor); const bool kIsNotIndex = false; if (arena == nullptr) { @@ -2109,21 +2138,21 @@ InternalIterator* BlockBasedTable::NewIterator( this, read_options, rep_->internal_comparator, NewIndexIterator( read_options, - prefix_extractor_changed && + need_upper_bound_check && rep_->index_type == BlockBasedTableOptions::kHashSearch), !skip_filters && !read_options.total_order_seek && - prefix_extractor != nullptr && !prefix_extractor_changed, - prefix_extractor, kIsNotIndex, true /*key_includes_seq*/, - for_compaction); + prefix_extractor != nullptr, + need_upper_bound_check, prefix_extractor, kIsNotIndex, + true /*key_includes_seq*/, for_compaction); } else { auto* mem = arena->AllocateAligned(sizeof(BlockBasedTableIterator)); return new (mem) BlockBasedTableIterator( this, read_options, rep_->internal_comparator, - NewIndexIterator(read_options, prefix_extractor_changed), + NewIndexIterator(read_options, need_upper_bound_check), !skip_filters && !read_options.total_order_seek && - prefix_extractor != nullptr && !prefix_extractor_changed, - prefix_extractor, kIsNotIndex, true /*key_includes_seq*/, - for_compaction); + prefix_extractor != nullptr, + need_upper_bound_check, prefix_extractor, kIsNotIndex, + true /*key_includes_seq*/, for_compaction); } } @@ -2210,14 +2239,14 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, BlockIter iiter_on_stack; // if prefix_extractor found in block differs from options, disable // BlockPrefixIndex. Only do this check when index_type is kHashSearch. - bool prefix_extractor_changed = false; + bool need_upper_bound_check = false; if (rep_->index_type == BlockBasedTableOptions::kHashSearch) { - prefix_extractor_changed = PrefixExtractorChanged( + need_upper_bound_check = PrefixExtractorChanged( rep_->table_properties.get(), prefix_extractor); } - auto iiter = NewIndexIterator(read_options, prefix_extractor_changed, - &iiter_on_stack, /* index_entry */ nullptr, - get_context); + auto iiter = + NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack, + /* index_entry */ nullptr, get_context); std::unique_ptr iiter_unique_ptr; if (iiter != &iiter_on_stack) { iiter_unique_ptr.reset(iiter); @@ -2479,7 +2508,7 @@ Status BlockBasedTable::CreateIndexReader( // kHashSearch requires non-empty prefix_extractor but bypass checking // prefix_extractor here since we have no access to MutableCFOptions. - // Add prefix_extractor_changed flag in BlockBasedTable::NewIndexIterator. + // Add need_upper_bound_check flag in BlockBasedTable::NewIndexIterator. // If prefix_extractor does not match prefix_extractor_name from table // properties, turn off Hash Index by setting total_order_seek to true diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h index 53ba5e197..821e672a6 100644 --- a/table/block_based_table_reader.h +++ b/table/block_based_table_reader.h @@ -95,7 +95,9 @@ class BlockBasedTable : public TableReader { bool skip_filters = false, int level = -1); bool PrefixMayMatch(const Slice& internal_key, - const SliceTransform* prefix_extractor = nullptr); + const ReadOptions& read_options, + const SliceTransform* options_prefix_extractor, + const bool need_upper_bound_check); // Returns a new iterator over the table contents. // The result of NewIterator() is initially invalid (caller must @@ -277,7 +279,7 @@ class BlockBasedTable : public TableReader { // 3. We disallowed any io to be performed, that is, read_options == // kBlockCacheTier InternalIterator* NewIndexIterator( - const ReadOptions& read_options, bool prefix_extractor_changed = false, + const ReadOptions& read_options, bool need_upper_bound_check = false, BlockIter* input_iter = nullptr, CachableEntry* index_entry = nullptr, GetContext* get_context = nullptr); @@ -481,6 +483,7 @@ struct BlockBasedTable::Rep { // and compatible with existing code, we introduce a wrapper that allows // block to extract prefix without knowing if a key is internal or not. unique_ptr internal_prefix_transform; + std::shared_ptr table_prefix_extractor; // only used in level 0 files when pin_l0_filter_and_index_blocks_in_cache is // true or in all levels when pin_top_level_index_and_filter is set in @@ -515,6 +518,7 @@ class BlockBasedTableIterator : public InternalIterator { const ReadOptions& read_options, const InternalKeyComparator& icomp, InternalIterator* index_iter, bool check_filter, + bool need_upper_bound_check, const SliceTransform* prefix_extractor, bool is_index, bool key_includes_seq = true, bool for_compaction = false) @@ -525,10 +529,11 @@ class BlockBasedTableIterator : public InternalIterator { pinned_iters_mgr_(nullptr), block_iter_points_to_real_block_(false), check_filter_(check_filter), + need_upper_bound_check_(need_upper_bound_check), + prefix_extractor_(prefix_extractor), is_index_(is_index), key_includes_seq_(key_includes_seq), - for_compaction_(for_compaction), - prefix_extractor_(prefix_extractor) {} + for_compaction_(for_compaction) {} ~BlockBasedTableIterator() { delete index_iter_; } @@ -575,9 +580,10 @@ class BlockBasedTableIterator : public InternalIterator { block_iter_points_to_real_block_; } - bool CheckPrefixMayMatch(const Slice& ikey, - const SliceTransform* prefix_extractor = nullptr) { - if (check_filter_ && !table_->PrefixMayMatch(ikey, prefix_extractor)) { + bool CheckPrefixMayMatch(const Slice& ikey) { + if (check_filter_ && + !table_->PrefixMayMatch(ikey, read_options_, prefix_extractor_, + need_upper_bound_check_)) { // TODO remember the iterator is invalidated because of prefix // match. This can avoid the upper level file iterator to falsely // believe the position is the end of the SST file and move to @@ -621,6 +627,9 @@ class BlockBasedTableIterator : public InternalIterator { bool block_iter_points_to_real_block_; bool is_out_of_bound_ = false; bool check_filter_; + // TODO(Zhongyi): pick a better name + bool need_upper_bound_check_; + const SliceTransform* prefix_extractor_; // If the blocks over which we iterate are index blocks bool is_index_; // If the keys in the blocks over which we iterate include 8 byte sequence @@ -629,7 +638,6 @@ class BlockBasedTableIterator : public InternalIterator { bool for_compaction_; // TODO use block offset instead std::string prev_index_value_; - const SliceTransform* prefix_extractor_; static const size_t kInitReadaheadSize = 8 * 1024; // Found that 256 KB readahead size provides the best performance, based on diff --git a/table/filter_block.h b/table/filter_block.h index 8f3666e1b..a93049547 100644 --- a/table/filter_block.h +++ b/table/filter_block.h @@ -123,6 +123,17 @@ class FilterBlockReader { virtual void CacheDependencies(bool /*pin*/, const SliceTransform* /*prefix_extractor*/) {} + virtual bool RangeMayExist( + const Slice* /*iterate_upper_bound*/, const Slice& user_key, + const SliceTransform* prefix_extractor, + const Comparator* /*comparator*/, const Slice* const const_ikey_ptr, + bool* filter_checked, bool /*need_upper_bound_check*/) { + *filter_checked = true; + Slice prefix = prefix_extractor->Transform(user_key); + return PrefixMayMatch(prefix, prefix_extractor, kNotValid, false, + const_ikey_ptr); + } + protected: bool whole_key_filtering_; diff --git a/table/full_filter_block.cc b/table/full_filter_block.cc index 150caa6d1..fec42c407 100644 --- a/table/full_filter_block.cc +++ b/table/full_filter_block.cc @@ -98,6 +98,10 @@ FullFilterBlockReader::FullFilterBlockReader( contents_(contents) { assert(filter_bits_reader != nullptr); filter_bits_reader_.reset(filter_bits_reader); + if (prefix_extractor_ != nullptr) { + full_length_enabled_ = + prefix_extractor_->FullLengthEnabled(&prefix_extractor_full_length_); + } } FullFilterBlockReader::FullFilterBlockReader( @@ -153,4 +157,57 @@ bool FullFilterBlockReader::MayMatch(const Slice& entry) { size_t FullFilterBlockReader::ApproximateMemoryUsage() const { return contents_.size(); } + +bool FullFilterBlockReader::RangeMayExist(const Slice* iterate_upper_bound, + const Slice& user_key, const SliceTransform* prefix_extractor, + const Comparator* comparator, const Slice* const const_ikey_ptr, + bool* filter_checked, bool need_upper_bound_check) { + if (!prefix_extractor || !prefix_extractor->InDomain(user_key)) { + *filter_checked = false; + return true; + } + Slice prefix = prefix_extractor->Transform(user_key); + if (need_upper_bound_check && + !IsFilterCompatible(iterate_upper_bound, prefix, comparator)) { + *filter_checked = false; + return true; + } else { + *filter_checked = true; + return PrefixMayMatch(prefix, prefix_extractor, kNotValid, false, + const_ikey_ptr); + } +} + +bool FullFilterBlockReader::IsFilterCompatible( + const Slice* iterate_upper_bound, const Slice& prefix, + const Comparator* comparator) { + // Try to reuse the bloom filter in the SST table if prefix_extractor in + // mutable_cf_options has changed. If range [user_key, upper_bound) all + // share the same prefix then we may still be able to use the bloom filter. + if (iterate_upper_bound != nullptr && prefix_extractor_) { + if (!prefix_extractor_->InDomain(*iterate_upper_bound)) { + return false; + } + Slice upper_bound_xform = + prefix_extractor_->Transform(*iterate_upper_bound); + // first check if user_key and upper_bound all share the same prefix + if (!comparator->Equal(prefix, upper_bound_xform)) { + // second check if user_key's prefix is the immediate predecessor of + // upper_bound and have the same length. If so, we know for sure all + // keys in the range [user_key, upper_bound) share the same prefix. + // Also need to make sure upper_bound are full length to ensure + // correctness + if (!full_length_enabled_ || + iterate_upper_bound->size() != prefix_extractor_full_length_ || + !comparator->IsSameLengthImmediateSuccessor(prefix, + *iterate_upper_bound)) { + return false; + } + } + return true; + } else { + return false; + } +} + } // namespace rocksdb diff --git a/table/full_filter_block.h b/table/full_filter_block.h index efa93da12..aed4cfd7d 100644 --- a/table/full_filter_block.h +++ b/table/full_filter_block.h @@ -108,18 +108,27 @@ class FullFilterBlockReader : public FilterBlockReader { uint64_t block_offset = kNotValid, const bool no_io = false, const Slice* const const_ikey_ptr = nullptr) override; virtual size_t ApproximateMemoryUsage() const override; - + virtual bool RangeMayExist(const Slice* iterate_upper_bound, const Slice& user_key, + const SliceTransform* prefix_extractor, + const Comparator* comparator, + const Slice* const const_ikey_ptr, bool* filter_checked, + bool need_upper_bound_check) override; private: const SliceTransform* prefix_extractor_; Slice contents_; std::unique_ptr filter_bits_reader_; BlockContents block_contents_; std::unique_ptr filter_data_; + bool full_length_enabled_; + size_t prefix_extractor_full_length_; // No copying allowed FullFilterBlockReader(const FullFilterBlockReader&); bool MayMatch(const Slice& entry); void operator=(const FullFilterBlockReader&); + bool IsFilterCompatible(const Slice* iterate_upper_bound, + const Slice& prefix, const Comparator* comparator); + }; } // namespace rocksdb diff --git a/table/table_test.cc b/table/table_test.cc index bada1a2c4..2054d036d 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -692,6 +692,9 @@ class FixedOrLessPrefixTransform : public SliceTransform { virtual bool InRange(const Slice& dst) const override { return (dst.size() <= prefix_len_); } + virtual bool FullLengthEnabled(size_t* /*len*/) const override { + return false; + } }; class HarnessTest : public testing::Test { diff --git a/util/comparator.cc b/util/comparator.cc index 007f449a8..2c427a7ac 100644 --- a/util/comparator.cc +++ b/util/comparator.cc @@ -98,6 +98,32 @@ class BytewiseComparatorImpl : public Comparator { } // *key is a run of 0xffs. Leave it alone. } + + virtual bool IsSameLengthImmediateSuccessor(const Slice& s, + const Slice& t) const override { + if (s.size() != t.size() || s.size() == 0) { + return false; + } + size_t diff_ind = s.difference_offset(t); + // same slice + if (diff_ind >= s.size()) return false; + uint8_t byte_s = static_cast(s[diff_ind]); + uint8_t byte_t = static_cast(t[diff_ind]); + // first different byte must be consecutive, and remaining bytes must be + // 0xff for s and 0x00 for t + if (byte_s != uint8_t{0xff} && byte_s + 1 == byte_t) { + for (size_t i = diff_ind + 1; i < s.size(); ++i) { + byte_s = static_cast(s[i]); + byte_t = static_cast(t[i]); + if (byte_s != uint8_t{0xff} || byte_t != uint8_t{0x00}) { + return false; + } + } + return true; + } else { + return false; + } + } }; class ReverseBytewiseComparatorImpl : public BytewiseComparatorImpl { diff --git a/util/slice.cc b/util/slice.cc index d344fbacf..ed8c48169 100644 --- a/util/slice.cc +++ b/util/slice.cc @@ -47,6 +47,11 @@ class FixedPrefixTransform : public SliceTransform { return (dst.size() == prefix_len_); } + virtual bool FullLengthEnabled(size_t* len) const override { + *len = prefix_len_; + return true; + } + virtual bool SameResultWhenAppended(const Slice& prefix) const override { return InDomain(prefix); } @@ -80,6 +85,11 @@ class CappedPrefixTransform : public SliceTransform { return (dst.size() <= cap_len_); } + virtual bool FullLengthEnabled(size_t* len) const override { + *len = cap_len_; + return true; + } + virtual bool SameResultWhenAppended(const Slice& prefix) const override { return prefix.size() >= cap_len_; }