From 7891af8b53be9a1a2181b46767561fad8e862a2e Mon Sep 17 00:00:00 2001 From: Nikhil Benesch Date: Tue, 17 Oct 2017 22:09:01 -0700 Subject: [PATCH] expose a hook to skip tables during iteration Summary: As discussed on the mailing list (["Skipping entire SSTs while iterating"](https://groups.google.com/forum/#!topic/rocksdb/ujHCJVLrHlU)), this patch adds a `table_filter` to `ReadOptions` that allows specifying a callback to be executed during iteration before each table in the database is scanned. The callback is passed the table's properties; the table is scanned iff the callback returns true. This can be used in conjunction with a `TablePropertiesCollector` to dramatically speed up scans by skipping tables that are known to contain irrelevant data for the scan at hand. We're using this [downstream in CockroachDB](https://github.com/cockroachdb/cockroach/blob/master/pkg/storage/engine/db.cc#L2009-L2022) already. With this feature, under ideal conditions, we can reduce the time of an incremental backup in from hours to seconds. FYI, the first commit in this PR fixes a segfault that I unfortunately have not figured out how to reproduce outside of CockroachDB. I'm hoping you accept it on the grounds that it is not correct to return 8-byte aligned memory from a call to `malloc` on some 64-bit platforms; one correct approach is to infer the necessary alignment from `std::max_align_t`, as done here. As noted in the first commit message, the bug is tickled by having a`std::function` in `struct ReadOptions`. That is, the following patch alone is enough to cause RocksDB to segfault when run from CockroachDB on Darwin. ```diff --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -1546,6 +1546,13 @@ struct ReadOptions { // Default: false bool ignore_range_deletions; + // A callback to determine whether relevant keys for this scan exist in a + // given table based on the table's properties. The callback is passed the + // properties of each table during iteration. If the callback returns false, + // the table will not be scanned. + // Default: empty (every table will be scanned) + std::function table_filter; + ReadOptions(); ReadOptions(bool cksum, bool cache); }; ``` /cc danhhz Closes https://github.com/facebook/rocksdb/pull/2265 Differential Revision: D5054262 Pulled By: yiwu-arbug fbshipit-source-id: dd6b28f2bba6cb8466250d8c5c542d3c92785476 --- db/db_iterator_test.cc | 65 +++++++++++++++++++++++++++++++++++++++ db/table_cache.cc | 7 ++++- include/rocksdb/options.h | 8 +++++ 3 files changed, 79 insertions(+), 1 deletion(-) diff --git a/db/db_iterator_test.cc b/db/db_iterator_test.cc index e7a3534e3..61a38ec2b 100644 --- a/db/db_iterator_test.cc +++ b/db/db_iterator_test.cc @@ -2010,6 +2010,71 @@ TEST_F(DBIteratorTest, CreationFailure) { delete iter; } +TEST_F(DBIteratorTest, TableFilter) { + ASSERT_OK(Put("a", "1")); + dbfull()->Flush(FlushOptions()); + ASSERT_OK(Put("b", "2")); + ASSERT_OK(Put("c", "3")); + dbfull()->Flush(FlushOptions()); + ASSERT_OK(Put("d", "4")); + ASSERT_OK(Put("e", "5")); + ASSERT_OK(Put("f", "6")); + dbfull()->Flush(FlushOptions()); + + // Ensure the table_filter callback is called once for each table. + { + std::set unseen {1, 2, 3}; + ReadOptions opts; + opts.table_filter = [&](const TableProperties& props) { + auto it = unseen.find(props.num_entries); + if (it == unseen.end()) { + ADD_FAILURE() << "saw table properties with an unexpected " << props.num_entries << " entries"; + } else { + unseen.erase(it); + } + return true; + }; + auto iter = db_->NewIterator(opts); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->1"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "b->2"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "c->3"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "d->4"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "e->5"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "f->6"); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + ASSERT_TRUE(unseen.empty()); + delete iter; + } + + // Ensure returning false in the table_filter hides the keys from that table + // during iteration. + { + ReadOptions opts; + opts.table_filter = [](const TableProperties& props) { + return props.num_entries != 2; + }; + auto iter = db_->NewIterator(opts); + iter->SeekToFirst(); + ASSERT_EQ(IterStatus(iter), "a->1"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "d->4"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "e->5"); + iter->Next(); + ASSERT_EQ(IterStatus(iter), "f->6"); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + delete iter; + } +} + } // namespace rocksdb int main(int argc, char** argv) { diff --git a/db/table_cache.cc b/db/table_cache.cc index b4d5cc1bb..25c3befa4 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -225,7 +225,12 @@ InternalIterator* TableCache::NewIterator( } InternalIterator* result = nullptr; if (s.ok()) { - result = table_reader->NewIterator(options, arena, skip_filters); + if (options.table_filter && + !options.table_filter(*table_reader->GetTableProperties())) { + result = NewEmptyInternalIterator(arena); + } else { + result = table_reader->NewIterator(options, arena, skip_filters); + } if (create_new_table_reader) { assert(handle == nullptr); result->RegisterCleanup(&DeleteTableReader, table_reader, nullptr); diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 6c4e906ab..3ec83a65c 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -1063,6 +1063,14 @@ struct ReadOptions { // Default: false bool ignore_range_deletions; + // A callback to determine whether relevant keys for this scan exist in a + // given table based on the table's properties. The callback is passed the + // properties of each table during iteration. If the callback returns false, + // the table will not be scanned. This option only affects Iterators and has + // no impact on point lookups. + // Default: empty (every table will be scanned) + std::function table_filter; + ReadOptions(); ReadOptions(bool cksum, bool cache); };