expose a hook to skip tables during iteration
Summary: As discussed on the mailing list (["Skipping entire SSTs while iterating"](https://groups.google.com/forum/#!topic/rocksdb/ujHCJVLrHlU)), this patch adds a `table_filter` to `ReadOptions` that allows specifying a callback to be executed during iteration before each table in the database is scanned. The callback is passed the table's properties; the table is scanned iff the callback returns true. This can be used in conjunction with a `TablePropertiesCollector` to dramatically speed up scans by skipping tables that are known to contain irrelevant data for the scan at hand. We're using this [downstream in CockroachDB](https://github.com/cockroachdb/cockroach/blob/master/pkg/storage/engine/db.cc#L2009-L2022) already. With this feature, under ideal conditions, we can reduce the time of an incremental backup in from hours to seconds. FYI, the first commit in this PR fixes a segfault that I unfortunately have not figured out how to reproduce outside of CockroachDB. I'm hoping you accept it on the grounds that it is not correct to return 8-byte aligned memory from a call to `malloc` on some 64-bit platforms; one correct approach is to infer the necessary alignment from `std::max_align_t`, as done here. As noted in the first commit message, the bug is tickled by having a`std::function` in `struct ReadOptions`. That is, the following patch alone is enough to cause RocksDB to segfault when run from CockroachDB on Darwin. ```diff --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -1546,6 +1546,13 @@ struct ReadOptions { // Default: false bool ignore_range_deletions; + // A callback to determine whether relevant keys for this scan exist in a + // given table based on the table's properties. The callback is passed the + // properties of each table during iteration. If the callback returns false, + // the table will not be scanned. + // Default: empty (every table will be scanned) + std::function<bool(const TableProperties&)> table_filter; + ReadOptions(); ReadOptions(bool cksum, bool cache); }; ``` /cc danhhz Closes https://github.com/facebook/rocksdb/pull/2265 Differential Revision: D5054262 Pulled By: yiwu-arbug fbshipit-source-id: dd6b28f2bba6cb8466250d8c5c542d3c92785476
This commit is contained in:
parent
eaaef91178
commit
7891af8b53
@ -2010,6 +2010,71 @@ TEST_F(DBIteratorTest, CreationFailure) {
|
||||
delete iter;
|
||||
}
|
||||
|
||||
TEST_F(DBIteratorTest, TableFilter) {
|
||||
ASSERT_OK(Put("a", "1"));
|
||||
dbfull()->Flush(FlushOptions());
|
||||
ASSERT_OK(Put("b", "2"));
|
||||
ASSERT_OK(Put("c", "3"));
|
||||
dbfull()->Flush(FlushOptions());
|
||||
ASSERT_OK(Put("d", "4"));
|
||||
ASSERT_OK(Put("e", "5"));
|
||||
ASSERT_OK(Put("f", "6"));
|
||||
dbfull()->Flush(FlushOptions());
|
||||
|
||||
// Ensure the table_filter callback is called once for each table.
|
||||
{
|
||||
std::set<uint64_t> unseen {1, 2, 3};
|
||||
ReadOptions opts;
|
||||
opts.table_filter = [&](const TableProperties& props) {
|
||||
auto it = unseen.find(props.num_entries);
|
||||
if (it == unseen.end()) {
|
||||
ADD_FAILURE() << "saw table properties with an unexpected " << props.num_entries << " entries";
|
||||
} else {
|
||||
unseen.erase(it);
|
||||
}
|
||||
return true;
|
||||
};
|
||||
auto iter = db_->NewIterator(opts);
|
||||
iter->SeekToFirst();
|
||||
ASSERT_EQ(IterStatus(iter), "a->1");
|
||||
iter->Next();
|
||||
ASSERT_EQ(IterStatus(iter), "b->2");
|
||||
iter->Next();
|
||||
ASSERT_EQ(IterStatus(iter), "c->3");
|
||||
iter->Next();
|
||||
ASSERT_EQ(IterStatus(iter), "d->4");
|
||||
iter->Next();
|
||||
ASSERT_EQ(IterStatus(iter), "e->5");
|
||||
iter->Next();
|
||||
ASSERT_EQ(IterStatus(iter), "f->6");
|
||||
iter->Next();
|
||||
ASSERT_FALSE(iter->Valid());
|
||||
ASSERT_TRUE(unseen.empty());
|
||||
delete iter;
|
||||
}
|
||||
|
||||
// Ensure returning false in the table_filter hides the keys from that table
|
||||
// during iteration.
|
||||
{
|
||||
ReadOptions opts;
|
||||
opts.table_filter = [](const TableProperties& props) {
|
||||
return props.num_entries != 2;
|
||||
};
|
||||
auto iter = db_->NewIterator(opts);
|
||||
iter->SeekToFirst();
|
||||
ASSERT_EQ(IterStatus(iter), "a->1");
|
||||
iter->Next();
|
||||
ASSERT_EQ(IterStatus(iter), "d->4");
|
||||
iter->Next();
|
||||
ASSERT_EQ(IterStatus(iter), "e->5");
|
||||
iter->Next();
|
||||
ASSERT_EQ(IterStatus(iter), "f->6");
|
||||
iter->Next();
|
||||
ASSERT_FALSE(iter->Valid());
|
||||
delete iter;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
|
@ -225,7 +225,12 @@ InternalIterator* TableCache::NewIterator(
|
||||
}
|
||||
InternalIterator* result = nullptr;
|
||||
if (s.ok()) {
|
||||
if (options.table_filter &&
|
||||
!options.table_filter(*table_reader->GetTableProperties())) {
|
||||
result = NewEmptyInternalIterator(arena);
|
||||
} else {
|
||||
result = table_reader->NewIterator(options, arena, skip_filters);
|
||||
}
|
||||
if (create_new_table_reader) {
|
||||
assert(handle == nullptr);
|
||||
result->RegisterCleanup(&DeleteTableReader, table_reader, nullptr);
|
||||
|
@ -1063,6 +1063,14 @@ struct ReadOptions {
|
||||
// Default: false
|
||||
bool ignore_range_deletions;
|
||||
|
||||
// A callback to determine whether relevant keys for this scan exist in a
|
||||
// given table based on the table's properties. The callback is passed the
|
||||
// properties of each table during iteration. If the callback returns false,
|
||||
// the table will not be scanned. This option only affects Iterators and has
|
||||
// no impact on point lookups.
|
||||
// Default: empty (every table will be scanned)
|
||||
std::function<bool(const TableProperties&)> table_filter;
|
||||
|
||||
ReadOptions();
|
||||
ReadOptions(bool cksum, bool cache);
|
||||
};
|
||||
|
Loading…
Reference in New Issue
Block a user