From 2a986919d600eba8aa0d827d5c78443f33c28310 Mon Sep 17 00:00:00 2001 From: Mayank Agarwal Date: Fri, 5 Jul 2013 18:49:18 -0700 Subject: [PATCH] Make rocksdb-deletes faster using bloom filter Summary: Wrote a new function in db_impl.c-CheckKeyMayExist that calls Get but with a new parameter turned on which makes Get return false only if bloom filters can guarantee that key is not in database. Delete calls this function and if the option- deletes_use_filter is turned on and CheckKeyMayExist returns false, the delete will be dropped saving: 1. Put of delete type 2. Space in the db,and 3. Compaction time Test Plan: make all check; will run db_stress and db_bench and enhance unit-test once the basic design gets approved Reviewers: dhruba, haobo, vamsi Reviewed By: haobo CC: leveldb Differential Revision: https://reviews.facebook.net/D11607 --- db/db_bench.cc | 7 +++++++ db/db_impl.cc | 26 ++++++++++++++++++++--- db/db_impl.h | 10 +++++++++ db/db_test.cc | 40 +++++++++++++++++++++++++++++++++++- db/memtable.cc | 6 +++++- db/memtable.h | 5 +++-- db/memtablelist.cc | 4 ++-- db/memtablelist.h | 2 +- db/table_cache.cc | 6 ++++-- db/table_cache.h | 4 +++- db/version_set.cc | 19 +++++++++++++++-- db/version_set.h | 3 ++- include/leveldb/db.h | 5 +++++ include/leveldb/options.h | 9 ++++++++ include/leveldb/statistics.h | 7 +++++-- table/table.cc | 9 +++++++- table/table.h | 4 +++- tools/db_stress.cc | 11 +++++++++- util/options.cc | 5 ++++- utilities/ttl/db_ttl.cc | 4 ++++ utilities/ttl/db_ttl.h | 2 ++ 21 files changed, 166 insertions(+), 22 deletions(-) diff --git a/db/db_bench.cc b/db/db_bench.cc index bfba23b4c..253a64bcf 100644 --- a/db/db_bench.cc +++ b/db/db_bench.cc @@ -327,6 +327,9 @@ static auto FLAGS_use_adaptive_mutex = static auto FLAGS_bytes_per_sync = leveldb::Options().bytes_per_sync; +// On true, deletes use bloom-filter and drop the delete if key not present +static bool FLAGS_deletes_check_filter_first = false; + namespace leveldb { // Helper for quickly generating random data. @@ -1111,6 +1114,7 @@ unique_ptr GenerateKeyFromInt(int v, const char* suffix = "") options.max_bytes_for_level_base = FLAGS_max_bytes_for_level_base; options.max_bytes_for_level_multiplier = FLAGS_max_bytes_for_level_multiplier; + options.deletes_check_filter_first = FLAGS_deletes_check_filter_first; if (FLAGS_max_bytes_for_level_multiplier_additional.size() > 0) { if (FLAGS_max_bytes_for_level_multiplier_additional.size() != (unsigned int)FLAGS_num_levels) { @@ -2216,6 +2220,9 @@ int main(int argc, char** argv) { FLAGS_keys_per_multiget = n; } else if (sscanf(argv[i], "--bytes_per_sync=%ld%c", &l, &junk) == 1) { FLAGS_bytes_per_sync = l; + } else if (sscanf(argv[i], "--deletes_check_filter_first=%d%c", &n, &junk) + == 1 && (n == 0 || n ==1 )) { + FLAGS_deletes_check_filter_first = n; } else { fprintf(stderr, "Invalid flag '%s'\n", argv[i]); exit(1); diff --git a/db/db_impl.cc b/db/db_impl.cc index 7fda3ef7e..1bdd61d39 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -1998,6 +1998,16 @@ int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes() { Status DBImpl::Get(const ReadOptions& options, const Slice& key, std::string* value) { + return GetImpl(options, key, value); +} + +// If no_IO is true, then returns Status::NotFound if key is not in memtable, +// immutable-memtable and bloom-filters can guarantee that key is not in db, +// "value" is garbage string if no_IO is true +Status DBImpl::GetImpl(const ReadOptions& options, + const Slice& key, + std::string* value, + const bool no_IO) { Status s; StopWatch sw(env_, options_.statistics, DB_GET); @@ -2026,12 +2036,12 @@ Status DBImpl::Get(const ReadOptions& options, // s is both in/out. When in, s could either be OK or MergeInProgress. // value will contain the current merge operand in the latter case. LookupKey lkey(key, snapshot); - if (mem->Get(lkey, value, &s, options_)) { + if (mem->Get(lkey, value, &s, options_, no_IO)) { // Done - } else if (imm.Get(lkey, value, &s, options_)) { + } else if (imm.Get(lkey, value, &s, options_, no_IO)) { // Done } else { - current->Get(options, lkey, value, &s, &stats, options_); + current->Get(options, lkey, value, &s, &stats, options_, no_IO); have_stat_update = true; } mutex_.Lock(); @@ -2121,6 +2131,12 @@ std::vector DBImpl::MultiGet(const ReadOptions& options, return statList; } +bool DBImpl::KeyMayExist(const Slice& key) { + std::string value; + const Status s = GetImpl(ReadOptions(), key, &value, true); + return !s.IsNotFound(); +} + Iterator* DBImpl::NewIterator(const ReadOptions& options) { SequenceNumber latest_snapshot; Iterator* internal_iter = NewInternalIterator(options, &latest_snapshot); @@ -2156,6 +2172,10 @@ Status DBImpl::Merge(const WriteOptions& o, const Slice& key, } Status DBImpl::Delete(const WriteOptions& options, const Slice& key) { + if (options_.deletes_check_filter_first && !KeyMayExist(key)) { + RecordTick(options_.statistics, NUMBER_FILTERED_DELETES); + return Status::OK(); + } return DB::Delete(options, key); } diff --git a/db/db_impl.h b/db/db_impl.h index fb6879020..5f09035f2 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -48,6 +48,10 @@ class DBImpl : public DB { virtual std::vector MultiGet(const ReadOptions& options, const std::vector& keys, std::vector* values); + + // Returns false if key can't exist- based on memtable, immutable-memtable and + // bloom-filters; true otherwise. No IO is performed + virtual bool KeyMayExist(const Slice& key); virtual Iterator* NewIterator(const ReadOptions&); virtual const Snapshot* GetSnapshot(); virtual void ReleaseSnapshot(const Snapshot* snapshot); @@ -379,6 +383,12 @@ class DBImpl : public DB { SequenceNumber in, std::vector& snapshots, SequenceNumber* prev_snapshot); + + // Function that Get and KeyMayExist call with no_IO true or false + Status GetImpl(const ReadOptions& options, + const Slice& key, + std::string* value, + const bool no_IO = false); }; // Sanitize db options. The caller should delete result.info_log if diff --git a/db/db_test.cc b/db/db_test.cc index 7ce4ac419..a88b78e3c 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -218,6 +218,7 @@ class DBTest { kManifestFileSize, kCompactOnFlush, kPerfOptions, + kDeletesFilterFirst, kEnd }; int option_config_; @@ -289,6 +290,9 @@ class DBTest { options.rate_limit_delay_milliseconds = 2; // TODO -- test more options break; + case kDeletesFilterFirst: + options.deletes_check_filter_first = true; + break; default: break; } @@ -768,6 +772,37 @@ TEST(DBTest, GetEncountersEmptyLevel) { } while (ChangeOptions()); } +// KeyMayExist-API returns false if memtable(s) and in-memory bloom-filters can +// guarantee that the key doesn't exist in the db, else true. This can lead to +// a few false positives, but not false negatives. To make test deterministic, +// use a much larger number of bits per key-20 than bits in the key, so +// that false positives are eliminated +TEST(DBTest, KeyMayExist) { + do { + Options options = CurrentOptions(); + options.filter_policy = NewBloomFilterPolicy(20); + Reopen(&options); + + ASSERT_TRUE(!db_->KeyMayExist("a")); + + ASSERT_OK(db_->Put(WriteOptions(), "a", "b")); + ASSERT_TRUE(db_->KeyMayExist("a")); + + dbfull()->Flush(FlushOptions()); + ASSERT_TRUE(db_->KeyMayExist("a")); + + ASSERT_OK(db_->Delete(WriteOptions(), "a")); + ASSERT_TRUE(!db_->KeyMayExist("a")); + + dbfull()->Flush(FlushOptions()); + dbfull()->CompactRange(nullptr, nullptr); + ASSERT_TRUE(!db_->KeyMayExist("a")); + + ASSERT_OK(db_->Delete(WriteOptions(), "c")); + ASSERT_TRUE(!db_->KeyMayExist("c")); + } while (ChangeOptions()); +} + TEST(DBTest, IterEmpty) { Iterator* iter = db_->NewIterator(ReadOptions()); @@ -1403,7 +1438,7 @@ class DeleteFilter : public CompactionFilter { class ChangeFilter : public CompactionFilter { public: - ChangeFilter(int argv) : argv_(argv) {} + explicit ChangeFilter(int argv) : argv_(argv) {} virtual bool Filter(int level, const Slice& key, const Slice& value, std::string* new_value, @@ -2970,6 +3005,9 @@ class ModelDB: public DB { Status::NotSupported("Not implemented.")); return s; } + virtual bool KeyMayExist(const Slice& key) { + return true; // Not Supported directly + } virtual Iterator* NewIterator(const ReadOptions& options) { if (options.snapshot == nullptr) { KVMap* saved = new KVMap; diff --git a/db/memtable.cc b/db/memtable.cc index c6f8f26c2..cfd2bed04 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -119,7 +119,7 @@ void MemTable::Add(SequenceNumber s, ValueType type, } bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, - const Options& options) { + const Options& options, const bool check_presence_only) { Slice memkey = key.memtable_key(); Table::Iterator iter(&table_); iter.Seek(memkey.data()); @@ -164,6 +164,10 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, return true; } case kTypeMerge: { + if (check_presence_only) { + *s = Status::OK(); + return true; + } Slice v = GetLengthPrefixedSlice(key_ptr + key_length); if (merge_in_progress) { merge_operator->Merge(key.user_key(), &v, operand, diff --git a/db/memtable.h b/db/memtable.h index 6b3c68bf6..def3a5d3d 100644 --- a/db/memtable.h +++ b/db/memtable.h @@ -63,12 +63,13 @@ class MemTable { // If memtable contains a deletion for key, store a NotFound() error // in *status and return true. // If memtable contains Merge operation as the most recent entry for a key, - // and the merge process does not stop (not reaching a value or delete), + // and if check_presence_only is set, return true with Status::OK, + // else if the merge process does not stop (not reaching a value or delete), // store the current merged result in value and MergeInProgress in s. // return false // Else, return false. bool Get(const LookupKey& key, std::string* value, Status* s, - const Options& options); + const Options& options, const bool check_presence_only = false); // Returns the edits area that is needed for flushing the memtable VersionEdit* GetEdits() { return &edit_; } diff --git a/db/memtablelist.cc b/db/memtablelist.cc index eb427eb16..ac89d1043 100644 --- a/db/memtablelist.cc +++ b/db/memtablelist.cc @@ -194,10 +194,10 @@ size_t MemTableList::ApproximateMemoryUsage() { // Search all the memtables starting from the most recent one. // Return the most recent value found, if any. bool MemTableList::Get(const LookupKey& key, std::string* value, Status* s, - const Options& options ) { + const Options& options, const bool check_presence_only) { for (list::iterator it = memlist_.begin(); it != memlist_.end(); ++it) { - if ((*it)->Get(key, value, s, options)) { + if ((*it)->Get(key, value, s, options, check_presence_only)) { return true; } } diff --git a/db/memtablelist.h b/db/memtablelist.h index 31831deac..40419e56f 100644 --- a/db/memtablelist.h +++ b/db/memtablelist.h @@ -71,7 +71,7 @@ class MemTableList { // Search all the memtables starting from the most recent one. // Return the most recent value found, if any. bool Get(const LookupKey& key, std::string* value, Status* s, - const Options& options); + const Options& options, const bool check_presence_only = false); // Returns the list of underlying memtables. void GetMemTables(std::vector* list); diff --git a/db/table_cache.cc b/db/table_cache.cc index 700db74db..4cc105afe 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -112,14 +112,16 @@ Status TableCache::Get(const ReadOptions& options, const Slice& k, void* arg, bool (*saver)(void*, const Slice&, const Slice&, bool), - bool* tableIO) { + bool* tableIO, + void (*mark_key_may_exist)(void*), + const bool no_IO) { Cache::Handle* handle = nullptr; Status s = FindTable(storage_options_, file_number, file_size, &handle, tableIO); if (s.ok()) { Table* t = reinterpret_cast(cache_->Value(handle)); - s = t->InternalGet(options, k, arg, saver); + s = t->InternalGet(options, k, arg, saver, mark_key_may_exist, no_IO); cache_->Release(handle); } return s; diff --git a/db/table_cache.h b/db/table_cache.h index c3996a3cc..737e53c9e 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -48,7 +48,9 @@ class TableCache { const Slice& k, void* arg, bool (*handle_result)(void*, const Slice&, const Slice&, bool), - bool* tableIO); + bool* tableIO, + void (*mark_key_may_exist)(void*) = nullptr, + const bool no_IO = false); // Evict any entry for the specified file number void Evict(uint64_t file_number); diff --git a/db/version_set.cc b/db/version_set.cc index 686e82cba..19dc022f4 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -244,6 +244,16 @@ struct Saver { bool didIO; // did we do any disk io? }; } + +// Called from TableCache::Get when bloom-filters can't guarantee that key does +// not exist and Get is not permitted to do IO to read the data-block and be +// certain. +// Set the key as Found and let the caller know that key-may-exist +static void MarkKeyMayExist(void* arg) { + Saver* s = reinterpret_cast(arg); + s->state = kFound; +} + static bool SaveValue(void* arg, const Slice& ikey, const Slice& v, bool didIO){ Saver* s = reinterpret_cast(arg); ParsedInternalKey parsed_key; @@ -328,7 +338,8 @@ void Version::Get(const ReadOptions& options, std::string* value, Status *status, GetStats* stats, - const Options& db_options) { + const Options& db_options, + const bool no_IO) { Slice ikey = k.internal_key(); Slice user_key = k.user_key(); const Comparator* ucmp = vset_->icmp_.user_comparator(); @@ -337,6 +348,9 @@ void Version::Get(const ReadOptions& options, auto logger = db_options.info_log; assert(status->ok() || status->IsMergeInProgress()); + if (no_IO) { + assert(status->ok()); + } Saver saver; saver.state = status->ok()? kNotFound : kMerge; saver.ucmp = ucmp; @@ -404,7 +418,8 @@ void Version::Get(const ReadOptions& options, FileMetaData* f = files[i]; bool tableIO = false; *status = vset_->table_cache_->Get(options, f->number, f->file_size, - ikey, &saver, SaveValue, &tableIO); + ikey, &saver, SaveValue, &tableIO, + MarkKeyMayExist, no_IO); // TODO: examine the behavior for corrupted key if (!status->ok()) { return; diff --git a/db/version_set.h b/db/version_set.h index ba924126f..2369bcc1e 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -74,7 +74,8 @@ class Version { int seek_file_level; }; void Get(const ReadOptions&, const LookupKey& key, std::string* val, - Status* status, GetStats* stats, const Options& db_option); + Status* status, GetStats* stats, const Options& db_option, + const bool no_IO = false); // Adds "stats" into the current state. Returns true if a new // compaction may need to be triggered, false otherwise. diff --git a/include/leveldb/db.h b/include/leveldb/db.h index 6d816edce..056920d9e 100644 --- a/include/leveldb/db.h +++ b/include/leveldb/db.h @@ -120,6 +120,11 @@ class DB { const std::vector& keys, std::vector* values) = 0; + // If the key definitely does not exist in the database, then this method + // returns false. Otherwise return true. This check is potentially + // lighter-weight than invoking DB::Get(). No IO is performed + virtual bool KeyMayExist(const Slice& key) = 0; + // Return a heap-allocated iterator over the contents of the database. // The result of NewIterator() is initially invalid (caller must // call one of the Seek methods on the iterator before using it). diff --git a/include/leveldb/options.h b/include/leveldb/options.h index e0c21cb7e..3341b72a2 100644 --- a/include/leveldb/options.h +++ b/include/leveldb/options.h @@ -465,6 +465,15 @@ struct Options { // Default: 0 uint64_t bytes_per_sync; + // Use bloom-filter for deletes when this is true. + // db->Delete first calls KeyMayExist which checks memtable,immutable-memtable + // and bloom-filters to determine if the key does not exist in the database. + // If the key definitely does not exist, then the delete is a noop.KeyMayExist + // only incurs in-memory look up. This optimization avoids writing the delete + // to storage when appropriate. + // Default: false + bool deletes_check_filter_first; + }; // Options that control read operations diff --git a/include/leveldb/statistics.h b/include/leveldb/statistics.h index 155974659..4ce4f6d1b 100644 --- a/include/leveldb/statistics.h +++ b/include/leveldb/statistics.h @@ -58,7 +58,9 @@ enum Tickers { NUMBER_MULTIGET_KEYS_READ = 19, NUMBER_MULTIGET_BYTES_READ = 20, - TICKER_ENUM_MAX = 21 + NUMBER_FILTERED_DELETES = 21, + + TICKER_ENUM_MAX = 22 }; const std::vector> TickersNameMap = { @@ -82,7 +84,8 @@ const std::vector> TickersNameMap = { { NO_ITERATORS, "rocksdb.num.iterators" }, { NUMBER_MULTIGET_CALLS, "rocksdb.number.multiget.get" }, { NUMBER_MULTIGET_KEYS_READ, "rocksdb.number.multiget.keys.read" }, - { NUMBER_MULTIGET_BYTES_READ, "rocksdb.number.multiget.bytes.read" } + { NUMBER_MULTIGET_BYTES_READ, "rocksdb.number.multiget.bytes.read" }, + { NUMBER_FILTERED_DELETES, "rocksdb.number.deletes.filtered" } }; /** diff --git a/table/table.cc b/table/table.cc index 5aceebe0f..f7b664a4f 100644 --- a/table/table.cc +++ b/table/table.cc @@ -322,7 +322,9 @@ Iterator* Table::NewIterator(const ReadOptions& options) const { Status Table::InternalGet(const ReadOptions& options, const Slice& k, void* arg, bool (*saver)(void*, const Slice&, const Slice&, - bool)) { + bool), + void (*mark_key_may_exist)(void*), + const bool no_IO) { Status s; Iterator* iiter = rep_->index_block->NewIterator(rep_->options.comparator); bool done = false; @@ -338,6 +340,11 @@ Status Table::InternalGet(const ReadOptions& options, const Slice& k, // cross one data block, we should be fine. RecordTick(rep_->options.statistics, BLOOM_FILTER_USEFUL); break; + } else if (no_IO) { + // Update Saver.state to Found because we are only looking for whether + // bloom-filter can guarantee the key is not there when "no_IO" + (*mark_key_may_exist)(arg); + done = true; } else { bool didIO = false; Iterator* block_iter = BlockReader(this, options, iiter->value(), diff --git a/table/table.h b/table/table.h index a657290b5..4674e262b 100644 --- a/table/table.h +++ b/table/table.h @@ -86,7 +86,9 @@ class Table { Status InternalGet( const ReadOptions&, const Slice& key, void* arg, - bool (*handle_result)(void* arg, const Slice& k, const Slice& v, bool)); + bool (*handle_result)(void* arg, const Slice& k, const Slice& v, bool), + void (*mark_key_may_exist)(void*) = nullptr, + const bool no_IO = false); void ReadMeta(const Footer& footer); diff --git a/tools/db_stress.cc b/tools/db_stress.cc index 26a0e6775..8d27c1a68 100644 --- a/tools/db_stress.cc +++ b/tools/db_stress.cc @@ -180,6 +180,9 @@ static uint32_t FLAGS_log2_keys_per_lock = 2; // implies 2^2 keys per lock // Percentage of times we want to purge redundant keys in memory before flushing static uint32_t FLAGS_purge_redundant_percent = 50; +// On true, deletes use bloom-filter and drop the delete if key not present +static bool FLAGS_deletes_check_filter_first = false; + // Level0 compaction start trigger static int FLAGS_level0_file_num_compaction_trigger = 0; @@ -898,8 +901,10 @@ class StressTest { fprintf(stdout, "Num times DB reopens: %d\n", FLAGS_reopen); fprintf(stdout, "Batches/snapshots : %d\n", FLAGS_test_batches_snapshots); - fprintf(stdout, "Purge redundant %% : %d\n", + fprintf(stdout, "Purge redundant %% : %d\n", FLAGS_purge_redundant_percent); + fprintf(stdout, "Deletes use filter : %d\n", + FLAGS_deletes_check_filter_first); fprintf(stdout, "Num keys per lock : %d\n", 1 << FLAGS_log2_keys_per_lock); @@ -955,6 +960,7 @@ class StressTest { options.delete_obsolete_files_period_micros = FLAGS_delete_obsolete_files_period_micros; options.max_manifest_file_size = 1024; + options.deletes_check_filter_first = FLAGS_deletes_check_filter_first; static Random purge_percent(1000); // no benefit from non-determinism here if (purge_percent.Uniform(100) < FLAGS_purge_redundant_percent - 1) { options.purge_redundant_kvs_while_flush = false; @@ -1154,6 +1160,9 @@ int main(int argc, char** argv) { } else if (sscanf(argv[i], "--purge_redundant_percent=%d%c", &n, &junk) == 1 && (n >= 0 && n <= 100)) { FLAGS_purge_redundant_percent = n; + } else if (sscanf(argv[i], "--deletes_check_filter_first=%d%c", &n, &junk) + == 1 && (n == 0 || n == 1)) { + FLAGS_deletes_check_filter_first = n; } else { fprintf(stderr, "Invalid flag '%s'\n", argv[i]); exit(1); diff --git a/util/options.cc b/util/options.cc index 09121a0e5..1ac25845a 100644 --- a/util/options.cc +++ b/util/options.cc @@ -74,7 +74,8 @@ Options::Options() advise_random_on_open(true), access_hint_on_compaction_start(NORMAL), use_adaptive_mutex(false), - bytes_per_sync(0) { + bytes_per_sync(0), + deletes_check_filter_first(false) { } static const char* const access_hints[] = { @@ -208,6 +209,8 @@ Options::Dump(Logger* log) const use_adaptive_mutex); Log(log," Options.bytes_per_sync: %ld", bytes_per_sync); + Log(log," Options.deletes_check_filter_first: %d", + deletes_check_filter_first); } // Options::Dump // diff --git a/utilities/ttl/db_ttl.cc b/utilities/ttl/db_ttl.cc index d201208d3..eff675340 100644 --- a/utilities/ttl/db_ttl.cc +++ b/utilities/ttl/db_ttl.cc @@ -158,6 +158,10 @@ std::vector DBWithTTL::MultiGet(const ReadOptions& options, supported with TTL")); } +bool DBWithTTL::KeyMayExist(const Slice& key) { + return db_->KeyMayExist(key); +} + Status DBWithTTL::Delete(const WriteOptions& wopts, const Slice& key) { return db_->Delete(wopts, key); } diff --git a/utilities/ttl/db_ttl.h b/utilities/ttl/db_ttl.h index 2e01c1e3d..d24efbe48 100644 --- a/utilities/ttl/db_ttl.h +++ b/utilities/ttl/db_ttl.h @@ -33,6 +33,8 @@ class DBWithTTL : public DB, CompactionFilter { const std::vector& keys, std::vector* values); + virtual bool KeyMayExist(const Slice& key); + virtual Status Delete(const WriteOptions& wopts, const Slice& key); virtual Status Merge(const WriteOptions& options,