diff --git a/HISTORY.md b/HISTORY.md index 66c5100ed..68cac534b 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -6,6 +6,7 @@ * Provided support for tracking per-sst user-defined timestamp information in MANIFEST. * Added new option "adaptive_readahead" in ReadOptions. For iterators, RocksDB does auto-readahead on noticing sequential reads and by enabling this option, readahead_size of current file (if reads are sequential) will be carried forward to next file instead of starting from the scratch at each level (except L0 level files). If reads are not sequential it will fall back to 8KB. This option is applicable only for RocksDB internal prefetch buffer and isn't supported with underlying file system prefetching. * Added the read count and read bytes related stats to Statistics for tiered storage hot, warm, and cold file reads. +* Added an option to dynamically charge an updating estimated memory usage of block-based table building to block cache if block cache available. It currently only includes charging memory usage of constructing (new) Bloom Filter and Ribbon Filter to block cache. To enable this feature, set `BlockBasedTableOptions::reserve_table_builder_memory = true`. ### Bug Fixes * Prevent a `CompactRange()` with `CompactRangeOptions::change_level == true` from possibly causing corruption to the LSM state (overlapping files within a level) when run in parallel with another manual compaction. Note that setting `force_consistency_checks == true` (the default) would cause the DB to enter read-only mode in this scenario and return `Status::Corruption`, rather than committing any corruption. diff --git a/cache/cache_entry_roles.cc b/cache/cache_entry_roles.cc index 4416b7c2f..237fcfcd6 100644 --- a/cache/cache_entry_roles.cc +++ b/cache/cache_entry_roles.cc @@ -20,6 +20,7 @@ std::array kCacheEntryRoleToCamelString{{ "OtherBlock", "WriteBuffer", "CompressionDictionaryBuildingBuffer", + "FilterConstruction", "Misc", }}; @@ -32,6 +33,7 @@ std::array kCacheEntryRoleToHyphenString{{ "other-block", "write-buffer", "compression-dictionary-building-buffer", + "filter-construction", "misc", }}; diff --git a/cache/cache_entry_roles.h b/cache/cache_entry_roles.h index 65f1c464b..d0e3da2e3 100644 --- a/cache/cache_entry_roles.h +++ b/cache/cache_entry_roles.h @@ -36,6 +36,9 @@ enum class CacheEntryRole { // BlockBasedTableBuilder reservations to account for // compression dictionary building buffer's memory usage kCompressionDictionaryBuildingBuffer, + // Filter reservations to account for + // (new) bloom and ribbon filter construction's memory usage + kFilterConstruction, // Default bucket, for miscellaneous cache entries. Do not use for // entries that could potentially add up to large usage. kMisc, diff --git a/cache/cache_reservation_manager.cc b/cache/cache_reservation_manager.cc index dd69da785..b35726711 100644 --- a/cache/cache_reservation_manager.cc +++ b/cache/cache_reservation_manager.cc @@ -95,6 +95,11 @@ template Status CacheReservationManager::MakeCacheReservation( std::size_t incremental_memory_used, std::unique_ptr>* handle); +template Status CacheReservationManager::MakeCacheReservation< + CacheEntryRole::kFilterConstruction>( + std::size_t incremental_memory_used, + std::unique_ptr< + CacheReservationHandle>* handle); template Status CacheReservationManager::IncreaseCacheReservation( @@ -153,6 +158,14 @@ Slice CacheReservationManager::GetNextCacheKey() { return Slice(cache_key_, static_cast(end - cache_key_)); } +template +Cache::DeleterFn CacheReservationManager::TEST_GetNoopDeleterForRole() { + return GetNoopDeleterForRole(); +} + +template Cache::DeleterFn CacheReservationManager::TEST_GetNoopDeleterForRole< + CacheEntryRole::kFilterConstruction>(); + template CacheReservationHandle::CacheReservationHandle( std::size_t incremental_memory_used, @@ -175,4 +188,5 @@ CacheReservationHandle::~CacheReservationHandle() { // Explicitly instantiate templates for "CacheEntryRole" values we use. // This makes it possible to keep the template definitions in the .cc file. template class CacheReservationHandle; +template class CacheReservationHandle; } // namespace ROCKSDB_NAMESPACE diff --git a/cache/cache_reservation_manager.h b/cache/cache_reservation_manager.h index faa516217..9db495a20 100644 --- a/cache/cache_reservation_manager.h +++ b/cache/cache_reservation_manager.h @@ -150,6 +150,12 @@ class CacheReservationManager static constexpr std::size_t GetDummyEntrySize() { return kSizeDummyEntry; } + // For testing only - it is to help ensure the NoopDeleterForRole + // accessed from CacheReservationManager and the one accessed from the test + // are from the same translation units + template + static Cache::DeleterFn TEST_GetNoopDeleterForRole(); + private: static constexpr std::size_t kSizeDummyEntry = 256 * 1024; // The key will be longer than keys for blocks in SST files so they won't diff --git a/db/db_bloom_filter_test.cc b/db/db_bloom_filter_test.cc index b856e0de9..5bc746a8e 100644 --- a/db/db_bloom_filter_test.cc +++ b/db/db_bloom_filter_test.cc @@ -10,9 +10,12 @@ #include #include +#include "cache/cache_entry_roles.h" +#include "cache/cache_reservation_manager.h" #include "db/db_test_util.h" #include "options/options_helper.h" #include "port/stack_trace.h" +#include "rocksdb/convenience.h" #include "rocksdb/perf_context.h" #include "table/block_based/filter_policy_internal.h" #include "test_util/testutil.h" @@ -667,6 +670,439 @@ TEST_F(DBBloomFilterTest, BloomFilterReverseCompatibility) { } } +/* + * A cache wrapper that tracks peaks and increments of filter + * construction cache reservation. + * p0 + * / \ p1 + * / \ /\ + * / \/ \ + * a / b \ + * peaks = {p0, p1} + * increments = {p1-a, p2-b} + */ +class FilterConstructResPeakTrackingCache : public CacheWrapper { + public: + explicit FilterConstructResPeakTrackingCache(std::shared_ptr target) + : CacheWrapper(std::move(target)), + cur_cache_res_(0), + cache_res_peak_(0), + cache_res_increment_(0), + last_peak_tracked_(false), + cache_res_increments_sum_(0) {} + + using Cache::Insert; + Status Insert(const Slice& key, void* value, size_t charge, + void (*deleter)(const Slice& key, void* value), + Handle** handle = nullptr, + Priority priority = Priority::LOW) override { + Status s = target_->Insert(key, value, charge, deleter, handle, priority); + if (deleter == kNoopDeleterForFilterConstruction) { + if (last_peak_tracked_) { + cache_res_peak_ = 0; + cache_res_increment_ = 0; + last_peak_tracked_ = false; + } + cur_cache_res_ += charge; + cache_res_peak_ = std::max(cache_res_peak_, cur_cache_res_); + cache_res_increment_ += charge; + } + return s; + } + + using Cache::Release; + bool Release(Handle* handle, bool force_erase = false) override { + auto deleter = GetDeleter(handle); + if (deleter == kNoopDeleterForFilterConstruction) { + if (!last_peak_tracked_) { + cache_res_peaks_.push_back(cache_res_peak_); + cache_res_increments_sum_ += cache_res_increment_; + last_peak_tracked_ = true; + } + cur_cache_res_ -= GetCharge(handle); + } + bool is_successful = target_->Release(handle, force_erase); + return is_successful; + } + + std::deque GetReservedCachePeaks() { return cache_res_peaks_; } + + std::size_t GetReservedCacheIncrementSum() { + return cache_res_increments_sum_; + } + + private: + static const Cache::DeleterFn kNoopDeleterForFilterConstruction; + + std::size_t cur_cache_res_; + std::size_t cache_res_peak_; + std::size_t cache_res_increment_; + bool last_peak_tracked_; + std::deque cache_res_peaks_; + std::size_t cache_res_increments_sum_; +}; + +const Cache::DeleterFn + FilterConstructResPeakTrackingCache::kNoopDeleterForFilterConstruction = + CacheReservationManager::TEST_GetNoopDeleterForRole< + CacheEntryRole::kFilterConstruction>(); + +// To align with the type of hash entry being reserved in implementation. +using FilterConstructionReserveMemoryHash = uint64_t; + +class DBFilterConstructionReserveMemoryTestWithParam + : public DBTestBase, + public testing::WithParamInterface< + std::tuple> { + public: + DBFilterConstructionReserveMemoryTestWithParam() + : DBTestBase("db_bloom_filter_tests", + /*env_do_fsync=*/true), + num_key_(0), + reserve_table_builder_memory_(std::get<0>(GetParam())), + policy_(std::get<1>(GetParam())), + partition_filters_(std::get<2>(GetParam())) { + if (!reserve_table_builder_memory_ || + policy_ == BloomFilterPolicy::Mode::kDeprecatedBlock || + policy_ == BloomFilterPolicy::Mode::kLegacyBloom) { + // For these cases, we only interested in whether filter construction + // cache resevation happens instead of its accuracy. Therefore we don't + // need many keys. + num_key_ = 5; + } else if (partition_filters_) { + // For PartitionFilter case, since we set + // table_options.metadata_block_size big enough such that each partition + // trigger at least 1 dummy entry reservation each for hash entries and + // final filter, we need a large number of keys to ensure we have at least + // two partitions. + num_key_ = 18 * CacheReservationManager::GetDummyEntrySize() / + sizeof(FilterConstructionReserveMemoryHash); + } else if (policy_ == BloomFilterPolicy::Mode::kFastLocalBloom) { + // For Bloom Filter + FullFilter case, since we design the num_key_ to + // make hash entry cache reservation be a multiple of dummy entries, the + // correct behavior of charging final filter on top of it will trigger at + // least another dummy entry insertion. Therefore we can assert that + // behavior and we don't need a large number of keys to verify we + // indeed charge the final filter for cache reservation, even though final + // filter is a lot smaller than hash entries. + num_key_ = 1 * CacheReservationManager::GetDummyEntrySize() / + sizeof(FilterConstructionReserveMemoryHash); + } else { + // For Ribbon Filter + FullFilter case, we need a large enough number of + // keys so that charging final filter after releasing the hash entries + // reservation will trigger at least another dummy entry (or equivalently + // to saying, causing another peak in cache reservation) as banding + // reservation might not be a multiple of dummy entry. + num_key_ = 12 * CacheReservationManager::GetDummyEntrySize() / + sizeof(FilterConstructionReserveMemoryHash); + } + } + + BlockBasedTableOptions GetBlockBasedTableOptions() { + BlockBasedTableOptions table_options; + + // We set cache capacity big enough to prevent cache full for convenience in + // calculation. + constexpr std::size_t kCacheCapacity = 100 * 1024 * 1024; + + table_options.reserve_table_builder_memory = reserve_table_builder_memory_; + table_options.filter_policy.reset(new BloomFilterPolicy(10, policy_)); + table_options.partition_filters = partition_filters_; + if (table_options.partition_filters) { + table_options.index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + // We set table_options.metadata_block_size big enough so that each + // partition trigger at least 1 dummy entry insertion each for hash + // entries and final filter. + table_options.metadata_block_size = 409000; + } + + LRUCacheOptions lo; + lo.capacity = kCacheCapacity; + lo.num_shard_bits = 0; // 2^0 shard + lo.strict_capacity_limit = true; + cache_ = std::make_shared( + (NewLRUCache(lo))); + table_options.block_cache = cache_; + + return table_options; + } + + std::size_t GetNumKey() { return num_key_; } + + bool ReserveTableBuilderMemory() { return reserve_table_builder_memory_; } + + BloomFilterPolicy::Mode GetFilterPolicy() { return policy_; } + + bool PartitionFilters() { return partition_filters_; } + + std::shared_ptr + GetFilterConstructResPeakTrackingCache() { + return cache_; + } + + private: + std::size_t num_key_; + bool reserve_table_builder_memory_; + BloomFilterPolicy::Mode policy_; + bool partition_filters_; + std::shared_ptr cache_; +}; + +INSTANTIATE_TEST_CASE_P( + BlockBasedTableOptions, DBFilterConstructionReserveMemoryTestWithParam, + ::testing::Values( + std::make_tuple(false, BloomFilterPolicy::Mode::kFastLocalBloom, false), + std::make_tuple(true, BloomFilterPolicy::Mode::kFastLocalBloom, false), + std::make_tuple(true, BloomFilterPolicy::Mode::kFastLocalBloom, true), + std::make_tuple(true, BloomFilterPolicy::Mode::kStandard128Ribbon, + false), + std::make_tuple(true, BloomFilterPolicy::Mode::kStandard128Ribbon, + true), + std::make_tuple(true, BloomFilterPolicy::Mode::kDeprecatedBlock, false), + std::make_tuple(true, BloomFilterPolicy::Mode::kLegacyBloom, false))); + +// TODO: Speed up this test. +// The current test inserts many keys (on the scale of dummy entry size) +// in order to make small memory user (e.g, final filter, partitioned hash +// entries/filter/banding) , which is proportional to the number of +// keys, big enough so that its cache reservation triggers dummy entry insertion +// and becomes observable in the test. +// +// However, inserting that many keys slows down this test and leaves future +// developers an opportunity to speed it up. +// +// Possible approaches & challenges: +// 1. Use sync point during cache reservation of filter construction +// +// Benefit: It does not rely on triggering dummy entry insertion +// but the sync point to verify small memory user is charged correctly. +// +// Challenge: this approach is intrusive. +// +// 2. Make dummy entry size configurable and set it small in the test +// +// Benefit: It increases the precision of cache reservation and therefore +// small memory usage can still trigger insertion of dummy entry. +// +// Challenge: change CacheReservationManager related APIs and a hack +// might be needed to control the size of dummmy entry of +// CacheReservationManager used in filter construction for testing +// since CacheReservationManager is not exposed at the high level. +// +TEST_P(DBFilterConstructionReserveMemoryTestWithParam, ReserveMemory) { + Options options = CurrentOptions(); + // We set write_buffer_size big enough so that in the case where there is + // filter construction cache reservation, flush won't be triggered before we + // manually trigger it for clean testing + options.write_buffer_size = 640 << 20; + options.table_factory.reset( + NewBlockBasedTableFactory(GetBlockBasedTableOptions())); + std::shared_ptr cache = + GetFilterConstructResPeakTrackingCache(); + options.create_if_missing = true; + // Disable auto compaction to prevent its unexpected side effect + // to the number of keys per partition designed by us in the test + options.disable_auto_compactions = true; + DestroyAndReopen(options); + int num_key = static_cast(GetNumKey()); + for (int i = 0; i < num_key; i++) { + ASSERT_OK(Put(Key(i), Key(i))); + } + + ASSERT_EQ(cache->GetReservedCacheIncrementSum(), 0) + << "Flush was triggered too early in the test case with filter " + "construction cache reservation - please make sure no flush triggered " + "during the key insertions above"; + + ASSERT_OK(Flush()); + + bool reserve_table_builder_memory = ReserveTableBuilderMemory(); + BloomFilterPolicy::Mode policy = GetFilterPolicy(); + bool partition_filters = PartitionFilters(); + + std::deque filter_construction_cache_res_peaks = + cache->GetReservedCachePeaks(); + std::size_t filter_construction_cache_res_increments_sum = + cache->GetReservedCacheIncrementSum(); + + if (!reserve_table_builder_memory) { + EXPECT_EQ(filter_construction_cache_res_peaks.size(), 0); + return; + } + + if (policy == BloomFilterPolicy::Mode::kDeprecatedBlock || + policy == BloomFilterPolicy::Mode::kLegacyBloom) { + EXPECT_EQ(filter_construction_cache_res_peaks.size(), 0) + << "There shouldn't be filter construction cache reservation as this " + "feature does not support BloomFilterPolicy::Mode::kDeprecatedBlock " + "nor BloomFilterPolicy::Mode::kLegacyBloom"; + return; + } + + const std::size_t kDummyEntrySize = + CacheReservationManager::GetDummyEntrySize(); + + const std::size_t predicted_hash_entries_cache_res = + num_key * sizeof(FilterConstructionReserveMemoryHash); + ASSERT_EQ(predicted_hash_entries_cache_res % kDummyEntrySize, 0) + << "It's by this test's design that predicted_hash_entries_cache_res is " + "a multipe of dummy entry"; + + const std::size_t predicted_hash_entries_cache_res_dummy_entry_num = + predicted_hash_entries_cache_res / kDummyEntrySize; + const std::size_t predicted_final_filter_cache_res = + static_cast(std::ceil( + 1.0 * predicted_hash_entries_cache_res_dummy_entry_num / 6 * + (policy == BloomFilterPolicy::Mode::kStandard128Ribbon ? 0.7 : 1))) * + kDummyEntrySize; + const std::size_t predicted_banding_cache_res = + static_cast( + std::ceil(predicted_hash_entries_cache_res_dummy_entry_num * 2.5)) * + kDummyEntrySize; + + if (policy == BloomFilterPolicy::Mode::kFastLocalBloom) { + /* BloomFilterPolicy::Mode::kFastLocalBloom + FullFilter + * p0 + * / \ + * b / \ + * / \ + * / \ + * 0/ \ + * hash entries = b - 0, final filter = p0 - b + * p0 = hash entries + final filter + * + * The test is designed in a way such that the reservation for b is a + * multiple of dummy entries so that reservation for (p0 - b) + * will trigger at least another dummy entry insertion. + * + * BloomFilterPolicy::Mode::kFastLocalBloom + PartitionedFilter + * p1 + * / \ + * p0 b'/ \ + * / \ / \ + * b / \ / \ + * / \ / \ + * / a \ + * 0/ \ + * partitioned hash entries1 = b - 0, partitioned hash entries1 = b' - a + * parittioned final filter1 = p0 - b, parittioned final filter2 = p1 - b' + * + * (increment p0 - 0) + (increment p1 - a) + * = partitioned hash entries1 + partitioned hash entries2 + * + parittioned final filter1 + parittioned final filter2 + * = hash entries + final filter + * + */ + if (!partition_filters) { + EXPECT_EQ(filter_construction_cache_res_peaks.size(), 1) + << "Filter construction cache reservation should have only 1 peak in " + "case: BloomFilterPolicy::Mode::kFastLocalBloom + FullFilter"; + std::size_t filter_construction_cache_res_peak = + filter_construction_cache_res_peaks[0]; + EXPECT_GT(filter_construction_cache_res_peak, + predicted_hash_entries_cache_res) + << "The testing number of hash entries is designed to make hash " + "entries cache reservation be multiples of dummy entries" + " so the correct behavior of charging final filter on top of it" + " should've triggered at least another dummy entry insertion"; + + std::size_t predicted_filter_construction_cache_res_peak = + predicted_hash_entries_cache_res + predicted_final_filter_cache_res; + EXPECT_GE(filter_construction_cache_res_peak, + predicted_filter_construction_cache_res_peak * 0.9); + EXPECT_LE(filter_construction_cache_res_peak, + predicted_filter_construction_cache_res_peak * 1.1); + return; + } else { + EXPECT_GE(filter_construction_cache_res_peaks.size(), 2) + << "Filter construction cache reservation should have multiple peaks " + "in case: BloomFilterPolicy::Mode::kFastLocalBloom + " + "PartitionedFilter"; + std::size_t predicted_filter_construction_cache_res_increments_sum = + predicted_hash_entries_cache_res + predicted_final_filter_cache_res; + EXPECT_GE(filter_construction_cache_res_increments_sum, + predicted_filter_construction_cache_res_increments_sum * 0.9); + EXPECT_LE(filter_construction_cache_res_increments_sum, + predicted_filter_construction_cache_res_increments_sum * 1.1); + return; + } + } + + if (policy == BloomFilterPolicy::Mode::kStandard128Ribbon) { + /* BloomFilterPolicy::Mode::kStandard128Ribbon + FullFilter + * p0 + * / \ p1 + * / \/\ + * b / b' \ + * / \ + * 0/ \ + * hash entries = b - 0, banding = p0 - b, final filter = p1 - b' + * p0 = hash entries + banding + * + * The test is designed in a way such that the reservation for (p1 - b') + * will trigger at least another dummy entry insertion + * (or equivelantly to saying, creating another peak). + * + * BloomFilterPolicy::Mode::kStandard128Ribbon + PartitionedFilter + * p3 + * p0 /\ p4 + * / \ p1 / \ /\ + * / \/\ b''/ a' \ + * b / b' \ / \ + * / \ / \ + * 0/ a \ + * partitioned hash entries1 = b - 0, partitioned hash entries2 = b'' - a + * partitioned banding1 = p0 - b, partitioned banding2 = p3 - b'' + * parittioned final filter1 = p1 - b',parittioned final filter2 = p4 - a' + * + * (increment p0 - 0) + (increment p1 - b') + * + (increment p3 - a) + (increment p4 - a') + * = partitioned hash entries1 + partitioned hash entries2 + * + parittioned banding1 + parittioned banding2 + * + parittioned final filter1 + parittioned final filter2 + * = hash entries + banding + final filter + */ + if (!partition_filters) { + ASSERT_GE(std::floor(1.0 * predicted_final_filter_cache_res / + CacheReservationManager::GetDummyEntrySize()), + 1) + << "Final filter cache reservation too small for this test - please " + "increase the number of keys"; + EXPECT_EQ(filter_construction_cache_res_peaks.size(), 2) + << "Filter construction cache reservation should have 2 peaks in " + "case: BloomFilterPolicy::Mode::kStandard128Ribbon + FullFilter. " + "The second peak is resulted from charging the final filter after " + "decreasing the hash entry reservation since the testing final " + "filter reservation is designed to be at least 1 dummy entry size"; + + std::size_t filter_construction_cache_res_peak = + filter_construction_cache_res_peaks[0]; + std::size_t predicted_filter_construction_cache_res_peak = + predicted_hash_entries_cache_res + predicted_banding_cache_res; + EXPECT_GE(filter_construction_cache_res_peak, + predicted_filter_construction_cache_res_peak * 0.9); + EXPECT_LE(filter_construction_cache_res_peak, + predicted_filter_construction_cache_res_peak * 1.1); + return; + } else { + EXPECT_GE(filter_construction_cache_res_peaks.size(), 3) + << "Filter construction cache reservation should have more than 3 " + "peaks " + "in case: BloomFilterPolicy::Mode::kStandard128Ribbon + " + "PartitionedFilter"; + std::size_t predicted_filter_construction_cache_res_increments_sum = + predicted_hash_entries_cache_res + predicted_banding_cache_res + + predicted_final_filter_cache_res; + EXPECT_GE(filter_construction_cache_res_increments_sum, + predicted_filter_construction_cache_res_increments_sum * 0.9); + EXPECT_LE(filter_construction_cache_res_increments_sum, + predicted_filter_construction_cache_res_increments_sum * 1.1); + return; + } + } +} + namespace { // A wrapped bloom over block-based FilterPolicy class TestingWrappedBlockBasedFilterPolicy : public FilterPolicy { diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h index 498365256..dcc24df18 100644 --- a/include/rocksdb/table.h +++ b/include/rocksdb/table.h @@ -285,6 +285,25 @@ struct BlockBasedTableOptions { // separately uint64_t metadata_block_size = 4096; + // If true, a dynamically updating charge to block cache, loosely based + // on the actual memory usage of table building, will occur to account + // the memory, if block cache available. + // + // Charged memory usage includes: + // 1. (new) Bloom Filter and Ribbon Filter construction + // 2. More to come... + // + // Note: + // 1. (new) Bloom Filter and Ribbon Filter construction + // + // If additional temporary memory of Ribbon Filter uses up too much memory + // relative to the avaible space left in the block cache + // at some point (i.e, causing a cache full when strict_capacity_limit = + // true), construction will fall back to (new) Bloom Filter. + // + // Default: false + bool reserve_table_builder_memory = false; + // Note: currently this option requires kTwoLevelIndexSearch to be set as // well. // TODO(myabandeh): remove the note above once the limitation is lifted diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc index 4168c0b96..a95936967 100644 --- a/options/options_settable_test.cc +++ b/options/options_settable_test.cc @@ -175,6 +175,7 @@ TEST_F(OptionsSettableTest, BlockBasedTableOptionsAllFieldsSettable) { "optimize_filters_for_memory=true;" "index_block_restart_interval=4;" "filter_policy=bloomfilter:4:true;whole_key_filtering=1;" + "reserve_table_builder_memory=false;" "format_version=1;" "hash_index_allow_collision=false;" "verify_compression=true;read_amp_bytes_per_bit=0;" diff --git a/options/options_test.cc b/options/options_test.cc index e2b098a0d..3514420a3 100644 --- a/options/options_test.cc +++ b/options/options_test.cc @@ -852,6 +852,7 @@ TEST_F(OptionsTest, GetBlockBasedTableOptionsFromString) { "block_cache=1M;block_cache_compressed=1k;block_size=1024;" "block_size_deviation=8;block_restart_interval=4;" "format_version=5;whole_key_filtering=1;" + "reserve_table_builder_memory=true;" "filter_policy=bloomfilter:4.567:false;" // A bug caused read_amp_bytes_per_bit to be a large integer in OPTIONS // file generated by 6.10 to 6.14. Though bug is fixed in these releases, @@ -872,6 +873,7 @@ TEST_F(OptionsTest, GetBlockBasedTableOptionsFromString) { ASSERT_EQ(new_opt.block_restart_interval, 4); ASSERT_EQ(new_opt.format_version, 5U); ASSERT_EQ(new_opt.whole_key_filtering, true); + ASSERT_EQ(new_opt.reserve_table_builder_memory, true); ASSERT_TRUE(new_opt.filter_policy != nullptr); const BloomFilterPolicy* bfp = dynamic_cast(new_opt.filter_policy.get()); diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc index 6abd3629a..780eb4f37 100644 --- a/table/block_based/block_based_table_builder.cc +++ b/table/block_based/block_based_table_builder.cc @@ -1558,6 +1558,7 @@ void BlockBasedTableBuilder::WriteFilterBlock( WriteRawBlock(filter_content, kNoCompression, &filter_block_handle, BlockType::kFilter); } + rep_->filter_builder->ResetFilterBitsBuilder(); } if (ok() && !empty_filter_block) { // Add mapping from ".Name" to location diff --git a/table/block_based/block_based_table_factory.cc b/table/block_based/block_based_table_factory.cc index 259effb87..66bb44e52 100644 --- a/table/block_based/block_based_table_factory.cc +++ b/table/block_based/block_based_table_factory.cc @@ -356,6 +356,10 @@ static std::unordered_map {offsetof(struct BlockBasedTableOptions, whole_key_filtering), OptionType::kBoolean, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"reserve_table_builder_memory", + {offsetof(struct BlockBasedTableOptions, reserve_table_builder_memory), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, {"skip_table_builder_flush", {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, OptionTypeFlags::kNone}}, diff --git a/table/block_based/filter_block.h b/table/block_based/filter_block.h index 55ab254d9..765eb0244 100644 --- a/table/block_based/filter_block.h +++ b/table/block_based/filter_block.h @@ -75,12 +75,17 @@ class FilterBlockBuilder { } // If filter_data is not nullptr, Finish() may transfer ownership of // underlying filter data to the caller, so that it can be freed as soon as - // possible. + // possible. BlockBasedFilterBlock will ignore this parameter. + // virtual Slice Finish( const BlockHandle& tmp /* only used in PartitionedFilterBlock as last_partition_block_handle */ , Status* status, std::unique_ptr* filter_data = nullptr) = 0; + + // It is for releasing the memory usage and cache reservation of filter bits + // builder in FullFilter and PartitionedFilter + virtual void ResetFilterBitsBuilder() {} }; // A FilterBlockReader is used to parse filter from SST table. diff --git a/table/block_based/filter_policy.cc b/table/block_based/filter_policy.cc index eb133c7e4..b0f345c6a 100644 --- a/table/block_based/filter_policy.cc +++ b/table/block_based/filter_policy.cc @@ -12,7 +12,10 @@ #include #include #include +#include +#include "cache/cache_entry_roles.h" +#include "cache/cache_reservation_manager.h" #include "logging/logging.h" #include "rocksdb/slice.h" #include "table/block_based/block_based_filter_block.h" @@ -48,8 +51,10 @@ Slice FinishAlwaysFalse(std::unique_ptr* /*buf*/) { class XXPH3FilterBitsBuilder : public BuiltinFilterBitsBuilder { public: explicit XXPH3FilterBitsBuilder( - std::atomic* aggregate_rounding_balance) - : aggregate_rounding_balance_(aggregate_rounding_balance) {} + std::atomic* aggregate_rounding_balance, + std::shared_ptr cache_res_mgr) + : aggregate_rounding_balance_(aggregate_rounding_balance), + cache_res_mgr_(cache_res_mgr) {} ~XXPH3FilterBitsBuilder() override {} @@ -61,6 +66,18 @@ class XXPH3FilterBitsBuilder : public BuiltinFilterBitsBuilder { // requirements. if (hash_entries_.empty() || hash != hash_entries_.back()) { hash_entries_.push_back(hash); + if (cache_res_mgr_ && + // Traditional rounding to whole bucket size + ((hash_entries_.size() % kUint64tHashEntryCacheResBucketSize) == + kUint64tHashEntryCacheResBucketSize / 2)) { + hash_entry_cache_res_bucket_handles_.emplace_back(nullptr); + Status s = + cache_res_mgr_ + ->MakeCacheReservation( + kUint64tHashEntryCacheResBucketSize * sizeof(hash), + &hash_entry_cache_res_bucket_handles_.back()); + s.PermitUncheckedError(); + } } } @@ -71,6 +88,11 @@ class XXPH3FilterBitsBuilder : public BuiltinFilterBitsBuilder { protected: static constexpr uint32_t kMetadataLen = 5; + // Number of hash entries to accumulate before charging their memory usage to + // the cache when cache reservation is available + static const std::size_t kUint64tHashEntryCacheResBucketSize = + CacheReservationManager::GetDummyEntrySize() / sizeof(uint64_t); + // For delegating between XXPH3FilterBitsBuilders void SwapEntriesWith(XXPH3FilterBitsBuilder* other) { std::swap(hash_entries_, other->hash_entries_); @@ -183,6 +205,21 @@ class XXPH3FilterBitsBuilder : public BuiltinFilterBitsBuilder { // See BloomFilterPolicy::aggregate_rounding_balance_. If nullptr, // always "round up" like historic behavior. std::atomic* aggregate_rounding_balance_; + + // For reserving memory used in (new) Bloom and Ribbon Filter construction + std::shared_ptr cache_res_mgr_; + + // For managing cache reservation for buckets of hash entry in (new) Bloom and + // Ribbon Filter construction + std::deque>> + hash_entry_cache_res_bucket_handles_; + + // For managing cache reservation for final filter in (new) Bloom and Ribbon + // Filter construction + std::deque>> + final_filter_cache_res_handles_; }; // #################### FastLocalBloom implementation ################## // @@ -194,8 +231,9 @@ class FastLocalBloomBitsBuilder : public XXPH3FilterBitsBuilder { // Non-null aggregate_rounding_balance implies optimize_filters_for_memory explicit FastLocalBloomBitsBuilder( const int millibits_per_key, - std::atomic* aggregate_rounding_balance) - : XXPH3FilterBitsBuilder(aggregate_rounding_balance), + std::atomic* aggregate_rounding_balance, + std::shared_ptr cache_res_mgr) + : XXPH3FilterBitsBuilder(aggregate_rounding_balance, cache_res_mgr), millibits_per_key_(millibits_per_key) { assert(millibits_per_key >= 1000); } @@ -213,6 +251,20 @@ class FastLocalBloomBitsBuilder : public XXPH3FilterBitsBuilder { std::unique_ptr mutable_buf; len_with_metadata = AllocateMaybeRounding(len_with_metadata, num_entries, &mutable_buf); + // Cache reservation for mutable_buf + if (cache_res_mgr_) { + std::unique_ptr< + CacheReservationHandle> + final_filter_cache_res_handle; + Status s = + cache_res_mgr_ + ->MakeCacheReservation( + len_with_metadata * sizeof(char), + &final_filter_cache_res_handle); + final_filter_cache_res_handles_.push_back( + std::move(final_filter_cache_res_handle)); + s.PermitUncheckedError(); + } assert(mutable_buf); assert(len_with_metadata >= kMetadataLen); @@ -229,6 +281,8 @@ class FastLocalBloomBitsBuilder : public XXPH3FilterBitsBuilder { } assert(hash_entries_.empty()); + // Release cache for hash entries + hash_entry_cache_res_bucket_handles_.clear(); // See BloomFilterPolicy::GetBloomBitsReader re: metadata // -1 = Marker for newer Bloom implementations @@ -426,11 +480,13 @@ class Standard128RibbonBitsBuilder : public XXPH3FilterBitsBuilder { public: explicit Standard128RibbonBitsBuilder( double desired_one_in_fp_rate, int bloom_millibits_per_key, - std::atomic* aggregate_rounding_balance, Logger* info_log) - : XXPH3FilterBitsBuilder(aggregate_rounding_balance), + std::atomic* aggregate_rounding_balance, + std::shared_ptr cache_res_mgr, Logger* info_log) + : XXPH3FilterBitsBuilder(aggregate_rounding_balance, cache_res_mgr), desired_one_in_fp_rate_(desired_one_in_fp_rate), info_log_(info_log), - bloom_fallback_(bloom_millibits_per_key, aggregate_rounding_balance) { + bloom_fallback_(bloom_millibits_per_key, aggregate_rounding_balance, + cache_res_mgr) { assert(desired_one_in_fp_rate >= 1.0); } @@ -472,6 +528,31 @@ class Standard128RibbonBitsBuilder : public XXPH3FilterBitsBuilder { } BandingType banding; + std::size_t bytes_banding = ribbon::StandardBanding< + Standard128RibbonTypesAndSettings>::EstimateMemoryUsage(num_slots); + Status status_banding_cache_res = Status::OK(); + + // Cache reservation for banding + std::unique_ptr> + banding_res_handle; + if (cache_res_mgr_) { + status_banding_cache_res = + cache_res_mgr_ + ->MakeCacheReservation( + bytes_banding, &banding_res_handle); + } + + if (status_banding_cache_res.IsIncomplete()) { + ROCKS_LOG_WARN(info_log_, + "Cache reservation for Ribbon filter banding failed due " + "to cache full"); + SwapEntriesWith(&bloom_fallback_); + assert(hash_entries_.empty()); + // Release cache for banding since the banding won't be allocated + banding_res_handle.reset(); + return bloom_fallback_.Finish(buf); + } + bool success = banding.ResetAndFindSeedToSolve( num_slots, hash_entries_.begin(), hash_entries_.end(), /*starting seed*/ entropy & 255, /*seed mask*/ 255); @@ -485,6 +566,8 @@ class Standard128RibbonBitsBuilder : public XXPH3FilterBitsBuilder { return bloom_fallback_.Finish(buf); } hash_entries_.clear(); + // Release cache for hash entries + hash_entry_cache_res_bucket_handles_.clear(); uint32_t seed = banding.GetOrdinalSeed(); assert(seed < 256); @@ -492,6 +575,20 @@ class Standard128RibbonBitsBuilder : public XXPH3FilterBitsBuilder { std::unique_ptr mutable_buf; len_with_metadata = AllocateMaybeRounding(len_with_metadata, num_entries, &mutable_buf); + // Cache reservation for mutable_buf + if (cache_res_mgr_) { + std::unique_ptr< + CacheReservationHandle> + final_filter_cache_res_handle; + Status s = + cache_res_mgr_ + ->MakeCacheReservation( + len_with_metadata * sizeof(char), + &final_filter_cache_res_handle); + final_filter_cache_res_handles_.push_back( + std::move(final_filter_cache_res_handle)); + s.PermitUncheckedError(); + } SolnType soln(mutable_buf.get(), len_with_metadata); soln.BackSubstFrom(banding); @@ -1135,6 +1232,14 @@ FilterBitsBuilder* BloomFilterPolicy::GetBuilderWithContext( const FilterBuildingContext& context) const { Mode cur = mode_; bool offm = context.table_options.optimize_filters_for_memory; + bool reserve_filter_construction_mem = + (context.table_options.reserve_table_builder_memory && + context.table_options.block_cache); + std::shared_ptr cache_res_mgr; + if (reserve_filter_construction_mem) { + cache_res_mgr = std::make_shared( + context.table_options.block_cache); + } // Unusual code construction so that we can have just // one exhaustive switch without (risky) recursion for (int i = 0; i < 2; ++i) { @@ -1150,7 +1255,8 @@ FilterBitsBuilder* BloomFilterPolicy::GetBuilderWithContext( return nullptr; case kFastLocalBloom: return new FastLocalBloomBitsBuilder( - millibits_per_key_, offm ? &aggregate_rounding_balance_ : nullptr); + millibits_per_key_, offm ? &aggregate_rounding_balance_ : nullptr, + cache_res_mgr); case kLegacyBloom: if (whole_bits_per_key_ >= 14 && context.info_log && !warned_.load(std::memory_order_relaxed)) { @@ -1175,7 +1281,8 @@ FilterBitsBuilder* BloomFilterPolicy::GetBuilderWithContext( case kStandard128Ribbon: return new Standard128RibbonBitsBuilder( desired_one_in_fp_rate_, millibits_per_key_, - offm ? &aggregate_rounding_balance_ : nullptr, context.info_log); + offm ? &aggregate_rounding_balance_ : nullptr, cache_res_mgr, + context.info_log); } } assert(false); diff --git a/table/block_based/full_filter_block.h b/table/block_based/full_filter_block.h index fc4dd9f5a..bc3f958e0 100644 --- a/table/block_based/full_filter_block.h +++ b/table/block_based/full_filter_block.h @@ -59,6 +59,10 @@ class FullFilterBlockBuilder : public FilterBlockBuilder { std::unique_ptr* filter_data = nullptr) override; using FilterBlockBuilder::Finish; + virtual void ResetFilterBitsBuilder() override { + filter_bits_builder_.reset(); + } + protected: virtual void AddKey(const Slice& key); std::unique_ptr filter_bits_builder_; diff --git a/util/bloom_test.cc b/util/bloom_test.cc index dbde584b3..8636315ee 100644 --- a/util/bloom_test.cc +++ b/util/bloom_test.cc @@ -19,6 +19,8 @@ int main() { #include #include +#include "cache/cache_entry_roles.h" +#include "cache/cache_reservation_manager.h" #include "memory/arena.h" #include "port/jemalloc_helper.h" #include "rocksdb/filter_policy.h" @@ -601,6 +603,50 @@ TEST_P(FullBloomTest, OptimizeForMemory) { } } +TEST(FullBloomFilterConstructionReserveMemTest, + RibbonFilterFallBackOnLargeBanding) { + constexpr std::size_t kCacheCapacity = + 8 * CacheReservationManager::GetDummyEntrySize(); + constexpr std::size_t num_entries_for_cache_full = kCacheCapacity / 8; + + for (bool reserve_builder_mem : {true, false}) { + bool will_fall_back = reserve_builder_mem; + + BlockBasedTableOptions table_options; + table_options.reserve_table_builder_memory = reserve_builder_mem; + LRUCacheOptions lo; + lo.capacity = kCacheCapacity; + lo.num_shard_bits = 0; // 2^0 shard + lo.strict_capacity_limit = true; + std::shared_ptr cache(NewLRUCache(lo)); + table_options.block_cache = cache; + table_options.filter_policy.reset(new BloomFilterPolicy( + FLAGS_bits_per_key, BloomFilterPolicy::Mode::kStandard128Ribbon)); + FilterBuildingContext ctx(table_options); + std::unique_ptr filter_bits_builder( + table_options.filter_policy->GetBuilderWithContext(ctx)); + + char key_buffer[sizeof(int)]; + for (std::size_t i = 0; i < num_entries_for_cache_full; ++i) { + filter_bits_builder->AddKey(Key(static_cast(i), key_buffer)); + } + + std::unique_ptr buf; + Slice filter = filter_bits_builder->Finish(&buf); + + // To verify Ribbon Filter fallbacks to Bloom Filter properly + // based on cache reservation result + // See BloomFilterPolicy::GetBloomBitsReader re: metadata + // -1 = Marker for newer Bloom implementations + // -2 = Marker for Standard128 Ribbon + if (will_fall_back) { + EXPECT_EQ(filter.data()[filter.size() - 5], static_cast(-1)); + } else { + EXPECT_EQ(filter.data()[filter.size() - 5], static_cast(-2)); + } + } +} + namespace { inline uint32_t SelectByCacheLineSize(uint32_t for64, uint32_t for128, uint32_t for256) { diff --git a/util/filter_bench.cc b/util/filter_bench.cc index 1bc41caf8..69a2a3b45 100644 --- a/util/filter_bench.cc +++ b/util/filter_bench.cc @@ -19,6 +19,7 @@ int main() { #include "memory/arena.h" #include "port/port.h" #include "port/stack_trace.h" +#include "rocksdb/cache.h" #include "rocksdb/system_clock.h" #include "table/block_based/filter_policy_internal.h" #include "table/block_based/full_filter_block.h" @@ -93,6 +94,18 @@ DEFINE_bool(net_includes_hashing, false, DEFINE_bool(optimize_filters_for_memory, false, "Setting for BlockBasedTableOptions::optimize_filters_for_memory"); +DEFINE_uint32(block_cache_capacity_MB, 8, + "Setting for " + "LRUCacheOptions::capacity"); + +DEFINE_bool(reserve_table_builder_memory, false, + "Setting for " + "BlockBasedTableOptions::reserve_table_builder_memory"); + +DEFINE_bool(strict_capacity_limit, false, + "Setting for " + "LRUCacheOptions::strict_capacity_limit"); + DEFINE_bool(quick, false, "Run more limited set of tests, fewer queries"); DEFINE_bool(best_case, false, "Run limited tests only for best-case"); @@ -125,6 +138,7 @@ using ROCKSDB_NAMESPACE::BloomFilterPolicy; using ROCKSDB_NAMESPACE::BloomHash; using ROCKSDB_NAMESPACE::BuiltinFilterBitsBuilder; using ROCKSDB_NAMESPACE::CachableEntry; +using ROCKSDB_NAMESPACE::Cache; using ROCKSDB_NAMESPACE::EncodeFixed32; using ROCKSDB_NAMESPACE::FastRange32; using ROCKSDB_NAMESPACE::FilterBitsReader; @@ -133,6 +147,7 @@ using ROCKSDB_NAMESPACE::FullFilterBlockReader; using ROCKSDB_NAMESPACE::GetSliceHash; using ROCKSDB_NAMESPACE::GetSliceHash64; using ROCKSDB_NAMESPACE::Lower32of64; +using ROCKSDB_NAMESPACE::LRUCacheOptions; using ROCKSDB_NAMESPACE::ParsedFullFilterBlock; using ROCKSDB_NAMESPACE::PlainTableBloomV1; using ROCKSDB_NAMESPACE::Random32; @@ -285,6 +300,16 @@ struct FilterBench : public MockBlockBasedTableTester { ioptions_.logger = &stderr_logger_; table_options_.optimize_filters_for_memory = FLAGS_optimize_filters_for_memory; + if (FLAGS_reserve_table_builder_memory) { + table_options_.reserve_table_builder_memory = true; + table_options_.no_block_cache = false; + LRUCacheOptions lo; + lo.capacity = FLAGS_block_cache_capacity_MB * 1024 * 1024; + lo.num_shard_bits = 0; // 2^0 shard + lo.strict_capacity_limit = FLAGS_strict_capacity_limit; + std::shared_ptr cache(NewLRUCache(lo)); + table_options_.block_cache = cache; + } } void Go(); diff --git a/util/ribbon_impl.h b/util/ribbon_impl.h index 18343516f..0afecc67d 100644 --- a/util/ribbon_impl.h +++ b/util/ribbon_impl.h @@ -677,6 +677,16 @@ class StandardBanding : public StandardHasher { return false; } + static std::size_t EstimateMemoryUsage(uint32_t num_slots) { + std::size_t bytes_coeff_rows = num_slots * sizeof(CoeffRow); + std::size_t bytes_result_rows = num_slots * sizeof(ResultRow); + std::size_t bytes_backtrack = 0; + std::size_t bytes_banding = + bytes_coeff_rows + bytes_result_rows + bytes_backtrack; + + return bytes_banding; + } + protected: // TODO: explore combining in a struct std::unique_ptr coeff_rows_;