diff --git a/HISTORY.md b/HISTORY.md index 556a5ff28..8934d15fd 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -10,6 +10,8 @@ ### New Features * DB::GetLiveFilesStorageInfo is ready for production use. * Add new stats PREFETCHED_BYTES_DISCARDED which records number of prefetched bytes discarded by RocksDB FilePrefetchBuffer on destruction and POLL_WAIT_MICROS records wait time for FS::Poll API completion. +* Track memory usage of file metadata created for newly added files to Version. If such file metadata's memory exceeds the avaible space left in the block +cache at some point (i.e, causing a cache full under `LRUCacheOptions::strict_capacity_limit` = true), creation will fail with `Status::MemoryLimit()`. To opt in this feature, set `BlockBasedTableOptions::cache_usage_options.options_overrides.insert({CacheEntryRole::kFileMetadata, {/*.charged = */CacheEntryRoleOptions::Decision::kEnabled}})`. ### Public API changes * Add rollback_deletion_type_callback to TransactionDBOptions so that write-prepared transactions know whether to issue a Delete or SingleDelete to cancel a previous key written during prior prepare phase. The PR aims to prevent mixing SingleDeletes and Deletes for the same key that can lead to undefined behaviors for write-prepared transactions. diff --git a/cache/cache_entry_roles.cc b/cache/cache_entry_roles.cc index 5b49df457..8b373c416 100644 --- a/cache/cache_entry_roles.cc +++ b/cache/cache_entry_roles.cc @@ -22,6 +22,7 @@ std::array kCacheEntryRoleToCamelString{{ "CompressionDictionaryBuildingBuffer", "FilterConstruction", "BlockBasedTableReader", + "FileMetadata", "Misc", }}; @@ -36,6 +37,7 @@ std::array kCacheEntryRoleToHyphenString{{ "compression-dictionary-building-buffer", "filter-construction", "block-based-table-reader", + "file-metadata", "misc", }}; diff --git a/cache/cache_reservation_manager.cc b/cache/cache_reservation_manager.cc index 3cc149b43..fb4f2ad50 100644 --- a/cache/cache_reservation_manager.cc +++ b/cache/cache_reservation_manager.cc @@ -180,4 +180,5 @@ template class CacheReservationManagerImpl< template class CacheReservationManagerImpl; template class CacheReservationManagerImpl; template class CacheReservationManagerImpl; +template class CacheReservationManagerImpl; } // namespace ROCKSDB_NAMESPACE diff --git a/cache/cache_reservation_manager.h b/cache/cache_reservation_manager.h index fd003ddc5..8b30d92a8 100644 --- a/cache/cache_reservation_manager.h +++ b/cache/cache_reservation_manager.h @@ -36,6 +36,8 @@ class CacheReservationManager { }; virtual ~CacheReservationManager() {} virtual Status UpdateCacheReservation(std::size_t new_memory_used) = 0; + virtual Status UpdateCacheReservation(std::size_t memory_used_delta, + bool increase) = 0; virtual Status MakeCacheReservation( std::size_t incremental_memory_used, std::unique_ptr @@ -128,6 +130,11 @@ class CacheReservationManagerImpl // On keeping dummy entries the same, it always returns Status::OK(). Status UpdateCacheReservation(std::size_t new_memory_used) override; + Status UpdateCacheReservation(std::size_t /* memory_used_delta */, + bool /* increase */) override { + return Status::NotSupported(); + } + // One of the two ways of reserving cache space and releasing is done through // destruction of CacheReservationHandle. // See UpdateCacheReservation() for the other way. @@ -254,6 +261,23 @@ class ConcurrentCacheReservationManager std::lock_guard lock(cache_res_mgr_mu_); return cache_res_mgr_->UpdateCacheReservation(new_memory_used); } + + inline Status UpdateCacheReservation(std::size_t memory_used_delta, + bool increase) override { + std::lock_guard lock(cache_res_mgr_mu_); + std::size_t total_mem_used = cache_res_mgr_->GetTotalMemoryUsed(); + Status s; + if (!increase) { + assert(total_mem_used >= memory_used_delta); + s = cache_res_mgr_->UpdateCacheReservation(total_mem_used - + memory_used_delta); + } else { + s = cache_res_mgr_->UpdateCacheReservation(total_mem_used + + memory_used_delta); + } + return s; + } + inline Status MakeCacheReservation( std::size_t incremental_memory_used, std::unique_ptr *handle) diff --git a/db/column_family.cc b/db/column_family.cc index 3eb4aab8e..042ba300c 100644 --- a/db/column_family.cc +++ b/db/column_family.cc @@ -613,6 +613,23 @@ ColumnFamilyData::ColumnFamilyData( } RecalculateWriteStallConditions(mutable_cf_options_); + + if (cf_options.table_factory->IsInstanceOf( + TableFactory::kBlockBasedTableName()) && + cf_options.table_factory->GetOptions()) { + const BlockBasedTableOptions* bbto = + cf_options.table_factory->GetOptions(); + const auto& options_overrides = bbto->cache_usage_options.options_overrides; + const auto file_metadata_charged = + options_overrides.at(CacheEntryRole::kFileMetadata).charged; + if (bbto->block_cache && + file_metadata_charged == CacheEntryRoleOptions::Decision::kEnabled) { + file_metadata_cache_res_mgr_.reset(new ConcurrentCacheReservationManager( + std::make_shared< + CacheReservationManagerImpl>( + bbto->block_cache))); + } + } } // DB mutex held diff --git a/db/column_family.h b/db/column_family.h index c37430366..5e9045d3b 100644 --- a/db/column_family.h +++ b/db/column_family.h @@ -14,6 +14,7 @@ #include #include +#include "cache/cache_reservation_manager.h" #include "db/memtable_list.h" #include "db/table_cache.h" #include "db/table_properties_collector.h" @@ -520,6 +521,10 @@ class ColumnFamilyData { ThreadLocalPtr* TEST_GetLocalSV() { return local_sv_.get(); } WriteBufferManager* write_buffer_mgr() { return write_buffer_manager_; } + std::shared_ptr + GetFileMetadataCacheReservationManager() { + return file_metadata_cache_res_mgr_; + } static const uint32_t kDummyColumnFamilyDataId; @@ -618,6 +623,8 @@ class ColumnFamilyData { bool db_paths_registered_; std::string full_history_ts_low_; + + std::shared_ptr file_metadata_cache_res_mgr_; }; // ColumnFamilySet has interesting thread-safety requirements diff --git a/db/db_test_util.cc b/db/db_test_util.cc index 47bfd8421..d05293007 100644 --- a/db/db_test_util.cc +++ b/db/db_test_util.cc @@ -1740,5 +1740,6 @@ template class TargetCacheChargeTrackingCache< CacheEntryRole::kFilterConstruction>; template class TargetCacheChargeTrackingCache< CacheEntryRole::kBlockBasedTableReader>; +template class TargetCacheChargeTrackingCache; } // namespace ROCKSDB_NAMESPACE diff --git a/db/dbformat.h b/db/dbformat.h index 670c188c7..4fb4c25ea 100644 --- a/db/dbformat.h +++ b/db/dbformat.h @@ -320,7 +320,7 @@ class InternalKey { } Slice user_key() const { return ExtractUserKey(rep_); } - size_t size() { return rep_.size(); } + size_t size() const { return rep_.size(); } void Set(const Slice& _user_key, SequenceNumber s, ValueType t) { SetFrom(ParsedInternalKey(_user_key, s, t)); diff --git a/db/version_builder.cc b/db/version_builder.cc index b785adfdd..ed340e602 100644 --- a/db/version_builder.cc +++ b/db/version_builder.cc @@ -23,6 +23,7 @@ #include #include +#include "cache/cache_reservation_manager.h" #include "db/blob/blob_file_meta.h" #include "db/dbformat.h" #include "db/internal_stats.h" @@ -255,10 +256,13 @@ class VersionBuilder::Rep { // version edits. std::map mutable_blob_file_metas_; + std::shared_ptr file_metadata_cache_res_mgr_; + public: Rep(const FileOptions& file_options, const ImmutableCFOptions* ioptions, TableCache* table_cache, VersionStorageInfo* base_vstorage, - VersionSet* version_set) + VersionSet* version_set, + std::shared_ptr file_metadata_cache_res_mgr) : file_options_(file_options), ioptions_(ioptions), table_cache_(table_cache), @@ -266,7 +270,8 @@ class VersionBuilder::Rep { version_set_(version_set), num_levels_(base_vstorage->num_levels()), has_invalid_levels_(false), - level_nonzero_cmp_(base_vstorage_->InternalComparator()) { + level_nonzero_cmp_(base_vstorage_->InternalComparator()), + file_metadata_cache_res_mgr_(file_metadata_cache_res_mgr) { assert(ioptions_); levels_ = new LevelState[num_levels_]; @@ -291,6 +296,12 @@ class VersionBuilder::Rep { table_cache_->ReleaseHandle(f->table_reader_handle); f->table_reader_handle = nullptr; } + + if (file_metadata_cache_res_mgr_) { + Status s = file_metadata_cache_res_mgr_->UpdateCacheReservation( + f->ApproximateMemoryUsage(), false /* increase */); + s.PermitUncheckedError(); + } delete f; } } @@ -763,6 +774,22 @@ class VersionBuilder::Rep { FileMetaData* const f = new FileMetaData(meta); f->refs = 1; + if (file_metadata_cache_res_mgr_) { + Status s = file_metadata_cache_res_mgr_->UpdateCacheReservation( + f->ApproximateMemoryUsage(), true /* increase */); + if (!s.ok()) { + delete f; + s = Status::MemoryLimit( + "Can't allocate " + + kCacheEntryRoleToCamelString[static_cast( + CacheEntryRole::kFileMetadata)] + + " due to exceeding the memory limit " + "based on " + "cache capacity"); + return s; + } + } + auto& add_files = level_state.added_files; assert(add_files.find(file_number) == add_files.end()); add_files.emplace(file_number, f); @@ -1239,13 +1266,13 @@ class VersionBuilder::Rep { } }; -VersionBuilder::VersionBuilder(const FileOptions& file_options, - const ImmutableCFOptions* ioptions, - TableCache* table_cache, - VersionStorageInfo* base_vstorage, - VersionSet* version_set) +VersionBuilder::VersionBuilder( + const FileOptions& file_options, const ImmutableCFOptions* ioptions, + TableCache* table_cache, VersionStorageInfo* base_vstorage, + VersionSet* version_set, + std::shared_ptr file_metadata_cache_res_mgr) : rep_(new Rep(file_options, ioptions, table_cache, base_vstorage, - version_set)) {} + version_set, file_metadata_cache_res_mgr)) {} VersionBuilder::~VersionBuilder() = default; @@ -1280,7 +1307,8 @@ BaseReferencedVersionBuilder::BaseReferencedVersionBuilder( : version_builder_(new VersionBuilder( cfd->current()->version_set()->file_options(), cfd->ioptions(), cfd->table_cache(), cfd->current()->storage_info(), - cfd->current()->version_set())), + cfd->current()->version_set(), + cfd->GetFileMetadataCacheReservationManager())), version_(cfd->current()) { version_->Ref(); } @@ -1289,7 +1317,8 @@ BaseReferencedVersionBuilder::BaseReferencedVersionBuilder( ColumnFamilyData* cfd, Version* v) : version_builder_(new VersionBuilder( cfd->current()->version_set()->file_options(), cfd->ioptions(), - cfd->table_cache(), v->storage_info(), v->version_set())), + cfd->table_cache(), v->storage_info(), v->version_set(), + cfd->GetFileMetadataCacheReservationManager())), version_(v) { assert(version_ != cfd->current()); } diff --git a/db/version_builder.h b/db/version_builder.h index add1edac9..1c022832a 100644 --- a/db/version_builder.h +++ b/db/version_builder.h @@ -25,6 +25,7 @@ class InternalStats; class Version; class VersionSet; class ColumnFamilyData; +class CacheReservationManager; // A helper class so we can efficiently apply a whole sequence // of edits to a particular state without creating intermediate @@ -33,7 +34,9 @@ class VersionBuilder { public: VersionBuilder(const FileOptions& file_options, const ImmutableCFOptions* ioptions, TableCache* table_cache, - VersionStorageInfo* base_vstorage, VersionSet* version_set); + VersionStorageInfo* base_vstorage, VersionSet* version_set, + std::shared_ptr + file_metadata_cache_res_mgr = nullptr); ~VersionBuilder(); bool CheckConsistencyForNumLevels(); diff --git a/db/version_edit.h b/db/version_edit.h index 6674cc1f9..e35a57c5f 100644 --- a/db/version_edit.h +++ b/db/version_edit.h @@ -19,6 +19,7 @@ #include "db/dbformat.h" #include "db/wal_edit.h" #include "memory/arena.h" +#include "port/malloc.h" #include "rocksdb/advanced_options.h" #include "rocksdb/cache.h" #include "table/table_reader.h" @@ -285,6 +286,19 @@ struct FileMetaData { } return kUnknownFileCreationTime; } + + size_t ApproximateMemoryUsage() const { + size_t usage = 0; +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + usage += malloc_usable_size(const_cast(this)); +#else + usage += sizeof(*this); +#endif // ROCKSDB_MALLOC_USABLE_SIZE + usage += smallest.size() + largest.size() + file_checksum.size() + + file_checksum_func_name.size() + min_timestamp.size() + + max_timestamp.size(); + return usage; + } }; // A compressed copy of file meta data that just contain minimum data needed diff --git a/db/version_set.cc b/db/version_set.cc index b21761c5b..57148c912 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -753,7 +753,8 @@ Version::~Version() { uint32_t path_id = f->fd.GetPathId(); assert(path_id < cfd_->ioptions()->cf_paths.size()); vset_->obsolete_files_.push_back( - ObsoleteFileInfo(f, cfd_->ioptions()->cf_paths[path_id].path)); + ObsoleteFileInfo(f, cfd_->ioptions()->cf_paths[path_id].path, + cfd_->GetFileMetadataCacheReservationManager())); } } } diff --git a/db/version_set.h b/db/version_set.h index 5afd1202f..1ff4cbc72 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -921,8 +921,12 @@ struct ObsoleteFileInfo { std::string path; ObsoleteFileInfo() noexcept : metadata(nullptr) {} - ObsoleteFileInfo(FileMetaData* f, const std::string& file_path) - : metadata(f), path(file_path) {} + ObsoleteFileInfo(FileMetaData* f, const std::string& file_path, + std::shared_ptr + file_metadata_cache_res_mgr = nullptr) + : metadata(f), + path(file_path), + file_metadata_cache_res_mgr_(file_metadata_cache_res_mgr) {} ObsoleteFileInfo(const ObsoleteFileInfo&) = delete; ObsoleteFileInfo& operator=(const ObsoleteFileInfo&) = delete; @@ -936,14 +940,24 @@ struct ObsoleteFileInfo { path = std::move(rhs.path); metadata = rhs.metadata; rhs.metadata = nullptr; + file_metadata_cache_res_mgr_ = rhs.file_metadata_cache_res_mgr_; + rhs.file_metadata_cache_res_mgr_ = nullptr; return *this; } void DeleteMetadata() { + if (file_metadata_cache_res_mgr_) { + Status s = file_metadata_cache_res_mgr_->UpdateCacheReservation( + metadata->ApproximateMemoryUsage(), false /* increase */); + s.PermitUncheckedError(); + } delete metadata; metadata = nullptr; } + + private: + std::shared_ptr file_metadata_cache_res_mgr_; }; class ObsoleteBlobFileInfo { diff --git a/db/version_set_test.cc b/db/version_set_test.cc index c48547508..fd788f0b7 100644 --- a/db/version_set_test.cc +++ b/db/version_set_test.cc @@ -12,6 +12,7 @@ #include #include "db/db_impl/db_impl.h" +#include "db/db_test_util.h" #include "db/log_writer.h" #include "rocksdb/advanced_options.h" #include "rocksdb/convenience.h" @@ -3445,6 +3446,124 @@ TEST_F(VersionSetTestMissingFiles, MinLogNumberToKeep2PC) { } } +class ChargeFileMetadataTest : public DBTestBase { + public: + ChargeFileMetadataTest() + : DBTestBase("charge_file_metadata_test", /*env_do_fsync=*/true) {} +}; + +class ChargeFileMetadataTestWithParam + : public ChargeFileMetadataTest, + public testing::WithParamInterface { + public: + ChargeFileMetadataTestWithParam() {} +}; + +#ifndef ROCKSDB_LITE +INSTANTIATE_TEST_CASE_P( + ChargeFileMetadataTestWithParam, ChargeFileMetadataTestWithParam, + ::testing::Values(CacheEntryRoleOptions::Decision::kEnabled, + CacheEntryRoleOptions::Decision::kDisabled)); + +TEST_P(ChargeFileMetadataTestWithParam, Basic) { + Options options; + std::shared_ptr> + file_metadata_charge_only_cache = std::make_shared< + TargetCacheChargeTrackingCache>( + NewLRUCache( + 4 * CacheReservationManagerImpl< + CacheEntryRole::kFileMetadata>::GetDummyEntrySize(), + 0 /* num_shard_bits */, true /* strict_capacity_limit */)); + BlockBasedTableOptions table_options; + CacheEntryRoleOptions::Decision charge_file_metadata = GetParam(); + table_options.cache_usage_options.options_overrides.insert( + {CacheEntryRole::kFileMetadata, {/*.charged = */ charge_file_metadata}}); + table_options.block_cache = file_metadata_charge_only_cache; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.create_if_missing = true; + options.disable_auto_compactions = true; + DestroyAndReopen(options); + + // Create 128 file metadata, each of which is roughly 1024 bytes. + // This results in 1 * + // CacheReservationManagerImpl::GetDummyEntrySize() + // cache reservation for file metadata. + for (int i = 1; i <= 128; ++i) { + ASSERT_OK(Put(std::string(1024, 'a'), "va")); + ASSERT_OK(Put("b", "vb")); + ASSERT_OK(Flush()); + } + if (charge_file_metadata == CacheEntryRoleOptions::Decision::kEnabled) { + EXPECT_EQ(file_metadata_charge_only_cache->GetCacheCharge(), + 1 * CacheReservationManagerImpl< + CacheEntryRole::kFileMetadata>::GetDummyEntrySize()); + + } else { + EXPECT_EQ(file_metadata_charge_only_cache->GetCacheCharge(), 0); + } + + // Create another 128 file metadata. + // This increases the file metadata cache reservation to 2 * + // CacheReservationManagerImpl::GetDummyEntrySize(). + for (int i = 1; i <= 128; ++i) { + ASSERT_OK(Put(std::string(1024, 'a'), "vva")); + ASSERT_OK(Put("b", "vvb")); + ASSERT_OK(Flush()); + } + if (charge_file_metadata == CacheEntryRoleOptions::Decision::kEnabled) { + EXPECT_EQ(file_metadata_charge_only_cache->GetCacheCharge(), + 2 * CacheReservationManagerImpl< + CacheEntryRole::kFileMetadata>::GetDummyEntrySize()); + } else { + EXPECT_EQ(file_metadata_charge_only_cache->GetCacheCharge(), 0); + } + // Compaction will create 1 new file metadata, obsolete and delete all 256 + // file metadata above. This results in 1 * + // CacheReservationManagerImpl::GetDummyEntrySize() + // cache reservation for file metadata. + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("0,1", FilesPerLevel(0)); + + if (charge_file_metadata == CacheEntryRoleOptions::Decision::kEnabled) { + EXPECT_EQ(file_metadata_charge_only_cache->GetCacheCharge(), + 1 * CacheReservationManagerImpl< + CacheEntryRole::kFileMetadata>::GetDummyEntrySize()); + } else { + EXPECT_EQ(file_metadata_charge_only_cache->GetCacheCharge(), 0); + } + + // Destroying the db will delete the remaining 1 new file metadata + // This results in no cache reservation for file metadata. + Destroy(options); + EXPECT_EQ(file_metadata_charge_only_cache->GetCacheCharge(), + 0 * CacheReservationManagerImpl< + CacheEntryRole::kFileMetadata>::GetDummyEntrySize()); + + // Reopen the db with a smaller cache in order to test failure in allocating + // file metadata based on memory limit based on cache capacity + file_metadata_charge_only_cache = std::make_shared< + TargetCacheChargeTrackingCache>( + NewLRUCache(1 * CacheReservationManagerImpl< + CacheEntryRole::kFileMetadata>::GetDummyEntrySize(), + 0 /* num_shard_bits */, true /* strict_capacity_limit */)); + table_options.block_cache = file_metadata_charge_only_cache; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + ASSERT_OK(Put(std::string(1024, 'a'), "va")); + ASSERT_OK(Put("b", "vb")); + Status s = Flush(); + if (charge_file_metadata == CacheEntryRoleOptions::Decision::kEnabled) { + EXPECT_TRUE(s.IsMemoryLimit()); + EXPECT_TRUE(s.ToString().find( + kCacheEntryRoleToCamelString[static_cast( + CacheEntryRole::kFileMetadata)]) != std::string::npos); + EXPECT_TRUE(s.ToString().find("memory limit based on cache capacity") != + std::string::npos); + } else { + EXPECT_TRUE(s.ok()); + } +} +#endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h index 96380e5cd..cd566907d 100644 --- a/db_stress_tool/db_stress_common.h +++ b/db_stress_tool/db_stress_common.h @@ -138,6 +138,7 @@ DECLARE_bool(cache_index_and_filter_blocks); DECLARE_bool(charge_compression_dictionary_building_buffer); DECLARE_bool(charge_filter_construction); DECLARE_bool(charge_table_reader); +DECLARE_bool(charge_file_metadata); DECLARE_int32(top_level_index_pinning); DECLARE_int32(partition_pinning); DECLARE_int32(unpartitioned_pinning); diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc index c68f14a44..9ce3d177b 100644 --- a/db_stress_tool/db_stress_gflags.cc +++ b/db_stress_tool/db_stress_gflags.cc @@ -319,6 +319,11 @@ DEFINE_bool(charge_table_reader, false, "CacheEntryRoleOptions::charged of" "CacheEntryRole::kBlockBasedTableReader"); +DEFINE_bool(charge_file_metadata, false, + "Setting for " + "CacheEntryRoleOptions::charged of" + "kFileMetadata"); + DEFINE_int32( top_level_index_pinning, static_cast(ROCKSDB_NAMESPACE::PinningTier::kFallback), diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc index d66469eda..ebf70355d 100644 --- a/db_stress_tool/db_stress_test_base.cc +++ b/db_stress_tool/db_stress_test_base.cc @@ -2354,6 +2354,11 @@ void StressTest::Open(SharedState* shared) { {/*.charged = */ FLAGS_charge_table_reader ? CacheEntryRoleOptions::Decision::kEnabled : CacheEntryRoleOptions::Decision::kDisabled}}); + block_based_options.cache_usage_options.options_overrides.insert( + {CacheEntryRole::kFileMetadata, + {/*.charged = */ FLAGS_charge_file_metadata + ? CacheEntryRoleOptions::Decision::kEnabled + : CacheEntryRoleOptions::Decision::kDisabled}}); block_based_options.format_version = static_cast(FLAGS_format_version); block_based_options.index_block_restart_interval = diff --git a/db_stress_tool/no_batched_ops_stress.cc b/db_stress_tool/no_batched_ops_stress.cc index fe7c3a4b2..c65140e7d 100644 --- a/db_stress_tool/no_batched_ops_stress.cc +++ b/db_stress_tool/no_batched_ops_stress.cc @@ -9,8 +9,8 @@ #ifdef GFLAGS #include "db_stress_tool/db_stress_common.h" -#include "utilities/fault_injection_fs.h" #include "rocksdb/utilities/transaction_db.h" +#include "utilities/fault_injection_fs.h" namespace ROCKSDB_NAMESPACE { class NonBatchedOpsStressTest : public StressTest { diff --git a/include/rocksdb/cache.h b/include/rocksdb/cache.h index 9b27a9fb6..593a53eb4 100644 --- a/include/rocksdb/cache.h +++ b/include/rocksdb/cache.h @@ -570,6 +570,9 @@ enum class CacheEntryRole { // BlockBasedTableReader's charge to account for // its memory usage kBlockBasedTableReader, + // FileMetadata's charge to account for + // its memory usage + kFileMetadata, // Default bucket, for miscellaneous cache entries. Do not use for // entries that could potentially add up to large usage. kMisc, diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h index ec6c3cfd4..b08742c9a 100644 --- a/include/rocksdb/table.h +++ b/include/rocksdb/table.h @@ -370,7 +370,20 @@ struct BlockBasedTableOptions { // (iii) Compatible existing behavior: // Same as kDisabled. // - // (d) Other CacheEntryRole + // (d) CacheEntryRole::kFileMetadata + // (i) If kEnabled: + // Charge memory usage of file metadata created + // for newly added files to Version. + // If such file metadata's + // memory exceeds the avaible space left in the block cache at some point + // (i.e, causing a cache full under `LRUCacheOptions::strict_capacity_limit` = + // true), creation will fail with Status::MemoryLimit(). + // (ii) If kDisabled: + // Does not charge the memory usage mentioned above. + // (iii) Compatible existing behavior: + // Same as kDisabled. + // + // (e) Other CacheEntryRole // Not supported. // `Status::kNotSupported` will be returned if // `CacheEntryRoleOptions::charged` is set to {`kEnabled`, `kDisabled`}. diff --git a/java/src/main/java/org/rocksdb/RocksDB.java b/java/src/main/java/org/rocksdb/RocksDB.java index 9a8a0c384..77484288f 100644 --- a/java/src/main/java/org/rocksdb/RocksDB.java +++ b/java/src/main/java/org/rocksdb/RocksDB.java @@ -5,6 +5,8 @@ package org.rocksdb; +import static java.nio.charset.StandardCharsets.UTF_8; + import java.io.IOException; import java.nio.ByteBuffer; import java.util.ArrayList; @@ -14,8 +16,6 @@ import java.util.Map; import java.util.concurrent.atomic.AtomicReference; import org.rocksdb.util.Environment; -import static java.nio.charset.StandardCharsets.UTF_8; - /** * A RocksDB is a persistent ordered map from keys to values. It is safe for * concurrent access from multiple threads without any external synchronization. diff --git a/table/block_based/block_based_table_factory.cc b/table/block_based/block_based_table_factory.cc index 588c82236..fc8830a9b 100644 --- a/table/block_based/block_based_table_factory.cc +++ b/table/block_based/block_based_table_factory.cc @@ -689,7 +689,7 @@ Status BlockBasedTableFactory::ValidateOptions( static const std::set kMemoryChargingSupported = { CacheEntryRole::kCompressionDictionaryBuildingBuffer, CacheEntryRole::kFilterConstruction, - CacheEntryRole::kBlockBasedTableReader}; + CacheEntryRole::kBlockBasedTableReader, CacheEntryRole::kFileMetadata}; if (options.charged != CacheEntryRoleOptions::Decision::kFallback && kMemoryChargingSupported.count(role) == 0) { return Status::NotSupported( diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc index 901736b9a..3769f6f52 100644 --- a/tools/db_bench_tool.cc +++ b/tools/db_bench_tool.cc @@ -1144,6 +1144,11 @@ DEFINE_bool(charge_table_reader, false, "CacheEntryRoleOptions::charged of" "CacheEntryRole::kBlockBasedTableReader"); +DEFINE_bool(charge_file_metadata, false, + "Setting for " + "CacheEntryRoleOptions::charged of" + "CacheEntryRole::kFileMetadata"); + static enum ROCKSDB_NAMESPACE::CompressionType StringToCompressionType( const char* ctype) { assert(ctype); @@ -4187,6 +4192,11 @@ class Benchmark { {/*.charged = */ FLAGS_charge_table_reader ? CacheEntryRoleOptions::Decision::kEnabled : CacheEntryRoleOptions::Decision::kDisabled}}); + block_based_options.cache_usage_options.options_overrides.insert( + {CacheEntryRole::kFileMetadata, + {/*.charged = */ FLAGS_charge_file_metadata + ? CacheEntryRoleOptions::Decision::kEnabled + : CacheEntryRoleOptions::Decision::kDisabled}}); block_based_options.block_cache_compressed = compressed_cache_; block_based_options.block_size = FLAGS_block_size; block_based_options.block_restart_interval = FLAGS_block_restart_interval; diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py index 9c013734f..958fdf929 100644 --- a/tools/db_crashtest.py +++ b/tools/db_crashtest.py @@ -44,6 +44,7 @@ default_params = { "charge_compression_dictionary_building_buffer": lambda: random.choice([0, 1]), "charge_filter_construction": lambda: random.choice([0, 1]), "charge_table_reader": lambda: random.choice([0, 1]), + "charge_file_metadata": lambda: random.choice([0, 1]), "checkpoint_one_in": 1000000, "compression_type": lambda: random.choice( ["none", "snappy", "zlib", "lz4", "lz4hc", "xpress", "zstd"]),