From 73c1203af1803a871fc5c3f608bd16145005e555 Mon Sep 17 00:00:00 2001 From: sdong Date: Tue, 26 Nov 2019 21:38:38 -0800 Subject: [PATCH] Support options.max_open_files = -1 with periodic_compaction_seconds (#6090) Summary: options.periodic_compaction_seconds isn't supported when options.max_open_files != -1. It's because that the information of file creation time is stored in table properties and are not guaranteed to be loaded unless options.max_open_files = -1. Relax this constraint by storing the information in manifest. Pull Request resolved: https://github.com/facebook/rocksdb/pull/6090 Test Plan: Pass all existing tests; Modify an existing test to force the manifest value to take 0 to simulate backward compatibility case; manually open the DB generated with the change by release 4.2. Differential Revision: D18702268 fbshipit-source-id: 13e0bd94f546498a04f3dc5fc0d9dff5125ec9eb --- HISTORY.md | 3 +- db/compaction/compaction_job.cc | 1 + db/compaction/compaction_job_test.cc | 2 +- db/compaction/compaction_picker_test.cc | 2 +- db/db_compaction_test.cc | 151 ++++++++++++++---------- db/db_impl/db_impl_compaction_flush.cc | 5 +- db/db_impl/db_impl_experimental.cc | 2 +- db/db_impl/db_impl_open.cc | 2 +- db/db_test.cc | 29 ++++- db/external_sst_file_ingestion_job.cc | 9 +- db/flush_job.cc | 3 +- db/import_column_family_job.cc | 8 +- db/repair.cc | 14 +-- db/version_builder_test.cc | 47 +++++--- db/version_edit.cc | 16 +++ db/version_edit.h | 40 +++++-- db/version_edit_test.cc | 21 ++-- db/version_set.cc | 29 ++--- db/version_set_test.cc | 7 +- include/rocksdb/metadata.h | 16 ++- 20 files changed, 266 insertions(+), 141 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index f4318c300..3f1e7c9ae 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -19,8 +19,9 @@ * A batched MultiGet API (DB::MultiGet()) that supports retrieving keys from multiple column families. * Full and partitioned filters in the block-based table use an improved Bloom filter implementation, enabled with format_version 5 (or above) because previous releases cannot read this filter. This replacement is faster and more accurate, especially for high bits per key or millions of keys in a single (full) filter. For example, the new Bloom filter has the same false postive rate at 9.55 bits per key as the old one at 10 bits per key, and a lower false positive rate at 16 bits per key than the old one at 100 bits per key. * Added AVX2 instructions to USE_SSE builds to accelerate the new Bloom filter and XXH3-based hash function on compatible x86_64 platforms (Haswell and later, ~2014). -* Support options.ttl with options.max_open_files = -1. File's oldest ancester time will be written to manifest. If it is availalbe, this information will be used instead of creation_time in table properties. +* Support options.ttl or options.periodic_compaction_seconds with options.max_open_files = -1. File's oldest ancester time and file creation time will be written to manifest. If it is availalbe, this information will be used instead of creation_time and file_creation_time in table properties. * Setting options.ttl for universal compaction now has the same meaning as setting periodic_compaction_seconds. +* SstFileMetaData also returns file creation time and oldest ancester time. * The `sst_dump` command line tool `recompress` command now displays how many blocks were compressed and how many were not, in particular how many were not compressed because the compression ratio was not met (12.5% threshold for GoodCompressionRatio), as seen in the `number.block.not_compressed` counter stat since version 6.0.0. * The block cache usage is now takes into account the overhead of metadata per each entry. This results into more accurate managment of memory. A side-effect of this feature is that less items are fit into the block cache of the same size, which would result to higher cache miss rates. This can be remedied by increasing the block cache size or passing kDontChargeCacheMetadata to its constuctor to restore the old behavior. * When using BlobDB, a mapping is maintained and persisted in the MANIFEST between each SST file and the oldest non-TTL blob file it references. diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc index 3aec2cf6a..22c504fde 100644 --- a/db/compaction/compaction_job.cc +++ b/db/compaction/compaction_job.cc @@ -1501,6 +1501,7 @@ Status CompactionJob::OpenCompactionOutputFile( out.meta.fd = FileDescriptor(file_number, sub_compact->compaction->output_path_id(), 0); out.meta.oldest_ancester_time = oldest_ancester_time; + out.meta.file_creation_time = current_time; out.finished = false; sub_compact->outputs.push_back(out); } diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc index 0b5707a34..9fb3f0df5 100644 --- a/db/compaction/compaction_job_test.cc +++ b/db/compaction/compaction_job_test.cc @@ -184,7 +184,7 @@ class CompactionJobTest : public testing::Test { VersionEdit edit; edit.AddFile(level, file_number, 0, 10, smallest_key, largest_key, smallest_seqno, largest_seqno, false, oldest_blob_file_number, - kUnknownOldestAncesterTime); + kUnknownOldestAncesterTime, kUnknownFileCreationTime); mutex_.Lock(); versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(), diff --git a/db/compaction/compaction_picker_test.cc b/db/compaction/compaction_picker_test.cc index d593d6465..5cb3350d6 100644 --- a/db/compaction/compaction_picker_test.cc +++ b/db/compaction/compaction_picker_test.cc @@ -95,7 +95,7 @@ class CompactionPickerTest : public testing::Test { InternalKey(smallest, smallest_seq, kTypeValue), InternalKey(largest, largest_seq, kTypeValue), smallest_seq, largest_seq, /* marked_for_compact */ false, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime); + kUnknownOldestAncesterTime, kUnknownFileCreationTime); f->compensated_file_size = (compensated_file_size != 0) ? compensated_file_size : file_size; vstorage_->AddFile(level, f); diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc index 3c2452e4c..84f9f55dd 100644 --- a/db/db_compaction_test.cc +++ b/db/db_compaction_test.cc @@ -3657,71 +3657,103 @@ TEST_F(DBCompactionTest, LevelPeriodicCompaction) { const int kNumLevelFiles = 2; const int kValueSize = 100; - Options options = CurrentOptions(); - options.periodic_compaction_seconds = 48 * 60 * 60; // 2 days - options.max_open_files = -1; // needed for ttl compaction - env_->time_elapse_only_sleep_ = false; - options.env = env_; + for (bool if_restart : {false, true}) { + for (bool if_open_all_files : {false, true}) { + Options options = CurrentOptions(); + options.periodic_compaction_seconds = 48 * 60 * 60; // 2 days + if (if_open_all_files) { + options.max_open_files = -1; // needed for ttl compaction + } else { + options.max_open_files = 20; + } + // RocksDB sanitize max open files to at least 20. Modify it back. + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) { + int* max_open_files = static_cast(arg); + *max_open_files = 0; + }); + // In the case where all files are opened and doing DB restart + // forcing the file creation time in manifest file to be 0 to + // simulate the case of reading from an old version. + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "VersionEdit::EncodeTo:VarintFileCreationTime", [&](void* arg) { + if (if_restart && if_open_all_files) { + std::string* encoded_fieled = static_cast(arg); + *encoded_fieled = ""; + PutVarint64(encoded_fieled, 0); + } + }); - env_->addon_time_.store(0); - DestroyAndReopen(options); + env_->time_elapse_only_sleep_ = false; + options.env = env_; - int periodic_compactions = 0; - rocksdb::SyncPoint::GetInstance()->SetCallBack( - "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { - Compaction* compaction = reinterpret_cast(arg); - auto compaction_reason = compaction->compaction_reason(); - if (compaction_reason == CompactionReason::kPeriodicCompaction) { - periodic_compactions++; + env_->addon_time_.store(0); + DestroyAndReopen(options); + + int periodic_compactions = 0; + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + auto compaction_reason = compaction->compaction_reason(); + if (compaction_reason == CompactionReason::kPeriodicCompaction) { + periodic_compactions++; + } + }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + Random rnd(301); + for (int i = 0; i < kNumLevelFiles; ++i) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + ASSERT_OK(Put(Key(i * kNumKeysPerFile + j), + RandomString(&rnd, kValueSize))); } - }); - rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + Flush(); + } + dbfull()->TEST_WaitForCompact(); - Random rnd(301); - for (int i = 0; i < kNumLevelFiles; ++i) { - for (int j = 0; j < kNumKeysPerFile; ++j) { - ASSERT_OK( - Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize))); + ASSERT_EQ("2", FilesPerLevel()); + ASSERT_EQ(0, periodic_compactions); + + // Add 50 hours and do a write + env_->addon_time_.fetch_add(50 * 60 * 60); + ASSERT_OK(Put("a", "1")); + Flush(); + dbfull()->TEST_WaitForCompact(); + // Assert that the files stay in the same level + ASSERT_EQ("3", FilesPerLevel()); + // The two old files go through the periodic compaction process + ASSERT_EQ(2, periodic_compactions); + + MoveFilesToLevel(1); + ASSERT_EQ("0,3", FilesPerLevel()); + + // Add another 50 hours and do another write + env_->addon_time_.fetch_add(50 * 60 * 60); + ASSERT_OK(Put("b", "2")); + if (if_restart) { + Reopen(options); + } else { + Flush(); + } + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ("1,3", FilesPerLevel()); + // The three old files now go through the periodic compaction process. 2 + // + 3. + ASSERT_EQ(5, periodic_compactions); + + // Add another 50 hours and do another write + env_->addon_time_.fetch_add(50 * 60 * 60); + ASSERT_OK(Put("c", "3")); + Flush(); + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ("2,3", FilesPerLevel()); + // The four old files now go through the periodic compaction process. 5 + // + 4. + ASSERT_EQ(9, periodic_compactions); + + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); } - Flush(); } - dbfull()->TEST_WaitForCompact(); - - ASSERT_EQ("2", FilesPerLevel()); - ASSERT_EQ(0, periodic_compactions); - - // Add 50 hours and do a write - env_->addon_time_.fetch_add(50 * 60 * 60); - ASSERT_OK(Put("a", "1")); - Flush(); - dbfull()->TEST_WaitForCompact(); - // Assert that the files stay in the same level - ASSERT_EQ("3", FilesPerLevel()); - // The two old files go through the periodic compaction process - ASSERT_EQ(2, periodic_compactions); - - MoveFilesToLevel(1); - ASSERT_EQ("0,3", FilesPerLevel()); - - // Add another 50 hours and do another write - env_->addon_time_.fetch_add(50 * 60 * 60); - ASSERT_OK(Put("b", "2")); - Flush(); - dbfull()->TEST_WaitForCompact(); - ASSERT_EQ("1,3", FilesPerLevel()); - // The three old files now go through the periodic compaction process. 2 + 3. - ASSERT_EQ(5, periodic_compactions); - - // Add another 50 hours and do another write - env_->addon_time_.fetch_add(50 * 60 * 60); - ASSERT_OK(Put("c", "3")); - Flush(); - dbfull()->TEST_WaitForCompact(); - ASSERT_EQ("2,3", FilesPerLevel()); - // The four old files now go through the periodic compaction process. 5 + 4. - ASSERT_EQ(9, periodic_compactions); - - rocksdb::SyncPoint::GetInstance()->DisableProcessing(); } TEST_F(DBCompactionTest, LevelPeriodicCompactionWithOldDB) { @@ -3734,7 +3766,6 @@ TEST_F(DBCompactionTest, LevelPeriodicCompactionWithOldDB) { const int kValueSize = 100; Options options = CurrentOptions(); - options.max_open_files = -1; // needed for ttl compaction env_->time_elapse_only_sleep_ = false; options.env = env_; diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc index 12a8a2aab..b01fdbc96 100644 --- a/db/db_impl/db_impl_compaction_flush.cc +++ b/db/db_impl/db_impl_compaction_flush.cc @@ -1257,7 +1257,7 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) { f->fd.GetFileSize(), f->smallest, f->largest, f->fd.smallest_seqno, f->fd.largest_seqno, f->marked_for_compaction, f->oldest_blob_file_number, - f->oldest_ancester_time); + f->oldest_ancester_time, f->file_creation_time); } ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] Apply version edit:\n%s", cfd->GetName().c_str(), @@ -2672,7 +2672,8 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, f->fd.GetPathId(), f->fd.GetFileSize(), f->smallest, f->largest, f->fd.smallest_seqno, f->fd.largest_seqno, f->marked_for_compaction, - f->oldest_blob_file_number, f->oldest_ancester_time); + f->oldest_blob_file_number, f->oldest_ancester_time, + f->file_creation_time); ROCKS_LOG_BUFFER( log_buffer, diff --git a/db/db_impl/db_impl_experimental.cc b/db/db_impl/db_impl_experimental.cc index 4fe5409ae..9a6e85ea6 100644 --- a/db/db_impl/db_impl_experimental.cc +++ b/db/db_impl/db_impl_experimental.cc @@ -129,7 +129,7 @@ Status DBImpl::PromoteL0(ColumnFamilyHandle* column_family, int target_level) { f->fd.GetFileSize(), f->smallest, f->largest, f->fd.smallest_seqno, f->fd.largest_seqno, f->marked_for_compaction, f->oldest_blob_file_number, - f->oldest_ancester_time); + f->oldest_ancester_time, f->file_creation_time); } status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc index dddbcc262..9ca0a940c 100644 --- a/db/db_impl/db_impl_open.cc +++ b/db/db_impl/db_impl_open.cc @@ -1226,7 +1226,7 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, meta.fd.GetFileSize(), meta.smallest, meta.largest, meta.fd.smallest_seqno, meta.fd.largest_seqno, meta.marked_for_compaction, meta.oldest_blob_file_number, - meta.oldest_ancester_time); + meta.oldest_ancester_time, meta.file_creation_time); } InternalStats::CompactionStats stats(CompactionReason::kFlush, 1); diff --git a/db/db_test.cc b/db/db_test.cc index 6ea5e9e00..16d1a4dee 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -1022,7 +1022,8 @@ TEST_F(DBTest, FailMoreDbPaths) { void CheckColumnFamilyMeta( const ColumnFamilyMetaData& cf_meta, - const std::vector>& files_by_level) { + const std::vector>& files_by_level, + uint64_t start_time, uint64_t end_time) { ASSERT_EQ(cf_meta.name, kDefaultColumnFamilyName); ASSERT_EQ(cf_meta.levels.size(), files_by_level.size()); @@ -1060,6 +1061,14 @@ void CheckColumnFamilyMeta( file_meta_from_files.largest.user_key().ToString()); ASSERT_EQ(file_meta_from_cf.oldest_blob_file_number, file_meta_from_files.oldest_blob_file_number); + ASSERT_EQ(file_meta_from_cf.oldest_ancester_time, + file_meta_from_files.oldest_ancester_time); + ASSERT_EQ(file_meta_from_cf.file_creation_time, + file_meta_from_files.file_creation_time); + ASSERT_GE(file_meta_from_cf.file_creation_time, start_time); + ASSERT_LE(file_meta_from_cf.file_creation_time, end_time); + ASSERT_GE(file_meta_from_cf.oldest_ancester_time, start_time); + ASSERT_LE(file_meta_from_cf.oldest_ancester_time, end_time); } ASSERT_EQ(level_meta_from_cf.size, level_size); @@ -1113,6 +1122,11 @@ TEST_F(DBTest, MetaDataTest) { Options options = CurrentOptions(); options.create_if_missing = true; options.disable_auto_compactions = true; + + int64_t temp_time = 0; + options.env->GetCurrentTime(&temp_time); + uint64_t start_time = static_cast(temp_time); + DestroyAndReopen(options); Random rnd(301); @@ -1139,9 +1153,12 @@ TEST_F(DBTest, MetaDataTest) { std::vector> files_by_level; dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files_by_level); + options.env->GetCurrentTime(&temp_time); + uint64_t end_time = static_cast(temp_time); + ColumnFamilyMetaData cf_meta; db_->GetColumnFamilyMetaData(&cf_meta); - CheckColumnFamilyMeta(cf_meta, files_by_level); + CheckColumnFamilyMeta(cf_meta, files_by_level, start_time, end_time); std::vector live_file_meta; db_->GetLiveFilesMetaData(&live_file_meta); @@ -6420,6 +6437,12 @@ TEST_F(DBTest, CreationTimeOfOldestFile) { } } }); + // Set file creation time in manifest all to 0. + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "FileMetaData::FileMetaData", [&](void* arg) { + FileMetaData* meta = static_cast(arg); + meta->file_creation_time = 0; + }); rocksdb::SyncPoint::GetInstance()->EnableProcessing(); Random rnd(301); @@ -6431,7 +6454,7 @@ TEST_F(DBTest, CreationTimeOfOldestFile) { Flush(); } - // At this point there should be 2 files, oen with file_creation_time = 0 and + // At this point there should be 2 files, one with file_creation_time = 0 and // the other non-zero. GetCreationTimeOfOldestFile API should return 0. uint64_t creation_time; Status s1 = dbfull()->GetCreationTimeOfOldestFile(&creation_time); diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc index 08446c7f5..fd79ff1d0 100644 --- a/db/external_sst_file_ingestion_job.cc +++ b/db/external_sst_file_ingestion_job.cc @@ -246,16 +246,19 @@ Status ExternalSstFileIngestionJob::Run() { // We use the import time as the ancester time. This is the time the data // is written to the database. - uint64_t oldest_ancester_time = 0; int64_t temp_current_time = 0; + uint64_t current_time = kUnknownFileCreationTime; + uint64_t oldest_ancester_time = kUnknownOldestAncesterTime; if (env_->GetCurrentTime(&temp_current_time).ok()) { - oldest_ancester_time = static_cast(temp_current_time); + current_time = oldest_ancester_time = + static_cast(temp_current_time); } edit_.AddFile(f.picked_level, f.fd.GetNumber(), f.fd.GetPathId(), f.fd.GetFileSize(), f.smallest_internal_key, f.largest_internal_key, f.assigned_seqno, f.assigned_seqno, - false, kInvalidBlobFileNumber, oldest_ancester_time); + false, kInvalidBlobFileNumber, oldest_ancester_time, + current_time); } return status; } diff --git a/db/flush_job.cc b/db/flush_job.cc index dcbc33c37..bdb4c179b 100644 --- a/db/flush_job.cc +++ b/db/flush_job.cc @@ -368,6 +368,7 @@ Status FlushJob::WriteLevel0Table() { // It's not clear whether oldest_key_time is always available. In case // it is not available, use current_time. meta_.oldest_ancester_time = std::min(current_time, oldest_key_time); + meta_.file_creation_time = current_time; s = BuildTable( dbname_, db_options_.env, *cfd_->ioptions(), mutable_cf_options_, @@ -413,7 +414,7 @@ Status FlushJob::WriteLevel0Table() { meta_.fd.GetFileSize(), meta_.smallest, meta_.largest, meta_.fd.smallest_seqno, meta_.fd.largest_seqno, meta_.marked_for_compaction, meta_.oldest_blob_file_number, - meta_.oldest_ancester_time); + meta_.oldest_ancester_time, meta_.file_creation_time); } #ifndef ROCKSDB_LITE // Piggyback FlushJobInfo on the first first flushed memtable. diff --git a/db/import_column_family_job.cc b/db/import_column_family_job.cc index 264075a9d..f52418a07 100644 --- a/db/import_column_family_job.cc +++ b/db/import_column_family_job.cc @@ -135,10 +135,12 @@ Status ImportColumnFamilyJob::Run() { // We use the import time as the ancester time. This is the time the data // is written to the database. - uint64_t oldest_ancester_time = 0; int64_t temp_current_time = 0; + uint64_t oldest_ancester_time = kUnknownOldestAncesterTime; + uint64_t current_time = kUnknownOldestAncesterTime; if (env_->GetCurrentTime(&temp_current_time).ok()) { - oldest_ancester_time = static_cast(temp_current_time); + current_time = oldest_ancester_time = + static_cast(temp_current_time); } for (size_t i = 0; i < files_to_import_.size(); ++i) { @@ -149,7 +151,7 @@ Status ImportColumnFamilyJob::Run() { f.fd.GetFileSize(), f.smallest_internal_key, f.largest_internal_key, file_metadata.smallest_seqno, file_metadata.largest_seqno, false, kInvalidBlobFileNumber, - oldest_ancester_time); + oldest_ancester_time, current_time); // If incoming sequence number is higher, update local sequence number. if (file_metadata.largest_seqno > versions_->LastSequence()) { diff --git a/db/repair.cc b/db/repair.cc index baed9ead1..b71f725a2 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -577,13 +577,13 @@ class Repairer { // TODO(opt): separate out into multiple levels for (const auto* table : cf_id_and_tables.second) { - edit.AddFile(0, table->meta.fd.GetNumber(), table->meta.fd.GetPathId(), - table->meta.fd.GetFileSize(), table->meta.smallest, - table->meta.largest, table->meta.fd.smallest_seqno, - table->meta.fd.largest_seqno, - table->meta.marked_for_compaction, - table->meta.oldest_blob_file_number, - table->meta.oldest_ancester_time); + edit.AddFile( + 0, table->meta.fd.GetNumber(), table->meta.fd.GetPathId(), + table->meta.fd.GetFileSize(), table->meta.smallest, + table->meta.largest, table->meta.fd.smallest_seqno, + table->meta.fd.largest_seqno, table->meta.marked_for_compaction, + table->meta.oldest_blob_file_number, + table->meta.oldest_ancester_time, table->meta.file_creation_time); } assert(next_file_number_ > 0); vset_.MarkFileNumberUsed(next_file_number_ - 1); diff --git a/db/version_builder_test.cc b/db/version_builder_test.cc index 4c88983ba..64d2d2481 100644 --- a/db/version_builder_test.cc +++ b/db/version_builder_test.cc @@ -63,7 +63,7 @@ class VersionBuilderTest : public testing::Test { file_number, path_id, file_size, GetInternalKey(smallest, smallest_seq), GetInternalKey(largest, largest_seq), smallest_seqno, largest_seqno, /* marked_for_compact */ false, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime); + kUnknownOldestAncesterTime, kUnknownFileCreationTime); f->compensated_file_size = file_size; f->num_entries = num_entries; f->num_deletions = num_deletions; @@ -114,7 +114,8 @@ TEST_F(VersionBuilderTest, ApplyAndSaveTo) { VersionEdit version_edit; version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200, false, - kInvalidBlobFileNumber, kUnknownOldestAncesterTime); + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime); version_edit.DeleteFile(3, 27U); EnvOptions env_options; @@ -149,7 +150,8 @@ TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic) { VersionEdit version_edit; version_edit.AddFile(3, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200, false, - kInvalidBlobFileNumber, kUnknownOldestAncesterTime); + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime); version_edit.DeleteFile(0, 1U); version_edit.DeleteFile(0, 88U); @@ -187,7 +189,8 @@ TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic2) { VersionEdit version_edit; version_edit.AddFile(4, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200, false, - kInvalidBlobFileNumber, kUnknownOldestAncesterTime); + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime); version_edit.DeleteFile(0, 1U); version_edit.DeleteFile(0, 88U); version_edit.DeleteFile(4, 6U); @@ -216,19 +219,24 @@ TEST_F(VersionBuilderTest, ApplyMultipleAndSaveTo) { VersionEdit version_edit; version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200, false, - kInvalidBlobFileNumber, kUnknownOldestAncesterTime); + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime); version_edit.AddFile(2, 676, 0, 100U, GetInternalKey("401"), GetInternalKey("450"), 200, 200, false, - kInvalidBlobFileNumber, kUnknownOldestAncesterTime); + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime); version_edit.AddFile(2, 636, 0, 100U, GetInternalKey("601"), GetInternalKey("650"), 200, 200, false, - kInvalidBlobFileNumber, kUnknownOldestAncesterTime); + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime); version_edit.AddFile(2, 616, 0, 100U, GetInternalKey("501"), GetInternalKey("550"), 200, 200, false, - kInvalidBlobFileNumber, kUnknownOldestAncesterTime); + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime); version_edit.AddFile(2, 606, 0, 100U, GetInternalKey("701"), GetInternalKey("750"), 200, 200, false, - kInvalidBlobFileNumber, kUnknownOldestAncesterTime); + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime); EnvOptions env_options; @@ -255,30 +263,37 @@ TEST_F(VersionBuilderTest, ApplyDeleteAndSaveTo) { VersionEdit version_edit; version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200, false, - kInvalidBlobFileNumber, kUnknownOldestAncesterTime); + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime); version_edit.AddFile(2, 676, 0, 100U, GetInternalKey("401"), GetInternalKey("450"), 200, 200, false, - kInvalidBlobFileNumber, kUnknownOldestAncesterTime); + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime); version_edit.AddFile(2, 636, 0, 100U, GetInternalKey("601"), GetInternalKey("650"), 200, 200, false, - kInvalidBlobFileNumber, kUnknownOldestAncesterTime); + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime); version_edit.AddFile(2, 616, 0, 100U, GetInternalKey("501"), GetInternalKey("550"), 200, 200, false, - kInvalidBlobFileNumber, kUnknownOldestAncesterTime); + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime); version_edit.AddFile(2, 606, 0, 100U, GetInternalKey("701"), GetInternalKey("750"), 200, 200, false, - kInvalidBlobFileNumber, kUnknownOldestAncesterTime); + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime); version_builder.Apply(&version_edit); VersionEdit version_edit2; version_edit.AddFile(2, 808, 0, 100U, GetInternalKey("901"), GetInternalKey("950"), 200, 200, false, - kInvalidBlobFileNumber, kUnknownOldestAncesterTime); + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime); version_edit2.DeleteFile(2, 616); version_edit2.DeleteFile(2, 636); version_edit.AddFile(2, 806, 0, 100U, GetInternalKey("801"), GetInternalKey("850"), 200, 200, false, - kInvalidBlobFileNumber, kUnknownOldestAncesterTime); + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime); version_builder.Apply(&version_edit2); version_builder.SaveTo(&new_vstorage); diff --git a/db/version_edit.cc b/db/version_edit.cc index 564cec3c1..dc1d821d9 100644 --- a/db/version_edit.cc +++ b/db/version_edit.cc @@ -62,6 +62,7 @@ enum CustomTag : uint32_t { kMinLogNumberToKeepHack = 3, kOldestBlobFileNumber = 4, kOldestAncesterTime = 5, + kFileCreationTime = 6, kPathId = 65, }; // If this bit for the custom tag is set, opening DB should fail if @@ -217,6 +218,14 @@ bool VersionEdit::EncodeTo(std::string* dst) const { TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:VarintOldestAncesterTime", &varint_oldest_ancester_time); PutLengthPrefixedSlice(dst, Slice(varint_oldest_ancester_time)); + + PutVarint32(dst, CustomTag::kFileCreationTime); + std::string varint_file_creation_time; + PutVarint64(&varint_file_creation_time, f.file_creation_time); + TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:VarintFileCreationTime", + &varint_file_creation_time); + PutLengthPrefixedSlice(dst, Slice(varint_file_creation_time)); + if (f.fd.GetPathId() != 0) { PutVarint32(dst, CustomTag::kPathId); char p = static_cast(f.fd.GetPathId()); @@ -335,6 +344,11 @@ const char* VersionEdit::DecodeNewFile4From(Slice* input) { return "invalid oldest ancester time"; } break; + case kFileCreationTime: + if (!GetVarint64(&field, &f.file_creation_time)) { + return "invalid file creation time"; + } + break; case kNeedCompaction: if (field.size() != 1) { return "need_compaction field wrong size"; @@ -660,6 +674,8 @@ std::string VersionEdit::DebugString(bool hex_key) const { } r.append(" oldest_ancester_time:"); AppendNumberTo(&r, f.oldest_ancester_time); + r.append(" file_creation_time:"); + AppendNumberTo(&r, f.file_creation_time); } r.append("\n ColumnFamily: "); AppendNumberTo(&r, column_family_); diff --git a/db/version_edit.h b/db/version_edit.h index d3664fd39..5815d18dc 100644 --- a/db/version_edit.h +++ b/db/version_edit.h @@ -26,6 +26,7 @@ class VersionSet; constexpr uint64_t kFileNumberMask = 0x3FFFFFFFFFFFFFFF; constexpr uint64_t kInvalidBlobFileNumber = 0; constexpr uint64_t kUnknownOldestAncesterTime = 0; +constexpr uint64_t kUnknownFileCreationTime = 0; extern uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id); @@ -128,7 +129,10 @@ struct FileMetaData { // in turn be outputs for compact older SST files. We track the memtable // flush timestamp for the oldest SST file that eventaully contribute data // to this file. 0 means the information is not available. - uint64_t oldest_ancester_time = 0; + uint64_t oldest_ancester_time = kUnknownOldestAncesterTime; + + // Unix time when the SST file is created. + uint64_t file_creation_time = kUnknownFileCreationTime; FileMetaData() = default; @@ -136,13 +140,17 @@ struct FileMetaData { const InternalKey& smallest_key, const InternalKey& largest_key, const SequenceNumber& smallest_seq, const SequenceNumber& largest_seq, bool marked_for_compact, - uint64_t oldest_blob_file, uint64_t _oldest_ancester_time) + uint64_t oldest_blob_file, uint64_t _oldest_ancester_time, + uint64_t _file_creation_time) : fd(file, file_path_id, file_size, smallest_seq, largest_seq), smallest(smallest_key), largest(largest_key), marked_for_compaction(marked_for_compact), oldest_blob_file_number(oldest_blob_file), - oldest_ancester_time(_oldest_ancester_time) {} + oldest_ancester_time(_oldest_ancester_time), + file_creation_time(_file_creation_time) { + TEST_SYNC_POINT_CALLBACK("FileMetaData::FileMetaData", this); + } // REQUIRED: Keys must be given to the function in sorted order (it expects // the last key to be the largest). @@ -168,13 +176,23 @@ struct FileMetaData { // if table reader is already pinned. // 0 means the information is not available. uint64_t TryGetOldestAncesterTime() { - if (oldest_ancester_time != 0) { + if (oldest_ancester_time != kUnknownOldestAncesterTime) { return oldest_ancester_time; } else if (fd.table_reader != nullptr && fd.table_reader->GetTableProperties() != nullptr) { return fd.table_reader->GetTableProperties()->creation_time; } - return 0; + return kUnknownOldestAncesterTime; + } + + uint64_t TryGetFileCreationTime() { + if (file_creation_time != kUnknownFileCreationTime) { + return file_creation_time; + } else if (fd.table_reader != nullptr && + fd.table_reader->GetTableProperties() != nullptr) { + return fd.table_reader->GetTableProperties()->file_creation_time; + } + return kUnknownFileCreationTime; } }; @@ -277,14 +295,14 @@ class VersionEdit { uint64_t file_size, const InternalKey& smallest, const InternalKey& largest, const SequenceNumber& smallest_seqno, const SequenceNumber& largest_seqno, bool marked_for_compaction, - uint64_t oldest_blob_file_number, - uint64_t oldest_ancester_time) { + uint64_t oldest_blob_file_number, uint64_t oldest_ancester_time, + uint64_t file_creation_time) { assert(smallest_seqno <= largest_seqno); new_files_.emplace_back( - level, - FileMetaData(file, file_path_id, file_size, smallest, largest, - smallest_seqno, largest_seqno, marked_for_compaction, - oldest_blob_file_number, oldest_ancester_time)); + level, FileMetaData(file, file_path_id, file_size, smallest, largest, + smallest_seqno, largest_seqno, + marked_for_compaction, oldest_blob_file_number, + oldest_ancester_time, file_creation_time)); } void AddFile(int level, const FileMetaData& f) { diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc index f110694d8..8a4c1380c 100644 --- a/db/version_edit_test.cc +++ b/db/version_edit_test.cc @@ -37,7 +37,7 @@ TEST_F(VersionEditTest, EncodeDecode) { InternalKey("foo", kBig + 500 + i, kTypeValue), InternalKey("zoo", kBig + 600 + i, kTypeDeletion), kBig + 500 + i, kBig + 600 + i, false, kInvalidBlobFileNumber, - 888); + 888, 678); edit.DeleteFile(4, kBig + 700 + i); } @@ -55,17 +55,19 @@ TEST_F(VersionEditTest, EncodeDecodeNewFile4) { edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue), InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500, kBig + 600, true, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime); + kUnknownOldestAncesterTime, kUnknownFileCreationTime); edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue), InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501, kBig + 601, false, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime); + kUnknownOldestAncesterTime, kUnknownFileCreationTime); edit.AddFile(5, 302, 0, 100, InternalKey("foo", kBig + 502, kTypeValue), InternalKey("zoo", kBig + 602, kTypeDeletion), kBig + 502, - kBig + 602, true, kInvalidBlobFileNumber, 666); + kBig + 602, true, kInvalidBlobFileNumber, 666, 888); edit.AddFile(5, 303, 0, 100, InternalKey("foo", kBig + 503, kTypeBlobIndex), InternalKey("zoo", kBig + 603, kTypeBlobIndex), kBig + 503, - kBig + 603, true, 1001, kUnknownOldestAncesterTime); + kBig + 603, true, 1001, kUnknownOldestAncesterTime, + kUnknownFileCreationTime); + ; edit.DeleteFile(4, 700); @@ -104,10 +106,10 @@ TEST_F(VersionEditTest, ForwardCompatibleNewFile4) { edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue), InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500, kBig + 600, true, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime); + kUnknownOldestAncesterTime, kUnknownFileCreationTime); edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue), InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501, - kBig + 601, false, kInvalidBlobFileNumber, 686); + kBig + 601, false, kInvalidBlobFileNumber, 686, 868); edit.DeleteFile(4, 700); edit.SetComparatorName("foo"); @@ -154,7 +156,7 @@ TEST_F(VersionEditTest, NewFile4NotSupportedField) { edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue), InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500, kBig + 600, true, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime); + kUnknownOldestAncesterTime, kUnknownFileCreationTime); edit.SetComparatorName("foo"); edit.SetLogNumber(kBig + 100); @@ -182,7 +184,8 @@ TEST_F(VersionEditTest, NewFile4NotSupportedField) { TEST_F(VersionEditTest, EncodeEmptyFile) { VersionEdit edit; edit.AddFile(0, 0, 0, 0, InternalKey(), InternalKey(), 0, 0, false, - kInvalidBlobFileNumber, kUnknownOldestAncesterTime); + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime); std::string buffer; ASSERT_TRUE(!edit.EncodeTo(&buffer)); } diff --git a/db/version_set.cc b/db/version_set.cc index 2393503d5..444996e40 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -1459,7 +1459,8 @@ void Version::GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta) { file->fd.largest_seqno, file->smallest.user_key().ToString(), file->largest.user_key().ToString(), file->stats.num_reads_sampled.load(std::memory_order_relaxed), - file->being_compacted, file->oldest_blob_file_number}); + file->being_compacted, file->oldest_blob_file_number, + file->TryGetOldestAncesterTime(), file->TryGetFileCreationTime()}); files.back().num_entries = file->num_entries; files.back().num_deletions = file->num_deletions; level_size += file->fd.GetFileSize(); @@ -1485,10 +1486,9 @@ void Version::GetCreationTimeOfOldestFile(uint64_t* creation_time) { for (int level = 0; level < storage_info_.num_non_empty_levels_; level++) { for (FileMetaData* meta : storage_info_.LevelFiles(level)) { assert(meta->fd.table_reader != nullptr); - uint64_t file_creation_time = - meta->fd.table_reader->GetTableProperties()->file_creation_time; - if (file_creation_time == 0) { - *creation_time = file_creation_time; + uint64_t file_creation_time = meta->TryGetFileCreationTime(); + if (file_creation_time == kUnknownFileCreationTime) { + *creation_time = 0; return; } if (file_creation_time < oldest_time) { @@ -2501,8 +2501,7 @@ void VersionStorageInfo::ComputeExpiredTtlFiles( void VersionStorageInfo::ComputeFilesMarkedForPeriodicCompaction( const ImmutableCFOptions& ioptions, const uint64_t periodic_compaction_seconds) { - assert(periodic_compaction_seconds > 0 && - periodic_compaction_seconds < port::kMaxUint64); + assert(periodic_compaction_seconds > 0); files_marked_for_periodic_compaction_.clear(); @@ -2513,8 +2512,8 @@ void VersionStorageInfo::ComputeFilesMarkedForPeriodicCompaction( } const uint64_t current_time = static_cast(temp_current_time); - // If periodic_compaction_seconds > current_time, no file possibly qualifies - // periodic compaction. + // If periodic_compaction_seconds is larger than current time, periodic + // compaction can't possibly be triggered. if (periodic_compaction_seconds > current_time) { return; } @@ -2524,20 +2523,18 @@ void VersionStorageInfo::ComputeFilesMarkedForPeriodicCompaction( for (int level = 0; level < num_levels(); level++) { for (auto f : files_[level]) { - if (!f->being_compacted && f->fd.table_reader != nullptr && - f->fd.table_reader->GetTableProperties() != nullptr) { + if (!f->being_compacted) { // Compute a file's modification time in the following order: // 1. Use file_creation_time table property if it is > 0. // 2. Use creation_time table property if it is > 0. // 3. Use file's mtime metadata if the above two table properties are 0. // Don't consider the file at all if the modification time cannot be // correctly determined based on the above conditions. - uint64_t file_modification_time = - f->fd.table_reader->GetTableProperties()->file_creation_time; - if (file_modification_time == 0) { + uint64_t file_modification_time = f->TryGetFileCreationTime(); + if (file_modification_time == kUnknownFileCreationTime) { file_modification_time = f->TryGetOldestAncesterTime(); } - if (file_modification_time == 0) { + if (file_modification_time == kUnknownOldestAncesterTime) { auto file_path = TableFileName(ioptions.cf_paths, f->fd.GetNumber(), f->fd.GetPathId()); status = ioptions.env->GetFileModificationTime( @@ -4980,7 +4977,7 @@ Status VersionSet::WriteCurrentStateToManifest(log::Writer* log) { f->fd.GetFileSize(), f->smallest, f->largest, f->fd.smallest_seqno, f->fd.largest_seqno, f->marked_for_compaction, f->oldest_blob_file_number, - f->oldest_ancester_time); + f->oldest_ancester_time, f->file_creation_time); } } edit.SetLogNumber(cfd->GetLogNumber()); diff --git a/db/version_set_test.cc b/db/version_set_test.cc index 363b337bc..66ad930f5 100644 --- a/db/version_set_test.cc +++ b/db/version_set_test.cc @@ -40,7 +40,7 @@ class GenerateLevelFilesBriefTest : public testing::Test { InternalKey(smallest, smallest_seq, kTypeValue), InternalKey(largest, largest_seq, kTypeValue), smallest_seq, largest_seq, /* marked_for_compact */ false, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime); + kUnknownOldestAncesterTime, kUnknownFileCreationTime); files_.push_back(f); } @@ -135,7 +135,7 @@ class VersionStorageInfoTest : public testing::Test { file_number, 0, file_size, GetInternalKey(smallest, 0), GetInternalKey(largest, 0), /* smallest_seq */ 0, /* largest_seq */ 0, /* marked_for_compact */ false, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime); + kUnknownOldestAncesterTime, kUnknownFileCreationTime); f->compensated_file_size = file_size; vstorage_.AddFile(level, f); } @@ -146,7 +146,8 @@ class VersionStorageInfoTest : public testing::Test { FileMetaData* f = new FileMetaData( file_number, 0, file_size, smallest, largest, /* smallest_seq */ 0, /* largest_seq */ 0, /* marked_for_compact */ false, - kInvalidBlobFileNumber, kUnknownOldestAncesterTime); + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime); f->compensated_file_size = file_size; vstorage_.AddFile(level, f); } diff --git a/include/rocksdb/metadata.h b/include/rocksdb/metadata.h index 52b5657c3..fecee8430 100644 --- a/include/rocksdb/metadata.h +++ b/include/rocksdb/metadata.h @@ -69,7 +69,8 @@ struct SstFileMetaData { SequenceNumber _smallest_seqno, SequenceNumber _largest_seqno, const std::string& _smallestkey, const std::string& _largestkey, uint64_t _num_reads_sampled, - bool _being_compacted, uint64_t _oldest_blob_file_number) + bool _being_compacted, uint64_t _oldest_blob_file_number, + uint64_t _oldest_ancester_time, uint64_t _file_creation_time) : size(_size), name(_file_name), file_number(_file_number), @@ -82,7 +83,9 @@ struct SstFileMetaData { being_compacted(_being_compacted), num_entries(0), num_deletions(0), - oldest_blob_file_number(_oldest_blob_file_number) {} + oldest_blob_file_number(_oldest_blob_file_number), + oldest_ancester_time(_oldest_ancester_time), + file_creation_time(_file_creation_time) {} // File size in bytes. size_t size; @@ -105,6 +108,15 @@ struct SstFileMetaData { uint64_t oldest_blob_file_number; // The id of the oldest blob file // referenced by the file. + // An SST file may be generated by compactions whose input files may + // in turn be generated by earlier compactions. The creation time of the + // oldest SST file that is the compaction ancester of this file. + // The timestamp is provided Env::GetCurrentTime(). + // 0 if the information is not available. + uint64_t oldest_ancester_time; + // Timestamp when the SST file is created, provided by Env::GetCurrentTime(). + // 0 if the information is not available. + uint64_t file_creation_time; }; // The full set of metadata associated with each SST file.