diff --git a/HISTORY.md b/HISTORY.md index cb4f18e79..4583e5033 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -10,6 +10,7 @@ ### New Features * DB::GetLiveFilesStorageInfo is ready for production use. * Add new stats PREFETCHED_BYTES_DISCARDED which records number of prefetched bytes discarded by RocksDB FilePrefetchBuffer on destruction and POLL_WAIT_MICROS records wait time for FS::Poll API completion. +* Start tracking SST unique id in MANIFEST, which will be used to verify with SST properties during DB open to make sure the SST file is not overwritten or misplaced. A db option `verify_sst_unique_id_in_manifest` is introduced to enable/disable the verification, it's disabled by default for now, but plan to be enabled by default later. ### Public API changes * Add rollback_deletion_type_callback to TransactionDBOptions so that write-prepared transactions know whether to issue a Delete or SingleDelete to cancel a previous key written during prior prepare phase. The PR aims to prevent mixing SingleDeletes and Deletes for the same key that can lead to undefined behaviors for write-prepared transactions. diff --git a/db/builder.cc b/db/builder.cc index 9af93a736..df3e9fcc5 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -37,6 +37,7 @@ #include "table/block_based/block_based_table_builder.h" #include "table/format.h" #include "table/internal_iterator.h" +#include "table/unique_id_impl.h" #include "test_util/sync_point.h" #include "util/stop_watch.h" @@ -310,6 +311,11 @@ Status BuildTable( meta->file_checksum_func_name = file_writer->GetFileChecksumFuncName(); file_checksum = meta->file_checksum; file_checksum_func_name = meta->file_checksum_func_name; + // Set unique_id + if (!tboptions.db_id.empty() && !tboptions.db_session_id.empty()) { + s = GetSstInternalUniqueId(tboptions.db_id, tboptions.db_session_id, + meta->fd.GetNumber(), &meta->unique_id); + } } if (s.ok()) { diff --git a/db/column_family.cc b/db/column_family.cc index 3eb4aab8e..da405a268 100644 --- a/db/column_family.cc +++ b/db/column_family.cc @@ -1060,6 +1060,10 @@ uint64_t ColumnFamilyData::GetLiveSstFilesSize() const { return current_->GetSstFilesSize(); } +Status ColumnFamilyData::TryVerifySstUniqueIds() const { + return current_->TryVerifySstUniqueIds(); +} + MemTable* ColumnFamilyData::ConstructNewMemtable( const MutableCFOptions& mutable_cf_options, SequenceNumber earliest_seq) { return new MemTable(internal_comparator_, ioptions_, mutable_cf_options, diff --git a/db/column_family.h b/db/column_family.h index c37430366..ed74dab59 100644 --- a/db/column_family.h +++ b/db/column_family.h @@ -359,6 +359,7 @@ class ColumnFamilyData { uint64_t GetTotalSstFilesSize() const; // REQUIRE: DB mutex held uint64_t GetLiveSstFilesSize() const; // REQUIRE: DB mutex held uint64_t GetTotalBlobFileSize() const; // REQUIRE: DB mutex held + Status TryVerifySstUniqueIds() const; // REQUIRE: DB mutex held void SetMemtable(MemTable* new_mem) { uint64_t memtable_id = last_memtable_id_.fetch_add(1) + 1; new_mem->SetID(memtable_id); diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc index ed4687906..6578917a4 100644 --- a/db/compaction/compaction_job.cc +++ b/db/compaction/compaction_job.cc @@ -64,6 +64,7 @@ #include "table/block_based/block_based_table_factory.h" #include "table/merging_iterator.h" #include "table/table_builder.h" +#include "table/unique_id_impl.h" #include "test_util/sync_point.h" #include "util/coding.h" #include "util/hash.h" @@ -1047,6 +1048,7 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService( const Compaction* compaction = sub_compact->compaction; CompactionServiceInput compaction_input; compaction_input.output_level = compaction->output_level(); + compaction_input.db_id = db_id_; const std::vector& inputs = *(compact_->compaction->inputs()); @@ -1208,6 +1210,7 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService( meta.oldest_ancester_time = file.oldest_ancester_time; meta.file_creation_time = file.file_creation_time; meta.marked_for_compaction = file.marked_for_compaction; + meta.unique_id = file.unique_id; auto cfd = compaction->column_family_data(); sub_compact->outputs.emplace_back(std::move(meta), @@ -2277,6 +2280,18 @@ Status CompactionJob::OpenCompactionOutputFile( meta.oldest_ancester_time = oldest_ancester_time; meta.file_creation_time = current_time; meta.temperature = temperature; + assert(!db_id_.empty()); + assert(!db_session_id_.empty()); + s = GetSstInternalUniqueId(db_id_, db_session_id_, meta.fd.GetNumber(), + &meta.unique_id); + if (!s.ok()) { + ROCKS_LOG_ERROR(db_options_.info_log, + "[%s] [JOB %d] file #%" PRIu64 + " failed to generate unique id: %s.", + cfd->GetName().c_str(), job_id_, meta.fd.GetNumber(), + s.ToString().c_str()); + return s; + } sub_compact->outputs.emplace_back( std::move(meta), cfd->internal_comparator(), /*enable_order_check=*/ @@ -2596,7 +2611,7 @@ Status CompactionServiceCompactionJob::Run() { meta.fd.largest_seqno, meta.smallest.Encode().ToString(), meta.largest.Encode().ToString(), meta.oldest_ancester_time, meta.file_creation_time, output_file.validator.GetHash(), - meta.marked_for_compaction); + meta.marked_for_compaction, meta.unique_id); } compaction_result_->num_output_records = sub_compact->num_output_records; compaction_result_->total_bytes = sub_compact->total_bytes; @@ -2700,6 +2715,9 @@ static std::unordered_map cs_input_type_info = { {"output_level", {offsetof(struct CompactionServiceInput, output_level), OptionType::kInt, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"db_id", + {offsetof(struct CompactionServiceInput, db_id), + OptionType::kEncodedString}}, {"has_begin", {offsetof(struct CompactionServiceInput, has_begin), OptionType::kBoolean, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, @@ -2757,6 +2775,11 @@ static std::unordered_map {offsetof(struct CompactionServiceOutputFile, marked_for_compaction), OptionType::kBoolean, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"unique_id", + OptionTypeInfo::Array( + offsetof(struct CompactionServiceOutputFile, unique_id), + OptionVerificationType::kNormal, OptionTypeFlags::kNone, + {0, OptionType::kUInt64T})}, }; static std::unordered_map diff --git a/db/compaction/compaction_job.h b/db/compaction/compaction_job.h index 24a77c679..1c50be682 100644 --- a/db/compaction/compaction_job.h +++ b/db/compaction/compaction_job.h @@ -253,6 +253,9 @@ struct CompactionServiceInput { std::vector input_files; int output_level; + // db_id which will be used to generate unique id of sst + std::string db_id; + // information for subcompaction bool has_begin = false; std::string begin; @@ -284,13 +287,15 @@ struct CompactionServiceOutputFile { uint64_t file_creation_time; uint64_t paranoid_hash; bool marked_for_compaction; + UniqueId64x2 unique_id; CompactionServiceOutputFile() = default; CompactionServiceOutputFile( const std::string& name, SequenceNumber smallest, SequenceNumber largest, std::string _smallest_internal_key, std::string _largest_internal_key, uint64_t _oldest_ancester_time, uint64_t _file_creation_time, - uint64_t _paranoid_hash, bool _marked_for_compaction) + uint64_t _paranoid_hash, bool _marked_for_compaction, + UniqueId64x2 _unique_id) : file_name(name), smallest_seqno(smallest), largest_seqno(largest), @@ -299,7 +304,8 @@ struct CompactionServiceOutputFile { oldest_ancester_time(_oldest_ancester_time), file_creation_time(_file_creation_time), paranoid_hash(_paranoid_hash), - marked_for_compaction(_marked_for_compaction) {} + marked_for_compaction(_marked_for_compaction), + unique_id(std::move(_unique_id)) {} }; // CompactionServiceResult contains the compaction result from a different db diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc index f9a2a2855..caf955a61 100644 --- a/db/compaction/compaction_job_test.cc +++ b/db/compaction/compaction_job_test.cc @@ -206,7 +206,7 @@ class CompactionJobTestBase : public testing::Test { oldest_blob_file_number, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownFileChecksum, kUnknownFileChecksumFuncName, kDisableUserTimestamp, - kDisableUserTimestamp); + kDisableUserTimestamp, {}); mutex_.Lock(); EXPECT_OK( @@ -359,8 +359,8 @@ class CompactionJobTestBase : public testing::Test { table_cache_, &event_logger, false, false, dbname_, &compaction_job_stats_, Env::Priority::USER, nullptr /* IOTracer */, /*manual_compaction_paused=*/nullptr, - /*manual_compaction_canceled=*/nullptr, /*db_id=*/"", - /*db_session_id=*/"", full_history_ts_low_); + /*manual_compaction_canceled=*/nullptr, env_->GenerateUniqueId(), + DBImpl::GenerateDbSessionId(nullptr), full_history_ts_low_); VerifyInitializationOfCompactionJobStats(compaction_job_stats_); compaction_job.Prepare(); @@ -1227,13 +1227,14 @@ TEST_F(CompactionJobTest, ResultSerialization) { result.status = status_list.at(rnd.Uniform(static_cast(status_list.size()))); while (!rnd.OneIn(10)) { + UniqueId64x2 id{rnd64.Uniform(UINT64_MAX), rnd64.Uniform(UINT64_MAX)}; result.output_files.emplace_back( rnd.RandomString(rnd.Uniform(kStrMaxLen)), rnd64.Uniform(UINT64_MAX), rnd64.Uniform(UINT64_MAX), rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen)), rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen)), rnd64.Uniform(UINT64_MAX), rnd64.Uniform(UINT64_MAX), - rnd64.Uniform(UINT64_MAX), rnd.OneIn(2)); + rnd64.Uniform(UINT64_MAX), rnd.OneIn(2), id); } result.output_level = rnd.Uniform(10); result.output_path = rnd.RandomString(rnd.Uniform(kStrMaxLen)); @@ -1261,6 +1262,15 @@ TEST_F(CompactionJobTest, ResultSerialization) { ASSERT_FALSE(deserialized1.TEST_Equals(&result, &mismatch)); ASSERT_EQ(mismatch, "stats.num_input_files"); + // Test unique id mismatch + if (!result.output_files.empty()) { + CompactionServiceResult deserialized_tmp; + ASSERT_OK(CompactionServiceResult::Read(output, &deserialized_tmp)); + deserialized_tmp.output_files[0].unique_id[0] += 1; + ASSERT_FALSE(deserialized_tmp.TEST_Equals(&result, &mismatch)); + ASSERT_EQ(mismatch, "output_files.unique_id"); + } + // Test unknown field CompactionServiceResult deserialized2; output.clear(); diff --git a/db/compaction/compaction_picker_test.cc b/db/compaction/compaction_picker_test.cc index 02f704088..9be3f06e8 100644 --- a/db/compaction/compaction_picker_test.cc +++ b/db/compaction/compaction_picker_test.cc @@ -115,7 +115,7 @@ class CompactionPickerTest : public testing::Test { largest_seq, marked_for_compact, temperature, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kDisableUserTimestamp, kDisableUserTimestamp); + kDisableUserTimestamp, kDisableUserTimestamp, {}); f->compensated_file_size = (compensated_file_size != 0) ? compensated_file_size : file_size; f->oldest_ancester_time = oldest_ancestor_time; diff --git a/db/compaction/compaction_service_test.cc b/db/compaction/compaction_service_test.cc index 6fff7aa7a..cbb48fdbc 100644 --- a/db/compaction/compaction_service_test.cc +++ b/db/compaction/compaction_service_test.cc @@ -274,6 +274,16 @@ TEST_F(CompactionServiceTest, BasicCompactions) { auto s = static_cast(status); *s = Status::Aborted("MyTestCompactionService failed to compact!"); }); + + // tracking success unique id verification + std::atomic_int verify_passed{0}; + SyncPoint::GetInstance()->SetCallBack( + "Version::TryVerifySstUniqueIds::Passed", [&](void* arg) { + // override job status + auto id = static_cast(arg); + assert(!id->empty()); + verify_passed++; + }); SyncPoint::GetInstance()->EnableProcessing(); Status s; @@ -298,6 +308,12 @@ TEST_F(CompactionServiceTest, BasicCompactions) { } } ASSERT_TRUE(s.IsAborted()); + + // Test verification + ASSERT_EQ(verify_passed, 0); + options.verify_sst_unique_id_in_manifest = true; + Reopen(options); + ASSERT_GT(verify_passed, 0); } TEST_F(CompactionServiceTest, ManualCompaction) { diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc index 7ce128b07..dc5dbf0f1 100644 --- a/db/db_impl/db_impl_compaction_flush.cc +++ b/db/db_impl/db_impl_compaction_flush.cc @@ -1651,7 +1651,8 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) { f->smallest, f->largest, f->fd.smallest_seqno, f->fd.largest_seqno, f->marked_for_compaction, f->temperature, f->oldest_blob_file_number, f->oldest_ancester_time, f->file_creation_time, f->file_checksum, - f->file_checksum_func_name, f->min_timestamp, f->max_timestamp); + f->file_checksum_func_name, f->min_timestamp, f->max_timestamp, + f->unique_id); } ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] Apply version edit:\n%s", cfd->GetName().c_str(), @@ -3276,7 +3277,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, f->fd.largest_seqno, f->marked_for_compaction, f->temperature, f->oldest_blob_file_number, f->oldest_ancester_time, f->file_creation_time, f->file_checksum, f->file_checksum_func_name, - f->min_timestamp, f->max_timestamp); + f->min_timestamp, f->max_timestamp, f->unique_id); ROCKS_LOG_BUFFER( log_buffer, diff --git a/db/db_impl/db_impl_experimental.cc b/db/db_impl/db_impl_experimental.cc index 06a51f53a..b6207b654 100644 --- a/db/db_impl/db_impl_experimental.cc +++ b/db/db_impl/db_impl_experimental.cc @@ -137,7 +137,7 @@ Status DBImpl::PromoteL0(ColumnFamilyHandle* column_family, int target_level) { f->fd.largest_seqno, f->marked_for_compaction, f->temperature, f->oldest_blob_file_number, f->oldest_ancester_time, f->file_creation_time, f->file_checksum, f->file_checksum_func_name, - f->min_timestamp, f->max_timestamp); + f->min_timestamp, f->max_timestamp, f->unique_id); } status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc index f1218d1ae..9542f82ea 100644 --- a/db/db_impl/db_impl_open.cc +++ b/db/db_impl/db_impl_open.cc @@ -518,6 +518,23 @@ Status DBImpl::Recover( if (!s.ok()) { return s; } + if (immutable_db_options_.verify_sst_unique_id_in_manifest) { + if (mutable_db_options_.max_open_files != -1) { + ROCKS_LOG_WARN( + immutable_db_options_.info_log, + "verify_sst_unique_id_in_manifest is set to true, but as " + "max_open_files is " + "not -1, only opened SST during DB-open will be verified."); + } + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (!cfd->IsDropped()) { + s = cfd->TryVerifySstUniqueIds(); + if (!s.ok()) { + return s; + } + } + } + } s = SetDBId(read_only); if (s.ok() && !read_only) { s = DeleteUnreferencedSstFiles(); @@ -1498,13 +1515,14 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, constexpr int level = 0; if (s.ok() && has_output) { - edit->AddFile( - level, meta.fd.GetNumber(), meta.fd.GetPathId(), meta.fd.GetFileSize(), - meta.smallest, meta.largest, meta.fd.smallest_seqno, - meta.fd.largest_seqno, meta.marked_for_compaction, meta.temperature, - meta.oldest_blob_file_number, meta.oldest_ancester_time, - meta.file_creation_time, meta.file_checksum, - meta.file_checksum_func_name, meta.min_timestamp, meta.max_timestamp); + edit->AddFile(level, meta.fd.GetNumber(), meta.fd.GetPathId(), + meta.fd.GetFileSize(), meta.smallest, meta.largest, + meta.fd.smallest_seqno, meta.fd.largest_seqno, + meta.marked_for_compaction, meta.temperature, + meta.oldest_blob_file_number, meta.oldest_ancester_time, + meta.file_creation_time, meta.file_checksum, + meta.file_checksum_func_name, meta.min_timestamp, + meta.max_timestamp, meta.unique_id); for (const auto& blob : blob_file_additions) { edit->AddBlobFile(blob); diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc index fb93a4408..8eddc8787 100644 --- a/db/db_impl/db_impl_secondary.cc +++ b/db/db_impl/db_impl_secondary.cc @@ -772,12 +772,15 @@ Status DBImplSecondary::CompactWithoutInstallation( const int job_id = next_job_id_.fetch_add(1); + // use primary host's db_id for running the compaction, but db_session_id is + // using the local one. No special reason, only because local db_id is empty. CompactionServiceCompactionJob compaction_job( job_id, c.get(), immutable_db_options_, mutable_db_options_, file_options_for_compaction_, versions_.get(), &shutting_down_, &log_buffer, output_dir.get(), stats_, &mutex_, &error_handler_, input.snapshots, table_cache_, &event_logger_, dbname_, io_tracer_, - options.canceled, db_id_, db_session_id_, secondary_path_, input, result); + options.canceled, input.db_id, db_session_id_, secondary_path_, input, + result); mutex_.Unlock(); s = compaction_job.Run(); diff --git a/db/db_secondary_test.cc b/db/db_secondary_test.cc index c67e86167..3d705dca9 100644 --- a/db/db_secondary_test.cc +++ b/db/db_secondary_test.cc @@ -181,6 +181,7 @@ TEST_F(DBSecondaryTest, SimpleInternalCompaction) { ASSERT_EQ(input.input_files.size(), 3); input.output_level = 1; + ASSERT_OK(db_->GetDbIdentity(input.db_id)); Close(); options.max_open_files = -1; @@ -241,6 +242,7 @@ TEST_F(DBSecondaryTest, InternalCompactionMultiLevels) { input1.input_files.push_back(meta.levels[1].files[2].name); input1.output_level = 1; + ASSERT_OK(db_->GetDbIdentity(input1.db_id)); options.max_open_files = -1; Close(); @@ -261,6 +263,7 @@ TEST_F(DBSecondaryTest, InternalCompactionMultiLevels) { } input2.output_level = 2; + input2.db_id = input1.db_id; ASSERT_OK(db_secondary_full()->TEST_CompactWithoutInstallation( OpenAndCompactOptions(), cfh, input2, &result)); ASSERT_OK(result.status); @@ -305,6 +308,7 @@ TEST_F(DBSecondaryTest, InternalCompactionCompactedFiles) { ASSERT_EQ(input.input_files.size(), 3); input.output_level = 1; + ASSERT_OK(db_->GetDbIdentity(input.db_id)); // trigger compaction to delete the files for secondary instance compaction ASSERT_OK(Put("foo", "foo_value" + std::to_string(3))); @@ -346,6 +350,7 @@ TEST_F(DBSecondaryTest, InternalCompactionMissingFiles) { ASSERT_EQ(input.input_files.size(), 3); input.output_level = 1; + ASSERT_OK(db_->GetDbIdentity(input.db_id)); Close(); diff --git a/db/db_test2.cc b/db/db_test2.cc index 8cde53f9c..a0bf9aac9 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -7145,6 +7145,113 @@ TEST_F(DBTest2, RenameDirectory) { dbname_ = old_dbname; } +TEST_F(DBTest2, SstUniqueIdVerifyBackwardCompatible) { + const int kNumSst = 3; + const int kLevel0Trigger = 4; + auto options = CurrentOptions(); + options.level0_file_num_compaction_trigger = kLevel0Trigger; + options.statistics = CreateDBStatistics(); + + // Existing manifest doesn't have unique id + SyncPoint::GetInstance()->SetCallBack( + "VersionEdit::EncodeTo:UniqueId", [&](void* arg) { + auto unique_id = static_cast(arg); + // remove id before writing it to manifest + (*unique_id)[0] = 0; + (*unique_id)[1] = 0; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + // generate a few SSTs + for (int i = 0; i < kNumSst; i++) { + for (int j = 0; j < 100; j++) { + ASSERT_OK(Put(Key(i * 10 + j), "value")); + } + ASSERT_OK(Flush()); + } + + // Reopen without verification + Reopen(options); + + // Reopen with verification, but it's skipped because manifest doesn't have id + options.verify_sst_unique_id_in_manifest = true; + Reopen(options); + ASSERT_EQ(options.statistics->getTickerCount( + NUMBER_SST_UNIQUE_ID_VERIFICATION_SKIP), + kNumSst); + + // test compaction generated Sst + for (int i = kNumSst; i < kLevel0Trigger; i++) { + for (int j = 0; j < 100; j++) { + ASSERT_OK(Put(Key(i * 10 + j), "value")); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + +#ifndef ROCKSDB_LITE + ASSERT_EQ("0,1", FilesPerLevel(0)); +#endif // ROCKSDB_LITE + + // Reopen without verification should fail + options.verify_sst_unique_id_in_manifest = true; + ASSERT_OK(options.statistics->Reset()); + Reopen(options); + ASSERT_EQ(options.statistics->getTickerCount( + NUMBER_SST_UNIQUE_ID_VERIFICATION_SKIP), + 1); +} + +TEST_F(DBTest2, SstUniqueIdVerify) { + const int kNumSst = 3; + const int kLevel0Trigger = 4; + auto options = CurrentOptions(); + options.level0_file_num_compaction_trigger = kLevel0Trigger; + + SyncPoint::GetInstance()->SetCallBack( + "PropertyBlockBuilder::AddTableProperty:Start", [&](void* props_vs) { + auto props = static_cast(props_vs); + // update table property session_id to a different one + props->db_session_id = DBImpl::GenerateDbSessionId(nullptr); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + // generate a few SSTs + for (int i = 0; i < kNumSst; i++) { + for (int j = 0; j < 100; j++) { + ASSERT_OK(Put(Key(i * 10 + j), "value")); + } + ASSERT_OK(Flush()); + } + + // Reopen with verification should report corruption + options.verify_sst_unique_id_in_manifest = true; + auto s = TryReopen(options); + ASSERT_TRUE(s.IsCorruption()); + + // Reopen without verification should be fine + options.verify_sst_unique_id_in_manifest = false; + Reopen(options); + + // test compaction generated Sst + for (int i = kNumSst; i < kLevel0Trigger; i++) { + for (int j = 0; j < 100; j++) { + ASSERT_OK(Put(Key(i * 10 + j), "value")); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + +#ifndef ROCKSDB_LITE + ASSERT_EQ("0,1", FilesPerLevel(0)); +#endif // ROCKSDB_LITE + + // Reopen without verification should fail + options.verify_sst_unique_id_in_manifest = true; + s = TryReopen(options); + ASSERT_TRUE(s.IsCorruption()); +} + #ifndef ROCKSDB_LITE TEST_F(DBTest2, GetLatestSeqAndTsForKey) { Destroy(last_options_); diff --git a/db/experimental.cc b/db/experimental.cc index e2917a443..d0de8a40f 100644 --- a/db/experimental.cc +++ b/db/experimental.cc @@ -112,7 +112,7 @@ Status UpdateManifestForFilesState( lf->oldest_blob_file_number, lf->oldest_ancester_time, lf->file_creation_time, lf->file_checksum, lf->file_checksum_func_name, - lf->min_timestamp, lf->max_timestamp); + lf->min_timestamp, lf->max_timestamp, lf->unique_id); } } } else { diff --git a/db/external_sst_file_basic_test.cc b/db/external_sst_file_basic_test.cc index 1ef3fe784..343ecc421 100644 --- a/db/external_sst_file_basic_test.cc +++ b/db/external_sst_file_basic_test.cc @@ -1866,6 +1866,60 @@ TEST_F(ExternalSSTFileBasicTest, VerifyChecksum) { ASSERT_OK(db_->VerifyChecksum()); } +TEST_F(ExternalSSTFileBasicTest, VerifySstUniqueId) { + const std::string kPutVal = "put_val"; + const std::string kIngestedVal = "ingested_val"; + + ASSERT_OK(Put("k", kPutVal, WriteOptions())); + ASSERT_OK(Flush()); + + std::string external_file = sst_files_dir_ + "/file_to_ingest.sst"; + { + SstFileWriter sst_file_writer{EnvOptions(), CurrentOptions()}; + + ASSERT_OK(sst_file_writer.Open(external_file)); + ASSERT_OK(sst_file_writer.Put("k", kIngestedVal)); + ASSERT_OK(sst_file_writer.Finish()); + } + + ASSERT_OK(db_->IngestExternalFile(db_->DefaultColumnFamily(), {external_file}, + IngestExternalFileOptions())); + auto options = CurrentOptions(); + options.verify_sst_unique_id_in_manifest = true; + Reopen(options); + + // Test ingest file without session_id and db_id (for example generated by an + // older version of sst_writer) + SyncPoint::GetInstance()->SetCallBack( + "PropertyBlockBuilder::AddTableProperty:Start", [&](void* props_vs) { + auto props = static_cast(props_vs); + // update table property session_id to a different one + props->db_session_id = ""; + props->db_id = ""; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + external_file = sst_files_dir_ + "/file_to_ingest2.sst"; + { + SstFileWriter sst_file_writer{EnvOptions(), CurrentOptions()}; + + ASSERT_OK(sst_file_writer.Open(external_file)); + ASSERT_OK(sst_file_writer.Put("k", kIngestedVal)); + ASSERT_OK(sst_file_writer.Finish()); + } + + ASSERT_OK(db_->IngestExternalFile(db_->DefaultColumnFamily(), {external_file}, + IngestExternalFileOptions())); + + options.statistics = CreateDBStatistics(); + options.verify_sst_unique_id_in_manifest = true; + Reopen(options); + // only one sst file is not verified because of missing unique_id + ASSERT_EQ(options.statistics->getTickerCount( + NUMBER_SST_UNIQUE_ID_VERIFICATION_SKIP), + 1); +} + INSTANTIATE_TEST_CASE_P(ExternalSSTFileBasicTest, ExternalSSTFileBasicTest, testing::Values(std::make_tuple(true, true), std::make_tuple(true, false), diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc index 51b636678..c9e31b747 100644 --- a/db/external_sst_file_ingestion_job.cc +++ b/db/external_sst_file_ingestion_job.cc @@ -22,6 +22,7 @@ #include "table/scoped_arena_iterator.h" #include "table/sst_file_writer_collectors.h" #include "table/table_builder.h" +#include "table/unique_id_impl.h" #include "test_util/sync_point.h" #include "util/stop_watch.h" @@ -446,8 +447,8 @@ Status ExternalSstFileIngestionJob::Run() { f.smallest_internal_key, f.largest_internal_key, f.assigned_seqno, f.assigned_seqno, false, f.file_temperature, kInvalidBlobFileNumber, oldest_ancester_time, current_time, f.file_checksum, - f.file_checksum_func_name, kDisableUserTimestamp, - kDisableUserTimestamp); + f.file_checksum_func_name, kDisableUserTimestamp, kDisableUserTimestamp, + f.unique_id); f_metadata.temperature = f.file_temperature; edit_.AddFile(f.picked_level, f_metadata); } @@ -727,6 +728,15 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo( file_to_ingest->table_properties = *props; + auto s = GetSstInternalUniqueId(props->db_id, props->db_session_id, + props->orig_file_number, + &(file_to_ingest->unique_id)); + if (!s.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Failed to get SST unique id for file %s", + file_to_ingest->internal_file_path.c_str()); + } + return status; } diff --git a/db/external_sst_file_ingestion_job.h b/db/external_sst_file_ingestion_job.h index ca97fd48f..1f1b341d7 100644 --- a/db/external_sst_file_ingestion_job.h +++ b/db/external_sst_file_ingestion_job.h @@ -70,6 +70,8 @@ struct IngestedFileInfo { std::string file_checksum_func_name; // The temperature of the file to be ingested Temperature file_temperature = Temperature::kUnknown; + // Unique id of the file to be ingested + UniqueId64x2 unique_id; }; class ExternalSstFileIngestionJob { diff --git a/db/flush_job.cc b/db/flush_job.cc index adb98fed5..b71d41e63 100644 --- a/db/flush_job.cc +++ b/db/flush_job.cc @@ -9,9 +9,8 @@ #include "db/flush_job.h" -#include - #include +#include #include #include "db/builder.h" @@ -988,7 +987,7 @@ Status FlushJob::WriteLevel0Table() { meta_.oldest_blob_file_number, meta_.oldest_ancester_time, meta_.file_creation_time, meta_.file_checksum, meta_.file_checksum_func_name, meta_.min_timestamp, - meta_.max_timestamp); + meta_.max_timestamp, meta_.unique_id); edit_->SetBlobFileAdditions(std::move(blob_file_additions)); } diff --git a/db/import_column_family_job.cc b/db/import_column_family_job.cc index 951fc3819..da8872ea5 100644 --- a/db/import_column_family_job.cc +++ b/db/import_column_family_job.cc @@ -15,6 +15,7 @@ #include "table/scoped_arena_iterator.h" #include "table/sst_file_writer_collectors.h" #include "table/table_builder.h" +#include "table/unique_id_impl.h" #include "util/stop_watch.h" namespace ROCKSDB_NAMESPACE { @@ -156,7 +157,7 @@ Status ImportColumnFamilyJob::Run() { file_metadata.largest_seqno, false, file_metadata.temperature, kInvalidBlobFileNumber, oldest_ancester_time, current_time, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kDisableUserTimestamp, kDisableUserTimestamp); + kDisableUserTimestamp, kDisableUserTimestamp, f.unique_id); // If incoming sequence number is higher, update local sequence number. if (file_metadata.largest_seqno > versions_->LastSequence()) { @@ -285,6 +286,15 @@ Status ImportColumnFamilyJob::GetIngestedFileInfo( file_to_import->table_properties = *props; + auto s = GetSstInternalUniqueId(props->db_id, props->db_session_id, + props->orig_file_number, + &(file_to_import->unique_id)); + if (!s.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Failed to get SST unique id for file %s", + file_to_import->internal_file_path.c_str()); + } + return status; } diff --git a/db/import_column_family_test.cc b/db/import_column_family_test.cc index e459e935e..c78ecfad6 100644 --- a/db/import_column_family_test.cc +++ b/db/import_column_family_test.cc @@ -130,6 +130,12 @@ TEST_F(ImportColumnFamilyTest, ImportSSTFileWriterFiles) { ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, "K4", &value)); ASSERT_EQ(value, "V2"); } + EXPECT_OK(db_->DestroyColumnFamilyHandle(import_cfh_)); + import_cfh_ = nullptr; + + // verify sst unique id during reopen + options.verify_sst_unique_id_in_manifest = true; + ReopenWithColumnFamilies({"default", "koko", "yoyo"}, options); } TEST_F(ImportColumnFamilyTest, ImportSSTFileWriterFilesWithOverlap) { diff --git a/db/repair.cc b/db/repair.cc index ac237001c..4515be84b 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -82,6 +82,7 @@ #include "rocksdb/options.h" #include "rocksdb/write_buffer_manager.h" #include "table/scoped_arena_iterator.h" +#include "table/unique_id_impl.h" #include "util/string_util.h" namespace ROCKSDB_NAMESPACE { @@ -505,6 +506,15 @@ class Repairer { t->meta.fd, &props); } if (status.ok()) { + auto s = + GetSstInternalUniqueId(props->db_id, props->db_session_id, + props->orig_file_number, &t->meta.unique_id); + if (!s.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Table #%" PRIu64 + ": unable to get unique id, default to Unknown.", + t->meta.fd.GetNumber()); + } t->column_family_id = static_cast(props->column_family_id); if (t->column_family_id == TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) { @@ -639,7 +649,8 @@ class Repairer { table->meta.temperature, table->meta.oldest_blob_file_number, table->meta.oldest_ancester_time, table->meta.file_creation_time, table->meta.file_checksum, table->meta.file_checksum_func_name, - table->meta.min_timestamp, table->meta.max_timestamp); + table->meta.min_timestamp, table->meta.max_timestamp, + table->meta.unique_id); } assert(next_file_number_ > 0); vset_.MarkFileNumberUsed(next_file_number_ - 1); diff --git a/db/repair_test.cc b/db/repair_test.cc index eb0fa0446..d1ecbe6c3 100644 --- a/db/repair_test.cc +++ b/db/repair_test.cc @@ -43,6 +43,23 @@ class RepairTest : public DBTestBase { } return s; } + + void ReopenWithSstIdVerify() { + std::atomic_int verify_passed{0}; + SyncPoint::GetInstance()->SetCallBack( + "Version::TryVerifySstUniqueIds::Passed", [&](void* arg) { + // override job status + auto id = static_cast(arg); + assert(!id->empty()); + verify_passed++; + }); + SyncPoint::GetInstance()->EnableProcessing(); + auto options = CurrentOptions(); + options.verify_sst_unique_id_in_manifest = true; + Reopen(options); + + ASSERT_GT(verify_passed, 0); + } }; TEST_F(RepairTest, LostManifest) { @@ -61,7 +78,7 @@ TEST_F(RepairTest, LostManifest) { ASSERT_OK(env_->FileExists(manifest_path)); ASSERT_OK(env_->DeleteFile(manifest_path)); ASSERT_OK(RepairDB(dbname_, CurrentOptions())); - Reopen(CurrentOptions()); + ReopenWithSstIdVerify(); ASSERT_EQ(Get("key"), "val"); ASSERT_EQ(Get("key2"), "val2"); @@ -88,7 +105,9 @@ TEST_F(RepairTest, LostManifestMoreDbFeatures) { ASSERT_OK(env_->FileExists(manifest_path)); ASSERT_OK(env_->DeleteFile(manifest_path)); ASSERT_OK(RepairDB(dbname_, CurrentOptions())); - Reopen(CurrentOptions()); + + // repair from sst should work with unique_id verification + ReopenWithSstIdVerify(); ASSERT_EQ(Get("key"), "val"); ASSERT_EQ(Get("key2"), "NOT_FOUND"); @@ -113,7 +132,8 @@ TEST_F(RepairTest, CorruptManifest) { ASSERT_OK(CreateFile(env_->GetFileSystem(), manifest_path, "blah", false /* use_fsync */)); ASSERT_OK(RepairDB(dbname_, CurrentOptions())); - Reopen(CurrentOptions()); + + ReopenWithSstIdVerify(); ASSERT_EQ(Get("key"), "val"); ASSERT_EQ(Get("key2"), "val2"); @@ -139,7 +159,8 @@ TEST_F(RepairTest, IncompleteManifest) { // Replace the manifest with one that is only aware of the first SST file. CopyFile(orig_manifest_path + ".tmp", new_manifest_path); ASSERT_OK(RepairDB(dbname_, CurrentOptions())); - Reopen(CurrentOptions()); + + ReopenWithSstIdVerify(); ASSERT_EQ(Get("key"), "val"); ASSERT_EQ(Get("key2"), "val2"); @@ -157,7 +178,8 @@ TEST_F(RepairTest, PostRepairSstFileNumbering) { ASSERT_OK(RepairDB(dbname_, CurrentOptions())); - Reopen(CurrentOptions()); + ReopenWithSstIdVerify(); + uint64_t post_repair_file_num = dbfull()->TEST_Current_Next_FileNo(); ASSERT_GE(post_repair_file_num, pre_repair_file_num); } @@ -176,7 +198,7 @@ TEST_F(RepairTest, LostSst) { Close(); ASSERT_OK(RepairDB(dbname_, CurrentOptions())); - Reopen(CurrentOptions()); + ReopenWithSstIdVerify(); // Exactly one of the key-value pairs should be in the DB now. ASSERT_TRUE((Get("key") == "val") != (Get("key2") == "val2")); @@ -198,7 +220,7 @@ TEST_F(RepairTest, CorruptSst) { Close(); ASSERT_OK(RepairDB(dbname_, CurrentOptions())); - Reopen(CurrentOptions()); + ReopenWithSstIdVerify(); // Exactly one of the key-value pairs should be in the DB now. ASSERT_TRUE((Get("key") == "val") != (Get("key2") == "val2")); @@ -226,7 +248,7 @@ TEST_F(RepairTest, UnflushedSst) { ASSERT_OK(env_->FileExists(manifest_path)); ASSERT_OK(env_->DeleteFile(manifest_path)); ASSERT_OK(RepairDB(dbname_, CurrentOptions())); - Reopen(CurrentOptions()); + ReopenWithSstIdVerify(); ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files)); ASSERT_EQ(wal_files.size(), 0); @@ -265,7 +287,7 @@ TEST_F(RepairTest, SeparateWalDir) { // make sure that all WALs are converted to SSTables. options.wal_dir = ""; - Reopen(options); + ReopenWithSstIdVerify(); ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files)); ASSERT_EQ(wal_files.size(), 0); { @@ -398,7 +420,7 @@ TEST_F(RepairTest, DbNameContainsTrailingSlash) { Close(); ASSERT_OK(RepairDB(dbname_ + "/", CurrentOptions())); - Reopen(CurrentOptions()); + ReopenWithSstIdVerify(); ASSERT_EQ(Get("key"), "val"); } #endif // ROCKSDB_LITE diff --git a/db/version_builder_test.cc b/db/version_builder_test.cc index 6476a3150..27b8107a1 100644 --- a/db/version_builder_test.cc +++ b/db/version_builder_test.cc @@ -72,7 +72,7 @@ class VersionBuilderTest : public testing::Test { oldest_blob_file_number, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownFileChecksum, kUnknownFileChecksumFuncName, kDisableUserTimestamp, - kDisableUserTimestamp); + kDisableUserTimestamp, {}); f->compensated_file_size = file_size; f->num_entries = num_entries; f->num_deletions = num_deletions; @@ -134,7 +134,7 @@ class VersionBuilderTest : public testing::Test { Temperature::kUnknown, blob_file_number, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kDisableUserTimestamp, kDisableUserTimestamp); + kDisableUserTimestamp, kDisableUserTimestamp, {}); } void UpdateVersionStorageInfo(VersionStorageInfo* vstorage) { @@ -180,7 +180,7 @@ TEST_F(VersionBuilderTest, ApplyAndSaveTo) { Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kDisableUserTimestamp, kDisableUserTimestamp); + kDisableUserTimestamp, kDisableUserTimestamp, {}); version_edit.DeleteFile(3, 27U); EnvOptions env_options; @@ -224,7 +224,7 @@ TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic) { Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kDisableUserTimestamp, kDisableUserTimestamp); + kDisableUserTimestamp, kDisableUserTimestamp, {}); version_edit.DeleteFile(0, 1U); version_edit.DeleteFile(0, 88U); @@ -271,7 +271,7 @@ TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic2) { Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kDisableUserTimestamp, kDisableUserTimestamp); + kDisableUserTimestamp, kDisableUserTimestamp, {}); version_edit.DeleteFile(0, 1U); version_edit.DeleteFile(0, 88U); version_edit.DeleteFile(4, 6U); @@ -308,31 +308,31 @@ TEST_F(VersionBuilderTest, ApplyMultipleAndSaveTo) { Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kDisableUserTimestamp, kDisableUserTimestamp); + kDisableUserTimestamp, kDisableUserTimestamp, {}); version_edit.AddFile(2, 676, 0, 100U, GetInternalKey("401"), GetInternalKey("450"), 200, 200, false, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kDisableUserTimestamp, kDisableUserTimestamp); + kDisableUserTimestamp, kDisableUserTimestamp, {}); version_edit.AddFile(2, 636, 0, 100U, GetInternalKey("601"), GetInternalKey("650"), 200, 200, false, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kDisableUserTimestamp, kDisableUserTimestamp); + kDisableUserTimestamp, kDisableUserTimestamp, {}); version_edit.AddFile(2, 616, 0, 100U, GetInternalKey("501"), GetInternalKey("550"), 200, 200, false, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kDisableUserTimestamp, kDisableUserTimestamp); + kDisableUserTimestamp, kDisableUserTimestamp, {}); version_edit.AddFile(2, 606, 0, 100U, GetInternalKey("701"), GetInternalKey("750"), 200, 200, false, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kDisableUserTimestamp, kDisableUserTimestamp); + kDisableUserTimestamp, kDisableUserTimestamp, {}); EnvOptions env_options; constexpr TableCache* table_cache = nullptr; @@ -372,31 +372,31 @@ TEST_F(VersionBuilderTest, ApplyDeleteAndSaveTo) { Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kDisableUserTimestamp, kDisableUserTimestamp); + kDisableUserTimestamp, kDisableUserTimestamp, {}); version_edit.AddFile(2, 676, 0, 100U, GetInternalKey("401"), GetInternalKey("450"), 200, 200, false, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kDisableUserTimestamp, kDisableUserTimestamp); + kDisableUserTimestamp, kDisableUserTimestamp, {}); version_edit.AddFile(2, 636, 0, 100U, GetInternalKey("601"), GetInternalKey("650"), 200, 200, false, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kDisableUserTimestamp, kDisableUserTimestamp); + kDisableUserTimestamp, kDisableUserTimestamp, {}); version_edit.AddFile(2, 616, 0, 100U, GetInternalKey("501"), GetInternalKey("550"), 200, 200, false, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kDisableUserTimestamp, kDisableUserTimestamp); + kDisableUserTimestamp, kDisableUserTimestamp, {}); version_edit.AddFile(2, 606, 0, 100U, GetInternalKey("701"), GetInternalKey("750"), 200, 200, false, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kDisableUserTimestamp, kDisableUserTimestamp); + kDisableUserTimestamp, kDisableUserTimestamp, {}); ASSERT_OK(version_builder.Apply(&version_edit)); VersionEdit version_edit2; @@ -405,7 +405,7 @@ TEST_F(VersionBuilderTest, ApplyDeleteAndSaveTo) { Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kDisableUserTimestamp, kDisableUserTimestamp); + kDisableUserTimestamp, kDisableUserTimestamp, {}); version_edit2.DeleteFile(2, 616); version_edit2.DeleteFile(2, 636); version_edit.AddFile(2, 806, 0, 100U, GetInternalKey("801"), @@ -413,7 +413,7 @@ TEST_F(VersionBuilderTest, ApplyDeleteAndSaveTo) { Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kDisableUserTimestamp, kDisableUserTimestamp); + kDisableUserTimestamp, kDisableUserTimestamp, {}); ASSERT_OK(version_builder.Apply(&version_edit2)); ASSERT_OK(version_builder.SaveTo(&new_vstorage)); @@ -525,7 +525,7 @@ TEST_F(VersionBuilderTest, ApplyFileDeletionAndAddition) { kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownFileChecksum, kUnknownFileChecksumFuncName, kDisableUserTimestamp, - kDisableUserTimestamp); + kDisableUserTimestamp, {}); ASSERT_OK(builder.Apply(&addition)); @@ -575,7 +575,7 @@ TEST_F(VersionBuilderTest, ApplyFileAdditionAlreadyInBase) { Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kDisableUserTimestamp, kDisableUserTimestamp); + kDisableUserTimestamp, kDisableUserTimestamp, {}); const Status s = builder.Apply(&edit); ASSERT_TRUE(s.IsCorruption()); @@ -612,7 +612,7 @@ TEST_F(VersionBuilderTest, ApplyFileAdditionAlreadyApplied) { kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownFileChecksum, kUnknownFileChecksumFuncName, kDisableUserTimestamp, - kDisableUserTimestamp); + kDisableUserTimestamp, {}); ASSERT_OK(builder.Apply(&edit)); @@ -626,7 +626,7 @@ TEST_F(VersionBuilderTest, ApplyFileAdditionAlreadyApplied) { Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kDisableUserTimestamp, kDisableUserTimestamp); + kDisableUserTimestamp, kDisableUserTimestamp, {}); const Status s = builder.Apply(&other_edit); ASSERT_TRUE(s.IsCorruption()); @@ -663,7 +663,7 @@ TEST_F(VersionBuilderTest, ApplyFileAdditionAndDeletion) { Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kDisableUserTimestamp, kDisableUserTimestamp); + kDisableUserTimestamp, kDisableUserTimestamp, {}); ASSERT_OK(builder.Apply(&addition)); @@ -1232,7 +1232,7 @@ TEST_F(VersionBuilderTest, SaveBlobFilesToConcurrentJobs) { GetInternalKey(largest), smallest_seqno, largest_seqno, marked_for_compaction, Temperature::kUnknown, blob_file_number, kUnknownOldestAncesterTime, kUnknownFileCreationTime, checksum_value, - checksum_method, kDisableUserTimestamp, kDisableUserTimestamp); + checksum_method, kDisableUserTimestamp, kDisableUserTimestamp, {}); edit.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes, checksum_method, checksum_value); @@ -1320,7 +1320,7 @@ TEST_F(VersionBuilderTest, CheckConsistencyForBlobFiles) { /* oldest_blob_file_number */ 16, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownFileChecksum, kUnknownFileChecksumFuncName, kDisableUserTimestamp, - kDisableUserTimestamp); + kDisableUserTimestamp, {}); edit.AddFile(/* level */ 1, /* file_number */ 700, /* path_id */ 0, /* file_size */ 100, /* smallest */ GetInternalKey("801"), @@ -1330,7 +1330,7 @@ TEST_F(VersionBuilderTest, CheckConsistencyForBlobFiles) { /* oldest_blob_file_number */ 1000, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownFileChecksum, kUnknownFileChecksumFuncName, kDisableUserTimestamp, - kDisableUserTimestamp); + kDisableUserTimestamp, {}); edit.AddBlobFile(/* blob_file_number */ 1000, /* total_blob_count */ 2000, /* total_blob_bytes */ 200000, /* checksum_method */ std::string(), @@ -1552,7 +1552,7 @@ TEST_F(VersionBuilderTest, MaintainLinkedSstsForBlobFiles) { /* oldest_blob_file_number */ 1, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownFileChecksum, kUnknownFileChecksumFuncName, kDisableUserTimestamp, - kDisableUserTimestamp); + kDisableUserTimestamp, {}); // Add an SST that does not reference any blob files. edit.AddFile( @@ -1563,7 +1563,7 @@ TEST_F(VersionBuilderTest, MaintainLinkedSstsForBlobFiles) { Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownFileChecksum, kUnknownFileChecksumFuncName, kDisableUserTimestamp, - kDisableUserTimestamp); + kDisableUserTimestamp, {}); // Delete a file that references a blob file. edit.DeleteFile(/* level */ 1, /* file_number */ 6); @@ -1586,7 +1586,7 @@ TEST_F(VersionBuilderTest, MaintainLinkedSstsForBlobFiles) { /* oldest_blob_file_number */ 3, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownFileChecksum, kUnknownFileChecksumFuncName, kDisableUserTimestamp, - kDisableUserTimestamp); + kDisableUserTimestamp, {}); // Trivially move a file that does not reference any blob files. edit.DeleteFile(/* level */ 1, /* file_number */ 13); @@ -1598,7 +1598,7 @@ TEST_F(VersionBuilderTest, MaintainLinkedSstsForBlobFiles) { Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kDisableUserTimestamp, kDisableUserTimestamp); + kDisableUserTimestamp, kDisableUserTimestamp, {}); // Add one more SST file that references a blob file, then promptly // delete it in a second version edit before the new version gets saved. @@ -1612,7 +1612,7 @@ TEST_F(VersionBuilderTest, MaintainLinkedSstsForBlobFiles) { /* oldest_blob_file_number */ 5, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownFileChecksum, kUnknownFileChecksumFuncName, kDisableUserTimestamp, - kDisableUserTimestamp); + kDisableUserTimestamp, {}); VersionEdit edit2; diff --git a/db/version_edit.cc b/db/version_edit.cc index b9bf9d685..7888f8c5d 100644 --- a/db/version_edit.cc +++ b/db/version_edit.cc @@ -13,6 +13,7 @@ #include "db/version_set.h" #include "logging/event_logger.h" #include "rocksdb/slice.h" +#include "table/unique_id_impl.h" #include "test_util/sync_point.h" #include "util/coding.h" #include "util/string_util.h" @@ -221,6 +222,14 @@ bool VersionEdit::EncodeTo(std::string* dst) const { PutVarint64(&oldest_blob_file_number, f.oldest_blob_file_number); PutLengthPrefixedSlice(dst, Slice(oldest_blob_file_number)); } + UniqueId64x2 unique_id = f.unique_id; + TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:UniqueId", &unique_id); + if (!IsUniqueIdUnknown(unique_id)) { + PutVarint32(dst, NewFileCustomTag::kUniqueId); + std::string unique_id_str = EncodeUniqueIdBytes(unique_id); + PutLengthPrefixedSlice(dst, Slice(unique_id_str)); + } + TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:NewFile4:CustomizeFields", dst); @@ -392,6 +401,11 @@ const char* VersionEdit::DecodeNewFile4From(Slice* input) { case kMaxTimestamp: f.max_timestamp = field.ToString(); break; + case kUniqueId: + if (!DecodeUniqueIdBytes(field.ToString(), &f.unique_id).ok()) { + return "invalid unique id"; + } + break; default: if ((custom_tag & kCustomTagNonSafeIgnoreMask) != 0) { // Should not proceed if cannot understand it @@ -819,6 +833,10 @@ std::string VersionEdit::DebugString(bool hex_key) const { // permanent r.append(std::to_string(static_cast(f.temperature))); } + if (!f.unique_id.empty()) { + r.append(" unique_id(internal): "); + r.append(UniqueIdToHumanString(f.unique_id)); + } } for (const auto& blob_file_addition : blob_file_additions_) { diff --git a/db/version_edit.h b/db/version_edit.h index 6674cc1f9..38e4ad372 100644 --- a/db/version_edit.h +++ b/db/version_edit.h @@ -85,6 +85,7 @@ enum NewFileCustomTag : uint32_t { kTemperature = 9, kMinTimestamp = 10, kMaxTimestamp = 11, + kUniqueId = 12, // If this bit for the custom tag is set, opening DB should fail if // we don't know this field. @@ -102,6 +103,8 @@ constexpr uint64_t kUnknownFileCreationTime = 0; extern uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id); +using UniqueId64x2 = std::array; + // A copyable structure contains information needed to read data from an SST // file. It can contain a pointer to a table reader opened for the file, or // file number and size, which can be used to create a new table reader for it. @@ -217,6 +220,9 @@ struct FileMetaData { // Max (newest) timestamp of keys in this file std::string max_timestamp; + // SST unique id + UniqueId64x2 unique_id{}; + FileMetaData() = default; FileMetaData(uint64_t file, uint32_t file_path_id, uint64_t file_size, @@ -227,7 +233,8 @@ struct FileMetaData { uint64_t _oldest_ancester_time, uint64_t _file_creation_time, const std::string& _file_checksum, const std::string& _file_checksum_func_name, - std::string _min_timestamp, std::string _max_timestamp) + std::string _min_timestamp, std::string _max_timestamp, + UniqueId64x2 _unique_id) : fd(file, file_path_id, file_size, smallest_seq, largest_seq), smallest(smallest_key), largest(largest_key), @@ -239,7 +246,8 @@ struct FileMetaData { file_checksum(_file_checksum), file_checksum_func_name(_file_checksum_func_name), min_timestamp(std::move(_min_timestamp)), - max_timestamp(std::move(_max_timestamp)) { + max_timestamp(std::move(_max_timestamp)), + unique_id(std::move(_unique_id)) { TEST_SYNC_POINT_CALLBACK("FileMetaData::FileMetaData", this); } @@ -408,7 +416,8 @@ class VersionEdit { const std::string& file_checksum, const std::string& file_checksum_func_name, const std::string& min_timestamp, - const std::string& max_timestamp) { + const std::string& max_timestamp, + const UniqueId64x2& unique_id) { assert(smallest_seqno <= largest_seqno); new_files_.emplace_back( level, @@ -416,7 +425,7 @@ class VersionEdit { smallest_seqno, largest_seqno, marked_for_compaction, temperature, oldest_blob_file_number, oldest_ancester_time, file_creation_time, file_checksum, file_checksum_func_name, - min_timestamp, max_timestamp)); + min_timestamp, max_timestamp, unique_id)); if (!HasLastSequence() || largest_seqno > GetLastSequence()) { SetLastSequence(largest_seqno); } diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc index 66c006161..254f116aa 100644 --- a/db/version_edit_test.cc +++ b/db/version_edit_test.cc @@ -43,7 +43,7 @@ TEST_F(VersionEditTest, EncodeDecode) { InternalKey("zoo", kBig + 600 + i, kTypeDeletion), kBig + 500 + i, kBig + 600 + i, false, Temperature::kUnknown, kInvalidBlobFileNumber, 888, 678, "234", "crc32c", "123", - "345"); + "345", {}); edit.DeleteFile(4, kBig + 700 + i); } @@ -62,25 +62,25 @@ TEST_F(VersionEditTest, EncodeDecodeNewFile4) { InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500, kBig + 600, true, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, "123", - "234"); + kUnknownFileChecksum, kUnknownFileChecksumFuncName, "123", "234", + {}); edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue), InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501, kBig + 601, false, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, "345", - "543"); + kUnknownFileChecksum, kUnknownFileChecksumFuncName, "345", "543", + {}); edit.AddFile(5, 302, 0, 100, InternalKey("foo", kBig + 502, kTypeValue), InternalKey("zoo", kBig + 602, kTypeDeletion), kBig + 502, kBig + 602, true, Temperature::kUnknown, kInvalidBlobFileNumber, 666, 888, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - "456", "567"); + "456", "567", {}); edit.AddFile(5, 303, 0, 100, InternalKey("foo", kBig + 503, kTypeBlobIndex), InternalKey("zoo", kBig + 603, kTypeBlobIndex), kBig + 503, kBig + 603, true, Temperature::kUnknown, 1001, kUnknownOldestAncesterTime, kUnknownFileCreationTime, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, "678", - "789"); + kUnknownFileChecksum, kUnknownFileChecksumFuncName, "678", "789", + {}); ; edit.DeleteFile(4, 700); @@ -129,13 +129,13 @@ TEST_F(VersionEditTest, ForwardCompatibleNewFile4) { InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500, kBig + 600, true, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, "123", - "234"); + kUnknownFileChecksum, kUnknownFileChecksumFuncName, "123", "234", + {}); edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue), InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501, kBig + 601, false, Temperature::kUnknown, kInvalidBlobFileNumber, 686, 868, "234", "crc32c", kDisableUserTimestamp, - kDisableUserTimestamp); + kDisableUserTimestamp, {}); edit.DeleteFile(4, 700); edit.SetComparatorName("foo"); @@ -188,7 +188,7 @@ TEST_F(VersionEditTest, NewFile4NotSupportedField) { kBig + 600, true, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kDisableUserTimestamp, kDisableUserTimestamp); + kDisableUserTimestamp, kDisableUserTimestamp, {}); edit.SetComparatorName("foo"); edit.SetLogNumber(kBig + 100); @@ -219,7 +219,7 @@ TEST_F(VersionEditTest, EncodeEmptyFile) { Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kDisableUserTimestamp, kDisableUserTimestamp); + kDisableUserTimestamp, kDisableUserTimestamp, {}); std::string buffer; ASSERT_TRUE(!edit.EncodeTo(&buffer)); } diff --git a/db/version_set.cc b/db/version_set.cc index b21761c5b..4e76770c3 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -59,6 +59,7 @@ #include "table/plain/plain_table_factory.h" #include "table/table_reader.h" #include "table/two_level_iterator.h" +#include "table/unique_id_impl.h" #include "test_util/sync_point.h" #include "util/cast_util.h" #include "util/coding.h" @@ -1534,6 +1535,37 @@ void Version::GetCreationTimeOfOldestFile(uint64_t* creation_time) { *creation_time = oldest_time; } +Status Version::TryVerifySstUniqueIds() { + for (int level = 0; level < storage_info_.num_non_empty_levels_; level++) { + for (FileMetaData* meta : storage_info_.LevelFiles(level)) { + // if table is not opened or manifest doesn't have unique_id, skip + // verifying + if (meta->fd.table_reader != nullptr && + !IsUniqueIdUnknown(meta->unique_id)) { + auto props = meta->fd.table_reader->GetTableProperties(); + UniqueId64x2 id; + auto s = GetSstInternalUniqueId(props->db_id, props->db_session_id, + props->orig_file_number, &id); + if (!s.ok() || id != meta->unique_id) { + std::ostringstream oss; + oss << "SST #" << meta->fd.GetNumber() << " unique ID mismatch. "; + oss << "Manifest: " << UniqueIdToHumanString(meta->unique_id) << ", "; + if (s.ok()) { + oss << "Table Properties: " << UniqueIdToHumanString(id); + } else { + oss << "Failed to get Table Properties: " << s.ToString(); + } + return Status::Corruption("VersionSet", oss.str()); + } + TEST_SYNC_POINT_CALLBACK("Version::TryVerifySstUniqueIds::Passed", &id); + } else { + RecordTick(db_statistics_, NUMBER_SST_UNIQUE_ID_VERIFICATION_SKIP); + } + } + } + return Status::OK(); +} + uint64_t VersionStorageInfo::GetEstimatedActiveKeys() const { // Estimation will be inaccurate when: // (1) there exist merge keys @@ -5492,13 +5524,14 @@ Status VersionSet::WriteCurrentStateToManifest( for (const auto& f : level_files) { assert(f); - edit.AddFile( - level, f->fd.GetNumber(), f->fd.GetPathId(), f->fd.GetFileSize(), - f->smallest, f->largest, f->fd.smallest_seqno, - f->fd.largest_seqno, f->marked_for_compaction, f->temperature, - f->oldest_blob_file_number, f->oldest_ancester_time, - f->file_creation_time, f->file_checksum, - f->file_checksum_func_name, f->min_timestamp, f->max_timestamp); + edit.AddFile(level, f->fd.GetNumber(), f->fd.GetPathId(), + f->fd.GetFileSize(), f->smallest, f->largest, + f->fd.smallest_seqno, f->fd.largest_seqno, + f->marked_for_compaction, f->temperature, + f->oldest_blob_file_number, f->oldest_ancester_time, + f->file_creation_time, f->file_checksum, + f->file_checksum_func_name, f->min_timestamp, + f->max_timestamp, f->unique_id); } } diff --git a/db/version_set.h b/db/version_set.h index 5afd1202f..3ee68a359 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -851,6 +851,8 @@ class Version { const MutableCFOptions& GetMutableCFOptions() { return mutable_cf_options_; } + Status TryVerifySstUniqueIds(); + private: Env* env_; SystemClock* clock_; diff --git a/db/version_set_test.cc b/db/version_set_test.cc index c48547508..e418f3083 100644 --- a/db/version_set_test.cc +++ b/db/version_set_test.cc @@ -49,7 +49,7 @@ class GenerateLevelFilesBriefTest : public testing::Test { kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownFileChecksum, kUnknownFileChecksumFuncName, kDisableUserTimestamp, - kDisableUserTimestamp); + kDisableUserTimestamp, {}); files_.push_back(f); } @@ -158,7 +158,7 @@ class VersionStorageInfoTestBase : public testing::Test { Temperature::kUnknown, oldest_blob_file_number, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kDisableUserTimestamp, kDisableUserTimestamp); + kDisableUserTimestamp, kDisableUserTimestamp, {}); f->compensated_file_size = file_size; vstorage_.AddFile(level, f); } @@ -3222,11 +3222,12 @@ class VersionSetTestMissingFiles : public VersionSetTestBase, s = fs_->GetFileSize(fname, IOOptions(), &file_size, nullptr); ASSERT_OK(s); ASSERT_NE(0, file_size); - file_metas->emplace_back(file_num, /*file_path_id=*/0, file_size, ikey, - ikey, 0, 0, false, Temperature::kUnknown, 0, 0, - 0, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, - kDisableUserTimestamp, kDisableUserTimestamp); + UniqueId64x2 id{}; + file_metas->emplace_back( + file_num, /*file_path_id=*/0, file_size, ikey, ikey, 0, 0, false, + Temperature::kUnknown, 0, 0, 0, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kDisableUserTimestamp, + kDisableUserTimestamp, id); } } @@ -3282,7 +3283,7 @@ TEST_F(VersionSetTestMissingFiles, ManifestFarBehindSst) { file_num, /*file_path_id=*/0, /*file_size=*/12, smallest_ikey, largest_ikey, 0, 0, false, Temperature::kUnknown, 0, 0, 0, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kDisableUserTimestamp, kDisableUserTimestamp); + kDisableUserTimestamp, kDisableUserTimestamp, {}); added_files.emplace_back(0, meta); } WriteFileAdditionAndDeletionToManifest( @@ -3338,7 +3339,7 @@ TEST_F(VersionSetTestMissingFiles, ManifestAheadofSst) { file_num, /*file_path_id=*/0, /*file_size=*/12, smallest_ikey, largest_ikey, 0, 0, false, Temperature::kUnknown, 0, 0, 0, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kDisableUserTimestamp, kDisableUserTimestamp); + kDisableUserTimestamp, kDisableUserTimestamp, {}); added_files.emplace_back(0, meta); } WriteFileAdditionAndDeletionToManifest( diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h index 8c23ac373..b5e1065ea 100644 --- a/db_stress_tool/db_stress_common.h +++ b/db_stress_tool/db_stress_common.h @@ -289,6 +289,7 @@ DECLARE_uint64(wp_commit_cache_bits); DECLARE_bool(adaptive_readahead); DECLARE_bool(async_io); DECLARE_string(wal_compression); +DECLARE_bool(verify_sst_unique_id_in_manifest); constexpr long KB = 1024; constexpr int kRandomValueMaxFactor = 3; diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc index 79b072d7d..42f6a4004 100644 --- a/db_stress_tool/db_stress_gflags.cc +++ b/db_stress_tool/db_stress_gflags.cc @@ -936,4 +936,10 @@ DEFINE_bool( DEFINE_string(wal_compression, "none", "Algorithm to use for WAL compression. none to disable."); +DEFINE_bool( + verify_sst_unique_id_in_manifest, false, + "Enable DB options `verify_sst_unique_id_in_manifest`, if true, during " + "DB-open try verifying the SST unique id between MANIFEST and SST " + "properties."); + #endif // GFLAGS diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc index ab8ae2dfe..6d1efa0b4 100644 --- a/db_stress_tool/db_stress_test_base.cc +++ b/db_stress_tool/db_stress_test_base.cc @@ -2314,6 +2314,8 @@ void StressTest::PrintEnv() const { static_cast(FLAGS_user_timestamp_size)); fprintf(stdout, "WAL compression : %s\n", FLAGS_wal_compression.c_str()); + fprintf(stdout, "Try verify sst unique id : %d\n", + static_cast(FLAGS_verify_sst_unique_id_in_manifest)); fprintf(stdout, "------------------------------------------------\n"); } diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index f0a8bff54..9cefdb5ff 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -492,6 +492,26 @@ struct DBOptions { // Default: false bool track_and_verify_wals_in_manifest = false; + // EXPERIMENTAL: This API/behavior is subject to change + // If true, during DB-open it verifies the SST unique id between MANIFEST + // and SST properties, which is to make sure the SST is not overwritten or + // misplaced. A corruption error will be reported if mismatch detected, but + // only when MANIFEST tracks the unique id, which starts from version 7.3. + // The unique id is an internal unique id and subject to change. + // The feature is disabled by default for now and planed to be enabled by + // default in a future release. + // + // Note: + // 1. the option should be used with `max_open_files=-1` to check all SST + // files. Otherwise, only the opened files during DB-open are checked. + // 2. existing SST files won't have its unique_id tracked in MANIFEST, then + // verification will be skipped. + // To check how many SST verifications are skipped, a statistic ticker + // `NUMBER_SST_UNIQUE_ID_VERIFICATION_SKIP` is introduced for that. + // + // Default: false + bool verify_sst_unique_id_in_manifest = false; + // Use the specified object to interact with the environment, // e.g. to read/write files, schedule background work, etc. In the near // future, support for doing storage operations such as read/write files diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h index 9bc7ab196..02c9635cb 100644 --- a/include/rocksdb/statistics.h +++ b/include/rocksdb/statistics.h @@ -433,6 +433,9 @@ enum Tickers : uint32_t { BLOCK_CHECKSUM_COMPUTE_COUNT, + // Number of sst files which skipped unique id verification during DB open + NUMBER_SST_UNIQUE_ID_VERIFICATION_SKIP, + TICKER_ENUM_MAX }; diff --git a/include/rocksdb/utilities/options_type.h b/include/rocksdb/utilities/options_type.h index ae6b339b8..3948ea344 100644 --- a/include/rocksdb/utilities/options_type.h +++ b/include/rocksdb/utilities/options_type.h @@ -56,6 +56,7 @@ enum class OptionType { kCustomizable, kEncodedString, kTemperature, + kArray, kUnknown, }; @@ -155,6 +156,24 @@ bool SerializeEnum(const std::unordered_map& type_map, return false; } +template +Status ParseArray(const ConfigOptions& config_options, + const OptionTypeInfo& elem_info, char separator, + const std::string& name, const std::string& value, + std::array* result); + +template +Status SerializeArray(const ConfigOptions& config_options, + const OptionTypeInfo& elem_info, char separator, + const std::string& name, const std::array& vec, + std::string* value); + +template +bool ArraysAreEqual(const ConfigOptions& config_options, + const OptionTypeInfo& elem_info, const std::string& name, + const std::array& array1, + const std::array& array2, std::string* mismatch); + template Status ParseVector(const ConfigOptions& config_options, const OptionTypeInfo& elem_info, char separator, @@ -388,6 +407,38 @@ class OptionTypeInfo { return info.SetParseFunc(parse_func); } + template + static OptionTypeInfo Array(int _offset, OptionVerificationType _verification, + OptionTypeFlags _flags, + const OptionTypeInfo& elem_info, + char separator = ':') { + OptionTypeInfo info(_offset, OptionType::kArray, _verification, _flags); + info.SetParseFunc([elem_info, separator]( + const ConfigOptions& opts, const std::string& name, + const std::string& value, void* addr) { + auto result = static_cast*>(addr); + return ParseArray(opts, elem_info, separator, name, value, + result); + }); + info.SetSerializeFunc([elem_info, separator](const ConfigOptions& opts, + const std::string& name, + const void* addr, + std::string* value) { + const auto& array = *(static_cast*>(addr)); + return SerializeArray(opts, elem_info, separator, name, array, + value); + }); + info.SetEqualsFunc([elem_info](const ConfigOptions& opts, + const std::string& name, const void* addr1, + const void* addr2, std::string* mismatch) { + const auto& array1 = *(static_cast*>(addr1)); + const auto& array2 = *(static_cast*>(addr2)); + return ArraysAreEqual(opts, elem_info, name, array1, array2, + mismatch); + }); + return info; + } + template static OptionTypeInfo Vector(int _offset, OptionVerificationType _verification, @@ -893,6 +944,144 @@ class OptionTypeInfo { OptionTypeFlags flags_; }; +// Parses the input value into elements of the result array, which has fixed +// array size. For example, if the value=1:2:3 and elem_info parses integers, +// the result array will be {1,2,3}. Array size is defined in the OptionTypeInfo +// the input value has to match with that. +// @param config_options Controls how the option value is parsed. +// @param elem_info Controls how individual tokens in value are parsed +// @param separator Character separating tokens in values (':' in the above +// example) +// @param name The name associated with this array option +// @param value The input string to parse into tokens +// @param result Returns the results of parsing value into its elements. +// @return OK if the value was successfully parse +// @return InvalidArgument if the value is improperly formed or element number +// doesn't match array size defined in OptionTypeInfo +// or if the token could not be parsed +// @return NotFound If the tokenized value contains unknown options for +// its type +template +Status ParseArray(const ConfigOptions& config_options, + const OptionTypeInfo& elem_info, char separator, + const std::string& name, const std::string& value, + std::array* result) { + Status status; + + ConfigOptions copy = config_options; + copy.ignore_unsupported_options = false; + size_t i = 0, start = 0, end = 0; + for (; status.ok() && i < kSize && start < value.size() && + end != std::string::npos; + i++, start = end + 1) { + std::string token; + status = OptionTypeInfo::NextToken(value, separator, start, &end, &token); + if (status.ok()) { + status = elem_info.Parse(copy, name, token, &((*result)[i])); + if (config_options.ignore_unsupported_options && + status.IsNotSupported()) { + // If we were ignoring unsupported options and this one should be + // ignored, ignore it by setting the status to OK + status = Status::OK(); + } + } + } + if (!status.ok()) { + return status; + } + // make sure the element number matches the array size + if (i < kSize) { + return Status::InvalidArgument( + "Serialized value has less elements than array size", name); + } + if (start < value.size() && end != std::string::npos) { + return Status::InvalidArgument( + "Serialized value has more elements than array size", name); + } + return status; +} + +// Serializes the fixed size input array into its output value. Elements are +// separated by the separator character. This element will convert all of the +// elements in array into their serialized form, using elem_info to perform the +// serialization. +// For example, if the array contains the integers 1,2,3 and elem_info +// serializes the output would be 1:2:3 for separator ":". +// @param config_options Controls how the option value is serialized. +// @param elem_info Controls how individual tokens in value are serialized +// @param separator Character separating tokens in value (':' in the above +// example) +// @param name The name associated with this array option +// @param array The input array to serialize +// @param value The output string of serialized options +// @return OK if the value was successfully parse +// @return InvalidArgument if the value is improperly formed or if the token +// could not be parsed +// @return NotFound If the tokenized value contains unknown options for +// its type +template +Status SerializeArray(const ConfigOptions& config_options, + const OptionTypeInfo& elem_info, char separator, + const std::string& name, + const std::array& array, std::string* value) { + std::string result; + ConfigOptions embedded = config_options; + embedded.delimiter = ";"; + int printed = 0; + for (const auto& elem : array) { + std::string elem_str; + Status s = elem_info.Serialize(embedded, name, &elem, &elem_str); + if (!s.ok()) { + return s; + } else if (!elem_str.empty()) { + if (printed++ > 0) { + result += separator; + } + // If the element contains embedded separators, put it inside of brackets + if (elem_str.find(separator) != std::string::npos) { + result += "{" + elem_str + "}"; + } else { + result += elem_str; + } + } + } + if (result.find("=") != std::string::npos) { + *value = "{" + result + "}"; + } else if (printed > 1 && result.at(0) == '{') { + *value = "{" + result + "}"; + } else { + *value = result; + } + return Status::OK(); +} + +// Compares the input arrays array1 and array2 for equality +// Elements of the array are compared one by one using elem_info to perform the +// comparison. +// +// @param config_options Controls how the arrays are compared. +// @param elem_info Controls how individual elements in the arrays are compared +// @param name The name associated with this array option +// @param array1,array2 The arrays to compare. +// @param mismatch If the arrays are not equivalent, mismatch will point to +// the first element of the comparison that did not match. +// @return true If vec1 and vec2 are "equal", false otherwise +template +bool ArraysAreEqual(const ConfigOptions& config_options, + const OptionTypeInfo& elem_info, const std::string& name, + const std::array& array1, + const std::array& array2, std::string* mismatch) { + assert(array1.size() == kSize); + assert(array2.size() == kSize); + for (size_t i = 0; i < kSize; ++i) { + if (!elem_info.AreEqual(config_options, name, &array1[i], &array2[i], + mismatch)) { + return false; + } + } + return true; +} + // Parses the input value into elements of the result vector. This method // will break the input value into the individual tokens (based on the // separator), where each of those tokens will be parsed based on the rules of diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h index 4d0d55c12..2afe7dade 100644 --- a/java/rocksjni/portal.h +++ b/java/rocksjni/portal.h @@ -5088,6 +5088,8 @@ class TickerTypeJni { return -0x2D; case ROCKSDB_NAMESPACE::Tickers::BLOCK_CHECKSUM_COMPUTE_COUNT: return -0x2E; + case ROCKSDB_NAMESPACE::Tickers::NUMBER_SST_UNIQUE_ID_VERIFICATION_SKIP: + return -0x2F; case ROCKSDB_NAMESPACE::Tickers::TICKER_ENUM_MAX: // 0x5F was the max value in the initial copy of tickers to Java. // Since these values are exposed directly to Java clients, we keep @@ -5459,6 +5461,9 @@ class TickerTypeJni { return ROCKSDB_NAMESPACE::Tickers::NON_LAST_LEVEL_READ_COUNT; case -0x2E: return ROCKSDB_NAMESPACE::Tickers::BLOCK_CHECKSUM_COMPUTE_COUNT; + case -0x2F: + return ROCKSDB_NAMESPACE::Tickers:: + NUMBER_SST_UNIQUE_ID_VERIFICATION_SKIP; case 0x5F: // 0x5F was the max value in the initial copy of tickers to Java. // Since these values are exposed directly to Java clients, we keep diff --git a/java/src/main/java/org/rocksdb/TickerType.java b/java/src/main/java/org/rocksdb/TickerType.java index a6ad31154..15ed82a2f 100644 --- a/java/src/main/java/org/rocksdb/TickerType.java +++ b/java/src/main/java/org/rocksdb/TickerType.java @@ -804,6 +804,11 @@ public enum TickerType { NON_LAST_LEVEL_READ_BYTES((byte) -0x2C), NON_LAST_LEVEL_READ_COUNT((byte) -0x2D), + /** + * Number of sst files which skipped unique id verification in DB open + */ + NUMBER_SST_UNIQUE_ID_VERIFICATION_SKIP((byte) -0x2F), + BLOCK_CHECKSUM_COMPUTE_COUNT((byte) -0x2E), TICKER_ENUM_MAX((byte) 0x5F); diff --git a/monitoring/statistics.cc b/monitoring/statistics.cc index 388acaf4d..23248bd5b 100644 --- a/monitoring/statistics.cc +++ b/monitoring/statistics.cc @@ -226,7 +226,9 @@ const std::vector> TickersNameMap = { {LAST_LEVEL_READ_COUNT, "rocksdb.last.level.read.count"}, {NON_LAST_LEVEL_READ_BYTES, "rocksdb.non.last.level.read.bytes"}, {NON_LAST_LEVEL_READ_COUNT, "rocksdb.non.last.level.read.count"}, - {BLOCK_CHECKSUM_COMPUTE_COUNT, "rocksdb.block.checksum.compute.count"}}; + {BLOCK_CHECKSUM_COMPUTE_COUNT, "rocksdb.block.checksum.compute.count"}, + {NUMBER_SST_UNIQUE_ID_VERIFICATION_SKIP, + "rocksdb.number.sst.uniqueid.verification.skip"}}; const std::vector> HistogramsNameMap = { {DB_GET, "rocksdb.db.get.micros"}, diff --git a/options/db_options.cc b/options/db_options.cc index 1e8de20de..65736e79e 100644 --- a/options/db_options.cc +++ b/options/db_options.cc @@ -680,6 +680,8 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options) flush_verify_memtable_count(options.flush_verify_memtable_count), track_and_verify_wals_in_manifest( options.track_and_verify_wals_in_manifest), + verify_sst_unique_id_in_manifest( + options.verify_sst_unique_id_in_manifest), env(options.env), rate_limiter(options.rate_limiter), sst_file_manager(options.sst_file_manager), diff --git a/options/db_options.h b/options/db_options.h index 0462a9c96..a245d63c6 100644 --- a/options/db_options.h +++ b/options/db_options.h @@ -26,6 +26,7 @@ struct ImmutableDBOptions { bool paranoid_checks; bool flush_verify_memtable_count; bool track_and_verify_wals_in_manifest; + bool verify_sst_unique_id_in_manifest; Env* env; std::shared_ptr rate_limiter; std::shared_ptr sst_file_manager; diff --git a/options/options_test.cc b/options/options_test.cc index 208d65b31..8daaa5fa8 100644 --- a/options/options_test.cc +++ b/options/options_test.cc @@ -4603,6 +4603,68 @@ TEST_F(OptionTypeInfoTest, TestStruct) { ASSERT_EQ(e1.b.s, "66"); } +TEST_F(OptionTypeInfoTest, TestArrayType) { + OptionTypeInfo array_info = OptionTypeInfo::Array( + 0, OptionVerificationType::kNormal, OptionTypeFlags::kNone, + {0, OptionType::kString}); + std::array array1, array2; + std::string mismatch; + + ConfigOptions config_options; + TestParseAndCompareOption(config_options, array_info, "v", "a:b:c:d", &array1, + &array2); + + ASSERT_EQ(array1.size(), 4); + ASSERT_EQ(array1[0], "a"); + ASSERT_EQ(array1[1], "b"); + ASSERT_EQ(array1[2], "c"); + ASSERT_EQ(array1[3], "d"); + array1[3] = "e"; + ASSERT_FALSE( + array_info.AreEqual(config_options, "v", &array1, &array2, &mismatch)); + ASSERT_EQ(mismatch, "v"); + + // Test vectors with inner brackets + TestParseAndCompareOption(config_options, array_info, "v", "a:{b}:c:d", + &array1, &array2); + ASSERT_EQ(array1.size(), 4); + ASSERT_EQ(array1[0], "a"); + ASSERT_EQ(array1[1], "b"); + ASSERT_EQ(array1[2], "c"); + ASSERT_EQ(array1[3], "d"); + + std::array array3, array4; + OptionTypeInfo bar_info = OptionTypeInfo::Array( + 0, OptionVerificationType::kNormal, OptionTypeFlags::kNone, + {0, OptionType::kString}, '|'); + TestParseAndCompareOption(config_options, bar_info, "v", "x|y|z", &array3, + &array4); + + // Test arrays with inner array + TestParseAndCompareOption(config_options, bar_info, "v", + "a|{b1|b2}|{c1|c2|{d1|d2}}", &array3, &array4, + false); + ASSERT_EQ(array3.size(), 3); + ASSERT_EQ(array3[0], "a"); + ASSERT_EQ(array3[1], "b1|b2"); + ASSERT_EQ(array3[2], "c1|c2|{d1|d2}"); + + TestParseAndCompareOption(config_options, bar_info, "v", + "{a1|a2}|{b1|{c1|c2}}|d1", &array3, &array4, true); + ASSERT_EQ(array3.size(), 3); + ASSERT_EQ(array3[0], "a1|a2"); + ASSERT_EQ(array3[1], "b1|{c1|c2}"); + ASSERT_EQ(array3[2], "d1"); + + // Test invalid input: less element than requested + auto s = bar_info.Parse(config_options, "opt_name1", "a1|a2", &array3); + ASSERT_TRUE(s.IsInvalidArgument()); + + // Test invalid input: more element than requested + s = bar_info.Parse(config_options, "opt_name2", "a1|b|c1|d3", &array3); + ASSERT_TRUE(s.IsInvalidArgument()); +} + TEST_F(OptionTypeInfoTest, TestVectorType) { OptionTypeInfo vec_info = OptionTypeInfo::Vector( 0, OptionVerificationType::kNormal, OptionTypeFlags::kNone, diff --git a/table/unique_id.cc b/table/unique_id.cc index 95e9ded29..b5fd1ea95 100644 --- a/table/unique_id.cc +++ b/table/unique_id.cc @@ -105,6 +105,20 @@ Status GetSstInternalUniqueId(const std::string &db_id, return Status::OK(); } +Status GetSstInternalUniqueId(const std::string &db_id, + const std::string &db_session_id, + uint64_t file_number, UniqueId64x2 *out) { + UniqueId64x3 tmp{}; + Status s = GetSstInternalUniqueId(db_id, db_session_id, file_number, &tmp); + if (s.ok()) { + (*out)[0] = tmp[0]; + (*out)[1] = tmp[1]; + } else { + *out = {0, 0}; + } + return s; +} + namespace { // For InternalUniqueIdToExternal / ExternalUniqueIdToInternal we want all // zeros in first 128 bits to map to itself, so that excluding zero in @@ -140,6 +154,27 @@ std::string EncodeUniqueIdBytes(const UniqueId64x3 &in) { return ret; } +std::string EncodeUniqueIdBytes(const UniqueId64x2 &in) { + std::string ret(16U, '\0'); + EncodeFixed64(&ret[0], in[0]); + EncodeFixed64(&ret[8], in[1]); + return ret; +} + +Status DecodeUniqueIdBytes(const std::string &unique_id, UniqueId64x2 *out) { + if (unique_id.size() != 16) { + return Status::NotSupported("Not a valid unique_id"); + } + const char *buf = &unique_id.front(); + (*out)[0] = DecodeFixed64(buf); + (*out)[1] = DecodeFixed64(buf + 8); + return Status::OK(); +} + +bool IsUniqueIdUnknown(const UniqueId64x2 &in) { + return in[0] == 0 && in[1] == 0; +} + Status GetUniqueIdFromTableProperties(const TableProperties &props, std::string *out_id) { UniqueId64x3 tmp{}; @@ -163,4 +198,8 @@ std::string UniqueIdToHumanString(const std::string &id) { return str; } +std::string UniqueIdToHumanString(const UniqueId64x2 &id) { + return UniqueIdToHumanString(EncodeUniqueIdBytes(id)); +} + } // namespace ROCKSDB_NAMESPACE diff --git a/table/unique_id_impl.h b/table/unique_id_impl.h index 8f414f7d6..2aa68c96c 100644 --- a/table/unique_id_impl.h +++ b/table/unique_id_impl.h @@ -12,6 +12,7 @@ namespace ROCKSDB_NAMESPACE { using UniqueId64x3 = std::array; +using UniqueId64x2 = std::array; // Helper for GetUniqueIdFromTableProperties. This function can also be used // for temporary ids for files without sufficient information in table @@ -23,6 +24,12 @@ Status GetSstInternalUniqueId(const std::string &db_id, const std::string &db_session_id, uint64_t file_number, UniqueId64x3 *out); +Status GetSstInternalUniqueId(const std::string &db_id, + const std::string &db_session_id, + uint64_t file_number, UniqueId64x2 *out); + +std::string UniqueIdToHumanString(const UniqueId64x2 &id); + // Helper for GetUniqueIdFromTableProperties. External unique ids go through // this extra hashing layer so that prefixes of the unique id have predictable // "full" entropy. This hashing layer is 1-to-1 on the first 128 bits and on @@ -37,6 +44,11 @@ void ExternalUniqueIdToInternal(UniqueId64x3 *in_out); // Convert numerical format to byte format for public API std::string EncodeUniqueIdBytes(const UniqueId64x3 &in); +std::string EncodeUniqueIdBytes(const UniqueId64x2 &in); + +Status DecodeUniqueIdBytes(const std::string &unique_id, UniqueId64x2 *out); + +bool IsUniqueIdUnknown(const UniqueId64x2 &in); // Reformat a random value down to our "DB session id" format, // which is intended to be compact and friendly for use in file names. diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py index 325a46871..7bc50f666 100644 --- a/tools/db_crashtest.py +++ b/tools/db_crashtest.py @@ -171,6 +171,7 @@ default_params = { "adaptive_readahead": lambda: random.choice([0, 1]), "async_io": lambda: random.choice([0, 1]), "wal_compression": lambda: random.choice(["none", "zstd"]), + "verify_sst_unique_id_in_manifest": 1, # always verify for now } _TEST_DIR_ENV_VAR = 'TEST_TMPDIR'