diff --git a/HISTORY.md b/HISTORY.md index 356ddacc5..4efdc5b5e 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -7,6 +7,7 @@ * Fixed the false-positive alert when recovering from the WAL file. Avoid reporting "SST file is ahead of WAL" on a newly created empty column family, if the previous WAL file is corrupted. * Fixed a bug where `GetLiveFiles()` output included a non-existent file called "OPTIONS-000000". Backups and checkpoints, which use `GetLiveFiles()`, failed on DBs impacted by this bug. Read-write DBs were impacted when the latest OPTIONS file failed to write and `fail_if_options_file_error == false`. Read-only DBs were impacted when no OPTIONS files existed. * Handle return code by io_uring_submit_and_wait() and io_uring_wait_cqe(). +* In the IngestExternalFile() API, only try to sync the ingested file if the file is linked and the FileSystem/Env supports reopening a writable file. ### Behavior Changes * Due to the fix of false-postive alert of "SST file is ahead of WAL", all the CFs with no SST file (CF empty) will bypass the consistency check. We fixed a false-positive, but introduced a very rare true-negative which will be triggered in the following conditions: A CF with some delete operations in the last a few queries which will result in an empty CF (those are flushed to SST file and a compaction triggered which combines this file and all other SST files and generates an empty CF, or there is another reason to write a manifest entry for this CF after a flush that generates no SST file from an empty CF). The deletion entries are logged in a WAL and this WAL was corrupted, while the CF's log number points to the next WAL (due to the flush). Therefore, the DB can only recover to the point without these trailing deletions and cause the inconsistent DB status. diff --git a/db/external_sst_file_basic_test.cc b/db/external_sst_file_basic_test.cc index a11a44b99..035cb3698 100644 --- a/db/external_sst_file_basic_test.cc +++ b/db/external_sst_file_basic_test.cc @@ -1136,6 +1136,41 @@ TEST_F(ExternalSSTFileBasicTest, SyncFailure) { } } +TEST_F(ExternalSSTFileBasicTest, ReopenNotSupported) { + Options options; + options.create_if_missing = true; + options.env = env_; + + SyncPoint::GetInstance()->SetCallBack( + "ExternalSstFileIngestionJob::Prepare:Reopen", [&](void* arg) { + Status* s = static_cast(arg); + *s = Status::NotSupported(); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + DestroyAndReopen(options); + + Options sst_file_writer_options; + sst_file_writer_options.env = env_; + std::unique_ptr sst_file_writer( + new SstFileWriter(EnvOptions(), sst_file_writer_options)); + std::string file_name = + sst_files_dir_ + "reopen_not_supported_test_" + ".sst"; + ASSERT_OK(sst_file_writer->Open(file_name)); + ASSERT_OK(sst_file_writer->Put("bar", "v2")); + ASSERT_OK(sst_file_writer->Finish()); + + IngestExternalFileOptions ingest_opt; + ingest_opt.move_files = true; + const Snapshot* snapshot = db_->GetSnapshot(); + ASSERT_OK(db_->IngestExternalFile({file_name}, ingest_opt)); + db_->ReleaseSnapshot(snapshot); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + Destroy(options); +} + TEST_F(ExternalSSTFileBasicTest, VerifyChecksumReadahead) { Options options; options.create_if_missing = true; diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc index ff5450138..ac196a798 100644 --- a/db/external_sst_file_ingestion_job.cc +++ b/db/external_sst_file_ingestion_job.cc @@ -109,17 +109,26 @@ Status ExternalSstFileIngestionJob::Prepare( // directory before ingest the file. For integrity of RocksDB we need // to sync the file. std::unique_ptr file_to_sync; - status = fs_->ReopenWritableFile(path_inside_db, env_options_, - &file_to_sync, nullptr); - if (status.ok()) { - TEST_SYNC_POINT( - "ExternalSstFileIngestionJob::BeforeSyncIngestedFile"); - status = SyncIngestedFile(file_to_sync.get()); - TEST_SYNC_POINT("ExternalSstFileIngestionJob::AfterSyncIngestedFile"); - if (!status.ok()) { - ROCKS_LOG_WARN(db_options_.info_log, - "Failed to sync ingested file %s: %s", - path_inside_db.c_str(), status.ToString().c_str()); + Status s = fs_->ReopenWritableFile(path_inside_db, env_options_, + &file_to_sync, nullptr); + TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Prepare:Reopen", + &s); + // Some file systems (especially remote/distributed) don't support + // reopening a file for writing and don't require reopening and + // syncing the file. Ignore the NotSupported error in that case. + if (!s.IsNotSupported()) { + status = s; + if (status.ok()) { + TEST_SYNC_POINT( + "ExternalSstFileIngestionJob::BeforeSyncIngestedFile"); + status = SyncIngestedFile(file_to_sync.get()); + TEST_SYNC_POINT( + "ExternalSstFileIngestionJob::AfterSyncIngestedFile"); + if (!status.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Failed to sync ingested file %s: %s", + path_inside_db.c_str(), status.ToString().c_str()); + } } } } else if (status.IsNotSupported() &&