From 9d61a0856dbaee107861dcf96bc9f132fff971ff Mon Sep 17 00:00:00 2001 From: anand76 Date: Tue, 18 May 2021 19:32:55 -0700 Subject: [PATCH] Sync ingested files only if reopen is supported by the FS (#8296) Summary: Some file systems (especially distributed FS) do not support reopening a file for writing. The ExternalSstFileIngestionJob calls ReopenWritableFile in order to sync the ingested file, which typically makes sense only on a local file system with a page cache (i.e Posix). So this change tries to sync the ingested file only if ReopenWritableFile doesn't return Status::NotSupported(). Tests: Add a new unit test in external_sst_file_basic_test Pull Request resolved: https://github.com/facebook/rocksdb/pull/8296 Reviewed By: jay-zhuang Differential Revision: D28420865 Pulled By: anand1976 fbshipit-source-id: 380e7f5ff95324997f7a59864a9ac96ebbd0100c --- HISTORY.md | 1 + db/external_sst_file_basic_test.cc | 35 +++++++++++++++++++++++++++ db/external_sst_file_ingestion_job.cc | 31 +++++++++++++++--------- 3 files changed, 56 insertions(+), 11 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 356ddacc5..4efdc5b5e 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -7,6 +7,7 @@ * Fixed the false-positive alert when recovering from the WAL file. Avoid reporting "SST file is ahead of WAL" on a newly created empty column family, if the previous WAL file is corrupted. * Fixed a bug where `GetLiveFiles()` output included a non-existent file called "OPTIONS-000000". Backups and checkpoints, which use `GetLiveFiles()`, failed on DBs impacted by this bug. Read-write DBs were impacted when the latest OPTIONS file failed to write and `fail_if_options_file_error == false`. Read-only DBs were impacted when no OPTIONS files existed. * Handle return code by io_uring_submit_and_wait() and io_uring_wait_cqe(). +* In the IngestExternalFile() API, only try to sync the ingested file if the file is linked and the FileSystem/Env supports reopening a writable file. ### Behavior Changes * Due to the fix of false-postive alert of "SST file is ahead of WAL", all the CFs with no SST file (CF empty) will bypass the consistency check. We fixed a false-positive, but introduced a very rare true-negative which will be triggered in the following conditions: A CF with some delete operations in the last a few queries which will result in an empty CF (those are flushed to SST file and a compaction triggered which combines this file and all other SST files and generates an empty CF, or there is another reason to write a manifest entry for this CF after a flush that generates no SST file from an empty CF). The deletion entries are logged in a WAL and this WAL was corrupted, while the CF's log number points to the next WAL (due to the flush). Therefore, the DB can only recover to the point without these trailing deletions and cause the inconsistent DB status. diff --git a/db/external_sst_file_basic_test.cc b/db/external_sst_file_basic_test.cc index a11a44b99..035cb3698 100644 --- a/db/external_sst_file_basic_test.cc +++ b/db/external_sst_file_basic_test.cc @@ -1136,6 +1136,41 @@ TEST_F(ExternalSSTFileBasicTest, SyncFailure) { } } +TEST_F(ExternalSSTFileBasicTest, ReopenNotSupported) { + Options options; + options.create_if_missing = true; + options.env = env_; + + SyncPoint::GetInstance()->SetCallBack( + "ExternalSstFileIngestionJob::Prepare:Reopen", [&](void* arg) { + Status* s = static_cast(arg); + *s = Status::NotSupported(); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + DestroyAndReopen(options); + + Options sst_file_writer_options; + sst_file_writer_options.env = env_; + std::unique_ptr sst_file_writer( + new SstFileWriter(EnvOptions(), sst_file_writer_options)); + std::string file_name = + sst_files_dir_ + "reopen_not_supported_test_" + ".sst"; + ASSERT_OK(sst_file_writer->Open(file_name)); + ASSERT_OK(sst_file_writer->Put("bar", "v2")); + ASSERT_OK(sst_file_writer->Finish()); + + IngestExternalFileOptions ingest_opt; + ingest_opt.move_files = true; + const Snapshot* snapshot = db_->GetSnapshot(); + ASSERT_OK(db_->IngestExternalFile({file_name}, ingest_opt)); + db_->ReleaseSnapshot(snapshot); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + Destroy(options); +} + TEST_F(ExternalSSTFileBasicTest, VerifyChecksumReadahead) { Options options; options.create_if_missing = true; diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc index ff5450138..ac196a798 100644 --- a/db/external_sst_file_ingestion_job.cc +++ b/db/external_sst_file_ingestion_job.cc @@ -109,17 +109,26 @@ Status ExternalSstFileIngestionJob::Prepare( // directory before ingest the file. For integrity of RocksDB we need // to sync the file. std::unique_ptr file_to_sync; - status = fs_->ReopenWritableFile(path_inside_db, env_options_, - &file_to_sync, nullptr); - if (status.ok()) { - TEST_SYNC_POINT( - "ExternalSstFileIngestionJob::BeforeSyncIngestedFile"); - status = SyncIngestedFile(file_to_sync.get()); - TEST_SYNC_POINT("ExternalSstFileIngestionJob::AfterSyncIngestedFile"); - if (!status.ok()) { - ROCKS_LOG_WARN(db_options_.info_log, - "Failed to sync ingested file %s: %s", - path_inside_db.c_str(), status.ToString().c_str()); + Status s = fs_->ReopenWritableFile(path_inside_db, env_options_, + &file_to_sync, nullptr); + TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Prepare:Reopen", + &s); + // Some file systems (especially remote/distributed) don't support + // reopening a file for writing and don't require reopening and + // syncing the file. Ignore the NotSupported error in that case. + if (!s.IsNotSupported()) { + status = s; + if (status.ok()) { + TEST_SYNC_POINT( + "ExternalSstFileIngestionJob::BeforeSyncIngestedFile"); + status = SyncIngestedFile(file_to_sync.get()); + TEST_SYNC_POINT( + "ExternalSstFileIngestionJob::AfterSyncIngestedFile"); + if (!status.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Failed to sync ingested file %s: %s", + path_inside_db.c_str(), status.ToString().c_str()); + } } } } else if (status.IsNotSupported() &&