From 74a334a2eb8db6c2ba2f38382be287af908e63c0 Mon Sep 17 00:00:00 2001 From: haoyuhuang Date: Thu, 23 May 2019 21:54:23 -0700 Subject: [PATCH] Provide an option so that SST ingestion won't fall back to copy after hard linking fails (#5333) Summary: RocksDB always tries to perform a hard link operation on the external SST file to ingest. This operation can fail if the external SST resides on a different device/FS, or the underlying FS does not support hard link. Currently RocksDB assumes that if the link fails, the user is willing to perform file copy, which is not true according to the post. This commit provides an option named 'failed_move_fall_back_to_copy' for users to choose which behavior they want. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5333 Differential Revision: D15457597 Pulled By: HaoyuHuang fbshipit-source-id: f3626e13f845db4f7ed970a53ec8a2b1f0d62214 --- HISTORY.md | 3 +- db/external_sst_file_ingestion_job.cc | 19 ++--- db/external_sst_file_test.cc | 109 ++++++++++++++++++++++---- include/rocksdb/options.h | 2 + 4 files changed, 106 insertions(+), 27 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index b65f5a038..40d11096d 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -7,6 +7,7 @@ * Add an option `snap_refresh_nanos` (default to 0.1s) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature. * Add an option `unordered_write` which trades snapshot guarantees with higher write throughput. When used with WRITE_PREPARED transactions with two_write_queues=true, it offers higher throughput with however no compromise on guarantees. * Allow DBImplSecondary to remove memtables with obsolete data after replaying MANIFEST and WAL. +* Add an option `failed_move_fall_back_to_copy` (default is true) for external SST ingestion. When `move_files` is true and hard link fails, ingestion falls back to copy if `failed_move_fall_back_to_copy` is true. Otherwise, ingestion reports an error. ### Performance Improvements * Reduce binary search when iterator reseek into the same data block. @@ -20,7 +21,7 @@ ### Bug Fixes - + ## 6.2.0 (4/30/2019) ### New Features * Add an option `strict_bytes_per_sync` that causes a file-writing thread to block rather than exceed the limit on bytes pending writeback specified by `bytes_per_sync` or `wal_bytes_per_sync`. diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc index 28b481678..588ac5110 100644 --- a/db/external_sst_file_ingestion_job.cc +++ b/db/external_sst_file_ingestion_job.cc @@ -92,26 +92,27 @@ Status ExternalSstFileIngestionJob::Prepare( // Copy/Move external files into DB for (IngestedFileInfo& f : files_to_ingest_) { f.fd = FileDescriptor(next_file_number++, 0, f.file_size); - + f.copy_file = false; const std::string path_outside_db = f.external_file_path; const std::string path_inside_db = TableFileName(cfd_->ioptions()->cf_paths, f.fd.GetNumber(), f.fd.GetPathId()); - if (ingestion_options_.move_files) { status = env_->LinkFile(path_outside_db, path_inside_db); - if (status.IsNotSupported()) { - // Original file is on a different FS, use copy instead of hard linking - status = CopyFile(env_, path_outside_db, path_inside_db, 0, - db_options_.use_fsync); + if (status.IsNotSupported() && + ingestion_options_.failed_move_fall_back_to_copy) { + // Original file is on a different FS, use copy instead of hard linking. f.copy_file = true; - } else { - f.copy_file = false; } } else { + f.copy_file = true; + } + + if (f.copy_file) { + TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Prepare:CopyFile", + nullptr); status = CopyFile(env_, path_outside_db, path_inside_db, 0, db_options_.use_fsync); - f.copy_file = true; } TEST_SYNC_POINT("ExternalSstFileIngestionJob::Prepare:FileAdded"); if (!status.ok()) { diff --git a/db/external_sst_file_test.cc b/db/external_sst_file_test.cc index cbbb2fa26..3850a2a03 100644 --- a/db/external_sst_file_test.cc +++ b/db/external_sst_file_test.cc @@ -16,6 +16,54 @@ namespace rocksdb { +// A test environment that can be configured to fail the Link operation. +class ExternalSSTTestEnv : public EnvWrapper { + public: + ExternalSSTTestEnv(Env* t, bool fail_link) + : EnvWrapper(t), fail_link_(fail_link) {} + + Status LinkFile(const std::string& s, const std::string& t) override { + if (fail_link_) { + return Status::NotSupported("Link failed"); + } + return target()->LinkFile(s, t); + } + + void set_fail_link(bool fail_link) { fail_link_ = fail_link; } + + private: + bool fail_link_; +}; + +class ExternSSTFileLinkFailFallbackTest + : public DBTestBase, + public ::testing::WithParamInterface> { + public: + ExternSSTFileLinkFailFallbackTest() + : DBTestBase("/external_sst_file_test"), + test_env_(new ExternalSSTTestEnv(env_, true)) { + sst_files_dir_ = dbname_ + "/sst_files/"; + test::DestroyDir(env_, sst_files_dir_); + env_->CreateDir(sst_files_dir_); + options_ = CurrentOptions(); + options_.disable_auto_compactions = true; + options_.env = test_env_; + } + + void TearDown() override { + delete db_; + db_ = nullptr; + ASSERT_OK(DestroyDB(dbname_, options_)); + delete test_env_; + test_env_ = nullptr; + } + + protected: + std::string sst_files_dir_; + Options options_; + ExternalSSTTestEnv* test_env_; +}; + class ExternalSSTFileTest : public DBTestBase, public ::testing::WithParamInterface> { @@ -2014,17 +2062,23 @@ TEST_F(ExternalSSTFileTest, FileWithCFInfo) { } /* - * Test and verify the functionality of ingestion_options.move_files. + * Test and verify the functionality of ingestion_options.move_files and + * ingestion_options.failed_move_fall_back_to_copy */ -TEST_F(ExternalSSTFileTest, LinkExternalSst) { - Options options = CurrentOptions(); - options.disable_auto_compactions = true; - DestroyAndReopen(options); +TEST_P(ExternSSTFileLinkFailFallbackTest, LinkFailFallBackExternalSst) { + const bool fail_link = std::get<0>(GetParam()); + const bool failed_move_fall_back_to_copy = std::get<1>(GetParam()); + test_env_->set_fail_link(fail_link); + const EnvOptions env_options; + DestroyAndReopen(options_); const int kNumKeys = 10000; + IngestExternalFileOptions ifo; + ifo.move_files = true; + ifo.failed_move_fall_back_to_copy = failed_move_fall_back_to_copy; std::string file_path = sst_files_dir_ + "file1.sst"; // Create SstFileWriter for default column family - SstFileWriter sst_file_writer(EnvOptions(), options); + SstFileWriter sst_file_writer(env_options, options_); ASSERT_OK(sst_file_writer.Open(file_path)); for (int i = 0; i < kNumKeys; i++) { ASSERT_OK(sst_file_writer.Put(Key(i), Key(i) + "_value")); @@ -2033,9 +2087,13 @@ TEST_F(ExternalSSTFileTest, LinkExternalSst) { uint64_t file_size = 0; ASSERT_OK(env_->GetFileSize(file_path, &file_size)); - IngestExternalFileOptions ifo; - ifo.move_files = true; - ASSERT_OK(db_->IngestExternalFile({file_path}, ifo)); + bool copyfile = false; + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "ExternalSstFileIngestionJob::Prepare:CopyFile", + [&](void* /* arg */) { copyfile = true; }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + const Status s = db_->IngestExternalFile({file_path}, ifo); ColumnFamilyHandleImpl* cfh = static_cast(dbfull()->DefaultColumnFamily()); @@ -2049,18 +2107,29 @@ TEST_F(ExternalSSTFileTest, LinkExternalSst) { bytes_copied += stats.bytes_written; bytes_moved += stats.bytes_moved; } - // If bytes_moved > 0, it means external sst resides on the same FS - // supporting hard link operation. Therefore, - // 0 bytes should be copied, and the bytes_moved == file_size. - // Otherwise, FS does not support hard link, or external sst file resides on - // a different file system, then the bytes_copied should be equal to - // file_size. - if (bytes_moved > 0) { + + if (!fail_link) { + // Link operation succeeds. External SST should be moved. + ASSERT_OK(s); ASSERT_EQ(0, bytes_copied); ASSERT_EQ(file_size, bytes_moved); + ASSERT_FALSE(copyfile); } else { - ASSERT_EQ(file_size, bytes_copied); + // Link operation fails. + ASSERT_EQ(0, bytes_moved); + if (failed_move_fall_back_to_copy) { + ASSERT_OK(s); + // Copy file is true since a failed link falls back to copy file. + ASSERT_TRUE(copyfile); + ASSERT_EQ(file_size, bytes_copied); + } else { + ASSERT_TRUE(s.IsNotSupported()); + // Copy file is false since a failed link does not fall back to copy file. + ASSERT_FALSE(copyfile); + ASSERT_EQ(0, bytes_copied); + } } + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); } class TestIngestExternalFileListener : public EventListener { @@ -2666,6 +2735,12 @@ INSTANTIATE_TEST_CASE_P(ExternalSSTFileTest, ExternalSSTFileTest, std::make_tuple(true, false), std::make_tuple(true, true))); +INSTANTIATE_TEST_CASE_P(ExternSSTFileLinkFailFallbackTest, + ExternSSTFileLinkFailFallbackTest, + testing::Values(std::make_tuple(true, false), + std::make_tuple(true, true), + std::make_tuple(false, false))); + } // namespace rocksdb int main(int argc, char** argv) { diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 7d22fb675..cc7119410 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -1398,6 +1398,8 @@ struct CompactRangeOptions { struct IngestExternalFileOptions { // Can be set to true to move the files instead of copying them. bool move_files = false; + // If set to true, ingestion falls back to copy when move fails. + bool failed_move_fall_back_to_copy = true; // If set to false, an ingested file keys could appear in existing snapshots // that where created before the file was ingested. bool snapshot_consistency = true;