diff --git a/env/env_posix.cc b/env/env_posix.cc index b29a64752..cc377f161 100644 --- a/env/env_posix.cc +++ b/env/env_posix.cc @@ -613,6 +613,15 @@ class PosixEnv : public Env { return result; } + Status NumFileLinks(const std::string& fname, uint64_t* count) override { + struct stat s; + if (stat(fname.c_str(), &s) != 0) { + return IOError("while stat a file for num file links", fname, errno); + } + *count = static_cast(s.st_nlink); + return Status::OK(); + } + virtual Status AreFilesSame(const std::string& first, const std::string& second, bool* res) override { struct stat statbuf[2]; diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h index 79ff3f38b..c6ca725c5 100644 --- a/include/rocksdb/env.h +++ b/include/rocksdb/env.h @@ -289,6 +289,12 @@ class Env { return Status::NotSupported("LinkFile is not supported for this Env"); } + virtual Status NumFileLinks(const std::string& /*fname*/, + uint64_t* /*count*/) { + return Status::NotSupported( + "Getting number of file links is not supported for this Env"); + } + virtual Status AreFilesSame(const std::string& /*first*/, const std::string& /*second*/, bool* /*res*/) { return Status::NotSupported("AreFilesSame is not supported for this Env"); @@ -1064,6 +1070,10 @@ class EnvWrapper : public Env { return target_->LinkFile(s, t); } + Status NumFileLinks(const std::string& fname, uint64_t* count) override { + return target_->NumFileLinks(fname, count); + } + Status AreFilesSame(const std::string& first, const std::string& second, bool* res) override { return target_->AreFilesSame(first, second, res); diff --git a/include/rocksdb/sst_file_manager.h b/include/rocksdb/sst_file_manager.h index 439537e2a..2d3c3be66 100644 --- a/include/rocksdb/sst_file_manager.h +++ b/include/rocksdb/sst_file_manager.h @@ -96,14 +96,18 @@ class SstFileManager { // @param max_trash_db_ratio: If the trash size constitutes for more than this // fraction of the total DB size we will start deleting new files passed to // DeleteScheduler immediately -// @param bytes_max_delete_chunk: if a single file is larger than delete chunk, -// ftruncate the file by this size each time, rather than dropping the whole -// file. 0 means to always delete the whole file. NOTE this options may not -// work well with checkpoints, which relies on file system hard links. +// @param bytes_max_delete_chunk: if a file to delete is larger than delete +// chunk, ftruncate the file by this size each time, rather than dropping the +// whole file. 0 means to always delete the whole file. If the file has more +// than one linked names, the file will be deleted as a whole. Either way, +// `rate_bytes_per_sec` will be appreciated. NOTE that with this option, +// files already renamed as a trash may be partial, so users should not +// directly recover them without checking. extern SstFileManager* NewSstFileManager( Env* env, std::shared_ptr info_log = nullptr, std::string trash_dir = "", int64_t rate_bytes_per_sec = 0, bool delete_existing_trash = true, Status* status = nullptr, - double max_trash_db_ratio = 0.25, uint64_t bytes_max_delete_chunk = 0); + double max_trash_db_ratio = 0.25, + uint64_t bytes_max_delete_chunk = 64 * 1024 * 1024); } // namespace rocksdb diff --git a/java/rocksjni/sst_file_manager.cc b/java/rocksjni/sst_file_manager.cc index 440411c39..c83ea00ef 100644 --- a/java/rocksjni/sst_file_manager.cc +++ b/java/rocksjni/sst_file_manager.cc @@ -138,8 +138,7 @@ jobject Java_org_rocksdb_SstFileManager_getTrackedFiles(JNIEnv* env, const rocksdb::HashMapJni::FnMapKV fn_map_kv = - [env]( - const std::pair& pair) { + [env](const std::pair& pair) { const jstring jtracked_file_path = env->NewStringUTF(pair.first.c_str()); if (jtracked_file_path == nullptr) { diff --git a/util/delete_scheduler.cc b/util/delete_scheduler.cc index 8ef66fdfa..1d51055a3 100644 --- a/util/delete_scheduler.cc +++ b/util/delete_scheduler.cc @@ -267,24 +267,46 @@ Status DeleteScheduler::DeleteTrashFile(const std::string& path_in_trash, if (s.ok()) { bool need_full_delete = true; if (bytes_max_delete_chunk_ != 0 && file_size > bytes_max_delete_chunk_) { - unique_ptr wf; - Status my_status = - env_->ReopenWritableFile(path_in_trash, &wf, EnvOptions()); + uint64_t num_hard_links = 2; + // We don't have to worry aobut data race between linking a new + // file after the number of file link check and ftruncte because + // the file is now in trash and no hardlink is supposed to create + // to trash files by RocksDB. + Status my_status = env_->NumFileLinks(path_in_trash, &num_hard_links); if (my_status.ok()) { - my_status = wf->Truncate(file_size - bytes_max_delete_chunk_); - if (my_status.ok()) { - TEST_SYNC_POINT("DeleteScheduler::DeleteTrashFile:Fsync"); - my_status = wf->Fsync(); + if (num_hard_links == 1) { + unique_ptr wf; + my_status = + env_->ReopenWritableFile(path_in_trash, &wf, EnvOptions()); + if (my_status.ok()) { + my_status = wf->Truncate(file_size - bytes_max_delete_chunk_); + if (my_status.ok()) { + TEST_SYNC_POINT("DeleteScheduler::DeleteTrashFile:Fsync"); + my_status = wf->Fsync(); + } + } + if (my_status.ok()) { + *deleted_bytes = bytes_max_delete_chunk_; + need_full_delete = false; + *is_complete = false; + } else { + ROCKS_LOG_WARN(info_log_, + "Failed to partially delete %s from trash -- %s", + path_in_trash.c_str(), my_status.ToString().c_str()); + } + } else { + ROCKS_LOG_INFO(info_log_, + "Cannot delete %s slowly through ftruncate from trash " + "as it has other links", + path_in_trash.c_str()); } - } - if (my_status.ok()) { - *deleted_bytes = bytes_max_delete_chunk_; - need_full_delete = false; - *is_complete = false; - } else { - ROCKS_LOG_WARN(info_log_, - "Failed to partially delete %s from trash -- %s", - path_in_trash.c_str(), my_status.ToString().c_str()); + } else if (!num_link_error_printed_) { + ROCKS_LOG_INFO( + info_log_, + "Cannot delete files slowly through ftruncate from trash " + "as Env::NumFileLinks() returns error: %s", + my_status.ToString().c_str()); + num_link_error_printed_ = true; } } diff --git a/util/delete_scheduler.h b/util/delete_scheduler.h index 0d94ca72d..cbd13ecef 100644 --- a/util/delete_scheduler.h +++ b/util/delete_scheduler.h @@ -108,6 +108,8 @@ class DeleteScheduler { uint64_t bytes_max_delete_chunk_; // Errors that happened in BackgroundEmptyTrash (file_path => error) std::map bg_errors_; + + bool num_link_error_printed_ = false; // Set to true in ~DeleteScheduler() to force BackgroundEmptyTrash to stop bool closing_; // Condition variable signaled in these conditions diff --git a/util/delete_scheduler_test.cc b/util/delete_scheduler_test.cc index e7fd8d49f..78d5313f8 100644 --- a/util/delete_scheduler_test.cc +++ b/util/delete_scheduler_test.cc @@ -478,6 +478,40 @@ TEST_F(DeleteSchedulerTest, DeletePartialFile) { rocksdb::SyncPoint::GetInstance()->EnableProcessing(); } +#ifdef OS_LINUX +TEST_F(DeleteSchedulerTest, NoPartialDeleteWithLink) { + int bg_delete_file = 0; + int bg_fsync = 0; + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DeleteScheduler::DeleteTrashFile:DeleteFile", + [&](void*) { bg_delete_file++; }); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DeleteScheduler::DeleteTrashFile:Fsync", [&](void*) { bg_fsync++; }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + rate_bytes_per_sec_ = 1024 * 1024; // 1 MB / sec + NewDeleteScheduler(); + + std::string file1 = NewDummyFile("data_1", 500 * 1024); + std::string file2 = NewDummyFile("data_2", 100 * 1024); + + ASSERT_OK(env_->LinkFile(file1, dummy_files_dirs_[0] + "/data_1b")); + ASSERT_OK(env_->LinkFile(file2, dummy_files_dirs_[0] + "/data_2b")); + + // Should delete in 4 batch if there is no hardlink + ASSERT_OK(delete_scheduler_->DeleteFile(file1, "")); + ASSERT_OK(delete_scheduler_->DeleteFile(file2, "")); + + delete_scheduler_->WaitForEmptyTrash(); + + auto bg_errors = delete_scheduler_->GetBackgroundErrors(); + ASSERT_EQ(bg_errors.size(), 0); + ASSERT_EQ(2, bg_delete_file); + ASSERT_EQ(0, bg_fsync); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); +} +#endif + // 1- Create a DeleteScheduler with very slow rate limit (1 Byte / sec) // 2- Delete 100 files using DeleteScheduler // 3- Delete the DeleteScheduler (call the destructor while queue is not empty)