From 120bc4715b7e851b32f9b880f5ffac1d4f3a5d50 Mon Sep 17 00:00:00 2001 From: Mike Kolupaev Date: Mon, 1 Apr 2019 17:07:38 -0700 Subject: [PATCH] Add DBOptions. avoid_unnecessary_blocking_io to defer file deletions (#5043) Summary: Just like ReadOptions::background_purge_on_iterator_cleanup but for ColumnFamilyHandle instead of Iterator. In our use case we sometimes call ColumnFamilyHandle's destructor from low-latency threads, and sometimes it blocks the thread for a few seconds deleting the files. To avoid that, we can either offload ColumnFamilyHandle's destruction to a background thread on our side, or add this option on rocksdb side. This PR does the latter, to be consistent with how we solve exactly the same problem for iterators using background_purge_on_iterator_cleanup option. (EDIT: It's avoid_unnecessary_blocking_io now, and affects both CF drops and iterator destructors.) I'm not quite comfortable with having two separate options (background_purge_on_iterator_cleanup and background_purge_on_cf_cleanup) for such a rarely used thing. Maybe we should merge them? Rename background_purge_on_cf_cleanup to something like delete_files_on_background_threads_only or avoid_blocking_io_in_unexpected_places, and make iterators use it instead of the one in ReadOptions? I can do that here if you guys think it's better. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5043 Differential Revision: D14339233 Pulled By: al13n321 fbshipit-source-id: ccf7efa11c85c9a5b91d969bb55627d0fb01e7b8 --- HISTORY.md | 2 ++ db/column_family.cc | 9 +++++- db/db_impl.cc | 3 +- db/deletefile_test.cc | 49 +++++++++++++++++++++++++++++++- db/forward_iterator.cc | 8 ++++-- include/rocksdb/options.h | 8 ++++++ options/db_options.cc | 7 ++++- options/db_options.h | 1 + options/options_helper.cc | 9 +++++- options/options_settable_test.cc | 3 +- 10 files changed, 90 insertions(+), 9 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 2819c69a5..593a74324 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -9,6 +9,8 @@ * Introduce two more stats levels, kExceptHistogramOrTimers and kExceptTimers. * Added a feature to perform data-block sampling for compressibility, and report stats to user. * Add support for trace filtering. +* Add DBOptions.avoid_unnecessary_blocking_io. If true, we avoid file deletion when destorying ColumnFamilyHandle and Iterator. Instead, a job is scheduled to delete the files in background. + ### Public API Change * Remove bundled fbson library. * statistics.stats_level_ becomes atomic. It is preferred to use statistics.set_stats_level() and statistics.get_stats_level() to access it. diff --git a/db/column_family.cc b/db/column_family.cc index c9172f662..f9a4ae66d 100644 --- a/db/column_family.cc +++ b/db/column_family.cc @@ -68,7 +68,14 @@ ColumnFamilyHandleImpl::~ColumnFamilyHandleImpl() { db_->FindObsoleteFiles(&job_context, false, true); mutex_->Unlock(); if (job_context.HaveSomethingToDelete()) { - db_->PurgeObsoleteFiles(job_context); + bool defer_purge = + db_->immutable_db_options().avoid_unnecessary_blocking_io; + db_->PurgeObsoleteFiles(job_context, defer_purge); + if (defer_purge) { + mutex_->Lock(); + db_->SchedulePurge(); + mutex_->Unlock(); + } } job_context.Clean(); } diff --git a/db/db_impl.cc b/db/db_impl.cc index becc8e8e2..4a55ad7e3 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -1313,7 +1313,8 @@ InternalIterator* DBImpl::NewInternalIterator(const ReadOptions& read_options, internal_iter = merge_iter_builder.Finish(); IterState* cleanup = new IterState(this, &mutex_, super_version, - read_options.background_purge_on_iterator_cleanup); + read_options.background_purge_on_iterator_cleanup || + immutable_db_options_.avoid_unnecessary_blocking_io); internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, nullptr); return internal_iter; diff --git a/db/deletefile_test.cc b/db/deletefile_test.cc index 8583535fa..0905c90cc 100644 --- a/db/deletefile_test.cc +++ b/db/deletefile_test.cc @@ -241,7 +241,7 @@ TEST_F(DeleteFileTest, PurgeObsoleteFilesTest) { CloseDB(); } -TEST_F(DeleteFileTest, BackgroundPurgeTest) { +TEST_F(DeleteFileTest, BackgroundPurgeIteratorTest) { std::string first("0"), last("999999"); CompactRangeOptions compact_options; compact_options.change_level = true; @@ -279,6 +279,53 @@ TEST_F(DeleteFileTest, BackgroundPurgeTest) { CloseDB(); } +TEST_F(DeleteFileTest, BackgroundPurgeCFDropTest) { + auto do_test = [&](bool bg_purge) { + ColumnFamilyOptions co; + WriteOptions wo; + FlushOptions fo; + ColumnFamilyHandle* cfh = nullptr; + + ASSERT_OK(db_->CreateColumnFamily(co, "dropme", &cfh)); + + ASSERT_OK(db_->Put(wo, cfh, "pika", "chu")); + ASSERT_OK(db_->Flush(fo, cfh)); + // Expect 1 sst file. + CheckFileTypeCounts(dbname_, 0, 1, 1); + + ASSERT_OK(db_->DropColumnFamily(cfh)); + // Still 1 file, it won't be deleted while ColumnFamilyHandle is alive. + CheckFileTypeCounts(dbname_, 0, 1, 1); + + delete cfh; + test::SleepingBackgroundTask sleeping_task_after; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + &sleeping_task_after, Env::Priority::HIGH); + // If background purge is enabled, the file should still be there. + CheckFileTypeCounts(dbname_, 0, bg_purge ? 1 : 0, 1); + + // Execute background purges. + sleeping_task_after.WakeUp(); + sleeping_task_after.WaitUntilDone(); + // The file should have been deleted. + CheckFileTypeCounts(dbname_, 0, 0, 1); + }; + + { + SCOPED_TRACE("avoid_unnecessary_blocking_io = false"); + do_test(false); + } + + options_.avoid_unnecessary_blocking_io = true; + ASSERT_OK(ReopenDB(false)); + { + SCOPED_TRACE("avoid_unnecessary_blocking_io = true"); + do_test(true); + } + + CloseDB(); +} + // This test is to reproduce a bug that read invalid ReadOption in iterator // cleanup function TEST_F(DeleteFileTest, BackgroundPurgeCopyOptions) { diff --git a/db/forward_iterator.cc b/db/forward_iterator.cc index 165982241..d1c073468 100644 --- a/db/forward_iterator.cc +++ b/db/forward_iterator.cc @@ -265,16 +265,18 @@ void ForwardIterator::SVCleanup() { if (sv_ == nullptr) { return; } + bool background_purge = + read_options_.background_purge_on_iterator_cleanup || + db_->immutable_db_options().avoid_unnecessary_blocking_io; if (pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled()) { // pinned_iters_mgr_ tells us to make sure that all visited key-value slices // are alive until pinned_iters_mgr_->ReleasePinnedData() is called. // The slices may point into some memtables owned by sv_, so we need to keep // sv_ referenced until pinned_iters_mgr_ unpins everything. - auto p = new SVCleanupParams{ - db_, sv_, read_options_.background_purge_on_iterator_cleanup}; + auto p = new SVCleanupParams{db_, sv_, background_purge}; pinned_iters_mgr_->PinPtr(p, &ForwardIterator::DeferredSVCleanup); } else { - SVCleanup(db_, sv_, read_options_.background_purge_on_iterator_cleanup); + SVCleanup(db_, sv_, background_purge); } } diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index b42a1d5e6..f7d6dfaf5 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -1003,6 +1003,14 @@ struct DBOptions { // Currently, any WAL-enabled writes after atomic flush may be replayed // independently if the process crashes later and tries to recover. bool atomic_flush = false; + + // If true, ColumnFamilyHandle's and Iterator's destructors won't delete + // obsolete files directly and will instead schedule a background job + // to do it. Use it if you're destroying iterators or ColumnFamilyHandle-s + // from latency-sensitive threads. + // If set to true, takes precedence over + // ReadOptions::background_purge_on_iterator_cleanup. + bool avoid_unnecessary_blocking_io = false; }; // Options to control the behavior of a database (passed to DB::Open) diff --git a/options/db_options.cc b/options/db_options.cc index b90c27606..b9950e09e 100644 --- a/options/db_options.cc +++ b/options/db_options.cc @@ -86,7 +86,8 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options) preserve_deletes(options.preserve_deletes), two_write_queues(options.two_write_queues), manual_wal_flush(options.manual_wal_flush), - atomic_flush(options.atomic_flush) { + atomic_flush(options.atomic_flush), + avoid_unnecessary_blocking_io(options.avoid_unnecessary_blocking_io) { } void ImmutableDBOptions::Dump(Logger* log) const { @@ -217,6 +218,10 @@ void ImmutableDBOptions::Dump(Logger* log) const { two_write_queues); ROCKS_LOG_HEADER(log, " Options.manual_wal_flush: %d", manual_wal_flush); + ROCKS_LOG_HEADER(log, " Options.atomic_flush: %d", atomic_flush); + ROCKS_LOG_HEADER(log, + " Options.avoid_unnecessary_blocking_io: %d", + avoid_unnecessary_blocking_io); } MutableDBOptions::MutableDBOptions() diff --git a/options/db_options.h b/options/db_options.h index 360848851..283cf7d35 100644 --- a/options/db_options.h +++ b/options/db_options.h @@ -79,6 +79,7 @@ struct ImmutableDBOptions { bool two_write_queues; bool manual_wal_flush; bool atomic_flush; + bool avoid_unnecessary_blocking_io; }; struct MutableDBOptions { diff --git a/options/options_helper.cc b/options/options_helper.cc index be8ec2e18..9facf6e94 100644 --- a/options/options_helper.cc +++ b/options/options_helper.cc @@ -132,6 +132,8 @@ DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options, options.two_write_queues = immutable_db_options.two_write_queues; options.manual_wal_flush = immutable_db_options.manual_wal_flush; options.atomic_flush = immutable_db_options.atomic_flush; + options.avoid_unnecessary_blocking_io = + immutable_db_options.avoid_unnecessary_blocking_io; return options; } @@ -1615,7 +1617,12 @@ std::unordered_map {"atomic_flush", {offsetof(struct DBOptions, atomic_flush), OptionType::kBoolean, OptionVerificationType::kNormal, false, - offsetof(struct ImmutableDBOptions, atomic_flush)}}}; + offsetof(struct ImmutableDBOptions, atomic_flush)}}, + {"avoid_unnecessary_blocking_io", + {offsetof(struct DBOptions, avoid_unnecessary_blocking_io), + OptionType::kBoolean, OptionVerificationType::kNormal, false, + offsetof(struct ImmutableDBOptions, avoid_unnecessary_blocking_io)}} + }; std::unordered_map OptionsHelper::block_base_table_index_type_string_map = { diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc index cf1ce4e21..3a6bd6a88 100644 --- a/options/options_settable_test.cc +++ b/options/options_settable_test.cc @@ -294,7 +294,8 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) { "two_write_queues=false;" "manual_wal_flush=false;" "seq_per_batch=false;" - "atomic_flush=false", + "atomic_flush=false;" + "avoid_unnecessary_blocking_io=false", new_options)); ASSERT_EQ(unset_bytes_base, NumUnsetBytes(new_options_ptr, sizeof(DBOptions),