From e3a1707d05b60d7a6e2ed516d85e5f919321e4b3 Mon Sep 17 00:00:00 2001 From: Peter Dillinger Date: Thu, 24 Mar 2022 13:05:17 -0700 Subject: [PATCH] Fix heap use-after-free race with DropColumnFamily (#9730) Summary: Although ColumnFamilySet comments say that DB mutex can be freed during iteration, as long as you hold a ref while releasing DB mutex, this is not quite true because UnrefAndTryDelete might delete cfd right before it is needed to get ->next_ for the next iteration of the loop. This change solves the problem by making a wrapper class that makes such iteration easier while handling the tricky details of UnrefAndTryDelete on the previous cfd only after getting next_ in operator++. FreeDeadColumnFamilies should already have been obsolete; this removes it for good. Similarly, ColumnFamilySet::iterator doesn't need to check for cfd with 0 refs, because those are immediately deleted. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9730 Test Plan: was reported with ASAN on unit tests like DBLogicalBlockSizeCacheTest.CreateColumnFamily (very rare); keep watching Reviewed By: ltamasi Differential Revision: D35038143 Pulled By: pdillinger fbshipit-source-id: 0a5478d5be96c135343a00603711b7df43ae19c9 --- HISTORY.md | 1 + db/column_family.cc | 14 ----- db/column_family.h | 73 ++++++++++++++++++++------ db/db_filesnapshot.cc | 5 +- db/db_impl/db_impl.cc | 31 ++++------- db/db_impl/db_impl_compaction_flush.cc | 2 - db/version_set.h | 4 ++ 7 files changed, 71 insertions(+), 59 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index b90923585..806184043 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -4,6 +4,7 @@ * Fixed a race condition for `alive_log_files_` in non-two-write-queues mode. The race is between the write_thread_ in WriteToWAL() and another thread executing `FindObsoleteFiles()`. The race condition will be caught if `__glibcxx_requires_nonempty` is enabled. * Fixed a race condition when mmaping a WritableFile on POSIX. * Fixed a race condition when 2PC is disabled and WAL tracking in the MANIFEST is enabled. The race condition is between two background flush threads trying to install flush results, causing a WAL deletion not tracked in the MANIFEST. A future DB open may fail. +* Fixed a heap use-after-free race with DropColumnFamily. ## 6.29.4 (03/22/2022) ### Bug Fixes diff --git a/db/column_family.cc b/db/column_family.cc index a2885515c..c89ed24c0 100644 --- a/db/column_family.cc +++ b/db/column_family.cc @@ -1562,20 +1562,6 @@ ColumnFamilyData* ColumnFamilySet::CreateColumnFamily( return new_cfd; } -// REQUIRES: DB mutex held -void ColumnFamilySet::FreeDeadColumnFamilies() { - autovector to_delete; - for (auto cfd = dummy_cfd_->next_; cfd != dummy_cfd_; cfd = cfd->next_) { - if (cfd->refs_.load(std::memory_order_relaxed) == 0) { - to_delete.push_back(cfd); - } - } - for (auto cfd : to_delete) { - // this is very rare, so it's not a problem that we do it under a mutex - delete cfd; - } -} - // under a DB mutex AND from a write thread void ColumnFamilySet::RemoveColumnFamily(ColumnFamilyData* cfd) { auto cfd_iter = column_family_data_.find(cfd->GetID()); diff --git a/db/column_family.h b/db/column_family.h index a28f5d7e9..4de70290d 100644 --- a/db/column_family.h +++ b/db/column_family.h @@ -519,9 +519,10 @@ class ColumnFamilyData { ThreadLocalPtr* TEST_GetLocalSV() { return local_sv_.get(); } WriteBufferManager* write_buffer_mgr() { return write_buffer_manager_; } + static const uint32_t kDummyColumnFamilyDataId; + private: friend class ColumnFamilySet; - static const uint32_t kDummyColumnFamilyDataId; ColumnFamilyData(uint32_t id, const std::string& name, Version* dummy_versions, Cache* table_cache, WriteBufferManager* write_buffer_manager, @@ -627,10 +628,8 @@ class ColumnFamilyData { // held and it needs to be executed from the write thread. SetDropped() also // guarantees that it will be called only from single-threaded LogAndApply(), // but this condition is not that important. -// * Iteration -- hold DB mutex, but you can release it in the body of -// iteration. If you release DB mutex in body, reference the column -// family before the mutex and unreference after you unlock, since the column -// family might get dropped when the DB mutex is released +// * Iteration -- hold DB mutex. If you want to release the DB mutex in the +// body of the iteration, wrap in a RefedColumnFamilySet. // * GetDefault() -- thread safe // * GetColumnFamily() -- either inside of DB mutex or from a write thread // * GetNextColumnFamilyID(), GetMaxColumnFamily(), UpdateMaxColumnFamily(), @@ -642,17 +641,12 @@ class ColumnFamilySet { public: explicit iterator(ColumnFamilyData* cfd) : current_(cfd) {} + // NOTE: minimum operators for for-loop iteration iterator& operator++() { - // dropped column families might still be included in this iteration - // (we're only removing them when client drops the last reference to the - // column family). - // dummy is never dead, so this will never be infinite - do { - current_ = current_->next_; - } while (current_->refs_.load(std::memory_order_relaxed) == 0); + current_ = current_->next_; return *this; } - bool operator!=(const iterator& other) { + bool operator!=(const iterator& other) const { return this->current_ != other.current_; } ColumnFamilyData* operator*() { return current_; } @@ -691,10 +685,6 @@ class ColumnFamilySet { iterator begin() { return iterator(dummy_cfd_->next_); } iterator end() { return iterator(dummy_cfd_); } - // REQUIRES: DB mutex held - // Don't call while iterating over ColumnFamilySet - void FreeDeadColumnFamilies(); - Cache* get_table_cache() { return table_cache_; } WriteBufferManager* write_buffer_manager() { return write_buffer_manager_; } @@ -737,6 +727,55 @@ class ColumnFamilySet { std::string db_session_id_; }; +// A wrapper for ColumnFamilySet that supports releasing DB mutex during each +// iteration over the iterator, because the cfd is Refed and Unrefed during +// each iteration to prevent concurrent CF drop from destroying it (until +// Unref). +class RefedColumnFamilySet { + public: + explicit RefedColumnFamilySet(ColumnFamilySet* cfs) : wrapped_(cfs) {} + + class iterator { + public: + explicit iterator(ColumnFamilySet::iterator wrapped) : wrapped_(wrapped) { + MaybeRef(*wrapped_); + } + ~iterator() { MaybeUnref(*wrapped_); } + inline void MaybeRef(ColumnFamilyData* cfd) { + if (cfd->GetID() != ColumnFamilyData::kDummyColumnFamilyDataId) { + cfd->Ref(); + } + } + inline void MaybeUnref(ColumnFamilyData* cfd) { + if (cfd->GetID() != ColumnFamilyData::kDummyColumnFamilyDataId) { + cfd->UnrefAndTryDelete(); + } + } + // NOTE: minimum operators for for-loop iteration + inline iterator& operator++() { + ColumnFamilyData* old = *wrapped_; + ++wrapped_; + // Can only unref & potentially free cfd after accessing its next_ + MaybeUnref(old); + MaybeRef(*wrapped_); + return *this; + } + inline bool operator!=(const iterator& other) const { + return this->wrapped_ != other.wrapped_; + } + inline ColumnFamilyData* operator*() { return *wrapped_; } + + private: + ColumnFamilySet::iterator wrapped_; + }; + + iterator begin() { return iterator(wrapped_->begin()); } + iterator end() { return iterator(wrapped_->end()); } + + private: + ColumnFamilySet* wrapped_; +}; + // We use ColumnFamilyMemTablesImpl to provide WriteBatch a way to access // memtables of different column families (specified by ID in the write batch) class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables { diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc index 35d75c823..e04927d04 100644 --- a/db/db_filesnapshot.cc +++ b/db/db_filesnapshot.cc @@ -45,17 +45,15 @@ Status DBImpl::FlushForGetLiveFiles() { } mutex_.Lock(); } else { - for (auto cfd : *versions_->GetColumnFamilySet()) { + for (auto cfd : versions_->GetRefedColumnFamilySet()) { if (cfd->IsDropped()) { continue; } - cfd->Ref(); mutex_.Unlock(); status = FlushMemTable(cfd, FlushOptions(), FlushReason::kGetLiveFiles); TEST_SYNC_POINT("DBImpl::GetLiveFiles:1"); TEST_SYNC_POINT("DBImpl::GetLiveFiles:2"); mutex_.Lock(); - cfd->UnrefAndTryDelete(); if (!status.ok() && !status.IsColumnFamilyDropped()) { break; } else if (status.IsColumnFamilyDropped()) { @@ -63,7 +61,6 @@ Status DBImpl::FlushForGetLiveFiles() { } } } - versions_->GetColumnFamilySet()->FreeDeadColumnFamilies(); return status; } diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index d31644be9..c967505b0 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -383,15 +383,12 @@ Status DBImpl::ResumeImpl(DBRecoverContext context) { s = AtomicFlushMemTables(cfds, flush_opts, context.flush_reason); mutex_.Lock(); } else { - for (auto cfd : *versions_->GetColumnFamilySet()) { + for (auto cfd : versions_->GetRefedColumnFamilySet()) { if (cfd->IsDropped()) { continue; } - cfd->Ref(); - mutex_.Unlock(); + InstrumentedMutexUnlock u(&mutex_); s = FlushMemTable(cfd, flush_opts, context.flush_reason); - mutex_.Lock(); - cfd->UnrefAndTryDelete(); if (!s.ok()) { break; } @@ -503,18 +500,14 @@ void DBImpl::CancelAllBackgroundWork(bool wait) { s.PermitUncheckedError(); //**TODO: What to do on error? mutex_.Lock(); } else { - for (auto cfd : *versions_->GetColumnFamilySet()) { + for (auto cfd : versions_->GetRefedColumnFamilySet()) { if (!cfd->IsDropped() && cfd->initialized() && !cfd->mem()->IsEmpty()) { - cfd->Ref(); - mutex_.Unlock(); + InstrumentedMutexUnlock u(&mutex_); Status s = FlushMemTable(cfd, FlushOptions(), FlushReason::kShutDown); s.PermitUncheckedError(); //**TODO: What to do on error? - mutex_.Lock(); - cfd->UnrefAndTryDelete(); } } } - versions_->GetColumnFamilySet()->FreeDeadColumnFamilies(); } shutting_down_.store(true, std::memory_order_release); @@ -977,18 +970,13 @@ void DBImpl::DumpStats() { TEST_SYNC_POINT("DBImpl::DumpStats:StartRunning"); { InstrumentedMutexLock l(&mutex_); - for (auto cfd : *versions_->GetColumnFamilySet()) { + for (auto cfd : versions_->GetRefedColumnFamilySet()) { if (cfd->initialized()) { // Release DB mutex for gathering cache entry stats. Pass over all // column families for this first so that other stats are dumped // near-atomically. - // Get a ref before unlocking - cfd->Ref(); - { - InstrumentedMutexUnlock u(&mutex_); - cfd->internal_stats()->CollectCacheEntryStats(/*foreground=*/false); - } - cfd->UnrefAndTryDelete(); + InstrumentedMutexUnlock u(&mutex_); + cfd->internal_stats()->CollectCacheEntryStats(/*foreground=*/false); } } @@ -3457,15 +3445,13 @@ bool DBImpl::GetAggregatedIntProperty(const Slice& property, // Needs mutex to protect the list of column families. InstrumentedMutexLock l(&mutex_); uint64_t value; - for (auto* cfd : *versions_->GetColumnFamilySet()) { + for (auto* cfd : versions_->GetRefedColumnFamilySet()) { if (!cfd->initialized()) { continue; } - cfd->Ref(); ret = GetIntPropertyInternal(cfd, *property_info, true, &value); // GetIntPropertyInternal may release db mutex and re-acquire it. mutex_.AssertHeld(); - cfd->UnrefAndTryDelete(); if (ret) { sum += value; } else { @@ -5056,6 +5042,7 @@ Status DBImpl::VerifyChecksumInternal(const ReadOptions& read_options, } } + // TODO: simplify using GetRefedColumnFamilySet? std::vector cfd_list; { InstrumentedMutexLock l(&mutex_); diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc index 6c7e59ba7..106b30a58 100644 --- a/db/db_impl/db_impl_compaction_flush.cc +++ b/db/db_impl/db_impl_compaction_flush.cc @@ -2984,8 +2984,6 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction, bg_bottom_compaction_scheduled_--; } - versions_->GetColumnFamilySet()->FreeDeadColumnFamilies(); - // See if there's more work to be done MaybeScheduleFlushOrCompaction(); diff --git a/db/version_set.h b/db/version_set.h index 618c78f3c..e30a83571 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -1272,6 +1272,10 @@ class VersionSet { uint64_t min_pending_output); ColumnFamilySet* GetColumnFamilySet() { return column_family_set_.get(); } + RefedColumnFamilySet GetRefedColumnFamilySet() { + return RefedColumnFamilySet(GetColumnFamilySet()); + } + const FileOptions& file_options() { return file_options_; } void ChangeFileOptions(const MutableDBOptions& new_options) { file_options_.writable_file_max_buffer_size =