Fix heap use-after-free race with DropColumnFamily (#9730)

Summary:
Although ColumnFamilySet comments say that DB mutex can be
freed during iteration, as long as you hold a ref while releasing DB
mutex, this is not quite true because UnrefAndTryDelete might delete cfd
right before it is needed to get ->next_ for the next iteration of the
loop.

This change solves the problem by making a wrapper class that makes such
iteration easier while handling the tricky details of UnrefAndTryDelete
on the previous cfd only after getting next_ in operator++.

FreeDeadColumnFamilies should already have been obsolete; this removes
it for good. Similarly, ColumnFamilySet::iterator doesn't need to check
for cfd with 0 refs, because those are immediately deleted.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/9730

Test Plan:
was reported with ASAN on unit tests like
DBLogicalBlockSizeCacheTest.CreateColumnFamily (very rare); keep watching

Reviewed By: ltamasi

Differential Revision: D35038143

Pulled By: pdillinger

fbshipit-source-id: 0a5478d5be96c135343a00603711b7df43ae19c9
This commit is contained in:
Peter Dillinger 2022-03-24 13:05:17 -07:00 committed by Andrew Kryczka
parent 2de3bfc7d1
commit 8b2cfef9e0
7 changed files with 71 additions and 59 deletions

View File

@ -5,6 +5,7 @@
* Fixed a race condition for `alive_log_files_` in non-two-write-queues mode. The race is between the write_thread_ in WriteToWAL() and another thread executing `FindObsoleteFiles()`. The race condition will be caught if `__glibcxx_requires_nonempty` is enabled. * Fixed a race condition for `alive_log_files_` in non-two-write-queues mode. The race is between the write_thread_ in WriteToWAL() and another thread executing `FindObsoleteFiles()`. The race condition will be caught if `__glibcxx_requires_nonempty` is enabled.
* Fixed a race condition when mmaping a WritableFile on POSIX. * Fixed a race condition when mmaping a WritableFile on POSIX.
* Fixed a race condition when 2PC is disabled and WAL tracking in the MANIFEST is enabled. The race condition is between two background flush threads trying to install flush results, causing a WAL deletion not tracked in the MANIFEST. A future DB open may fail. * Fixed a race condition when 2PC is disabled and WAL tracking in the MANIFEST is enabled. The race condition is between two background flush threads trying to install flush results, causing a WAL deletion not tracked in the MANIFEST. A future DB open may fail.
* Fixed a heap use-after-free race with DropColumnFamily.
## 7.0.3 (03/23/2022) ## 7.0.3 (03/23/2022)
### Bug Fixes ### Bug Fixes

View File

@ -1562,20 +1562,6 @@ ColumnFamilyData* ColumnFamilySet::CreateColumnFamily(
return new_cfd; return new_cfd;
} }
// REQUIRES: DB mutex held
void ColumnFamilySet::FreeDeadColumnFamilies() {
autovector<ColumnFamilyData*> to_delete;
for (auto cfd = dummy_cfd_->next_; cfd != dummy_cfd_; cfd = cfd->next_) {
if (cfd->refs_.load(std::memory_order_relaxed) == 0) {
to_delete.push_back(cfd);
}
}
for (auto cfd : to_delete) {
// this is very rare, so it's not a problem that we do it under a mutex
delete cfd;
}
}
// under a DB mutex AND from a write thread // under a DB mutex AND from a write thread
void ColumnFamilySet::RemoveColumnFamily(ColumnFamilyData* cfd) { void ColumnFamilySet::RemoveColumnFamily(ColumnFamilyData* cfd) {
auto cfd_iter = column_family_data_.find(cfd->GetID()); auto cfd_iter = column_family_data_.find(cfd->GetID());

View File

@ -519,9 +519,10 @@ class ColumnFamilyData {
ThreadLocalPtr* TEST_GetLocalSV() { return local_sv_.get(); } ThreadLocalPtr* TEST_GetLocalSV() { return local_sv_.get(); }
WriteBufferManager* write_buffer_mgr() { return write_buffer_manager_; } WriteBufferManager* write_buffer_mgr() { return write_buffer_manager_; }
static const uint32_t kDummyColumnFamilyDataId;
private: private:
friend class ColumnFamilySet; friend class ColumnFamilySet;
static const uint32_t kDummyColumnFamilyDataId;
ColumnFamilyData(uint32_t id, const std::string& name, ColumnFamilyData(uint32_t id, const std::string& name,
Version* dummy_versions, Cache* table_cache, Version* dummy_versions, Cache* table_cache,
WriteBufferManager* write_buffer_manager, WriteBufferManager* write_buffer_manager,
@ -627,10 +628,8 @@ class ColumnFamilyData {
// held and it needs to be executed from the write thread. SetDropped() also // held and it needs to be executed from the write thread. SetDropped() also
// guarantees that it will be called only from single-threaded LogAndApply(), // guarantees that it will be called only from single-threaded LogAndApply(),
// but this condition is not that important. // but this condition is not that important.
// * Iteration -- hold DB mutex, but you can release it in the body of // * Iteration -- hold DB mutex. If you want to release the DB mutex in the
// iteration. If you release DB mutex in body, reference the column // body of the iteration, wrap in a RefedColumnFamilySet.
// family before the mutex and unreference after you unlock, since the column
// family might get dropped when the DB mutex is released
// * GetDefault() -- thread safe // * GetDefault() -- thread safe
// * GetColumnFamily() -- either inside of DB mutex or from a write thread // * GetColumnFamily() -- either inside of DB mutex or from a write thread
// * GetNextColumnFamilyID(), GetMaxColumnFamily(), UpdateMaxColumnFamily(), // * GetNextColumnFamilyID(), GetMaxColumnFamily(), UpdateMaxColumnFamily(),
@ -642,17 +641,12 @@ class ColumnFamilySet {
public: public:
explicit iterator(ColumnFamilyData* cfd) explicit iterator(ColumnFamilyData* cfd)
: current_(cfd) {} : current_(cfd) {}
// NOTE: minimum operators for for-loop iteration
iterator& operator++() { iterator& operator++() {
// dropped column families might still be included in this iteration current_ = current_->next_;
// (we're only removing them when client drops the last reference to the
// column family).
// dummy is never dead, so this will never be infinite
do {
current_ = current_->next_;
} while (current_->refs_.load(std::memory_order_relaxed) == 0);
return *this; return *this;
} }
bool operator!=(const iterator& other) { bool operator!=(const iterator& other) const {
return this->current_ != other.current_; return this->current_ != other.current_;
} }
ColumnFamilyData* operator*() { return current_; } ColumnFamilyData* operator*() { return current_; }
@ -691,10 +685,6 @@ class ColumnFamilySet {
iterator begin() { return iterator(dummy_cfd_->next_); } iterator begin() { return iterator(dummy_cfd_->next_); }
iterator end() { return iterator(dummy_cfd_); } iterator end() { return iterator(dummy_cfd_); }
// REQUIRES: DB mutex held
// Don't call while iterating over ColumnFamilySet
void FreeDeadColumnFamilies();
Cache* get_table_cache() { return table_cache_; } Cache* get_table_cache() { return table_cache_; }
WriteBufferManager* write_buffer_manager() { return write_buffer_manager_; } WriteBufferManager* write_buffer_manager() { return write_buffer_manager_; }
@ -737,6 +727,55 @@ class ColumnFamilySet {
std::string db_session_id_; std::string db_session_id_;
}; };
// A wrapper for ColumnFamilySet that supports releasing DB mutex during each
// iteration over the iterator, because the cfd is Refed and Unrefed during
// each iteration to prevent concurrent CF drop from destroying it (until
// Unref).
class RefedColumnFamilySet {
public:
explicit RefedColumnFamilySet(ColumnFamilySet* cfs) : wrapped_(cfs) {}
class iterator {
public:
explicit iterator(ColumnFamilySet::iterator wrapped) : wrapped_(wrapped) {
MaybeRef(*wrapped_);
}
~iterator() { MaybeUnref(*wrapped_); }
inline void MaybeRef(ColumnFamilyData* cfd) {
if (cfd->GetID() != ColumnFamilyData::kDummyColumnFamilyDataId) {
cfd->Ref();
}
}
inline void MaybeUnref(ColumnFamilyData* cfd) {
if (cfd->GetID() != ColumnFamilyData::kDummyColumnFamilyDataId) {
cfd->UnrefAndTryDelete();
}
}
// NOTE: minimum operators for for-loop iteration
inline iterator& operator++() {
ColumnFamilyData* old = *wrapped_;
++wrapped_;
// Can only unref & potentially free cfd after accessing its next_
MaybeUnref(old);
MaybeRef(*wrapped_);
return *this;
}
inline bool operator!=(const iterator& other) const {
return this->wrapped_ != other.wrapped_;
}
inline ColumnFamilyData* operator*() { return *wrapped_; }
private:
ColumnFamilySet::iterator wrapped_;
};
iterator begin() { return iterator(wrapped_->begin()); }
iterator end() { return iterator(wrapped_->end()); }
private:
ColumnFamilySet* wrapped_;
};
// We use ColumnFamilyMemTablesImpl to provide WriteBatch a way to access // We use ColumnFamilyMemTablesImpl to provide WriteBatch a way to access
// memtables of different column families (specified by ID in the write batch) // memtables of different column families (specified by ID in the write batch)
class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables { class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables {

View File

@ -45,17 +45,15 @@ Status DBImpl::FlushForGetLiveFiles() {
} }
mutex_.Lock(); mutex_.Lock();
} else { } else {
for (auto cfd : *versions_->GetColumnFamilySet()) { for (auto cfd : versions_->GetRefedColumnFamilySet()) {
if (cfd->IsDropped()) { if (cfd->IsDropped()) {
continue; continue;
} }
cfd->Ref();
mutex_.Unlock(); mutex_.Unlock();
status = FlushMemTable(cfd, FlushOptions(), FlushReason::kGetLiveFiles); status = FlushMemTable(cfd, FlushOptions(), FlushReason::kGetLiveFiles);
TEST_SYNC_POINT("DBImpl::GetLiveFiles:1"); TEST_SYNC_POINT("DBImpl::GetLiveFiles:1");
TEST_SYNC_POINT("DBImpl::GetLiveFiles:2"); TEST_SYNC_POINT("DBImpl::GetLiveFiles:2");
mutex_.Lock(); mutex_.Lock();
cfd->UnrefAndTryDelete();
if (!status.ok() && !status.IsColumnFamilyDropped()) { if (!status.ok() && !status.IsColumnFamilyDropped()) {
break; break;
} else if (status.IsColumnFamilyDropped()) { } else if (status.IsColumnFamilyDropped()) {
@ -63,7 +61,6 @@ Status DBImpl::FlushForGetLiveFiles() {
} }
} }
} }
versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
return status; return status;
} }

View File

@ -377,15 +377,12 @@ Status DBImpl::ResumeImpl(DBRecoverContext context) {
s = AtomicFlushMemTables(cfds, flush_opts, context.flush_reason); s = AtomicFlushMemTables(cfds, flush_opts, context.flush_reason);
mutex_.Lock(); mutex_.Lock();
} else { } else {
for (auto cfd : *versions_->GetColumnFamilySet()) { for (auto cfd : versions_->GetRefedColumnFamilySet()) {
if (cfd->IsDropped()) { if (cfd->IsDropped()) {
continue; continue;
} }
cfd->Ref(); InstrumentedMutexUnlock u(&mutex_);
mutex_.Unlock();
s = FlushMemTable(cfd, flush_opts, context.flush_reason); s = FlushMemTable(cfd, flush_opts, context.flush_reason);
mutex_.Lock();
cfd->UnrefAndTryDelete();
if (!s.ok()) { if (!s.ok()) {
break; break;
} }
@ -497,18 +494,14 @@ void DBImpl::CancelAllBackgroundWork(bool wait) {
s.PermitUncheckedError(); //**TODO: What to do on error? s.PermitUncheckedError(); //**TODO: What to do on error?
mutex_.Lock(); mutex_.Lock();
} else { } else {
for (auto cfd : *versions_->GetColumnFamilySet()) { for (auto cfd : versions_->GetRefedColumnFamilySet()) {
if (!cfd->IsDropped() && cfd->initialized() && !cfd->mem()->IsEmpty()) { if (!cfd->IsDropped() && cfd->initialized() && !cfd->mem()->IsEmpty()) {
cfd->Ref(); InstrumentedMutexUnlock u(&mutex_);
mutex_.Unlock();
Status s = FlushMemTable(cfd, FlushOptions(), FlushReason::kShutDown); Status s = FlushMemTable(cfd, FlushOptions(), FlushReason::kShutDown);
s.PermitUncheckedError(); //**TODO: What to do on error? s.PermitUncheckedError(); //**TODO: What to do on error?
mutex_.Lock();
cfd->UnrefAndTryDelete();
} }
} }
} }
versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
} }
shutting_down_.store(true, std::memory_order_release); shutting_down_.store(true, std::memory_order_release);
@ -971,18 +964,13 @@ void DBImpl::DumpStats() {
TEST_SYNC_POINT("DBImpl::DumpStats:StartRunning"); TEST_SYNC_POINT("DBImpl::DumpStats:StartRunning");
{ {
InstrumentedMutexLock l(&mutex_); InstrumentedMutexLock l(&mutex_);
for (auto cfd : *versions_->GetColumnFamilySet()) { for (auto cfd : versions_->GetRefedColumnFamilySet()) {
if (cfd->initialized()) { if (cfd->initialized()) {
// Release DB mutex for gathering cache entry stats. Pass over all // Release DB mutex for gathering cache entry stats. Pass over all
// column families for this first so that other stats are dumped // column families for this first so that other stats are dumped
// near-atomically. // near-atomically.
// Get a ref before unlocking InstrumentedMutexUnlock u(&mutex_);
cfd->Ref(); cfd->internal_stats()->CollectCacheEntryStats(/*foreground=*/false);
{
InstrumentedMutexUnlock u(&mutex_);
cfd->internal_stats()->CollectCacheEntryStats(/*foreground=*/false);
}
cfd->UnrefAndTryDelete();
} }
} }
@ -3492,15 +3480,13 @@ bool DBImpl::GetAggregatedIntProperty(const Slice& property,
// Needs mutex to protect the list of column families. // Needs mutex to protect the list of column families.
InstrumentedMutexLock l(&mutex_); InstrumentedMutexLock l(&mutex_);
uint64_t value; uint64_t value;
for (auto* cfd : *versions_->GetColumnFamilySet()) { for (auto* cfd : versions_->GetRefedColumnFamilySet()) {
if (!cfd->initialized()) { if (!cfd->initialized()) {
continue; continue;
} }
cfd->Ref();
ret = GetIntPropertyInternal(cfd, *property_info, true, &value); ret = GetIntPropertyInternal(cfd, *property_info, true, &value);
// GetIntPropertyInternal may release db mutex and re-acquire it. // GetIntPropertyInternal may release db mutex and re-acquire it.
mutex_.AssertHeld(); mutex_.AssertHeld();
cfd->UnrefAndTryDelete();
if (ret) { if (ret) {
sum += value; sum += value;
} else { } else {
@ -5091,6 +5077,7 @@ Status DBImpl::VerifyChecksumInternal(const ReadOptions& read_options,
} }
} }
// TODO: simplify using GetRefedColumnFamilySet?
std::vector<ColumnFamilyData*> cfd_list; std::vector<ColumnFamilyData*> cfd_list;
{ {
InstrumentedMutexLock l(&mutex_); InstrumentedMutexLock l(&mutex_);

View File

@ -2992,8 +2992,6 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
bg_bottom_compaction_scheduled_--; bg_bottom_compaction_scheduled_--;
} }
versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
// See if there's more work to be done // See if there's more work to be done
MaybeScheduleFlushOrCompaction(); MaybeScheduleFlushOrCompaction();

View File

@ -1299,6 +1299,10 @@ class VersionSet {
uint64_t min_pending_output); uint64_t min_pending_output);
ColumnFamilySet* GetColumnFamilySet() { return column_family_set_.get(); } ColumnFamilySet* GetColumnFamilySet() { return column_family_set_.get(); }
RefedColumnFamilySet GetRefedColumnFamilySet() {
return RefedColumnFamilySet(GetColumnFamilySet());
}
const FileOptions& file_options() { return file_options_; } const FileOptions& file_options() { return file_options_; }
void ChangeFileOptions(const MutableDBOptions& new_options) { void ChangeFileOptions(const MutableDBOptions& new_options) {
file_options_.writable_file_max_buffer_size = file_options_.writable_file_max_buffer_size =