Fix heap use-after-free race with DropColumnFamily (#9730)

Summary:
Although ColumnFamilySet comments say that DB mutex can be
freed during iteration, as long as you hold a ref while releasing DB
mutex, this is not quite true because UnrefAndTryDelete might delete cfd
right before it is needed to get ->next_ for the next iteration of the
loop.

This change solves the problem by making a wrapper class that makes such
iteration easier while handling the tricky details of UnrefAndTryDelete
on the previous cfd only after getting next_ in operator++.

FreeDeadColumnFamilies should already have been obsolete; this removes
it for good. Similarly, ColumnFamilySet::iterator doesn't need to check
for cfd with 0 refs, because those are immediately deleted.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/9730

Test Plan:
was reported with ASAN on unit tests like
DBLogicalBlockSizeCacheTest.CreateColumnFamily (very rare); keep watching

Reviewed By: ltamasi

Differential Revision: D35038143

Pulled By: pdillinger

fbshipit-source-id: 0a5478d5be96c135343a00603711b7df43ae19c9
This commit is contained in:
Peter Dillinger 2022-03-24 13:05:17 -07:00 committed by Andrew Kryczka
parent 2de3bfc7d1
commit 8b2cfef9e0
7 changed files with 71 additions and 59 deletions

View File

@ -5,6 +5,7 @@
* Fixed a race condition for `alive_log_files_` in non-two-write-queues mode. The race is between the write_thread_ in WriteToWAL() and another thread executing `FindObsoleteFiles()`. The race condition will be caught if `__glibcxx_requires_nonempty` is enabled.
* Fixed a race condition when mmaping a WritableFile on POSIX.
* Fixed a race condition when 2PC is disabled and WAL tracking in the MANIFEST is enabled. The race condition is between two background flush threads trying to install flush results, causing a WAL deletion not tracked in the MANIFEST. A future DB open may fail.
* Fixed a heap use-after-free race with DropColumnFamily.
## 7.0.3 (03/23/2022)
### Bug Fixes

View File

@ -1562,20 +1562,6 @@ ColumnFamilyData* ColumnFamilySet::CreateColumnFamily(
return new_cfd;
}
// REQUIRES: DB mutex held
void ColumnFamilySet::FreeDeadColumnFamilies() {
autovector<ColumnFamilyData*> to_delete;
for (auto cfd = dummy_cfd_->next_; cfd != dummy_cfd_; cfd = cfd->next_) {
if (cfd->refs_.load(std::memory_order_relaxed) == 0) {
to_delete.push_back(cfd);
}
}
for (auto cfd : to_delete) {
// this is very rare, so it's not a problem that we do it under a mutex
delete cfd;
}
}
// under a DB mutex AND from a write thread
void ColumnFamilySet::RemoveColumnFamily(ColumnFamilyData* cfd) {
auto cfd_iter = column_family_data_.find(cfd->GetID());

View File

@ -519,9 +519,10 @@ class ColumnFamilyData {
ThreadLocalPtr* TEST_GetLocalSV() { return local_sv_.get(); }
WriteBufferManager* write_buffer_mgr() { return write_buffer_manager_; }
static const uint32_t kDummyColumnFamilyDataId;
private:
friend class ColumnFamilySet;
static const uint32_t kDummyColumnFamilyDataId;
ColumnFamilyData(uint32_t id, const std::string& name,
Version* dummy_versions, Cache* table_cache,
WriteBufferManager* write_buffer_manager,
@ -627,10 +628,8 @@ class ColumnFamilyData {
// held and it needs to be executed from the write thread. SetDropped() also
// guarantees that it will be called only from single-threaded LogAndApply(),
// but this condition is not that important.
// * Iteration -- hold DB mutex, but you can release it in the body of
// iteration. If you release DB mutex in body, reference the column
// family before the mutex and unreference after you unlock, since the column
// family might get dropped when the DB mutex is released
// * Iteration -- hold DB mutex. If you want to release the DB mutex in the
// body of the iteration, wrap in a RefedColumnFamilySet.
// * GetDefault() -- thread safe
// * GetColumnFamily() -- either inside of DB mutex or from a write thread
// * GetNextColumnFamilyID(), GetMaxColumnFamily(), UpdateMaxColumnFamily(),
@ -642,17 +641,12 @@ class ColumnFamilySet {
public:
explicit iterator(ColumnFamilyData* cfd)
: current_(cfd) {}
// NOTE: minimum operators for for-loop iteration
iterator& operator++() {
// dropped column families might still be included in this iteration
// (we're only removing them when client drops the last reference to the
// column family).
// dummy is never dead, so this will never be infinite
do {
current_ = current_->next_;
} while (current_->refs_.load(std::memory_order_relaxed) == 0);
current_ = current_->next_;
return *this;
}
bool operator!=(const iterator& other) {
bool operator!=(const iterator& other) const {
return this->current_ != other.current_;
}
ColumnFamilyData* operator*() { return current_; }
@ -691,10 +685,6 @@ class ColumnFamilySet {
iterator begin() { return iterator(dummy_cfd_->next_); }
iterator end() { return iterator(dummy_cfd_); }
// REQUIRES: DB mutex held
// Don't call while iterating over ColumnFamilySet
void FreeDeadColumnFamilies();
Cache* get_table_cache() { return table_cache_; }
WriteBufferManager* write_buffer_manager() { return write_buffer_manager_; }
@ -737,6 +727,55 @@ class ColumnFamilySet {
std::string db_session_id_;
};
// A wrapper for ColumnFamilySet that supports releasing DB mutex during each
// iteration over the iterator, because the cfd is Refed and Unrefed during
// each iteration to prevent concurrent CF drop from destroying it (until
// Unref).
class RefedColumnFamilySet {
public:
explicit RefedColumnFamilySet(ColumnFamilySet* cfs) : wrapped_(cfs) {}
class iterator {
public:
explicit iterator(ColumnFamilySet::iterator wrapped) : wrapped_(wrapped) {
MaybeRef(*wrapped_);
}
~iterator() { MaybeUnref(*wrapped_); }
inline void MaybeRef(ColumnFamilyData* cfd) {
if (cfd->GetID() != ColumnFamilyData::kDummyColumnFamilyDataId) {
cfd->Ref();
}
}
inline void MaybeUnref(ColumnFamilyData* cfd) {
if (cfd->GetID() != ColumnFamilyData::kDummyColumnFamilyDataId) {
cfd->UnrefAndTryDelete();
}
}
// NOTE: minimum operators for for-loop iteration
inline iterator& operator++() {
ColumnFamilyData* old = *wrapped_;
++wrapped_;
// Can only unref & potentially free cfd after accessing its next_
MaybeUnref(old);
MaybeRef(*wrapped_);
return *this;
}
inline bool operator!=(const iterator& other) const {
return this->wrapped_ != other.wrapped_;
}
inline ColumnFamilyData* operator*() { return *wrapped_; }
private:
ColumnFamilySet::iterator wrapped_;
};
iterator begin() { return iterator(wrapped_->begin()); }
iterator end() { return iterator(wrapped_->end()); }
private:
ColumnFamilySet* wrapped_;
};
// We use ColumnFamilyMemTablesImpl to provide WriteBatch a way to access
// memtables of different column families (specified by ID in the write batch)
class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables {

View File

@ -45,17 +45,15 @@ Status DBImpl::FlushForGetLiveFiles() {
}
mutex_.Lock();
} else {
for (auto cfd : *versions_->GetColumnFamilySet()) {
for (auto cfd : versions_->GetRefedColumnFamilySet()) {
if (cfd->IsDropped()) {
continue;
}
cfd->Ref();
mutex_.Unlock();
status = FlushMemTable(cfd, FlushOptions(), FlushReason::kGetLiveFiles);
TEST_SYNC_POINT("DBImpl::GetLiveFiles:1");
TEST_SYNC_POINT("DBImpl::GetLiveFiles:2");
mutex_.Lock();
cfd->UnrefAndTryDelete();
if (!status.ok() && !status.IsColumnFamilyDropped()) {
break;
} else if (status.IsColumnFamilyDropped()) {
@ -63,7 +61,6 @@ Status DBImpl::FlushForGetLiveFiles() {
}
}
}
versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
return status;
}

View File

@ -377,15 +377,12 @@ Status DBImpl::ResumeImpl(DBRecoverContext context) {
s = AtomicFlushMemTables(cfds, flush_opts, context.flush_reason);
mutex_.Lock();
} else {
for (auto cfd : *versions_->GetColumnFamilySet()) {
for (auto cfd : versions_->GetRefedColumnFamilySet()) {
if (cfd->IsDropped()) {
continue;
}
cfd->Ref();
mutex_.Unlock();
InstrumentedMutexUnlock u(&mutex_);
s = FlushMemTable(cfd, flush_opts, context.flush_reason);
mutex_.Lock();
cfd->UnrefAndTryDelete();
if (!s.ok()) {
break;
}
@ -497,18 +494,14 @@ void DBImpl::CancelAllBackgroundWork(bool wait) {
s.PermitUncheckedError(); //**TODO: What to do on error?
mutex_.Lock();
} else {
for (auto cfd : *versions_->GetColumnFamilySet()) {
for (auto cfd : versions_->GetRefedColumnFamilySet()) {
if (!cfd->IsDropped() && cfd->initialized() && !cfd->mem()->IsEmpty()) {
cfd->Ref();
mutex_.Unlock();
InstrumentedMutexUnlock u(&mutex_);
Status s = FlushMemTable(cfd, FlushOptions(), FlushReason::kShutDown);
s.PermitUncheckedError(); //**TODO: What to do on error?
mutex_.Lock();
cfd->UnrefAndTryDelete();
}
}
}
versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
}
shutting_down_.store(true, std::memory_order_release);
@ -971,18 +964,13 @@ void DBImpl::DumpStats() {
TEST_SYNC_POINT("DBImpl::DumpStats:StartRunning");
{
InstrumentedMutexLock l(&mutex_);
for (auto cfd : *versions_->GetColumnFamilySet()) {
for (auto cfd : versions_->GetRefedColumnFamilySet()) {
if (cfd->initialized()) {
// Release DB mutex for gathering cache entry stats. Pass over all
// column families for this first so that other stats are dumped
// near-atomically.
// Get a ref before unlocking
cfd->Ref();
{
InstrumentedMutexUnlock u(&mutex_);
cfd->internal_stats()->CollectCacheEntryStats(/*foreground=*/false);
}
cfd->UnrefAndTryDelete();
InstrumentedMutexUnlock u(&mutex_);
cfd->internal_stats()->CollectCacheEntryStats(/*foreground=*/false);
}
}
@ -3492,15 +3480,13 @@ bool DBImpl::GetAggregatedIntProperty(const Slice& property,
// Needs mutex to protect the list of column families.
InstrumentedMutexLock l(&mutex_);
uint64_t value;
for (auto* cfd : *versions_->GetColumnFamilySet()) {
for (auto* cfd : versions_->GetRefedColumnFamilySet()) {
if (!cfd->initialized()) {
continue;
}
cfd->Ref();
ret = GetIntPropertyInternal(cfd, *property_info, true, &value);
// GetIntPropertyInternal may release db mutex and re-acquire it.
mutex_.AssertHeld();
cfd->UnrefAndTryDelete();
if (ret) {
sum += value;
} else {
@ -5091,6 +5077,7 @@ Status DBImpl::VerifyChecksumInternal(const ReadOptions& read_options,
}
}
// TODO: simplify using GetRefedColumnFamilySet?
std::vector<ColumnFamilyData*> cfd_list;
{
InstrumentedMutexLock l(&mutex_);

View File

@ -2992,8 +2992,6 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
bg_bottom_compaction_scheduled_--;
}
versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
// See if there's more work to be done
MaybeScheduleFlushOrCompaction();

View File

@ -1299,6 +1299,10 @@ class VersionSet {
uint64_t min_pending_output);
ColumnFamilySet* GetColumnFamilySet() { return column_family_set_.get(); }
RefedColumnFamilySet GetRefedColumnFamilySet() {
return RefedColumnFamilySet(GetColumnFamilySet());
}
const FileOptions& file_options() { return file_options_; }
void ChangeFileOptions(const MutableDBOptions& new_options) {
file_options_.writable_file_max_buffer_size =