Fix heap use-after-free race with DropColumnFamily (#9730)
Summary: Although ColumnFamilySet comments say that DB mutex can be freed during iteration, as long as you hold a ref while releasing DB mutex, this is not quite true because UnrefAndTryDelete might delete cfd right before it is needed to get ->next_ for the next iteration of the loop. This change solves the problem by making a wrapper class that makes such iteration easier while handling the tricky details of UnrefAndTryDelete on the previous cfd only after getting next_ in operator++. FreeDeadColumnFamilies should already have been obsolete; this removes it for good. Similarly, ColumnFamilySet::iterator doesn't need to check for cfd with 0 refs, because those are immediately deleted. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9730 Test Plan: was reported with ASAN on unit tests like DBLogicalBlockSizeCacheTest.CreateColumnFamily (very rare); keep watching Reviewed By: ltamasi Differential Revision: D35038143 Pulled By: pdillinger fbshipit-source-id: 0a5478d5be96c135343a00603711b7df43ae19c9
This commit is contained in:
parent
43890774ed
commit
fa99e8afbc
@ -2,6 +2,7 @@
|
||||
## Unreleased
|
||||
### Bug Fixes
|
||||
* Fixed a race condition when 2PC is disabled and WAL tracking in the MANIFEST is enabled. The race condition is between two background flush threads trying to install flush results, causing a WAL deletion not tracked in the MANIFEST. A future DB open may fail.
|
||||
* Fixed a heap use-after-free race with DropColumnFamily.
|
||||
|
||||
## 7.1.1 (04/07/2022)
|
||||
### Bug Fixes
|
||||
|
@ -1562,20 +1562,6 @@ ColumnFamilyData* ColumnFamilySet::CreateColumnFamily(
|
||||
return new_cfd;
|
||||
}
|
||||
|
||||
// REQUIRES: DB mutex held
|
||||
void ColumnFamilySet::FreeDeadColumnFamilies() {
|
||||
autovector<ColumnFamilyData*> to_delete;
|
||||
for (auto cfd = dummy_cfd_->next_; cfd != dummy_cfd_; cfd = cfd->next_) {
|
||||
if (cfd->refs_.load(std::memory_order_relaxed) == 0) {
|
||||
to_delete.push_back(cfd);
|
||||
}
|
||||
}
|
||||
for (auto cfd : to_delete) {
|
||||
// this is very rare, so it's not a problem that we do it under a mutex
|
||||
delete cfd;
|
||||
}
|
||||
}
|
||||
|
||||
// under a DB mutex AND from a write thread
|
||||
void ColumnFamilySet::RemoveColumnFamily(ColumnFamilyData* cfd) {
|
||||
auto cfd_iter = column_family_data_.find(cfd->GetID());
|
||||
|
@ -520,9 +520,10 @@ class ColumnFamilyData {
|
||||
ThreadLocalPtr* TEST_GetLocalSV() { return local_sv_.get(); }
|
||||
WriteBufferManager* write_buffer_mgr() { return write_buffer_manager_; }
|
||||
|
||||
static const uint32_t kDummyColumnFamilyDataId;
|
||||
|
||||
private:
|
||||
friend class ColumnFamilySet;
|
||||
static const uint32_t kDummyColumnFamilyDataId;
|
||||
ColumnFamilyData(uint32_t id, const std::string& name,
|
||||
Version* dummy_versions, Cache* table_cache,
|
||||
WriteBufferManager* write_buffer_manager,
|
||||
@ -628,10 +629,8 @@ class ColumnFamilyData {
|
||||
// held and it needs to be executed from the write thread. SetDropped() also
|
||||
// guarantees that it will be called only from single-threaded LogAndApply(),
|
||||
// but this condition is not that important.
|
||||
// * Iteration -- hold DB mutex, but you can release it in the body of
|
||||
// iteration. If you release DB mutex in body, reference the column
|
||||
// family before the mutex and unreference after you unlock, since the column
|
||||
// family might get dropped when the DB mutex is released
|
||||
// * Iteration -- hold DB mutex. If you want to release the DB mutex in the
|
||||
// body of the iteration, wrap in a RefedColumnFamilySet.
|
||||
// * GetDefault() -- thread safe
|
||||
// * GetColumnFamily() -- either inside of DB mutex or from a write thread
|
||||
// * GetNextColumnFamilyID(), GetMaxColumnFamily(), UpdateMaxColumnFamily(),
|
||||
@ -643,17 +642,12 @@ class ColumnFamilySet {
|
||||
public:
|
||||
explicit iterator(ColumnFamilyData* cfd)
|
||||
: current_(cfd) {}
|
||||
// NOTE: minimum operators for for-loop iteration
|
||||
iterator& operator++() {
|
||||
// dropped column families might still be included in this iteration
|
||||
// (we're only removing them when client drops the last reference to the
|
||||
// column family).
|
||||
// dummy is never dead, so this will never be infinite
|
||||
do {
|
||||
current_ = current_->next_;
|
||||
} while (current_->refs_.load(std::memory_order_relaxed) == 0);
|
||||
current_ = current_->next_;
|
||||
return *this;
|
||||
}
|
||||
bool operator!=(const iterator& other) {
|
||||
bool operator!=(const iterator& other) const {
|
||||
return this->current_ != other.current_;
|
||||
}
|
||||
ColumnFamilyData* operator*() { return current_; }
|
||||
@ -692,10 +686,6 @@ class ColumnFamilySet {
|
||||
iterator begin() { return iterator(dummy_cfd_->next_); }
|
||||
iterator end() { return iterator(dummy_cfd_); }
|
||||
|
||||
// REQUIRES: DB mutex held
|
||||
// Don't call while iterating over ColumnFamilySet
|
||||
void FreeDeadColumnFamilies();
|
||||
|
||||
Cache* get_table_cache() { return table_cache_; }
|
||||
|
||||
WriteBufferManager* write_buffer_manager() { return write_buffer_manager_; }
|
||||
@ -738,6 +728,55 @@ class ColumnFamilySet {
|
||||
std::string db_session_id_;
|
||||
};
|
||||
|
||||
// A wrapper for ColumnFamilySet that supports releasing DB mutex during each
|
||||
// iteration over the iterator, because the cfd is Refed and Unrefed during
|
||||
// each iteration to prevent concurrent CF drop from destroying it (until
|
||||
// Unref).
|
||||
class RefedColumnFamilySet {
|
||||
public:
|
||||
explicit RefedColumnFamilySet(ColumnFamilySet* cfs) : wrapped_(cfs) {}
|
||||
|
||||
class iterator {
|
||||
public:
|
||||
explicit iterator(ColumnFamilySet::iterator wrapped) : wrapped_(wrapped) {
|
||||
MaybeRef(*wrapped_);
|
||||
}
|
||||
~iterator() { MaybeUnref(*wrapped_); }
|
||||
inline void MaybeRef(ColumnFamilyData* cfd) {
|
||||
if (cfd->GetID() != ColumnFamilyData::kDummyColumnFamilyDataId) {
|
||||
cfd->Ref();
|
||||
}
|
||||
}
|
||||
inline void MaybeUnref(ColumnFamilyData* cfd) {
|
||||
if (cfd->GetID() != ColumnFamilyData::kDummyColumnFamilyDataId) {
|
||||
cfd->UnrefAndTryDelete();
|
||||
}
|
||||
}
|
||||
// NOTE: minimum operators for for-loop iteration
|
||||
inline iterator& operator++() {
|
||||
ColumnFamilyData* old = *wrapped_;
|
||||
++wrapped_;
|
||||
// Can only unref & potentially free cfd after accessing its next_
|
||||
MaybeUnref(old);
|
||||
MaybeRef(*wrapped_);
|
||||
return *this;
|
||||
}
|
||||
inline bool operator!=(const iterator& other) const {
|
||||
return this->wrapped_ != other.wrapped_;
|
||||
}
|
||||
inline ColumnFamilyData* operator*() { return *wrapped_; }
|
||||
|
||||
private:
|
||||
ColumnFamilySet::iterator wrapped_;
|
||||
};
|
||||
|
||||
iterator begin() { return iterator(wrapped_->begin()); }
|
||||
iterator end() { return iterator(wrapped_->end()); }
|
||||
|
||||
private:
|
||||
ColumnFamilySet* wrapped_;
|
||||
};
|
||||
|
||||
// We use ColumnFamilyMemTablesImpl to provide WriteBatch a way to access
|
||||
// memtables of different column families (specified by ID in the write batch)
|
||||
class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables {
|
||||
|
@ -45,17 +45,15 @@ Status DBImpl::FlushForGetLiveFiles() {
|
||||
}
|
||||
mutex_.Lock();
|
||||
} else {
|
||||
for (auto cfd : *versions_->GetColumnFamilySet()) {
|
||||
for (auto cfd : versions_->GetRefedColumnFamilySet()) {
|
||||
if (cfd->IsDropped()) {
|
||||
continue;
|
||||
}
|
||||
cfd->Ref();
|
||||
mutex_.Unlock();
|
||||
status = FlushMemTable(cfd, FlushOptions(), FlushReason::kGetLiveFiles);
|
||||
TEST_SYNC_POINT("DBImpl::GetLiveFiles:1");
|
||||
TEST_SYNC_POINT("DBImpl::GetLiveFiles:2");
|
||||
mutex_.Lock();
|
||||
cfd->UnrefAndTryDelete();
|
||||
if (!status.ok() && !status.IsColumnFamilyDropped()) {
|
||||
break;
|
||||
} else if (status.IsColumnFamilyDropped()) {
|
||||
@ -63,7 +61,6 @@ Status DBImpl::FlushForGetLiveFiles() {
|
||||
}
|
||||
}
|
||||
}
|
||||
versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
|
||||
return status;
|
||||
}
|
||||
|
||||
|
@ -375,15 +375,12 @@ Status DBImpl::ResumeImpl(DBRecoverContext context) {
|
||||
s = AtomicFlushMemTables(cfds, flush_opts, context.flush_reason);
|
||||
mutex_.Lock();
|
||||
} else {
|
||||
for (auto cfd : *versions_->GetColumnFamilySet()) {
|
||||
for (auto cfd : versions_->GetRefedColumnFamilySet()) {
|
||||
if (cfd->IsDropped()) {
|
||||
continue;
|
||||
}
|
||||
cfd->Ref();
|
||||
mutex_.Unlock();
|
||||
InstrumentedMutexUnlock u(&mutex_);
|
||||
s = FlushMemTable(cfd, flush_opts, context.flush_reason);
|
||||
mutex_.Lock();
|
||||
cfd->UnrefAndTryDelete();
|
||||
if (!s.ok()) {
|
||||
break;
|
||||
}
|
||||
@ -495,18 +492,14 @@ void DBImpl::CancelAllBackgroundWork(bool wait) {
|
||||
s.PermitUncheckedError(); //**TODO: What to do on error?
|
||||
mutex_.Lock();
|
||||
} else {
|
||||
for (auto cfd : *versions_->GetColumnFamilySet()) {
|
||||
for (auto cfd : versions_->GetRefedColumnFamilySet()) {
|
||||
if (!cfd->IsDropped() && cfd->initialized() && !cfd->mem()->IsEmpty()) {
|
||||
cfd->Ref();
|
||||
mutex_.Unlock();
|
||||
InstrumentedMutexUnlock u(&mutex_);
|
||||
Status s = FlushMemTable(cfd, FlushOptions(), FlushReason::kShutDown);
|
||||
s.PermitUncheckedError(); //**TODO: What to do on error?
|
||||
mutex_.Lock();
|
||||
cfd->UnrefAndTryDelete();
|
||||
}
|
||||
}
|
||||
}
|
||||
versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
|
||||
}
|
||||
|
||||
shutting_down_.store(true, std::memory_order_release);
|
||||
@ -969,18 +962,13 @@ void DBImpl::DumpStats() {
|
||||
TEST_SYNC_POINT("DBImpl::DumpStats:StartRunning");
|
||||
{
|
||||
InstrumentedMutexLock l(&mutex_);
|
||||
for (auto cfd : *versions_->GetColumnFamilySet()) {
|
||||
for (auto cfd : versions_->GetRefedColumnFamilySet()) {
|
||||
if (cfd->initialized()) {
|
||||
// Release DB mutex for gathering cache entry stats. Pass over all
|
||||
// column families for this first so that other stats are dumped
|
||||
// near-atomically.
|
||||
// Get a ref before unlocking
|
||||
cfd->Ref();
|
||||
{
|
||||
InstrumentedMutexUnlock u(&mutex_);
|
||||
cfd->internal_stats()->CollectCacheEntryStats(/*foreground=*/false);
|
||||
}
|
||||
cfd->UnrefAndTryDelete();
|
||||
InstrumentedMutexUnlock u(&mutex_);
|
||||
cfd->internal_stats()->CollectCacheEntryStats(/*foreground=*/false);
|
||||
}
|
||||
}
|
||||
|
||||
@ -3490,15 +3478,13 @@ bool DBImpl::GetAggregatedIntProperty(const Slice& property,
|
||||
// Needs mutex to protect the list of column families.
|
||||
InstrumentedMutexLock l(&mutex_);
|
||||
uint64_t value;
|
||||
for (auto* cfd : *versions_->GetColumnFamilySet()) {
|
||||
for (auto* cfd : versions_->GetRefedColumnFamilySet()) {
|
||||
if (!cfd->initialized()) {
|
||||
continue;
|
||||
}
|
||||
cfd->Ref();
|
||||
ret = GetIntPropertyInternal(cfd, *property_info, true, &value);
|
||||
// GetIntPropertyInternal may release db mutex and re-acquire it.
|
||||
mutex_.AssertHeld();
|
||||
cfd->UnrefAndTryDelete();
|
||||
if (ret) {
|
||||
sum += value;
|
||||
} else {
|
||||
@ -5089,6 +5075,7 @@ Status DBImpl::VerifyChecksumInternal(const ReadOptions& read_options,
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: simplify using GetRefedColumnFamilySet?
|
||||
std::vector<ColumnFamilyData*> cfd_list;
|
||||
{
|
||||
InstrumentedMutexLock l(&mutex_);
|
||||
|
@ -2973,8 +2973,6 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
|
||||
bg_bottom_compaction_scheduled_--;
|
||||
}
|
||||
|
||||
versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
|
||||
|
||||
// See if there's more work to be done
|
||||
MaybeScheduleFlushOrCompaction();
|
||||
|
||||
|
@ -1299,6 +1299,10 @@ class VersionSet {
|
||||
uint64_t min_pending_output);
|
||||
|
||||
ColumnFamilySet* GetColumnFamilySet() { return column_family_set_.get(); }
|
||||
RefedColumnFamilySet GetRefedColumnFamilySet() {
|
||||
return RefedColumnFamilySet(GetColumnFamilySet());
|
||||
}
|
||||
|
||||
const FileOptions& file_options() { return file_options_; }
|
||||
void ChangeFileOptions(const MutableDBOptions& new_options) {
|
||||
file_options_.writable_file_max_buffer_size =
|
||||
|
Loading…
Reference in New Issue
Block a user