Fix heap use-after-free race with DropColumnFamily (#9730)
Summary: Although ColumnFamilySet comments say that DB mutex can be freed during iteration, as long as you hold a ref while releasing DB mutex, this is not quite true because UnrefAndTryDelete might delete cfd right before it is needed to get ->next_ for the next iteration of the loop. This change solves the problem by making a wrapper class that makes such iteration easier while handling the tricky details of UnrefAndTryDelete on the previous cfd only after getting next_ in operator++. FreeDeadColumnFamilies should already have been obsolete; this removes it for good. Similarly, ColumnFamilySet::iterator doesn't need to check for cfd with 0 refs, because those are immediately deleted. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9730 Test Plan: was reported with ASAN on unit tests like DBLogicalBlockSizeCacheTest.CreateColumnFamily (very rare); keep watching Reviewed By: ltamasi Differential Revision: D35038143 Pulled By: pdillinger fbshipit-source-id: 0a5478d5be96c135343a00603711b7df43ae19c9
This commit is contained in:
parent
2de3bfc7d1
commit
8b2cfef9e0
@ -5,6 +5,7 @@
|
|||||||
* Fixed a race condition for `alive_log_files_` in non-two-write-queues mode. The race is between the write_thread_ in WriteToWAL() and another thread executing `FindObsoleteFiles()`. The race condition will be caught if `__glibcxx_requires_nonempty` is enabled.
|
* Fixed a race condition for `alive_log_files_` in non-two-write-queues mode. The race is between the write_thread_ in WriteToWAL() and another thread executing `FindObsoleteFiles()`. The race condition will be caught if `__glibcxx_requires_nonempty` is enabled.
|
||||||
* Fixed a race condition when mmaping a WritableFile on POSIX.
|
* Fixed a race condition when mmaping a WritableFile on POSIX.
|
||||||
* Fixed a race condition when 2PC is disabled and WAL tracking in the MANIFEST is enabled. The race condition is between two background flush threads trying to install flush results, causing a WAL deletion not tracked in the MANIFEST. A future DB open may fail.
|
* Fixed a race condition when 2PC is disabled and WAL tracking in the MANIFEST is enabled. The race condition is between two background flush threads trying to install flush results, causing a WAL deletion not tracked in the MANIFEST. A future DB open may fail.
|
||||||
|
* Fixed a heap use-after-free race with DropColumnFamily.
|
||||||
|
|
||||||
## 7.0.3 (03/23/2022)
|
## 7.0.3 (03/23/2022)
|
||||||
### Bug Fixes
|
### Bug Fixes
|
||||||
|
@ -1562,20 +1562,6 @@ ColumnFamilyData* ColumnFamilySet::CreateColumnFamily(
|
|||||||
return new_cfd;
|
return new_cfd;
|
||||||
}
|
}
|
||||||
|
|
||||||
// REQUIRES: DB mutex held
|
|
||||||
void ColumnFamilySet::FreeDeadColumnFamilies() {
|
|
||||||
autovector<ColumnFamilyData*> to_delete;
|
|
||||||
for (auto cfd = dummy_cfd_->next_; cfd != dummy_cfd_; cfd = cfd->next_) {
|
|
||||||
if (cfd->refs_.load(std::memory_order_relaxed) == 0) {
|
|
||||||
to_delete.push_back(cfd);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for (auto cfd : to_delete) {
|
|
||||||
// this is very rare, so it's not a problem that we do it under a mutex
|
|
||||||
delete cfd;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// under a DB mutex AND from a write thread
|
// under a DB mutex AND from a write thread
|
||||||
void ColumnFamilySet::RemoveColumnFamily(ColumnFamilyData* cfd) {
|
void ColumnFamilySet::RemoveColumnFamily(ColumnFamilyData* cfd) {
|
||||||
auto cfd_iter = column_family_data_.find(cfd->GetID());
|
auto cfd_iter = column_family_data_.find(cfd->GetID());
|
||||||
|
@ -519,9 +519,10 @@ class ColumnFamilyData {
|
|||||||
ThreadLocalPtr* TEST_GetLocalSV() { return local_sv_.get(); }
|
ThreadLocalPtr* TEST_GetLocalSV() { return local_sv_.get(); }
|
||||||
WriteBufferManager* write_buffer_mgr() { return write_buffer_manager_; }
|
WriteBufferManager* write_buffer_mgr() { return write_buffer_manager_; }
|
||||||
|
|
||||||
|
static const uint32_t kDummyColumnFamilyDataId;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
friend class ColumnFamilySet;
|
friend class ColumnFamilySet;
|
||||||
static const uint32_t kDummyColumnFamilyDataId;
|
|
||||||
ColumnFamilyData(uint32_t id, const std::string& name,
|
ColumnFamilyData(uint32_t id, const std::string& name,
|
||||||
Version* dummy_versions, Cache* table_cache,
|
Version* dummy_versions, Cache* table_cache,
|
||||||
WriteBufferManager* write_buffer_manager,
|
WriteBufferManager* write_buffer_manager,
|
||||||
@ -627,10 +628,8 @@ class ColumnFamilyData {
|
|||||||
// held and it needs to be executed from the write thread. SetDropped() also
|
// held and it needs to be executed from the write thread. SetDropped() also
|
||||||
// guarantees that it will be called only from single-threaded LogAndApply(),
|
// guarantees that it will be called only from single-threaded LogAndApply(),
|
||||||
// but this condition is not that important.
|
// but this condition is not that important.
|
||||||
// * Iteration -- hold DB mutex, but you can release it in the body of
|
// * Iteration -- hold DB mutex. If you want to release the DB mutex in the
|
||||||
// iteration. If you release DB mutex in body, reference the column
|
// body of the iteration, wrap in a RefedColumnFamilySet.
|
||||||
// family before the mutex and unreference after you unlock, since the column
|
|
||||||
// family might get dropped when the DB mutex is released
|
|
||||||
// * GetDefault() -- thread safe
|
// * GetDefault() -- thread safe
|
||||||
// * GetColumnFamily() -- either inside of DB mutex or from a write thread
|
// * GetColumnFamily() -- either inside of DB mutex or from a write thread
|
||||||
// * GetNextColumnFamilyID(), GetMaxColumnFamily(), UpdateMaxColumnFamily(),
|
// * GetNextColumnFamilyID(), GetMaxColumnFamily(), UpdateMaxColumnFamily(),
|
||||||
@ -642,17 +641,12 @@ class ColumnFamilySet {
|
|||||||
public:
|
public:
|
||||||
explicit iterator(ColumnFamilyData* cfd)
|
explicit iterator(ColumnFamilyData* cfd)
|
||||||
: current_(cfd) {}
|
: current_(cfd) {}
|
||||||
|
// NOTE: minimum operators for for-loop iteration
|
||||||
iterator& operator++() {
|
iterator& operator++() {
|
||||||
// dropped column families might still be included in this iteration
|
current_ = current_->next_;
|
||||||
// (we're only removing them when client drops the last reference to the
|
|
||||||
// column family).
|
|
||||||
// dummy is never dead, so this will never be infinite
|
|
||||||
do {
|
|
||||||
current_ = current_->next_;
|
|
||||||
} while (current_->refs_.load(std::memory_order_relaxed) == 0);
|
|
||||||
return *this;
|
return *this;
|
||||||
}
|
}
|
||||||
bool operator!=(const iterator& other) {
|
bool operator!=(const iterator& other) const {
|
||||||
return this->current_ != other.current_;
|
return this->current_ != other.current_;
|
||||||
}
|
}
|
||||||
ColumnFamilyData* operator*() { return current_; }
|
ColumnFamilyData* operator*() { return current_; }
|
||||||
@ -691,10 +685,6 @@ class ColumnFamilySet {
|
|||||||
iterator begin() { return iterator(dummy_cfd_->next_); }
|
iterator begin() { return iterator(dummy_cfd_->next_); }
|
||||||
iterator end() { return iterator(dummy_cfd_); }
|
iterator end() { return iterator(dummy_cfd_); }
|
||||||
|
|
||||||
// REQUIRES: DB mutex held
|
|
||||||
// Don't call while iterating over ColumnFamilySet
|
|
||||||
void FreeDeadColumnFamilies();
|
|
||||||
|
|
||||||
Cache* get_table_cache() { return table_cache_; }
|
Cache* get_table_cache() { return table_cache_; }
|
||||||
|
|
||||||
WriteBufferManager* write_buffer_manager() { return write_buffer_manager_; }
|
WriteBufferManager* write_buffer_manager() { return write_buffer_manager_; }
|
||||||
@ -737,6 +727,55 @@ class ColumnFamilySet {
|
|||||||
std::string db_session_id_;
|
std::string db_session_id_;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// A wrapper for ColumnFamilySet that supports releasing DB mutex during each
|
||||||
|
// iteration over the iterator, because the cfd is Refed and Unrefed during
|
||||||
|
// each iteration to prevent concurrent CF drop from destroying it (until
|
||||||
|
// Unref).
|
||||||
|
class RefedColumnFamilySet {
|
||||||
|
public:
|
||||||
|
explicit RefedColumnFamilySet(ColumnFamilySet* cfs) : wrapped_(cfs) {}
|
||||||
|
|
||||||
|
class iterator {
|
||||||
|
public:
|
||||||
|
explicit iterator(ColumnFamilySet::iterator wrapped) : wrapped_(wrapped) {
|
||||||
|
MaybeRef(*wrapped_);
|
||||||
|
}
|
||||||
|
~iterator() { MaybeUnref(*wrapped_); }
|
||||||
|
inline void MaybeRef(ColumnFamilyData* cfd) {
|
||||||
|
if (cfd->GetID() != ColumnFamilyData::kDummyColumnFamilyDataId) {
|
||||||
|
cfd->Ref();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
inline void MaybeUnref(ColumnFamilyData* cfd) {
|
||||||
|
if (cfd->GetID() != ColumnFamilyData::kDummyColumnFamilyDataId) {
|
||||||
|
cfd->UnrefAndTryDelete();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// NOTE: minimum operators for for-loop iteration
|
||||||
|
inline iterator& operator++() {
|
||||||
|
ColumnFamilyData* old = *wrapped_;
|
||||||
|
++wrapped_;
|
||||||
|
// Can only unref & potentially free cfd after accessing its next_
|
||||||
|
MaybeUnref(old);
|
||||||
|
MaybeRef(*wrapped_);
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
inline bool operator!=(const iterator& other) const {
|
||||||
|
return this->wrapped_ != other.wrapped_;
|
||||||
|
}
|
||||||
|
inline ColumnFamilyData* operator*() { return *wrapped_; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
ColumnFamilySet::iterator wrapped_;
|
||||||
|
};
|
||||||
|
|
||||||
|
iterator begin() { return iterator(wrapped_->begin()); }
|
||||||
|
iterator end() { return iterator(wrapped_->end()); }
|
||||||
|
|
||||||
|
private:
|
||||||
|
ColumnFamilySet* wrapped_;
|
||||||
|
};
|
||||||
|
|
||||||
// We use ColumnFamilyMemTablesImpl to provide WriteBatch a way to access
|
// We use ColumnFamilyMemTablesImpl to provide WriteBatch a way to access
|
||||||
// memtables of different column families (specified by ID in the write batch)
|
// memtables of different column families (specified by ID in the write batch)
|
||||||
class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables {
|
class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables {
|
||||||
|
@ -45,17 +45,15 @@ Status DBImpl::FlushForGetLiveFiles() {
|
|||||||
}
|
}
|
||||||
mutex_.Lock();
|
mutex_.Lock();
|
||||||
} else {
|
} else {
|
||||||
for (auto cfd : *versions_->GetColumnFamilySet()) {
|
for (auto cfd : versions_->GetRefedColumnFamilySet()) {
|
||||||
if (cfd->IsDropped()) {
|
if (cfd->IsDropped()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
cfd->Ref();
|
|
||||||
mutex_.Unlock();
|
mutex_.Unlock();
|
||||||
status = FlushMemTable(cfd, FlushOptions(), FlushReason::kGetLiveFiles);
|
status = FlushMemTable(cfd, FlushOptions(), FlushReason::kGetLiveFiles);
|
||||||
TEST_SYNC_POINT("DBImpl::GetLiveFiles:1");
|
TEST_SYNC_POINT("DBImpl::GetLiveFiles:1");
|
||||||
TEST_SYNC_POINT("DBImpl::GetLiveFiles:2");
|
TEST_SYNC_POINT("DBImpl::GetLiveFiles:2");
|
||||||
mutex_.Lock();
|
mutex_.Lock();
|
||||||
cfd->UnrefAndTryDelete();
|
|
||||||
if (!status.ok() && !status.IsColumnFamilyDropped()) {
|
if (!status.ok() && !status.IsColumnFamilyDropped()) {
|
||||||
break;
|
break;
|
||||||
} else if (status.IsColumnFamilyDropped()) {
|
} else if (status.IsColumnFamilyDropped()) {
|
||||||
@ -63,7 +61,6 @@ Status DBImpl::FlushForGetLiveFiles() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
|
|
||||||
return status;
|
return status;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -377,15 +377,12 @@ Status DBImpl::ResumeImpl(DBRecoverContext context) {
|
|||||||
s = AtomicFlushMemTables(cfds, flush_opts, context.flush_reason);
|
s = AtomicFlushMemTables(cfds, flush_opts, context.flush_reason);
|
||||||
mutex_.Lock();
|
mutex_.Lock();
|
||||||
} else {
|
} else {
|
||||||
for (auto cfd : *versions_->GetColumnFamilySet()) {
|
for (auto cfd : versions_->GetRefedColumnFamilySet()) {
|
||||||
if (cfd->IsDropped()) {
|
if (cfd->IsDropped()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
cfd->Ref();
|
InstrumentedMutexUnlock u(&mutex_);
|
||||||
mutex_.Unlock();
|
|
||||||
s = FlushMemTable(cfd, flush_opts, context.flush_reason);
|
s = FlushMemTable(cfd, flush_opts, context.flush_reason);
|
||||||
mutex_.Lock();
|
|
||||||
cfd->UnrefAndTryDelete();
|
|
||||||
if (!s.ok()) {
|
if (!s.ok()) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -497,18 +494,14 @@ void DBImpl::CancelAllBackgroundWork(bool wait) {
|
|||||||
s.PermitUncheckedError(); //**TODO: What to do on error?
|
s.PermitUncheckedError(); //**TODO: What to do on error?
|
||||||
mutex_.Lock();
|
mutex_.Lock();
|
||||||
} else {
|
} else {
|
||||||
for (auto cfd : *versions_->GetColumnFamilySet()) {
|
for (auto cfd : versions_->GetRefedColumnFamilySet()) {
|
||||||
if (!cfd->IsDropped() && cfd->initialized() && !cfd->mem()->IsEmpty()) {
|
if (!cfd->IsDropped() && cfd->initialized() && !cfd->mem()->IsEmpty()) {
|
||||||
cfd->Ref();
|
InstrumentedMutexUnlock u(&mutex_);
|
||||||
mutex_.Unlock();
|
|
||||||
Status s = FlushMemTable(cfd, FlushOptions(), FlushReason::kShutDown);
|
Status s = FlushMemTable(cfd, FlushOptions(), FlushReason::kShutDown);
|
||||||
s.PermitUncheckedError(); //**TODO: What to do on error?
|
s.PermitUncheckedError(); //**TODO: What to do on error?
|
||||||
mutex_.Lock();
|
|
||||||
cfd->UnrefAndTryDelete();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
shutting_down_.store(true, std::memory_order_release);
|
shutting_down_.store(true, std::memory_order_release);
|
||||||
@ -971,18 +964,13 @@ void DBImpl::DumpStats() {
|
|||||||
TEST_SYNC_POINT("DBImpl::DumpStats:StartRunning");
|
TEST_SYNC_POINT("DBImpl::DumpStats:StartRunning");
|
||||||
{
|
{
|
||||||
InstrumentedMutexLock l(&mutex_);
|
InstrumentedMutexLock l(&mutex_);
|
||||||
for (auto cfd : *versions_->GetColumnFamilySet()) {
|
for (auto cfd : versions_->GetRefedColumnFamilySet()) {
|
||||||
if (cfd->initialized()) {
|
if (cfd->initialized()) {
|
||||||
// Release DB mutex for gathering cache entry stats. Pass over all
|
// Release DB mutex for gathering cache entry stats. Pass over all
|
||||||
// column families for this first so that other stats are dumped
|
// column families for this first so that other stats are dumped
|
||||||
// near-atomically.
|
// near-atomically.
|
||||||
// Get a ref before unlocking
|
InstrumentedMutexUnlock u(&mutex_);
|
||||||
cfd->Ref();
|
cfd->internal_stats()->CollectCacheEntryStats(/*foreground=*/false);
|
||||||
{
|
|
||||||
InstrumentedMutexUnlock u(&mutex_);
|
|
||||||
cfd->internal_stats()->CollectCacheEntryStats(/*foreground=*/false);
|
|
||||||
}
|
|
||||||
cfd->UnrefAndTryDelete();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -3492,15 +3480,13 @@ bool DBImpl::GetAggregatedIntProperty(const Slice& property,
|
|||||||
// Needs mutex to protect the list of column families.
|
// Needs mutex to protect the list of column families.
|
||||||
InstrumentedMutexLock l(&mutex_);
|
InstrumentedMutexLock l(&mutex_);
|
||||||
uint64_t value;
|
uint64_t value;
|
||||||
for (auto* cfd : *versions_->GetColumnFamilySet()) {
|
for (auto* cfd : versions_->GetRefedColumnFamilySet()) {
|
||||||
if (!cfd->initialized()) {
|
if (!cfd->initialized()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
cfd->Ref();
|
|
||||||
ret = GetIntPropertyInternal(cfd, *property_info, true, &value);
|
ret = GetIntPropertyInternal(cfd, *property_info, true, &value);
|
||||||
// GetIntPropertyInternal may release db mutex and re-acquire it.
|
// GetIntPropertyInternal may release db mutex and re-acquire it.
|
||||||
mutex_.AssertHeld();
|
mutex_.AssertHeld();
|
||||||
cfd->UnrefAndTryDelete();
|
|
||||||
if (ret) {
|
if (ret) {
|
||||||
sum += value;
|
sum += value;
|
||||||
} else {
|
} else {
|
||||||
@ -5091,6 +5077,7 @@ Status DBImpl::VerifyChecksumInternal(const ReadOptions& read_options,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO: simplify using GetRefedColumnFamilySet?
|
||||||
std::vector<ColumnFamilyData*> cfd_list;
|
std::vector<ColumnFamilyData*> cfd_list;
|
||||||
{
|
{
|
||||||
InstrumentedMutexLock l(&mutex_);
|
InstrumentedMutexLock l(&mutex_);
|
||||||
|
@ -2992,8 +2992,6 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
|
|||||||
bg_bottom_compaction_scheduled_--;
|
bg_bottom_compaction_scheduled_--;
|
||||||
}
|
}
|
||||||
|
|
||||||
versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
|
|
||||||
|
|
||||||
// See if there's more work to be done
|
// See if there's more work to be done
|
||||||
MaybeScheduleFlushOrCompaction();
|
MaybeScheduleFlushOrCompaction();
|
||||||
|
|
||||||
|
@ -1299,6 +1299,10 @@ class VersionSet {
|
|||||||
uint64_t min_pending_output);
|
uint64_t min_pending_output);
|
||||||
|
|
||||||
ColumnFamilySet* GetColumnFamilySet() { return column_family_set_.get(); }
|
ColumnFamilySet* GetColumnFamilySet() { return column_family_set_.get(); }
|
||||||
|
RefedColumnFamilySet GetRefedColumnFamilySet() {
|
||||||
|
return RefedColumnFamilySet(GetColumnFamilySet());
|
||||||
|
}
|
||||||
|
|
||||||
const FileOptions& file_options() { return file_options_; }
|
const FileOptions& file_options() { return file_options_; }
|
||||||
void ChangeFileOptions(const MutableDBOptions& new_options) {
|
void ChangeFileOptions(const MutableDBOptions& new_options) {
|
||||||
file_options_.writable_file_max_buffer_size =
|
file_options_.writable_file_max_buffer_size =
|
||||||
|
Loading…
Reference in New Issue
Block a user