fast look up purge_queue (#5796)

Summary:
purge_queue_ maybe contains thousands sst files, for example manual compact a range. If full scan is triggered at the same time and the total sst files number is large, RocksDB will be blocked at https://github.com/facebook/rocksdb/blob/master/db/db_impl_files.cc#L150 for several seconds. In our environment we have 140,000 sst files and the manual compaction delete about 1000 sst files, it blocked about 2 minutes.

Commandeering https://github.com/facebook/rocksdb/issues/5290.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5796

Differential Revision: D17357775

Pulled By: riversand963

fbshipit-source-id: 20eacca917355b8de975ccc7b1c9a3e7bd5b201a
This commit is contained in:
Yi Wu 2019-09-17 16:43:07 -07:00 committed by Facebook Github Bot
parent 9a87ae46fd
commit a68d814570
5 changed files with 41 additions and 55 deletions

View File

@ -1314,32 +1314,28 @@ void DBImpl::SchedulePurge() {
void DBImpl::BackgroundCallPurge() { void DBImpl::BackgroundCallPurge() {
mutex_.Lock(); mutex_.Lock();
// We use one single loop to clear both queues so that after existing the loop while (!logs_to_free_queue_.empty()) {
// both queues are empty. This is stricter than what is needed, but can make assert(!logs_to_free_queue_.empty());
// it easier for us to reason the correctness. log::Writer* log_writer = *(logs_to_free_queue_.begin());
while (!purge_queue_.empty() || !logs_to_free_queue_.empty()) { logs_to_free_queue_.pop_front();
// Check logs_to_free_queue_ first and close log writers. mutex_.Unlock();
if (!logs_to_free_queue_.empty()) { delete log_writer;
assert(!logs_to_free_queue_.empty()); mutex_.Lock();
log::Writer* log_writer = *(logs_to_free_queue_.begin());
logs_to_free_queue_.pop_front();
mutex_.Unlock();
delete log_writer;
mutex_.Lock();
} else {
auto purge_file = purge_queue_.begin();
auto fname = purge_file->fname;
auto dir_to_sync = purge_file->dir_to_sync;
auto type = purge_file->type;
auto number = purge_file->number;
auto job_id = purge_file->job_id;
purge_queue_.pop_front();
mutex_.Unlock();
DeleteObsoleteFileImpl(job_id, fname, dir_to_sync, type, number);
mutex_.Lock();
}
} }
for (const auto& file : purge_files_) {
const PurgeFileInfo& purge_file = file.second;
const std::string& fname = purge_file.fname;
const std::string& dir_to_sync = purge_file.dir_to_sync;
FileType type = purge_file.type;
uint64_t number = purge_file.number;
int job_id = purge_file.job_id;
mutex_.Unlock();
DeleteObsoleteFileImpl(job_id, fname, dir_to_sync, type, number);
mutex_.Lock();
}
purge_files_.clear();
bg_purge_scheduled_--; bg_purge_scheduled_--;
bg_cv_.SignalAll(); bg_cv_.SignalAll();

View File

@ -347,7 +347,8 @@ class DBImpl : public DB {
uint64_t* manifest_file_size, uint64_t* manifest_file_size,
bool flush_memtable = true) override; bool flush_memtable = true) override;
virtual Status GetSortedWalFiles(VectorLogPtr& files) override; virtual Status GetSortedWalFiles(VectorLogPtr& files) override;
virtual Status GetCurrentWalFile(std::unique_ptr<LogFile>* current_log_file) override; virtual Status GetCurrentWalFile(
std::unique_ptr<LogFile>* current_log_file) override;
virtual Status GetUpdatesSince( virtual Status GetUpdatesSince(
SequenceNumber seq_number, std::unique_ptr<TransactionLogIterator>* iter, SequenceNumber seq_number, std::unique_ptr<TransactionLogIterator>* iter,
@ -1784,12 +1785,12 @@ class DBImpl : public DB {
// ColumnFamilyData::pending_compaction_ == true) // ColumnFamilyData::pending_compaction_ == true)
std::deque<ColumnFamilyData*> compaction_queue_; std::deque<ColumnFamilyData*> compaction_queue_;
// A queue to store filenames of the files to be purged // A map to store file numbers and filenames of the files to be purged
std::deque<PurgeFileInfo> purge_queue_; std::unordered_map<uint64_t, PurgeFileInfo> purge_files_;
// A vector to store the file numbers that have been assigned to certain // A vector to store the file numbers that have been assigned to certain
// JobContext. Current implementation tracks ssts only. // JobContext. Current implementation tracks ssts only.
std::vector<uint64_t> files_grabbed_for_purge_; std::unordered_set<uint64_t> files_grabbed_for_purge_;
// A queue to store log writers to close // A queue to store log writers to close
std::deque<log::Writer*> logs_to_free_queue_; std::deque<log::Writer*> logs_to_free_queue_;

View File

@ -2090,7 +2090,7 @@ void DBImpl::SchedulePendingPurge(std::string fname, std::string dir_to_sync,
FileType type, uint64_t number, int job_id) { FileType type, uint64_t number, int job_id) {
mutex_.AssertHeld(); mutex_.AssertHeld();
PurgeFileInfo file_info(fname, dir_to_sync, type, number, job_id); PurgeFileInfo file_info(fname, dir_to_sync, type, number, job_id);
purge_queue_.push_back(std::move(file_info)); purge_files_.insert({{number, std::move(file_info)}});
} }
void DBImpl::BGWorkFlush(void* arg) { void DBImpl::BGWorkFlush(void* arg) {
@ -3077,34 +3077,20 @@ void DBImpl::InstallSuperVersionAndScheduleWork(
} }
// ShouldPurge is called by FindObsoleteFiles when doing a full scan, // ShouldPurge is called by FindObsoleteFiles when doing a full scan,
// and db mutex (mutex_) should already be held. This function performs a // and db mutex (mutex_) should already be held.
// linear scan of an vector (files_grabbed_for_purge_) in search of a
// certain element. We expect FindObsoleteFiles with full scan to occur once
// every 10 hours by default, and the size of the vector is small.
// Therefore, the cost is affordable even if the mutex is held.
// Actually, the current implementation of FindObsoleteFiles with // Actually, the current implementation of FindObsoleteFiles with
// full_scan=true can issue I/O requests to obtain list of files in // full_scan=true can issue I/O requests to obtain list of files in
// directories, e.g. env_->getChildren while holding db mutex. // directories, e.g. env_->getChildren while holding db mutex.
// In the future, if we want to reduce the cost of search, we may try to keep
// the vector sorted.
bool DBImpl::ShouldPurge(uint64_t file_number) const { bool DBImpl::ShouldPurge(uint64_t file_number) const {
for (auto fn : files_grabbed_for_purge_) { return files_grabbed_for_purge_.find(file_number) ==
if (file_number == fn) { files_grabbed_for_purge_.end() &&
return false; purge_files_.find(file_number) == purge_files_.end();
}
}
for (const auto& purge_file_info : purge_queue_) {
if (purge_file_info.number == file_number) {
return false;
}
}
return true;
} }
// MarkAsGrabbedForPurge is called by FindObsoleteFiles, and db mutex // MarkAsGrabbedForPurge is called by FindObsoleteFiles, and db mutex
// (mutex_) should already be held. // (mutex_) should already be held.
void DBImpl::MarkAsGrabbedForPurge(uint64_t file_number) { void DBImpl::MarkAsGrabbedForPurge(uint64_t file_number) {
files_grabbed_for_purge_.emplace_back(file_number); files_grabbed_for_purge_.insert(file_number);
} }
void DBImpl::SetSnapshotChecker(SnapshotChecker* snapshot_checker) { void DBImpl::SetSnapshotChecker(SnapshotChecker* snapshot_checker) {

View File

@ -15,6 +15,7 @@
#include "db/memtable_list.h" #include "db/memtable_list.h"
#include "file/file_util.h" #include "file/file_util.h"
#include "file/sst_file_manager_impl.h" #include "file/sst_file_manager_impl.h"
#include "util/autovector.h"
namespace rocksdb { namespace rocksdb {
@ -495,13 +496,15 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
// After purging obsolete files, remove them from files_grabbed_for_purge_. // After purging obsolete files, remove them from files_grabbed_for_purge_.
// Use a temporary vector to perform bulk deletion via swap. // Use a temporary vector to perform bulk deletion via swap.
InstrumentedMutexLock guard_lock(&mutex_); InstrumentedMutexLock guard_lock(&mutex_);
std::vector<uint64_t> tmp; autovector<uint64_t> to_be_removed;
for (auto fn : files_grabbed_for_purge_) { for (auto fn : files_grabbed_for_purge_) {
if (files_to_del.count(fn) == 0) { if (files_to_del.count(fn) != 0) {
tmp.emplace_back(fn); to_be_removed.emplace_back(fn);
} }
} }
files_grabbed_for_purge_.swap(tmp); for (auto fn : to_be_removed) {
files_grabbed_for_purge_.erase(fn);
}
} }
// Delete old info log files. // Delete old info log files.

View File

@ -171,8 +171,8 @@ TEST_F(ObsoleteFilesTest, RaceForObsoleteFileDeletion) {
}); });
SyncPoint::GetInstance()->SetCallBack( SyncPoint::GetInstance()->SetCallBack(
"DBImpl::CloseHelper:PendingPurgeFinished", [&](void* arg) { "DBImpl::CloseHelper:PendingPurgeFinished", [&](void* arg) {
std::vector<uint64_t>* files_grabbed_for_purge_ptr = std::unordered_set<uint64_t>* files_grabbed_for_purge_ptr =
reinterpret_cast<std::vector<uint64_t>*>(arg); reinterpret_cast<std::unordered_set<uint64_t>*>(arg);
ASSERT_TRUE(files_grabbed_for_purge_ptr->empty()); ASSERT_TRUE(files_grabbed_for_purge_ptr->empty());
}); });
SyncPoint::GetInstance()->EnableProcessing(); SyncPoint::GetInstance()->EnableProcessing();