rocksdb/db/memtable_list.cc
Andres Noetzli 2050832974 Fixing race condition in DBTest.DynamicMemtableOptions
Summary:
This patch fixes a race condition in DBTEst.DynamicMemtableOptions. In rare cases,
it was possible that the main thread would fill up both memtables before the flush
job acquired its work. Then, the flush job was flushing both memtables together,
producing only one L0 file while the test expected two. Now, the test waits for
flushes to finish earlier, to make sure that the memtables are flushed in separate
flush jobs.

Test Plan:
Insert "usleep(10000);" after "IOSTATS_SET_THREAD_POOL_ID(Env::Priority::HIGH);" in BGWorkFlush()
to make the issue more likely. Then test with:
make db_test && time while ./db_test --gtest_filter=*DynamicMemtableOptions; do true; done

Reviewers: rven, sdong, yhchiang, anthony, igor

Reviewed By: igor

Subscribers: dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D45429
2015-08-24 17:04:18 -07:00

396 lines
13 KiB
C++

// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
#include "db/memtable_list.h"
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS
#endif
#include <inttypes.h>
#include <string>
#include "rocksdb/db.h"
#include "db/memtable.h"
#include "db/version_set.h"
#include "rocksdb/env.h"
#include "rocksdb/iterator.h"
#include "table/merger.h"
#include "util/coding.h"
#include "util/log_buffer.h"
#include "util/thread_status_util.h"
namespace rocksdb {
class InternalKeyComparator;
class Mutex;
class VersionSet;
void MemTableListVersion::AddMemTable(MemTable* m) {
memlist_.push_front(m);
*parent_memtable_list_memory_usage_ += m->ApproximateMemoryUsage();
}
void MemTableListVersion::UnrefMemTable(autovector<MemTable*>* to_delete,
MemTable* m) {
if (m->Unref()) {
to_delete->push_back(m);
assert(*parent_memtable_list_memory_usage_ >= m->ApproximateMemoryUsage());
*parent_memtable_list_memory_usage_ -= m->ApproximateMemoryUsage();
} else {
}
}
MemTableListVersion::MemTableListVersion(
size_t* parent_memtable_list_memory_usage, MemTableListVersion* old)
: max_write_buffer_number_to_maintain_(
old->max_write_buffer_number_to_maintain_),
parent_memtable_list_memory_usage_(parent_memtable_list_memory_usage) {
if (old != nullptr) {
memlist_ = old->memlist_;
for (auto& m : memlist_) {
m->Ref();
}
memlist_history_ = old->memlist_history_;
for (auto& m : memlist_history_) {
m->Ref();
}
}
}
MemTableListVersion::MemTableListVersion(
size_t* parent_memtable_list_memory_usage,
int max_write_buffer_number_to_maintain)
: max_write_buffer_number_to_maintain_(max_write_buffer_number_to_maintain),
parent_memtable_list_memory_usage_(parent_memtable_list_memory_usage) {}
void MemTableListVersion::Ref() { ++refs_; }
// called by superversion::clean()
void MemTableListVersion::Unref(autovector<MemTable*>* to_delete) {
assert(refs_ >= 1);
--refs_;
if (refs_ == 0) {
// if to_delete is equal to nullptr it means we're confident
// that refs_ will not be zero
assert(to_delete != nullptr);
for (const auto& m : memlist_) {
UnrefMemTable(to_delete, m);
}
for (const auto& m : memlist_history_) {
UnrefMemTable(to_delete, m);
}
delete this;
}
}
int MemTableList::NumNotFlushed() const {
int size = static_cast<int>(current_->memlist_.size());
assert(num_flush_not_started_ <= size);
return size;
}
int MemTableList::NumFlushed() const {
return static_cast<int>(current_->memlist_history_.size());
}
// Search all the memtables starting from the most recent one.
// Return the most recent value found, if any.
// Operands stores the list of merge operations to apply, so far.
bool MemTableListVersion::Get(const LookupKey& key, std::string* value,
Status* s, MergeContext* merge_context,
SequenceNumber* seq) {
return GetFromList(&memlist_, key, value, s, merge_context, seq);
}
bool MemTableListVersion::GetFromHistory(const LookupKey& key,
std::string* value, Status* s,
MergeContext* merge_context,
SequenceNumber* seq) {
return GetFromList(&memlist_history_, key, value, s, merge_context, seq);
}
bool MemTableListVersion::GetFromList(std::list<MemTable*>* list,
const LookupKey& key, std::string* value,
Status* s, MergeContext* merge_context,
SequenceNumber* seq) {
*seq = kMaxSequenceNumber;
for (auto& memtable : *list) {
SequenceNumber current_seq = kMaxSequenceNumber;
bool done = memtable->Get(key, value, s, merge_context, &current_seq);
if (*seq == kMaxSequenceNumber) {
// Store the most recent sequence number of any operation on this key.
// Since we only care about the most recent change, we only need to
// return the first operation found when searching memtables in
// reverse-chronological order.
*seq = current_seq;
}
if (done) {
assert(*seq != kMaxSequenceNumber);
return true;
}
}
return false;
}
void MemTableListVersion::AddIterators(const ReadOptions& options,
std::vector<Iterator*>* iterator_list,
Arena* arena) {
for (auto& m : memlist_) {
iterator_list->push_back(m->NewIterator(options, arena));
}
}
void MemTableListVersion::AddIterators(
const ReadOptions& options, MergeIteratorBuilder* merge_iter_builder) {
for (auto& m : memlist_) {
merge_iter_builder->AddIterator(
m->NewIterator(options, merge_iter_builder->GetArena()));
}
}
uint64_t MemTableListVersion::GetTotalNumEntries() const {
uint64_t total_num = 0;
for (auto& m : memlist_) {
total_num += m->num_entries();
}
return total_num;
}
uint64_t MemTableListVersion::ApproximateSize(const Slice& start_ikey,
const Slice& end_ikey) {
uint64_t total_size = 0;
for (auto& m : memlist_) {
total_size += m->ApproximateSize(start_ikey, end_ikey);
}
return total_size;
}
uint64_t MemTableListVersion::GetTotalNumDeletes() const {
uint64_t total_num = 0;
for (auto& m : memlist_) {
total_num += m->num_deletes();
}
return total_num;
}
SequenceNumber MemTableListVersion::GetEarliestSequenceNumber(
bool include_history) const {
if (include_history && !memlist_history_.empty()) {
return memlist_history_.back()->GetEarliestSequenceNumber();
} else if (!memlist_.empty()) {
return memlist_.back()->GetEarliestSequenceNumber();
} else {
return kMaxSequenceNumber;
}
}
// caller is responsible for referencing m
void MemTableListVersion::Add(MemTable* m, autovector<MemTable*>* to_delete) {
assert(refs_ == 1); // only when refs_ == 1 is MemTableListVersion mutable
AddMemTable(m);
TrimHistory(to_delete);
}
// Removes m from list of memtables not flushed. Caller should NOT Unref m.
void MemTableListVersion::Remove(MemTable* m,
autovector<MemTable*>* to_delete) {
assert(refs_ == 1); // only when refs_ == 1 is MemTableListVersion mutable
memlist_.remove(m);
if (max_write_buffer_number_to_maintain_ > 0) {
memlist_history_.push_front(m);
TrimHistory(to_delete);
} else {
UnrefMemTable(to_delete, m);
}
}
// Make sure we don't use up too much space in history
void MemTableListVersion::TrimHistory(autovector<MemTable*>* to_delete) {
while (memlist_.size() + memlist_history_.size() >
static_cast<size_t>(max_write_buffer_number_to_maintain_) &&
!memlist_history_.empty()) {
MemTable* x = memlist_history_.back();
memlist_history_.pop_back();
UnrefMemTable(to_delete, x);
}
}
// Returns true if there is at least one memtable on which flush has
// not yet started.
bool MemTableList::IsFlushPending() const {
if ((flush_requested_ && num_flush_not_started_ >= 1) ||
(num_flush_not_started_ >= min_write_buffer_number_to_merge_)) {
assert(imm_flush_needed.load(std::memory_order_relaxed));
return true;
}
return false;
}
// Returns the memtables that need to be flushed.
void MemTableList::PickMemtablesToFlush(autovector<MemTable*>* ret) {
AutoThreadOperationStageUpdater stage_updater(
ThreadStatus::STAGE_PICK_MEMTABLES_TO_FLUSH);
const auto& memlist = current_->memlist_;
for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) {
MemTable* m = *it;
if (!m->flush_in_progress_) {
assert(!m->flush_completed_);
num_flush_not_started_--;
if (num_flush_not_started_ == 0) {
imm_flush_needed.store(false, std::memory_order_release);
}
m->flush_in_progress_ = true; // flushing will start very soon
ret->push_back(m);
}
}
flush_requested_ = false; // start-flush request is complete
}
void MemTableList::RollbackMemtableFlush(const autovector<MemTable*>& mems,
uint64_t file_number) {
AutoThreadOperationStageUpdater stage_updater(
ThreadStatus::STAGE_MEMTABLE_ROLLBACK);
assert(!mems.empty());
// If the flush was not successful, then just reset state.
// Maybe a succeeding attempt to flush will be successful.
for (MemTable* m : mems) {
assert(m->flush_in_progress_);
assert(m->file_number_ == 0);
m->flush_in_progress_ = false;
m->flush_completed_ = false;
m->edit_.Clear();
num_flush_not_started_++;
}
imm_flush_needed.store(true, std::memory_order_release);
}
// Record a successful flush in the manifest file
Status MemTableList::InstallMemtableFlushResults(
ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
const autovector<MemTable*>& mems, VersionSet* vset, InstrumentedMutex* mu,
uint64_t file_number, autovector<MemTable*>* to_delete,
Directory* db_directory, LogBuffer* log_buffer) {
AutoThreadOperationStageUpdater stage_updater(
ThreadStatus::STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS);
mu->AssertHeld();
// flush was successful
for (size_t i = 0; i < mems.size(); ++i) {
// All the edits are associated with the first memtable of this batch.
assert(i == 0 || mems[i]->GetEdits()->NumEntries() == 0);
mems[i]->flush_completed_ = true;
mems[i]->file_number_ = file_number;
}
// if some other thread is already committing, then return
Status s;
if (commit_in_progress_) {
return s;
}
// Only a single thread can be executing this piece of code
commit_in_progress_ = true;
// scan all memtables from the earliest, and commit those
// (in that order) that have finished flushing. Memetables
// are always committed in the order that they were created.
while (!current_->memlist_.empty() && s.ok()) {
MemTable* m = current_->memlist_.back(); // get the last element
if (!m->flush_completed_) {
break;
}
LogToBuffer(log_buffer, "[%s] Level-0 commit table #%" PRIu64 " started",
cfd->GetName().c_str(), m->file_number_);
// this can release and reacquire the mutex.
s = vset->LogAndApply(cfd, mutable_cf_options, &m->edit_, mu, db_directory);
// we will be changing the version in the next code path,
// so we better create a new one, since versions are immutable
InstallNewVersion();
// All the later memtables that have the same filenum
// are part of the same batch. They can be committed now.
uint64_t mem_id = 1; // how many memtables have been flushed.
do {
if (s.ok()) { // commit new state
LogToBuffer(log_buffer, "[%s] Level-0 commit table #%" PRIu64
": memtable #%" PRIu64 " done",
cfd->GetName().c_str(), m->file_number_, mem_id);
assert(m->file_number_ > 0);
current_->Remove(m, to_delete);
} else {
// commit failed. setup state so that we can flush again.
LogToBuffer(log_buffer, "Level-0 commit table #%" PRIu64
": memtable #%" PRIu64 " failed",
m->file_number_, mem_id);
m->flush_completed_ = false;
m->flush_in_progress_ = false;
m->edit_.Clear();
num_flush_not_started_++;
m->file_number_ = 0;
imm_flush_needed.store(true, std::memory_order_release);
}
++mem_id;
} while (!current_->memlist_.empty() && (m = current_->memlist_.back()) &&
m->file_number_ == file_number);
}
commit_in_progress_ = false;
return s;
}
// New memtables are inserted at the front of the list.
void MemTableList::Add(MemTable* m, autovector<MemTable*>* to_delete) {
assert(static_cast<int>(current_->memlist_.size()) >= num_flush_not_started_);
InstallNewVersion();
// this method is used to move mutable memtable into an immutable list.
// since mutable memtable is already refcounted by the DBImpl,
// and when moving to the imutable list we don't unref it,
// we don't have to ref the memtable here. we just take over the
// reference from the DBImpl.
current_->Add(m, to_delete);
m->MarkImmutable();
num_flush_not_started_++;
if (num_flush_not_started_ == 1) {
imm_flush_needed.store(true, std::memory_order_release);
}
}
// Returns an estimate of the number of bytes of data in use.
size_t MemTableList::ApproximateUnflushedMemTablesMemoryUsage() {
size_t total_size = 0;
for (auto& memtable : current_->memlist_) {
total_size += memtable->ApproximateMemoryUsage();
}
return total_size;
}
size_t MemTableList::ApproximateMemoryUsage() { return current_memory_usage_; }
void MemTableList::InstallNewVersion() {
if (current_->refs_ == 1) {
// we're the only one using the version, just keep using it
} else {
// somebody else holds the current version, we need to create new one
MemTableListVersion* version = current_;
current_ = new MemTableListVersion(&current_memory_usage_, current_);
current_->Ref();
version->Unref();
}
}
} // namespace rocksdb