dynamically change current memtable size

Summary:
Previously setting `write_buffer_size` with `SetOptions` would only apply to new memtables. An internal user wanted it to take effect immediately, instead of at an arbitrary future point, to prevent OOM.

This PR makes the memtable's size mutable, and makes `SetOptions()` mutate it. There is one case when we preserve the old behavior, which is when memtable prefix bloom filter is enabled and the user is increasing the memtable's capacity. That's because the prefix bloom filter's size is fixed and wouldn't work as well on a larger memtable.
Closes https://github.com/facebook/rocksdb/pull/3119

Differential Revision: D6228304

Pulled By: ajkr

fbshipit-source-id: e44bd9d10a5f8c9d8c464bf7436070bb3eafdfc9
This commit is contained in:
Andrew Kryczka 2017-11-02 22:16:23 -07:00 committed by Facebook Github Bot
parent 30e4e01e05
commit c4c1f961e7
5 changed files with 58 additions and 26 deletions

View File

@ -949,6 +949,10 @@ void ColumnFamilyData::InstallSuperVersion(
RecalculateWriteStallConditions(mutable_cf_options); RecalculateWriteStallConditions(mutable_cf_options);
if (old_superversion != nullptr) { if (old_superversion != nullptr) {
if (old_superversion->mutable_cf_options.write_buffer_size !=
mutable_cf_options.write_buffer_size) {
mem_->UpdateWriteBufferSize(mutable_cf_options.write_buffer_size);
}
if (old_superversion->write_stall_condition != if (old_superversion->write_stall_condition !=
new_superversion->write_stall_condition) { new_superversion->write_stall_condition) {
sv_context->PushWriteStallNotification( sv_context->PushWriteStallNotification(

View File

@ -3354,11 +3354,23 @@ TEST_F(DBTest, DynamicMemtableOptions) {
{"write_buffer_size", "131072"}, {"write_buffer_size", "131072"},
})); }));
// The existing memtable is still 64KB in size, after it becomes immutable, // The existing memtable inflated 64KB->128KB when we invoked SetOptions().
// the next memtable will be 128KB in size. Write 256KB total, we should // Write 192KB, we should have a 128KB L0 file and a memtable with 64KB data.
// have a 64KB L0 file, a 128KB L0 file, and a memtable with 64KB data gen_l0_kb(192);
gen_l0_kb(256); ASSERT_EQ(NumTableFilesAtLevel(0), 1); // (A)
ASSERT_EQ(NumTableFilesAtLevel(0), 2); // (A) ASSERT_LT(SizeAtLevel(0), k128KB + 2 * k5KB);
ASSERT_GT(SizeAtLevel(0), k128KB - 4 * k5KB);
// Decrease buffer size below current usage
ASSERT_OK(dbfull()->SetOptions({
{"write_buffer_size", "65536"},
}));
// The existing memtable became eligible for flush when we reduced its
// capacity to 64KB. Two keys need to be added to trigger flush: first causes
// memtable to be marked full, second schedules the flush. Then we should have
// a 128KB L0 file, a 64KB L0 file, and a memtable with just one key.
gen_l0_kb(2);
ASSERT_EQ(NumTableFilesAtLevel(0), 2);
ASSERT_LT(SizeAtLevel(0), k128KB + k64KB + 2 * k5KB); ASSERT_LT(SizeAtLevel(0), k128KB + k64KB + 2 * k5KB);
ASSERT_GT(SizeAtLevel(0), k128KB + k64KB - 4 * k5KB); ASSERT_GT(SizeAtLevel(0), k128KB + k64KB - 4 * k5KB);

View File

@ -39,10 +39,10 @@
namespace rocksdb { namespace rocksdb {
MemTableOptions::MemTableOptions(const ImmutableCFOptions& ioptions, ImmutableMemTableOptions::ImmutableMemTableOptions(
const MutableCFOptions& mutable_cf_options) const ImmutableCFOptions& ioptions,
: write_buffer_size(mutable_cf_options.write_buffer_size), const MutableCFOptions& mutable_cf_options)
arena_block_size(mutable_cf_options.arena_block_size), : arena_block_size(mutable_cf_options.arena_block_size),
memtable_prefix_bloom_bits( memtable_prefix_bloom_bits(
static_cast<uint32_t>( static_cast<uint32_t>(
static_cast<double>(mutable_cf_options.write_buffer_size) * static_cast<double>(mutable_cf_options.write_buffer_size) *
@ -83,6 +83,7 @@ MemTable::MemTable(const InternalKeyComparator& cmp,
data_size_(0), data_size_(0),
num_entries_(0), num_entries_(0),
num_deletes_(0), num_deletes_(0),
write_buffer_size_(mutable_cf_options.write_buffer_size),
flush_in_progress_(false), flush_in_progress_(false),
flush_completed_(false), flush_completed_(false),
file_number_(0), file_number_(0),
@ -136,6 +137,7 @@ size_t MemTable::ApproximateMemoryUsage() {
} }
bool MemTable::ShouldFlushNow() const { bool MemTable::ShouldFlushNow() const {
size_t write_buffer_size = write_buffer_size_.load(std::memory_order_relaxed);
// In a lot of times, we cannot allocate arena blocks that exactly matches the // In a lot of times, we cannot allocate arena blocks that exactly matches the
// buffer size. Thus we have to decide if we should over-allocate or // buffer size. Thus we have to decide if we should over-allocate or
// under-allocate. // under-allocate.
@ -153,16 +155,14 @@ bool MemTable::ShouldFlushNow() const {
// if we can still allocate one more block without exceeding the // if we can still allocate one more block without exceeding the
// over-allocation ratio, then we should not flush. // over-allocation ratio, then we should not flush.
if (allocated_memory + kArenaBlockSize < if (allocated_memory + kArenaBlockSize <
moptions_.write_buffer_size + write_buffer_size + kArenaBlockSize * kAllowOverAllocationRatio) {
kArenaBlockSize * kAllowOverAllocationRatio) {
return false; return false;
} }
// if user keeps adding entries that exceeds moptions.write_buffer_size, // if user keeps adding entries that exceeds write_buffer_size, we need to
// we need to flush earlier even though we still have much available // flush earlier even though we still have much available memory left.
// memory left. if (allocated_memory >
if (allocated_memory > moptions_.write_buffer_size + write_buffer_size + kArenaBlockSize * kAllowOverAllocationRatio) {
kArenaBlockSize * kAllowOverAllocationRatio) {
return true; return true;
} }
@ -265,7 +265,8 @@ class MemTableIterator : public InternalIterator {
comparator_(mem.comparator_), comparator_(mem.comparator_),
valid_(false), valid_(false),
arena_mode_(arena != nullptr), arena_mode_(arena != nullptr),
value_pinned_(!mem.GetMemTableOptions()->inplace_update_support) { value_pinned_(
!mem.GetImmutableMemTableOptions()->inplace_update_support) {
if (use_range_del_table) { if (use_range_del_table) {
iter_ = mem.range_del_table_->GetIterator(arena); iter_ = mem.range_del_table_->GetIterator(arena);
} else if (prefix_extractor_ != nullptr && !read_options.total_order_seek) { } else if (prefix_extractor_ != nullptr && !read_options.total_order_seek) {

View File

@ -36,11 +36,9 @@ class MemTableIterator;
class MergeContext; class MergeContext;
class InternalIterator; class InternalIterator;
struct MemTableOptions { struct ImmutableMemTableOptions {
explicit MemTableOptions( explicit ImmutableMemTableOptions(const ImmutableCFOptions& ioptions,
const ImmutableCFOptions& ioptions, const MutableCFOptions& mutable_cf_options);
const MutableCFOptions& mutable_cf_options);
size_t write_buffer_size;
size_t arena_block_size; size_t arena_block_size;
uint32_t memtable_prefix_bloom_bits; uint32_t memtable_prefix_bloom_bits;
size_t memtable_huge_page_size; size_t memtable_huge_page_size;
@ -262,6 +260,18 @@ class MemTable {
return num_deletes_.load(std::memory_order_relaxed); return num_deletes_.load(std::memory_order_relaxed);
} }
// Dynamically change the memtable's capacity. If set below the current usage,
// the next key added will trigger a flush. Can only increase size when
// memtable prefix bloom is disabled, since we can't easily allocate more
// space.
void UpdateWriteBufferSize(size_t new_write_buffer_size) {
if (prefix_bloom_ == nullptr ||
new_write_buffer_size < write_buffer_size_) {
write_buffer_size_.store(new_write_buffer_size,
std::memory_order_relaxed);
}
}
// Returns the edits area that is needed for flushing the memtable // Returns the edits area that is needed for flushing the memtable
VersionEdit* GetEdits() { return &edit_; } VersionEdit* GetEdits() { return &edit_; }
@ -350,7 +360,9 @@ class MemTable {
return comparator_.comparator; return comparator_.comparator;
} }
const MemTableOptions* GetMemTableOptions() const { return &moptions_; } const ImmutableMemTableOptions* GetImmutableMemTableOptions() const {
return &moptions_;
}
uint64_t ApproximateOldestKeyTime() const { uint64_t ApproximateOldestKeyTime() const {
return oldest_key_time_.load(std::memory_order_relaxed); return oldest_key_time_.load(std::memory_order_relaxed);
@ -364,7 +376,7 @@ class MemTable {
friend class MemTableList; friend class MemTableList;
KeyComparator comparator_; KeyComparator comparator_;
const MemTableOptions moptions_; const ImmutableMemTableOptions moptions_;
int refs_; int refs_;
const size_t kArenaBlockSize; const size_t kArenaBlockSize;
AllocTracker mem_tracker_; AllocTracker mem_tracker_;
@ -378,6 +390,9 @@ class MemTable {
std::atomic<uint64_t> num_entries_; std::atomic<uint64_t> num_entries_;
std::atomic<uint64_t> num_deletes_; std::atomic<uint64_t> num_deletes_;
// Dynamically changeable memtable option
std::atomic<size_t> write_buffer_size_;
// These are used to manage memtable flushes to storage // These are used to manage memtable flushes to storage
bool flush_in_progress_; // started the flush bool flush_in_progress_; // started the flush
bool flush_completed_; // finished the flush bool flush_completed_; // finished the flush

View File

@ -1035,7 +1035,7 @@ class MemTableInserter : public WriteBatch::Handler {
} }
MemTable* mem = cf_mems_->GetMemTable(); MemTable* mem = cf_mems_->GetMemTable();
auto* moptions = mem->GetMemTableOptions(); auto* moptions = mem->GetImmutableMemTableOptions();
if (!moptions->inplace_update_support) { if (!moptions->inplace_update_support) {
mem->Add(sequence_, value_type, key, value, concurrent_memtable_writes_, mem->Add(sequence_, value_type, key, value, concurrent_memtable_writes_,
get_post_process_info(mem)); get_post_process_info(mem));
@ -1196,7 +1196,7 @@ class MemTableInserter : public WriteBatch::Handler {
} }
MemTable* mem = cf_mems_->GetMemTable(); MemTable* mem = cf_mems_->GetMemTable();
auto* moptions = mem->GetMemTableOptions(); auto* moptions = mem->GetImmutableMemTableOptions();
bool perform_merge = false; bool perform_merge = false;
// If we pass DB through and options.max_successive_merges is hit // If we pass DB through and options.max_successive_merges is hit