2014-01-24 23:30:28 +01:00
|
|
|
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under the BSD-style license found in the
|
|
|
|
// LICENSE file in the root directory of this source tree. An additional grant
|
|
|
|
// of patent rights can be found in the PATENTS file in the same directory.
|
|
|
|
//
|
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
2014-01-22 20:44:53 +01:00
|
|
|
#include "db/column_family.h"
|
2014-01-24 23:30:28 +01:00
|
|
|
|
Push- instead of pull-model for managing Write stalls
Summary:
Introducing WriteController, which is a source of truth about per-DB write delays. Let's define an DB epoch as a period where there are no flushes and compactions (i.e. new epoch is started when flush or compaction finishes). Each epoch can either:
* proceed with all writes without delay
* delay all writes by fixed time
* stop all writes
The three modes are recomputed at each epoch change (flush, compaction), rather than on every write (which is currently the case).
When we have a lot of column families, our current pull behavior adds a big overhead, since we need to loop over every column family for every write. With new push model, overhead on Write code-path is minimal.
This is just the start. Next step is to also take care of stalls introduced by slow memtable flushes. The final goal is to eliminate function MakeRoomForWrite(), which currently needs to be called for every column family by every write.
Test Plan: make check for now. I'll add some unit tests later. Also, perf test.
Reviewers: dhruba, yhchiang, MarkCallaghan, sdong, ljin
Reviewed By: ljin
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D22791
2014-09-08 20:20:25 +02:00
|
|
|
#ifndef __STDC_FORMAT_MACROS
|
|
|
|
#define __STDC_FORMAT_MACROS
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#include <inttypes.h>
|
2014-01-24 23:30:28 +01:00
|
|
|
#include <vector>
|
|
|
|
#include <string>
|
|
|
|
#include <algorithm>
|
2014-05-21 20:43:35 +02:00
|
|
|
#include <limits>
|
2014-01-24 23:30:28 +01:00
|
|
|
|
2014-12-02 19:53:39 +01:00
|
|
|
#include "db/compaction_picker.h"
|
2014-02-11 02:04:44 +01:00
|
|
|
#include "db/db_impl.h"
|
2014-12-02 21:09:20 +01:00
|
|
|
#include "db/job_context.h"
|
|
|
|
#include "db/version_set.h"
|
|
|
|
#include "db/writebuffer.h"
|
2014-02-05 02:45:19 +01:00
|
|
|
#include "db/internal_stats.h"
|
2014-02-05 01:31:18 +01:00
|
|
|
#include "db/table_properties_collector.h"
|
2014-12-02 19:53:39 +01:00
|
|
|
#include "db/version_set.h"
|
Push- instead of pull-model for managing Write stalls
Summary:
Introducing WriteController, which is a source of truth about per-DB write delays. Let's define an DB epoch as a period where there are no flushes and compactions (i.e. new epoch is started when flush or compaction finishes). Each epoch can either:
* proceed with all writes without delay
* delay all writes by fixed time
* stop all writes
The three modes are recomputed at each epoch change (flush, compaction), rather than on every write (which is currently the case).
When we have a lot of column families, our current pull behavior adds a big overhead, since we need to loop over every column family for every write. With new push model, overhead on Write code-path is minimal.
This is just the start. Next step is to also take care of stalls introduced by slow memtable flushes. The final goal is to eliminate function MakeRoomForWrite(), which currently needs to be called for every column family by every write.
Test Plan: make check for now. I'll add some unit tests later. Also, perf test.
Reviewers: dhruba, yhchiang, MarkCallaghan, sdong, ljin
Reviewed By: ljin
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D22791
2014-09-08 20:20:25 +02:00
|
|
|
#include "db/write_controller.h"
|
2014-02-07 00:42:16 +01:00
|
|
|
#include "util/autovector.h"
|
2015-06-18 23:55:05 +02:00
|
|
|
#include "util/compression.h"
|
2014-02-05 01:31:18 +01:00
|
|
|
#include "util/hash_skiplist_rep.h"
|
2014-09-17 21:49:13 +02:00
|
|
|
#include "util/options_helper.h"
|
2015-06-03 02:07:16 +02:00
|
|
|
#include "util/thread_status_util.h"
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-29 01:34:24 +02:00
|
|
|
#include "util/xfunc.h"
|
2014-01-22 20:44:53 +01:00
|
|
|
|
|
|
|
namespace rocksdb {
|
|
|
|
|
2014-11-06 20:14:28 +01:00
|
|
|
ColumnFamilyHandleImpl::ColumnFamilyHandleImpl(
|
2015-02-05 06:39:45 +01:00
|
|
|
ColumnFamilyData* column_family_data, DBImpl* db, InstrumentedMutex* mutex)
|
2014-11-06 20:14:28 +01:00
|
|
|
: cfd_(column_family_data), db_(db), mutex_(mutex) {
|
2014-02-11 02:04:44 +01:00
|
|
|
if (cfd_ != nullptr) {
|
|
|
|
cfd_->Ref();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
ColumnFamilyHandleImpl::~ColumnFamilyHandleImpl() {
|
|
|
|
if (cfd_ != nullptr) {
|
2015-02-12 18:54:48 +01:00
|
|
|
// Job id == 0 means that this is not our background process, but rather
|
|
|
|
// user thread
|
|
|
|
JobContext job_context(0);
|
2014-02-11 02:04:44 +01:00
|
|
|
mutex_->Lock();
|
|
|
|
if (cfd_->Unref()) {
|
|
|
|
delete cfd_;
|
|
|
|
}
|
2014-10-28 19:54:33 +01:00
|
|
|
db_->FindObsoleteFiles(&job_context, false, true);
|
2014-02-11 02:04:44 +01:00
|
|
|
mutex_->Unlock();
|
2014-10-28 19:54:33 +01:00
|
|
|
if (job_context.HaveSomethingToDelete()) {
|
|
|
|
db_->PurgeObsoleteFiles(job_context);
|
2014-03-11 01:25:10 +01:00
|
|
|
}
|
2014-02-11 02:04:44 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-02-26 02:30:54 +01:00
|
|
|
uint32_t ColumnFamilyHandleImpl::GetID() const { return cfd()->GetID(); }
|
|
|
|
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
2014-11-07 23:45:18 +01:00
|
|
|
const std::string& ColumnFamilyHandleImpl::GetName() const {
|
|
|
|
return cfd()->GetName();
|
|
|
|
}
|
|
|
|
|
2014-09-22 20:37:35 +02:00
|
|
|
const Comparator* ColumnFamilyHandleImpl::user_comparator() const {
|
|
|
|
return cfd()->user_comparator();
|
|
|
|
}
|
|
|
|
|
A new call back to TablePropertiesCollector to allow users know the entry is add, delete or merge
Summary:
Currently users have no idea a key is add, delete or merge from TablePropertiesCollector call back. Add a new function to add it.
Also refactor the codes so that
(1) make table property collector and internal table property collector two separate data structures with the later one now exposed
(2) table builders only receive internal table properties
Test Plan: Add cases in table_properties_collector_test to cover both of old and new ways of using TablePropertiesCollector.
Reviewers: yhchiang, igor.sugak, rven, igor
Reviewed By: rven, igor
Subscribers: meyering, yoshinorim, maykov, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D35373
2015-04-06 19:04:30 +02:00
|
|
|
void GetIntTblPropCollectorFactory(
|
|
|
|
const ColumnFamilyOptions& cf_options,
|
|
|
|
std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
|
|
|
|
int_tbl_prop_collector_factories) {
|
|
|
|
auto& collector_factories = cf_options.table_properties_collector_factories;
|
|
|
|
for (size_t i = 0; i < cf_options.table_properties_collector_factories.size();
|
|
|
|
++i) {
|
|
|
|
assert(collector_factories[i]);
|
|
|
|
int_tbl_prop_collector_factories->emplace_back(
|
|
|
|
new UserKeyTablePropertiesCollectorFactory(collector_factories[i]));
|
|
|
|
}
|
|
|
|
// Add collector to collect internal key statistics
|
|
|
|
int_tbl_prop_collector_factories->emplace_back(
|
|
|
|
new InternalKeyPropertiesCollectorFactory);
|
|
|
|
}
|
|
|
|
|
2015-06-18 23:55:05 +02:00
|
|
|
Status CheckCompressionSupported(const ColumnFamilyOptions& cf_options) {
|
|
|
|
if (!cf_options.compression_per_level.empty()) {
|
|
|
|
for (size_t level = 0; level < cf_options.compression_per_level.size();
|
|
|
|
++level) {
|
|
|
|
if (!CompressionTypeSupported(cf_options.compression_per_level[level])) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"Compression type " +
|
|
|
|
CompressionTypeToString(cf_options.compression_per_level[level]) +
|
|
|
|
" is not linked with the binary.");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
if (!CompressionTypeSupported(cf_options.compression)) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"Compression type " +
|
|
|
|
CompressionTypeToString(cf_options.compression) +
|
|
|
|
" is not linked with the binary.");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 20:44:17 +01:00
|
|
|
ColumnFamilyOptions SanitizeOptions(const DBOptions& db_options,
|
|
|
|
const InternalKeyComparator* icmp,
|
|
|
|
const ColumnFamilyOptions& src) {
|
2014-02-05 01:31:18 +01:00
|
|
|
ColumnFamilyOptions result = src;
|
|
|
|
result.comparator = icmp;
|
2014-04-17 00:15:22 +02:00
|
|
|
#ifdef OS_MACOSX
|
|
|
|
// TODO(icanadi) make write_buffer_size uint64_t instead of size_t
|
|
|
|
ClipToRange(&result.write_buffer_size, ((size_t)64) << 10, ((size_t)1) << 30);
|
|
|
|
#else
|
2014-02-05 01:31:18 +01:00
|
|
|
ClipToRange(&result.write_buffer_size,
|
|
|
|
((size_t)64) << 10, ((size_t)64) << 30);
|
2014-04-17 00:15:22 +02:00
|
|
|
#endif
|
2014-02-05 01:31:18 +01:00
|
|
|
// if user sets arena_block_size, we trust user to use this value. Otherwise,
|
|
|
|
// calculate a proper value from writer_buffer_size;
|
|
|
|
if (result.arena_block_size <= 0) {
|
|
|
|
result.arena_block_size = result.write_buffer_size / 10;
|
|
|
|
}
|
|
|
|
result.min_write_buffer_number_to_merge =
|
|
|
|
std::min(result.min_write_buffer_number_to_merge,
|
|
|
|
result.max_write_buffer_number - 1);
|
2015-05-22 20:35:40 +02:00
|
|
|
if (result.num_levels < 1) {
|
|
|
|
result.num_levels = 1;
|
|
|
|
}
|
|
|
|
if (result.compaction_style == kCompactionStyleLevel &&
|
|
|
|
result.num_levels < 2) {
|
|
|
|
result.num_levels = 2;
|
|
|
|
}
|
2014-02-05 01:31:18 +01:00
|
|
|
if (result.max_mem_compaction_level >= result.num_levels) {
|
|
|
|
result.max_mem_compaction_level = result.num_levels - 1;
|
|
|
|
}
|
2014-06-17 01:26:46 +02:00
|
|
|
if (result.max_write_buffer_number < 2) {
|
|
|
|
result.max_write_buffer_number = 2;
|
|
|
|
}
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-29 01:34:24 +02:00
|
|
|
if (result.max_write_buffer_number_to_maintain < 0) {
|
|
|
|
result.max_write_buffer_number_to_maintain = result.max_write_buffer_number;
|
|
|
|
}
|
|
|
|
XFUNC_TEST("memtablelist_history", "transaction_xftest_SanitizeOptions",
|
|
|
|
xf_transaction_set_memtable_history1,
|
|
|
|
xf_transaction_set_memtable_history,
|
|
|
|
&result.max_write_buffer_number_to_maintain);
|
|
|
|
XFUNC_TEST("memtablelist_history_clear", "transaction_xftest_SanitizeOptions",
|
|
|
|
xf_transaction_clear_memtable_history1,
|
|
|
|
xf_transaction_clear_memtable_history,
|
|
|
|
&result.max_write_buffer_number_to_maintain);
|
|
|
|
|
2014-03-11 01:25:10 +01:00
|
|
|
if (!result.prefix_extractor) {
|
|
|
|
assert(result.memtable_factory);
|
|
|
|
Slice name = result.memtable_factory->Name();
|
|
|
|
if (name.compare("HashSkipListRepFactory") == 0 ||
|
|
|
|
name.compare("HashLinkListRepFactory") == 0) {
|
2014-02-05 01:31:18 +01:00
|
|
|
result.memtable_factory = std::make_shared<SkipListFactory>();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-05-21 20:43:35 +02:00
|
|
|
if (result.compaction_style == kCompactionStyleFIFO) {
|
|
|
|
result.num_levels = 1;
|
|
|
|
// since we delete level0 files in FIFO compaction when there are too many
|
|
|
|
// of them, these options don't really mean anything
|
|
|
|
result.level0_file_num_compaction_trigger = std::numeric_limits<int>::max();
|
|
|
|
result.level0_slowdown_writes_trigger = std::numeric_limits<int>::max();
|
|
|
|
result.level0_stop_writes_trigger = std::numeric_limits<int>::max();
|
|
|
|
}
|
|
|
|
|
2015-02-24 01:08:27 +01:00
|
|
|
if (result.level0_stop_writes_trigger <
|
|
|
|
result.level0_slowdown_writes_trigger ||
|
|
|
|
result.level0_slowdown_writes_trigger <
|
|
|
|
result.level0_file_num_compaction_trigger) {
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 20:44:17 +01:00
|
|
|
Warn(db_options.info_log.get(),
|
2015-02-24 01:08:27 +01:00
|
|
|
"This condition must be satisfied: "
|
|
|
|
"level0_stop_writes_trigger(%d) >= "
|
|
|
|
"level0_slowdown_writes_trigger(%d) >= "
|
|
|
|
"level0_file_num_compaction_trigger(%d)",
|
|
|
|
result.level0_stop_writes_trigger,
|
|
|
|
result.level0_slowdown_writes_trigger,
|
|
|
|
result.level0_file_num_compaction_trigger);
|
|
|
|
if (result.level0_slowdown_writes_trigger <
|
|
|
|
result.level0_file_num_compaction_trigger) {
|
|
|
|
result.level0_slowdown_writes_trigger =
|
|
|
|
result.level0_file_num_compaction_trigger;
|
|
|
|
}
|
|
|
|
if (result.level0_stop_writes_trigger <
|
|
|
|
result.level0_slowdown_writes_trigger) {
|
|
|
|
result.level0_stop_writes_trigger = result.level0_slowdown_writes_trigger;
|
|
|
|
}
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 20:44:17 +01:00
|
|
|
Warn(db_options.info_log.get(),
|
2015-02-24 01:08:27 +01:00
|
|
|
"Adjust the value to "
|
|
|
|
"level0_stop_writes_trigger(%d)"
|
|
|
|
"level0_slowdown_writes_trigger(%d)"
|
|
|
|
"level0_file_num_compaction_trigger(%d)",
|
|
|
|
result.level0_stop_writes_trigger,
|
|
|
|
result.level0_slowdown_writes_trigger,
|
|
|
|
result.level0_file_num_compaction_trigger);
|
|
|
|
}
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 20:44:17 +01:00
|
|
|
if (result.level_compaction_dynamic_level_bytes) {
|
|
|
|
if (result.compaction_style != kCompactionStyleLevel ||
|
|
|
|
db_options.db_paths.size() > 1U) {
|
|
|
|
// 1. level_compaction_dynamic_level_bytes only makes sense for
|
|
|
|
// level-based compaction.
|
|
|
|
// 2. we don't yet know how to make both of this feature and multiple
|
|
|
|
// DB path work.
|
|
|
|
result.level_compaction_dynamic_level_bytes = false;
|
|
|
|
}
|
|
|
|
}
|
2015-02-24 01:08:27 +01:00
|
|
|
|
2014-02-05 01:31:18 +01:00
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2014-03-08 01:59:47 +01:00
|
|
|
int SuperVersion::dummy = 0;
|
|
|
|
void* const SuperVersion::kSVInUse = &SuperVersion::dummy;
|
|
|
|
void* const SuperVersion::kSVObsolete = nullptr;
|
2014-02-05 01:31:18 +01:00
|
|
|
|
2014-01-24 23:30:28 +01:00
|
|
|
SuperVersion::~SuperVersion() {
|
|
|
|
for (auto td : to_delete) {
|
|
|
|
delete td;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
SuperVersion* SuperVersion::Ref() {
|
|
|
|
refs.fetch_add(1, std::memory_order_relaxed);
|
|
|
|
return this;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool SuperVersion::Unref() {
|
|
|
|
// fetch_sub returns the previous value of ref
|
2015-01-27 22:57:44 +01:00
|
|
|
uint32_t previous_refs = refs.fetch_sub(1);
|
2014-02-04 00:28:03 +01:00
|
|
|
assert(previous_refs > 0);
|
|
|
|
return previous_refs == 1;
|
2014-01-24 23:30:28 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
void SuperVersion::Cleanup() {
|
|
|
|
assert(refs.load(std::memory_order_relaxed) == 0);
|
|
|
|
imm->Unref(&to_delete);
|
|
|
|
MemTable* m = mem->Unref();
|
|
|
|
if (m != nullptr) {
|
|
|
|
to_delete.push_back(m);
|
|
|
|
}
|
|
|
|
current->Unref();
|
|
|
|
}
|
|
|
|
|
|
|
|
void SuperVersion::Init(MemTable* new_mem, MemTableListVersion* new_imm,
|
|
|
|
Version* new_current) {
|
|
|
|
mem = new_mem;
|
|
|
|
imm = new_imm;
|
|
|
|
current = new_current;
|
|
|
|
mem->Ref();
|
|
|
|
imm->Ref();
|
|
|
|
current->Ref();
|
|
|
|
refs.store(1, std::memory_order_relaxed);
|
|
|
|
}
|
|
|
|
|
2014-03-04 02:54:04 +01:00
|
|
|
namespace {
|
|
|
|
void SuperVersionUnrefHandle(void* ptr) {
|
2014-03-08 01:59:47 +01:00
|
|
|
// UnrefHandle is called when a thread exists or a ThreadLocalPtr gets
|
|
|
|
// destroyed. When former happens, the thread shouldn't see kSVInUse.
|
|
|
|
// When latter happens, we are in ~ColumnFamilyData(), no get should happen as
|
|
|
|
// well.
|
2014-03-04 02:54:04 +01:00
|
|
|
SuperVersion* sv = static_cast<SuperVersion*>(ptr);
|
|
|
|
if (sv->Unref()) {
|
|
|
|
sv->db_mutex->Lock();
|
|
|
|
sv->Cleanup();
|
|
|
|
sv->db_mutex->Unlock();
|
|
|
|
delete sv;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} // anonymous namespace
|
|
|
|
|
Rewritten system for scheduling background work
Summary:
When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue.
The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction.
Here are the performance results:
Command:
./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000 --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333
Before the patch:
fillrandom : 26.950 micros/op 37105 ops/sec; 4.1 MB/s
After the patch:
fillrandom : 17.404 micros/op 57456 ops/sec; 6.4 MB/s
Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got:
fillrandom : 7.590 micros/op 131758 ops/sec; 14.6 MB/s
Test Plan:
make check
two stress tests:
Big number of compactions and flushes:
./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
max_background_flushes=0, to verify that this case also works correctly
./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
Reviewers: ljin, rven, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D30123
2014-12-19 20:38:12 +01:00
|
|
|
ColumnFamilyData::ColumnFamilyData(
|
|
|
|
uint32_t id, const std::string& name, Version* _dummy_versions,
|
|
|
|
Cache* _table_cache, WriteBuffer* write_buffer,
|
|
|
|
const ColumnFamilyOptions& cf_options, const DBOptions* db_options,
|
|
|
|
const EnvOptions& env_options, ColumnFamilySet* column_family_set)
|
2014-01-29 22:28:50 +01:00
|
|
|
: id_(id),
|
|
|
|
name_(name),
|
2014-11-06 20:14:28 +01:00
|
|
|
dummy_versions_(_dummy_versions),
|
2014-01-29 22:28:50 +01:00
|
|
|
current_(nullptr),
|
2014-02-11 02:04:44 +01:00
|
|
|
refs_(0),
|
|
|
|
dropped_(false),
|
2014-09-17 21:49:13 +02:00
|
|
|
internal_comparator_(cf_options.comparator),
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 20:44:17 +01:00
|
|
|
options_(*db_options,
|
|
|
|
SanitizeOptions(*db_options, &internal_comparator_, cf_options)),
|
2014-09-05 01:18:36 +02:00
|
|
|
ioptions_(options_),
|
2014-10-02 01:19:16 +02:00
|
|
|
mutable_cf_options_(options_, ioptions_),
|
2014-12-02 21:09:20 +01:00
|
|
|
write_buffer_(write_buffer),
|
2014-01-29 22:28:50 +01:00
|
|
|
mem_(nullptr),
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-29 01:34:24 +02:00
|
|
|
imm_(options_.min_write_buffer_number_to_merge,
|
|
|
|
options_.max_write_buffer_number_to_maintain),
|
2014-01-29 22:28:50 +01:00
|
|
|
super_version_(nullptr),
|
|
|
|
super_version_number_(0),
|
2014-03-04 02:54:04 +01:00
|
|
|
local_sv_(new ThreadLocalPtr(&SuperVersionUnrefHandle)),
|
2014-01-31 01:49:46 +01:00
|
|
|
next_(nullptr),
|
|
|
|
prev_(nullptr),
|
2014-01-31 00:23:13 +01:00
|
|
|
log_number_(0),
|
Rewritten system for scheduling background work
Summary:
When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue.
The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction.
Here are the performance results:
Command:
./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000 --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333
Before the patch:
fillrandom : 26.950 micros/op 37105 ops/sec; 4.1 MB/s
After the patch:
fillrandom : 17.404 micros/op 57456 ops/sec; 6.4 MB/s
Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got:
fillrandom : 7.590 micros/op 131758 ops/sec; 14.6 MB/s
Test Plan:
make check
two stress tests:
Big number of compactions and flushes:
./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
max_background_flushes=0, to verify that this case also works correctly
./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
Reviewers: ljin, rven, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D30123
2014-12-19 20:38:12 +01:00
|
|
|
column_family_set_(column_family_set),
|
|
|
|
pending_flush_(false),
|
|
|
|
pending_compaction_(false) {
|
2014-02-11 02:04:44 +01:00
|
|
|
Ref();
|
|
|
|
|
A new call back to TablePropertiesCollector to allow users know the entry is add, delete or merge
Summary:
Currently users have no idea a key is add, delete or merge from TablePropertiesCollector call back. Add a new function to add it.
Also refactor the codes so that
(1) make table property collector and internal table property collector two separate data structures with the later one now exposed
(2) table builders only receive internal table properties
Test Plan: Add cases in table_properties_collector_test to cover both of old and new ways of using TablePropertiesCollector.
Reviewers: yhchiang, igor.sugak, rven, igor
Reviewed By: rven, igor
Subscribers: meyering, yoshinorim, maykov, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D35373
2015-04-06 19:04:30 +02:00
|
|
|
// Convert user defined table properties collector factories to internal ones.
|
|
|
|
GetIntTblPropCollectorFactory(options_, &int_tbl_prop_collector_factories_);
|
|
|
|
|
2014-11-06 20:14:28 +01:00
|
|
|
// if _dummy_versions is nullptr, then this is a dummy column family.
|
|
|
|
if (_dummy_versions != nullptr) {
|
make internal stats independent of statistics
Summary:
also make it aware of column family
output from db_bench
```
** Compaction Stats [default] **
Level Files Size(MB) Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) RW-Amp W-Amp Rd(MB/s) Wr(MB/s) Rn(cnt) Rnp1(cnt) Wnp1(cnt) Wnew(cnt) Comp(sec) Comp(cnt) Avg(sec) Stall(sec) Stall(cnt) Avg(ms)
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
L0 14 956 0.9 0.0 0.0 0.0 2.7 2.7 0.0 0.0 0.0 111.6 0 0 0 0 24 40 0.612 75.20 492387 0.15
L1 21 2001 2.0 5.7 2.0 3.7 5.3 1.6 5.4 2.6 71.2 65.7 31 43 55 12 82 2 41.242 43.72 41183 1.06
L2 217 18974 1.9 16.5 2.0 14.4 15.1 0.7 15.6 7.4 70.1 64.3 17 182 185 3 241 16 15.052 0.00 0 0.00
L3 1641 188245 1.8 9.1 1.1 8.0 8.5 0.5 15.4 7.4 61.3 57.2 9 75 76 1 152 9 16.887 0.00 0 0.00
L4 4447 449025 0.4 13.4 4.8 8.6 9.1 0.5 4.7 1.9 77.8 52.7 38 79 100 21 176 38 4.639 0.00 0 0.00
Sum 6340 659201 0.0 44.7 10.0 34.7 40.6 6.0 32.0 15.2 67.7 61.6 95 379 416 37 676 105 6.439 118.91 533570 0.22
Int 0 0 0.0 1.2 0.4 0.8 1.3 0.5 5.2 2.7 59.1 65.6 3 7 9 2 20 10 2.003 0.00 0 0.00
Stalls(secs): 75.197 level0_slowdown, 0.000 level0_numfiles, 0.000 memtable_compaction, 43.717 leveln_slowdown
Stalls(count): 492387 level0_slowdown, 0 level0_numfiles, 0 memtable_compaction, 41183 leveln_slowdown
** DB Stats **
Uptime(secs): 202.1 total, 13.5 interval
Cumulative writes: 6291456 writes, 6291456 batches, 1.0 writes per batch, 4.90 ingest GB
Cumulative WAL: 6291456 writes, 6291456 syncs, 1.00 writes per sync, 4.90 GB written
Interval writes: 1048576 writes, 1048576 batches, 1.0 writes per batch, 836.0 ingest MB
Interval WAL: 1048576 writes, 1048576 syncs, 1.00 writes per sync, 0.82 MB written
Test Plan: ran it
Reviewers: sdong, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D19917
2014-07-21 21:57:29 +02:00
|
|
|
internal_stats_.reset(
|
2014-10-02 01:19:16 +02:00
|
|
|
new InternalStats(ioptions_.num_levels, db_options->env, this));
|
2014-11-06 20:14:28 +01:00
|
|
|
table_cache_.reset(new TableCache(ioptions_, env_options, _table_cache));
|
2014-11-27 00:45:11 +01:00
|
|
|
if (ioptions_.compaction_style == kCompactionStyleLevel) {
|
2014-03-11 22:52:17 +01:00
|
|
|
compaction_picker_.reset(
|
2014-10-02 01:19:16 +02:00
|
|
|
new LevelCompactionPicker(ioptions_, &internal_comparator_));
|
2014-11-27 00:45:11 +01:00
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
} else if (ioptions_.compaction_style == kCompactionStyleUniversal) {
|
|
|
|
compaction_picker_.reset(
|
|
|
|
new UniversalCompactionPicker(ioptions_, &internal_comparator_));
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
2014-11-07 23:45:18 +01:00
|
|
|
} else if (ioptions_.compaction_style == kCompactionStyleFIFO) {
|
2014-05-21 20:43:35 +02:00
|
|
|
compaction_picker_.reset(
|
2014-10-02 01:19:16 +02:00
|
|
|
new FIFOCompactionPicker(ioptions_, &internal_comparator_));
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
2014-11-07 23:45:18 +01:00
|
|
|
} else if (ioptions_.compaction_style == kCompactionStyleNone) {
|
|
|
|
compaction_picker_.reset(new NullCompactionPicker(
|
|
|
|
ioptions_, &internal_comparator_));
|
|
|
|
Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log,
|
|
|
|
"Column family %s does not use any background compaction. "
|
|
|
|
"Compactions can only be done via CompactFiles\n",
|
|
|
|
GetName().c_str());
|
2014-11-27 00:45:11 +01:00
|
|
|
#endif // !ROCKSDB_LITE
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
2014-11-07 23:45:18 +01:00
|
|
|
} else {
|
|
|
|
Log(InfoLogLevel::ERROR_LEVEL, ioptions_.info_log,
|
|
|
|
"Unable to recognize the specified compaction style %d. "
|
|
|
|
"Column family %s will use kCompactionStyleLevel.\n",
|
|
|
|
ioptions_.compaction_style, GetName().c_str());
|
|
|
|
compaction_picker_.reset(
|
|
|
|
new LevelCompactionPicker(ioptions_, &internal_comparator_));
|
2014-02-05 02:45:19 +01:00
|
|
|
}
|
2014-02-07 06:39:20 +01:00
|
|
|
|
Rewritten system for scheduling background work
Summary:
When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue.
The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction.
Here are the performance results:
Command:
./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000 --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333
Before the patch:
fillrandom : 26.950 micros/op 37105 ops/sec; 4.1 MB/s
After the patch:
fillrandom : 17.404 micros/op 57456 ops/sec; 6.4 MB/s
Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got:
fillrandom : 7.590 micros/op 131758 ops/sec; 14.6 MB/s
Test Plan:
make check
two stress tests:
Big number of compactions and flushes:
./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
max_background_flushes=0, to verify that this case also works correctly
./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
Reviewers: ljin, rven, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D30123
2014-12-19 20:38:12 +01:00
|
|
|
if (column_family_set_->NumberOfColumnFamilies() < 10) {
|
|
|
|
Log(InfoLogLevel::INFO_LEVEL, ioptions_.info_log,
|
|
|
|
"--------------- Options for column family [%s]:\n", name.c_str());
|
2015-06-18 19:15:54 +02:00
|
|
|
options_.DumpCFOptions(ioptions_.info_log);
|
Rewritten system for scheduling background work
Summary:
When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue.
The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction.
Here are the performance results:
Command:
./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000 --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333
Before the patch:
fillrandom : 26.950 micros/op 37105 ops/sec; 4.1 MB/s
After the patch:
fillrandom : 17.404 micros/op 57456 ops/sec; 6.4 MB/s
Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got:
fillrandom : 7.590 micros/op 131758 ops/sec; 14.6 MB/s
Test Plan:
make check
two stress tests:
Big number of compactions and flushes:
./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
max_background_flushes=0, to verify that this case also works correctly
./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
Reviewers: ljin, rven, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D30123
2014-12-19 20:38:12 +01:00
|
|
|
} else {
|
|
|
|
Log(InfoLogLevel::INFO_LEVEL, ioptions_.info_log,
|
|
|
|
"\t(skipping printing options)\n");
|
|
|
|
}
|
2014-02-01 00:30:27 +01:00
|
|
|
}
|
Cache some conditions for DBImpl::MakeRoomForWrite
Summary:
Task 4580155. Some conditions in DBImpl::MakeRoomForWrite can be cached in
ColumnFamilyData, because theirs value can be changed only during compaction,
adding new memtable and/or add recalculation of compaction score.
These conditions are:
cfd->imm()->size() == cfd->options()->max_write_buffer_number - 1
cfd->current()->NumLevelFiles(0) >= cfd->options()->level0_stop_writes_trigger
cfd->options()->soft_rate_limit > 0.0 &&
(score = cfd->current()->MaxCompactionScore()) > cfd->options()->soft_rate_limit
cfd->options()->hard_rate_limit > 1.0 &&
(score = cfd->current()->MaxCompactionScore()) > cfd->options()->hard_rate_limit
P.S.
As it's my first diff, Siying suggested to add everybody as a reviewers
for this diff. Sorry, if I forgot someone or add someone by mistake.
Test Plan: make all check
Reviewers: haobo, xjin, dhruba, yhchiang, zagfox, ljin, sdong
Reviewed By: sdong
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D19311
2014-06-27 01:45:27 +02:00
|
|
|
|
2014-10-02 01:19:16 +02:00
|
|
|
RecalculateWriteStallConditions(mutable_cf_options_);
|
2014-02-01 00:30:27 +01:00
|
|
|
}
|
2014-01-22 20:44:53 +01:00
|
|
|
|
2014-02-11 02:04:44 +01:00
|
|
|
// DB mutex held
|
2014-01-22 20:44:53 +01:00
|
|
|
ColumnFamilyData::~ColumnFamilyData() {
|
2015-01-26 20:48:07 +01:00
|
|
|
assert(refs_.load(std::memory_order_relaxed) == 0);
|
2014-02-11 02:04:44 +01:00
|
|
|
// remove from linked list
|
|
|
|
auto prev = prev_;
|
|
|
|
auto next = next_;
|
|
|
|
prev->next_ = next;
|
|
|
|
next->prev_ = prev;
|
|
|
|
|
2015-01-06 21:44:21 +01:00
|
|
|
if (!dropped_ && column_family_set_ != nullptr) {
|
|
|
|
// If it's dropped, it's already removed from column family set
|
|
|
|
// If column_family_set_ == nullptr, this is dummy CFD and not in
|
|
|
|
// ColumnFamilySet
|
2014-03-11 22:52:17 +01:00
|
|
|
column_family_set_->RemoveColumnFamily(this);
|
2014-02-11 02:04:44 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
if (current_ != nullptr) {
|
|
|
|
current_->Unref();
|
|
|
|
}
|
|
|
|
|
Rewritten system for scheduling background work
Summary:
When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue.
The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction.
Here are the performance results:
Command:
./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000 --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333
Before the patch:
fillrandom : 26.950 micros/op 37105 ops/sec; 4.1 MB/s
After the patch:
fillrandom : 17.404 micros/op 57456 ops/sec; 6.4 MB/s
Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got:
fillrandom : 7.590 micros/op 131758 ops/sec; 14.6 MB/s
Test Plan:
make check
two stress tests:
Big number of compactions and flushes:
./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
max_background_flushes=0, to verify that this case also works correctly
./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
Reviewers: ljin, rven, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D30123
2014-12-19 20:38:12 +01:00
|
|
|
// It would be wrong if this ColumnFamilyData is in flush_queue_ or
|
|
|
|
// compaction_queue_ and we destroyed it
|
|
|
|
assert(!pending_flush_);
|
|
|
|
assert(!pending_compaction_);
|
|
|
|
|
2014-03-11 22:52:17 +01:00
|
|
|
if (super_version_ != nullptr) {
|
|
|
|
// Release SuperVersion reference kept in ThreadLocalPtr.
|
|
|
|
// This must be done outside of mutex_ since unref handler can lock mutex.
|
|
|
|
super_version_->db_mutex->Unlock();
|
|
|
|
local_sv_.reset();
|
|
|
|
super_version_->db_mutex->Lock();
|
|
|
|
|
|
|
|
bool is_last_reference __attribute__((unused));
|
|
|
|
is_last_reference = super_version_->Unref();
|
|
|
|
assert(is_last_reference);
|
|
|
|
super_version_->Cleanup();
|
|
|
|
delete super_version_;
|
|
|
|
super_version_ = nullptr;
|
|
|
|
}
|
2014-03-04 18:03:56 +01:00
|
|
|
|
2014-01-31 01:49:46 +01:00
|
|
|
if (dummy_versions_ != nullptr) {
|
|
|
|
// List must be empty
|
2014-10-28 18:04:38 +01:00
|
|
|
assert(dummy_versions_->TEST_Next() == dummy_versions_);
|
|
|
|
bool deleted __attribute__((unused)) = dummy_versions_->Unref();
|
|
|
|
assert(deleted);
|
2014-01-31 01:49:46 +01:00
|
|
|
}
|
2014-01-24 23:30:28 +01:00
|
|
|
|
2014-01-29 22:28:50 +01:00
|
|
|
if (mem_ != nullptr) {
|
|
|
|
delete mem_->Unref();
|
2014-01-24 23:30:28 +01:00
|
|
|
}
|
2014-02-07 00:42:16 +01:00
|
|
|
autovector<MemTable*> to_delete;
|
2014-01-29 22:28:50 +01:00
|
|
|
imm_.current()->Unref(&to_delete);
|
2014-01-24 23:30:28 +01:00
|
|
|
for (MemTable* m : to_delete) {
|
|
|
|
delete m;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-01-06 21:44:21 +01:00
|
|
|
void ColumnFamilyData::SetDropped() {
|
|
|
|
// can't drop default CF
|
|
|
|
assert(id_ != 0);
|
|
|
|
dropped_ = true;
|
|
|
|
write_controller_token_.reset();
|
|
|
|
|
|
|
|
// remove from column_family_set
|
|
|
|
column_family_set_->RemoveColumnFamily(this);
|
|
|
|
}
|
|
|
|
|
2014-10-02 01:19:16 +02:00
|
|
|
void ColumnFamilyData::RecalculateWriteStallConditions(
|
|
|
|
const MutableCFOptions& mutable_cf_options) {
|
Cache some conditions for DBImpl::MakeRoomForWrite
Summary:
Task 4580155. Some conditions in DBImpl::MakeRoomForWrite can be cached in
ColumnFamilyData, because theirs value can be changed only during compaction,
adding new memtable and/or add recalculation of compaction score.
These conditions are:
cfd->imm()->size() == cfd->options()->max_write_buffer_number - 1
cfd->current()->NumLevelFiles(0) >= cfd->options()->level0_stop_writes_trigger
cfd->options()->soft_rate_limit > 0.0 &&
(score = cfd->current()->MaxCompactionScore()) > cfd->options()->soft_rate_limit
cfd->options()->hard_rate_limit > 1.0 &&
(score = cfd->current()->MaxCompactionScore()) > cfd->options()->hard_rate_limit
P.S.
As it's my first diff, Siying suggested to add everybody as a reviewers
for this diff. Sorry, if I forgot someone or add someone by mistake.
Test Plan: make all check
Reviewers: haobo, xjin, dhruba, yhchiang, zagfox, ljin, sdong
Reviewed By: sdong
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D19311
2014-06-27 01:45:27 +02:00
|
|
|
if (current_ != nullptr) {
|
2014-10-31 16:48:19 +01:00
|
|
|
auto* vstorage = current_->storage_info();
|
2014-11-04 02:45:55 +01:00
|
|
|
const double score = vstorage->max_compaction_score();
|
|
|
|
const int max_level = vstorage->max_compaction_score_level();
|
Push- instead of pull-model for managing Write stalls
Summary:
Introducing WriteController, which is a source of truth about per-DB write delays. Let's define an DB epoch as a period where there are no flushes and compactions (i.e. new epoch is started when flush or compaction finishes). Each epoch can either:
* proceed with all writes without delay
* delay all writes by fixed time
* stop all writes
The three modes are recomputed at each epoch change (flush, compaction), rather than on every write (which is currently the case).
When we have a lot of column families, our current pull behavior adds a big overhead, since we need to loop over every column family for every write. With new push model, overhead on Write code-path is minimal.
This is just the start. Next step is to also take care of stalls introduced by slow memtable flushes. The final goal is to eliminate function MakeRoomForWrite(), which currently needs to be called for every column family by every write.
Test Plan: make check for now. I'll add some unit tests later. Also, perf test.
Reviewers: dhruba, yhchiang, MarkCallaghan, sdong, ljin
Reviewed By: ljin
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D22791
2014-09-08 20:20:25 +02:00
|
|
|
auto write_controller = column_family_set_->write_controller_;
|
|
|
|
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-29 01:34:24 +02:00
|
|
|
if (imm()->NumNotFlushed() >= mutable_cf_options.max_write_buffer_number) {
|
Push- instead of pull-model for managing Write stalls
Summary:
Introducing WriteController, which is a source of truth about per-DB write delays. Let's define an DB epoch as a period where there are no flushes and compactions (i.e. new epoch is started when flush or compaction finishes). Each epoch can either:
* proceed with all writes without delay
* delay all writes by fixed time
* stop all writes
The three modes are recomputed at each epoch change (flush, compaction), rather than on every write (which is currently the case).
When we have a lot of column families, our current pull behavior adds a big overhead, since we need to loop over every column family for every write. With new push model, overhead on Write code-path is minimal.
This is just the start. Next step is to also take care of stalls introduced by slow memtable flushes. The final goal is to eliminate function MakeRoomForWrite(), which currently needs to be called for every column family by every write.
Test Plan: make check for now. I'll add some unit tests later. Also, perf test.
Reviewers: dhruba, yhchiang, MarkCallaghan, sdong, ljin
Reviewed By: ljin
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D22791
2014-09-08 20:20:25 +02:00
|
|
|
write_controller_token_ = write_controller->GetStopToken();
|
|
|
|
internal_stats_->AddCFStats(InternalStats::MEMTABLE_COMPACTION, 1);
|
2014-10-29 23:11:32 +01:00
|
|
|
Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log,
|
Push- instead of pull-model for managing Write stalls
Summary:
Introducing WriteController, which is a source of truth about per-DB write delays. Let's define an DB epoch as a period where there are no flushes and compactions (i.e. new epoch is started when flush or compaction finishes). Each epoch can either:
* proceed with all writes without delay
* delay all writes by fixed time
* stop all writes
The three modes are recomputed at each epoch change (flush, compaction), rather than on every write (which is currently the case).
When we have a lot of column families, our current pull behavior adds a big overhead, since we need to loop over every column family for every write. With new push model, overhead on Write code-path is minimal.
This is just the start. Next step is to also take care of stalls introduced by slow memtable flushes. The final goal is to eliminate function MakeRoomForWrite(), which currently needs to be called for every column family by every write.
Test Plan: make check for now. I'll add some unit tests later. Also, perf test.
Reviewers: dhruba, yhchiang, MarkCallaghan, sdong, ljin
Reviewed By: ljin
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D22791
2014-09-08 20:20:25 +02:00
|
|
|
"[%s] Stopping writes because we have %d immutable memtables "
|
2014-10-17 01:57:59 +02:00
|
|
|
"(waiting for flush), max_write_buffer_number is set to %d",
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-29 01:34:24 +02:00
|
|
|
name_.c_str(), imm()->NumNotFlushed(),
|
2014-10-17 01:57:59 +02:00
|
|
|
mutable_cf_options.max_write_buffer_number);
|
2015-03-30 23:04:21 +02:00
|
|
|
} else if (vstorage->l0_delay_trigger_count() >=
|
2014-10-02 01:19:16 +02:00
|
|
|
mutable_cf_options.level0_stop_writes_trigger) {
|
2014-09-09 00:23:58 +02:00
|
|
|
write_controller_token_ = write_controller->GetStopToken();
|
|
|
|
internal_stats_->AddCFStats(InternalStats::LEVEL0_NUM_FILES, 1);
|
2014-10-29 23:11:32 +01:00
|
|
|
Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log,
|
2014-09-09 00:23:58 +02:00
|
|
|
"[%s] Stopping writes because we have %d level-0 files",
|
2015-03-30 23:04:21 +02:00
|
|
|
name_.c_str(), vstorage->l0_delay_trigger_count());
|
2014-10-02 01:19:16 +02:00
|
|
|
} else if (mutable_cf_options.level0_slowdown_writes_trigger >= 0 &&
|
2015-03-30 23:04:21 +02:00
|
|
|
vstorage->l0_delay_trigger_count() >=
|
2014-10-02 01:19:16 +02:00
|
|
|
mutable_cf_options.level0_slowdown_writes_trigger) {
|
2015-05-16 00:52:51 +02:00
|
|
|
write_controller_token_ = write_controller->GetDelayToken();
|
|
|
|
internal_stats_->AddCFStats(InternalStats::LEVEL0_SLOWDOWN, 1);
|
2014-10-29 23:11:32 +01:00
|
|
|
Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log,
|
2015-05-16 00:52:51 +02:00
|
|
|
"[%s] Stalling writes because we have %d level-0 files",
|
|
|
|
name_.c_str(), vstorage->l0_delay_trigger_count());
|
2014-10-17 02:21:31 +02:00
|
|
|
} else if (mutable_cf_options.soft_rate_limit > 0.0 &&
|
|
|
|
score > mutable_cf_options.soft_rate_limit) {
|
2015-05-16 00:52:51 +02:00
|
|
|
write_controller_token_ = write_controller->GetDelayToken();
|
2015-03-14 23:01:43 +01:00
|
|
|
internal_stats_->RecordLevelNSlowdown(max_level, true);
|
2014-10-29 23:11:32 +01:00
|
|
|
Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log,
|
2015-05-16 00:52:51 +02:00
|
|
|
"[%s] Stalling writes because we hit soft limit on level %d",
|
|
|
|
name_.c_str(), max_level);
|
Push- instead of pull-model for managing Write stalls
Summary:
Introducing WriteController, which is a source of truth about per-DB write delays. Let's define an DB epoch as a period where there are no flushes and compactions (i.e. new epoch is started when flush or compaction finishes). Each epoch can either:
* proceed with all writes without delay
* delay all writes by fixed time
* stop all writes
The three modes are recomputed at each epoch change (flush, compaction), rather than on every write (which is currently the case).
When we have a lot of column families, our current pull behavior adds a big overhead, since we need to loop over every column family for every write. With new push model, overhead on Write code-path is minimal.
This is just the start. Next step is to also take care of stalls introduced by slow memtable flushes. The final goal is to eliminate function MakeRoomForWrite(), which currently needs to be called for every column family by every write.
Test Plan: make check for now. I'll add some unit tests later. Also, perf test.
Reviewers: dhruba, yhchiang, MarkCallaghan, sdong, ljin
Reviewed By: ljin
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D22791
2014-09-08 20:20:25 +02:00
|
|
|
} else {
|
|
|
|
write_controller_token_.reset();
|
|
|
|
}
|
Cache some conditions for DBImpl::MakeRoomForWrite
Summary:
Task 4580155. Some conditions in DBImpl::MakeRoomForWrite can be cached in
ColumnFamilyData, because theirs value can be changed only during compaction,
adding new memtable and/or add recalculation of compaction score.
These conditions are:
cfd->imm()->size() == cfd->options()->max_write_buffer_number - 1
cfd->current()->NumLevelFiles(0) >= cfd->options()->level0_stop_writes_trigger
cfd->options()->soft_rate_limit > 0.0 &&
(score = cfd->current()->MaxCompactionScore()) > cfd->options()->soft_rate_limit
cfd->options()->hard_rate_limit > 1.0 &&
(score = cfd->current()->MaxCompactionScore()) > cfd->options()->hard_rate_limit
P.S.
As it's my first diff, Siying suggested to add everybody as a reviewers
for this diff. Sorry, if I forgot someone or add someone by mistake.
Test Plan: make all check
Reviewers: haobo, xjin, dhruba, yhchiang, zagfox, ljin, sdong
Reviewed By: sdong
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D19311
2014-06-27 01:45:27 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-04-14 19:48:01 +02:00
|
|
|
const EnvOptions* ColumnFamilyData::soptions() const {
|
2014-09-05 01:18:36 +02:00
|
|
|
return &(column_family_set_->env_options_);
|
2014-04-14 19:48:01 +02:00
|
|
|
}
|
|
|
|
|
2014-11-06 20:14:28 +01:00
|
|
|
void ColumnFamilyData::SetCurrent(Version* current_version) {
|
|
|
|
current_ = current_version;
|
|
|
|
}
|
2014-01-31 00:23:13 +01:00
|
|
|
|
2015-02-12 02:10:43 +01:00
|
|
|
uint64_t ColumnFamilyData::GetNumLiveVersions() const {
|
|
|
|
return VersionSet::GetNumLiveVersions(dummy_versions_);
|
|
|
|
}
|
|
|
|
|
2014-12-02 21:09:20 +01:00
|
|
|
MemTable* ColumnFamilyData::ConstructNewMemtable(
|
2015-05-29 23:36:35 +02:00
|
|
|
const MutableCFOptions& mutable_cf_options, SequenceNumber earliest_seq) {
|
2014-01-29 22:28:50 +01:00
|
|
|
assert(current_ != nullptr);
|
2015-05-29 23:36:35 +02:00
|
|
|
return new MemTable(internal_comparator_, ioptions_, mutable_cf_options,
|
|
|
|
write_buffer_, earliest_seq);
|
2014-12-02 21:09:20 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
void ColumnFamilyData::CreateNewMemtable(
|
2015-05-29 23:36:35 +02:00
|
|
|
const MutableCFOptions& mutable_cf_options, SequenceNumber earliest_seq) {
|
2014-01-29 22:28:50 +01:00
|
|
|
if (mem_ != nullptr) {
|
|
|
|
delete mem_->Unref();
|
2014-01-24 23:30:28 +01:00
|
|
|
}
|
2015-05-29 23:36:35 +02:00
|
|
|
SetMemtable(ConstructNewMemtable(mutable_cf_options, earliest_seq));
|
2014-01-29 22:28:50 +01:00
|
|
|
mem_->Ref();
|
|
|
|
}
|
|
|
|
|
Rewritten system for scheduling background work
Summary:
When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue.
The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction.
Here are the performance results:
Command:
./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000 --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333
Before the patch:
fillrandom : 26.950 micros/op 37105 ops/sec; 4.1 MB/s
After the patch:
fillrandom : 17.404 micros/op 57456 ops/sec; 6.4 MB/s
Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got:
fillrandom : 7.590 micros/op 131758 ops/sec; 14.6 MB/s
Test Plan:
make check
two stress tests:
Big number of compactions and flushes:
./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
max_background_flushes=0, to verify that this case also works correctly
./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
Reviewers: ljin, rven, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D30123
2014-12-19 20:38:12 +01:00
|
|
|
bool ColumnFamilyData::NeedsCompaction() const {
|
|
|
|
return compaction_picker_->NeedsCompaction(current_->storage_info());
|
|
|
|
}
|
|
|
|
|
2014-10-02 01:19:16 +02:00
|
|
|
Compaction* ColumnFamilyData::PickCompaction(
|
|
|
|
const MutableCFOptions& mutable_options, LogBuffer* log_buffer) {
|
2014-10-27 23:49:46 +01:00
|
|
|
auto* result = compaction_picker_->PickCompaction(
|
2014-10-31 16:48:19 +01:00
|
|
|
GetName(), mutable_options, current_->storage_info(), log_buffer);
|
2014-10-27 23:49:46 +01:00
|
|
|
if (result != nullptr) {
|
|
|
|
result->SetInputVersion(current_);
|
|
|
|
}
|
Cache some conditions for DBImpl::MakeRoomForWrite
Summary:
Task 4580155. Some conditions in DBImpl::MakeRoomForWrite can be cached in
ColumnFamilyData, because theirs value can be changed only during compaction,
adding new memtable and/or add recalculation of compaction score.
These conditions are:
cfd->imm()->size() == cfd->options()->max_write_buffer_number - 1
cfd->current()->NumLevelFiles(0) >= cfd->options()->level0_stop_writes_trigger
cfd->options()->soft_rate_limit > 0.0 &&
(score = cfd->current()->MaxCompactionScore()) > cfd->options()->soft_rate_limit
cfd->options()->hard_rate_limit > 1.0 &&
(score = cfd->current()->MaxCompactionScore()) > cfd->options()->hard_rate_limit
P.S.
As it's my first diff, Siying suggested to add everybody as a reviewers
for this diff. Sorry, if I forgot someone or add someone by mistake.
Test Plan: make all check
Reviewers: haobo, xjin, dhruba, yhchiang, zagfox, ljin, sdong
Reviewed By: sdong
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D19311
2014-06-27 01:45:27 +02:00
|
|
|
return result;
|
2014-02-01 00:30:27 +01:00
|
|
|
}
|
|
|
|
|
2015-04-23 01:55:22 +02:00
|
|
|
const int ColumnFamilyData::kCompactAllLevels = -1;
|
2015-04-15 06:45:20 +02:00
|
|
|
const int ColumnFamilyData::kCompactToBaseLevel = -2;
|
2015-04-23 01:55:22 +02:00
|
|
|
|
2014-10-02 01:19:16 +02:00
|
|
|
Compaction* ColumnFamilyData::CompactRange(
|
|
|
|
const MutableCFOptions& mutable_cf_options,
|
|
|
|
int input_level, int output_level, uint32_t output_path_id,
|
|
|
|
const InternalKey* begin, const InternalKey* end,
|
|
|
|
InternalKey** compaction_end) {
|
2014-10-27 23:49:46 +01:00
|
|
|
auto* result = compaction_picker_->CompactRange(
|
2014-10-31 16:48:19 +01:00
|
|
|
GetName(), mutable_cf_options, current_->storage_info(), input_level,
|
2014-10-27 23:49:46 +01:00
|
|
|
output_level, output_path_id, begin, end, compaction_end);
|
|
|
|
if (result != nullptr) {
|
|
|
|
result->SetInputVersion(current_);
|
|
|
|
}
|
|
|
|
return result;
|
2014-02-01 00:30:27 +01:00
|
|
|
}
|
|
|
|
|
2014-04-14 18:34:59 +02:00
|
|
|
SuperVersion* ColumnFamilyData::GetReferencedSuperVersion(
|
2015-02-05 06:39:45 +01:00
|
|
|
InstrumentedMutex* db_mutex) {
|
2014-04-14 18:34:59 +02:00
|
|
|
SuperVersion* sv = nullptr;
|
2014-09-24 22:12:16 +02:00
|
|
|
sv = GetThreadLocalSuperVersion(db_mutex);
|
|
|
|
sv->Ref();
|
|
|
|
if (!ReturnThreadLocalSuperVersion(sv)) {
|
|
|
|
sv->Unref();
|
2014-04-14 18:34:59 +02:00
|
|
|
}
|
|
|
|
return sv;
|
|
|
|
}
|
|
|
|
|
|
|
|
SuperVersion* ColumnFamilyData::GetThreadLocalSuperVersion(
|
2015-02-05 06:39:45 +01:00
|
|
|
InstrumentedMutex* db_mutex) {
|
2014-04-14 18:34:59 +02:00
|
|
|
SuperVersion* sv = nullptr;
|
|
|
|
// The SuperVersion is cached in thread local storage to avoid acquiring
|
|
|
|
// mutex when SuperVersion does not change since the last use. When a new
|
|
|
|
// SuperVersion is installed, the compaction or flush thread cleans up
|
|
|
|
// cached SuperVersion in all existing thread local storage. To avoid
|
|
|
|
// acquiring mutex for this operation, we use atomic Swap() on the thread
|
|
|
|
// local pointer to guarantee exclusive access. If the thread local pointer
|
|
|
|
// is being used while a new SuperVersion is installed, the cached
|
|
|
|
// SuperVersion can become stale. In that case, the background thread would
|
|
|
|
// have swapped in kSVObsolete. We re-check the value at when returning
|
|
|
|
// SuperVersion back to thread local, with an atomic compare and swap.
|
|
|
|
// The superversion will need to be released if detected to be stale.
|
|
|
|
void* ptr = local_sv_->Swap(SuperVersion::kSVInUse);
|
|
|
|
// Invariant:
|
|
|
|
// (1) Scrape (always) installs kSVObsolete in ThreadLocal storage
|
|
|
|
// (2) the Swap above (always) installs kSVInUse, ThreadLocal storage
|
|
|
|
// should only keep kSVInUse before ReturnThreadLocalSuperVersion call
|
|
|
|
// (if no Scrape happens).
|
|
|
|
assert(ptr != SuperVersion::kSVInUse);
|
|
|
|
sv = static_cast<SuperVersion*>(ptr);
|
|
|
|
if (sv == SuperVersion::kSVObsolete ||
|
|
|
|
sv->version_number != super_version_number_.load()) {
|
2014-10-02 01:19:16 +02:00
|
|
|
RecordTick(ioptions_.statistics, NUMBER_SUPERVERSION_ACQUIRES);
|
2014-04-14 18:34:59 +02:00
|
|
|
SuperVersion* sv_to_delete = nullptr;
|
|
|
|
|
|
|
|
if (sv && sv->Unref()) {
|
2014-10-02 01:19:16 +02:00
|
|
|
RecordTick(ioptions_.statistics, NUMBER_SUPERVERSION_CLEANUPS);
|
2014-04-14 18:34:59 +02:00
|
|
|
db_mutex->Lock();
|
|
|
|
// NOTE: underlying resources held by superversion (sst files) might
|
|
|
|
// not be released until the next background job.
|
|
|
|
sv->Cleanup();
|
|
|
|
sv_to_delete = sv;
|
|
|
|
} else {
|
|
|
|
db_mutex->Lock();
|
|
|
|
}
|
|
|
|
sv = super_version_->Ref();
|
|
|
|
db_mutex->Unlock();
|
|
|
|
|
|
|
|
delete sv_to_delete;
|
|
|
|
}
|
|
|
|
assert(sv != nullptr);
|
|
|
|
return sv;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool ColumnFamilyData::ReturnThreadLocalSuperVersion(SuperVersion* sv) {
|
|
|
|
assert(sv != nullptr);
|
|
|
|
// Put the SuperVersion back
|
|
|
|
void* expected = SuperVersion::kSVInUse;
|
|
|
|
if (local_sv_->CompareAndSwap(static_cast<void*>(sv), expected)) {
|
|
|
|
// When we see kSVInUse in the ThreadLocal, we are sure ThreadLocal
|
2015-04-25 11:14:27 +02:00
|
|
|
// storage has not been altered and no Scrape has happened. The
|
2014-04-14 18:34:59 +02:00
|
|
|
// SuperVersion is still current.
|
|
|
|
return true;
|
|
|
|
} else {
|
|
|
|
// ThreadLocal scrape happened in the process of this GetImpl call (after
|
|
|
|
// thread local Swap() at the beginning and before CompareAndSwap()).
|
|
|
|
// This means the SuperVersion it holds is obsolete.
|
|
|
|
assert(expected == SuperVersion::kSVObsolete);
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2014-01-29 22:28:50 +01:00
|
|
|
SuperVersion* ColumnFamilyData::InstallSuperVersion(
|
2015-02-05 06:39:45 +01:00
|
|
|
SuperVersion* new_superversion, InstrumentedMutex* db_mutex) {
|
2014-09-17 21:49:13 +02:00
|
|
|
db_mutex->AssertHeld();
|
|
|
|
return InstallSuperVersion(new_superversion, db_mutex, mutable_cf_options_);
|
|
|
|
}
|
|
|
|
|
|
|
|
SuperVersion* ColumnFamilyData::InstallSuperVersion(
|
2015-02-05 06:39:45 +01:00
|
|
|
SuperVersion* new_superversion, InstrumentedMutex* db_mutex,
|
2014-09-17 21:49:13 +02:00
|
|
|
const MutableCFOptions& mutable_cf_options) {
|
2014-03-31 21:44:54 +02:00
|
|
|
new_superversion->db_mutex = db_mutex;
|
2014-09-17 21:49:13 +02:00
|
|
|
new_superversion->mutable_cf_options = mutable_cf_options;
|
2014-01-29 22:28:50 +01:00
|
|
|
new_superversion->Init(mem_, imm_.current(), current_);
|
|
|
|
SuperVersion* old_superversion = super_version_;
|
|
|
|
super_version_ = new_superversion;
|
|
|
|
++super_version_number_;
|
2014-03-04 02:54:04 +01:00
|
|
|
super_version_->version_number = super_version_number_;
|
2014-04-14 18:34:59 +02:00
|
|
|
// Reset SuperVersions cached in thread local storage
|
2014-09-24 22:12:16 +02:00
|
|
|
ResetThreadLocalSuperVersions();
|
Cache some conditions for DBImpl::MakeRoomForWrite
Summary:
Task 4580155. Some conditions in DBImpl::MakeRoomForWrite can be cached in
ColumnFamilyData, because theirs value can be changed only during compaction,
adding new memtable and/or add recalculation of compaction score.
These conditions are:
cfd->imm()->size() == cfd->options()->max_write_buffer_number - 1
cfd->current()->NumLevelFiles(0) >= cfd->options()->level0_stop_writes_trigger
cfd->options()->soft_rate_limit > 0.0 &&
(score = cfd->current()->MaxCompactionScore()) > cfd->options()->soft_rate_limit
cfd->options()->hard_rate_limit > 1.0 &&
(score = cfd->current()->MaxCompactionScore()) > cfd->options()->hard_rate_limit
P.S.
As it's my first diff, Siying suggested to add everybody as a reviewers
for this diff. Sorry, if I forgot someone or add someone by mistake.
Test Plan: make all check
Reviewers: haobo, xjin, dhruba, yhchiang, zagfox, ljin, sdong
Reviewed By: sdong
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D19311
2014-06-27 01:45:27 +02:00
|
|
|
|
2014-10-02 01:19:16 +02:00
|
|
|
RecalculateWriteStallConditions(mutable_cf_options);
|
Cache some conditions for DBImpl::MakeRoomForWrite
Summary:
Task 4580155. Some conditions in DBImpl::MakeRoomForWrite can be cached in
ColumnFamilyData, because theirs value can be changed only during compaction,
adding new memtable and/or add recalculation of compaction score.
These conditions are:
cfd->imm()->size() == cfd->options()->max_write_buffer_number - 1
cfd->current()->NumLevelFiles(0) >= cfd->options()->level0_stop_writes_trigger
cfd->options()->soft_rate_limit > 0.0 &&
(score = cfd->current()->MaxCompactionScore()) > cfd->options()->soft_rate_limit
cfd->options()->hard_rate_limit > 1.0 &&
(score = cfd->current()->MaxCompactionScore()) > cfd->options()->hard_rate_limit
P.S.
As it's my first diff, Siying suggested to add everybody as a reviewers
for this diff. Sorry, if I forgot someone or add someone by mistake.
Test Plan: make all check
Reviewers: haobo, xjin, dhruba, yhchiang, zagfox, ljin, sdong
Reviewed By: sdong
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D19311
2014-06-27 01:45:27 +02:00
|
|
|
|
2014-01-29 22:28:50 +01:00
|
|
|
if (old_superversion != nullptr && old_superversion->Unref()) {
|
|
|
|
old_superversion->Cleanup();
|
|
|
|
return old_superversion; // will let caller delete outside of mutex
|
|
|
|
}
|
|
|
|
return nullptr;
|
2014-01-22 20:44:53 +01:00
|
|
|
}
|
|
|
|
|
2014-03-04 02:54:04 +01:00
|
|
|
void ColumnFamilyData::ResetThreadLocalSuperVersions() {
|
|
|
|
autovector<void*> sv_ptrs;
|
2014-03-08 01:59:47 +01:00
|
|
|
local_sv_->Scrape(&sv_ptrs, SuperVersion::kSVObsolete);
|
2014-03-04 02:54:04 +01:00
|
|
|
for (auto ptr : sv_ptrs) {
|
|
|
|
assert(ptr);
|
2014-03-08 01:59:47 +01:00
|
|
|
if (ptr == SuperVersion::kSVInUse) {
|
|
|
|
continue;
|
|
|
|
}
|
2014-03-04 02:54:04 +01:00
|
|
|
auto sv = static_cast<SuperVersion*>(ptr);
|
|
|
|
if (sv->Unref()) {
|
|
|
|
sv->Cleanup();
|
|
|
|
delete sv;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-11-13 22:45:33 +01:00
|
|
|
#ifndef ROCKSDB_LITE
|
2014-11-05 01:23:05 +01:00
|
|
|
Status ColumnFamilyData::SetOptions(
|
2014-09-17 21:49:13 +02:00
|
|
|
const std::unordered_map<std::string, std::string>& options_map) {
|
|
|
|
MutableCFOptions new_mutable_cf_options;
|
2014-11-05 01:23:05 +01:00
|
|
|
Status s = GetMutableOptionsFromStrings(mutable_cf_options_, options_map,
|
|
|
|
&new_mutable_cf_options);
|
|
|
|
if (s.ok()) {
|
2014-09-17 21:49:13 +02:00
|
|
|
mutable_cf_options_ = new_mutable_cf_options;
|
2014-10-02 01:19:16 +02:00
|
|
|
mutable_cf_options_.RefreshDerivedOptions(ioptions_);
|
2014-09-17 21:49:13 +02:00
|
|
|
}
|
2014-11-05 01:23:05 +01:00
|
|
|
return s;
|
2014-09-17 21:49:13 +02:00
|
|
|
}
|
2014-11-13 22:45:33 +01:00
|
|
|
#endif // ROCKSDB_LITE
|
2014-09-17 21:49:13 +02:00
|
|
|
|
[CF] Rethink table cache
Summary:
Adapting table cache to column families is interesting. We want table cache to be global LRU, so if some column families are use not as often as others, we want them to be evicted from cache. However, current TableCache object also constructs tables on its own. If table is not found in the cache, TableCache automatically creates new table. We want each column family to be able to specify different table factory.
To solve the problem, we still have a single LRU, but we provide the LRUCache object to TableCache on construction. We have one TableCache per column family, but the underyling cache is shared by all TableCache objects.
This allows us to have a global LRU, but still be able to support different table factories for different column families. Also, in the future it will also be able to support different directories for different column families.
Test Plan: make check
Reviewers: dhruba, haobo, kailiu, sdong
CC: leveldb
Differential Revision: https://reviews.facebook.net/D15915
2014-02-05 18:07:55 +01:00
|
|
|
ColumnFamilySet::ColumnFamilySet(const std::string& dbname,
|
2014-02-05 22:12:23 +01:00
|
|
|
const DBOptions* db_options,
|
2014-09-05 01:18:36 +02:00
|
|
|
const EnvOptions& env_options,
|
Push- instead of pull-model for managing Write stalls
Summary:
Introducing WriteController, which is a source of truth about per-DB write delays. Let's define an DB epoch as a period where there are no flushes and compactions (i.e. new epoch is started when flush or compaction finishes). Each epoch can either:
* proceed with all writes without delay
* delay all writes by fixed time
* stop all writes
The three modes are recomputed at each epoch change (flush, compaction), rather than on every write (which is currently the case).
When we have a lot of column families, our current pull behavior adds a big overhead, since we need to loop over every column family for every write. With new push model, overhead on Write code-path is minimal.
This is just the start. Next step is to also take care of stalls introduced by slow memtable flushes. The final goal is to eliminate function MakeRoomForWrite(), which currently needs to be called for every column family by every write.
Test Plan: make check for now. I'll add some unit tests later. Also, perf test.
Reviewers: dhruba, yhchiang, MarkCallaghan, sdong, ljin
Reviewed By: ljin
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D22791
2014-09-08 20:20:25 +02:00
|
|
|
Cache* table_cache,
|
2014-12-02 21:09:20 +01:00
|
|
|
WriteBuffer* write_buffer,
|
Push- instead of pull-model for managing Write stalls
Summary:
Introducing WriteController, which is a source of truth about per-DB write delays. Let's define an DB epoch as a period where there are no flushes and compactions (i.e. new epoch is started when flush or compaction finishes). Each epoch can either:
* proceed with all writes without delay
* delay all writes by fixed time
* stop all writes
The three modes are recomputed at each epoch change (flush, compaction), rather than on every write (which is currently the case).
When we have a lot of column families, our current pull behavior adds a big overhead, since we need to loop over every column family for every write. With new push model, overhead on Write code-path is minimal.
This is just the start. Next step is to also take care of stalls introduced by slow memtable flushes. The final goal is to eliminate function MakeRoomForWrite(), which currently needs to be called for every column family by every write.
Test Plan: make check for now. I'll add some unit tests later. Also, perf test.
Reviewers: dhruba, yhchiang, MarkCallaghan, sdong, ljin
Reviewed By: ljin
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D22791
2014-09-08 20:20:25 +02:00
|
|
|
WriteController* write_controller)
|
2014-01-31 01:49:46 +01:00
|
|
|
: max_column_family_(0),
|
2014-12-02 21:09:20 +01:00
|
|
|
dummy_cfd_(new ColumnFamilyData(0, "", nullptr, nullptr, nullptr,
|
2014-02-05 21:20:40 +01:00
|
|
|
ColumnFamilyOptions(), db_options,
|
Push- instead of pull-model for managing Write stalls
Summary:
Introducing WriteController, which is a source of truth about per-DB write delays. Let's define an DB epoch as a period where there are no flushes and compactions (i.e. new epoch is started when flush or compaction finishes). Each epoch can either:
* proceed with all writes without delay
* delay all writes by fixed time
* stop all writes
The three modes are recomputed at each epoch change (flush, compaction), rather than on every write (which is currently the case).
When we have a lot of column families, our current pull behavior adds a big overhead, since we need to loop over every column family for every write. With new push model, overhead on Write code-path is minimal.
This is just the start. Next step is to also take care of stalls introduced by slow memtable flushes. The final goal is to eliminate function MakeRoomForWrite(), which currently needs to be called for every column family by every write.
Test Plan: make check for now. I'll add some unit tests later. Also, perf test.
Reviewers: dhruba, yhchiang, MarkCallaghan, sdong, ljin
Reviewed By: ljin
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D22791
2014-09-08 20:20:25 +02:00
|
|
|
env_options, nullptr)),
|
2014-03-11 22:52:17 +01:00
|
|
|
default_cfd_cache_(nullptr),
|
[CF] Rethink table cache
Summary:
Adapting table cache to column families is interesting. We want table cache to be global LRU, so if some column families are use not as often as others, we want them to be evicted from cache. However, current TableCache object also constructs tables on its own. If table is not found in the cache, TableCache automatically creates new table. We want each column family to be able to specify different table factory.
To solve the problem, we still have a single LRU, but we provide the LRUCache object to TableCache on construction. We have one TableCache per column family, but the underyling cache is shared by all TableCache objects.
This allows us to have a global LRU, but still be able to support different table factories for different column families. Also, in the future it will also be able to support different directories for different column families.
Test Plan: make check
Reviewers: dhruba, haobo, kailiu, sdong
CC: leveldb
Differential Revision: https://reviews.facebook.net/D15915
2014-02-05 18:07:55 +01:00
|
|
|
db_name_(dbname),
|
|
|
|
db_options_(db_options),
|
2014-09-05 01:18:36 +02:00
|
|
|
env_options_(env_options),
|
2014-02-06 20:44:50 +01:00
|
|
|
table_cache_(table_cache),
|
2014-12-02 21:09:20 +01:00
|
|
|
write_buffer_(write_buffer),
|
2015-01-06 21:44:21 +01:00
|
|
|
write_controller_(write_controller) {
|
2014-01-31 01:49:46 +01:00
|
|
|
// initialize linked list
|
2014-02-11 02:04:44 +01:00
|
|
|
dummy_cfd_->prev_ = dummy_cfd_;
|
|
|
|
dummy_cfd_->next_ = dummy_cfd_;
|
2014-01-31 01:49:46 +01:00
|
|
|
}
|
2014-01-22 20:44:53 +01:00
|
|
|
|
|
|
|
ColumnFamilySet::~ColumnFamilySet() {
|
2014-02-11 02:04:44 +01:00
|
|
|
while (column_family_data_.size() > 0) {
|
|
|
|
// cfd destructor will delete itself from column_family_data_
|
|
|
|
auto cfd = column_family_data_.begin()->second;
|
|
|
|
cfd->Unref();
|
2014-01-22 20:44:53 +01:00
|
|
|
delete cfd;
|
|
|
|
}
|
2014-02-11 02:04:44 +01:00
|
|
|
dummy_cfd_->Unref();
|
2014-01-31 01:49:46 +01:00
|
|
|
delete dummy_cfd_;
|
2014-01-22 20:44:53 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
ColumnFamilyData* ColumnFamilySet::GetDefault() const {
|
2014-03-11 22:52:17 +01:00
|
|
|
assert(default_cfd_cache_ != nullptr);
|
|
|
|
return default_cfd_cache_;
|
2014-01-22 20:44:53 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
ColumnFamilyData* ColumnFamilySet::GetColumnFamily(uint32_t id) const {
|
|
|
|
auto cfd_iter = column_family_data_.find(id);
|
|
|
|
if (cfd_iter != column_family_data_.end()) {
|
|
|
|
return cfd_iter->second;
|
|
|
|
} else {
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-02-28 23:05:11 +01:00
|
|
|
ColumnFamilyData* ColumnFamilySet::GetColumnFamily(const std::string& name)
|
|
|
|
const {
|
|
|
|
auto cfd_iter = column_families_.find(name);
|
2014-03-11 22:52:17 +01:00
|
|
|
if (cfd_iter != column_families_.end()) {
|
|
|
|
auto cfd = GetColumnFamily(cfd_iter->second);
|
|
|
|
assert(cfd != nullptr);
|
|
|
|
return cfd;
|
|
|
|
} else {
|
2014-02-28 23:05:11 +01:00
|
|
|
return nullptr;
|
|
|
|
}
|
2014-01-22 20:44:53 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
uint32_t ColumnFamilySet::GetNextColumnFamilyID() {
|
|
|
|
return ++max_column_family_;
|
|
|
|
}
|
|
|
|
|
2014-03-05 21:13:44 +01:00
|
|
|
uint32_t ColumnFamilySet::GetMaxColumnFamily() { return max_column_family_; }
|
|
|
|
|
|
|
|
void ColumnFamilySet::UpdateMaxColumnFamily(uint32_t new_max_column_family) {
|
|
|
|
max_column_family_ = std::max(new_max_column_family, max_column_family_);
|
|
|
|
}
|
|
|
|
|
2014-06-03 00:33:54 +02:00
|
|
|
size_t ColumnFamilySet::NumberOfColumnFamilies() const {
|
|
|
|
return column_families_.size();
|
|
|
|
}
|
|
|
|
|
2015-01-06 21:44:21 +01:00
|
|
|
// under a DB mutex AND write thread
|
2014-01-22 20:44:53 +01:00
|
|
|
ColumnFamilyData* ColumnFamilySet::CreateColumnFamily(
|
|
|
|
const std::string& name, uint32_t id, Version* dummy_versions,
|
|
|
|
const ColumnFamilyOptions& options) {
|
|
|
|
assert(column_families_.find(name) == column_families_.end());
|
|
|
|
ColumnFamilyData* new_cfd =
|
2014-12-02 21:09:20 +01:00
|
|
|
new ColumnFamilyData(id, name, dummy_versions, table_cache_,
|
|
|
|
write_buffer_, options, db_options_,
|
|
|
|
env_options_, this);
|
2014-02-11 02:04:44 +01:00
|
|
|
column_families_.insert({name, id});
|
2014-01-22 20:44:53 +01:00
|
|
|
column_family_data_.insert({id, new_cfd});
|
|
|
|
max_column_family_ = std::max(max_column_family_, id);
|
2014-01-31 01:49:46 +01:00
|
|
|
// add to linked list
|
2014-02-11 02:04:44 +01:00
|
|
|
new_cfd->next_ = dummy_cfd_;
|
|
|
|
auto prev = dummy_cfd_->prev_;
|
|
|
|
new_cfd->prev_ = prev;
|
|
|
|
prev->next_ = new_cfd;
|
|
|
|
dummy_cfd_->prev_ = new_cfd;
|
2014-03-11 22:52:17 +01:00
|
|
|
if (id == 0) {
|
|
|
|
default_cfd_cache_ = new_cfd;
|
|
|
|
}
|
2014-01-22 20:44:53 +01:00
|
|
|
return new_cfd;
|
|
|
|
}
|
|
|
|
|
2014-04-07 23:21:25 +02:00
|
|
|
// REQUIRES: DB mutex held
|
|
|
|
void ColumnFamilySet::FreeDeadColumnFamilies() {
|
|
|
|
autovector<ColumnFamilyData*> to_delete;
|
|
|
|
for (auto cfd = dummy_cfd_->next_; cfd != dummy_cfd_; cfd = cfd->next_) {
|
2015-01-26 20:48:07 +01:00
|
|
|
if (cfd->refs_.load(std::memory_order_relaxed) == 0) {
|
2014-04-07 23:21:25 +02:00
|
|
|
to_delete.push_back(cfd);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for (auto cfd : to_delete) {
|
|
|
|
// this is very rare, so it's not a problem that we do it under a mutex
|
|
|
|
delete cfd;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-01-06 21:44:21 +01:00
|
|
|
// under a DB mutex AND from a write thread
|
2014-03-11 22:52:17 +01:00
|
|
|
void ColumnFamilySet::RemoveColumnFamily(ColumnFamilyData* cfd) {
|
2014-02-11 02:04:44 +01:00
|
|
|
auto cfd_iter = column_family_data_.find(cfd->GetID());
|
2014-01-31 01:49:46 +01:00
|
|
|
assert(cfd_iter != column_family_data_.end());
|
|
|
|
column_family_data_.erase(cfd_iter);
|
2014-02-11 02:04:44 +01:00
|
|
|
column_families_.erase(cfd->GetName());
|
2014-01-22 20:44:53 +01:00
|
|
|
}
|
|
|
|
|
2015-01-06 21:44:21 +01:00
|
|
|
// under a DB mutex OR from a write thread
|
2014-02-06 01:02:48 +01:00
|
|
|
bool ColumnFamilyMemTablesImpl::Seek(uint32_t column_family_id) {
|
2014-03-11 22:52:17 +01:00
|
|
|
if (column_family_id == 0) {
|
|
|
|
// optimization for common case
|
|
|
|
current_ = column_family_set_->GetDefault();
|
|
|
|
} else {
|
|
|
|
current_ = column_family_set_->GetColumnFamily(column_family_id);
|
|
|
|
}
|
2014-02-11 02:04:44 +01:00
|
|
|
handle_.SetCFD(current_);
|
2014-02-06 01:02:48 +01:00
|
|
|
return current_ != nullptr;
|
|
|
|
}
|
2014-01-28 20:05:04 +01:00
|
|
|
|
2014-02-06 01:02:48 +01:00
|
|
|
uint64_t ColumnFamilyMemTablesImpl::GetLogNumber() const {
|
|
|
|
assert(current_ != nullptr);
|
|
|
|
return current_->GetLogNumber();
|
|
|
|
}
|
|
|
|
|
|
|
|
MemTable* ColumnFamilyMemTablesImpl::GetMemTable() const {
|
|
|
|
assert(current_ != nullptr);
|
|
|
|
return current_->mem();
|
|
|
|
}
|
|
|
|
|
2014-02-11 02:04:44 +01:00
|
|
|
ColumnFamilyHandle* ColumnFamilyMemTablesImpl::GetColumnFamilyHandle() {
|
2014-02-06 01:02:48 +01:00
|
|
|
assert(current_ != nullptr);
|
2014-02-11 02:04:44 +01:00
|
|
|
return &handle_;
|
2014-01-28 20:05:04 +01:00
|
|
|
}
|
|
|
|
|
2014-09-11 03:46:09 +02:00
|
|
|
void ColumnFamilyMemTablesImpl::CheckMemtableFull() {
|
|
|
|
if (current_ != nullptr && current_->mem()->ShouldScheduleFlush()) {
|
|
|
|
flush_scheduler_->ScheduleFlush(current_);
|
|
|
|
current_->mem()->MarkFlushScheduled();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-08-19 00:19:17 +02:00
|
|
|
uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family) {
|
|
|
|
uint32_t column_family_id = 0;
|
|
|
|
if (column_family != nullptr) {
|
|
|
|
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
|
|
|
|
column_family_id = cfh->GetID();
|
|
|
|
}
|
|
|
|
return column_family_id;
|
|
|
|
}
|
|
|
|
|
2014-09-22 20:37:35 +02:00
|
|
|
const Comparator* GetColumnFamilyUserComparator(
|
|
|
|
ColumnFamilyHandle* column_family) {
|
|
|
|
if (column_family != nullptr) {
|
|
|
|
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
|
|
|
|
return cfh->user_comparator();
|
|
|
|
}
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
2014-01-22 20:44:53 +01:00
|
|
|
} // namespace rocksdb
|