2014-09-17 21:49:13 +02:00
|
|
|
// Copyright (c) 2014, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under the BSD-style license found in the
|
|
|
|
// LICENSE file in the root directory of this source tree. An additional grant
|
|
|
|
// of patent rights can be found in the PATENTS file in the same directory.
|
|
|
|
|
|
|
|
#pragma once
|
|
|
|
|
2014-10-02 01:19:16 +02:00
|
|
|
#include <vector>
|
2014-09-17 21:49:13 +02:00
|
|
|
#include "rocksdb/options.h"
|
2014-10-02 01:19:16 +02:00
|
|
|
#include "rocksdb/immutable_options.h"
|
2014-09-17 21:49:13 +02:00
|
|
|
|
|
|
|
namespace rocksdb {
|
|
|
|
|
|
|
|
struct MutableCFOptions {
|
2014-10-02 01:19:16 +02:00
|
|
|
MutableCFOptions(const Options& options, const ImmutableCFOptions& ioptions)
|
Add options.compaction_measure_io_stats to print write I/O stats in compactions
Summary:
Add options.compaction_measure_io_stats to print out / pass to listener accumulated time spent on write calls. Example outputs in info logs:
2015/08/12-16:27:59.463944 7fd428bff700 (Original Log Time 2015/08/12-16:27:59.463922) EVENT_LOG_v1 {"time_micros": 1439422079463897, "job": 6, "event": "compaction_finished", "output_level": 1, "num_output_files": 4, "total_output_size": 6900525, "num_input_records": 111483, "num_output_records": 106877, "file_write_nanos": 15663206, "file_range_sync_nanos": 649588, "file_fsync_nanos": 349614797, "file_prepare_write_nanos": 1505812, "lsm_state": [2, 4, 0, 0, 0, 0, 0]}
Add two more counters in iostats_context.
Also add a parameter of db_bench.
Test Plan: Add a unit test. Also manually verify LOG outputs in db_bench
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D44115
2015-08-13 02:24:45 +02:00
|
|
|
: write_buffer_size(options.write_buffer_size),
|
|
|
|
max_write_buffer_number(options.max_write_buffer_number),
|
|
|
|
arena_block_size(options.arena_block_size),
|
|
|
|
memtable_prefix_bloom_bits(options.memtable_prefix_bloom_bits),
|
|
|
|
memtable_prefix_bloom_probes(options.memtable_prefix_bloom_probes),
|
|
|
|
memtable_prefix_bloom_huge_page_tlb_size(
|
|
|
|
options.memtable_prefix_bloom_huge_page_tlb_size),
|
|
|
|
max_successive_merges(options.max_successive_merges),
|
|
|
|
filter_deletes(options.filter_deletes),
|
|
|
|
inplace_update_num_locks(options.inplace_update_num_locks),
|
|
|
|
disable_auto_compactions(options.disable_auto_compactions),
|
|
|
|
soft_rate_limit(options.soft_rate_limit),
|
|
|
|
hard_rate_limit(options.hard_rate_limit),
|
|
|
|
level0_file_num_compaction_trigger(
|
|
|
|
options.level0_file_num_compaction_trigger),
|
|
|
|
level0_slowdown_writes_trigger(options.level0_slowdown_writes_trigger),
|
|
|
|
level0_stop_writes_trigger(options.level0_stop_writes_trigger),
|
|
|
|
max_grandparent_overlap_factor(options.max_grandparent_overlap_factor),
|
|
|
|
expanded_compaction_factor(options.expanded_compaction_factor),
|
|
|
|
source_compaction_factor(options.source_compaction_factor),
|
|
|
|
target_file_size_base(options.target_file_size_base),
|
|
|
|
target_file_size_multiplier(options.target_file_size_multiplier),
|
|
|
|
max_bytes_for_level_base(options.max_bytes_for_level_base),
|
|
|
|
max_bytes_for_level_multiplier(options.max_bytes_for_level_multiplier),
|
|
|
|
max_bytes_for_level_multiplier_additional(
|
|
|
|
options.max_bytes_for_level_multiplier_additional),
|
|
|
|
verify_checksums_in_compaction(options.verify_checksums_in_compaction),
|
|
|
|
num_subcompactions(options.num_subcompactions),
|
|
|
|
max_sequential_skip_in_iterations(
|
|
|
|
options.max_sequential_skip_in_iterations),
|
|
|
|
paranoid_file_checks(options.paranoid_file_checks),
|
|
|
|
compaction_measure_io_stats(options.compaction_measure_io_stats)
|
|
|
|
|
2014-10-02 01:19:16 +02:00
|
|
|
{
|
|
|
|
RefreshDerivedOptions(ioptions);
|
2014-09-17 21:49:13 +02:00
|
|
|
}
|
|
|
|
MutableCFOptions()
|
Add options.compaction_measure_io_stats to print write I/O stats in compactions
Summary:
Add options.compaction_measure_io_stats to print out / pass to listener accumulated time spent on write calls. Example outputs in info logs:
2015/08/12-16:27:59.463944 7fd428bff700 (Original Log Time 2015/08/12-16:27:59.463922) EVENT_LOG_v1 {"time_micros": 1439422079463897, "job": 6, "event": "compaction_finished", "output_level": 1, "num_output_files": 4, "total_output_size": 6900525, "num_input_records": 111483, "num_output_records": 106877, "file_write_nanos": 15663206, "file_range_sync_nanos": 649588, "file_fsync_nanos": 349614797, "file_prepare_write_nanos": 1505812, "lsm_state": [2, 4, 0, 0, 0, 0, 0]}
Add two more counters in iostats_context.
Also add a parameter of db_bench.
Test Plan: Add a unit test. Also manually verify LOG outputs in db_bench
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D44115
2015-08-13 02:24:45 +02:00
|
|
|
: write_buffer_size(0),
|
|
|
|
max_write_buffer_number(0),
|
|
|
|
arena_block_size(0),
|
|
|
|
memtable_prefix_bloom_bits(0),
|
|
|
|
memtable_prefix_bloom_probes(0),
|
|
|
|
memtable_prefix_bloom_huge_page_tlb_size(0),
|
|
|
|
max_successive_merges(0),
|
|
|
|
filter_deletes(false),
|
|
|
|
inplace_update_num_locks(0),
|
|
|
|
disable_auto_compactions(false),
|
|
|
|
soft_rate_limit(0),
|
|
|
|
hard_rate_limit(0),
|
|
|
|
level0_file_num_compaction_trigger(0),
|
|
|
|
level0_slowdown_writes_trigger(0),
|
|
|
|
level0_stop_writes_trigger(0),
|
|
|
|
max_grandparent_overlap_factor(0),
|
|
|
|
expanded_compaction_factor(0),
|
|
|
|
source_compaction_factor(0),
|
|
|
|
target_file_size_base(0),
|
|
|
|
target_file_size_multiplier(0),
|
|
|
|
max_bytes_for_level_base(0),
|
|
|
|
max_bytes_for_level_multiplier(0),
|
|
|
|
verify_checksums_in_compaction(false),
|
|
|
|
num_subcompactions(1),
|
|
|
|
max_sequential_skip_in_iterations(0),
|
|
|
|
paranoid_file_checks(false),
|
|
|
|
compaction_measure_io_stats(false) {}
|
2014-09-17 21:49:13 +02:00
|
|
|
|
2014-10-02 01:19:16 +02:00
|
|
|
// Must be called after any change to MutableCFOptions
|
|
|
|
void RefreshDerivedOptions(const ImmutableCFOptions& ioptions);
|
|
|
|
|
|
|
|
// Get the max file size in a given level.
|
|
|
|
uint64_t MaxFileSizeForLevel(int level) const;
|
|
|
|
// Returns maximum total overlap bytes with grandparent
|
|
|
|
// level (i.e., level+2) before we stop building a single
|
|
|
|
// file in level->level+1 compaction.
|
|
|
|
uint64_t MaxGrandParentOverlapBytes(int level) const;
|
|
|
|
uint64_t ExpandedCompactionByteSizeLimit(int level) const;
|
2015-03-30 23:04:21 +02:00
|
|
|
int MaxBytesMultiplerAdditional(int level) const {
|
|
|
|
if (level >=
|
|
|
|
static_cast<int>(max_bytes_for_level_multiplier_additional.size())) {
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
return max_bytes_for_level_multiplier_additional[level];
|
|
|
|
}
|
2014-10-02 01:19:16 +02:00
|
|
|
|
2014-10-17 02:22:28 +02:00
|
|
|
void Dump(Logger* log) const;
|
|
|
|
|
2014-10-02 01:19:16 +02:00
|
|
|
// Memtable related options
|
2014-09-17 21:49:13 +02:00
|
|
|
size_t write_buffer_size;
|
2014-10-17 01:57:59 +02:00
|
|
|
int max_write_buffer_number;
|
2014-09-17 21:49:13 +02:00
|
|
|
size_t arena_block_size;
|
|
|
|
uint32_t memtable_prefix_bloom_bits;
|
|
|
|
uint32_t memtable_prefix_bloom_probes;
|
|
|
|
size_t memtable_prefix_bloom_huge_page_tlb_size;
|
|
|
|
size_t max_successive_merges;
|
|
|
|
bool filter_deletes;
|
2014-10-27 20:10:13 +01:00
|
|
|
size_t inplace_update_num_locks;
|
2014-10-02 01:19:16 +02:00
|
|
|
|
|
|
|
// Compaction related options
|
2014-10-17 02:14:17 +02:00
|
|
|
bool disable_auto_compactions;
|
2014-10-17 02:21:31 +02:00
|
|
|
double soft_rate_limit;
|
|
|
|
double hard_rate_limit;
|
2014-10-02 01:19:16 +02:00
|
|
|
int level0_file_num_compaction_trigger;
|
|
|
|
int level0_slowdown_writes_trigger;
|
|
|
|
int level0_stop_writes_trigger;
|
|
|
|
int max_grandparent_overlap_factor;
|
|
|
|
int expanded_compaction_factor;
|
|
|
|
int source_compaction_factor;
|
2014-11-11 22:47:22 +01:00
|
|
|
uint64_t target_file_size_base;
|
2014-10-02 01:19:16 +02:00
|
|
|
int target_file_size_multiplier;
|
|
|
|
uint64_t max_bytes_for_level_base;
|
|
|
|
int max_bytes_for_level_multiplier;
|
|
|
|
std::vector<int> max_bytes_for_level_multiplier_additional;
|
2014-11-18 19:20:10 +01:00
|
|
|
bool verify_checksums_in_compaction;
|
Parallelize L0-L1 Compaction: Restructure Compaction Job
Summary:
As of now compactions involving files from Level 0 and Level 1 are single
threaded because the files in L0, although sorted, are not range partitioned like
the other levels. This means that during L0-L1 compaction each file from L1
needs to be merged with potentially all the files from L0.
This attempt to parallelize the L0-L1 compaction assigns a thread and a
corresponding iterator to each L1 file that then considers only the key range
found in that L1 file and only the L0 files that have those keys (and only the
specific portion of those L0 files in which those keys are found). In this way
the overlap is minimized and potentially eliminated between different iterators
focusing on the same files.
The first step is to restructure the compaction logic to break L0-L1 compactions
into multiple, smaller, sequential compactions. Eventually each of these smaller
jobs will be run simultaneously. Areas to pay extra attention to are
# Correct aggregation of compaction job statistics across multiple threads
# Proper opening/closing of output files (make sure each thread's is unique)
# Keys that span multiple L1 files
# Skewed distributions of keys within L0 files
Test Plan: Make and run db_test (newer version has separate compaction tests) and compaction_job_stats_test
Reviewers: igor, noetzli, anthony, sdong, yhchiang
Reviewed By: yhchiang
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D42699
2015-08-03 20:32:14 +02:00
|
|
|
int num_subcompactions;
|
2014-10-02 01:19:16 +02:00
|
|
|
|
2014-10-24 00:34:21 +02:00
|
|
|
// Misc options
|
|
|
|
uint64_t max_sequential_skip_in_iterations;
|
2015-04-18 00:26:50 +02:00
|
|
|
bool paranoid_file_checks;
|
Add options.compaction_measure_io_stats to print write I/O stats in compactions
Summary:
Add options.compaction_measure_io_stats to print out / pass to listener accumulated time spent on write calls. Example outputs in info logs:
2015/08/12-16:27:59.463944 7fd428bff700 (Original Log Time 2015/08/12-16:27:59.463922) EVENT_LOG_v1 {"time_micros": 1439422079463897, "job": 6, "event": "compaction_finished", "output_level": 1, "num_output_files": 4, "total_output_size": 6900525, "num_input_records": 111483, "num_output_records": 106877, "file_write_nanos": 15663206, "file_range_sync_nanos": 649588, "file_fsync_nanos": 349614797, "file_prepare_write_nanos": 1505812, "lsm_state": [2, 4, 0, 0, 0, 0, 0]}
Add two more counters in iostats_context.
Also add a parameter of db_bench.
Test Plan: Add a unit test. Also manually verify LOG outputs in db_bench
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D44115
2015-08-13 02:24:45 +02:00
|
|
|
bool compaction_measure_io_stats;
|
2014-10-24 00:34:21 +02:00
|
|
|
|
2014-10-02 01:19:16 +02:00
|
|
|
// Derived options
|
|
|
|
// Per-level target file size.
|
|
|
|
std::vector<uint64_t> max_file_size;
|
2014-09-17 21:49:13 +02:00
|
|
|
};
|
|
|
|
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 20:44:17 +01:00
|
|
|
uint64_t MultiplyCheckOverflow(uint64_t op1, int op2);
|
|
|
|
|
2014-09-17 21:49:13 +02:00
|
|
|
} // namespace rocksdb
|