rocksdb/db/compaction.cc
Andrew Kryczka 9b18cc2363 single-file bottom-level compaction when snapshot released
Summary:
When snapshots are held for a long time, files may reach the bottom level containing overwritten/deleted keys. We previously had no mechanism to trigger compaction on such files. This particularly impacted DBs that write to different parts of the keyspace over time, as such files would never be naturally compacted due to second-last level files moving down. This PR introduces a mechanism for bottommost files to be recompacted upon releasing all snapshots that prevent them from dropping their deleted/overwritten keys.

- Changed `CompactionPicker` to compact files in `BottommostFilesMarkedForCompaction()`. These are the last choice when picking. Each file will be compacted alone and output to the same level in which it originated. The goal of this type of compaction is to rewrite the data excluding deleted/overwritten keys.
- Changed `ReleaseSnapshot()` to recompute the bottom files marked for compaction when the oldest existing snapshot changes, and schedule a compaction if needed. We cache the value that oldest existing snapshot needs to exceed in order for another file to be marked in `bottommost_files_mark_threshold_`, which allows us to avoid recomputing marked files for most snapshot releases.
- Changed `VersionStorageInfo` to track the list of bottommost files, which is recomputed every time the version changes by `UpdateBottommostFiles()`. The list of marked bottommost files is first computed in `ComputeBottommostFilesMarkedForCompaction()` when the version changes, but may also be recomputed when `ReleaseSnapshot()` is called.
- Extracted core logic of `Compaction::IsBottommostLevel()` into `VersionStorageInfo::RangeMightExistAfterSortedRun()` since logic to check whether a file is bottommost is now necessary outside of compaction.
Closes https://github.com/facebook/rocksdb/pull/3009

Differential Revision: D6062044

Pulled By: ajkr

fbshipit-source-id: 123d201cf140715a7d5928e8b3cb4f9cd9f7ad21
2017-10-25 16:30:37 -07:00

469 lines
16 KiB
C++

// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/compaction.h"
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS
#endif
#include <inttypes.h>
#include <vector>
#include "db/column_family.h"
#include "rocksdb/compaction_filter.h"
#include "util/string_util.h"
#include "util/sync_point.h"
namespace rocksdb {
uint64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
uint64_t sum = 0;
for (size_t i = 0; i < files.size() && files[i]; i++) {
sum += files[i]->fd.GetFileSize();
}
return sum;
}
void Compaction::SetInputVersion(Version* _input_version) {
input_version_ = _input_version;
cfd_ = input_version_->cfd();
cfd_->Ref();
input_version_->Ref();
edit_.SetColumnFamily(cfd_->GetID());
}
void Compaction::GetBoundaryKeys(
VersionStorageInfo* vstorage,
const std::vector<CompactionInputFiles>& inputs, Slice* smallest_user_key,
Slice* largest_user_key) {
bool initialized = false;
const Comparator* ucmp = vstorage->InternalComparator()->user_comparator();
for (size_t i = 0; i < inputs.size(); ++i) {
if (inputs[i].files.empty()) {
continue;
}
if (inputs[i].level == 0) {
// we need to consider all files on level 0
for (const auto* f : inputs[i].files) {
const Slice& start_user_key = f->smallest.user_key();
if (!initialized ||
ucmp->Compare(start_user_key, *smallest_user_key) < 0) {
*smallest_user_key = start_user_key;
}
const Slice& end_user_key = f->largest.user_key();
if (!initialized ||
ucmp->Compare(end_user_key, *largest_user_key) > 0) {
*largest_user_key = end_user_key;
}
initialized = true;
}
} else {
// we only need to consider the first and last file
const Slice& start_user_key = inputs[i].files[0]->smallest.user_key();
if (!initialized ||
ucmp->Compare(start_user_key, *smallest_user_key) < 0) {
*smallest_user_key = start_user_key;
}
const Slice& end_user_key = inputs[i].files.back()->largest.user_key();
if (!initialized || ucmp->Compare(end_user_key, *largest_user_key) > 0) {
*largest_user_key = end_user_key;
}
initialized = true;
}
}
}
// helper function to determine if compaction is creating files at the
// bottommost level
bool Compaction::IsBottommostLevel(
int output_level, VersionStorageInfo* vstorage,
const std::vector<CompactionInputFiles>& inputs) {
int output_l0_idx;
if (output_level == 0) {
output_l0_idx = 0;
for (const auto* file : vstorage->LevelFiles(0)) {
if (inputs[0].files.back() == file) {
break;
}
++output_l0_idx;
}
assert(static_cast<size_t>(output_l0_idx) < vstorage->LevelFiles(0).size());
} else {
output_l0_idx = -1;
}
Slice smallest_key, largest_key;
GetBoundaryKeys(vstorage, inputs, &smallest_key, &largest_key);
return !vstorage->RangeMightExistAfterSortedRun(smallest_key, largest_key,
output_level, output_l0_idx);
}
// test function to validate the functionality of IsBottommostLevel()
// function -- determines if compaction with inputs and storage is bottommost
bool Compaction::TEST_IsBottommostLevel(
int output_level, VersionStorageInfo* vstorage,
const std::vector<CompactionInputFiles>& inputs) {
return IsBottommostLevel(output_level, vstorage, inputs);
}
bool Compaction::IsFullCompaction(
VersionStorageInfo* vstorage,
const std::vector<CompactionInputFiles>& inputs) {
size_t num_files_in_compaction = 0;
size_t total_num_files = 0;
for (int l = 0; l < vstorage->num_levels(); l++) {
total_num_files += vstorage->NumLevelFiles(l);
}
for (size_t i = 0; i < inputs.size(); i++) {
num_files_in_compaction += inputs[i].size();
}
return num_files_in_compaction == total_num_files;
}
Compaction::Compaction(VersionStorageInfo* vstorage,
const ImmutableCFOptions& _immutable_cf_options,
const MutableCFOptions& _mutable_cf_options,
std::vector<CompactionInputFiles> _inputs,
int _output_level, uint64_t _target_file_size,
uint64_t _max_compaction_bytes, uint32_t _output_path_id,
CompressionType _compression,
std::vector<FileMetaData*> _grandparents,
bool _manual_compaction, double _score,
bool _deletion_compaction,
CompactionReason _compaction_reason)
: input_vstorage_(vstorage),
start_level_(_inputs[0].level),
output_level_(_output_level),
max_output_file_size_(_target_file_size),
max_compaction_bytes_(_max_compaction_bytes),
immutable_cf_options_(_immutable_cf_options),
mutable_cf_options_(_mutable_cf_options),
input_version_(nullptr),
number_levels_(vstorage->num_levels()),
cfd_(nullptr),
output_path_id_(_output_path_id),
output_compression_(_compression),
deletion_compaction_(_deletion_compaction),
inputs_(std::move(_inputs)),
grandparents_(std::move(_grandparents)),
score_(_score),
bottommost_level_(IsBottommostLevel(output_level_, vstorage, inputs_)),
is_full_compaction_(IsFullCompaction(vstorage, inputs_)),
is_manual_compaction_(_manual_compaction),
is_trivial_move_(false),
compaction_reason_(_compaction_reason) {
MarkFilesBeingCompacted(true);
if (is_manual_compaction_) {
compaction_reason_ = CompactionReason::kManualCompaction;
}
#ifndef NDEBUG
for (size_t i = 1; i < inputs_.size(); ++i) {
assert(inputs_[i].level > inputs_[i - 1].level);
}
#endif
// setup input_levels_
{
input_levels_.resize(num_input_levels());
for (size_t which = 0; which < num_input_levels(); which++) {
DoGenerateLevelFilesBrief(&input_levels_[which], inputs_[which].files,
&arena_);
}
}
GetBoundaryKeys(vstorage, inputs_, &smallest_user_key_, &largest_user_key_);
}
Compaction::~Compaction() {
if (input_version_ != nullptr) {
input_version_->Unref();
}
if (cfd_ != nullptr) {
if (cfd_->Unref()) {
delete cfd_;
}
}
}
bool Compaction::InputCompressionMatchesOutput() const {
int base_level = input_vstorage_->base_level();
bool matches = (GetCompressionType(immutable_cf_options_, input_vstorage_,
mutable_cf_options_, start_level_,
base_level) == output_compression_);
if (matches) {
TEST_SYNC_POINT("Compaction::InputCompressionMatchesOutput:Matches");
return true;
}
TEST_SYNC_POINT("Compaction::InputCompressionMatchesOutput:DidntMatch");
return matches;
}
bool Compaction::IsTrivialMove() const {
// Avoid a move if there is lots of overlapping grandparent data.
// Otherwise, the move could create a parent file that will require
// a very expensive merge later on.
// If start_level_== output_level_, the purpose is to force compaction
// filter to be applied to that level, and thus cannot be a trivial move.
// Check if start level have files with overlapping ranges
if (start_level_ == 0 && input_vstorage_->level0_non_overlapping() == false) {
// We cannot move files from L0 to L1 if the files are overlapping
return false;
}
if (is_manual_compaction_ &&
(immutable_cf_options_.compaction_filter != nullptr ||
immutable_cf_options_.compaction_filter_factory != nullptr)) {
// This is a manual compaction and we have a compaction filter that should
// be executed, we cannot do a trivial move
return false;
}
// Used in universal compaction, where trivial move can be done if the
// input files are non overlapping
if ((immutable_cf_options_.compaction_options_universal.allow_trivial_move) &&
(output_level_ != 0)) {
return is_trivial_move_;
}
if (!(start_level_ != output_level_ && num_input_levels() == 1 &&
input(0, 0)->fd.GetPathId() == output_path_id() &&
InputCompressionMatchesOutput())) {
return false;
}
// assert inputs_.size() == 1
for (const auto& file : inputs_.front().files) {
std::vector<FileMetaData*> file_grand_parents;
if (output_level_ + 1 >= number_levels_) {
continue;
}
input_vstorage_->GetOverlappingInputs(output_level_ + 1, &file->smallest,
&file->largest, &file_grand_parents);
const auto compaction_size =
file->fd.GetFileSize() + TotalFileSize(file_grand_parents);
if (compaction_size > max_compaction_bytes_) {
return false;
}
}
return true;
}
void Compaction::AddInputDeletions(VersionEdit* out_edit) {
for (size_t which = 0; which < num_input_levels(); which++) {
for (size_t i = 0; i < inputs_[which].size(); i++) {
out_edit->DeleteFile(level(which), inputs_[which][i]->fd.GetNumber());
}
}
}
bool Compaction::KeyNotExistsBeyondOutputLevel(
const Slice& user_key, std::vector<size_t>* level_ptrs) const {
assert(input_version_ != nullptr);
assert(level_ptrs != nullptr);
assert(level_ptrs->size() == static_cast<size_t>(number_levels_));
if (cfd_->ioptions()->compaction_style == kCompactionStyleLevel) {
if (output_level_ == 0) {
return false;
}
// Maybe use binary search to find right entry instead of linear search?
const Comparator* user_cmp = cfd_->user_comparator();
for (int lvl = output_level_ + 1; lvl < number_levels_; lvl++) {
const std::vector<FileMetaData*>& files =
input_vstorage_->LevelFiles(lvl);
for (; level_ptrs->at(lvl) < files.size(); level_ptrs->at(lvl)++) {
auto* f = files[level_ptrs->at(lvl)];
if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) {
// We've advanced far enough
if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) {
// Key falls in this file's range, so definitely
// exists beyond output level
return false;
}
break;
}
}
}
return true;
} else {
return bottommost_level_;
}
}
// Mark (or clear) each file that is being compacted
void Compaction::MarkFilesBeingCompacted(bool mark_as_compacted) {
for (size_t i = 0; i < num_input_levels(); i++) {
for (size_t j = 0; j < inputs_[i].size(); j++) {
assert(mark_as_compacted ? !inputs_[i][j]->being_compacted
: inputs_[i][j]->being_compacted);
inputs_[i][j]->being_compacted = mark_as_compacted;
}
}
}
// Sample output:
// If compacting 3 L0 files, 2 L3 files and 1 L4 file, and outputting to L5,
// print: "3@0 + 2@3 + 1@4 files to L5"
const char* Compaction::InputLevelSummary(
InputLevelSummaryBuffer* scratch) const {
int len = 0;
bool is_first = true;
for (auto& input_level : inputs_) {
if (input_level.empty()) {
continue;
}
if (!is_first) {
len +=
snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, " + ");
} else {
is_first = false;
}
len += snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len,
"%" ROCKSDB_PRIszt "@%d", input_level.size(),
input_level.level);
}
snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len,
" files to L%d", output_level());
return scratch->buffer;
}
uint64_t Compaction::CalculateTotalInputSize() const {
uint64_t size = 0;
for (auto& input_level : inputs_) {
for (auto f : input_level.files) {
size += f->fd.GetFileSize();
}
}
return size;
}
void Compaction::ReleaseCompactionFiles(Status status) {
MarkFilesBeingCompacted(false);
cfd_->compaction_picker()->ReleaseCompactionFiles(this, status);
}
void Compaction::ResetNextCompactionIndex() {
assert(input_version_ != nullptr);
input_vstorage_->ResetNextCompactionIndex(start_level_);
}
namespace {
int InputSummary(const std::vector<FileMetaData*>& files, char* output,
int len) {
*output = '\0';
int write = 0;
for (size_t i = 0; i < files.size(); i++) {
int sz = len - write;
int ret;
char sztxt[16];
AppendHumanBytes(files.at(i)->fd.GetFileSize(), sztxt, 16);
ret = snprintf(output + write, sz, "%" PRIu64 "(%s) ",
files.at(i)->fd.GetNumber(), sztxt);
if (ret < 0 || ret >= sz) break;
write += ret;
}
// if files.size() is non-zero, overwrite the last space
return write - !!files.size();
}
} // namespace
void Compaction::Summary(char* output, int len) {
int write =
snprintf(output, len, "Base version %" PRIu64 " Base level %d, inputs: [",
input_version_->GetVersionNumber(), start_level_);
if (write < 0 || write >= len) {
return;
}
for (size_t level_iter = 0; level_iter < num_input_levels(); ++level_iter) {
if (level_iter > 0) {
write += snprintf(output + write, len - write, "], [");
if (write < 0 || write >= len) {
return;
}
}
write +=
InputSummary(inputs_[level_iter].files, output + write, len - write);
if (write < 0 || write >= len) {
return;
}
}
snprintf(output + write, len - write, "]");
}
uint64_t Compaction::OutputFilePreallocationSize() const {
uint64_t preallocation_size = 0;
if (max_output_file_size_ != port::kMaxUint64 &&
(cfd_->ioptions()->compaction_style == kCompactionStyleLevel ||
output_level() > 0)) {
preallocation_size = max_output_file_size_;
} else {
for (const auto& level_files : inputs_) {
for (const auto& file : level_files.files) {
preallocation_size += file->fd.GetFileSize();
}
}
}
// Over-estimate slightly so we don't end up just barely crossing
// the threshold
return preallocation_size + (preallocation_size / 10);
}
std::unique_ptr<CompactionFilter> Compaction::CreateCompactionFilter() const {
if (!cfd_->ioptions()->compaction_filter_factory) {
return nullptr;
}
CompactionFilter::Context context;
context.is_full_compaction = is_full_compaction_;
context.is_manual_compaction = is_manual_compaction_;
context.column_family_id = cfd_->GetID();
return cfd_->ioptions()->compaction_filter_factory->CreateCompactionFilter(
context);
}
bool Compaction::IsOutputLevelEmpty() const {
return inputs_.back().level != output_level_ || inputs_.back().empty();
}
bool Compaction::ShouldFormSubcompactions() const {
if (immutable_cf_options_.max_subcompactions <= 1 || cfd_ == nullptr) {
return false;
}
if (cfd_->ioptions()->compaction_style == kCompactionStyleLevel) {
return start_level_ == 0 && output_level_ > 0 && !IsOutputLevelEmpty();
} else if (cfd_->ioptions()->compaction_style == kCompactionStyleUniversal) {
return number_levels_ > 1 && output_level_ > 0;
} else {
return false;
}
}
uint64_t Compaction::MaxInputFileCreationTime() const {
uint64_t max_creation_time = 0;
for (const auto& file : inputs_[0].files) {
if (file->fd.table_reader != nullptr &&
file->fd.table_reader->GetTableProperties() != nullptr) {
uint64_t creation_time =
file->fd.table_reader->GetTableProperties()->creation_time;
max_creation_time = std::max(max_creation_time, creation_time);
}
}
return max_creation_time;
}
} // namespace rocksdb