2014-11-01 00:31:25 +01:00
|
|
|
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under the BSD-style license found in the
|
|
|
|
// LICENSE file in the root directory of this source tree. An additional grant
|
|
|
|
// of patent rights can be found in the PATENTS file in the same directory.
|
|
|
|
//
|
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
|
|
|
#include "db/compaction_job.h"
|
|
|
|
|
|
|
|
#ifndef __STDC_FORMAT_MACROS
|
|
|
|
#define __STDC_FORMAT_MACROS
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#include <inttypes.h>
|
|
|
|
#include <algorithm>
|
|
|
|
#include <vector>
|
|
|
|
#include <memory>
|
|
|
|
#include <list>
|
|
|
|
|
|
|
|
#include "db/builder.h"
|
|
|
|
#include "db/db_iter.h"
|
|
|
|
#include "db/dbformat.h"
|
|
|
|
#include "db/filename.h"
|
|
|
|
#include "db/log_reader.h"
|
|
|
|
#include "db/log_writer.h"
|
|
|
|
#include "db/memtable.h"
|
|
|
|
#include "db/merge_helper.h"
|
|
|
|
#include "db/memtable_list.h"
|
|
|
|
#include "db/merge_context.h"
|
|
|
|
#include "db/version_set.h"
|
|
|
|
#include "port/port.h"
|
|
|
|
#include "port/likely.h"
|
|
|
|
#include "rocksdb/db.h"
|
|
|
|
#include "rocksdb/env.h"
|
|
|
|
#include "rocksdb/statistics.h"
|
|
|
|
#include "rocksdb/status.h"
|
|
|
|
#include "rocksdb/table.h"
|
|
|
|
#include "table/block.h"
|
|
|
|
#include "table/block_based_table_factory.h"
|
|
|
|
#include "table/merger.h"
|
|
|
|
#include "table/table_builder.h"
|
|
|
|
#include "table/two_level_iterator.h"
|
|
|
|
#include "util/coding.h"
|
|
|
|
#include "util/logging.h"
|
|
|
|
#include "util/log_buffer.h"
|
|
|
|
#include "util/mutexlock.h"
|
|
|
|
#include "util/perf_context_imp.h"
|
|
|
|
#include "util/iostats_context_imp.h"
|
|
|
|
#include "util/stop_watch.h"
|
Include bunch of more events into EventLogger
Summary:
Added these events:
* Recovery start, finish and also when recovery creates a file
* Trivial move
* Compaction start, finish and when compaction creates a file
* Flush start, finish
Also includes small fix to EventLogger
Also added option ROCKSDB_PRINT_EVENTS_TO_STDOUT which is useful when we debug things. I've spent far too much time chasing LOG files.
Still didn't get sst table properties in JSON. They are written very deeply into the stack. I'll address in separate diff.
TODO:
* Write specification. Let's first use this for a while and figure out what's good data to put here, too. After that we'll write spec
* Write tools that parse and analyze LOGs. This can be in python or go. Good intern task.
Test Plan: Ran db_bench with ROCKSDB_PRINT_EVENTS_TO_STDOUT. Here's the output: https://phabricator.fb.com/P19811976
Reviewers: sdong, yhchiang, rven, MarkCallaghan, kradhakrishnan, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37521
2015-04-28 00:20:02 +02:00
|
|
|
#include "util/string_util.h"
|
2014-11-01 00:31:25 +01:00
|
|
|
#include "util/sync_point.h"
|
2015-01-13 09:04:08 +01:00
|
|
|
#include "util/thread_status_util.h"
|
2014-11-01 00:31:25 +01:00
|
|
|
|
|
|
|
namespace rocksdb {
|
|
|
|
|
|
|
|
struct CompactionJob::CompactionState {
|
|
|
|
Compaction* const compaction;
|
|
|
|
|
|
|
|
// Files produced by compaction
|
|
|
|
struct Output {
|
|
|
|
uint64_t number;
|
|
|
|
uint32_t path_id;
|
|
|
|
uint64_t file_size;
|
|
|
|
InternalKey smallest, largest;
|
|
|
|
SequenceNumber smallest_seqno, largest_seqno;
|
|
|
|
};
|
|
|
|
std::vector<Output> outputs;
|
|
|
|
|
|
|
|
// State kept for output being generated
|
|
|
|
std::unique_ptr<WritableFile> outfile;
|
|
|
|
std::unique_ptr<TableBuilder> builder;
|
|
|
|
|
|
|
|
uint64_t total_bytes;
|
|
|
|
|
|
|
|
Output* current_output() { return &outputs[outputs.size() - 1]; }
|
|
|
|
|
2014-11-01 02:36:07 +01:00
|
|
|
explicit CompactionState(Compaction* c)
|
|
|
|
: compaction(c),
|
|
|
|
total_bytes(0),
|
|
|
|
num_input_records(0),
|
|
|
|
num_output_records(0) {}
|
2014-11-01 00:31:25 +01:00
|
|
|
|
|
|
|
// Create a client visible context of this compaction
|
|
|
|
CompactionFilter::Context GetFilterContextV1() {
|
|
|
|
CompactionFilter::Context context;
|
|
|
|
context.is_full_compaction = compaction->IsFullCompaction();
|
|
|
|
context.is_manual_compaction = compaction->IsManualCompaction();
|
|
|
|
return context;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create a client visible context of this compaction
|
|
|
|
CompactionFilterContext GetFilterContext() {
|
|
|
|
CompactionFilterContext context;
|
|
|
|
context.is_full_compaction = compaction->IsFullCompaction();
|
|
|
|
context.is_manual_compaction = compaction->IsManualCompaction();
|
|
|
|
return context;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<std::string> key_str_buf_;
|
|
|
|
std::vector<std::string> existing_value_str_buf_;
|
|
|
|
// new_value_buf_ will only be appended if a value changes
|
|
|
|
std::vector<std::string> new_value_buf_;
|
|
|
|
// if values_changed_buf_[i] is true
|
|
|
|
// new_value_buf_ will add a new entry with the changed value
|
|
|
|
std::vector<bool> value_changed_buf_;
|
|
|
|
// to_delete_buf_[i] is true iff key_buf_[i] is deleted
|
|
|
|
std::vector<bool> to_delete_buf_;
|
|
|
|
|
|
|
|
std::vector<std::string> other_key_str_buf_;
|
|
|
|
std::vector<std::string> other_value_str_buf_;
|
|
|
|
|
|
|
|
std::vector<Slice> combined_key_buf_;
|
|
|
|
std::vector<Slice> combined_value_buf_;
|
|
|
|
|
|
|
|
std::string cur_prefix_;
|
|
|
|
|
2014-11-01 02:36:07 +01:00
|
|
|
uint64_t num_input_records;
|
|
|
|
uint64_t num_output_records;
|
|
|
|
|
2014-11-01 00:31:25 +01:00
|
|
|
// Buffers the kv-pair that will be run through compaction filter V2
|
|
|
|
// in the future.
|
|
|
|
void BufferKeyValueSlices(const Slice& key, const Slice& value) {
|
|
|
|
key_str_buf_.emplace_back(key.ToString());
|
|
|
|
existing_value_str_buf_.emplace_back(value.ToString());
|
|
|
|
}
|
|
|
|
|
|
|
|
// Buffers the kv-pair that will not be run through compaction filter V2
|
|
|
|
// in the future.
|
|
|
|
void BufferOtherKeyValueSlices(const Slice& key, const Slice& value) {
|
|
|
|
other_key_str_buf_.emplace_back(key.ToString());
|
|
|
|
other_value_str_buf_.emplace_back(value.ToString());
|
|
|
|
}
|
|
|
|
|
|
|
|
// Add a kv-pair to the combined buffer
|
|
|
|
void AddToCombinedKeyValueSlices(const Slice& key, const Slice& value) {
|
|
|
|
// The real strings are stored in the batch buffers
|
|
|
|
combined_key_buf_.emplace_back(key);
|
|
|
|
combined_value_buf_.emplace_back(value);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Merging the two buffers
|
|
|
|
void MergeKeyValueSliceBuffer(const InternalKeyComparator* comparator) {
|
|
|
|
size_t i = 0;
|
|
|
|
size_t j = 0;
|
|
|
|
size_t total_size = key_str_buf_.size() + other_key_str_buf_.size();
|
|
|
|
combined_key_buf_.reserve(total_size);
|
|
|
|
combined_value_buf_.reserve(total_size);
|
|
|
|
|
|
|
|
while (i + j < total_size) {
|
|
|
|
int comp_res = 0;
|
|
|
|
if (i < key_str_buf_.size() && j < other_key_str_buf_.size()) {
|
|
|
|
comp_res = comparator->Compare(key_str_buf_[i], other_key_str_buf_[j]);
|
|
|
|
} else if (i >= key_str_buf_.size() && j < other_key_str_buf_.size()) {
|
|
|
|
comp_res = 1;
|
|
|
|
} else if (j >= other_key_str_buf_.size() && i < key_str_buf_.size()) {
|
|
|
|
comp_res = -1;
|
|
|
|
}
|
|
|
|
if (comp_res > 0) {
|
|
|
|
AddToCombinedKeyValueSlices(other_key_str_buf_[j],
|
|
|
|
other_value_str_buf_[j]);
|
|
|
|
j++;
|
|
|
|
} else if (comp_res < 0) {
|
|
|
|
AddToCombinedKeyValueSlices(key_str_buf_[i],
|
|
|
|
existing_value_str_buf_[i]);
|
|
|
|
i++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void CleanupBatchBuffer() {
|
|
|
|
to_delete_buf_.clear();
|
|
|
|
key_str_buf_.clear();
|
|
|
|
existing_value_str_buf_.clear();
|
|
|
|
new_value_buf_.clear();
|
|
|
|
value_changed_buf_.clear();
|
|
|
|
|
|
|
|
to_delete_buf_.shrink_to_fit();
|
|
|
|
key_str_buf_.shrink_to_fit();
|
|
|
|
existing_value_str_buf_.shrink_to_fit();
|
|
|
|
new_value_buf_.shrink_to_fit();
|
|
|
|
value_changed_buf_.shrink_to_fit();
|
|
|
|
|
|
|
|
other_key_str_buf_.clear();
|
|
|
|
other_value_str_buf_.clear();
|
|
|
|
other_key_str_buf_.shrink_to_fit();
|
|
|
|
other_value_str_buf_.shrink_to_fit();
|
|
|
|
}
|
|
|
|
|
|
|
|
void CleanupMergedBuffer() {
|
|
|
|
combined_key_buf_.clear();
|
|
|
|
combined_value_buf_.clear();
|
|
|
|
combined_key_buf_.shrink_to_fit();
|
|
|
|
combined_value_buf_.shrink_to_fit();
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
CompactionJob::CompactionJob(
|
2015-02-12 18:54:48 +01:00
|
|
|
int job_id, Compaction* compaction, const DBOptions& db_options,
|
2015-05-06 04:01:12 +02:00
|
|
|
const EnvOptions& env_options, VersionSet* versions,
|
|
|
|
std::atomic<bool>* shutting_down, LogBuffer* log_buffer,
|
|
|
|
Directory* db_directory, Directory* output_directory, Statistics* stats,
|
|
|
|
std::vector<SequenceNumber> existing_snapshots,
|
2014-11-08 00:44:12 +01:00
|
|
|
std::shared_ptr<Cache> table_cache,
|
2015-05-06 04:01:12 +02:00
|
|
|
std::function<uint64_t()> yield_callback, EventLogger* event_logger,
|
|
|
|
bool paranoid_file_checks)
|
2015-02-12 18:54:48 +01:00
|
|
|
: job_id_(job_id),
|
|
|
|
compact_(new CompactionState(compaction)),
|
2014-11-01 00:31:25 +01:00
|
|
|
compaction_stats_(1),
|
|
|
|
db_options_(db_options),
|
|
|
|
env_options_(env_options),
|
|
|
|
env_(db_options.env),
|
|
|
|
versions_(versions),
|
|
|
|
shutting_down_(shutting_down),
|
|
|
|
log_buffer_(log_buffer),
|
|
|
|
db_directory_(db_directory),
|
2015-01-26 22:59:38 +01:00
|
|
|
output_directory_(output_directory),
|
2014-11-01 00:31:25 +01:00
|
|
|
stats_(stats),
|
2015-05-06 04:01:12 +02:00
|
|
|
existing_snapshots_(std::move(existing_snapshots)),
|
2014-11-01 00:31:25 +01:00
|
|
|
table_cache_(std::move(table_cache)),
|
Include bunch of more events into EventLogger
Summary:
Added these events:
* Recovery start, finish and also when recovery creates a file
* Trivial move
* Compaction start, finish and when compaction creates a file
* Flush start, finish
Also includes small fix to EventLogger
Also added option ROCKSDB_PRINT_EVENTS_TO_STDOUT which is useful when we debug things. I've spent far too much time chasing LOG files.
Still didn't get sst table properties in JSON. They are written very deeply into the stack. I'll address in separate diff.
TODO:
* Write specification. Let's first use this for a while and figure out what's good data to put here, too. After that we'll write spec
* Write tools that parse and analyze LOGs. This can be in python or go. Good intern task.
Test Plan: Ran db_bench with ROCKSDB_PRINT_EVENTS_TO_STDOUT. Here's the output: https://phabricator.fb.com/P19811976
Reviewers: sdong, yhchiang, rven, MarkCallaghan, kradhakrishnan, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37521
2015-04-28 00:20:02 +02:00
|
|
|
yield_callback_(std::move(yield_callback)),
|
2015-05-06 04:01:12 +02:00
|
|
|
event_logger_(event_logger),
|
|
|
|
paranoid_file_checks_(paranoid_file_checks) {
|
|
|
|
ThreadStatusUtil::SetColumnFamily(compact_->compaction->column_family_data());
|
2015-03-13 18:45:40 +01:00
|
|
|
ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION);
|
2015-05-07 07:50:35 +02:00
|
|
|
ReportStartedCompaction(compaction);
|
2015-03-13 18:45:40 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
CompactionJob::~CompactionJob() {
|
|
|
|
assert(compact_ == nullptr);
|
|
|
|
ThreadStatusUtil::ResetThreadStatus();
|
|
|
|
}
|
2014-11-01 00:31:25 +01:00
|
|
|
|
2015-05-07 07:50:35 +02:00
|
|
|
void CompactionJob::ReportStartedCompaction(
|
|
|
|
Compaction* compaction) {
|
|
|
|
ThreadStatusUtil::SetColumnFamily(
|
|
|
|
compact_->compaction->column_family_data());
|
|
|
|
|
|
|
|
ThreadStatusUtil::SetThreadOperationProperty(
|
|
|
|
ThreadStatus::COMPACTION_JOB_ID,
|
|
|
|
job_id_);
|
|
|
|
|
|
|
|
ThreadStatusUtil::SetThreadOperationProperty(
|
|
|
|
ThreadStatus::COMPACTION_INPUT_OUTPUT_LEVEL,
|
|
|
|
(static_cast<uint64_t>(compact_->compaction->start_level()) << 32) +
|
|
|
|
compact_->compaction->output_level());
|
|
|
|
|
|
|
|
ThreadStatusUtil::SetThreadOperationProperty(
|
|
|
|
ThreadStatus::COMPACTION_PROP_FLAGS,
|
|
|
|
compaction->IsManualCompaction() +
|
|
|
|
(compaction->IsDeletionCompaction() << 1) +
|
|
|
|
(compaction->IsTrivialMove() << 2));
|
|
|
|
|
|
|
|
ThreadStatusUtil::SetThreadOperationProperty(
|
|
|
|
ThreadStatus::COMPACTION_TOTAL_INPUT_BYTES,
|
|
|
|
compaction->CalculateTotalInputSize());
|
|
|
|
|
|
|
|
IOSTATS_RESET(bytes_written);
|
|
|
|
IOSTATS_RESET(bytes_read);
|
|
|
|
ThreadStatusUtil::SetThreadOperationProperty(
|
|
|
|
ThreadStatus::COMPACTION_BYTES_WRITTEN, 0);
|
|
|
|
ThreadStatusUtil::SetThreadOperationProperty(
|
|
|
|
ThreadStatus::COMPACTION_BYTES_READ, 0);
|
|
|
|
|
|
|
|
// Set the thread operation after operation properties
|
|
|
|
// to ensure GetThreadList() can always show them all together.
|
|
|
|
ThreadStatusUtil::SetThreadOperation(
|
|
|
|
ThreadStatus::OP_COMPACTION);
|
|
|
|
}
|
|
|
|
|
2014-11-01 00:31:25 +01:00
|
|
|
void CompactionJob::Prepare() {
|
2015-03-13 18:45:40 +01:00
|
|
|
AutoThreadOperationStageUpdater stage_updater(
|
|
|
|
ThreadStatus::STAGE_COMPACTION_PREPARE);
|
2014-11-01 00:31:25 +01:00
|
|
|
compact_->CleanupBatchBuffer();
|
|
|
|
compact_->CleanupMergedBuffer();
|
|
|
|
|
|
|
|
// Generate file_levels_ for compaction berfore making Iterator
|
Include bunch of more events into EventLogger
Summary:
Added these events:
* Recovery start, finish and also when recovery creates a file
* Trivial move
* Compaction start, finish and when compaction creates a file
* Flush start, finish
Also includes small fix to EventLogger
Also added option ROCKSDB_PRINT_EVENTS_TO_STDOUT which is useful when we debug things. I've spent far too much time chasing LOG files.
Still didn't get sst table properties in JSON. They are written very deeply into the stack. I'll address in separate diff.
TODO:
* Write specification. Let's first use this for a while and figure out what's good data to put here, too. After that we'll write spec
* Write tools that parse and analyze LOGs. This can be in python or go. Good intern task.
Test Plan: Ran db_bench with ROCKSDB_PRINT_EVENTS_TO_STDOUT. Here's the output: https://phabricator.fb.com/P19811976
Reviewers: sdong, yhchiang, rven, MarkCallaghan, kradhakrishnan, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37521
2015-04-28 00:20:02 +02:00
|
|
|
ColumnFamilyData* cfd __attribute__((unused)) =
|
|
|
|
compact_->compaction->column_family_data();
|
2014-11-14 20:35:48 +01:00
|
|
|
assert(cfd != nullptr);
|
2014-11-01 00:31:25 +01:00
|
|
|
|
|
|
|
assert(cfd->current()->storage_info()->NumLevelFiles(
|
|
|
|
compact_->compaction->level()) > 0);
|
|
|
|
assert(compact_->builder == nullptr);
|
|
|
|
assert(!compact_->outfile);
|
|
|
|
|
|
|
|
visible_at_tip_ = 0;
|
|
|
|
latest_snapshot_ = 0;
|
2015-05-06 04:01:12 +02:00
|
|
|
if (existing_snapshots_.size() == 0) {
|
2014-11-01 00:31:25 +01:00
|
|
|
// optimize for fast path if there are no snapshots
|
|
|
|
visible_at_tip_ = versions_->LastSequence();
|
|
|
|
earliest_snapshot_ = visible_at_tip_;
|
|
|
|
} else {
|
2015-05-06 04:01:12 +02:00
|
|
|
latest_snapshot_ = existing_snapshots_.back();
|
2014-11-01 00:31:25 +01:00
|
|
|
// Add the current seqno as the 'latest' virtual
|
|
|
|
// snapshot to the end of this list.
|
2015-05-06 04:01:12 +02:00
|
|
|
existing_snapshots_.push_back(versions_->LastSequence());
|
|
|
|
earliest_snapshot_ = existing_snapshots_[0];
|
2014-11-01 00:31:25 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
// Is this compaction producing files at the bottommost level?
|
|
|
|
bottommost_level_ = compact_->compaction->BottomMostLevel();
|
|
|
|
}
|
|
|
|
|
|
|
|
Status CompactionJob::Run() {
|
2015-03-13 18:45:40 +01:00
|
|
|
AutoThreadOperationStageUpdater stage_updater(
|
|
|
|
ThreadStatus::STAGE_COMPACTION_RUN);
|
2015-03-13 19:59:00 +01:00
|
|
|
TEST_SYNC_POINT("CompactionJob::Run():Start");
|
2014-11-01 00:31:25 +01:00
|
|
|
log_buffer_->FlushBufferToLog();
|
|
|
|
ColumnFamilyData* cfd = compact_->compaction->column_family_data();
|
|
|
|
|
Include bunch of more events into EventLogger
Summary:
Added these events:
* Recovery start, finish and also when recovery creates a file
* Trivial move
* Compaction start, finish and when compaction creates a file
* Flush start, finish
Also includes small fix to EventLogger
Also added option ROCKSDB_PRINT_EVENTS_TO_STDOUT which is useful when we debug things. I've spent far too much time chasing LOG files.
Still didn't get sst table properties in JSON. They are written very deeply into the stack. I'll address in separate diff.
TODO:
* Write specification. Let's first use this for a while and figure out what's good data to put here, too. After that we'll write spec
* Write tools that parse and analyze LOGs. This can be in python or go. Good intern task.
Test Plan: Ran db_bench with ROCKSDB_PRINT_EVENTS_TO_STDOUT. Here's the output: https://phabricator.fb.com/P19811976
Reviewers: sdong, yhchiang, rven, MarkCallaghan, kradhakrishnan, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37521
2015-04-28 00:20:02 +02:00
|
|
|
auto* compaction = compact_->compaction;
|
|
|
|
// Let's check if anything will get logged. Don't prepare all the info if
|
|
|
|
// we're not logging
|
|
|
|
if (db_options_.info_log_level <= InfoLogLevel::INFO_LEVEL) {
|
|
|
|
Compaction::InputLevelSummaryBuffer inputs_summary;
|
|
|
|
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
|
|
|
|
"[%s] [JOB %d] Compacting %s, score %.2f", cfd->GetName().c_str(),
|
|
|
|
job_id_, compaction->InputLevelSummary(&inputs_summary),
|
|
|
|
compaction->score());
|
|
|
|
char scratch[2345];
|
|
|
|
compact_->compaction->Summary(scratch, sizeof(scratch));
|
|
|
|
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
|
|
|
|
"[%s] Compaction start summary: %s\n", cfd->GetName().c_str(), scratch);
|
|
|
|
// build event logger report
|
|
|
|
auto stream = event_logger_->Log();
|
|
|
|
stream << "job" << job_id_ << "event"
|
|
|
|
<< "compaction_started";
|
|
|
|
for (size_t i = 0; i < compaction->num_input_levels(); ++i) {
|
|
|
|
stream << ("files_L" + ToString(compaction->level(i)));
|
|
|
|
stream.StartArray();
|
|
|
|
for (auto f : *compaction->inputs(i)) {
|
|
|
|
stream << f->fd.GetNumber();
|
|
|
|
}
|
|
|
|
stream.EndArray();
|
|
|
|
}
|
|
|
|
stream << "score" << compaction->score() << "input_data_size"
|
|
|
|
<< compaction->CalculateTotalInputSize();
|
|
|
|
}
|
|
|
|
|
2014-11-01 00:31:25 +01:00
|
|
|
const uint64_t start_micros = env_->NowMicros();
|
|
|
|
std::unique_ptr<Iterator> input(
|
|
|
|
versions_->MakeInputIterator(compact_->compaction));
|
|
|
|
input->SeekToFirst();
|
|
|
|
|
|
|
|
Status status;
|
|
|
|
ParsedInternalKey ikey;
|
|
|
|
std::unique_ptr<CompactionFilterV2> compaction_filter_from_factory_v2 =
|
|
|
|
nullptr;
|
|
|
|
auto context = compact_->GetFilterContext();
|
|
|
|
compaction_filter_from_factory_v2 =
|
|
|
|
cfd->ioptions()->compaction_filter_factory_v2->CreateCompactionFilterV2(
|
|
|
|
context);
|
|
|
|
auto compaction_filter_v2 = compaction_filter_from_factory_v2.get();
|
|
|
|
|
|
|
|
int64_t imm_micros = 0; // Micros spent doing imm_ compactions
|
|
|
|
if (!compaction_filter_v2) {
|
2014-11-01 02:36:07 +01:00
|
|
|
status = ProcessKeyValueCompaction(&imm_micros, input.get(), false);
|
2014-11-01 00:31:25 +01:00
|
|
|
} else {
|
|
|
|
// temp_backup_input always point to the start of the current buffer
|
|
|
|
// temp_backup_input = backup_input;
|
|
|
|
// iterate through input,
|
|
|
|
// 1) buffer ineligible keys and value keys into 2 separate buffers;
|
|
|
|
// 2) send value_buffer to compaction filter and alternate the values;
|
|
|
|
// 3) merge value_buffer with ineligible_value_buffer;
|
|
|
|
// 4) run the modified "compaction" using the old for loop.
|
|
|
|
bool prefix_initialized = false;
|
|
|
|
shared_ptr<Iterator> backup_input(
|
|
|
|
versions_->MakeInputIterator(compact_->compaction));
|
|
|
|
backup_input->SeekToFirst();
|
2015-03-03 19:59:36 +01:00
|
|
|
uint64_t total_filter_time = 0;
|
2014-11-01 00:31:25 +01:00
|
|
|
while (backup_input->Valid() &&
|
|
|
|
!shutting_down_->load(std::memory_order_acquire) &&
|
|
|
|
!cfd->IsDropped()) {
|
|
|
|
// FLUSH preempts compaction
|
|
|
|
// TODO(icanadi) this currently only checks if flush is necessary on
|
|
|
|
// compacting column family. we should also check if flush is necessary on
|
|
|
|
// other column families, too
|
|
|
|
|
|
|
|
imm_micros += yield_callback_();
|
|
|
|
|
|
|
|
Slice key = backup_input->key();
|
|
|
|
Slice value = backup_input->value();
|
|
|
|
|
|
|
|
if (!ParseInternalKey(key, &ikey)) {
|
|
|
|
// log error
|
2014-11-04 20:07:11 +01:00
|
|
|
Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
|
2015-02-12 18:54:48 +01:00
|
|
|
"[%s] [JOB %d] Failed to parse key: %s", cfd->GetName().c_str(),
|
|
|
|
job_id_, key.ToString().c_str());
|
2014-11-01 00:31:25 +01:00
|
|
|
continue;
|
|
|
|
} else {
|
|
|
|
const SliceTransform* transformer =
|
|
|
|
cfd->ioptions()->compaction_filter_factory_v2->GetPrefixExtractor();
|
|
|
|
const auto key_prefix = transformer->Transform(ikey.user_key);
|
|
|
|
if (!prefix_initialized) {
|
|
|
|
compact_->cur_prefix_ = key_prefix.ToString();
|
|
|
|
prefix_initialized = true;
|
|
|
|
}
|
|
|
|
// If the prefix remains the same, keep buffering
|
|
|
|
if (key_prefix.compare(Slice(compact_->cur_prefix_)) == 0) {
|
|
|
|
// Apply the compaction filter V2 to all the kv pairs sharing
|
|
|
|
// the same prefix
|
|
|
|
if (ikey.type == kTypeValue &&
|
|
|
|
(visible_at_tip_ || ikey.sequence > latest_snapshot_)) {
|
|
|
|
// Buffer all keys sharing the same prefix for CompactionFilterV2
|
|
|
|
// Iterate through keys to check prefix
|
|
|
|
compact_->BufferKeyValueSlices(key, value);
|
|
|
|
} else {
|
|
|
|
// buffer ineligible keys
|
|
|
|
compact_->BufferOtherKeyValueSlices(key, value);
|
|
|
|
}
|
|
|
|
backup_input->Next();
|
|
|
|
continue;
|
|
|
|
// finish changing values for eligible keys
|
|
|
|
} else {
|
|
|
|
// Now prefix changes, this batch is done.
|
|
|
|
// Call compaction filter on the buffered values to change the value
|
|
|
|
if (compact_->key_str_buf_.size() > 0) {
|
2015-03-03 19:59:36 +01:00
|
|
|
uint64_t time = 0;
|
|
|
|
CallCompactionFilterV2(compaction_filter_v2, &time);
|
|
|
|
total_filter_time += time;
|
2014-11-01 00:31:25 +01:00
|
|
|
}
|
|
|
|
compact_->cur_prefix_ = key_prefix.ToString();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Merge this batch of data (values + ineligible keys)
|
|
|
|
compact_->MergeKeyValueSliceBuffer(&cfd->internal_comparator());
|
|
|
|
|
|
|
|
// Done buffering for the current prefix. Spit it out to disk
|
|
|
|
// Now just iterate through all the kv-pairs
|
2014-11-01 02:36:07 +01:00
|
|
|
status = ProcessKeyValueCompaction(&imm_micros, input.get(), true);
|
2014-11-01 00:31:25 +01:00
|
|
|
|
|
|
|
if (!status.ok()) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
// After writing the kv-pairs, we can safely remove the reference
|
|
|
|
// to the string buffer and clean them up
|
|
|
|
compact_->CleanupBatchBuffer();
|
|
|
|
compact_->CleanupMergedBuffer();
|
|
|
|
// Buffer the key that triggers the mismatch in prefix
|
|
|
|
if (ikey.type == kTypeValue &&
|
|
|
|
(visible_at_tip_ || ikey.sequence > latest_snapshot_)) {
|
|
|
|
compact_->BufferKeyValueSlices(key, value);
|
|
|
|
} else {
|
|
|
|
compact_->BufferOtherKeyValueSlices(key, value);
|
|
|
|
}
|
|
|
|
backup_input->Next();
|
|
|
|
if (!backup_input->Valid()) {
|
|
|
|
// If this is the single last value, we need to merge it.
|
|
|
|
if (compact_->key_str_buf_.size() > 0) {
|
2015-03-03 19:59:36 +01:00
|
|
|
uint64_t time = 0;
|
|
|
|
CallCompactionFilterV2(compaction_filter_v2, &time);
|
|
|
|
total_filter_time += time;
|
2014-11-01 00:31:25 +01:00
|
|
|
}
|
|
|
|
compact_->MergeKeyValueSliceBuffer(&cfd->internal_comparator());
|
|
|
|
|
2014-11-01 02:36:07 +01:00
|
|
|
status = ProcessKeyValueCompaction(&imm_micros, input.get(), true);
|
2014-11-17 06:52:23 +01:00
|
|
|
if (!status.ok()) {
|
|
|
|
break;
|
|
|
|
}
|
2014-11-01 00:31:25 +01:00
|
|
|
|
|
|
|
compact_->CleanupBatchBuffer();
|
|
|
|
compact_->CleanupMergedBuffer();
|
|
|
|
}
|
|
|
|
} // done processing all prefix batches
|
|
|
|
// finish the last batch
|
2014-11-17 06:52:23 +01:00
|
|
|
if (status.ok()) {
|
|
|
|
if (compact_->key_str_buf_.size() > 0) {
|
2015-03-03 19:59:36 +01:00
|
|
|
uint64_t time = 0;
|
|
|
|
CallCompactionFilterV2(compaction_filter_v2, &time);
|
|
|
|
total_filter_time += time;
|
2014-11-17 06:52:23 +01:00
|
|
|
}
|
|
|
|
compact_->MergeKeyValueSliceBuffer(&cfd->internal_comparator());
|
|
|
|
status = ProcessKeyValueCompaction(&imm_micros, input.get(), true);
|
2014-11-01 00:31:25 +01:00
|
|
|
}
|
2015-03-03 19:59:36 +01:00
|
|
|
RecordTick(stats_, FILTER_OPERATION_TOTAL_TIME, total_filter_time);
|
2014-11-01 00:31:25 +01:00
|
|
|
} // checking for compaction filter v2
|
|
|
|
|
|
|
|
if (status.ok() &&
|
|
|
|
(shutting_down_->load(std::memory_order_acquire) || cfd->IsDropped())) {
|
|
|
|
status = Status::ShutdownInProgress(
|
|
|
|
"Database shutdown or Column family drop during compaction");
|
|
|
|
}
|
|
|
|
if (status.ok() && compact_->builder != nullptr) {
|
|
|
|
status = FinishCompactionOutputFile(input.get());
|
|
|
|
}
|
|
|
|
if (status.ok()) {
|
|
|
|
status = input->status();
|
|
|
|
}
|
|
|
|
input.reset();
|
|
|
|
|
2015-01-26 22:59:38 +01:00
|
|
|
if (output_directory_ && !db_options_.disableDataSync) {
|
|
|
|
output_directory_->Fsync();
|
2014-11-01 00:31:25 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
compaction_stats_.micros = env_->NowMicros() - start_micros - imm_micros;
|
2014-11-11 22:47:22 +01:00
|
|
|
compaction_stats_.files_in_leveln =
|
|
|
|
static_cast<int>(compact_->compaction->num_input_files(0));
|
2014-11-01 00:31:25 +01:00
|
|
|
compaction_stats_.files_in_levelnp1 =
|
2014-11-11 22:47:22 +01:00
|
|
|
static_cast<int>(compact_->compaction->num_input_files(1));
|
2014-11-01 00:31:25 +01:00
|
|
|
MeasureTime(stats_, COMPACTION_TIME, compaction_stats_.micros);
|
|
|
|
|
2014-11-11 22:47:22 +01:00
|
|
|
size_t num_output_files = compact_->outputs.size();
|
2014-11-01 00:31:25 +01:00
|
|
|
if (compact_->builder != nullptr) {
|
|
|
|
// An error occurred so ignore the last output.
|
|
|
|
assert(num_output_files > 0);
|
|
|
|
--num_output_files;
|
|
|
|
}
|
2014-11-11 22:47:22 +01:00
|
|
|
compaction_stats_.files_out_levelnp1 = static_cast<int>(num_output_files);
|
2014-11-01 00:31:25 +01:00
|
|
|
|
2014-11-11 22:47:22 +01:00
|
|
|
for (size_t i = 0; i < compact_->compaction->num_input_files(0); i++) {
|
2014-11-01 00:31:25 +01:00
|
|
|
compaction_stats_.bytes_readn +=
|
|
|
|
compact_->compaction->input(0, i)->fd.GetFileSize();
|
|
|
|
compaction_stats_.num_input_records +=
|
2014-11-01 02:36:07 +01:00
|
|
|
static_cast<uint64_t>(compact_->compaction->input(0, i)->num_entries);
|
2014-11-01 00:31:25 +01:00
|
|
|
}
|
|
|
|
|
2014-11-11 22:47:22 +01:00
|
|
|
for (size_t i = 0; i < compact_->compaction->num_input_files(1); i++) {
|
2014-11-01 00:31:25 +01:00
|
|
|
compaction_stats_.bytes_readnp1 +=
|
|
|
|
compact_->compaction->input(1, i)->fd.GetFileSize();
|
|
|
|
}
|
|
|
|
|
2014-11-11 22:47:22 +01:00
|
|
|
for (size_t i = 0; i < num_output_files; i++) {
|
2014-11-01 00:31:25 +01:00
|
|
|
compaction_stats_.bytes_written += compact_->outputs[i].file_size;
|
|
|
|
}
|
2014-11-01 02:36:07 +01:00
|
|
|
if (compact_->num_input_records > compact_->num_output_records) {
|
|
|
|
compaction_stats_.num_dropped_records +=
|
|
|
|
compact_->num_input_records - compact_->num_output_records;
|
|
|
|
}
|
2014-11-01 00:31:25 +01:00
|
|
|
|
|
|
|
RecordCompactionIOStats();
|
|
|
|
|
|
|
|
LogFlush(db_options_.info_log);
|
2015-03-13 19:59:00 +01:00
|
|
|
TEST_SYNC_POINT("CompactionJob::Run():End");
|
2014-11-01 00:31:25 +01:00
|
|
|
return status;
|
|
|
|
}
|
|
|
|
|
2015-05-06 04:01:12 +02:00
|
|
|
void CompactionJob::Install(Status* status,
|
|
|
|
const MutableCFOptions& mutable_cf_options,
|
|
|
|
InstrumentedMutex* db_mutex) {
|
2015-03-13 18:45:40 +01:00
|
|
|
AutoThreadOperationStageUpdater stage_updater(
|
|
|
|
ThreadStatus::STAGE_COMPACTION_INSTALL);
|
2014-11-08 00:44:12 +01:00
|
|
|
db_mutex->AssertHeld();
|
2014-11-01 00:31:25 +01:00
|
|
|
ColumnFamilyData* cfd = compact_->compaction->column_family_data();
|
|
|
|
cfd->internal_stats()->AddCompactionStats(
|
|
|
|
compact_->compaction->output_level(), compaction_stats_);
|
|
|
|
|
2014-11-10 20:57:58 +01:00
|
|
|
if (status->ok()) {
|
2015-05-06 04:01:12 +02:00
|
|
|
*status = InstallCompactionResults(db_mutex, mutable_cf_options);
|
2014-11-01 00:31:25 +01:00
|
|
|
}
|
|
|
|
VersionStorageInfo::LevelSummaryStorage tmp;
|
Include bunch of more events into EventLogger
Summary:
Added these events:
* Recovery start, finish and also when recovery creates a file
* Trivial move
* Compaction start, finish and when compaction creates a file
* Flush start, finish
Also includes small fix to EventLogger
Also added option ROCKSDB_PRINT_EVENTS_TO_STDOUT which is useful when we debug things. I've spent far too much time chasing LOG files.
Still didn't get sst table properties in JSON. They are written very deeply into the stack. I'll address in separate diff.
TODO:
* Write specification. Let's first use this for a while and figure out what's good data to put here, too. After that we'll write spec
* Write tools that parse and analyze LOGs. This can be in python or go. Good intern task.
Test Plan: Ran db_bench with ROCKSDB_PRINT_EVENTS_TO_STDOUT. Here's the output: https://phabricator.fb.com/P19811976
Reviewers: sdong, yhchiang, rven, MarkCallaghan, kradhakrishnan, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37521
2015-04-28 00:20:02 +02:00
|
|
|
auto vstorage = cfd->current()->storage_info();
|
2014-11-01 00:31:25 +01:00
|
|
|
const auto& stats = compaction_stats_;
|
|
|
|
LogToBuffer(log_buffer_,
|
|
|
|
"[%s] compacted to: %s, MB/sec: %.1f rd, %.1f wr, level %d, "
|
|
|
|
"files in(%d, %d) out(%d) "
|
|
|
|
"MB in(%.1f, %.1f) out(%.1f), read-write-amplify(%.1f) "
|
|
|
|
"write-amplify(%.1f) %s, records in: %d, records dropped: %d\n",
|
Include bunch of more events into EventLogger
Summary:
Added these events:
* Recovery start, finish and also when recovery creates a file
* Trivial move
* Compaction start, finish and when compaction creates a file
* Flush start, finish
Also includes small fix to EventLogger
Also added option ROCKSDB_PRINT_EVENTS_TO_STDOUT which is useful when we debug things. I've spent far too much time chasing LOG files.
Still didn't get sst table properties in JSON. They are written very deeply into the stack. I'll address in separate diff.
TODO:
* Write specification. Let's first use this for a while and figure out what's good data to put here, too. After that we'll write spec
* Write tools that parse and analyze LOGs. This can be in python or go. Good intern task.
Test Plan: Ran db_bench with ROCKSDB_PRINT_EVENTS_TO_STDOUT. Here's the output: https://phabricator.fb.com/P19811976
Reviewers: sdong, yhchiang, rven, MarkCallaghan, kradhakrishnan, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37521
2015-04-28 00:20:02 +02:00
|
|
|
cfd->GetName().c_str(), vstorage->LevelSummary(&tmp),
|
2014-11-01 00:31:25 +01:00
|
|
|
(stats.bytes_readn + stats.bytes_readnp1) /
|
|
|
|
static_cast<double>(stats.micros),
|
|
|
|
stats.bytes_written / static_cast<double>(stats.micros),
|
|
|
|
compact_->compaction->output_level(), stats.files_in_leveln,
|
|
|
|
stats.files_in_levelnp1, stats.files_out_levelnp1,
|
|
|
|
stats.bytes_readn / 1048576.0, stats.bytes_readnp1 / 1048576.0,
|
|
|
|
stats.bytes_written / 1048576.0,
|
|
|
|
(stats.bytes_written + stats.bytes_readnp1 + stats.bytes_readn) /
|
|
|
|
static_cast<double>(stats.bytes_readn),
|
|
|
|
stats.bytes_written / static_cast<double>(stats.bytes_readn),
|
2014-11-10 20:57:58 +01:00
|
|
|
status->ToString().c_str(), stats.num_input_records,
|
2014-11-01 00:31:25 +01:00
|
|
|
stats.num_dropped_records);
|
|
|
|
|
Include bunch of more events into EventLogger
Summary:
Added these events:
* Recovery start, finish and also when recovery creates a file
* Trivial move
* Compaction start, finish and when compaction creates a file
* Flush start, finish
Also includes small fix to EventLogger
Also added option ROCKSDB_PRINT_EVENTS_TO_STDOUT which is useful when we debug things. I've spent far too much time chasing LOG files.
Still didn't get sst table properties in JSON. They are written very deeply into the stack. I'll address in separate diff.
TODO:
* Write specification. Let's first use this for a while and figure out what's good data to put here, too. After that we'll write spec
* Write tools that parse and analyze LOGs. This can be in python or go. Good intern task.
Test Plan: Ran db_bench with ROCKSDB_PRINT_EVENTS_TO_STDOUT. Here's the output: https://phabricator.fb.com/P19811976
Reviewers: sdong, yhchiang, rven, MarkCallaghan, kradhakrishnan, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37521
2015-04-28 00:20:02 +02:00
|
|
|
auto stream = event_logger_->LogToBuffer(log_buffer_);
|
|
|
|
stream << "job" << job_id_ << "event"
|
|
|
|
<< "compaction_finished"
|
|
|
|
<< "output_level" << compact_->compaction->output_level()
|
|
|
|
<< "num_output_files" << compact_->outputs.size()
|
|
|
|
<< "total_output_size" << compact_->total_bytes << "num_input_records"
|
|
|
|
<< compact_->num_input_records << "num_output_records"
|
|
|
|
<< compact_->num_output_records;
|
|
|
|
stream << "lsm_state";
|
|
|
|
stream.StartArray();
|
|
|
|
for (int level = 0; level < vstorage->num_levels(); ++level) {
|
|
|
|
stream << vstorage->NumLevelFiles(level);
|
|
|
|
}
|
|
|
|
stream.EndArray();
|
|
|
|
|
2014-11-10 20:57:58 +01:00
|
|
|
CleanupCompaction(*status);
|
2014-11-01 00:31:25 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
Status CompactionJob::ProcessKeyValueCompaction(int64_t* imm_micros,
|
|
|
|
Iterator* input,
|
2014-11-01 02:36:07 +01:00
|
|
|
bool is_compaction_v2) {
|
2015-03-13 18:45:40 +01:00
|
|
|
AutoThreadOperationStageUpdater stage_updater(
|
|
|
|
ThreadStatus::STAGE_COMPACTION_PROCESS_KV);
|
2014-11-01 00:31:25 +01:00
|
|
|
size_t combined_idx = 0;
|
|
|
|
Status status;
|
|
|
|
std::string compaction_filter_value;
|
|
|
|
ParsedInternalKey ikey;
|
|
|
|
IterKey current_user_key;
|
|
|
|
bool has_current_user_key = false;
|
|
|
|
IterKey delete_key;
|
|
|
|
SequenceNumber last_sequence_for_key __attribute__((unused)) =
|
|
|
|
kMaxSequenceNumber;
|
|
|
|
SequenceNumber visible_in_snapshot = kMaxSequenceNumber;
|
|
|
|
ColumnFamilyData* cfd = compact_->compaction->column_family_data();
|
|
|
|
MergeHelper merge(cfd->user_comparator(), cfd->ioptions()->merge_operator,
|
|
|
|
db_options_.info_log.get(),
|
|
|
|
cfd->ioptions()->min_partial_merge_operands,
|
|
|
|
false /* internal key corruption is expected */);
|
|
|
|
auto compaction_filter = cfd->ioptions()->compaction_filter;
|
|
|
|
std::unique_ptr<CompactionFilter> compaction_filter_from_factory = nullptr;
|
|
|
|
if (!compaction_filter) {
|
|
|
|
auto context = compact_->GetFilterContextV1();
|
|
|
|
compaction_filter_from_factory =
|
|
|
|
cfd->ioptions()->compaction_filter_factory->CreateCompactionFilter(
|
|
|
|
context);
|
|
|
|
compaction_filter = compaction_filter_from_factory.get();
|
|
|
|
}
|
|
|
|
|
2015-03-14 16:21:53 +01:00
|
|
|
TEST_SYNC_POINT("CompactionJob::Run():Inprogress");
|
2015-03-11 18:31:02 +01:00
|
|
|
|
2014-11-01 00:31:25 +01:00
|
|
|
int64_t key_drop_user = 0;
|
|
|
|
int64_t key_drop_newer_entry = 0;
|
|
|
|
int64_t key_drop_obsolete = 0;
|
|
|
|
int64_t loop_cnt = 0;
|
2015-03-03 19:59:36 +01:00
|
|
|
|
|
|
|
StopWatchNano timer(env_, stats_ != nullptr);
|
|
|
|
uint64_t total_filter_time = 0;
|
2014-11-01 00:31:25 +01:00
|
|
|
while (input->Valid() && !shutting_down_->load(std::memory_order_acquire) &&
|
|
|
|
!cfd->IsDropped() && status.ok()) {
|
2014-11-01 02:36:07 +01:00
|
|
|
compact_->num_input_records++;
|
2014-11-01 00:31:25 +01:00
|
|
|
if (++loop_cnt > 1000) {
|
|
|
|
if (key_drop_user > 0) {
|
|
|
|
RecordTick(stats_, COMPACTION_KEY_DROP_USER, key_drop_user);
|
|
|
|
key_drop_user = 0;
|
|
|
|
}
|
|
|
|
if (key_drop_newer_entry > 0) {
|
|
|
|
RecordTick(stats_, COMPACTION_KEY_DROP_NEWER_ENTRY,
|
|
|
|
key_drop_newer_entry);
|
|
|
|
key_drop_newer_entry = 0;
|
|
|
|
}
|
|
|
|
if (key_drop_obsolete > 0) {
|
|
|
|
RecordTick(stats_, COMPACTION_KEY_DROP_OBSOLETE, key_drop_obsolete);
|
|
|
|
key_drop_obsolete = 0;
|
|
|
|
}
|
|
|
|
RecordCompactionIOStats();
|
|
|
|
loop_cnt = 0;
|
|
|
|
}
|
|
|
|
// FLUSH preempts compaction
|
|
|
|
// TODO(icanadi) this currently only checks if flush is necessary on
|
|
|
|
// compacting column family. we should also check if flush is necessary on
|
|
|
|
// other column families, too
|
|
|
|
(*imm_micros) += yield_callback_();
|
|
|
|
|
|
|
|
Slice key;
|
|
|
|
Slice value;
|
|
|
|
// If is_compaction_v2 is on, kv-pairs are reset to the prefix batch.
|
|
|
|
// This prefix batch should contain results after calling
|
|
|
|
// compaction_filter_v2.
|
|
|
|
//
|
|
|
|
// If is_compaction_v2 is off, this function will go through all the
|
|
|
|
// kv-pairs in input.
|
|
|
|
if (!is_compaction_v2) {
|
|
|
|
key = input->key();
|
|
|
|
value = input->value();
|
|
|
|
} else {
|
|
|
|
if (combined_idx >= compact_->combined_key_buf_.size()) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
assert(combined_idx < compact_->combined_key_buf_.size());
|
|
|
|
key = compact_->combined_key_buf_[combined_idx];
|
|
|
|
value = compact_->combined_value_buf_[combined_idx];
|
|
|
|
|
|
|
|
++combined_idx;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (compact_->compaction->ShouldStopBefore(key) &&
|
|
|
|
compact_->builder != nullptr) {
|
|
|
|
status = FinishCompactionOutputFile(input);
|
|
|
|
if (!status.ok()) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Handle key/value, add to state, etc.
|
|
|
|
bool drop = false;
|
|
|
|
bool current_entry_is_merging = false;
|
|
|
|
if (!ParseInternalKey(key, &ikey)) {
|
|
|
|
// Do not hide error keys
|
|
|
|
// TODO: error key stays in db forever? Figure out the intention/rationale
|
|
|
|
// v10 error v8 : we cannot hide v8 even though it's pretty obvious.
|
|
|
|
current_user_key.Clear();
|
|
|
|
has_current_user_key = false;
|
|
|
|
last_sequence_for_key = kMaxSequenceNumber;
|
|
|
|
visible_in_snapshot = kMaxSequenceNumber;
|
|
|
|
} else {
|
|
|
|
if (!has_current_user_key ||
|
|
|
|
cfd->user_comparator()->Compare(ikey.user_key,
|
|
|
|
current_user_key.GetKey()) != 0) {
|
|
|
|
// First occurrence of this user key
|
|
|
|
current_user_key.SetKey(ikey.user_key);
|
|
|
|
has_current_user_key = true;
|
|
|
|
last_sequence_for_key = kMaxSequenceNumber;
|
|
|
|
visible_in_snapshot = kMaxSequenceNumber;
|
|
|
|
// apply the compaction filter to the first occurrence of the user key
|
|
|
|
if (compaction_filter && !is_compaction_v2 && ikey.type == kTypeValue &&
|
|
|
|
(visible_at_tip_ || ikey.sequence > latest_snapshot_)) {
|
|
|
|
// If the user has specified a compaction filter and the sequence
|
|
|
|
// number is greater than any external snapshot, then invoke the
|
|
|
|
// filter.
|
|
|
|
// If the return value of the compaction filter is true, replace
|
|
|
|
// the entry with a delete marker.
|
|
|
|
bool value_changed = false;
|
|
|
|
compaction_filter_value.clear();
|
2015-03-03 19:59:36 +01:00
|
|
|
if (stats_ != nullptr) {
|
|
|
|
timer.Start();
|
|
|
|
}
|
2014-11-01 00:31:25 +01:00
|
|
|
bool to_delete = compaction_filter->Filter(
|
|
|
|
compact_->compaction->level(), ikey.user_key, value,
|
|
|
|
&compaction_filter_value, &value_changed);
|
2015-03-03 19:59:36 +01:00
|
|
|
total_filter_time += timer.ElapsedNanos();
|
2014-11-01 00:31:25 +01:00
|
|
|
if (to_delete) {
|
|
|
|
// make a copy of the original key and convert it to a delete
|
|
|
|
delete_key.SetInternalKey(ExtractUserKey(key), ikey.sequence,
|
|
|
|
kTypeDeletion);
|
|
|
|
// anchor the key again
|
|
|
|
key = delete_key.GetKey();
|
|
|
|
// needed because ikey is backed by key
|
|
|
|
ParseInternalKey(key, &ikey);
|
|
|
|
// no value associated with delete
|
|
|
|
value.clear();
|
|
|
|
++key_drop_user;
|
|
|
|
} else if (value_changed) {
|
|
|
|
value = compaction_filter_value;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// If there are no snapshots, then this kv affect visibility at tip.
|
|
|
|
// Otherwise, search though all existing snapshots to find
|
|
|
|
// the earlist snapshot that is affected by this kv.
|
|
|
|
SequenceNumber prev_snapshot = 0; // 0 means no previous snapshot
|
|
|
|
SequenceNumber visible =
|
|
|
|
visible_at_tip_
|
|
|
|
? visible_at_tip_
|
2015-05-06 04:01:12 +02:00
|
|
|
: findEarliestVisibleSnapshot(ikey.sequence, existing_snapshots_,
|
|
|
|
&prev_snapshot);
|
2014-11-01 00:31:25 +01:00
|
|
|
|
|
|
|
if (visible_in_snapshot == visible) {
|
|
|
|
// If the earliest snapshot is which this key is visible in
|
|
|
|
// is the same as the visibily of a previous instance of the
|
|
|
|
// same key, then this kv is not visible in any snapshot.
|
|
|
|
// Hidden by an newer entry for same user key
|
|
|
|
// TODO: why not > ?
|
|
|
|
assert(last_sequence_for_key >= ikey.sequence);
|
|
|
|
drop = true; // (A)
|
|
|
|
++key_drop_newer_entry;
|
|
|
|
} else if (ikey.type == kTypeDeletion &&
|
|
|
|
ikey.sequence <= earliest_snapshot_ &&
|
|
|
|
compact_->compaction->KeyNotExistsBeyondOutputLevel(
|
|
|
|
ikey.user_key)) {
|
|
|
|
// For this user key:
|
|
|
|
// (1) there is no data in higher levels
|
|
|
|
// (2) data in lower levels will have larger sequence numbers
|
|
|
|
// (3) data in layers that are being compacted here and have
|
|
|
|
// smaller sequence numbers will be dropped in the next
|
|
|
|
// few iterations of this loop (by rule (A) above).
|
|
|
|
// Therefore this deletion marker is obsolete and can be dropped.
|
|
|
|
drop = true;
|
|
|
|
++key_drop_obsolete;
|
|
|
|
} else if (ikey.type == kTypeMerge) {
|
|
|
|
if (!merge.HasOperator()) {
|
|
|
|
LogToBuffer(log_buffer_, "Options::merge_operator is null.");
|
|
|
|
status = Status::InvalidArgument(
|
|
|
|
"merge_operator is not properly initialized.");
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
// We know the merge type entry is not hidden, otherwise we would
|
|
|
|
// have hit (A)
|
|
|
|
// We encapsulate the merge related state machine in a different
|
|
|
|
// object to minimize change to the existing flow. Turn out this
|
|
|
|
// logic could also be nicely re-used for memtable flush purge
|
|
|
|
// optimization in BuildTable.
|
|
|
|
int steps = 0;
|
|
|
|
merge.MergeUntil(input, prev_snapshot, bottommost_level_,
|
2015-03-03 19:59:36 +01:00
|
|
|
db_options_.statistics.get(), &steps, env_);
|
2014-11-01 00:31:25 +01:00
|
|
|
// Skip the Merge ops
|
|
|
|
combined_idx = combined_idx - 1 + steps;
|
|
|
|
|
|
|
|
current_entry_is_merging = true;
|
|
|
|
if (merge.IsSuccess()) {
|
|
|
|
// Successfully found Put/Delete/(end-of-key-range) while merging
|
|
|
|
// Get the merge result
|
|
|
|
key = merge.key();
|
|
|
|
ParseInternalKey(key, &ikey);
|
|
|
|
value = merge.value();
|
|
|
|
} else {
|
|
|
|
// Did not find a Put/Delete/(end-of-key-range) while merging
|
|
|
|
// We now have some stack of merge operands to write out.
|
|
|
|
// NOTE: key,value, and ikey are now referring to old entries.
|
|
|
|
// These will be correctly set below.
|
|
|
|
assert(!merge.keys().empty());
|
|
|
|
assert(merge.keys().size() == merge.values().size());
|
|
|
|
|
|
|
|
// Hack to make sure last_sequence_for_key is correct
|
|
|
|
ParseInternalKey(merge.keys().front(), &ikey);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
last_sequence_for_key = ikey.sequence;
|
|
|
|
visible_in_snapshot = visible;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!drop) {
|
|
|
|
// We may write a single key (e.g.: for Put/Delete or successful merge).
|
|
|
|
// Or we may instead have to write a sequence/list of keys.
|
|
|
|
// We have to write a sequence iff we have an unsuccessful merge
|
|
|
|
bool has_merge_list = current_entry_is_merging && !merge.IsSuccess();
|
|
|
|
const std::deque<std::string>* keys = nullptr;
|
|
|
|
const std::deque<std::string>* values = nullptr;
|
|
|
|
std::deque<std::string>::const_reverse_iterator key_iter;
|
|
|
|
std::deque<std::string>::const_reverse_iterator value_iter;
|
|
|
|
if (has_merge_list) {
|
|
|
|
keys = &merge.keys();
|
|
|
|
values = &merge.values();
|
|
|
|
key_iter = keys->rbegin(); // The back (*rbegin()) is the first key
|
|
|
|
value_iter = values->rbegin();
|
|
|
|
|
|
|
|
key = Slice(*key_iter);
|
|
|
|
value = Slice(*value_iter);
|
|
|
|
}
|
|
|
|
|
|
|
|
// If we have a list of keys to write, traverse the list.
|
|
|
|
// If we have a single key to write, simply write that key.
|
|
|
|
while (true) {
|
|
|
|
// Invariant: key,value,ikey will always be the next entry to write
|
|
|
|
char* kptr = (char*)key.data();
|
|
|
|
std::string kstr;
|
|
|
|
|
|
|
|
// Zeroing out the sequence number leads to better compression.
|
|
|
|
// If this is the bottommost level (no files in lower levels)
|
|
|
|
// and the earliest snapshot is larger than this seqno
|
|
|
|
// then we can squash the seqno to zero.
|
|
|
|
if (bottommost_level_ && ikey.sequence < earliest_snapshot_ &&
|
|
|
|
ikey.type != kTypeMerge) {
|
|
|
|
assert(ikey.type != kTypeDeletion);
|
|
|
|
// make a copy because updating in place would cause problems
|
|
|
|
// with the priority queue that is managing the input key iterator
|
|
|
|
kstr.assign(key.data(), key.size());
|
|
|
|
kptr = (char*)kstr.c_str();
|
|
|
|
UpdateInternalKey(kptr, key.size(), (uint64_t)0, ikey.type);
|
|
|
|
}
|
|
|
|
|
|
|
|
Slice newkey(kptr, key.size());
|
|
|
|
assert((key.clear(), 1)); // we do not need 'key' anymore
|
|
|
|
|
|
|
|
// Open output file if necessary
|
|
|
|
if (compact_->builder == nullptr) {
|
|
|
|
status = OpenCompactionOutputFile();
|
|
|
|
if (!status.ok()) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
SequenceNumber seqno = GetInternalKeySeqno(newkey);
|
|
|
|
if (compact_->builder->NumEntries() == 0) {
|
|
|
|
compact_->current_output()->smallest.DecodeFrom(newkey);
|
|
|
|
compact_->current_output()->smallest_seqno = seqno;
|
|
|
|
} else {
|
|
|
|
compact_->current_output()->smallest_seqno =
|
|
|
|
std::min(compact_->current_output()->smallest_seqno, seqno);
|
|
|
|
}
|
|
|
|
compact_->current_output()->largest.DecodeFrom(newkey);
|
|
|
|
compact_->builder->Add(newkey, value);
|
2014-11-01 02:36:07 +01:00
|
|
|
compact_->num_output_records++,
|
2014-11-01 00:31:25 +01:00
|
|
|
compact_->current_output()->largest_seqno =
|
|
|
|
std::max(compact_->current_output()->largest_seqno, seqno);
|
|
|
|
|
|
|
|
// Close output file if it is big enough
|
|
|
|
if (compact_->builder->FileSize() >=
|
|
|
|
compact_->compaction->MaxOutputFileSize()) {
|
|
|
|
status = FinishCompactionOutputFile(input);
|
|
|
|
if (!status.ok()) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// If we have a list of entries, move to next element
|
|
|
|
// If we only had one entry, then break the loop.
|
|
|
|
if (has_merge_list) {
|
|
|
|
++key_iter;
|
|
|
|
++value_iter;
|
|
|
|
|
|
|
|
// If at end of list
|
|
|
|
if (key_iter == keys->rend() || value_iter == values->rend()) {
|
|
|
|
// Sanity Check: if one ends, then both end
|
|
|
|
assert(key_iter == keys->rend() && value_iter == values->rend());
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Otherwise not at end of list. Update key, value, and ikey.
|
|
|
|
key = Slice(*key_iter);
|
|
|
|
value = Slice(*value_iter);
|
|
|
|
ParseInternalKey(key, &ikey);
|
|
|
|
|
|
|
|
} else {
|
|
|
|
// Only had one item to begin with (Put/Delete)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
} // while (true)
|
|
|
|
} // if (!drop)
|
|
|
|
|
|
|
|
// MergeUntil has moved input to the next entry
|
|
|
|
if (!current_entry_is_merging) {
|
|
|
|
input->Next();
|
|
|
|
}
|
|
|
|
}
|
2015-03-03 19:59:36 +01:00
|
|
|
RecordTick(stats_, FILTER_OPERATION_TOTAL_TIME, total_filter_time);
|
2014-11-01 00:31:25 +01:00
|
|
|
if (key_drop_user > 0) {
|
|
|
|
RecordTick(stats_, COMPACTION_KEY_DROP_USER, key_drop_user);
|
|
|
|
}
|
|
|
|
if (key_drop_newer_entry > 0) {
|
|
|
|
RecordTick(stats_, COMPACTION_KEY_DROP_NEWER_ENTRY, key_drop_newer_entry);
|
|
|
|
}
|
|
|
|
if (key_drop_obsolete > 0) {
|
|
|
|
RecordTick(stats_, COMPACTION_KEY_DROP_OBSOLETE, key_drop_obsolete);
|
|
|
|
}
|
|
|
|
RecordCompactionIOStats();
|
|
|
|
|
|
|
|
return status;
|
|
|
|
}
|
|
|
|
|
|
|
|
void CompactionJob::CallCompactionFilterV2(
|
2015-03-03 19:59:36 +01:00
|
|
|
CompactionFilterV2* compaction_filter_v2, uint64_t* time) {
|
2014-11-01 00:31:25 +01:00
|
|
|
if (compact_ == nullptr || compaction_filter_v2 == nullptr) {
|
|
|
|
return;
|
|
|
|
}
|
2015-03-13 18:45:40 +01:00
|
|
|
AutoThreadOperationStageUpdater stage_updater(
|
|
|
|
ThreadStatus::STAGE_COMPACTION_FILTER_V2);
|
2014-11-01 00:31:25 +01:00
|
|
|
|
|
|
|
// Assemble slice vectors for user keys and existing values.
|
|
|
|
// We also keep track of our parsed internal key structs because
|
|
|
|
// we may need to access the sequence number in the event that
|
|
|
|
// keys are garbage collected during the filter process.
|
|
|
|
std::vector<ParsedInternalKey> ikey_buf;
|
|
|
|
std::vector<Slice> user_key_buf;
|
|
|
|
std::vector<Slice> existing_value_buf;
|
|
|
|
|
|
|
|
for (const auto& key : compact_->key_str_buf_) {
|
|
|
|
ParsedInternalKey ikey;
|
|
|
|
ParseInternalKey(Slice(key), &ikey);
|
|
|
|
ikey_buf.emplace_back(ikey);
|
|
|
|
user_key_buf.emplace_back(ikey.user_key);
|
|
|
|
}
|
|
|
|
for (const auto& value : compact_->existing_value_str_buf_) {
|
|
|
|
existing_value_buf.emplace_back(Slice(value));
|
|
|
|
}
|
|
|
|
|
|
|
|
// If the user has specified a compaction filter and the sequence
|
|
|
|
// number is greater than any external snapshot, then invoke the
|
|
|
|
// filter.
|
|
|
|
// If the return value of the compaction filter is true, replace
|
|
|
|
// the entry with a delete marker.
|
2015-03-03 19:59:36 +01:00
|
|
|
StopWatchNano timer(env_, stats_ != nullptr);
|
2014-11-01 00:31:25 +01:00
|
|
|
compact_->to_delete_buf_ = compaction_filter_v2->Filter(
|
|
|
|
compact_->compaction->level(), user_key_buf, existing_value_buf,
|
|
|
|
&compact_->new_value_buf_, &compact_->value_changed_buf_);
|
2015-03-03 19:59:36 +01:00
|
|
|
*time = timer.ElapsedNanos();
|
2014-11-01 00:31:25 +01:00
|
|
|
// new_value_buf_.size() <= to_delete__buf_.size(). "=" iff all
|
|
|
|
// kv-pairs in this compaction run needs to be deleted.
|
|
|
|
assert(compact_->to_delete_buf_.size() == compact_->key_str_buf_.size());
|
|
|
|
assert(compact_->to_delete_buf_.size() ==
|
|
|
|
compact_->existing_value_str_buf_.size());
|
2014-11-05 06:31:11 +01:00
|
|
|
assert(compact_->value_changed_buf_.empty() ||
|
|
|
|
compact_->to_delete_buf_.size() ==
|
2014-11-01 00:31:25 +01:00
|
|
|
compact_->value_changed_buf_.size());
|
|
|
|
|
|
|
|
int new_value_idx = 0;
|
|
|
|
for (unsigned int i = 0; i < compact_->to_delete_buf_.size(); ++i) {
|
|
|
|
if (compact_->to_delete_buf_[i]) {
|
|
|
|
// update the string buffer directly
|
|
|
|
// the Slice buffer points to the updated buffer
|
|
|
|
UpdateInternalKey(&compact_->key_str_buf_[i][0],
|
|
|
|
compact_->key_str_buf_[i].size(), ikey_buf[i].sequence,
|
|
|
|
kTypeDeletion);
|
|
|
|
|
|
|
|
// no value associated with delete
|
|
|
|
compact_->existing_value_str_buf_[i].clear();
|
|
|
|
RecordTick(stats_, COMPACTION_KEY_DROP_USER);
|
2014-11-05 06:31:11 +01:00
|
|
|
} else if (!compact_->value_changed_buf_.empty() &&
|
|
|
|
compact_->value_changed_buf_[i]) {
|
2014-11-01 00:31:25 +01:00
|
|
|
compact_->existing_value_str_buf_[i] =
|
|
|
|
compact_->new_value_buf_[new_value_idx++];
|
|
|
|
}
|
|
|
|
} // for
|
|
|
|
}
|
|
|
|
|
|
|
|
Status CompactionJob::FinishCompactionOutputFile(Iterator* input) {
|
2015-03-13 18:45:40 +01:00
|
|
|
AutoThreadOperationStageUpdater stage_updater(
|
|
|
|
ThreadStatus::STAGE_COMPACTION_SYNC_FILE);
|
2014-11-01 00:31:25 +01:00
|
|
|
assert(compact_ != nullptr);
|
|
|
|
assert(compact_->outfile);
|
|
|
|
assert(compact_->builder != nullptr);
|
|
|
|
|
|
|
|
const uint64_t output_number = compact_->current_output()->number;
|
|
|
|
const uint32_t output_path_id = compact_->current_output()->path_id;
|
|
|
|
assert(output_number != 0);
|
|
|
|
|
|
|
|
// Check for iterator errors
|
|
|
|
Status s = input->status();
|
|
|
|
const uint64_t current_entries = compact_->builder->NumEntries();
|
|
|
|
if (s.ok()) {
|
|
|
|
s = compact_->builder->Finish();
|
|
|
|
} else {
|
|
|
|
compact_->builder->Abandon();
|
|
|
|
}
|
|
|
|
const uint64_t current_bytes = compact_->builder->FileSize();
|
|
|
|
compact_->current_output()->file_size = current_bytes;
|
|
|
|
compact_->total_bytes += current_bytes;
|
|
|
|
compact_->builder.reset();
|
|
|
|
|
|
|
|
// Finish and check for file errors
|
|
|
|
if (s.ok() && !db_options_.disableDataSync) {
|
|
|
|
if (db_options_.use_fsync) {
|
|
|
|
StopWatch sw(env_, stats_, COMPACTION_OUTFILE_SYNC_MICROS);
|
|
|
|
s = compact_->outfile->Fsync();
|
|
|
|
} else {
|
|
|
|
StopWatch sw(env_, stats_, COMPACTION_OUTFILE_SYNC_MICROS);
|
|
|
|
s = compact_->outfile->Sync();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (s.ok()) {
|
|
|
|
s = compact_->outfile->Close();
|
|
|
|
}
|
|
|
|
compact_->outfile.reset();
|
|
|
|
|
|
|
|
if (s.ok() && current_entries > 0) {
|
|
|
|
// Verify that the table is usable
|
|
|
|
ColumnFamilyData* cfd = compact_->compaction->column_family_data();
|
|
|
|
FileDescriptor fd(output_number, output_path_id, current_bytes);
|
|
|
|
Iterator* iter = cfd->table_cache()->NewIterator(
|
|
|
|
ReadOptions(), env_options_, cfd->internal_comparator(), fd);
|
|
|
|
s = iter->status();
|
2015-04-18 00:26:50 +02:00
|
|
|
|
2015-05-06 04:01:12 +02:00
|
|
|
if (s.ok() && paranoid_file_checks_) {
|
2015-04-18 00:26:50 +02:00
|
|
|
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {}
|
|
|
|
s = iter->status();
|
|
|
|
}
|
|
|
|
|
2014-11-01 00:31:25 +01:00
|
|
|
delete iter;
|
|
|
|
if (s.ok()) {
|
2015-02-12 18:54:48 +01:00
|
|
|
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
|
|
|
|
"[%s] [JOB %d] Generated table #%" PRIu64 ": %" PRIu64
|
|
|
|
" keys, %" PRIu64 " bytes",
|
|
|
|
cfd->GetName().c_str(), job_id_, output_number, current_entries,
|
|
|
|
current_bytes);
|
Include bunch of more events into EventLogger
Summary:
Added these events:
* Recovery start, finish and also when recovery creates a file
* Trivial move
* Compaction start, finish and when compaction creates a file
* Flush start, finish
Also includes small fix to EventLogger
Also added option ROCKSDB_PRINT_EVENTS_TO_STDOUT which is useful when we debug things. I've spent far too much time chasing LOG files.
Still didn't get sst table properties in JSON. They are written very deeply into the stack. I'll address in separate diff.
TODO:
* Write specification. Let's first use this for a while and figure out what's good data to put here, too. After that we'll write spec
* Write tools that parse and analyze LOGs. This can be in python or go. Good intern task.
Test Plan: Ran db_bench with ROCKSDB_PRINT_EVENTS_TO_STDOUT. Here's the output: https://phabricator.fb.com/P19811976
Reviewers: sdong, yhchiang, rven, MarkCallaghan, kradhakrishnan, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37521
2015-04-28 00:20:02 +02:00
|
|
|
event_logger_->Log() << "job" << job_id_ << "event"
|
|
|
|
<< "table_file_creation"
|
|
|
|
<< "file_number" << output_number << "file_size"
|
|
|
|
<< current_bytes;
|
2014-11-01 00:31:25 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2015-05-06 04:01:12 +02:00
|
|
|
Status CompactionJob::InstallCompactionResults(
|
|
|
|
InstrumentedMutex* db_mutex, const MutableCFOptions& mutable_cf_options) {
|
2014-11-08 00:44:12 +01:00
|
|
|
db_mutex->AssertHeld();
|
2014-11-01 00:31:25 +01:00
|
|
|
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 20:44:17 +01:00
|
|
|
auto* compaction = compact_->compaction;
|
2014-11-01 00:31:25 +01:00
|
|
|
// paranoia: verify that the files that we started with
|
|
|
|
// still exist in the current version and in the same original level.
|
|
|
|
// This ensures that a concurrent compaction did not erroneously
|
|
|
|
// pick the same files to compact_.
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 20:44:17 +01:00
|
|
|
if (!versions_->VerifyCompactionFileConsistency(compaction)) {
|
|
|
|
Compaction::InputLevelSummaryBuffer inputs_summary;
|
|
|
|
|
2014-11-04 20:07:11 +01:00
|
|
|
Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 20:44:17 +01:00
|
|
|
"[%s] [JOB %d] Compaction %s aborted",
|
|
|
|
compaction->column_family_data()->GetName().c_str(), job_id_,
|
|
|
|
compaction->InputLevelSummary(&inputs_summary));
|
2014-11-01 00:31:25 +01:00
|
|
|
return Status::Corruption("Compaction input files inconsistent");
|
|
|
|
}
|
|
|
|
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 20:44:17 +01:00
|
|
|
{
|
|
|
|
Compaction::InputLevelSummaryBuffer inputs_summary;
|
|
|
|
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
|
|
|
|
"[%s] [JOB %d] Compacted %s => %" PRIu64 " bytes",
|
|
|
|
compaction->column_family_data()->GetName().c_str(), job_id_,
|
|
|
|
compaction->InputLevelSummary(&inputs_summary), compact_->total_bytes);
|
|
|
|
}
|
2014-11-01 00:31:25 +01:00
|
|
|
|
|
|
|
// Add compaction outputs
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 20:44:17 +01:00
|
|
|
compaction->AddInputDeletions(compact_->compaction->edit());
|
2014-11-01 00:31:25 +01:00
|
|
|
for (size_t i = 0; i < compact_->outputs.size(); i++) {
|
|
|
|
const CompactionState::Output& out = compact_->outputs[i];
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 20:44:17 +01:00
|
|
|
compaction->edit()->AddFile(
|
|
|
|
compaction->output_level(), out.number, out.path_id, out.file_size,
|
|
|
|
out.smallest, out.largest, out.smallest_seqno, out.largest_seqno);
|
|
|
|
}
|
|
|
|
return versions_->LogAndApply(compaction->column_family_data(),
|
2015-05-06 04:01:12 +02:00
|
|
|
mutable_cf_options, compaction->edit(),
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 20:44:17 +01:00
|
|
|
db_mutex, db_directory_);
|
2014-11-01 00:31:25 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
// Given a sequence number, return the sequence number of the
|
|
|
|
// earliest snapshot that this sequence number is visible in.
|
|
|
|
// The snapshots themselves are arranged in ascending order of
|
|
|
|
// sequence numbers.
|
|
|
|
// Employ a sequential search because the total number of
|
|
|
|
// snapshots are typically small.
|
|
|
|
inline SequenceNumber CompactionJob::findEarliestVisibleSnapshot(
|
|
|
|
SequenceNumber in, const std::vector<SequenceNumber>& snapshots,
|
|
|
|
SequenceNumber* prev_snapshot) {
|
2014-11-14 20:35:48 +01:00
|
|
|
assert(snapshots.size());
|
2014-11-01 00:31:25 +01:00
|
|
|
SequenceNumber prev __attribute__((unused)) = 0;
|
|
|
|
for (const auto cur : snapshots) {
|
|
|
|
assert(prev <= cur);
|
|
|
|
if (cur >= in) {
|
|
|
|
*prev_snapshot = prev;
|
|
|
|
return cur;
|
|
|
|
}
|
|
|
|
prev = cur; // assignment
|
|
|
|
assert(prev);
|
|
|
|
}
|
2014-11-04 20:07:11 +01:00
|
|
|
Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
|
|
|
|
"CompactionJob is not able to find snapshot"
|
|
|
|
" with SeqId later than %" PRIu64
|
|
|
|
": current MaxSeqId is %" PRIu64 "",
|
|
|
|
in, snapshots[snapshots.size() - 1]);
|
2014-11-01 00:31:25 +01:00
|
|
|
assert(0);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
void CompactionJob::RecordCompactionIOStats() {
|
|
|
|
RecordTick(stats_, COMPACT_READ_BYTES, IOSTATS(bytes_read));
|
2015-05-07 07:50:35 +02:00
|
|
|
ThreadStatusUtil::IncreaseThreadOperationProperty(
|
|
|
|
ThreadStatus::COMPACTION_BYTES_READ, IOSTATS(bytes_read));
|
2014-11-01 00:31:25 +01:00
|
|
|
IOSTATS_RESET(bytes_read);
|
|
|
|
RecordTick(stats_, COMPACT_WRITE_BYTES, IOSTATS(bytes_written));
|
2015-05-07 07:50:35 +02:00
|
|
|
ThreadStatusUtil::IncreaseThreadOperationProperty(
|
|
|
|
ThreadStatus::COMPACTION_BYTES_WRITTEN, IOSTATS(bytes_written));
|
2014-11-01 00:31:25 +01:00
|
|
|
IOSTATS_RESET(bytes_written);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status CompactionJob::OpenCompactionOutputFile() {
|
|
|
|
assert(compact_ != nullptr);
|
|
|
|
assert(compact_->builder == nullptr);
|
2014-11-08 00:44:12 +01:00
|
|
|
// no need to lock because VersionSet::next_file_number_ is atomic
|
|
|
|
uint64_t file_number = versions_->NewFileNumber();
|
2014-11-01 00:31:25 +01:00
|
|
|
// Make the output file
|
|
|
|
std::string fname = TableFileName(db_options_.db_paths, file_number,
|
|
|
|
compact_->compaction->GetOutputPathId());
|
|
|
|
Status s = env_->NewWritableFile(fname, &compact_->outfile, env_options_);
|
|
|
|
|
|
|
|
if (!s.ok()) {
|
|
|
|
Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
|
2015-02-12 18:54:48 +01:00
|
|
|
"[%s] [JOB %d] OpenCompactionOutputFiles for table #%" PRIu64
|
2014-11-04 20:07:11 +01:00
|
|
|
" fails at NewWritableFile with status %s",
|
2015-02-12 18:54:48 +01:00
|
|
|
compact_->compaction->column_family_data()->GetName().c_str(), job_id_,
|
2014-11-01 00:31:25 +01:00
|
|
|
file_number, s.ToString().c_str());
|
|
|
|
LogFlush(db_options_.info_log);
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
CompactionState::Output out;
|
|
|
|
out.number = file_number;
|
|
|
|
out.path_id = compact_->compaction->GetOutputPathId();
|
|
|
|
out.smallest.Clear();
|
|
|
|
out.largest.Clear();
|
|
|
|
out.smallest_seqno = out.largest_seqno = 0;
|
|
|
|
|
|
|
|
compact_->outputs.push_back(out);
|
|
|
|
compact_->outfile->SetIOPriority(Env::IO_LOW);
|
2015-05-06 04:01:12 +02:00
|
|
|
compact_->outfile->SetPreallocationBlockSize(
|
|
|
|
static_cast<size_t>(compact_->compaction->OutputFilePreallocationSize()));
|
2014-11-01 00:31:25 +01:00
|
|
|
|
|
|
|
ColumnFamilyData* cfd = compact_->compaction->column_family_data();
|
2015-02-17 17:03:45 +01:00
|
|
|
bool skip_filters = false;
|
|
|
|
|
|
|
|
// If the Column family flag is to only optimize filters for hits,
|
|
|
|
// we can skip creating filters if this is the bottommost_level where
|
|
|
|
// data is going to be found
|
|
|
|
//
|
|
|
|
if (cfd->ioptions()->optimize_filters_for_hits && bottommost_level_) {
|
|
|
|
skip_filters = true;
|
|
|
|
}
|
|
|
|
|
2014-11-01 00:31:25 +01:00
|
|
|
compact_->builder.reset(NewTableBuilder(
|
A new call back to TablePropertiesCollector to allow users know the entry is add, delete or merge
Summary:
Currently users have no idea a key is add, delete or merge from TablePropertiesCollector call back. Add a new function to add it.
Also refactor the codes so that
(1) make table property collector and internal table property collector two separate data structures with the later one now exposed
(2) table builders only receive internal table properties
Test Plan: Add cases in table_properties_collector_test to cover both of old and new ways of using TablePropertiesCollector.
Reviewers: yhchiang, igor.sugak, rven, igor
Reviewed By: rven, igor
Subscribers: meyering, yoshinorim, maykov, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D35373
2015-04-06 19:04:30 +02:00
|
|
|
*cfd->ioptions(), cfd->internal_comparator(),
|
|
|
|
cfd->int_tbl_prop_collector_factories(), compact_->outfile.get(),
|
2014-11-01 00:31:25 +01:00
|
|
|
compact_->compaction->OutputCompressionType(),
|
2015-02-17 17:03:45 +01:00
|
|
|
cfd->ioptions()->compression_opts, skip_filters));
|
2014-11-01 00:31:25 +01:00
|
|
|
LogFlush(db_options_.info_log);
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2014-11-10 20:57:58 +01:00
|
|
|
void CompactionJob::CleanupCompaction(const Status& status) {
|
2014-11-01 00:31:25 +01:00
|
|
|
if (compact_->builder != nullptr) {
|
|
|
|
// May happen if we get a shutdown call in the middle of compaction
|
|
|
|
compact_->builder->Abandon();
|
|
|
|
compact_->builder.reset();
|
|
|
|
} else {
|
|
|
|
assert(!status.ok() || compact_->outfile == nullptr);
|
|
|
|
}
|
|
|
|
for (size_t i = 0; i < compact_->outputs.size(); i++) {
|
|
|
|
const CompactionState::Output& out = compact_->outputs[i];
|
|
|
|
|
|
|
|
// If this file was inserted into the table cache then remove
|
|
|
|
// them here because this compaction was not committed.
|
|
|
|
if (!status.ok()) {
|
|
|
|
TableCache::Evict(table_cache_.get(), out.number);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
delete compact_;
|
|
|
|
compact_ = nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace rocksdb
|