2014-11-01 00:31:25 +01:00
|
|
|
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under the BSD-style license found in the
|
|
|
|
// LICENSE file in the root directory of this source tree. An additional grant
|
|
|
|
// of patent rights can be found in the PATENTS file in the same directory.
|
|
|
|
//
|
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
#pragma once
|
|
|
|
|
|
|
|
#include <atomic>
|
|
|
|
#include <deque>
|
|
|
|
#include <limits>
|
|
|
|
#include <set>
|
|
|
|
#include <utility>
|
|
|
|
#include <vector>
|
|
|
|
#include <string>
|
|
|
|
#include <functional>
|
|
|
|
|
|
|
|
#include "db/dbformat.h"
|
|
|
|
#include "db/log_writer.h"
|
|
|
|
#include "db/column_family.h"
|
|
|
|
#include "db/version_edit.h"
|
|
|
|
#include "db/memtable_list.h"
|
|
|
|
#include "port/port.h"
|
|
|
|
#include "rocksdb/db.h"
|
|
|
|
#include "rocksdb/env.h"
|
|
|
|
#include "rocksdb/memtablerep.h"
|
|
|
|
#include "rocksdb/compaction_filter.h"
|
2015-06-03 02:07:16 +02:00
|
|
|
#include "rocksdb/compaction_job_stats.h"
|
2014-11-01 00:31:25 +01:00
|
|
|
#include "rocksdb/transaction_log.h"
|
|
|
|
#include "util/autovector.h"
|
Include bunch of more events into EventLogger
Summary:
Added these events:
* Recovery start, finish and also when recovery creates a file
* Trivial move
* Compaction start, finish and when compaction creates a file
* Flush start, finish
Also includes small fix to EventLogger
Also added option ROCKSDB_PRINT_EVENTS_TO_STDOUT which is useful when we debug things. I've spent far too much time chasing LOG files.
Still didn't get sst table properties in JSON. They are written very deeply into the stack. I'll address in separate diff.
TODO:
* Write specification. Let's first use this for a while and figure out what's good data to put here, too. After that we'll write spec
* Write tools that parse and analyze LOGs. This can be in python or go. Good intern task.
Test Plan: Ran db_bench with ROCKSDB_PRINT_EVENTS_TO_STDOUT. Here's the output: https://phabricator.fb.com/P19811976
Reviewers: sdong, yhchiang, rven, MarkCallaghan, kradhakrishnan, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37521
2015-04-28 00:20:02 +02:00
|
|
|
#include "util/event_logger.h"
|
2014-11-01 00:31:25 +01:00
|
|
|
#include "util/stop_watch.h"
|
|
|
|
#include "util/thread_local.h"
|
|
|
|
#include "util/scoped_arena_iterator.h"
|
|
|
|
#include "db/internal_stats.h"
|
|
|
|
#include "db/write_controller.h"
|
|
|
|
#include "db/flush_scheduler.h"
|
|
|
|
#include "db/write_thread.h"
|
|
|
|
#include "db/job_context.h"
|
|
|
|
|
|
|
|
namespace rocksdb {
|
|
|
|
|
|
|
|
class MemTable;
|
|
|
|
class TableCache;
|
|
|
|
class Version;
|
|
|
|
class VersionEdit;
|
|
|
|
class VersionSet;
|
|
|
|
class Arena;
|
|
|
|
|
|
|
|
class CompactionJob {
|
|
|
|
public:
|
2015-02-12 18:54:48 +01:00
|
|
|
CompactionJob(int job_id, Compaction* compaction, const DBOptions& db_options,
|
2014-11-01 00:31:25 +01:00
|
|
|
const EnvOptions& env_options, VersionSet* versions,
|
2014-11-08 00:44:12 +01:00
|
|
|
std::atomic<bool>* shutting_down, LogBuffer* log_buffer,
|
2015-01-26 22:59:38 +01:00
|
|
|
Directory* db_directory, Directory* output_directory,
|
2015-05-06 04:01:12 +02:00
|
|
|
Statistics* stats,
|
|
|
|
std::vector<SequenceNumber> existing_snapshots,
|
|
|
|
std::shared_ptr<Cache> table_cache,
|
Include bunch of more events into EventLogger
Summary:
Added these events:
* Recovery start, finish and also when recovery creates a file
* Trivial move
* Compaction start, finish and when compaction creates a file
* Flush start, finish
Also includes small fix to EventLogger
Also added option ROCKSDB_PRINT_EVENTS_TO_STDOUT which is useful when we debug things. I've spent far too much time chasing LOG files.
Still didn't get sst table properties in JSON. They are written very deeply into the stack. I'll address in separate diff.
TODO:
* Write specification. Let's first use this for a while and figure out what's good data to put here, too. After that we'll write spec
* Write tools that parse and analyze LOGs. This can be in python or go. Good intern task.
Test Plan: Ran db_bench with ROCKSDB_PRINT_EVENTS_TO_STDOUT. Here's the output: https://phabricator.fb.com/P19811976
Reviewers: sdong, yhchiang, rven, MarkCallaghan, kradhakrishnan, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37521
2015-04-28 00:20:02 +02:00
|
|
|
std::function<uint64_t()> yield_callback,
|
2015-06-02 23:12:23 +02:00
|
|
|
EventLogger* event_logger, bool paranoid_file_checks,
|
2015-06-03 02:07:16 +02:00
|
|
|
const std::string& dbname,
|
|
|
|
CompactionJobStats* compaction_job_stats);
|
2014-11-01 00:31:25 +01:00
|
|
|
|
2015-03-13 18:45:40 +01:00
|
|
|
~CompactionJob();
|
2014-11-01 00:31:25 +01:00
|
|
|
|
|
|
|
// no copy/move
|
|
|
|
CompactionJob(CompactionJob&& job) = delete;
|
|
|
|
CompactionJob(const CompactionJob& job) = delete;
|
|
|
|
CompactionJob& operator=(const CompactionJob& job) = delete;
|
|
|
|
|
|
|
|
// REQUIRED: mutex held
|
|
|
|
void Prepare();
|
|
|
|
// REQUIRED mutex not held
|
|
|
|
Status Run();
|
|
|
|
// REQUIRED: mutex held
|
|
|
|
// status is the return of Run()
|
2015-05-06 04:01:12 +02:00
|
|
|
void Install(Status* status, const MutableCFOptions& mutable_cf_options,
|
|
|
|
InstrumentedMutex* db_mutex);
|
2014-11-01 00:31:25 +01:00
|
|
|
|
|
|
|
private:
|
2015-05-07 07:50:35 +02:00
|
|
|
// update the thread status for starting a compaction.
|
|
|
|
void ReportStartedCompaction(Compaction* compaction);
|
2014-11-01 00:31:25 +01:00
|
|
|
void AllocateCompactionOutputFileNumbers();
|
2015-07-14 09:09:20 +02:00
|
|
|
// Processes batches of keys with the same prefixes. This is used for
|
|
|
|
// CompactionFilterV2.
|
|
|
|
Status ProcessPrefixBatches(ColumnFamilyData* cfd,
|
|
|
|
int64_t* imm_micros,
|
|
|
|
Iterator* input,
|
|
|
|
CompactionFilterV2* compaction_filter_v2);
|
2014-11-01 00:31:25 +01:00
|
|
|
// Call compaction filter if is_compaction_v2 is not true. Then iterate
|
|
|
|
// through input and compact the kv-pairs
|
|
|
|
Status ProcessKeyValueCompaction(int64_t* imm_micros, Iterator* input,
|
2014-11-01 02:36:07 +01:00
|
|
|
bool is_compaction_v2);
|
2015-07-15 18:55:45 +02:00
|
|
|
|
|
|
|
Status WriteKeyValue(const Slice& key, const Slice& value,
|
|
|
|
const ParsedInternalKey& ikey,
|
|
|
|
const Status& input_status);
|
2014-11-01 00:31:25 +01:00
|
|
|
// Call compaction_filter_v2->Filter() on kv-pairs in compact
|
2015-03-03 19:59:36 +01:00
|
|
|
void CallCompactionFilterV2(CompactionFilterV2* compaction_filter_v2,
|
|
|
|
uint64_t* time);
|
2015-07-15 18:55:45 +02:00
|
|
|
Status FinishCompactionOutputFile(const Status& input_status);
|
2015-05-06 04:01:12 +02:00
|
|
|
Status InstallCompactionResults(InstrumentedMutex* db_mutex,
|
|
|
|
const MutableCFOptions& mutable_cf_options);
|
2014-11-01 00:31:25 +01:00
|
|
|
SequenceNumber findEarliestVisibleSnapshot(
|
|
|
|
SequenceNumber in, const std::vector<SequenceNumber>& snapshots,
|
|
|
|
SequenceNumber* prev_snapshot);
|
|
|
|
void RecordCompactionIOStats();
|
|
|
|
Status OpenCompactionOutputFile();
|
2014-11-10 20:57:58 +01:00
|
|
|
void CleanupCompaction(const Status& status);
|
2015-06-03 02:07:16 +02:00
|
|
|
void UpdateCompactionJobStats(
|
|
|
|
const InternalStats::CompactionStats& stats) const;
|
|
|
|
void RecordDroppedKeys(int64_t* key_drop_user,
|
|
|
|
int64_t* key_drop_newer_entry,
|
|
|
|
int64_t* key_drop_obsolete);
|
2014-11-01 00:31:25 +01:00
|
|
|
|
2015-07-14 09:09:20 +02:00
|
|
|
void UpdateCompactionStats();
|
2015-06-18 08:40:34 +02:00
|
|
|
void UpdateCompactionInputStatsHelper(
|
|
|
|
int* num_files, uint64_t* bytes_read, int input_level);
|
|
|
|
|
2015-07-14 09:09:20 +02:00
|
|
|
void LogCompaction(ColumnFamilyData* cfd, Compaction* compaction);
|
|
|
|
|
2015-02-12 18:54:48 +01:00
|
|
|
int job_id_;
|
|
|
|
|
2014-11-01 00:31:25 +01:00
|
|
|
// CompactionJob state
|
|
|
|
struct CompactionState;
|
|
|
|
CompactionState* compact_;
|
2015-06-03 02:07:16 +02:00
|
|
|
CompactionJobStats* compaction_job_stats_;
|
2014-11-01 00:31:25 +01:00
|
|
|
|
|
|
|
bool bottommost_level_;
|
|
|
|
SequenceNumber earliest_snapshot_;
|
|
|
|
SequenceNumber visible_at_tip_;
|
|
|
|
SequenceNumber latest_snapshot_;
|
|
|
|
|
|
|
|
InternalStats::CompactionStats compaction_stats_;
|
|
|
|
|
|
|
|
// DBImpl state
|
2015-06-02 23:12:23 +02:00
|
|
|
const std::string& dbname_;
|
2014-11-01 00:31:25 +01:00
|
|
|
const DBOptions& db_options_;
|
|
|
|
const EnvOptions& env_options_;
|
|
|
|
Env* env_;
|
|
|
|
VersionSet* versions_;
|
|
|
|
std::atomic<bool>* shutting_down_;
|
|
|
|
LogBuffer* log_buffer_;
|
|
|
|
Directory* db_directory_;
|
2015-01-26 22:59:38 +01:00
|
|
|
Directory* output_directory_;
|
2014-11-01 00:31:25 +01:00
|
|
|
Statistics* stats_;
|
2015-05-06 04:01:12 +02:00
|
|
|
// If there were two snapshots with seq numbers s1 and
|
|
|
|
// s2 and s1 < s2, and if we find two instances of a key k1 then lies
|
|
|
|
// entirely within s1 and s2, then the earlier version of k1 can be safely
|
|
|
|
// deleted because that version is not visible in any snapshot.
|
|
|
|
std::vector<SequenceNumber> existing_snapshots_;
|
2014-11-01 00:31:25 +01:00
|
|
|
std::shared_ptr<Cache> table_cache_;
|
|
|
|
|
|
|
|
// yield callback
|
|
|
|
std::function<uint64_t()> yield_callback_;
|
Include bunch of more events into EventLogger
Summary:
Added these events:
* Recovery start, finish and also when recovery creates a file
* Trivial move
* Compaction start, finish and when compaction creates a file
* Flush start, finish
Also includes small fix to EventLogger
Also added option ROCKSDB_PRINT_EVENTS_TO_STDOUT which is useful when we debug things. I've spent far too much time chasing LOG files.
Still didn't get sst table properties in JSON. They are written very deeply into the stack. I'll address in separate diff.
TODO:
* Write specification. Let's first use this for a while and figure out what's good data to put here, too. After that we'll write spec
* Write tools that parse and analyze LOGs. This can be in python or go. Good intern task.
Test Plan: Ran db_bench with ROCKSDB_PRINT_EVENTS_TO_STDOUT. Here's the output: https://phabricator.fb.com/P19811976
Reviewers: sdong, yhchiang, rven, MarkCallaghan, kradhakrishnan, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37521
2015-04-28 00:20:02 +02:00
|
|
|
|
|
|
|
EventLogger* event_logger_;
|
2015-05-06 04:01:12 +02:00
|
|
|
|
|
|
|
bool paranoid_file_checks_;
|
2014-11-01 00:31:25 +01:00
|
|
|
};
|
|
|
|
|
|
|
|
} // namespace rocksdb
|