rocksdb/include/rocksdb/listener.h
sdong e7c24168d8 Move old files to warm tier in FIFO compactions (#8310)
Summary:
Some FIFO users want to keep the data for longer, but the old data is rarely accessed. This feature allows users to configure FIFO compaction so that data older than a threshold is moved to a warm storage tier.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/8310

Test Plan: Add several unit tests.

Reviewed By: ajkr

Differential Revision: D28493792

fbshipit-source-id: c14824ea634814dee5278b449ab5c98b6e0b5501
2021-08-09 12:51:14 -07:00

569 lines
22 KiB
C++

// Copyright (c) 2014 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
#pragma once
#include <chrono>
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "rocksdb/compaction_job_stats.h"
#include "rocksdb/compression_type.h"
#include "rocksdb/customizable.h"
#include "rocksdb/status.h"
#include "rocksdb/table_properties.h"
#include "rocksdb/types.h"
namespace ROCKSDB_NAMESPACE {
typedef std::unordered_map<std::string, std::shared_ptr<const TableProperties>>
TablePropertiesCollection;
class DB;
class ColumnFamilyHandle;
class Status;
struct CompactionJobStats;
struct TableFileCreationBriefInfo {
// the name of the database where the file was created
std::string db_name;
// the name of the column family where the file was created.
std::string cf_name;
// the path to the created file.
std::string file_path;
// the id of the job (which could be flush or compaction) that
// created the file.
int job_id;
// reason of creating the table.
TableFileCreationReason reason;
};
struct TableFileCreationInfo : public TableFileCreationBriefInfo {
TableFileCreationInfo() = default;
explicit TableFileCreationInfo(TableProperties&& prop)
: table_properties(prop) {}
// the size of the file.
uint64_t file_size;
// Detailed properties of the created file.
TableProperties table_properties;
// The status indicating whether the creation was successful or not.
Status status;
// The checksum of the table file being created
std::string file_checksum;
// The checksum function name of checksum generator used for this table file
std::string file_checksum_func_name;
};
enum class CompactionReason : int {
kUnknown = 0,
// [Level] number of L0 files > level0_file_num_compaction_trigger
kLevelL0FilesNum,
// [Level] total size of level > MaxBytesForLevel()
kLevelMaxLevelSize,
// [Universal] Compacting for size amplification
kUniversalSizeAmplification,
// [Universal] Compacting for size ratio
kUniversalSizeRatio,
// [Universal] number of sorted runs > level0_file_num_compaction_trigger
kUniversalSortedRunNum,
// [FIFO] total size > max_table_files_size
kFIFOMaxSize,
// [FIFO] reduce number of files.
kFIFOReduceNumFiles,
// [FIFO] files with creation time < (current_time - interval)
kFIFOTtl,
// Manual compaction
kManualCompaction,
// DB::SuggestCompactRange() marked files for compaction
kFilesMarkedForCompaction,
// [Level] Automatic compaction within bottommost level to cleanup duplicate
// versions of same user key, usually due to a released snapshot.
kBottommostFiles,
// Compaction based on TTL
kTtl,
// According to the comments in flush_job.cc, RocksDB treats flush as
// a level 0 compaction in internal stats.
kFlush,
// Compaction caused by external sst file ingestion
kExternalSstIngestion,
// Compaction due to SST file being too old
kPeriodicCompaction,
// Compaction in order to move files to temperature
kChangeTemperature,
// total number of compaction reasons, new reasons must be added above this.
kNumOfReasons,
};
enum class FlushReason : int {
kOthers = 0x00,
kGetLiveFiles = 0x01,
kShutDown = 0x02,
kExternalFileIngestion = 0x03,
kManualCompaction = 0x04,
kWriteBufferManager = 0x05,
kWriteBufferFull = 0x06,
kTest = 0x07,
kDeleteFiles = 0x08,
kAutoCompaction = 0x09,
kManualFlush = 0x0a,
kErrorRecovery = 0xb,
// When set the flush reason to kErrorRecoveryRetryFlush, SwitchMemtable
// will not be called to avoid many small immutable memtables.
kErrorRecoveryRetryFlush = 0xc,
kWalFull = 0xd,
};
// TODO: In the future, BackgroundErrorReason will only be used to indicate
// why the BG Error is happening (e.g., flush, compaction). We may introduce
// other data structure to indicate other essential information such as
// the file type (e.g., Manifest, SST) and special context.
enum class BackgroundErrorReason {
kFlush,
kCompaction,
kWriteCallback,
kMemTable,
kManifestWrite,
kFlushNoWAL,
kManifestWriteNoWAL,
};
enum class WriteStallCondition {
kNormal,
kDelayed,
kStopped,
};
struct WriteStallInfo {
// the name of the column family
std::string cf_name;
// state of the write controller
struct {
WriteStallCondition cur;
WriteStallCondition prev;
} condition;
};
#ifndef ROCKSDB_LITE
struct TableFileDeletionInfo {
// The name of the database where the file was deleted.
std::string db_name;
// The path to the deleted file.
std::string file_path;
// The id of the job which deleted the file.
int job_id;
// The status indicating whether the deletion was successful or not.
Status status;
};
enum class FileOperationType {
kRead,
kWrite,
kTruncate,
kClose,
kFlush,
kSync,
kFsync,
kRangeSync
};
struct FileOperationInfo {
using Duration = std::chrono::nanoseconds;
using SteadyTimePoint =
std::chrono::time_point<std::chrono::steady_clock, Duration>;
using SystemTimePoint =
std::chrono::time_point<std::chrono::system_clock, Duration>;
using StartTimePoint = std::pair<SystemTimePoint, SteadyTimePoint>;
using FinishTimePoint = SteadyTimePoint;
FileOperationType type;
const std::string& path;
uint64_t offset;
size_t length;
const Duration duration;
const SystemTimePoint& start_ts;
Status status;
FileOperationInfo(const FileOperationType _type, const std::string& _path,
const StartTimePoint& _start_ts,
const FinishTimePoint& _finish_ts, const Status& _status)
: type(_type),
path(_path),
duration(std::chrono::duration_cast<std::chrono::nanoseconds>(
_finish_ts - _start_ts.second)),
start_ts(_start_ts.first),
status(_status) {}
static StartTimePoint StartNow() {
return std::make_pair<SystemTimePoint, SteadyTimePoint>(
std::chrono::system_clock::now(), std::chrono::steady_clock::now());
}
static FinishTimePoint FinishNow() {
return std::chrono::steady_clock::now();
}
};
struct FlushJobInfo {
// the id of the column family
uint32_t cf_id;
// the name of the column family
std::string cf_name;
// the path to the newly created file
std::string file_path;
// the file number of the newly created file
uint64_t file_number;
// the oldest blob file referenced by the newly created file
uint64_t oldest_blob_file_number;
// the id of the thread that completed this flush job.
uint64_t thread_id;
// the job id, which is unique in the same thread.
int job_id;
// If true, then rocksdb is currently slowing-down all writes to prevent
// creating too many Level 0 files as compaction seems not able to
// catch up the write request speed. This indicates that there are
// too many files in Level 0.
bool triggered_writes_slowdown;
// If true, then rocksdb is currently blocking any writes to prevent
// creating more L0 files. This indicates that there are too many
// files in level 0. Compactions should try to compact L0 files down
// to lower levels as soon as possible.
bool triggered_writes_stop;
// The smallest sequence number in the newly created file
SequenceNumber smallest_seqno;
// The largest sequence number in the newly created file
SequenceNumber largest_seqno;
// Table properties of the table being flushed
TableProperties table_properties;
FlushReason flush_reason;
};
struct CompactionFileInfo {
// The level of the file.
int level;
// The file number of the file.
uint64_t file_number;
// The file number of the oldest blob file this SST file references.
uint64_t oldest_blob_file_number;
};
struct CompactionJobInfo {
~CompactionJobInfo() { status.PermitUncheckedError(); }
// the id of the column family where the compaction happened.
uint32_t cf_id;
// the name of the column family where the compaction happened.
std::string cf_name;
// the status indicating whether the compaction was successful or not.
Status status;
// the id of the thread that completed this compaction job.
uint64_t thread_id;
// the job id, which is unique in the same thread.
int job_id;
// the smallest input level of the compaction.
int base_input_level;
// the output level of the compaction.
int output_level;
// The following variables contain information about compaction inputs
// and outputs. A file may appear in both the input and output lists
// if it was simply moved to a different level. The order of elements
// is the same across input_files and input_file_infos; similarly, it is
// the same across output_files and output_file_infos.
// The names of the compaction input files.
std::vector<std::string> input_files;
// Additional information about the compaction input files.
std::vector<CompactionFileInfo> input_file_infos;
// The names of the compaction output files.
std::vector<std::string> output_files;
// Additional information about the compaction output files.
std::vector<CompactionFileInfo> output_file_infos;
// Table properties for input and output tables.
// The map is keyed by values from input_files and output_files.
TablePropertiesCollection table_properties;
// Reason to run the compaction
CompactionReason compaction_reason;
// Compression algorithm used for output files
CompressionType compression;
// Statistics and other additional details on the compaction
CompactionJobStats stats;
};
struct MemTableInfo {
// the name of the column family to which memtable belongs
std::string cf_name;
// Sequence number of the first element that was inserted
// into the memtable.
SequenceNumber first_seqno;
// Sequence number that is guaranteed to be smaller than or equal
// to the sequence number of any key that could be inserted into this
// memtable. It can then be assumed that any write with a larger(or equal)
// sequence number will be present in this memtable or a later memtable.
SequenceNumber earliest_seqno;
// Total number of entries in memtable
uint64_t num_entries;
// Total number of deletes in memtable
uint64_t num_deletes;
};
struct ExternalFileIngestionInfo {
// the name of the column family
std::string cf_name;
// Path of the file outside the DB
std::string external_file_path;
// Path of the file inside the DB
std::string internal_file_path;
// The global sequence number assigned to keys in this file
SequenceNumber global_seqno;
// Table properties of the table being flushed
TableProperties table_properties;
};
// EventListener class contains a set of callback functions that will
// be called when specific RocksDB event happens such as flush. It can
// be used as a building block for developing custom features such as
// stats-collector or external compaction algorithm.
//
// IMPORTANT
// Because compaction is needed to resolve a "writes stopped" condition,
// calling or waiting for any blocking DB write function (no_slowdown=false)
// from a compaction-related listener callback can hang RocksDB. For DB
// writes from a callback we recommend a WriteBatch and no_slowdown=true,
// because the WriteBatch can accumulate writes for later in case DB::Write
// returns Status::Incomplete. Similarly, calling CompactRange or similar
// could hang by waiting for a background worker that is occupied until the
// callback returns.
//
// Otherwise, callback functions should not run for an extended period of
// time before the function returns, because this will slow RocksDB.
//
// [Threading] All EventListener callback will be called using the
// actual thread that involves in that specific event. For example, it
// is the RocksDB background flush thread that does the actual flush to
// call EventListener::OnFlushCompleted().
//
// [Locking] All EventListener callbacks are designed to be called without
// the current thread holding any DB mutex. This is to prevent potential
// deadlock and performance issue when using EventListener callback
// in a complex way.
class EventListener : public Customizable {
public:
static const char* Type() { return "EventListener"; }
static Status CreateFromString(const ConfigOptions& options,
const std::string& id,
std::shared_ptr<EventListener>* result);
const char* Name() const override {
// Since EventListeners did not have a name previously, we will assume
// an empty name. Instances should override this method.
return "";
}
// A callback function to RocksDB which will be called whenever a
// registered RocksDB flushes a file. The default implementation is
// no-op.
//
// Note that the this function must be implemented in a way such that
// it should not run for an extended period of time before the function
// returns. Otherwise, RocksDB may be blocked.
virtual void OnFlushCompleted(DB* /*db*/,
const FlushJobInfo& /*flush_job_info*/) {}
// A callback function to RocksDB which will be called before a
// RocksDB starts to flush memtables. The default implementation is
// no-op.
//
// Note that the this function must be implemented in a way such that
// it should not run for an extended period of time before the function
// returns. Otherwise, RocksDB may be blocked.
virtual void OnFlushBegin(DB* /*db*/,
const FlushJobInfo& /*flush_job_info*/) {}
// A callback function for RocksDB which will be called whenever
// a SST file is deleted. Different from OnCompactionCompleted and
// OnFlushCompleted, this callback is designed for external logging
// service and thus only provide string parameters instead
// of a pointer to DB. Applications that build logic basic based
// on file creations and deletions is suggested to implement
// OnFlushCompleted and OnCompactionCompleted.
//
// Note that if applications would like to use the passed reference
// outside this function call, they should make copies from the
// returned value.
virtual void OnTableFileDeleted(const TableFileDeletionInfo& /*info*/) {}
// A callback function to RocksDB which will be called before a
// RocksDB starts to compact. The default implementation is
// no-op.
//
// Note that the this function must be implemented in a way such that
// it should not run for an extended period of time before the function
// returns. Otherwise, RocksDB may be blocked.
virtual void OnCompactionBegin(DB* /*db*/, const CompactionJobInfo& /*ci*/) {}
// A callback function for RocksDB which will be called whenever
// a registered RocksDB compacts a file. The default implementation
// is a no-op.
//
// Note that this function must be implemented in a way such that
// it should not run for an extended period of time before the function
// returns. Otherwise, RocksDB may be blocked.
//
// @param db a pointer to the rocksdb instance which just compacted
// a file.
// @param ci a reference to a CompactionJobInfo struct. 'ci' is released
// after this function is returned, and must be copied if it is needed
// outside of this function.
virtual void OnCompactionCompleted(DB* /*db*/,
const CompactionJobInfo& /*ci*/) {}
// A callback function for RocksDB which will be called whenever
// a SST file is created. Different from OnCompactionCompleted and
// OnFlushCompleted, this callback is designed for external logging
// service and thus only provide string parameters instead
// of a pointer to DB. Applications that build logic basic based
// on file creations and deletions is suggested to implement
// OnFlushCompleted and OnCompactionCompleted.
//
// Historically it will only be called if the file is successfully created.
// Now it will also be called on failure case. User can check info.status
// to see if it succeeded or not.
//
// Note that if applications would like to use the passed reference
// outside this function call, they should make copies from these
// returned value.
virtual void OnTableFileCreated(const TableFileCreationInfo& /*info*/) {}
// A callback function for RocksDB which will be called before
// a SST file is being created. It will follow by OnTableFileCreated after
// the creation finishes.
//
// Note that if applications would like to use the passed reference
// outside this function call, they should make copies from these
// returned value.
virtual void OnTableFileCreationStarted(
const TableFileCreationBriefInfo& /*info*/) {}
// A callback function for RocksDB which will be called before
// a memtable is made immutable.
//
// Note that the this function must be implemented in a way such that
// it should not run for an extended period of time before the function
// returns. Otherwise, RocksDB may be blocked.
//
// Note that if applications would like to use the passed reference
// outside this function call, they should make copies from these
// returned value.
virtual void OnMemTableSealed(const MemTableInfo& /*info*/) {}
// A callback function for RocksDB which will be called before
// a column family handle is deleted.
//
// Note that the this function must be implemented in a way such that
// it should not run for an extended period of time before the function
// returns. Otherwise, RocksDB may be blocked.
// @param handle is a pointer to the column family handle to be deleted
// which will become a dangling pointer after the deletion.
virtual void OnColumnFamilyHandleDeletionStarted(
ColumnFamilyHandle* /*handle*/) {}
// A callback function for RocksDB which will be called after an external
// file is ingested using IngestExternalFile.
//
// Note that the this function will run on the same thread as
// IngestExternalFile(), if this function is blocked, IngestExternalFile()
// will be blocked from finishing.
virtual void OnExternalFileIngested(
DB* /*db*/, const ExternalFileIngestionInfo& /*info*/) {}
// A callback function for RocksDB which will be called before setting the
// background error status to a non-OK value. The new background error status
// is provided in `bg_error` and can be modified by the callback. E.g., a
// callback can suppress errors by resetting it to Status::OK(), thus
// preventing the database from entering read-only mode. We do not provide any
// guarantee when failed flushes/compactions will be rescheduled if the user
// suppresses an error.
//
// Note that this function can run on the same threads as flush, compaction,
// and user writes. So, it is extremely important not to perform heavy
// computations or blocking calls in this function.
virtual void OnBackgroundError(BackgroundErrorReason /* reason */,
Status* /* bg_error */) {}
// A callback function for RocksDB which will be called whenever a change
// of superversion triggers a change of the stall conditions.
//
// Note that the this function must be implemented in a way such that
// it should not run for an extended period of time before the function
// returns. Otherwise, RocksDB may be blocked.
virtual void OnStallConditionsChanged(const WriteStallInfo& /*info*/) {}
// A callback function for RocksDB which will be called whenever a file read
// operation finishes.
virtual void OnFileReadFinish(const FileOperationInfo& /* info */) {}
// A callback function for RocksDB which will be called whenever a file write
// operation finishes.
virtual void OnFileWriteFinish(const FileOperationInfo& /* info */) {}
// A callback function for RocksDB which will be called whenever a file flush
// operation finishes.
virtual void OnFileFlushFinish(const FileOperationInfo& /* info */) {}
// A callback function for RocksDB which will be called whenever a file sync
// operation finishes.
virtual void OnFileSyncFinish(const FileOperationInfo& /* info */) {}
// A callback function for RocksDB which will be called whenever a file
// rangeSync operation finishes.
virtual void OnFileRangeSyncFinish(const FileOperationInfo& /* info */) {}
// A callback function for RocksDB which will be called whenever a file
// truncate operation finishes.
virtual void OnFileTruncateFinish(const FileOperationInfo& /* info */) {}
// A callback function for RocksDB which will be called whenever a file close
// operation finishes.
virtual void OnFileCloseFinish(const FileOperationInfo& /* info */) {}
// If true, the OnFile*Finish functions will be called. If
// false, then they won't be called.
virtual bool ShouldBeNotifiedOnFileIO() { return false; }
// A callback function for RocksDB which will be called just before
// starting the automatic recovery process for recoverable background
// errors, such as NoSpace(). The callback can suppress the automatic
// recovery by setting *auto_recovery to false. The database will then
// have to be transitioned out of read-only mode by calling DB::Resume()
virtual void OnErrorRecoveryBegin(BackgroundErrorReason /* reason */,
Status /* bg_error */,
bool* /* auto_recovery */) {}
// A callback function for RocksDB which will be called once the database
// is recovered from read-only mode after an error. When this is called, it
// means normal writes to the database can be issued and the user can
// initiate any further recovery actions needed
virtual void OnErrorRecoveryCompleted(Status /* old_bg_error */) {}
virtual ~EventListener() {}
};
#else
class EventListener {};
struct FlushJobInfo {};
#endif // ROCKSDB_LITE
} // namespace ROCKSDB_NAMESPACE