2014-01-15 16:22:34 -08:00
|
|
|
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under the BSD-style license found in the
|
|
|
|
// LICENSE file in the root directory of this source tree. An additional grant
|
|
|
|
// of patent rights can be found in the PATENTS file in the same directory.
|
|
|
|
//
|
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
|
|
|
#pragma once
|
|
|
|
#include "db/version_set.h"
|
|
|
|
|
|
|
|
namespace rocksdb {
|
|
|
|
|
|
|
|
class Version;
|
2014-01-31 16:45:20 -08:00
|
|
|
class ColumnFamilyData;
|
2014-01-15 16:22:34 -08:00
|
|
|
|
|
|
|
// A Compaction encapsulates information about a compaction.
|
|
|
|
class Compaction {
|
|
|
|
public:
|
|
|
|
~Compaction();
|
|
|
|
|
|
|
|
// Return the level that is being compacted. Inputs from "level"
|
|
|
|
// will be merged.
|
|
|
|
int level() const { return level_; }
|
|
|
|
|
|
|
|
// Outputs will go to this level
|
|
|
|
int output_level() const { return out_level_; }
|
|
|
|
|
|
|
|
// Return the object that holds the edits to the descriptor done
|
|
|
|
// by this compaction.
|
|
|
|
VersionEdit* edit() { return edit_; }
|
|
|
|
|
|
|
|
// "which" must be either 0 or 1
|
|
|
|
int num_input_files(int which) const { return inputs_[which].size(); }
|
|
|
|
|
2014-01-22 10:55:16 -08:00
|
|
|
// Returns input version of the compaction
|
|
|
|
Version* input_version() const { return input_version_; }
|
|
|
|
|
2014-01-31 16:45:20 -08:00
|
|
|
ColumnFamilyData* column_family_data() const { return cfd_; }
|
|
|
|
|
2014-01-15 16:22:34 -08:00
|
|
|
// Return the ith input file at "level()+which" ("which" must be 0 or 1).
|
|
|
|
FileMetaData* input(int which, int i) const { return inputs_[which][i]; }
|
|
|
|
|
2014-01-22 10:55:16 -08:00
|
|
|
std::vector<FileMetaData*>* inputs(int which) { return &inputs_[which]; }
|
|
|
|
|
2014-01-15 16:22:34 -08:00
|
|
|
// Maximum size of files to build during this compaction.
|
|
|
|
uint64_t MaxOutputFileSize() const { return max_output_file_size_; }
|
|
|
|
|
2014-07-02 20:40:57 +02:00
|
|
|
// What compression for output
|
|
|
|
CompressionType OutputCompressionType() const { return output_compression_; }
|
2014-01-15 16:22:34 -08:00
|
|
|
|
2014-07-02 09:54:20 -07:00
|
|
|
// Whether need to write output file to second DB path.
|
|
|
|
uint32_t GetOutputPathId() const { return output_path_id_; }
|
|
|
|
|
create compressed_levels_ in Version, allocate its space using arena. Make Version::Get, Version::FindFile faster
Summary:
Define CompressedFileMetaData that just contains fd, smallest_slice, largest_slice. Create compressed_levels_ in Version, the space is allocated using arena
Thus increase the file meta data locality, speed up "Get" and "FindFile"
benchmark with in-memory tmpfs, could have 4% improvement under "random read" and 2% improvement under "read while writing"
benchmark command:
./db_bench --db=/mnt/db/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=33554432 --max_bytes_for_level_base=1073741824 --disable_wal=0 --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --benchmarks=readwhilewriting,readwhilewriting,readwhilewriting --use_existing_db=1 --num=52428800 --threads=1 —writes_per_second=81920
Read Random:
From 1.8363 ms/op, improve to 1.7587 ms/op.
Read while writing:
From 2.985 ms/op, improve to 2.924 ms/op.
Test Plan:
make all check
Reviewers: ljin, haobo, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, igor
Differential Revision: https://reviews.facebook.net/D19419
2014-07-09 22:14:39 -07:00
|
|
|
// Is this a trivial compaction that can be implemented by just
|
2014-01-15 16:22:34 -08:00
|
|
|
// moving a single input file to the next level (no merging or splitting)
|
|
|
|
bool IsTrivialMove() const;
|
|
|
|
|
2014-05-21 11:43:35 -07:00
|
|
|
// If true, just delete all files in inputs_[0]
|
|
|
|
bool IsDeletionCompaction() const;
|
|
|
|
|
2014-01-15 16:22:34 -08:00
|
|
|
// Add all inputs to this compaction as delete operations to *edit.
|
|
|
|
void AddInputDeletions(VersionEdit* edit);
|
|
|
|
|
|
|
|
// Returns true if the information we have available guarantees that
|
|
|
|
// the compaction is producing data in "level+1" for which no data exists
|
|
|
|
// in levels greater than "level+1".
|
|
|
|
bool IsBaseLevelForKey(const Slice& user_key);
|
|
|
|
|
|
|
|
// Returns true iff we should stop building the current output
|
|
|
|
// before processing "internal_key".
|
|
|
|
bool ShouldStopBefore(const Slice& internal_key);
|
|
|
|
|
|
|
|
// Release the input version for the compaction, once the compaction
|
|
|
|
// is successful.
|
|
|
|
void ReleaseInputs();
|
|
|
|
|
2014-01-31 16:45:20 -08:00
|
|
|
// Clear all files to indicate that they are not being compacted
|
|
|
|
// Delete this compaction from the list of running compactions.
|
|
|
|
void ReleaseCompactionFiles(Status status);
|
|
|
|
|
2014-01-15 16:22:34 -08:00
|
|
|
void Summary(char* output, int len);
|
|
|
|
|
|
|
|
// Return the score that was used to pick this compaction run.
|
|
|
|
double score() const { return score_; }
|
|
|
|
|
|
|
|
// Is this compaction creating a file in the bottom most level?
|
|
|
|
bool BottomMostLevel() { return bottommost_level_; }
|
|
|
|
|
|
|
|
// Does this compaction include all sst files?
|
|
|
|
bool IsFullCompaction() { return is_full_compaction_; }
|
|
|
|
|
2014-02-12 12:24:18 -08:00
|
|
|
// Was this compaction triggered manually by the client?
|
|
|
|
bool IsManualCompaction() { return is_manual_compaction_; }
|
|
|
|
|
2014-06-05 13:19:35 -07:00
|
|
|
// Returns a number of byte that the output file should be preallocated to
|
|
|
|
// In level compaction, that is max_file_size_. In universal compaction, that
|
|
|
|
// is the sum of all input file sizes
|
|
|
|
uint64_t OutputFilePreallocationSize();
|
|
|
|
|
2014-01-15 16:22:34 -08:00
|
|
|
private:
|
CompactionPicker
Summary:
This is a big one. This diff moves all the code related to picking compactions from VersionSet to new class CompactionPicker. Column families' compactions will be completely separate processes, so we need to have multiple CompactionPickers.
To make this easier to review, most of the code change is just copy/paste. There is also a small change not to use VersionSet::current_, but rather to take `Version* version` as a parameter. Most of the other code is exactly the same.
In future diffs, I will also make some improvements to CompactionPickers. I think the most important part will be encapsulating it better. Currently Version, VersionSet, Compaction and CompactionPicker are all friend classes, which makes it harder to change the implementation.
This diff depends on D15171, D15183, D15189 and D15201
Test Plan: `make check`
Reviewers: kailiu, sdong, dhruba, haobo
Reviewed By: kailiu
CC: leveldb
Differential Revision: https://reviews.facebook.net/D15207
2014-01-16 13:03:52 -08:00
|
|
|
friend class CompactionPicker;
|
|
|
|
friend class UniversalCompactionPicker;
|
2014-05-21 11:43:35 -07:00
|
|
|
friend class FIFOCompactionPicker;
|
CompactionPicker
Summary:
This is a big one. This diff moves all the code related to picking compactions from VersionSet to new class CompactionPicker. Column families' compactions will be completely separate processes, so we need to have multiple CompactionPickers.
To make this easier to review, most of the code change is just copy/paste. There is also a small change not to use VersionSet::current_, but rather to take `Version* version` as a parameter. Most of the other code is exactly the same.
In future diffs, I will also make some improvements to CompactionPickers. I think the most important part will be encapsulating it better. Currently Version, VersionSet, Compaction and CompactionPicker are all friend classes, which makes it harder to change the implementation.
This diff depends on D15171, D15183, D15189 and D15201
Test Plan: `make check`
Reviewers: kailiu, sdong, dhruba, haobo
Reviewed By: kailiu
CC: leveldb
Differential Revision: https://reviews.facebook.net/D15207
2014-01-16 13:03:52 -08:00
|
|
|
friend class LevelCompactionPicker;
|
2014-01-15 16:22:34 -08:00
|
|
|
|
|
|
|
Compaction(Version* input_version, int level, int out_level,
|
|
|
|
uint64_t target_file_size, uint64_t max_grandparent_overlap_bytes,
|
2014-07-02 09:54:20 -07:00
|
|
|
uint32_t output_path_id, CompressionType output_compression,
|
|
|
|
bool seek_compaction = false, bool deletion_compaction = false);
|
2014-01-15 16:22:34 -08:00
|
|
|
|
|
|
|
int level_;
|
|
|
|
int out_level_; // levels to which output files are stored
|
|
|
|
uint64_t max_output_file_size_;
|
2014-01-22 10:55:16 -08:00
|
|
|
uint64_t max_grandparent_overlap_bytes_;
|
2014-01-15 16:22:34 -08:00
|
|
|
Version* input_version_;
|
|
|
|
VersionEdit* edit_;
|
|
|
|
int number_levels_;
|
2014-01-31 16:45:20 -08:00
|
|
|
ColumnFamilyData* cfd_;
|
2014-01-15 16:22:34 -08:00
|
|
|
|
2014-07-02 09:54:20 -07:00
|
|
|
uint32_t output_path_id_;
|
2014-07-02 20:40:57 +02:00
|
|
|
CompressionType output_compression_;
|
2014-01-15 16:22:34 -08:00
|
|
|
bool seek_compaction_;
|
2014-05-21 11:43:35 -07:00
|
|
|
// if true, just delete files in inputs_[0]
|
|
|
|
bool deletion_compaction_;
|
2014-01-15 16:22:34 -08:00
|
|
|
|
|
|
|
// Each compaction reads inputs from "level_" and "level_+1"
|
|
|
|
std::vector<FileMetaData*> inputs_[2]; // The two sets of inputs
|
|
|
|
|
|
|
|
// State used to check for number of of overlapping grandparent files
|
|
|
|
// (parent == level_ + 1, grandparent == level_ + 2)
|
|
|
|
std::vector<FileMetaData*> grandparents_;
|
|
|
|
size_t grandparent_index_; // Index in grandparent_starts_
|
|
|
|
bool seen_key_; // Some output key has been seen
|
|
|
|
uint64_t overlapped_bytes_; // Bytes of overlap between current output
|
|
|
|
// and grandparent files
|
|
|
|
int base_index_; // index of the file in files_[level_]
|
|
|
|
int parent_index_; // index of some file with same range in files_[level_+1]
|
|
|
|
double score_; // score that was used to pick this compaction.
|
|
|
|
|
|
|
|
// Is this compaction creating a file in the bottom most level?
|
|
|
|
bool bottommost_level_;
|
|
|
|
// Does this compaction include all sst files?
|
|
|
|
bool is_full_compaction_;
|
|
|
|
|
2014-02-12 12:24:18 -08:00
|
|
|
// Is this compaction requested by the client?
|
|
|
|
bool is_manual_compaction_;
|
|
|
|
|
2014-01-15 16:22:34 -08:00
|
|
|
// level_ptrs_ holds indices into input_version_->levels_: our state
|
|
|
|
// is that we are positioned at one of the file ranges for each
|
|
|
|
// higher level than the ones involved in this compaction (i.e. for
|
|
|
|
// all L >= level_ + 2).
|
|
|
|
std::vector<size_t> level_ptrs_;
|
|
|
|
|
|
|
|
// mark (or clear) all files that are being compacted
|
|
|
|
void MarkFilesBeingCompacted(bool);
|
|
|
|
|
|
|
|
// Initialize whether compaction producing files at the bottommost level
|
|
|
|
void SetupBottomMostLevel(bool isManual);
|
|
|
|
|
|
|
|
// In case of compaction error, reset the nextIndex that is used
|
|
|
|
// to pick up the next file to be compacted from files_by_size_
|
|
|
|
void ResetNextCompactionIndex();
|
|
|
|
};
|
|
|
|
|
|
|
|
} // namespace rocksdb
|