3dafdfb2c4
Summary: On some filesystems, pre-allocation can be a considerable amount of space. xfs in our production environment pre-allocates by 1GB, for instance. By using fallocate to inform the kernel of our expected file sizes, we eliminate this wasteage (that isn't recovered until the file is closed which, in the case of LOG files, can be a considerable amount of time). Test Plan: created an xfs loopback filesystem, mounted with allocsize=4M, and ran db_stress. LOG file without this change was 4M, and with it it was 128k then grew to normal size. Reviewers: dhruba Reviewed By: dhruba CC: adsharma, leveldb Differential Revision: https://reviews.facebook.net/D7953
567 lines
20 KiB
C++
567 lines
20 KiB
C++
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
//
|
|
// The representation of a DBImpl consists of a set of Versions. The
|
|
// newest version is called "current". Older versions may be kept
|
|
// around to provide a consistent view to live iterators.
|
|
//
|
|
// Each Version keeps track of a set of Table files per level. The
|
|
// entire set of versions is maintained in a VersionSet.
|
|
//
|
|
// Version,VersionSet are thread-compatible, but require external
|
|
// synchronization on all accesses.
|
|
|
|
#ifndef STORAGE_LEVELDB_DB_VERSION_SET_H_
|
|
#define STORAGE_LEVELDB_DB_VERSION_SET_H_
|
|
|
|
#include <map>
|
|
#include <memory>
|
|
#include <set>
|
|
#include <vector>
|
|
#include <deque>
|
|
#include "db/dbformat.h"
|
|
#include "db/version_edit.h"
|
|
#include "port/port.h"
|
|
#include "db/table_cache.h"
|
|
|
|
namespace leveldb {
|
|
|
|
namespace log { class Writer; }
|
|
|
|
class Compaction;
|
|
class Iterator;
|
|
class MemTable;
|
|
class TableBuilder;
|
|
class TableCache;
|
|
class Version;
|
|
class VersionSet;
|
|
class WritableFile;
|
|
|
|
// Return the smallest index i such that files[i]->largest >= key.
|
|
// Return files.size() if there is no such file.
|
|
// REQUIRES: "files" contains a sorted list of non-overlapping files.
|
|
extern int FindFile(const InternalKeyComparator& icmp,
|
|
const std::vector<FileMetaData*>& files,
|
|
const Slice& key);
|
|
|
|
// Returns true iff some file in "files" overlaps the user key range
|
|
// [*smallest,*largest].
|
|
// smallest==NULL represents a key smaller than all keys in the DB.
|
|
// largest==NULL represents a key largest than all keys in the DB.
|
|
// REQUIRES: If disjoint_sorted_files, files[] contains disjoint ranges
|
|
// in sorted order.
|
|
extern bool SomeFileOverlapsRange(
|
|
const InternalKeyComparator& icmp,
|
|
bool disjoint_sorted_files,
|
|
const std::vector<FileMetaData*>& files,
|
|
const Slice* smallest_user_key,
|
|
const Slice* largest_user_key);
|
|
|
|
class Version {
|
|
public:
|
|
// Append to *iters a sequence of iterators that will
|
|
// yield the contents of this Version when merged together.
|
|
// REQUIRES: This version has been saved (see VersionSet::SaveTo)
|
|
void AddIterators(const ReadOptions&, std::vector<Iterator*>* iters);
|
|
|
|
// Lookup the value for key. If found, store it in *val and
|
|
// return OK. Else return a non-OK status. Fills *stats.
|
|
// REQUIRES: lock is not held
|
|
struct GetStats {
|
|
FileMetaData* seek_file;
|
|
int seek_file_level;
|
|
};
|
|
Status Get(const ReadOptions&, const LookupKey& key, std::string* val,
|
|
GetStats* stats);
|
|
|
|
// Adds "stats" into the current state. Returns true if a new
|
|
// compaction may need to be triggered, false otherwise.
|
|
// REQUIRES: lock is held
|
|
bool UpdateStats(const GetStats& stats);
|
|
|
|
// Reference count management (so Versions do not disappear out from
|
|
// under live iterators)
|
|
void Ref();
|
|
void Unref();
|
|
|
|
void GetOverlappingInputs(
|
|
int level,
|
|
const InternalKey* begin, // NULL means before all keys
|
|
const InternalKey* end, // NULL means after all keys
|
|
std::vector<FileMetaData*>* inputs,
|
|
int hint_index = -1, // index of overlap file
|
|
int* file_index = NULL); // return index of overlap file
|
|
|
|
void GetOverlappingInputsBinarySearch(
|
|
int level,
|
|
const Slice& begin, // NULL means before all keys
|
|
const Slice& end, // NULL means after all keys
|
|
std::vector<FileMetaData*>* inputs,
|
|
int hint_index, // index of overlap file
|
|
int* file_index); // return index of overlap file
|
|
|
|
void ExtendOverlappingInputs(
|
|
int level,
|
|
const Slice& begin, // NULL means before all keys
|
|
const Slice& end, // NULL means after all keys
|
|
std::vector<FileMetaData*>* inputs,
|
|
int index); // start extending from this index
|
|
|
|
// Returns true iff some file in the specified level overlaps
|
|
// some part of [*smallest_user_key,*largest_user_key].
|
|
// smallest_user_key==NULL represents a key smaller than all keys in the DB.
|
|
// largest_user_key==NULL represents a key largest than all keys in the DB.
|
|
bool OverlapInLevel(int level,
|
|
const Slice* smallest_user_key,
|
|
const Slice* largest_user_key);
|
|
|
|
// Return the level at which we should place a new memtable compaction
|
|
// result that covers the range [smallest_user_key,largest_user_key].
|
|
int PickLevelForMemTableOutput(const Slice& smallest_user_key,
|
|
const Slice& largest_user_key);
|
|
|
|
int NumFiles(int level) const { return files_[level].size(); }
|
|
|
|
// Return a human readable string that describes this version's contents.
|
|
std::string DebugString(bool hex = false) const;
|
|
|
|
// Returns the version nuber of this version
|
|
uint64_t GetVersionNumber() {
|
|
return version_number_;
|
|
}
|
|
|
|
private:
|
|
friend class Compaction;
|
|
friend class VersionSet;
|
|
|
|
class LevelFileNumIterator;
|
|
Iterator* NewConcatenatingIterator(const ReadOptions&, int level) const;
|
|
|
|
VersionSet* vset_; // VersionSet to which this Version belongs
|
|
Version* next_; // Next version in linked list
|
|
Version* prev_; // Previous version in linked list
|
|
int refs_; // Number of live refs to this version
|
|
|
|
// List of files per level, files in each level are arranged
|
|
// in increasing order of keys
|
|
std::vector<FileMetaData*>* files_;
|
|
|
|
// A list for the same set of files that are stored in files_,
|
|
// but files in each level are now sorted based on file
|
|
// size. The file with the largest size is at the front.
|
|
// This vector stores the index of the file from files_.
|
|
std::vector< std::vector<int> > files_by_size_;
|
|
|
|
// An index into files_by_size_ that specifies the first
|
|
// file that is not yet compacted
|
|
std::vector<int> next_file_to_compact_by_size_;
|
|
|
|
// Only the first few entries of files_by_size_ are sorted.
|
|
// There is no need to sort all the files because it is likely
|
|
// that on a running system, we need to look at only the first
|
|
// few largest files because a new version is created every few
|
|
// seconds/minutes (because of concurrent compactions).
|
|
static const int number_of_files_to_sort_ = 50;
|
|
|
|
// Next file to compact based on seek stats.
|
|
FileMetaData* file_to_compact_;
|
|
int file_to_compact_level_;
|
|
|
|
// Level that should be compacted next and its compaction score.
|
|
// Score < 1 means compaction is not strictly needed. These fields
|
|
// are initialized by Finalize().
|
|
// The most critical level to be compacted is listed first
|
|
// These are used to pick the best compaction level
|
|
std::vector<double> compaction_score_;
|
|
std::vector<int> compaction_level_;
|
|
double max_compaction_score_; // max score in l1 to ln-1
|
|
|
|
// The offset in the manifest file where this version is stored.
|
|
uint64_t offset_manifest_file_;
|
|
|
|
// A version number that uniquely represents this version. This is
|
|
// used for debugging and logging purposes only.
|
|
uint64_t version_number_;
|
|
|
|
explicit Version(VersionSet* vset, uint64_t version_number = 0);
|
|
|
|
~Version();
|
|
|
|
// re-initializes the index that is used to offset into files_by_size_
|
|
// to find the next compaction candidate file.
|
|
void ResetNextCompactionIndex(int level) {
|
|
next_file_to_compact_by_size_[level] = 0;
|
|
}
|
|
|
|
// No copying allowed
|
|
Version(const Version&);
|
|
void operator=(const Version&);
|
|
};
|
|
|
|
class VersionSet {
|
|
public:
|
|
VersionSet(const std::string& dbname,
|
|
const Options* options,
|
|
TableCache* table_cache,
|
|
const InternalKeyComparator*);
|
|
~VersionSet();
|
|
|
|
// Apply *edit to the current version to form a new descriptor that
|
|
// is both saved to persistent state and installed as the new
|
|
// current version. Will release *mu while actually writing to the file.
|
|
// REQUIRES: *mu is held on entry.
|
|
// REQUIRES: no other thread concurrently calls LogAndApply()
|
|
Status LogAndApply(VersionEdit* edit, port::Mutex* mu,
|
|
bool new_descriptor_log = false);
|
|
|
|
// Recover the last saved descriptor from persistent storage.
|
|
Status Recover();
|
|
|
|
// Try to reduce the number of levels. This call is valid when
|
|
// only one level from the new max level to the old
|
|
// max level containing files.
|
|
// For example, a db currently has 7 levels [0-6], and a call to
|
|
// to reduce to 5 [0-4] can only be executed when only one level
|
|
// among [4-6] contains files.
|
|
Status ReduceNumberOfLevels(int new_levels, port::Mutex* mu);
|
|
|
|
// Return the current version.
|
|
Version* current() const { return current_; }
|
|
|
|
// Return the current manifest file number
|
|
uint64_t ManifestFileNumber() const { return manifest_file_number_; }
|
|
|
|
// Allocate and return a new file number
|
|
uint64_t NewFileNumber() { return next_file_number_++; }
|
|
|
|
// Arrange to reuse "file_number" unless a newer file number has
|
|
// already been allocated.
|
|
// REQUIRES: "file_number" was returned by a call to NewFileNumber().
|
|
void ReuseFileNumber(uint64_t file_number) {
|
|
if (next_file_number_ == file_number + 1) {
|
|
next_file_number_ = file_number;
|
|
}
|
|
}
|
|
|
|
// Return the number of Table files at the specified level.
|
|
int NumLevelFiles(int level) const;
|
|
|
|
// Return the combined file size of all files at the specified level.
|
|
int64_t NumLevelBytes(int level) const;
|
|
|
|
// Return the last sequence number.
|
|
uint64_t LastSequence() const { return last_sequence_; }
|
|
|
|
// Set the last sequence number to s.
|
|
void SetLastSequence(uint64_t s) {
|
|
assert(s >= last_sequence_);
|
|
last_sequence_ = s;
|
|
}
|
|
|
|
// Mark the specified file number as used.
|
|
void MarkFileNumberUsed(uint64_t number);
|
|
|
|
// Return the current log file number.
|
|
uint64_t LogNumber() const { return log_number_; }
|
|
|
|
// Return the log file number for the log file that is currently
|
|
// being compacted, or zero if there is no such log file.
|
|
uint64_t PrevLogNumber() const { return prev_log_number_; }
|
|
|
|
int NumberLevels() const { return num_levels_; }
|
|
|
|
// Pick level and inputs for a new compaction.
|
|
// Returns NULL if there is no compaction to be done.
|
|
// Otherwise returns a pointer to a heap-allocated object that
|
|
// describes the compaction. Caller should delete the result.
|
|
Compaction* PickCompaction();
|
|
|
|
// Return a compaction object for compacting the range [begin,end] in
|
|
// the specified level. Returns NULL if there is nothing in that
|
|
// level that overlaps the specified range. Caller should delete
|
|
// the result.
|
|
Compaction* CompactRange(
|
|
int level,
|
|
const InternalKey* begin,
|
|
const InternalKey* end);
|
|
|
|
// Return the maximum overlapping data (in bytes) at next level for any
|
|
// file at a level >= 1.
|
|
int64_t MaxNextLevelOverlappingBytes();
|
|
|
|
// Create an iterator that reads over the compaction inputs for "*c".
|
|
// The caller should delete the iterator when no longer needed.
|
|
Iterator* MakeInputIterator(Compaction* c);
|
|
|
|
// Returns true iff some level needs a compaction because it has
|
|
// exceeded its target size.
|
|
bool NeedsSizeCompaction() const {
|
|
for (int i = 0; i < NumberLevels()-1; i++) {
|
|
if (current_->compaction_score_[i] >= 1) {
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
// Returns true iff some level needs a compaction.
|
|
bool NeedsCompaction() const {
|
|
return ((current_->file_to_compact_ != NULL) ||
|
|
NeedsSizeCompaction());
|
|
}
|
|
|
|
// Returns the maxmimum compaction score for levels 1 to max
|
|
double MaxCompactionScore() const {
|
|
return current_->max_compaction_score_;
|
|
}
|
|
|
|
// Add all files listed in any live version to *live.
|
|
// May also mutate some internal state.
|
|
void AddLiveFiles(std::set<uint64_t>* live);
|
|
|
|
// Add all files listed in the current version to *live.
|
|
void AddLiveFilesCurrentVersion(std::set<uint64_t>* live);
|
|
|
|
// Return the approximate offset in the database of the data for
|
|
// "key" as of version "v".
|
|
uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key);
|
|
|
|
// Return a human-readable short (single-line) summary of the number
|
|
// of files per level. Uses *scratch as backing store.
|
|
struct LevelSummaryStorage {
|
|
char buffer[100];
|
|
};
|
|
const char* LevelSummary(LevelSummaryStorage* scratch) const;
|
|
|
|
// printf contents (for debugging)
|
|
Status DumpManifest(Options& options, std::string& manifestFileName,
|
|
bool verbose, bool hex = false);
|
|
|
|
// Return a human-readable short (single-line) summary of the data size
|
|
// of files per level. Uses *scratch as backing store.
|
|
const char* LevelDataSizeSummary(LevelSummaryStorage* scratch) const;
|
|
|
|
// Return the size of the current manifest file
|
|
const uint64_t ManifestFileSize() { return current_->offset_manifest_file_; }
|
|
|
|
// For the specfied level, pick a compaction.
|
|
// Returns NULL if there is no compaction to be done.
|
|
// If level is 0 and there is already a compaction on that level, this
|
|
// function will return NULL.
|
|
Compaction* PickCompactionBySize(int level, double score);
|
|
|
|
// Free up the files that were participated in a compaction
|
|
void ReleaseCompactionFiles(Compaction* c, Status status);
|
|
|
|
// verify that the files that we started with for a compaction
|
|
// still exist in the current version and in the same original level.
|
|
// This ensures that a concurrent compaction did not erroneously
|
|
// pick the same files to compact.
|
|
bool VerifyCompactionFileConsistency(Compaction* c);
|
|
|
|
// used to sort files by size
|
|
typedef struct fsize {
|
|
int index;
|
|
FileMetaData* file;
|
|
} Fsize;
|
|
|
|
// Sort all files for this version based on their file size and
|
|
// record results in files_by_size_. The largest files are listed first.
|
|
void UpdateFilesBySize(Version *v);
|
|
|
|
// Get the max file size in a given level.
|
|
uint64_t MaxFileSizeForLevel(int level);
|
|
|
|
private:
|
|
class Builder;
|
|
struct ManifestWriter;
|
|
|
|
friend class Compaction;
|
|
friend class Version;
|
|
|
|
void Init(int num_levels);
|
|
|
|
void Finalize(Version* v);
|
|
|
|
void GetRange(const std::vector<FileMetaData*>& inputs,
|
|
InternalKey* smallest,
|
|
InternalKey* largest);
|
|
|
|
void GetRange2(const std::vector<FileMetaData*>& inputs1,
|
|
const std::vector<FileMetaData*>& inputs2,
|
|
InternalKey* smallest,
|
|
InternalKey* largest);
|
|
|
|
void SetupOtherInputs(Compaction* c);
|
|
|
|
// Save current contents to *log
|
|
Status WriteSnapshot(log::Writer* log);
|
|
|
|
void AppendVersion(Version* v);
|
|
|
|
bool ManifestContains(const std::string& record) const;
|
|
|
|
double MaxBytesForLevel(int level);
|
|
|
|
int64_t ExpandedCompactionByteSizeLimit(int level);
|
|
|
|
int64_t MaxGrandParentOverlapBytes(int level);
|
|
|
|
Env* const env_;
|
|
const std::string dbname_;
|
|
const Options* const options_;
|
|
TableCache* const table_cache_;
|
|
const InternalKeyComparator icmp_;
|
|
uint64_t next_file_number_;
|
|
uint64_t manifest_file_number_;
|
|
uint64_t last_sequence_;
|
|
uint64_t log_number_;
|
|
uint64_t prev_log_number_; // 0 or backing store for memtable being compacted
|
|
|
|
int num_levels_;
|
|
|
|
// Opened lazily
|
|
unique_ptr<log::Writer> descriptor_log_;
|
|
Version dummy_versions_; // Head of circular doubly-linked list of versions.
|
|
Version* current_; // == dummy_versions_.prev_
|
|
|
|
// Per-level key at which the next compaction at that level should start.
|
|
// Either an empty string, or a valid InternalKey.
|
|
std::string* compact_pointer_;
|
|
|
|
// Per-level target file size.
|
|
uint64_t* max_file_size_;
|
|
|
|
// Per-level max bytes
|
|
uint64_t* level_max_bytes_;
|
|
|
|
// record all the ongoing compactions for all levels
|
|
std::vector<std::set<Compaction*> > compactions_in_progress_;
|
|
|
|
// generates a increasing version number for every new version
|
|
uint64_t current_version_number_;
|
|
|
|
// Queue of writers to the manifest file
|
|
std::deque<ManifestWriter*> manifest_writers_;
|
|
|
|
// Store the manifest file size when it is checked.
|
|
// Save us the cost of checking file size twice in LogAndApply
|
|
uint64_t last_observed_manifest_size_;
|
|
|
|
// No copying allowed
|
|
VersionSet(const VersionSet&);
|
|
void operator=(const VersionSet&);
|
|
|
|
// Return the total amount of data that is undergoing
|
|
// compactions at this level
|
|
uint64_t SizeBeingCompacted(int level);
|
|
|
|
// Returns true if any one of the parent files are being compacted
|
|
bool ParentRangeInCompaction(const InternalKey* smallest,
|
|
const InternalKey* largest, int level, int* index);
|
|
|
|
// Returns true if any one of the specified files are being compacted
|
|
bool FilesInCompaction(std::vector<FileMetaData*>& files);
|
|
|
|
void LogAndApplyHelper(Builder*b, Version* v,
|
|
VersionEdit* edit, port::Mutex* mu);
|
|
};
|
|
|
|
// A Compaction encapsulates information about a compaction.
|
|
class Compaction {
|
|
public:
|
|
~Compaction();
|
|
|
|
// Return the level that is being compacted. Inputs from "level"
|
|
// and "level+1" will be merged to produce a set of "level+1" files.
|
|
int level() const { return level_; }
|
|
|
|
// Return the object that holds the edits to the descriptor done
|
|
// by this compaction.
|
|
VersionEdit* edit() { return edit_; }
|
|
|
|
// "which" must be either 0 or 1
|
|
int num_input_files(int which) const { return inputs_[which].size(); }
|
|
|
|
// Return the ith input file at "level()+which" ("which" must be 0 or 1).
|
|
FileMetaData* input(int which, int i) const { return inputs_[which][i]; }
|
|
|
|
// Maximum size of files to build during this compaction.
|
|
uint64_t MaxOutputFileSize() const { return max_output_file_size_; }
|
|
|
|
// Is this a trivial compaction that can be implemented by just
|
|
// moving a single input file to the next level (no merging or splitting)
|
|
bool IsTrivialMove() const;
|
|
|
|
// Add all inputs to this compaction as delete operations to *edit.
|
|
void AddInputDeletions(VersionEdit* edit);
|
|
|
|
// Returns true if the information we have available guarantees that
|
|
// the compaction is producing data in "level+1" for which no data exists
|
|
// in levels greater than "level+1".
|
|
bool IsBaseLevelForKey(const Slice& user_key);
|
|
|
|
// Returns true iff we should stop building the current output
|
|
// before processing "internal_key".
|
|
bool ShouldStopBefore(const Slice& internal_key);
|
|
|
|
// Release the input version for the compaction, once the compaction
|
|
// is successful.
|
|
void ReleaseInputs();
|
|
|
|
void Summary(char* output, int len);
|
|
|
|
// Return the score that was used to pick this compaction run.
|
|
double score() const { return score_; }
|
|
|
|
private:
|
|
friend class Version;
|
|
friend class VersionSet;
|
|
|
|
explicit Compaction(int level, uint64_t target_file_size,
|
|
uint64_t max_grandparent_overlap_bytes, int number_levels,
|
|
bool seek_compaction = false);
|
|
|
|
int level_;
|
|
uint64_t max_output_file_size_;
|
|
int64_t maxGrandParentOverlapBytes_;
|
|
Version* input_version_;
|
|
VersionEdit* edit_;
|
|
int number_levels_;
|
|
|
|
bool seek_compaction_;
|
|
|
|
// Each compaction reads inputs from "level_" and "level_+1"
|
|
std::vector<FileMetaData*> inputs_[2]; // The two sets of inputs
|
|
|
|
// State used to check for number of of overlapping grandparent files
|
|
// (parent == level_ + 1, grandparent == level_ + 2)
|
|
std::vector<FileMetaData*> grandparents_;
|
|
size_t grandparent_index_; // Index in grandparent_starts_
|
|
bool seen_key_; // Some output key has been seen
|
|
int64_t overlapped_bytes_; // Bytes of overlap between current output
|
|
// and grandparent files
|
|
int base_index_; // index of the file in files_[level_]
|
|
int parent_index_; // index of some file with same range in files_[level_+1]
|
|
double score_; // score that was used to pick this compaction.
|
|
|
|
// State for implementing IsBaseLevelForKey
|
|
|
|
// level_ptrs_ holds indices into input_version_->levels_: our state
|
|
// is that we are positioned at one of the file ranges for each
|
|
// higher level than the ones involved in this compaction (i.e. for
|
|
// all L >= level_ + 2).
|
|
size_t* level_ptrs_;
|
|
|
|
// mark (or clear) all files that are being compacted
|
|
void MarkFilesBeingCompacted(bool);
|
|
|
|
// In case of compaction error, reset the nextIndex that is used
|
|
// to pick up the next file to be compacted from files_by_size_
|
|
void ResetNextCompactionIndex();
|
|
};
|
|
|
|
} // namespace leveldb
|
|
|
|
#endif // STORAGE_LEVELDB_DB_VERSION_SET_H_
|