0acc738810
Summary: There are two versions of FindObsoleteFiles(): * full scan, which is executed every 6 hours (and it's terribly slow) * no full scan, which is executed every time a background process finishes and iterator is deleted This diff is optimizing the second case (no full scan). Here's what we do before the diff: * Get the list of obsolete files (files with ref==0). Some files in obsolete_files set might actually be live. * Get the list of live files to avoid deleting files that are live. * Delete files that are in obsolete_files and not in live_files. After this diff: * The only files with ref==0 that are still live are files that have been part of move compaction. Don't include moved files in obsolete_files. * Get the list of obsolete files (which exclude moved files). * No need to get the list of live files, since all files in obsolete_files need to be deleted. I'll post the benchmark results, but you can get the feel of it here: https://reviews.facebook.net/D30123 This depends on D30123. P.S. We should do full scan only in failure scenarios, not every 6 hours. I'll do this in a follow-up diff. Test Plan: One new unit test. Made sure that unit test fails if we don't have a `if (!f->moved)` safeguard in ~Version. make check Big number of compactions and flushes: ./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000 Reviewers: yhchiang, rven, sdong Reviewed By: sdong Subscribers: dhruba, leveldb Differential Revision: https://reviews.facebook.net/D30249
263 lines
8.2 KiB
C++
263 lines
8.2 KiB
C++
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under the BSD-style license found in the
|
|
// LICENSE file in the root directory of this source tree. An additional grant
|
|
// of patent rights can be found in the PATENTS file in the same directory.
|
|
//
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
#pragma once
|
|
#include <algorithm>
|
|
#include <set>
|
|
#include <utility>
|
|
#include <vector>
|
|
#include <string>
|
|
#include "rocksdb/cache.h"
|
|
#include "db/dbformat.h"
|
|
#include "util/arena.h"
|
|
#include "util/autovector.h"
|
|
|
|
namespace rocksdb {
|
|
|
|
class VersionSet;
|
|
|
|
const uint64_t kFileNumberMask = 0x3FFFFFFFFFFFFFFF;
|
|
|
|
extern uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id);
|
|
|
|
// A copyable structure contains information needed to read data from an SST
|
|
// file. It can contains a pointer to a table reader opened for the file, or
|
|
// file number and size, which can be used to create a new table reader for it.
|
|
// The behavior is undefined when a copied of the structure is used when the
|
|
// file is not in any live version any more.
|
|
struct FileDescriptor {
|
|
// Table reader in table_reader_handle
|
|
TableReader* table_reader;
|
|
uint64_t packed_number_and_path_id;
|
|
uint64_t file_size; // File size in bytes
|
|
|
|
FileDescriptor() : FileDescriptor(0, 0, 0) {}
|
|
|
|
FileDescriptor(uint64_t number, uint32_t path_id, uint64_t _file_size)
|
|
: table_reader(nullptr),
|
|
packed_number_and_path_id(PackFileNumberAndPathId(number, path_id)),
|
|
file_size(_file_size) {}
|
|
|
|
FileDescriptor& operator=(const FileDescriptor& fd) {
|
|
table_reader = fd.table_reader;
|
|
packed_number_and_path_id = fd.packed_number_and_path_id;
|
|
file_size = fd.file_size;
|
|
return *this;
|
|
}
|
|
|
|
uint64_t GetNumber() const {
|
|
return packed_number_and_path_id & kFileNumberMask;
|
|
}
|
|
uint32_t GetPathId() const {
|
|
return packed_number_and_path_id / (kFileNumberMask + 1);
|
|
}
|
|
uint64_t GetFileSize() const { return file_size; }
|
|
};
|
|
|
|
struct FileMetaData {
|
|
int refs;
|
|
FileDescriptor fd;
|
|
InternalKey smallest; // Smallest internal key served by table
|
|
InternalKey largest; // Largest internal key served by table
|
|
bool being_compacted; // Is this file undergoing compaction?
|
|
SequenceNumber smallest_seqno; // The smallest seqno in this file
|
|
SequenceNumber largest_seqno; // The largest seqno in this file
|
|
|
|
// Needs to be disposed when refs becomes 0.
|
|
Cache::Handle* table_reader_handle;
|
|
|
|
// Stats for compensating deletion entries during compaction
|
|
|
|
// File size compensated by deletion entry.
|
|
// This is updated in Version::UpdateAccumulatedStats() first time when the
|
|
// file is created or loaded. After it is updated, it is immutable.
|
|
uint64_t compensated_file_size;
|
|
uint64_t num_entries; // the number of entries.
|
|
uint64_t num_deletions; // the number of deletion entries.
|
|
uint64_t raw_key_size; // total uncompressed key size.
|
|
uint64_t raw_value_size; // total uncompressed value size.
|
|
bool init_stats_from_file; // true if the data-entry stats of this file
|
|
// has initialized from file.
|
|
|
|
// Always false for new files. Set to true if the file was part of move
|
|
// compaction. Can only be mutated from the compaction process, under DB mutex
|
|
bool moved;
|
|
|
|
FileMetaData()
|
|
: refs(0),
|
|
being_compacted(false),
|
|
table_reader_handle(nullptr),
|
|
compensated_file_size(0),
|
|
num_entries(0),
|
|
num_deletions(0),
|
|
raw_key_size(0),
|
|
raw_value_size(0),
|
|
init_stats_from_file(false),
|
|
moved(false) {}
|
|
};
|
|
|
|
// A compressed copy of file meta data that just contain
|
|
// smallest and largest key's slice
|
|
struct FdWithKeyRange {
|
|
FileDescriptor fd;
|
|
Slice smallest_key; // slice that contain smallest key
|
|
Slice largest_key; // slice that contain largest key
|
|
|
|
FdWithKeyRange()
|
|
: fd(),
|
|
smallest_key(),
|
|
largest_key() {
|
|
}
|
|
|
|
FdWithKeyRange(FileDescriptor _fd, Slice _smallest_key, Slice _largest_key)
|
|
: fd(_fd), smallest_key(_smallest_key), largest_key(_largest_key) {}
|
|
};
|
|
|
|
// Data structure to store an array of FdWithKeyRange in one level
|
|
// Actual data is guaranteed to be stored closely
|
|
struct LevelFilesBrief {
|
|
size_t num_files;
|
|
FdWithKeyRange* files;
|
|
LevelFilesBrief() {
|
|
num_files = 0;
|
|
files = nullptr;
|
|
}
|
|
};
|
|
|
|
class VersionEdit {
|
|
public:
|
|
VersionEdit() { Clear(); }
|
|
~VersionEdit() { }
|
|
|
|
void Clear();
|
|
|
|
void SetComparatorName(const Slice& name) {
|
|
has_comparator_ = true;
|
|
comparator_ = name.ToString();
|
|
}
|
|
void SetLogNumber(uint64_t num) {
|
|
has_log_number_ = true;
|
|
log_number_ = num;
|
|
}
|
|
void SetPrevLogNumber(uint64_t num) {
|
|
has_prev_log_number_ = true;
|
|
prev_log_number_ = num;
|
|
}
|
|
void SetNextFile(uint64_t num) {
|
|
has_next_file_number_ = true;
|
|
next_file_number_ = num;
|
|
}
|
|
void SetLastSequence(SequenceNumber seq) {
|
|
has_last_sequence_ = true;
|
|
last_sequence_ = seq;
|
|
}
|
|
void SetMaxColumnFamily(uint32_t max_column_family) {
|
|
has_max_column_family_ = true;
|
|
max_column_family_ = max_column_family;
|
|
}
|
|
|
|
// Add the specified file at the specified number.
|
|
// REQUIRES: This version has not been saved (see VersionSet::SaveTo)
|
|
// REQUIRES: "smallest" and "largest" are smallest and largest keys in file
|
|
void AddFile(int level, uint64_t file, uint32_t file_path_id,
|
|
uint64_t file_size, const InternalKey& smallest,
|
|
const InternalKey& largest, const SequenceNumber& smallest_seqno,
|
|
const SequenceNumber& largest_seqno) {
|
|
assert(smallest_seqno <= largest_seqno);
|
|
FileMetaData f;
|
|
f.fd = FileDescriptor(file, file_path_id, file_size);
|
|
f.smallest = smallest;
|
|
f.largest = largest;
|
|
f.smallest_seqno = smallest_seqno;
|
|
f.largest_seqno = largest_seqno;
|
|
new_files_.push_back(std::make_pair(level, f));
|
|
}
|
|
|
|
// Delete the specified "file" from the specified "level".
|
|
void DeleteFile(int level, uint64_t file) {
|
|
deleted_files_.insert({level, file});
|
|
}
|
|
|
|
// Number of edits
|
|
size_t NumEntries() { return new_files_.size() + deleted_files_.size(); }
|
|
|
|
bool IsColumnFamilyManipulation() {
|
|
return is_column_family_add_ || is_column_family_drop_;
|
|
}
|
|
|
|
void SetColumnFamily(uint32_t column_family_id) {
|
|
column_family_ = column_family_id;
|
|
}
|
|
|
|
// set column family ID by calling SetColumnFamily()
|
|
void AddColumnFamily(const std::string& name) {
|
|
assert(!is_column_family_drop_);
|
|
assert(!is_column_family_add_);
|
|
assert(NumEntries() == 0);
|
|
is_column_family_add_ = true;
|
|
column_family_name_ = name;
|
|
}
|
|
|
|
// set column family ID by calling SetColumnFamily()
|
|
void DropColumnFamily() {
|
|
assert(!is_column_family_drop_);
|
|
assert(!is_column_family_add_);
|
|
assert(NumEntries() == 0);
|
|
is_column_family_drop_ = true;
|
|
}
|
|
|
|
// return true on success.
|
|
bool EncodeTo(std::string* dst) const;
|
|
Status DecodeFrom(const Slice& src);
|
|
|
|
typedef std::set<std::pair<int, uint64_t>> DeletedFileSet;
|
|
|
|
const DeletedFileSet& GetDeletedFiles() { return deleted_files_; }
|
|
const std::vector<std::pair<int, FileMetaData>>& GetNewFiles() {
|
|
return new_files_;
|
|
}
|
|
|
|
std::string DebugString(bool hex_key = false) const;
|
|
|
|
private:
|
|
friend class VersionSet;
|
|
friend class Version;
|
|
|
|
bool GetLevel(Slice* input, int* level, const char** msg);
|
|
|
|
int max_level_;
|
|
std::string comparator_;
|
|
uint64_t log_number_;
|
|
uint64_t prev_log_number_;
|
|
uint64_t next_file_number_;
|
|
uint32_t max_column_family_;
|
|
SequenceNumber last_sequence_;
|
|
bool has_comparator_;
|
|
bool has_log_number_;
|
|
bool has_prev_log_number_;
|
|
bool has_next_file_number_;
|
|
bool has_last_sequence_;
|
|
bool has_max_column_family_;
|
|
|
|
DeletedFileSet deleted_files_;
|
|
std::vector<std::pair<int, FileMetaData>> new_files_;
|
|
|
|
// Each version edit record should have column_family_id set
|
|
// If it's not set, it is default (0)
|
|
uint32_t column_family_;
|
|
// a version edit can be either column_family add or
|
|
// column_family drop. If it's column family add,
|
|
// it also includes column family name.
|
|
bool is_column_family_drop_;
|
|
bool is_column_family_add_;
|
|
std::string column_family_name_;
|
|
};
|
|
|
|
} // namespace rocksdb
|