rocksdb/db/version_edit.cc
Levi Tamasi 5f025ea832 BlobDB GC: add SST <-> oldest blob file referenced mapping (#5903)
Summary:
This is groundwork for adding garbage collection support to BlobDB. The
patch adds logic that keeps track of the oldest blob file referred to by
each SST file. The oldest blob file is identified during flush/
compaction (similarly to how the range of keys covered by the SST is
identified), and persisted in the manifest as a custom field of the new
file edit record. Blob indexes with TTL are ignored for the purposes of
identifying the oldest blob file (since such blob files are cleaned up by the
TTL logic in BlobDB).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5903

Test Plan:
Added new unit tests; also ran db_bench in BlobDB mode, inspected the
manifest using ldb, and confirmed (by scanning the SST files using
sst_dump) that the value of the oldest blob file number field matches
the contents of the file for each SST.

Differential Revision: D17859997

Pulled By: ltamasi

fbshipit-source-id: 21662c137c6259a6af70446faaf3a9912c550e90
2019-10-14 15:21:01 -07:00

773 lines
23 KiB
C++

// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/version_edit.h"
#include "db/blob_index.h"
#include "db/version_set.h"
#include "logging/event_logger.h"
#include "rocksdb/slice.h"
#include "test_util/sync_point.h"
#include "util/coding.h"
#include "util/string_util.h"
namespace rocksdb {
// Mask for an identified tag from the future which can be safely ignored.
const uint32_t kTagSafeIgnoreMask = 1 << 13;
// Tag numbers for serialized VersionEdit. These numbers are written to
// disk and should not be changed. The number should be forward compatible so
// users can down-grade RocksDB safely. A future Tag is ignored by doing '&'
// between Tag and kTagSafeIgnoreMask field.
enum Tag : uint32_t {
kComparator = 1,
kLogNumber = 2,
kNextFileNumber = 3,
kLastSequence = 4,
kCompactPointer = 5,
kDeletedFile = 6,
kNewFile = 7,
// 8 was used for large value refs
kPrevLogNumber = 9,
kMinLogNumberToKeep = 10,
// Ignore-able field
kDbId = kTagSafeIgnoreMask + 1,
// these are new formats divergent from open source leveldb
kNewFile2 = 100,
kNewFile3 = 102,
kNewFile4 = 103, // 4th (the latest) format version of adding files
kColumnFamily = 200, // specify column family for version edit
kColumnFamilyAdd = 201,
kColumnFamilyDrop = 202,
kMaxColumnFamily = 203,
kInAtomicGroup = 300,
};
enum CustomTag : uint32_t {
kTerminate = 1, // The end of customized fields
kNeedCompaction = 2,
// Since Manifest is not entirely currently forward-compatible, and the only
// forward-compatible part is the CutsomtTag of kNewFile, we currently encode
// kMinLogNumberToKeep as part of a CustomTag as a hack. This should be
// removed when manifest becomes forward-comptabile.
kMinLogNumberToKeepHack = 3,
kOldestBlobFileNumber = 4,
kPathId = 65,
};
// If this bit for the custom tag is set, opening DB should fail if
// we don't know this field.
uint32_t kCustomTagNonSafeIgnoreMask = 1 << 6;
uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id) {
assert(number <= kFileNumberMask);
return number | (path_id * (kFileNumberMask + 1));
}
void FileMetaData::UpdateBoundaries(const Slice& key, const Slice& value,
SequenceNumber seqno,
ValueType value_type) {
if (smallest.size() == 0) {
smallest.DecodeFrom(key);
}
largest.DecodeFrom(key);
fd.smallest_seqno = std::min(fd.smallest_seqno, seqno);
fd.largest_seqno = std::max(fd.largest_seqno, seqno);
#ifndef ROCKSDB_LITE
if (value_type == kTypeBlobIndex) {
BlobIndex blob_index;
const Status s = blob_index.DecodeFrom(value);
if (!s.ok()) {
return;
}
if (blob_index.IsInlined()) {
return;
}
if (blob_index.HasTTL()) {
return;
}
// Paranoid check: this should not happen because BlobDB numbers the blob
// files starting from 1.
if (blob_index.file_number() == kInvalidBlobFileNumber) {
return;
}
if (oldest_blob_file_number == kInvalidBlobFileNumber ||
oldest_blob_file_number > blob_index.file_number()) {
oldest_blob_file_number = blob_index.file_number();
}
}
#else
(void)value;
(void)value_type;
#endif
}
void VersionEdit::Clear() {
db_id_.clear();
comparator_.clear();
max_level_ = 0;
log_number_ = 0;
prev_log_number_ = 0;
last_sequence_ = 0;
next_file_number_ = 0;
max_column_family_ = 0;
min_log_number_to_keep_ = 0;
has_db_id_ = false;
has_comparator_ = false;
has_log_number_ = false;
has_prev_log_number_ = false;
has_next_file_number_ = false;
has_last_sequence_ = false;
has_max_column_family_ = false;
has_min_log_number_to_keep_ = false;
deleted_files_.clear();
new_files_.clear();
column_family_ = 0;
is_column_family_add_ = 0;
is_column_family_drop_ = 0;
column_family_name_.clear();
is_in_atomic_group_ = false;
remaining_entries_ = 0;
}
bool VersionEdit::EncodeTo(std::string* dst) const {
if (has_db_id_) {
PutVarint32(dst, kDbId);
PutLengthPrefixedSlice(dst, db_id_);
}
if (has_comparator_) {
PutVarint32(dst, kComparator);
PutLengthPrefixedSlice(dst, comparator_);
}
if (has_log_number_) {
PutVarint32Varint64(dst, kLogNumber, log_number_);
}
if (has_prev_log_number_) {
PutVarint32Varint64(dst, kPrevLogNumber, prev_log_number_);
}
if (has_next_file_number_) {
PutVarint32Varint64(dst, kNextFileNumber, next_file_number_);
}
if (has_last_sequence_) {
PutVarint32Varint64(dst, kLastSequence, last_sequence_);
}
if (has_max_column_family_) {
PutVarint32Varint32(dst, kMaxColumnFamily, max_column_family_);
}
for (const auto& deleted : deleted_files_) {
PutVarint32Varint32Varint64(dst, kDeletedFile, deleted.first /* level */,
deleted.second /* file number */);
}
bool min_log_num_written = false;
for (size_t i = 0; i < new_files_.size(); i++) {
const FileMetaData& f = new_files_[i].second;
if (!f.smallest.Valid() || !f.largest.Valid()) {
return false;
}
bool has_customized_fields = false;
if (f.marked_for_compaction || has_min_log_number_to_keep_ ||
f.oldest_blob_file_number != kInvalidBlobFileNumber) {
PutVarint32(dst, kNewFile4);
has_customized_fields = true;
} else if (f.fd.GetPathId() == 0) {
// Use older format to make sure user can roll back the build if they
// don't config multiple DB paths.
PutVarint32(dst, kNewFile2);
} else {
PutVarint32(dst, kNewFile3);
}
PutVarint32Varint64(dst, new_files_[i].first /* level */, f.fd.GetNumber());
if (f.fd.GetPathId() != 0 && !has_customized_fields) {
// kNewFile3
PutVarint32(dst, f.fd.GetPathId());
}
PutVarint64(dst, f.fd.GetFileSize());
PutLengthPrefixedSlice(dst, f.smallest.Encode());
PutLengthPrefixedSlice(dst, f.largest.Encode());
PutVarint64Varint64(dst, f.fd.smallest_seqno, f.fd.largest_seqno);
if (has_customized_fields) {
// Customized fields' format:
// +-----------------------------+
// | 1st field's tag (varint32) |
// +-----------------------------+
// | 1st field's size (varint32) |
// +-----------------------------+
// | bytes for 1st field |
// | (based on size decoded) |
// +-----------------------------+
// | |
// | ...... |
// | |
// +-----------------------------+
// | last field's size (varint32)|
// +-----------------------------+
// | bytes for last field |
// | (based on size decoded) |
// +-----------------------------+
// | terminating tag (varint32) |
// +-----------------------------+
//
// Customized encoding for fields:
// tag kPathId: 1 byte as path_id
// tag kNeedCompaction:
// now only can take one char value 1 indicating need-compaction
//
if (f.fd.GetPathId() != 0) {
PutVarint32(dst, CustomTag::kPathId);
char p = static_cast<char>(f.fd.GetPathId());
PutLengthPrefixedSlice(dst, Slice(&p, 1));
}
if (f.marked_for_compaction) {
PutVarint32(dst, CustomTag::kNeedCompaction);
char p = static_cast<char>(1);
PutLengthPrefixedSlice(dst, Slice(&p, 1));
}
if (has_min_log_number_to_keep_ && !min_log_num_written) {
PutVarint32(dst, CustomTag::kMinLogNumberToKeepHack);
std::string varint_log_number;
PutFixed64(&varint_log_number, min_log_number_to_keep_);
PutLengthPrefixedSlice(dst, Slice(varint_log_number));
min_log_num_written = true;
}
if (f.oldest_blob_file_number != kInvalidBlobFileNumber) {
PutVarint32(dst, CustomTag::kOldestBlobFileNumber);
std::string oldest_blob_file_number;
PutVarint64(&oldest_blob_file_number, f.oldest_blob_file_number);
PutLengthPrefixedSlice(dst, Slice(oldest_blob_file_number));
}
TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:NewFile4:CustomizeFields",
dst);
PutVarint32(dst, CustomTag::kTerminate);
}
}
// 0 is default and does not need to be explicitly written
if (column_family_ != 0) {
PutVarint32Varint32(dst, kColumnFamily, column_family_);
}
if (is_column_family_add_) {
PutVarint32(dst, kColumnFamilyAdd);
PutLengthPrefixedSlice(dst, Slice(column_family_name_));
}
if (is_column_family_drop_) {
PutVarint32(dst, kColumnFamilyDrop);
}
if (is_in_atomic_group_) {
PutVarint32(dst, kInAtomicGroup);
PutVarint32(dst, remaining_entries_);
}
return true;
}
static bool GetInternalKey(Slice* input, InternalKey* dst) {
Slice str;
if (GetLengthPrefixedSlice(input, &str)) {
dst->DecodeFrom(str);
return dst->Valid();
} else {
return false;
}
}
bool VersionEdit::GetLevel(Slice* input, int* level, const char** /*msg*/) {
uint32_t v;
if (GetVarint32(input, &v)) {
*level = v;
if (max_level_ < *level) {
max_level_ = *level;
}
return true;
} else {
return false;
}
}
const char* VersionEdit::DecodeNewFile4From(Slice* input) {
const char* msg = nullptr;
int level;
FileMetaData f;
uint64_t number;
uint32_t path_id = 0;
uint64_t file_size;
SequenceNumber smallest_seqno;
SequenceNumber largest_seqno;
// Since this is the only forward-compatible part of the code, we hack new
// extension into this record. When we do, we set this boolean to distinguish
// the record from the normal NewFile records.
if (GetLevel(input, &level, &msg) && GetVarint64(input, &number) &&
GetVarint64(input, &file_size) && GetInternalKey(input, &f.smallest) &&
GetInternalKey(input, &f.largest) &&
GetVarint64(input, &smallest_seqno) &&
GetVarint64(input, &largest_seqno)) {
// See comments in VersionEdit::EncodeTo() for format of customized fields
while (true) {
uint32_t custom_tag;
Slice field;
if (!GetVarint32(input, &custom_tag)) {
return "new-file4 custom field";
}
if (custom_tag == kTerminate) {
break;
}
if (!GetLengthPrefixedSlice(input, &field)) {
return "new-file4 custom field length prefixed slice error";
}
switch (custom_tag) {
case kPathId:
if (field.size() != 1) {
return "path_id field wrong size";
}
path_id = field[0];
if (path_id > 3) {
return "path_id wrong vaue";
}
break;
case kNeedCompaction:
if (field.size() != 1) {
return "need_compaction field wrong size";
}
f.marked_for_compaction = (field[0] == 1);
break;
case kMinLogNumberToKeepHack:
// This is a hack to encode kMinLogNumberToKeep in a
// forward-compatible fashion.
if (!GetFixed64(&field, &min_log_number_to_keep_)) {
return "deleted log number malformatted";
}
has_min_log_number_to_keep_ = true;
break;
case kOldestBlobFileNumber:
if (!GetVarint64(&field, &f.oldest_blob_file_number)) {
return "invalid oldest blob file number";
}
break;
default:
if ((custom_tag & kCustomTagNonSafeIgnoreMask) != 0) {
// Should not proceed if cannot understand it
return "new-file4 custom field not supported";
}
break;
}
}
} else {
return "new-file4 entry";
}
f.fd =
FileDescriptor(number, path_id, file_size, smallest_seqno, largest_seqno);
new_files_.push_back(std::make_pair(level, f));
return nullptr;
}
Status VersionEdit::DecodeFrom(const Slice& src) {
Clear();
Slice input = src;
const char* msg = nullptr;
uint32_t tag;
// Temporary storage for parsing
int level;
FileMetaData f;
Slice str;
InternalKey key;
while (msg == nullptr && GetVarint32(&input, &tag)) {
switch (tag) {
case kDbId:
if (GetLengthPrefixedSlice(&input, &str)) {
db_id_ = str.ToString();
has_db_id_ = true;
} else {
msg = "db id";
}
break;
case kComparator:
if (GetLengthPrefixedSlice(&input, &str)) {
comparator_ = str.ToString();
has_comparator_ = true;
} else {
msg = "comparator name";
}
break;
case kLogNumber:
if (GetVarint64(&input, &log_number_)) {
has_log_number_ = true;
} else {
msg = "log number";
}
break;
case kPrevLogNumber:
if (GetVarint64(&input, &prev_log_number_)) {
has_prev_log_number_ = true;
} else {
msg = "previous log number";
}
break;
case kNextFileNumber:
if (GetVarint64(&input, &next_file_number_)) {
has_next_file_number_ = true;
} else {
msg = "next file number";
}
break;
case kLastSequence:
if (GetVarint64(&input, &last_sequence_)) {
has_last_sequence_ = true;
} else {
msg = "last sequence number";
}
break;
case kMaxColumnFamily:
if (GetVarint32(&input, &max_column_family_)) {
has_max_column_family_ = true;
} else {
msg = "max column family";
}
break;
case kMinLogNumberToKeep:
if (GetVarint64(&input, &min_log_number_to_keep_)) {
has_min_log_number_to_keep_ = true;
} else {
msg = "min log number to kee";
}
break;
case kCompactPointer:
if (GetLevel(&input, &level, &msg) &&
GetInternalKey(&input, &key)) {
// we don't use compact pointers anymore,
// but we should not fail if they are still
// in manifest
} else {
if (!msg) {
msg = "compaction pointer";
}
}
break;
case kDeletedFile: {
uint64_t number;
if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number)) {
deleted_files_.insert(std::make_pair(level, number));
} else {
if (!msg) {
msg = "deleted file";
}
}
break;
}
case kNewFile: {
uint64_t number;
uint64_t file_size;
if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) &&
GetVarint64(&input, &file_size) &&
GetInternalKey(&input, &f.smallest) &&
GetInternalKey(&input, &f.largest)) {
f.fd = FileDescriptor(number, 0, file_size);
new_files_.push_back(std::make_pair(level, f));
} else {
if (!msg) {
msg = "new-file entry";
}
}
break;
}
case kNewFile2: {
uint64_t number;
uint64_t file_size;
SequenceNumber smallest_seqno;
SequenceNumber largest_seqno;
if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) &&
GetVarint64(&input, &file_size) &&
GetInternalKey(&input, &f.smallest) &&
GetInternalKey(&input, &f.largest) &&
GetVarint64(&input, &smallest_seqno) &&
GetVarint64(&input, &largest_seqno)) {
f.fd = FileDescriptor(number, 0, file_size, smallest_seqno,
largest_seqno);
new_files_.push_back(std::make_pair(level, f));
} else {
if (!msg) {
msg = "new-file2 entry";
}
}
break;
}
case kNewFile3: {
uint64_t number;
uint32_t path_id;
uint64_t file_size;
SequenceNumber smallest_seqno;
SequenceNumber largest_seqno;
if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) &&
GetVarint32(&input, &path_id) && GetVarint64(&input, &file_size) &&
GetInternalKey(&input, &f.smallest) &&
GetInternalKey(&input, &f.largest) &&
GetVarint64(&input, &smallest_seqno) &&
GetVarint64(&input, &largest_seqno)) {
f.fd = FileDescriptor(number, path_id, file_size, smallest_seqno,
largest_seqno);
new_files_.push_back(std::make_pair(level, f));
} else {
if (!msg) {
msg = "new-file3 entry";
}
}
break;
}
case kNewFile4: {
msg = DecodeNewFile4From(&input);
break;
}
case kColumnFamily:
if (!GetVarint32(&input, &column_family_)) {
if (!msg) {
msg = "set column family id";
}
}
break;
case kColumnFamilyAdd:
if (GetLengthPrefixedSlice(&input, &str)) {
is_column_family_add_ = true;
column_family_name_ = str.ToString();
} else {
if (!msg) {
msg = "column family add";
}
}
break;
case kColumnFamilyDrop:
is_column_family_drop_ = true;
break;
case kInAtomicGroup:
is_in_atomic_group_ = true;
if (!GetVarint32(&input, &remaining_entries_)) {
if (!msg) {
msg = "remaining entries";
}
}
break;
default:
if (tag & kTagSafeIgnoreMask) {
// Tag from future which can be safely ignored.
// The next field must be the length of the entry.
uint32_t field_len;
if (!GetVarint32(&input, &field_len) ||
static_cast<size_t>(field_len) > input.size()) {
if (!msg) {
msg = "safely ignoreable tag length error";
}
} else {
input.remove_prefix(static_cast<size_t>(field_len));
}
} else {
msg = "unknown tag";
}
break;
}
}
if (msg == nullptr && !input.empty()) {
msg = "invalid tag";
}
Status result;
if (msg != nullptr) {
result = Status::Corruption("VersionEdit", msg);
}
return result;
}
std::string VersionEdit::DebugString(bool hex_key) const {
std::string r;
r.append("VersionEdit {");
if (has_db_id_) {
r.append("\n DB ID: ");
r.append(db_id_);
}
if (has_comparator_) {
r.append("\n Comparator: ");
r.append(comparator_);
}
if (has_log_number_) {
r.append("\n LogNumber: ");
AppendNumberTo(&r, log_number_);
}
if (has_prev_log_number_) {
r.append("\n PrevLogNumber: ");
AppendNumberTo(&r, prev_log_number_);
}
if (has_next_file_number_) {
r.append("\n NextFileNumber: ");
AppendNumberTo(&r, next_file_number_);
}
if (has_min_log_number_to_keep_) {
r.append("\n MinLogNumberToKeep: ");
AppendNumberTo(&r, min_log_number_to_keep_);
}
if (has_last_sequence_) {
r.append("\n LastSeq: ");
AppendNumberTo(&r, last_sequence_);
}
for (DeletedFileSet::const_iterator iter = deleted_files_.begin();
iter != deleted_files_.end();
++iter) {
r.append("\n DeleteFile: ");
AppendNumberTo(&r, iter->first);
r.append(" ");
AppendNumberTo(&r, iter->second);
}
for (size_t i = 0; i < new_files_.size(); i++) {
const FileMetaData& f = new_files_[i].second;
r.append("\n AddFile: ");
AppendNumberTo(&r, new_files_[i].first);
r.append(" ");
AppendNumberTo(&r, f.fd.GetNumber());
r.append(" ");
AppendNumberTo(&r, f.fd.GetFileSize());
r.append(" ");
r.append(f.smallest.DebugString(hex_key));
r.append(" .. ");
r.append(f.largest.DebugString(hex_key));
if (f.oldest_blob_file_number != kInvalidBlobFileNumber) {
r.append(" blob_file:");
AppendNumberTo(&r, f.oldest_blob_file_number);
}
}
r.append("\n ColumnFamily: ");
AppendNumberTo(&r, column_family_);
if (is_column_family_add_) {
r.append("\n ColumnFamilyAdd: ");
r.append(column_family_name_);
}
if (is_column_family_drop_) {
r.append("\n ColumnFamilyDrop");
}
if (has_max_column_family_) {
r.append("\n MaxColumnFamily: ");
AppendNumberTo(&r, max_column_family_);
}
if (is_in_atomic_group_) {
r.append("\n AtomicGroup: ");
AppendNumberTo(&r, remaining_entries_);
r.append(" entries remains");
}
r.append("\n}\n");
return r;
}
std::string VersionEdit::DebugJSON(int edit_num, bool hex_key) const {
JSONWriter jw;
jw << "EditNumber" << edit_num;
if (has_db_id_) {
jw << "DB ID" << db_id_;
}
if (has_comparator_) {
jw << "Comparator" << comparator_;
}
if (has_log_number_) {
jw << "LogNumber" << log_number_;
}
if (has_prev_log_number_) {
jw << "PrevLogNumber" << prev_log_number_;
}
if (has_next_file_number_) {
jw << "NextFileNumber" << next_file_number_;
}
if (has_last_sequence_) {
jw << "LastSeq" << last_sequence_;
}
if (!deleted_files_.empty()) {
jw << "DeletedFiles";
jw.StartArray();
for (DeletedFileSet::const_iterator iter = deleted_files_.begin();
iter != deleted_files_.end();
++iter) {
jw.StartArrayedObject();
jw << "Level" << iter->first;
jw << "FileNumber" << iter->second;
jw.EndArrayedObject();
}
jw.EndArray();
}
if (!new_files_.empty()) {
jw << "AddedFiles";
jw.StartArray();
for (size_t i = 0; i < new_files_.size(); i++) {
jw.StartArrayedObject();
jw << "Level" << new_files_[i].first;
const FileMetaData& f = new_files_[i].second;
jw << "FileNumber" << f.fd.GetNumber();
jw << "FileSize" << f.fd.GetFileSize();
jw << "SmallestIKey" << f.smallest.DebugString(hex_key);
jw << "LargestIKey" << f.largest.DebugString(hex_key);
if (f.oldest_blob_file_number != kInvalidBlobFileNumber) {
jw << "OldestBlobFile" << f.oldest_blob_file_number;
}
jw.EndArrayedObject();
}
jw.EndArray();
}
jw << "ColumnFamily" << column_family_;
if (is_column_family_add_) {
jw << "ColumnFamilyAdd" << column_family_name_;
}
if (is_column_family_drop_) {
jw << "ColumnFamilyDrop" << column_family_name_;
}
if (has_max_column_family_) {
jw << "MaxColumnFamily" << max_column_family_;
}
if (has_min_log_number_to_keep_) {
jw << "MinLogNumberToKeep" << min_log_number_to_keep_;
}
if (is_in_atomic_group_) {
jw << "AtomicGroup" << remaining_entries_;
}
jw.EndObject();
return jw.Get();
}
} // namespace rocksdb