BlobDB GC: add SST <-> oldest blob file referenced mapping (#5903)

Summary:
This is groundwork for adding garbage collection support to BlobDB. The
patch adds logic that keeps track of the oldest blob file referred to by
each SST file. The oldest blob file is identified during flush/
compaction (similarly to how the range of keys covered by the SST is
identified), and persisted in the manifest as a custom field of the new
file edit record. Blob indexes with TTL are ignored for the purposes of
identifying the oldest blob file (since such blob files are cleaned up by the
TTL logic in BlobDB).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5903

Test Plan:
Added new unit tests; also ran db_bench in BlobDB mode, inspected the
manifest using ldb, and confirmed (by scanning the SST files using
sst_dump) that the value of the oldest blob file number field matches
the contents of the file for each SST.

Differential Revision: D17859997

Pulled By: ltamasi

fbshipit-source-id: 21662c137c6259a6af70446faaf3a9912c550e90
This commit is contained in:
Levi Tamasi 2019-10-14 15:19:31 -07:00 committed by Facebook Github Bot
parent a59dc843a4
commit 5f025ea832
21 changed files with 378 additions and 157 deletions

View File

@ -10,7 +10,6 @@
#include "util/string_util.h"
namespace rocksdb {
namespace blob_db {
// BlobIndex is a pointer to the blob and metadata of the blob. The index is
// stored in base DB as ValueType::kTypeBlobIndex.
@ -156,6 +155,5 @@ class BlobIndex {
CompressionType compression_ = kNoCompression;
};
} // namespace blob_db
} // namespace rocksdb
#endif // ROCKSDB_LITE

View File

@ -124,7 +124,7 @@ Status BuildTable(
if (!s.ok()) {
EventHelpers::LogAndNotifyTableFileCreationFinished(
event_logger, ioptions.listeners, dbname, column_family_name, fname,
job_id, meta->fd, tp, reason, s);
job_id, meta->fd, kInvalidBlobFileNumber, tp, reason, s);
return s;
}
file->SetIOPriority(io_priority);
@ -157,8 +157,9 @@ Status BuildTable(
for (; c_iter.Valid(); c_iter.Next()) {
const Slice& key = c_iter.key();
const Slice& value = c_iter.value();
const ParsedInternalKey& ikey = c_iter.ikey();
builder->Add(key, value);
meta->UpdateBoundaries(key, c_iter.ikey().sequence);
meta->UpdateBoundaries(key, value, ikey.sequence, ikey.type);
// TODO(noetzli): Update stats after flush, too.
if (io_priority == Env::IO_HIGH &&
@ -249,7 +250,7 @@ Status BuildTable(
// Output to event logger and fire events.
EventHelpers::LogAndNotifyTableFileCreationFinished(
event_logger, ioptions.listeners, dbname, column_family_name, fname,
job_id, meta->fd, tp, reason, s);
job_id, meta->fd, meta->oldest_blob_file_number, tp, reason, s);
return s;
}

View File

@ -933,8 +933,9 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
assert(sub_compact->current_output() != nullptr);
sub_compact->builder->Add(key, value);
sub_compact->current_output_file_size = sub_compact->builder->FileSize();
const ParsedInternalKey& ikey = c_iter->ikey();
sub_compact->current_output()->meta.UpdateBoundaries(
key, c_iter->ikey().sequence);
key, value, ikey.sequence, ikey.type);
sub_compact->num_output_records++;
// Close output file if it is big enough. Two possibilities determine it's
@ -1349,17 +1350,20 @@ Status CompactionJob::FinishCompactionOutputFile(
}
std::string fname;
FileDescriptor output_fd;
uint64_t oldest_blob_file_number = kInvalidBlobFileNumber;
if (meta != nullptr) {
fname =
TableFileName(sub_compact->compaction->immutable_cf_options()->cf_paths,
meta->fd.GetNumber(), meta->fd.GetPathId());
output_fd = meta->fd;
oldest_blob_file_number = meta->oldest_blob_file_number;
} else {
fname = "(nil)";
}
EventHelpers::LogAndNotifyTableFileCreationFinished(
event_logger_, cfd->ioptions()->listeners, dbname_, cfd->GetName(), fname,
job_id_, output_fd, tp, TableFileCreationReason::kCompaction, s);
job_id_, output_fd, oldest_blob_file_number, tp,
TableFileCreationReason::kCompaction, s);
#ifndef ROCKSDB_LITE
// Report new file to SstFileManagerImpl
@ -1469,8 +1473,8 @@ Status CompactionJob::OpenCompactionOutputFile(
LogFlush(db_options_.info_log);
EventHelpers::LogAndNotifyTableFileCreationFinished(
event_logger_, cfd->ioptions()->listeners, dbname_, cfd->GetName(),
fname, job_id_, FileDescriptor(), TableProperties(),
TableFileCreationReason::kCompaction, s);
fname, job_id_, FileDescriptor(), kInvalidBlobFileNumber,
TableProperties(), TableFileCreationReason::kCompaction, s);
return s;
}

View File

@ -12,6 +12,7 @@
#include <string>
#include <tuple>
#include "db/blob_index.h"
#include "db/column_family.h"
#include "db/compaction/compaction_job.h"
#include "db/db_impl/db_impl.h"
@ -97,11 +98,34 @@ class CompactionJobTest : public testing::Test {
return TableFileName(db_paths, meta.fd.GetNumber(), meta.fd.GetPathId());
}
std::string KeyStr(const std::string& user_key, const SequenceNumber seq_num,
const ValueType t) {
static std::string KeyStr(const std::string& user_key,
const SequenceNumber seq_num, const ValueType t) {
return InternalKey(user_key, seq_num, t).Encode().ToString();
}
static std::string BlobStr(uint64_t blob_file_number, uint64_t offset,
uint64_t size) {
std::string blob_index;
BlobIndex::EncodeBlob(&blob_index, blob_file_number, offset, size,
kNoCompression);
return blob_index;
}
static std::string BlobStrTTL(uint64_t blob_file_number, uint64_t offset,
uint64_t size, uint64_t expiration) {
std::string blob_index;
BlobIndex::EncodeBlobTTL(&blob_index, expiration, blob_file_number, offset,
size, kNoCompression);
return blob_index;
}
static std::string BlobStrInlinedTTL(const Slice& value,
uint64_t expiration) {
std::string blob_index;
BlobIndex::EncodeInlinedTTL(&blob_index, expiration, value);
return blob_index;
}
void AddMockFile(const stl_wrappers::KVMap& contents, int level = 0) {
assert(contents.size() > 0);
@ -110,6 +134,7 @@ class CompactionJobTest : public testing::Test {
InternalKey smallest_key, largest_key;
SequenceNumber smallest_seqno = kMaxSequenceNumber;
SequenceNumber largest_seqno = 0;
uint64_t oldest_blob_file_number = kInvalidBlobFileNumber;
for (auto kv : contents) {
ParsedInternalKey key;
std::string skey;
@ -132,6 +157,24 @@ class CompactionJobTest : public testing::Test {
}
first_key = false;
if (key.type == kTypeBlobIndex) {
BlobIndex blob_index;
const Status s = blob_index.DecodeFrom(value);
if (!s.ok()) {
continue;
}
if (blob_index.IsInlined() || blob_index.HasTTL() ||
blob_index.file_number() == kInvalidBlobFileNumber) {
continue;
}
if (oldest_blob_file_number == kInvalidBlobFileNumber ||
oldest_blob_file_number > blob_index.file_number()) {
oldest_blob_file_number = blob_index.file_number();
}
}
}
uint64_t file_number = versions_->NewFileNumber();
@ -140,7 +183,7 @@ class CompactionJobTest : public testing::Test {
VersionEdit edit;
edit.AddFile(level, file_number, 0, 10, smallest_key, largest_key,
smallest_seqno, largest_seqno, false);
smallest_seqno, largest_seqno, false, oldest_blob_file_number);
mutex_.Lock();
versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(),
@ -250,7 +293,8 @@ class CompactionJobTest : public testing::Test {
const stl_wrappers::KVMap& expected_results,
const std::vector<SequenceNumber>& snapshots = {},
SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber,
int output_level = 1, bool verify = true) {
int output_level = 1, bool verify = true,
uint64_t expected_oldest_blob_file_number = kInvalidBlobFileNumber) {
auto cfd = versions_->GetColumnFamilySet()->GetDefault();
size_t num_input_files = 0;
@ -296,15 +340,20 @@ class CompactionJobTest : public testing::Test {
mutex_.Unlock();
if (verify) {
if (expected_results.size() == 0) {
ASSERT_GE(compaction_job_stats_.elapsed_micros, 0U);
ASSERT_EQ(compaction_job_stats_.num_input_files, num_input_files);
ASSERT_GE(compaction_job_stats_.elapsed_micros, 0U);
ASSERT_EQ(compaction_job_stats_.num_input_files, num_input_files);
if (expected_results.empty()) {
ASSERT_EQ(compaction_job_stats_.num_output_files, 0U);
} else {
ASSERT_GE(compaction_job_stats_.elapsed_micros, 0U);
ASSERT_EQ(compaction_job_stats_.num_input_files, num_input_files);
ASSERT_EQ(compaction_job_stats_.num_output_files, 1U);
mock_table_factory_->AssertLatestFile(expected_results);
auto output_files =
cfd->current()->storage_info()->LevelFiles(output_level);
ASSERT_EQ(output_files.size(), 1);
ASSERT_EQ(output_files[0]->oldest_blob_file_number,
expected_oldest_blob_file_number);
}
}
}
@ -960,6 +1009,54 @@ TEST_F(CompactionJobTest, CorruptionAfterDeletion) {
RunCompaction({files}, expected_results);
}
TEST_F(CompactionJobTest, OldestBlobFileNumber) {
NewDB();
// Note: blob1 is inlined TTL, so it will not be considered for the purposes
// of identifying the oldest referenced blob file. Similarly, blob6 will be
// ignored because it has TTL and hence refers to a TTL blob file.
const stl_wrappers::KVMap::value_type blob1(
KeyStr("a", 1U, kTypeBlobIndex), BlobStrInlinedTTL("foo", 1234567890ULL));
const stl_wrappers::KVMap::value_type blob2(KeyStr("b", 2U, kTypeBlobIndex),
BlobStr(59, 123456, 999));
const stl_wrappers::KVMap::value_type blob3(KeyStr("c", 3U, kTypeBlobIndex),
BlobStr(138, 1000, 1 << 8));
auto file1 = mock::MakeMockFile({blob1, blob2, blob3});
AddMockFile(file1);
const stl_wrappers::KVMap::value_type blob4(KeyStr("d", 4U, kTypeBlobIndex),
BlobStr(199, 3 << 10, 1 << 20));
const stl_wrappers::KVMap::value_type blob5(KeyStr("e", 5U, kTypeBlobIndex),
BlobStr(19, 6789, 333));
const stl_wrappers::KVMap::value_type blob6(
KeyStr("f", 6U, kTypeBlobIndex),
BlobStrTTL(5, 2048, 1 << 7, 1234567890ULL));
auto file2 = mock::MakeMockFile({blob4, blob5, blob6});
AddMockFile(file2);
const stl_wrappers::KVMap::value_type expected_blob1(
KeyStr("a", 0U, kTypeBlobIndex), blob1.second);
const stl_wrappers::KVMap::value_type expected_blob2(
KeyStr("b", 0U, kTypeBlobIndex), blob2.second);
const stl_wrappers::KVMap::value_type expected_blob3(
KeyStr("c", 0U, kTypeBlobIndex), blob3.second);
const stl_wrappers::KVMap::value_type expected_blob4(
KeyStr("d", 0U, kTypeBlobIndex), blob4.second);
const stl_wrappers::KVMap::value_type expected_blob5(
KeyStr("e", 0U, kTypeBlobIndex), blob5.second);
const stl_wrappers::KVMap::value_type expected_blob6(
KeyStr("f", 0U, kTypeBlobIndex), blob6.second);
auto expected_results =
mock::MakeMockFile({expected_blob1, expected_blob2, expected_blob3,
expected_blob4, expected_blob5, expected_blob6});
SetLastSequence(6U);
auto files = cfd_->current()->storage_info()->LevelFiles(0);
RunCompaction({files}, expected_results, std::vector<SequenceNumber>(),
kMaxSequenceNumber, /* output_level */ 1, /* verify */ true,
/* expected_oldest_blob_file_number */ 19);
}
} // namespace rocksdb
int main(int argc, char** argv) {

View File

@ -88,15 +88,13 @@ class CompactionPickerTest : public testing::Test {
SequenceNumber smallest_seq = 100, SequenceNumber largest_seq = 100,
size_t compensated_file_size = 0) {
assert(level < vstorage_->num_levels());
FileMetaData* f = new FileMetaData;
f->fd = FileDescriptor(file_number, path_id, file_size);
f->smallest = InternalKey(smallest, smallest_seq, kTypeValue);
f->largest = InternalKey(largest, largest_seq, kTypeValue);
f->fd.smallest_seqno = smallest_seq;
f->fd.largest_seqno = largest_seq;
FileMetaData* f = new FileMetaData(
file_number, path_id, file_size,
InternalKey(smallest, smallest_seq, kTypeValue),
InternalKey(largest, largest_seq, kTypeValue), smallest_seq,
largest_seq, /* marked_for_compact */ false, kInvalidBlobFileNumber);
f->compensated_file_size =
(compensated_file_size != 0) ? compensated_file_size : file_size;
f->refs = 0;
vstorage_->AddFile(level, f);
files_.emplace_back(f);
file_map_.insert({file_number, {f, level}});

View File

@ -1257,7 +1257,7 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
edit.AddFile(to_level, f->fd.GetNumber(), f->fd.GetPathId(),
f->fd.GetFileSize(), f->smallest, f->largest,
f->fd.smallest_seqno, f->fd.largest_seqno,
f->marked_for_compaction);
f->marked_for_compaction, f->oldest_blob_file_number);
}
ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
"[%s] Apply version edit:\n%s", cfd->GetName().c_str(),
@ -2657,7 +2657,8 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
c->edit()->AddFile(c->output_level(), f->fd.GetNumber(),
f->fd.GetPathId(), f->fd.GetFileSize(), f->smallest,
f->largest, f->fd.smallest_seqno,
f->fd.largest_seqno, f->marked_for_compaction);
f->fd.largest_seqno, f->marked_for_compaction,
f->oldest_blob_file_number);
ROCKS_LOG_BUFFER(
log_buffer,

View File

@ -128,7 +128,7 @@ Status DBImpl::PromoteL0(ColumnFamilyHandle* column_family, int target_level) {
edit.AddFile(target_level, f->fd.GetNumber(), f->fd.GetPathId(),
f->fd.GetFileSize(), f->smallest, f->largest,
f->fd.smallest_seqno, f->fd.largest_seqno,
f->marked_for_compaction);
f->marked_for_compaction, f->oldest_blob_file_number);
}
status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),

View File

@ -1210,7 +1210,7 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
edit->AddFile(level, meta.fd.GetNumber(), meta.fd.GetPathId(),
meta.fd.GetFileSize(), meta.smallest, meta.largest,
meta.fd.smallest_seqno, meta.fd.largest_seqno,
meta.marked_for_compaction);
meta.marked_for_compaction, meta.oldest_blob_file_number);
}
InternalStats::CompactionStats stats(CompactionReason::kFlush, 1);

View File

@ -70,8 +70,8 @@ void EventHelpers::LogAndNotifyTableFileCreationFinished(
const std::vector<std::shared_ptr<EventListener>>& listeners,
const std::string& db_name, const std::string& cf_name,
const std::string& file_path, int job_id, const FileDescriptor& fd,
const TableProperties& table_properties, TableFileCreationReason reason,
const Status& s) {
uint64_t oldest_blob_file_number, const TableProperties& table_properties,
TableFileCreationReason reason, const Status& s) {
if (s.ok() && event_logger) {
JSONWriter jwriter;
AppendCurrentTime(&jwriter);
@ -129,6 +129,11 @@ void EventHelpers::LogAndNotifyTableFileCreationFinished(
}
jwriter.EndObject();
}
if (oldest_blob_file_number != kInvalidBlobFileNumber) {
jwriter << "oldest_blob_file_number" << oldest_blob_file_number;
}
jwriter.EndObject();
event_logger->Log(jwriter);

View File

@ -34,8 +34,8 @@ class EventHelpers {
const std::vector<std::shared_ptr<EventListener>>& listeners,
const std::string& db_name, const std::string& cf_name,
const std::string& file_path, int job_id, const FileDescriptor& fd,
const TableProperties& table_properties, TableFileCreationReason reason,
const Status& s);
uint64_t oldest_blob_file_number, const TableProperties& table_properties,
TableFileCreationReason reason, const Status& s);
static void LogAndNotifyTableFileDeletion(
EventLogger* event_logger, int job_id,
uint64_t file_number, const std::string& file_path,

View File

@ -243,10 +243,11 @@ Status ExternalSstFileIngestionJob::Run() {
if (!status.ok()) {
return status;
}
edit_.AddFile(f.picked_level, f.fd.GetNumber(), f.fd.GetPathId(),
f.fd.GetFileSize(), f.smallest_internal_key,
f.largest_internal_key, f.assigned_seqno, f.assigned_seqno,
false);
false, kInvalidBlobFileNumber);
}
return status;
}

View File

@ -408,7 +408,7 @@ Status FlushJob::WriteLevel0Table() {
edit_->AddFile(0 /* level */, meta_.fd.GetNumber(), meta_.fd.GetPathId(),
meta_.fd.GetFileSize(), meta_.smallest, meta_.largest,
meta_.fd.smallest_seqno, meta_.fd.largest_seqno,
meta_.marked_for_compaction);
meta_.marked_for_compaction, meta_.oldest_blob_file_number);
}
// Note that here we treat flush as level 0 compaction in internal stats

View File

@ -4,9 +4,11 @@
// (found in the LICENSE.Apache file in the root directory).
#include <algorithm>
#include <array>
#include <map>
#include <string>
#include "db/blob_index.h"
#include "db/column_family.h"
#include "db/db_impl/db_impl.h"
#include "db/flush_job.h"
@ -154,6 +156,7 @@ TEST_F(FlushJobTest, NonEmpty) {
// seqno [ 1, 2 ... 8998, 8999, 9000, 9001, 9002 ... 9999 ]
// key [ 1001, 1002 ... 9998, 9999, 0, 1, 2 ... 999 ]
// range-delete "9995" -> "9999" at seqno 10000
// blob references with seqnos 10001..10006
for (int i = 1; i < 10000; ++i) {
std::string key(ToString((i + 1000) % 10000));
std::string value("value" + key);
@ -163,9 +166,43 @@ TEST_F(FlushJobTest, NonEmpty) {
inserted_keys.insert({internal_key.Encode().ToString(), value});
}
}
new_mem->Add(SequenceNumber(10000), kTypeRangeDeletion, "9995", "9999a");
InternalKey internal_key("9995", SequenceNumber(10000), kTypeRangeDeletion);
inserted_keys.insert({internal_key.Encode().ToString(), "9999a"});
{
new_mem->Add(SequenceNumber(10000), kTypeRangeDeletion, "9995", "9999a");
InternalKey internal_key("9995", SequenceNumber(10000), kTypeRangeDeletion);
inserted_keys.insert({internal_key.Encode().ToString(), "9999a"});
}
#ifndef ROCKSDB_LITE
// Note: the first two blob references will not be considered when resolving
// the oldest blob file referenced (the first one is inlined TTL, while the
// second one is TTL and thus points to a TTL blob file).
constexpr std::array<uint64_t, 6> blob_file_numbers{
kInvalidBlobFileNumber, 5, 103, 17, 102, 101};
for (size_t i = 0; i < blob_file_numbers.size(); ++i) {
std::string key(ToString(i + 10001));
std::string blob_index;
if (i == 0) {
BlobIndex::EncodeInlinedTTL(&blob_index, /* expiration */ 1234567890ULL,
"foo");
} else if (i == 1) {
BlobIndex::EncodeBlobTTL(&blob_index, /* expiration */ 1234567890ULL,
blob_file_numbers[i], /* offset */ i << 10,
/* size */ i << 20, kNoCompression);
} else {
BlobIndex::EncodeBlob(&blob_index, blob_file_numbers[i],
/* offset */ i << 10, /* size */ i << 20,
kNoCompression);
}
const SequenceNumber seq(i + 10001);
new_mem->Add(seq, kTypeBlobIndex, key, blob_index);
InternalKey internal_key(key, seq, kTypeBlobIndex);
inserted_keys.emplace_hint(inserted_keys.end(),
internal_key.Encode().ToString(), blob_index);
}
#endif
autovector<MemTable*> to_delete;
cfd->imm()->Add(new_mem, &to_delete);
@ -194,11 +231,14 @@ TEST_F(FlushJobTest, NonEmpty) {
ASSERT_GT(hist.average, 0.0);
ASSERT_EQ(ToString(0), file_meta.smallest.user_key().ToString());
ASSERT_EQ(
"9999a",
file_meta.largest.user_key().ToString()); // range tombstone end key
ASSERT_EQ("9999a", file_meta.largest.user_key().ToString());
ASSERT_EQ(1, file_meta.fd.smallest_seqno);
ASSERT_EQ(10000, file_meta.fd.largest_seqno); // range tombstone seqnum 10000
#ifndef ROCKSDB_LITE
ASSERT_EQ(10006, file_meta.fd.largest_seqno);
ASSERT_EQ(17, file_meta.oldest_blob_file_number);
#else
ASSERT_EQ(10000, file_meta.fd.largest_seqno);
#endif
mock_table_factory_->AssertSingleFile(inserted_keys);
job_context.Clean();
}
@ -261,6 +301,7 @@ TEST_F(FlushJobTest, FlushMemTablesSingleColumnFamily) {
ASSERT_EQ(0, file_meta.fd.smallest_seqno);
ASSERT_EQ(SequenceNumber(num_mems_to_flush * num_keys_per_table - 1),
file_meta.fd.largest_seqno);
ASSERT_EQ(kInvalidBlobFileNumber, file_meta.oldest_blob_file_number);
for (auto m : to_delete) {
delete m;

View File

@ -136,10 +136,11 @@ Status ImportColumnFamilyJob::Run() {
for (size_t i = 0; i < files_to_import_.size(); ++i) {
const auto& f = files_to_import_[i];
const auto& file_metadata = metadata_[i];
edit_.AddFile(file_metadata.level, f.fd.GetNumber(), f.fd.GetPathId(),
f.fd.GetFileSize(), f.smallest_internal_key,
f.largest_internal_key, file_metadata.smallest_seqno,
file_metadata.largest_seqno, false);
file_metadata.largest_seqno, false, kInvalidBlobFileNumber);
// If incoming sequence number is higher, update local sequence number.
if (file_metadata.largest_seqno > versions_->LastSequence()) {

View File

@ -34,6 +34,7 @@
// We scan every table to compute
// (1) smallest/largest for the table
// (2) largest sequence number in the table
// (3) oldest blob file referred to by the table (if applicable)
//
// If we are unable to scan the file, then we ignore the table.
//
@ -224,8 +225,6 @@ class Repairer {
FileMetaData meta;
uint32_t column_family_id;
std::string column_family_name;
SequenceNumber min_sequence;
SequenceNumber max_sequence;
};
std::string const dbname_;
@ -526,10 +525,7 @@ class Repairer {
TableReaderCaller::kRepair, /*arena=*/nullptr, /*skip_filters=*/false,
/*level=*/-1, /*smallest_compaction_key=*/nullptr,
/*largest_compaction_key=*/nullptr);
bool empty = true;
ParsedInternalKey parsed;
t->min_sequence = 0;
t->max_sequence = 0;
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
Slice key = iter->key();
if (!ParseInternalKey(key, &parsed)) {
@ -540,18 +536,9 @@ class Repairer {
}
counter++;
if (empty) {
empty = false;
t->meta.smallest.DecodeFrom(key);
t->min_sequence = parsed.sequence;
}
t->meta.largest.DecodeFrom(key);
if (parsed.sequence < t->min_sequence) {
t->min_sequence = parsed.sequence;
}
if (parsed.sequence > t->max_sequence) {
t->max_sequence = parsed.sequence;
}
t->meta.UpdateBoundaries(key, iter->value(), parsed.sequence,
parsed.type);
}
if (!iter->status().ok()) {
status = iter->status();
@ -570,8 +557,8 @@ class Repairer {
SequenceNumber max_sequence = 0;
for (size_t i = 0; i < tables_.size(); i++) {
cf_id_to_tables[tables_[i].column_family_id].push_back(&tables_[i]);
if (max_sequence < tables_[i].max_sequence) {
max_sequence = tables_[i].max_sequence;
if (max_sequence < tables_[i].meta.fd.largest_seqno) {
max_sequence = tables_[i].meta.fd.largest_seqno;
}
}
vset_.SetLastAllocatedSequence(max_sequence);
@ -591,8 +578,10 @@ class Repairer {
for (const auto* table : cf_id_and_tables.second) {
edit.AddFile(0, table->meta.fd.GetNumber(), table->meta.fd.GetPathId(),
table->meta.fd.GetFileSize(), table->meta.smallest,
table->meta.largest, table->min_sequence,
table->max_sequence, table->meta.marked_for_compaction);
table->meta.largest, table->meta.fd.smallest_seqno,
table->meta.fd.largest_seqno,
table->meta.marked_for_compaction,
table->meta.oldest_blob_file_number);
}
assert(next_file_number_ > 0);
vset_.MarkFileNumberUsed(next_file_number_ - 1);

View File

@ -59,14 +59,11 @@ class VersionBuilderTest : public testing::Test {
bool sampled = false, SequenceNumber smallest_seqno = 0,
SequenceNumber largest_seqno = 0) {
assert(level < vstorage_.num_levels());
FileMetaData* f = new FileMetaData;
f->fd = FileDescriptor(file_number, path_id, file_size);
f->smallest = GetInternalKey(smallest, smallest_seq);
f->largest = GetInternalKey(largest, largest_seq);
f->fd.smallest_seqno = smallest_seqno;
f->fd.largest_seqno = largest_seqno;
FileMetaData* f = new FileMetaData(
file_number, path_id, file_size, GetInternalKey(smallest, smallest_seq),
GetInternalKey(largest, largest_seq), smallest_seqno, largest_seqno,
/* marked_for_compact */ false, kInvalidBlobFileNumber);
f->compensated_file_size = file_size;
f->refs = 0;
f->num_entries = num_entries;
f->num_deletions = num_deletions;
vstorage_.AddFile(level, f);
@ -115,7 +112,8 @@ TEST_F(VersionBuilderTest, ApplyAndSaveTo) {
VersionEdit version_edit;
version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"),
GetInternalKey("350"), 200, 200, false);
GetInternalKey("350"), 200, 200, false,
kInvalidBlobFileNumber);
version_edit.DeleteFile(3, 27U);
EnvOptions env_options;
@ -149,7 +147,8 @@ TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic) {
VersionEdit version_edit;
version_edit.AddFile(3, 666, 0, 100U, GetInternalKey("301"),
GetInternalKey("350"), 200, 200, false);
GetInternalKey("350"), 200, 200, false,
kInvalidBlobFileNumber);
version_edit.DeleteFile(0, 1U);
version_edit.DeleteFile(0, 88U);
@ -186,7 +185,8 @@ TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic2) {
VersionEdit version_edit;
version_edit.AddFile(4, 666, 0, 100U, GetInternalKey("301"),
GetInternalKey("350"), 200, 200, false);
GetInternalKey("350"), 200, 200, false,
kInvalidBlobFileNumber);
version_edit.DeleteFile(0, 1U);
version_edit.DeleteFile(0, 88U);
version_edit.DeleteFile(4, 6U);
@ -214,15 +214,20 @@ TEST_F(VersionBuilderTest, ApplyMultipleAndSaveTo) {
VersionEdit version_edit;
version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"),
GetInternalKey("350"), 200, 200, false);
GetInternalKey("350"), 200, 200, false,
kInvalidBlobFileNumber);
version_edit.AddFile(2, 676, 0, 100U, GetInternalKey("401"),
GetInternalKey("450"), 200, 200, false);
GetInternalKey("450"), 200, 200, false,
kInvalidBlobFileNumber);
version_edit.AddFile(2, 636, 0, 100U, GetInternalKey("601"),
GetInternalKey("650"), 200, 200, false);
GetInternalKey("650"), 200, 200, false,
kInvalidBlobFileNumber);
version_edit.AddFile(2, 616, 0, 100U, GetInternalKey("501"),
GetInternalKey("550"), 200, 200, false);
GetInternalKey("550"), 200, 200, false,
kInvalidBlobFileNumber);
version_edit.AddFile(2, 606, 0, 100U, GetInternalKey("701"),
GetInternalKey("750"), 200, 200, false);
GetInternalKey("750"), 200, 200, false,
kInvalidBlobFileNumber);
EnvOptions env_options;
@ -248,24 +253,31 @@ TEST_F(VersionBuilderTest, ApplyDeleteAndSaveTo) {
VersionEdit version_edit;
version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"),
GetInternalKey("350"), 200, 200, false);
GetInternalKey("350"), 200, 200, false,
kInvalidBlobFileNumber);
version_edit.AddFile(2, 676, 0, 100U, GetInternalKey("401"),
GetInternalKey("450"), 200, 200, false);
GetInternalKey("450"), 200, 200, false,
kInvalidBlobFileNumber);
version_edit.AddFile(2, 636, 0, 100U, GetInternalKey("601"),
GetInternalKey("650"), 200, 200, false);
GetInternalKey("650"), 200, 200, false,
kInvalidBlobFileNumber);
version_edit.AddFile(2, 616, 0, 100U, GetInternalKey("501"),
GetInternalKey("550"), 200, 200, false);
GetInternalKey("550"), 200, 200, false,
kInvalidBlobFileNumber);
version_edit.AddFile(2, 606, 0, 100U, GetInternalKey("701"),
GetInternalKey("750"), 200, 200, false);
GetInternalKey("750"), 200, 200, false,
kInvalidBlobFileNumber);
version_builder.Apply(&version_edit);
VersionEdit version_edit2;
version_edit.AddFile(2, 808, 0, 100U, GetInternalKey("901"),
GetInternalKey("950"), 200, 200, false);
GetInternalKey("950"), 200, 200, false,
kInvalidBlobFileNumber);
version_edit2.DeleteFile(2, 616);
version_edit2.DeleteFile(2, 636);
version_edit.AddFile(2, 806, 0, 100U, GetInternalKey("801"),
GetInternalKey("850"), 200, 200, false);
GetInternalKey("850"), 200, 200, false,
kInvalidBlobFileNumber);
version_builder.Apply(&version_edit2);
version_builder.SaveTo(&new_vstorage);

View File

@ -9,6 +9,7 @@
#include "db/version_edit.h"
#include "db/blob_index.h"
#include "db/version_set.h"
#include "logging/event_logger.h"
#include "rocksdb/slice.h"
@ -59,6 +60,7 @@ enum CustomTag : uint32_t {
// kMinLogNumberToKeep as part of a CustomTag as a hack. This should be
// removed when manifest becomes forward-comptabile.
kMinLogNumberToKeepHack = 3,
kOldestBlobFileNumber = 4,
kPathId = 65,
};
// If this bit for the custom tag is set, opening DB should fail if
@ -70,6 +72,49 @@ uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id) {
return number | (path_id * (kFileNumberMask + 1));
}
void FileMetaData::UpdateBoundaries(const Slice& key, const Slice& value,
SequenceNumber seqno,
ValueType value_type) {
if (smallest.size() == 0) {
smallest.DecodeFrom(key);
}
largest.DecodeFrom(key);
fd.smallest_seqno = std::min(fd.smallest_seqno, seqno);
fd.largest_seqno = std::max(fd.largest_seqno, seqno);
#ifndef ROCKSDB_LITE
if (value_type == kTypeBlobIndex) {
BlobIndex blob_index;
const Status s = blob_index.DecodeFrom(value);
if (!s.ok()) {
return;
}
if (blob_index.IsInlined()) {
return;
}
if (blob_index.HasTTL()) {
return;
}
// Paranoid check: this should not happen because BlobDB numbers the blob
// files starting from 1.
if (blob_index.file_number() == kInvalidBlobFileNumber) {
return;
}
if (oldest_blob_file_number == kInvalidBlobFileNumber ||
oldest_blob_file_number > blob_index.file_number()) {
oldest_blob_file_number = blob_index.file_number();
}
}
#else
(void)value;
(void)value_type;
#endif
}
void VersionEdit::Clear() {
db_id_.clear();
comparator_.clear();
@ -134,7 +179,8 @@ bool VersionEdit::EncodeTo(std::string* dst) const {
return false;
}
bool has_customized_fields = false;
if (f.marked_for_compaction || has_min_log_number_to_keep_) {
if (f.marked_for_compaction || has_min_log_number_to_keep_ ||
f.oldest_blob_file_number != kInvalidBlobFileNumber) {
PutVarint32(dst, kNewFile4);
has_customized_fields = true;
} else if (f.fd.GetPathId() == 0) {
@ -197,6 +243,12 @@ bool VersionEdit::EncodeTo(std::string* dst) const {
PutLengthPrefixedSlice(dst, Slice(varint_log_number));
min_log_num_written = true;
}
if (f.oldest_blob_file_number != kInvalidBlobFileNumber) {
PutVarint32(dst, CustomTag::kOldestBlobFileNumber);
std::string oldest_blob_file_number;
PutVarint64(&oldest_blob_file_number, f.oldest_blob_file_number);
PutLengthPrefixedSlice(dst, Slice(oldest_blob_file_number));
}
TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:NewFile4:CustomizeFields",
dst);
@ -302,6 +354,11 @@ const char* VersionEdit::DecodeNewFile4From(Slice* input) {
}
has_min_log_number_to_keep_ = true;
break;
case kOldestBlobFileNumber:
if (!GetVarint64(&field, &f.oldest_blob_file_number)) {
return "invalid oldest blob file number";
}
break;
default:
if ((custom_tag & kCustomTagNonSafeIgnoreMask) != 0) {
// Should not proceed if cannot understand it
@ -602,6 +659,10 @@ std::string VersionEdit::DebugString(bool hex_key) const {
r.append(f.smallest.DebugString(hex_key));
r.append(" .. ");
r.append(f.largest.DebugString(hex_key));
if (f.oldest_blob_file_number != kInvalidBlobFileNumber) {
r.append(" blob_file:");
AppendNumberTo(&r, f.oldest_blob_file_number);
}
}
r.append("\n ColumnFamily: ");
AppendNumberTo(&r, column_family_);
@ -676,6 +737,9 @@ std::string VersionEdit::DebugJSON(int edit_num, bool hex_key) const {
jw << "FileSize" << f.fd.GetFileSize();
jw << "SmallestIKey" << f.smallest.DebugString(hex_key);
jw << "LargestIKey" << f.largest.DebugString(hex_key);
if (f.oldest_blob_file_number != kInvalidBlobFileNumber) {
jw << "OldestBlobFile" << f.oldest_blob_file_number;
}
jw.EndArrayedObject();
}

View File

@ -22,7 +22,8 @@ namespace rocksdb {
class VersionSet;
const uint64_t kFileNumberMask = 0x3FFFFFFFFFFFFFFF;
constexpr uint64_t kFileNumberMask = 0x3FFFFFFFFFFFFFFF;
constexpr uint64_t kInvalidBlobFileNumber = 0;
extern uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id);
@ -91,7 +92,7 @@ struct FileMetaData {
InternalKey largest; // Largest internal key served by table
// Needs to be disposed when refs becomes 0.
Cache::Handle* table_reader_handle;
Cache::Handle* table_reader_handle = nullptr;
FileSampledStats stats;
@ -100,45 +101,44 @@ struct FileMetaData {
// File size compensated by deletion entry.
// This is updated in Version::UpdateAccumulatedStats() first time when the
// file is created or loaded. After it is updated (!= 0), it is immutable.
uint64_t compensated_file_size;
uint64_t compensated_file_size = 0;
// These values can mutate, but they can only be read or written from
// single-threaded LogAndApply thread
uint64_t num_entries; // the number of entries.
uint64_t num_deletions; // the number of deletion entries.
uint64_t raw_key_size; // total uncompressed key size.
uint64_t raw_value_size; // total uncompressed value size.
uint64_t num_entries = 0; // the number of entries.
uint64_t num_deletions = 0; // the number of deletion entries.
uint64_t raw_key_size = 0; // total uncompressed key size.
uint64_t raw_value_size = 0; // total uncompressed value size.
int refs; // Reference count
int refs = 0; // Reference count
bool being_compacted; // Is this file undergoing compaction?
bool init_stats_from_file; // true if the data-entry stats of this file
// has initialized from file.
bool being_compacted = false; // Is this file undergoing compaction?
bool init_stats_from_file = false; // true if the data-entry stats of this
// file has initialized from file.
bool marked_for_compaction; // True if client asked us nicely to compact this
// file.
bool marked_for_compaction = false; // True if client asked us nicely to
// compact this file.
FileMetaData()
: table_reader_handle(nullptr),
compensated_file_size(0),
num_entries(0),
num_deletions(0),
raw_key_size(0),
raw_value_size(0),
refs(0),
being_compacted(false),
init_stats_from_file(false),
marked_for_compaction(false) {}
// Used only in BlobDB. The file number of the oldest blob file this SST file
// refers to. 0 is an invalid value; BlobDB numbers the files starting from 1.
uint64_t oldest_blob_file_number = kInvalidBlobFileNumber;
FileMetaData() = default;
FileMetaData(uint64_t file, uint32_t file_path_id, uint64_t file_size,
const InternalKey& smallest_key, const InternalKey& largest_key,
const SequenceNumber& smallest_seq,
const SequenceNumber& largest_seq, bool marked_for_compact,
uint64_t oldest_blob_file)
: fd(file, file_path_id, file_size, smallest_seq, largest_seq),
smallest(smallest_key),
largest(largest_key),
marked_for_compaction(marked_for_compact),
oldest_blob_file_number(oldest_blob_file) {}
// REQUIRED: Keys must be given to the function in sorted order (it expects
// the last key to be the largest).
void UpdateBoundaries(const Slice& key, SequenceNumber seqno) {
if (smallest.size() == 0) {
smallest.DecodeFrom(key);
}
largest.DecodeFrom(key);
fd.smallest_seqno = std::min(fd.smallest_seqno, seqno);
fd.largest_seqno = std::max(fd.largest_seqno, seqno);
}
void UpdateBoundaries(const Slice& key, const Slice& value,
SequenceNumber seqno, ValueType value_type);
// Unlike UpdateBoundaries, ranges do not need to be presented in any
// particular order.
@ -249,21 +249,18 @@ class VersionEdit {
// Add the specified file at the specified number.
// REQUIRES: This version has not been saved (see VersionSet::SaveTo)
// REQUIRES: "smallest" and "largest" are smallest and largest keys in file
// REQUIRES: "oldest_blob_file_number" is the number of the oldest blob file
// referred to by this file if any, kInvalidBlobFileNumber otherwise.
void AddFile(int level, uint64_t file, uint32_t file_path_id,
uint64_t file_size, const InternalKey& smallest,
const InternalKey& largest, const SequenceNumber& smallest_seqno,
const SequenceNumber& largest_seqno,
bool marked_for_compaction) {
const SequenceNumber& largest_seqno, bool marked_for_compaction,
uint64_t oldest_blob_file_number) {
assert(smallest_seqno <= largest_seqno);
FileMetaData f;
f.fd = FileDescriptor(file, file_path_id, file_size, smallest_seqno,
largest_seqno);
f.smallest = smallest;
f.largest = largest;
f.fd.smallest_seqno = smallest_seqno;
f.fd.largest_seqno = largest_seqno;
f.marked_for_compaction = marked_for_compaction;
new_files_.emplace_back(level, std::move(f));
new_files_.emplace_back(
level, FileMetaData(file, file_path_id, file_size, smallest, largest,
smallest_seqno, largest_seqno,
marked_for_compaction, oldest_blob_file_number));
}
void AddFile(int level, const FileMetaData& f) {

View File

@ -36,7 +36,7 @@ TEST_F(VersionEditTest, EncodeDecode) {
edit.AddFile(3, kBig + 300 + i, kBig32Bit + 400 + i, 0,
InternalKey("foo", kBig + 500 + i, kTypeValue),
InternalKey("zoo", kBig + 600 + i, kTypeDeletion),
kBig + 500 + i, kBig + 600 + i, false);
kBig + 500 + i, kBig + 600 + i, false, kInvalidBlobFileNumber);
edit.DeleteFile(4, kBig + 700 + i);
}
@ -53,13 +53,16 @@ TEST_F(VersionEditTest, EncodeDecodeNewFile4) {
VersionEdit edit;
edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue),
InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500,
kBig + 600, true);
kBig + 600, true, kInvalidBlobFileNumber);
edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue),
InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501,
kBig + 601, false);
kBig + 601, false, kInvalidBlobFileNumber);
edit.AddFile(5, 302, 0, 100, InternalKey("foo", kBig + 502, kTypeValue),
InternalKey("zoo", kBig + 602, kTypeDeletion), kBig + 502,
kBig + 602, true);
kBig + 602, true, kInvalidBlobFileNumber);
edit.AddFile(5, 303, 0, 100, InternalKey("foo", kBig + 503, kTypeBlobIndex),
InternalKey("zoo", kBig + 603, kTypeBlobIndex), kBig + 503,
kBig + 603, true, 1001);
edit.DeleteFile(4, 700);
@ -78,9 +81,18 @@ TEST_F(VersionEditTest, EncodeDecodeNewFile4) {
ASSERT_TRUE(new_files[0].second.marked_for_compaction);
ASSERT_TRUE(!new_files[1].second.marked_for_compaction);
ASSERT_TRUE(new_files[2].second.marked_for_compaction);
ASSERT_TRUE(new_files[3].second.marked_for_compaction);
ASSERT_EQ(3u, new_files[0].second.fd.GetPathId());
ASSERT_EQ(3u, new_files[1].second.fd.GetPathId());
ASSERT_EQ(0u, new_files[2].second.fd.GetPathId());
ASSERT_EQ(0u, new_files[3].second.fd.GetPathId());
ASSERT_EQ(kInvalidBlobFileNumber,
new_files[0].second.oldest_blob_file_number);
ASSERT_EQ(kInvalidBlobFileNumber,
new_files[1].second.oldest_blob_file_number);
ASSERT_EQ(kInvalidBlobFileNumber,
new_files[2].second.oldest_blob_file_number);
ASSERT_EQ(1001, new_files[3].second.oldest_blob_file_number);
}
TEST_F(VersionEditTest, ForwardCompatibleNewFile4) {
@ -88,10 +100,10 @@ TEST_F(VersionEditTest, ForwardCompatibleNewFile4) {
VersionEdit edit;
edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue),
InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500,
kBig + 600, true);
kBig + 600, true, kInvalidBlobFileNumber);
edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue),
InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501,
kBig + 601, false);
kBig + 601, false, kInvalidBlobFileNumber);
edit.DeleteFile(4, 700);
edit.SetComparatorName("foo");
@ -137,7 +149,7 @@ TEST_F(VersionEditTest, NewFile4NotSupportedField) {
VersionEdit edit;
edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue),
InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500,
kBig + 600, true);
kBig + 600, true, kInvalidBlobFileNumber);
edit.SetComparatorName("foo");
edit.SetLogNumber(kBig + 100);
@ -164,7 +176,8 @@ TEST_F(VersionEditTest, NewFile4NotSupportedField) {
TEST_F(VersionEditTest, EncodeEmptyFile) {
VersionEdit edit;
edit.AddFile(0, 0, 0, 0, InternalKey(), InternalKey(), 0, 0, false);
edit.AddFile(0, 0, 0, 0, InternalKey(), InternalKey(), 0, 0, false,
kInvalidBlobFileNumber);
std::string buffer;
ASSERT_TRUE(!edit.EncodeTo(&buffer));
}

View File

@ -3379,6 +3379,10 @@ std::string Version::DebugString(bool hex, bool print_stats) const {
r.append(" .. ");
r.append(files[i]->largest.DebugString(hex));
r.append("]");
if (files[i]->oldest_blob_file_number != kInvalidBlobFileNumber) {
r.append(" blob_file:");
AppendNumberTo(&r, files[i]->oldest_blob_file_number);
}
if (print_stats) {
r.append("(");
r.append(ToString(
@ -4923,7 +4927,7 @@ Status VersionSet::WriteCurrentStateToManifest(log::Writer* log) {
edit.AddFile(level, f->fd.GetNumber(), f->fd.GetPathId(),
f->fd.GetFileSize(), f->smallest, f->largest,
f->fd.smallest_seqno, f->fd.largest_seqno,
f->marked_for_compaction);
f->marked_for_compaction, f->oldest_blob_file_number);
}
}
edit.SetLogNumber(cfd->GetLogNumber());

View File

@ -35,10 +35,11 @@ class GenerateLevelFilesBriefTest : public testing::Test {
void Add(const char* smallest, const char* largest,
SequenceNumber smallest_seq = 100,
SequenceNumber largest_seq = 100) {
FileMetaData* f = new FileMetaData;
f->fd = FileDescriptor(files_.size() + 1, 0, 0);
f->smallest = InternalKey(smallest, smallest_seq, kTypeValue);
f->largest = InternalKey(largest, largest_seq, kTypeValue);
FileMetaData* f = new FileMetaData(
files_.size() + 1, 0, 0,
InternalKey(smallest, smallest_seq, kTypeValue),
InternalKey(largest, largest_seq, kTypeValue), smallest_seq,
largest_seq, /* marked_for_compact */ false, kInvalidBlobFileNumber);
files_.push_back(f);
}
@ -129,28 +130,22 @@ class VersionStorageInfoTest : public testing::Test {
void Add(int level, uint32_t file_number, const char* smallest,
const char* largest, uint64_t file_size = 0) {
assert(level < vstorage_.num_levels());
FileMetaData* f = new FileMetaData;
f->fd = FileDescriptor(file_number, 0, file_size);
f->smallest = GetInternalKey(smallest, 0);
f->largest = GetInternalKey(largest, 0);
FileMetaData* f = new FileMetaData(
file_number, 0, file_size, GetInternalKey(smallest, 0),
GetInternalKey(largest, 0), /* smallest_seq */ 0, /* largest_seq */ 0,
/* marked_for_compact */ false, kInvalidBlobFileNumber);
f->compensated_file_size = file_size;
f->refs = 0;
f->num_entries = 0;
f->num_deletions = 0;
vstorage_.AddFile(level, f);
}
void Add(int level, uint32_t file_number, const InternalKey& smallest,
const InternalKey& largest, uint64_t file_size = 0) {
assert(level < vstorage_.num_levels());
FileMetaData* f = new FileMetaData;
f->fd = FileDescriptor(file_number, 0, file_size);
f->smallest = smallest;
f->largest = largest;
FileMetaData* f = new FileMetaData(
file_number, 0, file_size, smallest, largest, /* smallest_seq */ 0,
/* largest_seq */ 0, /* marked_for_compact */ false,
kInvalidBlobFileNumber);
f->compensated_file_size = file_size;
f->refs = 0;
f->num_entries = 0;
f->num_deletions = 0;
vstorage_.AddFile(level, f);
}