Optimize & clean up footer code (#9280)

Summary:
Again, ahead of planned changes in https://github.com/facebook/rocksdb/issues/9058. This change improves
performance (vs. pre-https://github.com/facebook/rocksdb/issues/9240 baseline) by separating a FooterBuilder from
Footer, where FooterBuilder includes (inline owns) the serialized data
so that it can be stack allocated.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/9280

Test Plan:
existing tests + performance testing below

Extreme case performance testing as in https://github.com/facebook/rocksdb/issues/9240 with

    TEST_TMPDIR=/dev/shm/ ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=30000000

(Each is ops/s averaged over 50 runs, run simultaneously with competing
configuration for load fairness)
Pre-https://github.com/facebook/rocksdb/issues/9240 baseline (f577458): 436389
With https://github.com/facebook/rocksdb/issues/9240 (653c392): 417946 (-4.2% vs. baseline)
This change: 443762 (+1.7% vs. baseline)

Reviewed By: ajkr

Differential Revision: D33077220

Pulled By: pdillinger

fbshipit-source-id: 7eaa6499589aac1693414a758e8c799216c5016c
This commit is contained in:
Peter Dillinger 2021-12-13 17:42:05 -08:00 committed by Facebook GitHub Bot
parent 08721293ea
commit e92a0ed040
6 changed files with 166 additions and 179 deletions

View File

@ -1788,31 +1788,18 @@ void BlockBasedTableBuilder::WriteRangeDelBlock(
void BlockBasedTableBuilder::WriteFooter(BlockHandle& metaindex_block_handle,
BlockHandle& index_block_handle) {
Rep* r = rep_;
// No need to write out new footer if we're using default checksum.
// We're writing legacy magic number because we want old versions of RocksDB
// be able to read files generated with new release (just in case if
// somebody wants to roll back after an upgrade)
// TODO(icanadi) at some point in the future, when we're absolutely sure
// nobody will roll back to RocksDB 2.x versions, retire the legacy magic
// number and always write new table files with new magic number
bool legacy = (r->table_options.format_version == 0);
// this is guaranteed by BlockBasedTableBuilder's constructor
assert(r->table_options.checksum == kCRC32c ||
r->table_options.format_version != 0);
Footer footer;
footer
.set_table_magic_number(legacy ? kLegacyBlockBasedTableMagicNumber
: kBlockBasedTableMagicNumber)
.set_format_version(r->table_options.format_version)
.set_metaindex_handle(metaindex_block_handle)
.set_index_handle(index_block_handle)
.set_checksum_type(r->table_options.checksum);
std::string footer_encoding;
footer.EncodeTo(&footer_encoding, r->get_offset());
assert(ok());
IOStatus ios = r->file->Append(footer_encoding);
FooterBuilder footer;
footer.Build(kBlockBasedTableMagicNumber, r->table_options.format_version,
r->get_offset(), r->table_options.checksum,
metaindex_block_handle, index_block_handle);
IOStatus ios = r->file->Append(footer.GetSlice());
if (ios.ok()) {
r->set_offset(r->get_offset() + footer_encoding.size());
r->set_offset(r->get_offset() + footer.GetSlice().size());
} else {
r->SetIOStatus(ios);
r->SetStatus(ios);

View File

@ -393,15 +393,10 @@ Status CuckooTableBuilder::Finish() {
return status_;
}
Footer footer;
footer.set_table_magic_number(kCuckooTableMagicNumber)
.set_format_version(1)
.set_metaindex_handle(meta_index_block_handle)
.set_index_handle(BlockHandle::NullBlockHandle())
.set_checksum_type(kNoChecksum);
std::string footer_encoding;
footer.EncodeTo(&footer_encoding, offset);
io_status_ = file_->Append(footer_encoding);
FooterBuilder footer;
footer.Build(kCuckooTableMagicNumber, /* format_version */ 1, offset,
kNoChecksum, meta_index_block_handle);
io_status_ = file_->Append(footer.GetSlice());
status_ = io_status_;
return status_;
}

View File

@ -173,21 +173,25 @@ inline uint64_t UpconvertLegacyFooterFormat(uint64_t magic_number) {
return kPlainTableMagicNumber;
}
assert(false);
return 0;
return magic_number;
}
} // namespace
Footer& Footer::set_table_magic_number(uint64_t magic_number) {
assert(table_magic_number_ == kNullTableMagicNumber);
table_magic_number_ = magic_number;
inline uint64_t DownconvertToLegacyFooterFormat(uint64_t magic_number) {
if (magic_number == kBlockBasedTableMagicNumber) {
return kLegacyBlockBasedTableMagicNumber;
}
if (magic_number == kPlainTableMagicNumber) {
return kLegacyPlainTableMagicNumber;
}
assert(false);
return magic_number;
}
inline uint8_t BlockTrailerSizeForMagicNumber(uint64_t magic_number) {
if (magic_number == kBlockBasedTableMagicNumber ||
magic_number == kLegacyBlockBasedTableMagicNumber) {
block_trailer_size_ =
static_cast<uint8_t>(BlockBasedTable::kBlockTrailerSize);
return static_cast<uint8_t>(BlockBasedTable::kBlockTrailerSize);
} else {
block_trailer_size_ = 0;
return 0;
}
return *this;
}
// Footer format, in three parts:
@ -206,60 +210,69 @@ Footer& Footer::set_table_magic_number(uint64_t magic_number) {
// -> format_version >= 1 (inferred from NOT legacy magic number)
// format_version (uint32LE, 4 bytes), also called "footer version"
// newer magic number (8 bytes)
void Footer::EncodeTo(std::string* dst, uint64_t footer_offset) const {
constexpr size_t kFooterPart2Size = 2 * BlockHandle::kMaxEncodedLength;
} // namespace
void FooterBuilder::Build(uint64_t magic_number, uint32_t format_version,
uint64_t footer_offset, ChecksumType checksum_type,
const BlockHandle& metaindex_handle,
const BlockHandle& index_handle) {
(void)footer_offset; // Future use
// Sanitize magic numbers & format versions
assert(table_magic_number_ != kNullTableMagicNumber);
uint64_t magic = table_magic_number_;
uint32_t fv = format_version_;
assert(fv != kInvalidFormatVersion);
assert(IsLegacyFooterFormat(magic) == (fv == 0));
assert(magic_number != Footer::kNullTableMagicNumber);
assert(IsSupportedFormatVersion(format_version));
ChecksumType ct = checksum_type();
// Allocate destination data and generate parts 1 and 3
const size_t original_size = dst->size();
char* part2;
if (fv > 0) {
dst->resize(original_size + kNewVersionsEncodedLength);
char* part1 = &(*dst)[original_size];
part2 = part1 + 1;
char* part3 = part2 + 2 * BlockHandle::kMaxEncodedLength;
assert(&(*dst)[dst->size() - 1] + 1 - part3 == /* part 3 size */ 12);
char* part3;
if (format_version > 0) {
slice_ = Slice(data_.data(), Footer::kNewVersionsEncodedLength);
// Generate parts 1 and 3
part1[0] = ct;
EncodeFixed32(part3, fv);
EncodeFixed64(part3 + 4, magic);
char* cur = data_.data();
// Part 1
*(cur++) = checksum_type;
// Part 2
part2 = cur;
// Skip over part 2 for now
cur += kFooterPart2Size;
// Part 3
part3 = cur;
EncodeFixed32(cur, format_version);
cur += 4;
EncodeFixed64(cur, magic_number);
assert(cur + 8 == slice_.data() + slice_.size());
} else {
dst->resize(original_size + kVersion0EncodedLength);
part2 = &(*dst)[original_size];
char* part3 = part2 + 2 * BlockHandle::kMaxEncodedLength;
assert(&(*dst)[dst->size() - 1] + 1 - part3 == /* part 3 size */ 8);
slice_ = Slice(data_.data(), Footer::kVersion0EncodedLength);
// Legacy SST files use kCRC32c checksum but it's not stored in footer.
assert(ct == kNoChecksum || ct == kCRC32c);
// Generate part 3 (part 1 empty)
EncodeFixed64(part3, magic);
assert(checksum_type == kNoChecksum || checksum_type == kCRC32c);
// Generate part 3 (part 1 empty, skip part 2 for now)
part2 = data_.data();
part3 = part2 + kFooterPart2Size;
char* cur = part3;
// Use legacy magic numbers to indicate format_version=0, for
// compatibility. No other cases should use format_version=0.
EncodeFixed64(cur, DownconvertToLegacyFooterFormat(magic_number));
assert(cur + 8 == slice_.data() + slice_.size());
}
// Generate Part2
// Variable size encode handles (sigh)
part2 = metaindex_handle_.EncodeTo(part2);
/*part2 = */ index_handle_.EncodeTo(part2);
// remainder of part2 is already zero padded
{
char* cur = part2;
cur = metaindex_handle.EncodeTo(cur);
cur = index_handle.EncodeTo(cur);
// Zero pad remainder
std::fill(cur, part3, char{0});
}
}
Status Footer::DecodeFrom(Slice* input, uint64_t input_offset) {
Status Footer::DecodeFrom(Slice input, uint64_t input_offset) {
(void)input_offset; // Future use
// Only decode to unused Footer
assert(table_magic_number_ == kNullTableMagicNumber);
assert(input != nullptr);
assert(input->size() >= kMinEncodedLength);
assert(input.size() >= kMinEncodedLength);
const char* magic_ptr =
input->data() + input->size() - kMagicNumberLengthByte;
const char* magic_ptr = input.data() + input.size() - kMagicNumberLengthByte;
uint64_t magic = DecodeFixed64(magic_ptr);
// We check for legacy formats here and silently upconvert them
@ -267,13 +280,14 @@ Status Footer::DecodeFrom(Slice* input, uint64_t input_offset) {
if (legacy) {
magic = UpconvertLegacyFooterFormat(magic);
}
set_table_magic_number(magic);
table_magic_number_ = magic;
block_trailer_size_ = BlockTrailerSizeForMagicNumber(magic);
// Parse Part3
if (legacy) {
// The size is already asserted to be at least kMinEncodedLength
// at the beginning of the function
input->remove_prefix(input->size() - kVersion0EncodedLength);
input.remove_prefix(input.size() - kVersion0EncodedLength);
format_version_ = 0 /* legacy */;
checksum_type_ = kCRC32c;
} else {
@ -284,14 +298,14 @@ Status Footer::DecodeFrom(Slice* input, uint64_t input_offset) {
ROCKSDB_NAMESPACE::ToString(format_version_));
}
// All known format versions >= 1 occupy exactly this many bytes.
if (input->size() < kNewVersionsEncodedLength) {
if (input.size() < kNewVersionsEncodedLength) {
return Status::Corruption("Input is too short to be an SST file");
}
uint64_t adjustment = input->size() - kNewVersionsEncodedLength;
input->remove_prefix(adjustment);
uint64_t adjustment = input.size() - kNewVersionsEncodedLength;
input.remove_prefix(adjustment);
// Parse Part1
char chksum = input->data()[0];
char chksum = input.data()[0];
checksum_type_ = lossless_cast<ChecksumType>(chksum);
if (!IsSupportedChecksumType(checksum_type())) {
return Status::Corruption(
@ -299,21 +313,16 @@ Status Footer::DecodeFrom(Slice* input, uint64_t input_offset) {
ROCKSDB_NAMESPACE::ToString(lossless_cast<uint8_t>(chksum)));
}
// Consume checksum type field
input->remove_prefix(1);
input.remove_prefix(1);
}
// Parse Part2
Status result = metaindex_handle_.DecodeFrom(input);
Status result = metaindex_handle_.DecodeFrom(&input);
if (result.ok()) {
result = index_handle_.DecodeFrom(input);
result = index_handle_.DecodeFrom(&input);
}
if (!result.ok()) {
return result;
}
// Mark all input consumed (skip padding & part3)
*input = Slice(input->data() + input->size(), 0U);
return Status::OK();
return result;
// Padding in part2 is ignored
}
std::string Footer::ToString() const {
@ -384,7 +393,7 @@ Status ReadFooterFromFile(const IOOptions& opts, RandomAccessFileReader* file,
file->file_name());
}
s = footer->DecodeFrom(&footer_input, read_offset);
s = footer->DecodeFrom(footer_input, read_offset);
if (!s.ok()) {
return s;
}

View File

@ -9,6 +9,7 @@
#pragma once
#include <array>
#include <cstdint>
#include <string>
@ -128,66 +129,45 @@ inline bool IsSupportedFormatVersion(uint32_t version) {
// elsewhere under the metaindex block. For example, checksum_type is
// required for verifying metaindex block checksum (when applicable), but
// index block handle can easily go in metaindex block (possible future).
// See also FooterBuilder below.
class Footer {
public:
// Create empty. Populate using DecodeFrom.
Footer() {}
// Uses builder pattern rather than distinctive ctors
// Deserialize a footer (populate fields) from `input` and check for various
// corruptions. `input_offset` is the offset within the target file of
// `input` buffer (future use).
Status DecodeFrom(Slice input, uint64_t input_offset);
// Table magic number identifies file as RocksDB SST file and which kind of
// SST format is use.
Footer& set_table_magic_number(uint64_t tmn);
uint64_t table_magic_number() const { return table_magic_number_; }
// A version (footer and more) within a kind of SST. (It would add more
// unnecessary complexity to separate footer versions and
// BBTO::format_version.)
Footer& set_format_version(uint32_t fv) {
format_version_ = fv;
return *this;
}
uint32_t format_version() const { return format_version_; }
// Block handle for metaindex block.
Footer& set_metaindex_handle(const BlockHandle& h) {
metaindex_handle_ = h;
return *this;
}
const BlockHandle& metaindex_handle() const { return metaindex_handle_; }
// Block handle for (top-level) index block.
Footer& set_index_handle(const BlockHandle& h) {
index_handle_ = h;
return *this;
}
const BlockHandle& index_handle() const { return index_handle_; }
// Checksum type used in the file.
Footer& set_checksum_type(ChecksumType ct) {
checksum_type_ = ct;
return *this;
}
ChecksumType checksum_type() const {
return static_cast<ChecksumType>(checksum_type_);
}
// Appends serialized footer to `dst`. The starting offset of the footer
// within the file is required for future work.
void EncodeTo(std::string* dst, uint64_t footer_offset) const;
// Deserialize a footer (populate fields) from `input` and check for various
// corruptions. On success (and some error cases) `input` is advanced past
// the footer. Like EncodeTo, the offset within the file will be nedded for
// future work
Status DecodeFrom(Slice* input, uint64_t input_offset);
// Block trailer size used by file with this footer (e.g. 5 for block-based
// table and 0 for plain table). This is inferred from magic number so
// not in the serialized form.
inline size_t GetBlockTrailerSize() const { return block_trailer_size_; }
// Convert this object to a human readable form
std::string ToString() const;
// Block trailer size used by file with this footer (e.g. 5 for block-based
// table and 0 for plain table)
inline size_t GetBlockTrailerSize() const { return block_trailer_size_; }
// Encoded lengths of Footers. Bytes for serialized Footer will always be
// >= kMinEncodedLength and <= kMaxEncodedLength.
//
@ -207,8 +187,9 @@ class Footer {
static constexpr uint64_t kNullTableMagicNumber = 0;
private:
static constexpr uint32_t kInvalidFormatVersion = 0xffffffffU;
private:
static constexpr int kInvalidChecksumType =
(1 << (sizeof(ChecksumType) * 8)) | kNoChecksum;
@ -217,7 +198,40 @@ class Footer {
BlockHandle metaindex_handle_;
BlockHandle index_handle_;
int checksum_type_ = kInvalidChecksumType;
uint8_t block_trailer_size_ = 0; // set based on magic number
uint8_t block_trailer_size_ = 0;
};
// Builder for Footer
class FooterBuilder {
public:
// Run builder in inputs. This is a single step with lots of parameters for
// efficiency (based on perf testing).
// * table_magic_number identifies file as RocksDB SST file and which kind of
// SST format is use.
// * format_version is a version for the footer and can also apply to other
// aspects of the SST file (see BlockBasedTableOptions::format_version).
// NOTE: To save complexity in the caller, when format_version == 0 and
// there is a corresponding legacy magic number to the one specified, the
// legacy magic number will be written for forward compatibility.
// * footer_offset is the file offset where the footer will be written
// (for future use).
// * checksum_type is for formats using block checksums.
// * index_handle is optional for some kinds of SST files.
void Build(uint64_t table_magic_number, uint32_t format_version,
uint64_t footer_offset, ChecksumType checksum_type,
const BlockHandle& metaindex_handle,
const BlockHandle& index_handle = BlockHandle::NullBlockHandle());
// After Builder, get a Slice for the serialized Footer, backed by this
// FooterBuilder.
const Slice& GetSlice() const {
assert(slice_.size());
return slice_;
}
private:
Slice slice_;
std::array<char, Footer::kMaxEncodedLength> data_;
};
// Read the footer from file

View File

@ -292,16 +292,12 @@ Status PlainTableBuilder::Finish() {
// Write Footer
// no need to write out new footer if we're using default checksum
Footer footer;
footer.set_table_magic_number(kLegacyPlainTableMagicNumber)
.set_format_version(0)
.set_metaindex_handle(metaindex_block_handle)
.set_index_handle(BlockHandle::NullBlockHandle());
std::string footer_encoding;
footer.EncodeTo(&footer_encoding, offset_);
io_status_ = file_->Append(footer_encoding);
FooterBuilder footer;
footer.Build(kPlainTableMagicNumber, /* format_version */ 0, offset_,
kNoChecksum, metaindex_block_handle);
io_status_ = file_->Append(footer.GetSlice());
if (io_status_.ok()) {
offset_ += footer_encoding.size();
offset_ += footer.GetSlice().size();
}
status_ = io_status_;
return status_;

View File

@ -4175,17 +4175,12 @@ TEST(TableTest, FooterTests) {
BlockHandle meta_index(data_size + index_size + 2 * 5, metaindex_size);
uint64_t footer_offset = data_size + metaindex_size + index_size + 3 * 5;
{
// upconvert legacy block based
std::string encoded;
Footer footer;
footer.set_table_magic_number(kLegacyBlockBasedTableMagicNumber)
.set_format_version(0)
.set_metaindex_handle(meta_index)
.set_index_handle(index);
footer.EncodeTo(&encoded, footer_offset);
// legacy block based
FooterBuilder footer;
footer.Build(kBlockBasedTableMagicNumber, /* format_version */ 0,
footer_offset, kCRC32c, meta_index, index);
Footer decoded_footer;
Slice encoded_slice(encoded);
ASSERT_OK(decoded_footer.DecodeFrom(&encoded_slice, footer_offset));
ASSERT_OK(decoded_footer.DecodeFrom(footer.GetSlice(), footer_offset));
ASSERT_EQ(decoded_footer.table_magic_number(), kBlockBasedTableMagicNumber);
ASSERT_EQ(decoded_footer.checksum_type(), kCRC32c);
ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset());
@ -4194,21 +4189,19 @@ TEST(TableTest, FooterTests) {
ASSERT_EQ(decoded_footer.index_handle().size(), index.size());
ASSERT_EQ(decoded_footer.format_version(), 0U);
ASSERT_EQ(decoded_footer.GetBlockTrailerSize(), 5U);
// Ensure serialized with legacy magic
ASSERT_EQ(
DecodeFixed64(footer.GetSlice().data() + footer.GetSlice().size() - 8),
kLegacyBlockBasedTableMagicNumber);
}
// block based, various checksums, various versions
for (auto t : GetSupportedChecksums()) {
for (uint32_t fv = 1; IsSupportedFormatVersion(fv); ++fv) {
std::string encoded;
Footer footer;
footer.set_table_magic_number(kBlockBasedTableMagicNumber)
.set_format_version(fv)
.set_metaindex_handle(meta_index)
.set_index_handle(index)
.set_checksum_type(t);
footer.EncodeTo(&encoded, footer_offset);
FooterBuilder footer;
footer.Build(kBlockBasedTableMagicNumber, fv, footer_offset, t,
meta_index, index);
Footer decoded_footer;
Slice encoded_slice(encoded);
ASSERT_OK(decoded_footer.DecodeFrom(&encoded_slice, footer_offset));
ASSERT_OK(decoded_footer.DecodeFrom(footer.GetSlice(), footer_offset));
ASSERT_EQ(decoded_footer.table_magic_number(),
kBlockBasedTableMagicNumber);
ASSERT_EQ(decoded_footer.checksum_type(), t);
@ -4224,45 +4217,38 @@ TEST(TableTest, FooterTests) {
// Plain table is not supported in ROCKSDB_LITE
#ifndef ROCKSDB_LITE
{
// upconvert legacy plain table
std::string encoded;
Footer footer;
footer.set_table_magic_number(kLegacyPlainTableMagicNumber)
.set_format_version(0)
.set_metaindex_handle(meta_index)
.set_index_handle(index);
footer.EncodeTo(&encoded, footer_offset);
// legacy plain table
FooterBuilder footer;
footer.Build(kPlainTableMagicNumber, /* format_version */ 0, footer_offset,
kNoChecksum, meta_index);
Footer decoded_footer;
Slice encoded_slice(encoded);
ASSERT_OK(decoded_footer.DecodeFrom(&encoded_slice, footer_offset));
ASSERT_OK(decoded_footer.DecodeFrom(footer.GetSlice(), footer_offset));
ASSERT_EQ(decoded_footer.table_magic_number(), kPlainTableMagicNumber);
ASSERT_EQ(decoded_footer.checksum_type(), kCRC32c);
ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset());
ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset());
ASSERT_EQ(decoded_footer.index_handle().size(), index.size());
ASSERT_EQ(decoded_footer.index_handle().offset(), 0U);
ASSERT_EQ(decoded_footer.index_handle().size(), 0U);
ASSERT_EQ(decoded_footer.format_version(), 0U);
ASSERT_EQ(decoded_footer.GetBlockTrailerSize(), 0U);
// Ensure serialized with legacy magic
ASSERT_EQ(
DecodeFixed64(footer.GetSlice().data() + footer.GetSlice().size() - 8),
kLegacyPlainTableMagicNumber);
}
{
// xxhash plain table (not currently used)
std::string encoded;
Footer footer;
footer.set_table_magic_number(kPlainTableMagicNumber)
.set_format_version(1)
.set_metaindex_handle(meta_index)
.set_index_handle(index)
.set_checksum_type(kxxHash);
footer.EncodeTo(&encoded, footer_offset);
FooterBuilder footer;
footer.Build(kPlainTableMagicNumber, /* format_version */ 1, footer_offset,
kxxHash, meta_index);
Footer decoded_footer;
Slice encoded_slice(encoded);
ASSERT_OK(decoded_footer.DecodeFrom(&encoded_slice, footer_offset));
ASSERT_OK(decoded_footer.DecodeFrom(footer.GetSlice(), footer_offset));
ASSERT_EQ(decoded_footer.table_magic_number(), kPlainTableMagicNumber);
ASSERT_EQ(decoded_footer.checksum_type(), kxxHash);
ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset());
ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset());
ASSERT_EQ(decoded_footer.index_handle().size(), index.size());
ASSERT_EQ(decoded_footer.index_handle().offset(), 0U);
ASSERT_EQ(decoded_footer.index_handle().size(), 0U);
ASSERT_EQ(decoded_footer.format_version(), 1U);
ASSERT_EQ(decoded_footer.GetBlockTrailerSize(), 0U);
}