From 96b8240bc5794eecb94092435b80003013e06396 Mon Sep 17 00:00:00 2001 From: Igor Canadi Date: Tue, 13 Jan 2015 14:33:04 -0800 Subject: [PATCH] Support footer versions bigger than 1 Summary: In this diff I add another parameter to BlockBasedTableOptions that will let users specify block based table's format. This will greatly simplify block based table's format changes in the future. First format change that this will support is encoding decompressed size in Zlib and BZip2 blocks. This diff is blocking https://reviews.facebook.net/D31311. Test Plan: Added a unit tests. More tests to come as part of https://reviews.facebook.net/D31311. Reviewers: dhruba, MarkCallaghan, yhchiang, rven, sdong Reviewed By: sdong Subscribers: dhruba, leveldb Differential Revision: https://reviews.facebook.net/D31383 --- include/rocksdb/table.h | 13 +++++ table/block_based_table_builder.cc | 25 ++++++-- table/block_based_table_factory.cc | 6 ++ table/block_based_table_reader.cc | 10 +++- table/cuckoo_table_builder.cc | 2 +- table/format.cc | 93 +++++++++++++++--------------- table/format.h | 47 +++++++-------- table/meta_blocks.cc | 12 ++-- table/plain_table_builder.cc | 2 +- table/table_test.cc | 31 ++++++++-- 10 files changed, 148 insertions(+), 93 deletions(-) diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h index da525d4a2..d4e0e156f 100644 --- a/include/rocksdb/table.h +++ b/include/rocksdb/table.h @@ -125,6 +125,19 @@ struct BlockBasedTableOptions { // If true, place whole keys in the filter (not just prefixes). // This must generally be true for gets to be efficient. bool whole_key_filtering = true; + + // For more details on BlockBasedTable's formats, see FORMAT-CHANGES.md + // We currently have two versions: + // 0 -- This version is currently written out by all RocksDB's versions by + // default. Can be read by really old RocksDB's. Doesn't support changing + // checksum (default is CRC32). + // 1 -- Can be read by RocksDB's versions since 3.0. Supports non-default + // checksum, like xxHash. It is written by RocksDB when + // BlockBasedTableOptions::checksum is something other than kCRC32c. (version + // 0 is silently upconverted) + // This only affects newly written tables. When reading exising tables, the + // information about version is read from the footer. + uint32_t format_version = 0; }; // Table Properties that are specific to block-based table properties. diff --git a/table/block_based_table_builder.cc b/table/block_based_table_builder.cc index cdae8508b..f04906ff8 100644 --- a/table/block_based_table_builder.cc +++ b/table/block_based_table_builder.cc @@ -472,9 +472,20 @@ BlockBasedTableBuilder::BlockBasedTableBuilder( const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_comparator, WritableFile* file, const CompressionType compression_type, - const CompressionOptions& compression_opts) - : rep_(new Rep(ioptions, table_options, internal_comparator, - file, compression_type, compression_opts)) { + const CompressionOptions& compression_opts) { + BlockBasedTableOptions sanitized_table_options(table_options); + if (sanitized_table_options.format_version == 0 && + sanitized_table_options.checksum != kCRC32c) { + Log(InfoLogLevel::WARN_LEVEL, ioptions.info_log, + "Silently converting format_version to 1 because checksum is " + "non-default"); + // silently convert format_version to 1 to keep consistent with current + // behavior + sanitized_table_options.format_version = 1; + } + + rep_ = new Rep(ioptions, sanitized_table_options, internal_comparator, file, + compression_type, compression_opts); if (rep_->filter_block != nullptr) { rep_->filter_block->StartBlock(0); } @@ -771,9 +782,13 @@ Status BlockBasedTableBuilder::Finish() { // TODO(icanadi) at some point in the future, when we're absolutely sure // nobody will roll back to RocksDB 2.x versions, retire the legacy magic // number and always write new table files with new magic number - bool legacy = (r->table_options.checksum == kCRC32c); + bool legacy = (r->table_options.format_version == 0); + // this is guaranteed by BlockBasedTableBuilder's constructor + assert(r->table_options.checksum == kCRC32c || + r->table_options.format_version != 0); Footer footer(legacy ? kLegacyBlockBasedTableMagicNumber - : kBlockBasedTableMagicNumber); + : kBlockBasedTableMagicNumber, + r->table_options.format_version); footer.set_metaindex_handle(metaindex_block_handle); footer.set_index_handle(index_block_handle); footer.set_checksum(r->table_options.checksum); diff --git a/table/block_based_table_factory.cc b/table/block_based_table_factory.cc index 9708e1954..063bc2587 100644 --- a/table/block_based_table_factory.cc +++ b/table/block_based_table_factory.cc @@ -76,6 +76,10 @@ Status BlockBasedTableFactory::SanitizeOptions( return Status::InvalidArgument("Enable cache_index_and_filter_blocks, " ", but block cache is disabled"); } + if (table_options_.format_version > 1) { + return Status::InvalidArgument( + "We currently only support versions 0 and 1"); + } return Status::OK(); } @@ -135,6 +139,8 @@ std::string BlockBasedTableFactory::GetPrintableTableOptions() const { ret.append(buffer); snprintf(buffer, kBufferSize, " whole_key_filtering: %d\n", table_options_.whole_key_filtering); + snprintf(buffer, kBufferSize, " format_version: %d\n", + table_options_.format_version); ret.append(buffer); return ret; } diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc index 1e4da1e1f..727f9c43a 100644 --- a/table/block_based_table_reader.cc +++ b/table/block_based_table_reader.cc @@ -436,11 +436,17 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, unique_ptr* table_reader) { table_reader->reset(); - Footer footer(kBlockBasedTableMagicNumber); - auto s = ReadFooterFromFile(file.get(), file_size, &footer); + Footer footer; + auto s = ReadFooterFromFile(file.get(), file_size, &footer, + kBlockBasedTableMagicNumber); if (!s.ok()) { return s; } + if (footer.version() > 1) { + return Status::Corruption( + "Unknown Footer version. Maybe this file was created with too new " + "version of RocksDB?"); + } // We've successfully read the footer and the index block: we're // ready to serve requests. diff --git a/table/cuckoo_table_builder.cc b/table/cuckoo_table_builder.cc index 8a57f1c6b..1aa1e0707 100644 --- a/table/cuckoo_table_builder.cc +++ b/table/cuckoo_table_builder.cc @@ -377,7 +377,7 @@ Status CuckooTableBuilder::Finish() { return s; } - Footer footer(kCuckooTableMagicNumber); + Footer footer(kCuckooTableMagicNumber, 1); footer.set_metaindex_handle(meta_index_block_handle); footer.set_index_handle(BlockHandle::NullBlockHandle()); std::string footer_encoding; diff --git a/table/format.cc b/table/format.cc index c7f96f427..2ea4b9171 100644 --- a/table/format.cc +++ b/table/format.cc @@ -72,6 +72,23 @@ std::string BlockHandle::ToString(bool hex) const { const BlockHandle BlockHandle::kNullBlockHandle(0, 0); +namespace { +inline bool IsLegacyFooterFormat(uint64_t magic_number) { + return magic_number == kLegacyBlockBasedTableMagicNumber || + magic_number == kLegacyPlainTableMagicNumber; +} +inline uint64_t UpconvertLegacyFooterFormat(uint64_t magic_number) { + if (magic_number == kLegacyBlockBasedTableMagicNumber) { + return kBlockBasedTableMagicNumber; + } + if (magic_number == kLegacyPlainTableMagicNumber) { + return kPlainTableMagicNumber; + } + assert(false); + return 0; +} +} // namespace + // legacy footer format: // metaindex handle (varint64 offset, varint64 size) // index handle (varint64 offset, varint64 size) @@ -85,7 +102,8 @@ const BlockHandle BlockHandle::kNullBlockHandle(0, 0); // footer version (4 bytes) // table_magic_number (8 bytes) void Footer::EncodeTo(std::string* dst) const { - if (version() == kLegacyFooter) { + assert(HasInitializedTableMagicNumber()); + if (IsLegacyFooterFormat(table_magic_number())) { // has to be default checksum with legacy footer assert(checksum_ == kCRC32c); const size_t original_size = dst->size(); @@ -100,39 +118,24 @@ void Footer::EncodeTo(std::string* dst) const { dst->push_back(static_cast(checksum_)); metaindex_handle_.EncodeTo(dst); index_handle_.EncodeTo(dst); - dst->resize(original_size + kVersion1EncodedLength - 12); // Padding - PutFixed32(dst, kFooterVersion); + dst->resize(original_size + kNewVersionsEncodedLength - 12); // Padding + PutFixed32(dst, version()); PutFixed32(dst, static_cast(table_magic_number() & 0xffffffffu)); PutFixed32(dst, static_cast(table_magic_number() >> 32)); - assert(dst->size() == original_size + kVersion1EncodedLength); + assert(dst->size() == original_size + kNewVersionsEncodedLength); } } -namespace { -inline bool IsLegacyFooterFormat(uint64_t magic_number) { - return magic_number == kLegacyBlockBasedTableMagicNumber || - magic_number == kLegacyPlainTableMagicNumber; -} - -inline uint64_t UpconvertLegacyFooterFormat(uint64_t magic_number) { - if (magic_number == kLegacyBlockBasedTableMagicNumber) { - return kBlockBasedTableMagicNumber; - } - if (magic_number == kLegacyPlainTableMagicNumber) { - return kPlainTableMagicNumber; - } - assert(false); - return 0; -} -} // namespace - -Footer::Footer(uint64_t _table_magic_number) - : version_(IsLegacyFooterFormat(_table_magic_number) ? kLegacyFooter - : kFooterVersion), +Footer::Footer(uint64_t _table_magic_number, uint32_t _version) + : version_(_version), checksum_(kCRC32c), - table_magic_number_(_table_magic_number) {} + table_magic_number_(_table_magic_number) { + // This should be guaranteed by constructor callers + assert(!IsLegacyFooterFormat(_table_magic_number) || version_ == 0); +} Status Footer::DecodeFrom(Slice* input) { + assert(!HasInitializedTableMagicNumber()); assert(input != nullptr); assert(input->size() >= kMinEncodedLength); @@ -148,36 +151,23 @@ Status Footer::DecodeFrom(Slice* input) { if (legacy) { magic = UpconvertLegacyFooterFormat(magic); } - if (HasInitializedTableMagicNumber()) { - if (magic != table_magic_number()) { - char buffer[80]; - snprintf(buffer, sizeof(buffer) - 1, - "not an sstable (bad magic number --- %lx)", - (long)magic); - return Status::Corruption(buffer); - } - } else { - set_table_magic_number(magic); - } + set_table_magic_number(magic); if (legacy) { // The size is already asserted to be at least kMinEncodedLength // at the beginning of the function input->remove_prefix(input->size() - kVersion0EncodedLength); - version_ = kLegacyFooter; + version_ = 0 /* legacy */; checksum_ = kCRC32c; } else { version_ = DecodeFixed32(magic_ptr - 4); - if (version_ != kFooterVersion) { - return Status::Corruption("bad footer version"); - } - // Footer version 1 will always occupy exactly this many bytes. + // Footer version 1 and higher will always occupy exactly this many bytes. // It consists of the checksum type, two block handles, padding, // a version number, and a magic number - if (input->size() < kVersion1EncodedLength) { + if (input->size() < kNewVersionsEncodedLength) { return Status::Corruption("input is too short to be an sstable"); } else { - input->remove_prefix(input->size() - kVersion1EncodedLength); + input->remove_prefix(input->size() - kNewVersionsEncodedLength); } uint32_t chksum; if (!GetVarint32(input, &chksum)) { @@ -219,9 +209,8 @@ std::string Footer::ToString() const { return result; } -Status ReadFooterFromFile(RandomAccessFile* file, - uint64_t file_size, - Footer* footer) { +Status ReadFooterFromFile(RandomAccessFile* file, uint64_t file_size, + Footer* footer, uint64_t enforce_table_magic_number) { if (file_size < Footer::kMinEncodedLength) { return Status::Corruption("file is too short to be an sstable"); } @@ -242,7 +231,15 @@ Status ReadFooterFromFile(RandomAccessFile* file, return Status::Corruption("file is too short to be an sstable"); } - return footer->DecodeFrom(&footer_input); + s = footer->DecodeFrom(&footer_input); + if (!s.ok()) { + return s; + } + if (enforce_table_magic_number != 0 && + enforce_table_magic_number != footer->table_magic_number()) { + return Status::Corruption("Bad table magic number"); + } + return Status::OK(); } // Without anonymous namespace here, we fail the warning -Wmissing-prototypes diff --git a/table/format.h b/table/format.h index e8586c986..d8bc43735 100644 --- a/table/format.h +++ b/table/format.h @@ -72,12 +72,13 @@ class Footer { // Constructs a footer without specifying its table magic number. // In such case, the table magic number of such footer should be // initialized via @ReadFooterFromFile(). - Footer() : Footer(kInvalidTableMagicNumber) {} + // Use this when you plan to load Footer with DecodeFrom(). Never use this + // when you plan to EncodeTo. + Footer() : Footer(kInvalidTableMagicNumber, 0) {} - // @table_magic_number serves two purposes: - // 1. Identify different types of the tables. - // 2. Help us to identify if a given file is a valid sst. - explicit Footer(uint64_t table_magic_number); + // Use this constructor when you plan to write out the footer using + // EncodeTo(). Never use this constructor with DecodeFrom(). + Footer(uint64_t table_magic_number, uint32_t version); // The version of the footer in this file uint32_t version() const { return version_; } @@ -97,20 +98,13 @@ class Footer { uint64_t table_magic_number() const { return table_magic_number_; } - // The version of Footer we encode - enum { - kLegacyFooter = 0, - kFooterVersion = 1, - }; - void EncodeTo(std::string* dst) const; - // Set the current footer based on the input slice. If table_magic_number_ - // is not set (i.e., HasInitializedTableMagicNumber() is true), then this - // function will also initialize table_magic_number_. Otherwise, this - // function will verify whether the magic number specified in the input - // slice matches table_magic_number_ and update the current footer only - // when the test passes. + // Set the current footer based on the input slice. + // + // REQUIRES: table_magic_number_ is not set (i.e., + // HasInitializedTableMagicNumber() is true). The function will initialize the + // magic number Status DecodeFrom(Slice* input); // Encoded length of a Footer. Note that the serialization of a Footer will @@ -121,13 +115,12 @@ class Footer { // Footer version 0 (legacy) will always occupy exactly this many bytes. // It consists of two block handles, padding, and a magic number. kVersion0EncodedLength = 2 * BlockHandle::kMaxEncodedLength + 8, - // Footer version 1 will always occupy exactly this many bytes. - // It consists of the checksum type, two block handles, padding, - // a version number, and a magic number - kVersion1EncodedLength = 1 + 2 * BlockHandle::kMaxEncodedLength + 4 + 8, - + // Footer of versions 1 and higher will always occupy exactly this many + // bytes. It consists of the checksum type, two block handles, padding, + // a version number (bigger than 1), and a magic number + kNewVersionsEncodedLength = 1 + 2 * BlockHandle::kMaxEncodedLength + 4 + 8, kMinEncodedLength = kVersion0EncodedLength, - kMaxEncodedLength = kVersion1EncodedLength + kMaxEncodedLength = kNewVersionsEncodedLength, }; static const uint64_t kInvalidTableMagicNumber = 0; @@ -156,9 +149,11 @@ class Footer { }; // Read the footer from file -Status ReadFooterFromFile(RandomAccessFile* file, - uint64_t file_size, - Footer* footer); +// If enforce_table_magic_number != 0, ReadFooterFromFile() will return +// corruption if table_magic number is not equal to enforce_table_magic_number +Status ReadFooterFromFile(RandomAccessFile* file, uint64_t file_size, + Footer* footer, + uint64_t enforce_table_magic_number = 0); // 1-byte type + 32-bit crc static const size_t kBlockTrailerSize = 5; diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc index 25a785787..6f83f42d4 100644 --- a/table/meta_blocks.cc +++ b/table/meta_blocks.cc @@ -220,8 +220,8 @@ Status ReadTableProperties(RandomAccessFile* file, uint64_t file_size, uint64_t table_magic_number, Env* env, Logger* info_log, TableProperties** properties) { // -- Read metaindex block - Footer footer(table_magic_number); - auto s = ReadFooterFromFile(file, file_size, &footer); + Footer footer; + auto s = ReadFooterFromFile(file, file_size, &footer, table_magic_number); if (!s.ok()) { return s; } @@ -274,8 +274,8 @@ Status FindMetaBlock(RandomAccessFile* file, uint64_t file_size, uint64_t table_magic_number, Env* env, const std::string& meta_block_name, BlockHandle* block_handle) { - Footer footer(table_magic_number); - auto s = ReadFooterFromFile(file, file_size, &footer); + Footer footer; + auto s = ReadFooterFromFile(file, file_size, &footer, table_magic_number); if (!s.ok()) { return s; } @@ -302,8 +302,8 @@ Status ReadMetaBlock(RandomAccessFile* file, uint64_t file_size, const std::string& meta_block_name, BlockContents* contents) { Status status; - Footer footer(table_magic_number); - status = ReadFooterFromFile(file, file_size, &footer); + Footer footer; + status = ReadFooterFromFile(file, file_size, &footer, table_magic_number); if (!status.ok()) { return status; } diff --git a/table/plain_table_builder.cc b/table/plain_table_builder.cc index 74a71cb35..0f89dd1f5 100644 --- a/table/plain_table_builder.cc +++ b/table/plain_table_builder.cc @@ -258,7 +258,7 @@ Status PlainTableBuilder::Finish() { // Write Footer // no need to write out new footer if we're using default checksum - Footer footer(kLegacyPlainTableMagicNumber); + Footer footer(kLegacyPlainTableMagicNumber, 0); footer.set_metaindex_handle(metaindex_block_handle); footer.set_index_handle(BlockHandle::NullBlockHandle()); std::string footer_encoding; diff --git a/table/table_test.cc b/table/table_test.cc index 8810a2254..4289059f9 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -1943,7 +1943,7 @@ TEST(Harness, FooterTests) { { // upconvert legacy block based std::string encoded; - Footer footer(kLegacyBlockBasedTableMagicNumber); + Footer footer(kLegacyBlockBasedTableMagicNumber, 0); BlockHandle meta_index(10, 5), index(20, 15); footer.set_metaindex_handle(meta_index); footer.set_index_handle(index); @@ -1957,11 +1957,12 @@ TEST(Harness, FooterTests) { ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size()); ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset()); ASSERT_EQ(decoded_footer.index_handle().size(), index.size()); + ASSERT_EQ(decoded_footer.version(), 0U); } { // xxhash block based std::string encoded; - Footer footer(kBlockBasedTableMagicNumber); + Footer footer(kBlockBasedTableMagicNumber, 1); BlockHandle meta_index(10, 5), index(20, 15); footer.set_metaindex_handle(meta_index); footer.set_index_handle(index); @@ -1976,11 +1977,12 @@ TEST(Harness, FooterTests) { ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size()); ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset()); ASSERT_EQ(decoded_footer.index_handle().size(), index.size()); + ASSERT_EQ(decoded_footer.version(), 1U); } { // upconvert legacy plain table std::string encoded; - Footer footer(kLegacyPlainTableMagicNumber); + Footer footer(kLegacyPlainTableMagicNumber, 0); BlockHandle meta_index(10, 5), index(20, 15); footer.set_metaindex_handle(meta_index); footer.set_index_handle(index); @@ -1994,11 +1996,12 @@ TEST(Harness, FooterTests) { ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size()); ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset()); ASSERT_EQ(decoded_footer.index_handle().size(), index.size()); + ASSERT_EQ(decoded_footer.version(), 0U); } { // xxhash block based std::string encoded; - Footer footer(kPlainTableMagicNumber); + Footer footer(kPlainTableMagicNumber, 1); BlockHandle meta_index(10, 5), index(20, 15); footer.set_metaindex_handle(meta_index); footer.set_index_handle(index); @@ -2013,6 +2016,26 @@ TEST(Harness, FooterTests) { ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size()); ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset()); ASSERT_EQ(decoded_footer.index_handle().size(), index.size()); + ASSERT_EQ(decoded_footer.version(), 1U); + } + { + // version == 2 + std::string encoded; + Footer footer(kBlockBasedTableMagicNumber, 2); + BlockHandle meta_index(10, 5), index(20, 15); + footer.set_metaindex_handle(meta_index); + footer.set_index_handle(index); + footer.EncodeTo(&encoded); + Footer decoded_footer; + Slice encoded_slice(encoded); + decoded_footer.DecodeFrom(&encoded_slice); + ASSERT_EQ(decoded_footer.table_magic_number(), kBlockBasedTableMagicNumber); + ASSERT_EQ(decoded_footer.checksum(), kCRC32c); + ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset()); + ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size()); + ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset()); + ASSERT_EQ(decoded_footer.index_handle().size(), index.size()); + ASSERT_EQ(decoded_footer.version(), 2U); } }