diff --git a/HISTORY.md b/HISTORY.md index 72b7192af..b6099eba2 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -18,6 +18,7 @@ ### Public API change * Added `TableProperties::slow_compression_estimated_data_size` and `TableProperties::fast_compression_estimated_data_size`. When `ColumnFamilyOptions::sample_for_compression > 0`, they estimate what `TableProperties::data_size` would have been if the "fast" or "slow" (see `ColumnFamilyOptions::sample_for_compression` API doc for definitions) compression had been used instead. * Update DB::StartIOTrace and remove Env object from the arguments as its redundant and DB already has Env object that is passed down to IOTracer::StartIOTrace +* For new integrated BlobDB, add support for blob files for backup/restore like table files. Because of current limitations, blob files always use the kLegacyCrc32cAndFileSize naming scheme, and incremental backups must read and checksum all blob files in a DB, even for files that are already backed up. ### New Features * Added the ability to open BackupEngine backups as read-only DBs, using BackupInfo::name_for_open and env_for_open provided by BackupEngine::GetBackupInfo() with include_file_details=true. diff --git a/env/composite_env_wrapper.h b/env/composite_env_wrapper.h index c2752fcee..636bdda90 100644 --- a/env/composite_env_wrapper.h +++ b/env/composite_env_wrapper.h @@ -218,7 +218,12 @@ class CompositeEnv : public Env { return file_system_->OptimizeForCompactionTableRead( FileOptions(env_options), db_options); } - + EnvOptions OptimizeForBlobFileRead( + const EnvOptions& env_options, + const ImmutableDBOptions& db_options) const override { + return file_system_->OptimizeForBlobFileRead(FileOptions(env_options), + db_options); + } // This seems to clash with a macro on Windows, so #undef it here #ifdef GetFreeSpace #undef GetFreeSpace diff --git a/env/env.cc b/env/env.cc index bf3ec3231..809b80568 100644 --- a/env/env.cc +++ b/env/env.cc @@ -536,6 +536,11 @@ class LegacyFileSystemWrapper : public FileSystem { const ImmutableDBOptions& db_options) const override { return target_->OptimizeForCompactionTableRead(file_options, db_options); } + FileOptions OptimizeForBlobFileRead( + const FileOptions& file_options, + const ImmutableDBOptions& db_options) const override { + return target_->OptimizeForBlobFileRead(file_options, db_options); + } #ifdef GetFreeSpace #undef GetFreeSpace @@ -997,6 +1002,12 @@ EnvOptions Env::OptimizeForCompactionTableRead( optimized_env_options.use_direct_reads = db_options.use_direct_reads; return optimized_env_options; } +EnvOptions Env::OptimizeForBlobFileRead( + const EnvOptions& env_options, const ImmutableDBOptions& db_options) const { + EnvOptions optimized_env_options(env_options); + optimized_env_options.use_direct_reads = db_options.use_direct_reads; + return optimized_env_options; +} EnvOptions::EnvOptions(const DBOptions& options) { AssignEnvOptions(this, options); diff --git a/env/file_system.cc b/env/file_system.cc index 2a76ee5d3..3e0c4d102 100644 --- a/env/file_system.cc +++ b/env/file_system.cc @@ -83,6 +83,14 @@ FileOptions FileSystem::OptimizeForCompactionTableRead( return optimized_file_options; } +FileOptions FileSystem::OptimizeForBlobFileRead( + const FileOptions& file_options, + const ImmutableDBOptions& db_options) const { + FileOptions optimized_file_options(file_options); + optimized_file_options.use_direct_reads = db_options.use_direct_reads; + return optimized_file_options; +} + IOStatus WriteStringToFile(FileSystem* fs, const Slice& data, const std::string& fname, bool should_sync) { std::unique_ptr file; diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h index d3bbeac64..fc776bc71 100644 --- a/include/rocksdb/env.h +++ b/include/rocksdb/env.h @@ -546,6 +546,13 @@ class Env { const EnvOptions& env_options, const ImmutableDBOptions& db_options) const; + // OptimizeForBlobFileRead will create a new EnvOptions object that + // is a copy of the EnvOptions in the parameters, but is optimized for reading + // blob files. + virtual EnvOptions OptimizeForBlobFileRead( + const EnvOptions& env_options, + const ImmutableDBOptions& db_options) const; + // Returns the status of all threads that belong to the current Env. virtual Status GetThreadList(std::vector* /*thread_list*/) { return Status::NotSupported("Env::GetThreadList() not supported."); @@ -1495,6 +1502,11 @@ class EnvWrapper : public Env { const ImmutableDBOptions& db_options) const override { return target_->OptimizeForCompactionTableRead(env_options, db_options); } + EnvOptions OptimizeForBlobFileRead( + const EnvOptions& env_options, + const ImmutableDBOptions& db_options) const override { + return target_->OptimizeForBlobFileRead(env_options, db_options); + } Status GetFreeSpace(const std::string& path, uint64_t* diskfree) override { return target_->GetFreeSpace(path, diskfree); } diff --git a/include/rocksdb/file_system.h b/include/rocksdb/file_system.h index 02e307d0a..daa15a4ce 100644 --- a/include/rocksdb/file_system.h +++ b/include/rocksdb/file_system.h @@ -550,6 +550,13 @@ class FileSystem { const FileOptions& file_options, const ImmutableDBOptions& db_options) const; + // OptimizeForBlobFileRead will create a new FileOptions object that + // is a copy of the FileOptions in the parameters, but is optimized for + // reading blob files. + virtual FileOptions OptimizeForBlobFileRead( + const FileOptions& file_options, + const ImmutableDBOptions& db_options) const; + // This seems to clash with a macro on Windows, so #undef it here #ifdef GetFreeSpace #undef GetFreeSpace @@ -1289,6 +1296,11 @@ class FileSystemWrapper : public FileSystem { const ImmutableDBOptions& db_options) const override { return target_->OptimizeForCompactionTableRead(file_options, db_options); } + FileOptions OptimizeForBlobFileRead( + const FileOptions& file_options, + const ImmutableDBOptions& db_options) const override { + return target_->OptimizeForBlobFileRead(file_options, db_options); + } IOStatus GetFreeSpace(const std::string& path, const IOOptions& options, uint64_t* diskfree, IODebugContext* dbg) override { return target_->GetFreeSpace(path, options, diskfree, dbg); diff --git a/include/rocksdb/utilities/backupable_db.h b/include/rocksdb/utilities/backupable_db.h index 71989c56b..fd34c6144 100644 --- a/include/rocksdb/utilities/backupable_db.h +++ b/include/rocksdb/utilities/backupable_db.h @@ -42,11 +42,14 @@ struct BackupableDBOptions { // Default: nullptr Env* backup_env; - // If share_table_files == true, backup will assume that table files with - // same name have the same contents. This enables incremental backups and - // avoids unnecessary data copies. - // If share_table_files == false, each backup will be on its own and will - // not share any data with other backups. + // share_table_files supports table and blob files. + // + // If share_table_files == true, the backup directory will share table and + // blob files among backups, to save space among backups of the same DB and to + // enable incremental backups by only copying new files. + // If share_table_files == false, each backup will be on its own and will not + // share any data with other backups. + // // default: true bool share_table_files; @@ -92,13 +95,15 @@ struct BackupableDBOptions { // Default: nullptr std::shared_ptr restore_rate_limiter{nullptr}; + // share_files_with_checksum supports table and blob files. + // // Only used if share_table_files is set to true. Setting to false is // DEPRECATED and potentially dangerous because in that case BackupEngine // can lose data if backing up databases with distinct or divergent // history, for example if restoring from a backup other than the latest, // writing to the DB, and creating another backup. Setting to true (default) - // prevents these issues by ensuring that different table files (SSTs) with - // the same number are treated as distinct. See + // prevents these issues by ensuring that different table files (SSTs) and + // blob files with the same number are treated as distinct. See // share_files_with_checksum_naming and ShareFilesNaming. // // Default: true @@ -126,11 +131,12 @@ struct BackupableDBOptions { int max_valid_backups_to_open; // ShareFilesNaming describes possible naming schemes for backup - // table file names when the table files are stored in the shared_checksum - // directory (i.e., both share_table_files and share_files_with_checksum - // are true). + // table and blob file names when they are stored in the + // shared_checksum directory (i.e., both share_table_files and + // share_files_with_checksum are true). enum ShareFilesNaming : uint32_t { - // Backup SST filenames are __.sst + // Backup blob filenames are __.blob and + // backup SST filenames are __.sst // where is an unsigned decimal integer. This is the // original/legacy naming scheme for share_files_with_checksum, // with two problems: @@ -139,6 +145,7 @@ struct BackupableDBOptions { // * Determining the name to use requires computing the checksum, // so generally requires reading the whole file even if the file // is already backed up. + // // ** ONLY RECOMMENDED FOR PRESERVING OLD BEHAVIOR ** kLegacyCrc32cAndFileSize = 1U, @@ -148,6 +155,8 @@ struct BackupableDBOptions { // the value is a DB session id, not a checksum. // // Exceptions: + // * For blob files, kLegacyCrc32cAndFileSize is used as currently + // db_session_id is not supported by the blob file format. // * For old SST files without a DB session id, kLegacyCrc32cAndFileSize // will be used instead, matching the names assigned by RocksDB versions // not supporting the newer naming scheme. @@ -158,25 +167,25 @@ struct BackupableDBOptions { // If not already part of the naming scheme, insert // _ - // before .sst in the name. In case of user code actually parsing the - // last _ before the .sst as the file size, this preserves that - // feature of kLegacyCrc32cAndFileSize. In other words, this option makes - // official that unofficial feature of the backup metadata. + // before .sst and .blob in the name. In case of user code actually parsing + // the last _ before the .sst and .blob as the file size, this + // preserves that feature of kLegacyCrc32cAndFileSize. In other words, this + // option makes official that unofficial feature of the backup metadata. // - // We do not consider SST file sizes to have sufficient entropy to + // We do not consider SST and blob file sizes to have sufficient entropy to // contribute significantly to naming uniqueness. kFlagIncludeFileSize = 1U << 31, kMaskNamingFlags = ~kMaskNoNamingFlags, }; - // Naming option for share_files_with_checksum table files. See + // Naming option for share_files_with_checksum table and blob files. See // ShareFilesNaming for details. // // Modifying this option cannot introduce a downgrade compatibility issue // because RocksDB can read, restore, and delete backups using different file - // names, and it's OK for a backup directory to use a mixture of table file - // naming schemes. + // names, and it's OK for a backup directory to use a mixture of table and + // blob files naming schemes. // // However, modifying this option and saving more backups to the same // directory can lead to the same file getting saved again to that diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc index 1f87e2904..29b7319ea 100644 --- a/utilities/backupable/backupable_db.cc +++ b/utilities/backupable/backupable_db.cc @@ -749,7 +749,7 @@ class BackupEngineImpl { BackupID backup_id, bool shared, const std::string& src_dir, const std::string& fname, // starts with "/" const EnvOptions& src_env_options, RateLimiter* rate_limiter, - uint64_t size_bytes, uint64_t size_limit = 0, + FileType file_type, uint64_t size_bytes, uint64_t size_limit = 0, bool shared_checksum = false, std::function progress_callback = []() {}, const std::string& contents = std::string(), @@ -1287,7 +1287,7 @@ Status BackupEngineImpl::CreateNewBackupWithMetadata( Log(options_.info_log, "add file for backup %s", fname.c_str()); uint64_t size_bytes = 0; Status st; - if (type == kTableFile) { + if (type == kTableFile || type == kBlobFile) { st = db_env_->GetFileSize(src_dirname + fname, &size_bytes); } EnvOptions src_env_options; @@ -1304,6 +1304,10 @@ Status BackupEngineImpl::CreateNewBackupWithMetadata( src_env_options = db_env_->OptimizeForManifestRead(src_raw_env_options); break; + case kBlobFile: + src_env_options = db_env_->OptimizeForBlobFileRead( + src_raw_env_options, ImmutableDBOptions(db_options)); + break; default: // Other backed up files (like options file) are not read by live // DB, so don't need to worry about avoiding mixing buffered and @@ -1314,22 +1318,25 @@ Status BackupEngineImpl::CreateNewBackupWithMetadata( if (st.ok()) { st = AddBackupFileWorkItem( live_dst_paths, backup_items_to_finish, new_backup_id, - options_.share_table_files && type == kTableFile, src_dirname, - fname, src_env_options, rate_limiter, size_bytes, - size_limit_bytes, - options_.share_files_with_checksum && type == kTableFile, + options_.share_table_files && + (type == kTableFile || type == kBlobFile), + src_dirname, fname, src_env_options, rate_limiter, type, + size_bytes, size_limit_bytes, + options_.share_files_with_checksum && + (type == kTableFile || type == kBlobFile), options.progress_callback, "" /* contents */, checksum_func_name, checksum_val); } return st; } /* copy_file_cb */, - [&](const std::string& fname, const std::string& contents, FileType) { + [&](const std::string& fname, const std::string& contents, + FileType type) { Log(options_.info_log, "add file for backup %s", fname.c_str()); return AddBackupFileWorkItem( live_dst_paths, backup_items_to_finish, new_backup_id, false /* shared */, "" /* src_dir */, fname, - EnvOptions() /* src_env_options */, rate_limiter, contents.size(), - 0 /* size_limit */, false /* shared_checksum */, + EnvOptions() /* src_env_options */, rate_limiter, type, + contents.size(), 0 /* size_limit */, false /* shared_checksum */, options.progress_callback, contents); } /* create_file_cb */, &sequence_number, options.flush_before_backup ? 0 : port::kMaxUint64, @@ -1872,9 +1879,10 @@ Status BackupEngineImpl::AddBackupFileWorkItem( std::vector& backup_items_to_finish, BackupID backup_id, bool shared, const std::string& src_dir, const std::string& fname, const EnvOptions& src_env_options, - RateLimiter* rate_limiter, uint64_t size_bytes, uint64_t size_limit, - bool shared_checksum, std::function progress_callback, - const std::string& contents, const std::string& src_checksum_func_name, + RateLimiter* rate_limiter, FileType file_type, uint64_t size_bytes, + uint64_t size_limit, bool shared_checksum, + std::function progress_callback, const std::string& contents, + const std::string& src_checksum_func_name, const std::string& src_checksum_str) { assert(!fname.empty() && fname[0] == '/'); assert(contents.empty() != src_dir.empty()); @@ -1887,8 +1895,8 @@ Status BackupEngineImpl::AddBackupFileWorkItem( std::string checksum_hex; // Whenever a default checksum function name is passed in, we will compares - // the corresponding checksum values after copying. Note that only table files - // may have a known checksum function name passed in. + // the corresponding checksum values after copying. Note that only table and + // blob files may have a known checksum function name passed in. // // If no default checksum function name is passed in and db session id is not // available, we will calculate the checksum *before* copying in two cases @@ -1906,7 +1914,8 @@ Status BackupEngineImpl::AddBackupFileWorkItem( // Step 1: Prepare the relative path to destination if (shared && shared_checksum) { - if (GetNamingNoFlags() != BackupableDBOptions::kLegacyCrc32cAndFileSize) { + if (GetNamingNoFlags() != BackupableDBOptions::kLegacyCrc32cAndFileSize && + file_type != kBlobFile) { // Prepare db_session_id to add to the file name // Ignore the returned status // In the failed cases, db_id and db_session_id will be empty @@ -1938,6 +1947,11 @@ Status BackupEngineImpl::AddBackupFileWorkItem( // shared_checksum/_.sst // Otherwise, dst_relative is of the form // shared_checksum/__.sst + // + // For blob files, db_session_id is not supported with the blob file format. + // It uses original/legacy naming scheme. + // dst_relative will be of the form: + // shared_checksum/__.blob dst_relative = GetSharedFileWithChecksum(dst_relative, checksum_hex, size_bytes, db_session_id); dst_relative_tmp = GetSharedFileWithChecksumRel(dst_relative, true); diff --git a/utilities/backupable/backupable_db_test.cc b/utilities/backupable/backupable_db_test.cc index fdaf4a149..e932f30f8 100644 --- a/utilities/backupable/backupable_db_test.cc +++ b/utilities/backupable/backupable_db_test.cc @@ -632,6 +632,7 @@ class BackupableDBTest : public testing::Test { options_.write_buffer_size = 1 << 17; // 128KB options_.env = test_db_env_.get(); options_.wal_dir = dbname_; + options_.enable_blob_files = true; // Create logger DBOptions logger_options; @@ -894,14 +895,17 @@ class BackupableDBTest : public testing::Test { void AssertDirectoryFilesMatchRegex(const std::string& dir, const std::regex& pattern, + const std::string& file_type, int minimum_count) { std::vector children; ASSERT_OK(file_manager_->GetChildrenFileAttributes(dir, &children)); int found_count = 0; for (const auto& child : children) { - const std::string match("match"); - ASSERT_EQ(match, std::regex_replace(child.name, pattern, match)); - ++found_count; + if (EndsWith(child.name, file_type)) { + ASSERT_TRUE(std::regex_match(child.name, pattern)) + << "File name " << child.name << " does not match regex."; + ++found_count; + } } ASSERT_GE(found_count, minimum_count); } @@ -1433,9 +1437,8 @@ TEST_F(BackupableDBTest, CorruptFileMaintainSize) { } // Corrupt a blob file but maintain its size -TEST_F(BackupableDBTest, CorruptBlobFileMaintainSize) { +TEST_P(BackupableDBTestWithParam, CorruptBlobFileMaintainSize) { const int keys_iteration = 5000; - options_.enable_blob_files = true; OpenDBAndBackupEngine(true); // create a backup FillDB(db_.get(), 0, keys_iteration); @@ -1450,12 +1453,18 @@ TEST_F(BackupableDBTest, CorruptBlobFileMaintainSize) { std::string file_to_corrupt; std::vector children; - const std::string dir = backupdir_ + "/private/1"; + + std::string dir = backupdir_; + if (backupable_options_->share_files_with_checksum) { + dir += "/shared_checksum"; + } else { + dir += "/shared"; + } + ASSERT_OK(file_manager_->GetChildrenFileAttributes(dir, &children)); for (const auto& child : children) { - if (child.name.find(".blob") != std::string::npos && - child.size_bytes != 0) { + if (EndsWith(child.name, ".blob") && child.size_bytes != 0) { // corrupt the blob files by replacing its content by file_size random // bytes ASSERT_OK( @@ -1509,7 +1518,6 @@ TEST_F(BackupableDBTest, TableFileCorruptedBeforeBackup) { // been corrupted and the blob file checksum is stored in the DB manifest TEST_F(BackupableDBTest, BlobFileCorruptedBeforeBackup) { const int keys_iteration = 50000; - options_.enable_blob_files = true; OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */, kNoShare); @@ -1575,7 +1583,6 @@ TEST_P(BackupableDBTestWithParam, TableFileCorruptedBeforeBackup) { // the case when backup blob files will be stored in a shared directory TEST_P(BackupableDBTestWithParam, BlobFileCorruptedBeforeBackup) { const int keys_iteration = 50000; - options_.enable_blob_files = true; OpenDBAndBackupEngine(true /* destroy_old_data */); FillDB(db_.get(), 0, keys_iteration); CloseAndReopenDB(/*read_only*/ true); @@ -1986,6 +1993,8 @@ TEST_F(BackupableDBTest, ShareTableFilesWithChecksumsNewNaming) { "[0-9]+_s[0-9A-Z]{20}_[0-9]+[.]sst"}, }; + const std::string blobfile_pattern = "[0-9]+_[0-9]+_[0-9]+[.]blob"; + for (const auto& pair : option_to_expected) { CloseAndReopenDB(); backupable_options_->share_files_with_checksum_naming = pair.first; @@ -1994,12 +2003,15 @@ TEST_F(BackupableDBTest, ShareTableFilesWithChecksumsNewNaming) { CloseDBAndBackupEngine(); AssertBackupConsistency(1, 0, keys_iteration, keys_iteration * 2); AssertDirectoryFilesMatchRegex(backupdir_ + "/shared_checksum", - std::regex(pair.second), + std::regex(pair.second), ".sst", 1 /* minimum_count */); if (std::string::npos != pair.second.find("_[0-9]+[.]sst")) { AssertDirectoryFilesSizeIndicators(backupdir_ + "/shared_checksum", 1 /* minimum_count */); } + AssertDirectoryFilesMatchRegex(backupdir_ + "/shared_checksum", + std::regex(blobfile_pattern), ".blob", + 1 /* minimum_count */); } } @@ -2024,6 +2036,8 @@ TEST_F(BackupableDBTest, ShareTableFilesWithChecksumsOldFileNaming) { // Old names should always be used on old files const std::regex expected("[0-9]+_[0-9]+_[0-9]+[.]sst"); + const std::string blobfile_pattern = "[0-9]+_[0-9]+_[0-9]+[.]blob"; + for (ShareFilesNaming option : {kNamingDefault, kUseDbSessionId}) { CloseAndReopenDB(); backupable_options_->share_files_with_checksum_naming = option; @@ -2032,6 +2046,9 @@ TEST_F(BackupableDBTest, ShareTableFilesWithChecksumsOldFileNaming) { CloseDBAndBackupEngine(); AssertBackupConsistency(1, 0, keys_iteration, keys_iteration * 2); AssertDirectoryFilesMatchRegex(backupdir_ + "/shared_checksum", expected, + ".sst", 1 /* minimum_count */); + AssertDirectoryFilesMatchRegex(backupdir_ + "/shared_checksum", + std::regex(blobfile_pattern), ".blob", 1 /* minimum_count */); } @@ -2175,9 +2192,9 @@ TEST_F(BackupableDBTest, FileSizeForIncremental) { ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true /*flush*/)); CloseDBAndBackupEngine(); - // Corrupt backup SST + // Corrupt backup SST and blob file ASSERT_OK(file_manager_->GetChildrenFileAttributes(shared_dir, &children)); - ASSERT_EQ(children.size(), 1U); // one sst + ASSERT_EQ(children.size(), 2U); // one sst and one blob file for (const auto& child : children) { if (child.name.size() > 4 && child.size_bytes > 0) { ASSERT_OK( @@ -2234,10 +2251,10 @@ TEST_F(BackupableDBTest, FileSizeForIncremental) { OpenDBAndBackupEngine(false, false, share); ASSERT_OK(db_->Put(WriteOptions(), "y", Random(42).RandomString(500))); - // Count backup SSTs + // Count backup SSTs and blob files. children.clear(); ASSERT_OK(file_manager_->GetChildrenFileAttributes(shared_dir, &children)); - ASSERT_EQ(children.size(), 2U); // two sst + ASSERT_EQ(children.size(), 4U); // two sst and two blob files // Try create backup 3 s = backup_engine_->CreateNewBackup(db_.get(), true /*flush*/); @@ -2250,18 +2267,18 @@ TEST_F(BackupableDBTest, FileSizeForIncremental) { // Acceptable to call it corruption if size is not in name and // db session id collision is practically impossible. EXPECT_TRUE(s.IsCorruption()); - EXPECT_EQ(children.size(), 2U); // no SST added + EXPECT_EQ(children.size(), 4U); // no SST/Blob file added } else if (option == share_no_checksum) { // Good to call it corruption if both backups cannot be // accommodated. EXPECT_TRUE(s.IsCorruption()); - EXPECT_EQ(children.size(), 2U); // no SST added + EXPECT_EQ(children.size(), 4U); // no SST/Blob file added } else { // Since opening a DB seems sufficient for detecting size corruption // on the DB side, this should be a good thing, ... EXPECT_OK(s); // ... as long as we did actually treat it as a distinct SST file. - EXPECT_EQ(children.size(), 3U); // Another SST added + EXPECT_EQ(children.size(), 6U); // Another SST and blob added } CloseDBAndBackupEngine(); ASSERT_OK(DestroyDB(dbname_, options_));