Integrated BlobDB for backup/restore support (#8129)

Summary:
Add support for blob files for backup/restore like table files.
    Since DB session ID is currently not supported for blob files (there is no place to store it in
    the header), so for blob files uses the
    kLegacyCrc32cAndFileSize naming scheme even if
    share_files_with_checksum_naming is set to kUseDbSessionId.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/8129

Test Plan: Add new test units

Reviewed By: ltamasi

Differential Revision: D27408510

Pulled By: akankshamahajan15

fbshipit-source-id: b27434d189a639ef3e6ad165c61a143a2daaf06e
This commit is contained in:
Akanksha Mahajan 2021-04-07 13:37:36 -07:00 committed by Facebook GitHub Bot
parent a4e82a3cca
commit d52b520d51
9 changed files with 142 additions and 53 deletions

View File

@ -18,6 +18,7 @@
### Public API change
* Added `TableProperties::slow_compression_estimated_data_size` and `TableProperties::fast_compression_estimated_data_size`. When `ColumnFamilyOptions::sample_for_compression > 0`, they estimate what `TableProperties::data_size` would have been if the "fast" or "slow" (see `ColumnFamilyOptions::sample_for_compression` API doc for definitions) compression had been used instead.
* Update DB::StartIOTrace and remove Env object from the arguments as its redundant and DB already has Env object that is passed down to IOTracer::StartIOTrace
* For new integrated BlobDB, add support for blob files for backup/restore like table files. Because of current limitations, blob files always use the kLegacyCrc32cAndFileSize naming scheme, and incremental backups must read and checksum all blob files in a DB, even for files that are already backed up.
### New Features
* Added the ability to open BackupEngine backups as read-only DBs, using BackupInfo::name_for_open and env_for_open provided by BackupEngine::GetBackupInfo() with include_file_details=true.

View File

@ -218,7 +218,12 @@ class CompositeEnv : public Env {
return file_system_->OptimizeForCompactionTableRead(
FileOptions(env_options), db_options);
}
EnvOptions OptimizeForBlobFileRead(
const EnvOptions& env_options,
const ImmutableDBOptions& db_options) const override {
return file_system_->OptimizeForBlobFileRead(FileOptions(env_options),
db_options);
}
// This seems to clash with a macro on Windows, so #undef it here
#ifdef GetFreeSpace
#undef GetFreeSpace

11
env/env.cc vendored
View File

@ -536,6 +536,11 @@ class LegacyFileSystemWrapper : public FileSystem {
const ImmutableDBOptions& db_options) const override {
return target_->OptimizeForCompactionTableRead(file_options, db_options);
}
FileOptions OptimizeForBlobFileRead(
const FileOptions& file_options,
const ImmutableDBOptions& db_options) const override {
return target_->OptimizeForBlobFileRead(file_options, db_options);
}
#ifdef GetFreeSpace
#undef GetFreeSpace
@ -997,6 +1002,12 @@ EnvOptions Env::OptimizeForCompactionTableRead(
optimized_env_options.use_direct_reads = db_options.use_direct_reads;
return optimized_env_options;
}
EnvOptions Env::OptimizeForBlobFileRead(
const EnvOptions& env_options, const ImmutableDBOptions& db_options) const {
EnvOptions optimized_env_options(env_options);
optimized_env_options.use_direct_reads = db_options.use_direct_reads;
return optimized_env_options;
}
EnvOptions::EnvOptions(const DBOptions& options) {
AssignEnvOptions(this, options);

8
env/file_system.cc vendored
View File

@ -83,6 +83,14 @@ FileOptions FileSystem::OptimizeForCompactionTableRead(
return optimized_file_options;
}
FileOptions FileSystem::OptimizeForBlobFileRead(
const FileOptions& file_options,
const ImmutableDBOptions& db_options) const {
FileOptions optimized_file_options(file_options);
optimized_file_options.use_direct_reads = db_options.use_direct_reads;
return optimized_file_options;
}
IOStatus WriteStringToFile(FileSystem* fs, const Slice& data,
const std::string& fname, bool should_sync) {
std::unique_ptr<FSWritableFile> file;

View File

@ -546,6 +546,13 @@ class Env {
const EnvOptions& env_options,
const ImmutableDBOptions& db_options) const;
// OptimizeForBlobFileRead will create a new EnvOptions object that
// is a copy of the EnvOptions in the parameters, but is optimized for reading
// blob files.
virtual EnvOptions OptimizeForBlobFileRead(
const EnvOptions& env_options,
const ImmutableDBOptions& db_options) const;
// Returns the status of all threads that belong to the current Env.
virtual Status GetThreadList(std::vector<ThreadStatus>* /*thread_list*/) {
return Status::NotSupported("Env::GetThreadList() not supported.");
@ -1495,6 +1502,11 @@ class EnvWrapper : public Env {
const ImmutableDBOptions& db_options) const override {
return target_->OptimizeForCompactionTableRead(env_options, db_options);
}
EnvOptions OptimizeForBlobFileRead(
const EnvOptions& env_options,
const ImmutableDBOptions& db_options) const override {
return target_->OptimizeForBlobFileRead(env_options, db_options);
}
Status GetFreeSpace(const std::string& path, uint64_t* diskfree) override {
return target_->GetFreeSpace(path, diskfree);
}

View File

@ -550,6 +550,13 @@ class FileSystem {
const FileOptions& file_options,
const ImmutableDBOptions& db_options) const;
// OptimizeForBlobFileRead will create a new FileOptions object that
// is a copy of the FileOptions in the parameters, but is optimized for
// reading blob files.
virtual FileOptions OptimizeForBlobFileRead(
const FileOptions& file_options,
const ImmutableDBOptions& db_options) const;
// This seems to clash with a macro on Windows, so #undef it here
#ifdef GetFreeSpace
#undef GetFreeSpace
@ -1289,6 +1296,11 @@ class FileSystemWrapper : public FileSystem {
const ImmutableDBOptions& db_options) const override {
return target_->OptimizeForCompactionTableRead(file_options, db_options);
}
FileOptions OptimizeForBlobFileRead(
const FileOptions& file_options,
const ImmutableDBOptions& db_options) const override {
return target_->OptimizeForBlobFileRead(file_options, db_options);
}
IOStatus GetFreeSpace(const std::string& path, const IOOptions& options,
uint64_t* diskfree, IODebugContext* dbg) override {
return target_->GetFreeSpace(path, options, diskfree, dbg);

View File

@ -42,11 +42,14 @@ struct BackupableDBOptions {
// Default: nullptr
Env* backup_env;
// If share_table_files == true, backup will assume that table files with
// same name have the same contents. This enables incremental backups and
// avoids unnecessary data copies.
// If share_table_files == false, each backup will be on its own and will
// not share any data with other backups.
// share_table_files supports table and blob files.
//
// If share_table_files == true, the backup directory will share table and
// blob files among backups, to save space among backups of the same DB and to
// enable incremental backups by only copying new files.
// If share_table_files == false, each backup will be on its own and will not
// share any data with other backups.
//
// default: true
bool share_table_files;
@ -92,13 +95,15 @@ struct BackupableDBOptions {
// Default: nullptr
std::shared_ptr<RateLimiter> restore_rate_limiter{nullptr};
// share_files_with_checksum supports table and blob files.
//
// Only used if share_table_files is set to true. Setting to false is
// DEPRECATED and potentially dangerous because in that case BackupEngine
// can lose data if backing up databases with distinct or divergent
// history, for example if restoring from a backup other than the latest,
// writing to the DB, and creating another backup. Setting to true (default)
// prevents these issues by ensuring that different table files (SSTs) with
// the same number are treated as distinct. See
// prevents these issues by ensuring that different table files (SSTs) and
// blob files with the same number are treated as distinct. See
// share_files_with_checksum_naming and ShareFilesNaming.
//
// Default: true
@ -126,11 +131,12 @@ struct BackupableDBOptions {
int max_valid_backups_to_open;
// ShareFilesNaming describes possible naming schemes for backup
// table file names when the table files are stored in the shared_checksum
// directory (i.e., both share_table_files and share_files_with_checksum
// are true).
// table and blob file names when they are stored in the
// shared_checksum directory (i.e., both share_table_files and
// share_files_with_checksum are true).
enum ShareFilesNaming : uint32_t {
// Backup SST filenames are <file_number>_<crc32c>_<file_size>.sst
// Backup blob filenames are <file_number>_<crc32c>_<file_size>.blob and
// backup SST filenames are <file_number>_<crc32c>_<file_size>.sst
// where <crc32c> is an unsigned decimal integer. This is the
// original/legacy naming scheme for share_files_with_checksum,
// with two problems:
@ -139,6 +145,7 @@ struct BackupableDBOptions {
// * Determining the name to use requires computing the checksum,
// so generally requires reading the whole file even if the file
// is already backed up.
//
// ** ONLY RECOMMENDED FOR PRESERVING OLD BEHAVIOR **
kLegacyCrc32cAndFileSize = 1U,
@ -148,6 +155,8 @@ struct BackupableDBOptions {
// the value is a DB session id, not a checksum.
//
// Exceptions:
// * For blob files, kLegacyCrc32cAndFileSize is used as currently
// db_session_id is not supported by the blob file format.
// * For old SST files without a DB session id, kLegacyCrc32cAndFileSize
// will be used instead, matching the names assigned by RocksDB versions
// not supporting the newer naming scheme.
@ -158,25 +167,25 @@ struct BackupableDBOptions {
// If not already part of the naming scheme, insert
// _<file_size>
// before .sst in the name. In case of user code actually parsing the
// last _<whatever> before the .sst as the file size, this preserves that
// feature of kLegacyCrc32cAndFileSize. In other words, this option makes
// official that unofficial feature of the backup metadata.
// before .sst and .blob in the name. In case of user code actually parsing
// the last _<whatever> before the .sst and .blob as the file size, this
// preserves that feature of kLegacyCrc32cAndFileSize. In other words, this
// option makes official that unofficial feature of the backup metadata.
//
// We do not consider SST file sizes to have sufficient entropy to
// We do not consider SST and blob file sizes to have sufficient entropy to
// contribute significantly to naming uniqueness.
kFlagIncludeFileSize = 1U << 31,
kMaskNamingFlags = ~kMaskNoNamingFlags,
};
// Naming option for share_files_with_checksum table files. See
// Naming option for share_files_with_checksum table and blob files. See
// ShareFilesNaming for details.
//
// Modifying this option cannot introduce a downgrade compatibility issue
// because RocksDB can read, restore, and delete backups using different file
// names, and it's OK for a backup directory to use a mixture of table file
// naming schemes.
// names, and it's OK for a backup directory to use a mixture of table and
// blob files naming schemes.
//
// However, modifying this option and saving more backups to the same
// directory can lead to the same file getting saved again to that

View File

@ -749,7 +749,7 @@ class BackupEngineImpl {
BackupID backup_id, bool shared, const std::string& src_dir,
const std::string& fname, // starts with "/"
const EnvOptions& src_env_options, RateLimiter* rate_limiter,
uint64_t size_bytes, uint64_t size_limit = 0,
FileType file_type, uint64_t size_bytes, uint64_t size_limit = 0,
bool shared_checksum = false,
std::function<void()> progress_callback = []() {},
const std::string& contents = std::string(),
@ -1287,7 +1287,7 @@ Status BackupEngineImpl::CreateNewBackupWithMetadata(
Log(options_.info_log, "add file for backup %s", fname.c_str());
uint64_t size_bytes = 0;
Status st;
if (type == kTableFile) {
if (type == kTableFile || type == kBlobFile) {
st = db_env_->GetFileSize(src_dirname + fname, &size_bytes);
}
EnvOptions src_env_options;
@ -1304,6 +1304,10 @@ Status BackupEngineImpl::CreateNewBackupWithMetadata(
src_env_options =
db_env_->OptimizeForManifestRead(src_raw_env_options);
break;
case kBlobFile:
src_env_options = db_env_->OptimizeForBlobFileRead(
src_raw_env_options, ImmutableDBOptions(db_options));
break;
default:
// Other backed up files (like options file) are not read by live
// DB, so don't need to worry about avoiding mixing buffered and
@ -1314,22 +1318,25 @@ Status BackupEngineImpl::CreateNewBackupWithMetadata(
if (st.ok()) {
st = AddBackupFileWorkItem(
live_dst_paths, backup_items_to_finish, new_backup_id,
options_.share_table_files && type == kTableFile, src_dirname,
fname, src_env_options, rate_limiter, size_bytes,
size_limit_bytes,
options_.share_files_with_checksum && type == kTableFile,
options_.share_table_files &&
(type == kTableFile || type == kBlobFile),
src_dirname, fname, src_env_options, rate_limiter, type,
size_bytes, size_limit_bytes,
options_.share_files_with_checksum &&
(type == kTableFile || type == kBlobFile),
options.progress_callback, "" /* contents */,
checksum_func_name, checksum_val);
}
return st;
} /* copy_file_cb */,
[&](const std::string& fname, const std::string& contents, FileType) {
[&](const std::string& fname, const std::string& contents,
FileType type) {
Log(options_.info_log, "add file for backup %s", fname.c_str());
return AddBackupFileWorkItem(
live_dst_paths, backup_items_to_finish, new_backup_id,
false /* shared */, "" /* src_dir */, fname,
EnvOptions() /* src_env_options */, rate_limiter, contents.size(),
0 /* size_limit */, false /* shared_checksum */,
EnvOptions() /* src_env_options */, rate_limiter, type,
contents.size(), 0 /* size_limit */, false /* shared_checksum */,
options.progress_callback, contents);
} /* create_file_cb */,
&sequence_number, options.flush_before_backup ? 0 : port::kMaxUint64,
@ -1872,9 +1879,10 @@ Status BackupEngineImpl::AddBackupFileWorkItem(
std::vector<BackupAfterCopyOrCreateWorkItem>& backup_items_to_finish,
BackupID backup_id, bool shared, const std::string& src_dir,
const std::string& fname, const EnvOptions& src_env_options,
RateLimiter* rate_limiter, uint64_t size_bytes, uint64_t size_limit,
bool shared_checksum, std::function<void()> progress_callback,
const std::string& contents, const std::string& src_checksum_func_name,
RateLimiter* rate_limiter, FileType file_type, uint64_t size_bytes,
uint64_t size_limit, bool shared_checksum,
std::function<void()> progress_callback, const std::string& contents,
const std::string& src_checksum_func_name,
const std::string& src_checksum_str) {
assert(!fname.empty() && fname[0] == '/');
assert(contents.empty() != src_dir.empty());
@ -1887,8 +1895,8 @@ Status BackupEngineImpl::AddBackupFileWorkItem(
std::string checksum_hex;
// Whenever a default checksum function name is passed in, we will compares
// the corresponding checksum values after copying. Note that only table files
// may have a known checksum function name passed in.
// the corresponding checksum values after copying. Note that only table and
// blob files may have a known checksum function name passed in.
//
// If no default checksum function name is passed in and db session id is not
// available, we will calculate the checksum *before* copying in two cases
@ -1906,7 +1914,8 @@ Status BackupEngineImpl::AddBackupFileWorkItem(
// Step 1: Prepare the relative path to destination
if (shared && shared_checksum) {
if (GetNamingNoFlags() != BackupableDBOptions::kLegacyCrc32cAndFileSize) {
if (GetNamingNoFlags() != BackupableDBOptions::kLegacyCrc32cAndFileSize &&
file_type != kBlobFile) {
// Prepare db_session_id to add to the file name
// Ignore the returned status
// In the failed cases, db_id and db_session_id will be empty
@ -1938,6 +1947,11 @@ Status BackupEngineImpl::AddBackupFileWorkItem(
// shared_checksum/<file_number>_<db_session_id>.sst
// Otherwise, dst_relative is of the form
// shared_checksum/<file_number>_<checksum>_<size>.sst
//
// For blob files, db_session_id is not supported with the blob file format.
// It uses original/legacy naming scheme.
// dst_relative will be of the form:
// shared_checksum/<file_number>_<checksum>_<size>.blob
dst_relative = GetSharedFileWithChecksum(dst_relative, checksum_hex,
size_bytes, db_session_id);
dst_relative_tmp = GetSharedFileWithChecksumRel(dst_relative, true);

View File

@ -632,6 +632,7 @@ class BackupableDBTest : public testing::Test {
options_.write_buffer_size = 1 << 17; // 128KB
options_.env = test_db_env_.get();
options_.wal_dir = dbname_;
options_.enable_blob_files = true;
// Create logger
DBOptions logger_options;
@ -894,14 +895,17 @@ class BackupableDBTest : public testing::Test {
void AssertDirectoryFilesMatchRegex(const std::string& dir,
const std::regex& pattern,
const std::string& file_type,
int minimum_count) {
std::vector<FileAttributes> children;
ASSERT_OK(file_manager_->GetChildrenFileAttributes(dir, &children));
int found_count = 0;
for (const auto& child : children) {
const std::string match("match");
ASSERT_EQ(match, std::regex_replace(child.name, pattern, match));
++found_count;
if (EndsWith(child.name, file_type)) {
ASSERT_TRUE(std::regex_match(child.name, pattern))
<< "File name " << child.name << " does not match regex.";
++found_count;
}
}
ASSERT_GE(found_count, minimum_count);
}
@ -1433,9 +1437,8 @@ TEST_F(BackupableDBTest, CorruptFileMaintainSize) {
}
// Corrupt a blob file but maintain its size
TEST_F(BackupableDBTest, CorruptBlobFileMaintainSize) {
TEST_P(BackupableDBTestWithParam, CorruptBlobFileMaintainSize) {
const int keys_iteration = 5000;
options_.enable_blob_files = true;
OpenDBAndBackupEngine(true);
// create a backup
FillDB(db_.get(), 0, keys_iteration);
@ -1450,12 +1453,18 @@ TEST_F(BackupableDBTest, CorruptBlobFileMaintainSize) {
std::string file_to_corrupt;
std::vector<FileAttributes> children;
const std::string dir = backupdir_ + "/private/1";
std::string dir = backupdir_;
if (backupable_options_->share_files_with_checksum) {
dir += "/shared_checksum";
} else {
dir += "/shared";
}
ASSERT_OK(file_manager_->GetChildrenFileAttributes(dir, &children));
for (const auto& child : children) {
if (child.name.find(".blob") != std::string::npos &&
child.size_bytes != 0) {
if (EndsWith(child.name, ".blob") && child.size_bytes != 0) {
// corrupt the blob files by replacing its content by file_size random
// bytes
ASSERT_OK(
@ -1509,7 +1518,6 @@ TEST_F(BackupableDBTest, TableFileCorruptedBeforeBackup) {
// been corrupted and the blob file checksum is stored in the DB manifest
TEST_F(BackupableDBTest, BlobFileCorruptedBeforeBackup) {
const int keys_iteration = 50000;
options_.enable_blob_files = true;
OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */,
kNoShare);
@ -1575,7 +1583,6 @@ TEST_P(BackupableDBTestWithParam, TableFileCorruptedBeforeBackup) {
// the case when backup blob files will be stored in a shared directory
TEST_P(BackupableDBTestWithParam, BlobFileCorruptedBeforeBackup) {
const int keys_iteration = 50000;
options_.enable_blob_files = true;
OpenDBAndBackupEngine(true /* destroy_old_data */);
FillDB(db_.get(), 0, keys_iteration);
CloseAndReopenDB(/*read_only*/ true);
@ -1986,6 +1993,8 @@ TEST_F(BackupableDBTest, ShareTableFilesWithChecksumsNewNaming) {
"[0-9]+_s[0-9A-Z]{20}_[0-9]+[.]sst"},
};
const std::string blobfile_pattern = "[0-9]+_[0-9]+_[0-9]+[.]blob";
for (const auto& pair : option_to_expected) {
CloseAndReopenDB();
backupable_options_->share_files_with_checksum_naming = pair.first;
@ -1994,12 +2003,15 @@ TEST_F(BackupableDBTest, ShareTableFilesWithChecksumsNewNaming) {
CloseDBAndBackupEngine();
AssertBackupConsistency(1, 0, keys_iteration, keys_iteration * 2);
AssertDirectoryFilesMatchRegex(backupdir_ + "/shared_checksum",
std::regex(pair.second),
std::regex(pair.second), ".sst",
1 /* minimum_count */);
if (std::string::npos != pair.second.find("_[0-9]+[.]sst")) {
AssertDirectoryFilesSizeIndicators(backupdir_ + "/shared_checksum",
1 /* minimum_count */);
}
AssertDirectoryFilesMatchRegex(backupdir_ + "/shared_checksum",
std::regex(blobfile_pattern), ".blob",
1 /* minimum_count */);
}
}
@ -2024,6 +2036,8 @@ TEST_F(BackupableDBTest, ShareTableFilesWithChecksumsOldFileNaming) {
// Old names should always be used on old files
const std::regex expected("[0-9]+_[0-9]+_[0-9]+[.]sst");
const std::string blobfile_pattern = "[0-9]+_[0-9]+_[0-9]+[.]blob";
for (ShareFilesNaming option : {kNamingDefault, kUseDbSessionId}) {
CloseAndReopenDB();
backupable_options_->share_files_with_checksum_naming = option;
@ -2032,6 +2046,9 @@ TEST_F(BackupableDBTest, ShareTableFilesWithChecksumsOldFileNaming) {
CloseDBAndBackupEngine();
AssertBackupConsistency(1, 0, keys_iteration, keys_iteration * 2);
AssertDirectoryFilesMatchRegex(backupdir_ + "/shared_checksum", expected,
".sst", 1 /* minimum_count */);
AssertDirectoryFilesMatchRegex(backupdir_ + "/shared_checksum",
std::regex(blobfile_pattern), ".blob",
1 /* minimum_count */);
}
@ -2175,9 +2192,9 @@ TEST_F(BackupableDBTest, FileSizeForIncremental) {
ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true /*flush*/));
CloseDBAndBackupEngine();
// Corrupt backup SST
// Corrupt backup SST and blob file
ASSERT_OK(file_manager_->GetChildrenFileAttributes(shared_dir, &children));
ASSERT_EQ(children.size(), 1U); // one sst
ASSERT_EQ(children.size(), 2U); // one sst and one blob file
for (const auto& child : children) {
if (child.name.size() > 4 && child.size_bytes > 0) {
ASSERT_OK(
@ -2234,10 +2251,10 @@ TEST_F(BackupableDBTest, FileSizeForIncremental) {
OpenDBAndBackupEngine(false, false, share);
ASSERT_OK(db_->Put(WriteOptions(), "y", Random(42).RandomString(500)));
// Count backup SSTs
// Count backup SSTs and blob files.
children.clear();
ASSERT_OK(file_manager_->GetChildrenFileAttributes(shared_dir, &children));
ASSERT_EQ(children.size(), 2U); // two sst
ASSERT_EQ(children.size(), 4U); // two sst and two blob files
// Try create backup 3
s = backup_engine_->CreateNewBackup(db_.get(), true /*flush*/);
@ -2250,18 +2267,18 @@ TEST_F(BackupableDBTest, FileSizeForIncremental) {
// Acceptable to call it corruption if size is not in name and
// db session id collision is practically impossible.
EXPECT_TRUE(s.IsCorruption());
EXPECT_EQ(children.size(), 2U); // no SST added
EXPECT_EQ(children.size(), 4U); // no SST/Blob file added
} else if (option == share_no_checksum) {
// Good to call it corruption if both backups cannot be
// accommodated.
EXPECT_TRUE(s.IsCorruption());
EXPECT_EQ(children.size(), 2U); // no SST added
EXPECT_EQ(children.size(), 4U); // no SST/Blob file added
} else {
// Since opening a DB seems sufficient for detecting size corruption
// on the DB side, this should be a good thing, ...
EXPECT_OK(s);
// ... as long as we did actually treat it as a distinct SST file.
EXPECT_EQ(children.size(), 3U); // Another SST added
EXPECT_EQ(children.size(), 6U); // Another SST and blob added
}
CloseDBAndBackupEngine();
ASSERT_OK(DestroyDB(dbname_, options_));