Add an option to prevent DB::Open() from querying sizes of all sst files (#6353)

Summary:
When paranoid_checks is on, DBImpl::CheckConsistency() iterates over all sst files and calls Env::GetFileSize() for each of them. As far as I could understand, this is pretty arbitrary and doesn't affect correctness - if filesystem doesn't corrupt fsynced files, the file sizes will always match; if it does, it may as well corrupt contents as well as sizes, and rocksdb doesn't check contents on open.

If there are thousands of sst files, getting all their sizes takes a while. If, on top of that, Env is overridden to use some remote storage instead of local filesystem, it can be *really* slow and overload the remote storage service. This PR adds an option to not do GetFileSize(); instead it does GetChildren() for parent directory to check that all the expected sst files are at least present, but doesn't check their sizes.

We can't just disable paranoid_checks instead because paranoid_checks do a few other important things: make the DB read-only on write errors, print error messages on read errors, etc.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6353

Test Plan: ran the added sanity check unit test. Will try it out in a LogDevice test cluster where the GetFileSize() calls are causing a lot of trouble.

Differential Revision: D19656425

Pulled By: al13n321

fbshipit-source-id: c2c421b367633033760d1f56747bad206d1fbf82
This commit is contained in:
Mike Kolupaev 2020-02-04 01:24:29 -08:00 committed by Facebook Github Bot
parent 7330ec0ff1
commit 637e64b9ac
14 changed files with 160 additions and 17 deletions

View File

@ -7,6 +7,7 @@
* Fix incorrect results while block-based table uses kHashSearch, together with Prev()/SeekForPrev().
* Fix a bug that prevents opening a DB after two consecutive crash with TransactionDB, where the first crash recovers from a corrupted WAL with kPointInTimeRecovery but the second cannot.
* Fixed issue #6316 that can cause a corruption of the MANIFEST file in the middle when writing to it fails due to no disk space.
* Add DBOptions::skip_checking_sst_file_sizes_on_db_open. It disables potentially expensive checking of all sst file sizes in DB::Open().
### Public API Change
* The BlobDB garbage collector now emits the statistics `BLOB_DB_GC_NUM_FILES` (number of blob files obsoleted during GC), `BLOB_DB_GC_NUM_NEW_FILES` (number of new blob files generated during GC), `BLOB_DB_GC_FAILURES` (number of failed GC passes), `BLOB_DB_GC_NUM_KEYS_RELOCATED` (number of blobs relocated during GC), and `BLOB_DB_GC_BYTES_RELOCATED` (total size of blobs relocated during GC). On the other hand, the following statistics, which are not relevant for the new GC implementation, are now deprecated: `BLOB_DB_GC_NUM_KEYS_OVERWRITTEN`, `BLOB_DB_GC_NUM_KEYS_EXPIRED`, `BLOB_DB_GC_BYTES_OVERWRITTEN`, `BLOB_DB_GC_BYTES_EXPIRED`, and `BLOB_DB_GC_MICROS`.

View File

@ -2320,6 +2320,11 @@ void rocksdb_options_set_skip_stats_update_on_db_open(rocksdb_options_t* opt,
opt->rep.skip_stats_update_on_db_open = val;
}
void rocksdb_options_set_skip_checking_sst_file_sizes_on_db_open(
rocksdb_options_t* opt, unsigned char val) {
opt->rep.skip_checking_sst_file_sizes_on_db_open = val;
}
void rocksdb_options_set_num_levels(rocksdb_options_t* opt, int n) {
opt->rep.num_levels = n;
}

View File

@ -3318,6 +3318,44 @@ Status DBImpl::CheckConsistency() {
TEST_SYNC_POINT("DBImpl::CheckConsistency:AfterGetLiveFilesMetaData");
std::string corruption_messages;
if (immutable_db_options_.skip_checking_sst_file_sizes_on_db_open) {
// Instead of calling GetFileSize() for each expected file, call
// GetChildren() for the DB directory and check that all expected files
// are listed, without checking their sizes.
// Since sst files might be in different directories, do it for each
// directory separately.
std::map<std::string, std::vector<std::string>> files_by_directory;
for (const auto& md : metadata) {
// md.name has a leading "/". Remove it.
std::string fname = md.name;
if (!fname.empty() && fname[0] == '/') {
fname = fname.substr(1);
}
files_by_directory[md.db_path].push_back(fname);
}
for (const auto& dir_files : files_by_directory) {
std::string directory = dir_files.first;
std::vector<std::string> existing_files;
Status s = env_->GetChildren(directory, &existing_files);
if (!s.ok()) {
corruption_messages +=
"Can't list files in " + directory + ": " + s.ToString() + "\n";
continue;
}
std::sort(existing_files.begin(), existing_files.end());
for (const std::string& fname : dir_files.second) {
if (!std::binary_search(existing_files.begin(), existing_files.end(),
fname) &&
!std::binary_search(existing_files.begin(), existing_files.end(),
Rocks2LevelTableFileName(fname))) {
corruption_messages +=
"Missing sst file " + fname + " in " + directory + "\n";
}
}
}
} else {
for (const auto& md : metadata) {
// md.name has a leading "/".
std::string file_path = md.db_path + md.name;
@ -3339,6 +3377,8 @@ Status DBImpl::CheckConsistency() {
ToString(fsize) + "\n";
}
}
}
if (corruption_messages.size() == 0) {
return Status::OK();
} else {

View File

@ -172,6 +172,13 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) {
result.sst_file_manager = sst_file_manager;
}
#endif
if (!result.paranoid_checks) {
result.skip_checking_sst_file_sizes_on_db_open = true;
ROCKS_LOG_INFO(result.info_log,
"file size check will be skipped during open.");
}
return result;
}

View File

@ -468,6 +468,11 @@ Status DBImplSecondary::CheckConsistency() {
// approach and just proceed.
TEST_SYNC_POINT_CALLBACK(
"DBImplSecondary::CheckConsistency:AfterFirstAttempt", &s);
if (immutable_db_options_.skip_checking_sst_file_sizes_on_db_open) {
return Status::OK();
}
std::vector<LiveFileMetaData> metadata;
versions_->GetLiveFilesMetaData(&metadata);

View File

@ -124,6 +124,20 @@ TEST_F(DBSSTTest, SSTsWithLdbSuffixHandling) {
Destroy(options);
}
// Check that we don't crash when opening DB with
// DBOptions::skip_checking_sst_file_sizes_on_db_open = true.
TEST_F(DBSSTTest, SkipCheckingSSTFileSizesOnDBOpen) {
ASSERT_OK(Put("pika", "choo"));
ASSERT_OK(Flush());
// Just open the DB with the option set to true and check that we don't crash.
Options options;
options.skip_checking_sst_file_sizes_on_db_open = true;
Reopen(options);
ASSERT_EQ("choo", Get("pika"));
}
#ifndef ROCKSDB_LITE
TEST_F(DBSSTTest, DontDeleteMovedFile) {
// This test triggers move compaction and verifies that the file is not

View File

@ -854,6 +854,9 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_enable_statistics(
extern ROCKSDB_LIBRARY_API void
rocksdb_options_set_skip_stats_update_on_db_open(rocksdb_options_t* opt,
unsigned char val);
extern ROCKSDB_LIBRARY_API void
rocksdb_options_set_skip_checking_sst_file_sizes_on_db_open(
rocksdb_options_t* opt, unsigned char val);
/* returns a pointer to a malloc()-ed, null terminated string */
extern ROCKSDB_LIBRARY_API char* rocksdb_options_statistics_get_string(

View File

@ -985,6 +985,16 @@ struct DBOptions {
// Default: false
bool skip_stats_update_on_db_open = false;
// If true, then DB::Open() will not fetch and check sizes of all sst files.
// This may significantly speed up startup if there are many sst files,
// especially when using non-default Env with expensive GetFileSize().
// We'll still check that all required sst files exist.
// If paranoid_checks is false, this option is ignored, and sst files are
// not checked at all.
//
// Default: false
bool skip_checking_sst_file_sizes_on_db_open = false;
// Recovery mode to control the consistency while replaying WAL
// Default: kPointInTimeRecovery
WALRecoveryMode wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;

View File

@ -1738,6 +1738,30 @@ jboolean Java_org_rocksdb_Options_skipStatsUpdateOnDbOpen(
return static_cast<jboolean>(opt->skip_stats_update_on_db_open);
}
/*
* Class: org_rocksdb_Options
* Method: setSkipCheckingSstFileSizesOnDbOpen
* Signature: (JZ)V
*/
void Java_org_rocksdb_Options_setSkipCheckingSstFileSizesOnDbOpen(
JNIEnv*, jobject, jlong jhandle,
jboolean jskip_checking_sst_file_sizes_on_db_open) {
auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
opt->skip_checking_sst_file_sizes_on_db_open =
static_cast<bool>(jskip_checking_sst_file_sizes_on_db_open);
}
/*
* Class: org_rocksdb_Options
* Method: skipCheckingSstFileSizesOnDbOpen
* Signature: (J)Z
*/
jboolean Java_org_rocksdb_Options_skipCheckingSstFileSizesOnDbOpen(
JNIEnv*, jobject, jlong jhandle) {
auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
return static_cast<jboolean>(opt->skip_checking_sst_file_sizes_on_db_open);
}
/*
* Class: org_rocksdb_Options
* Method: setWalRecoveryMode
@ -6010,6 +6034,30 @@ jboolean Java_org_rocksdb_DBOptions_skipStatsUpdateOnDbOpen(
return static_cast<jboolean>(opt->skip_stats_update_on_db_open);
}
/*
* Class: org_rocksdb_DBOptions
* Method: setSkipCheckingSstFileSizesOnDbOpen
* Signature: (JZ)V
*/
void Java_org_rocksdb_DBOptions_setSkipCheckingSstFileSizesOnDbOpen(
JNIEnv*, jobject, jlong jhandle,
jboolean jskip_checking_sst_file_sizes_on_db_open) {
auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
opt->skip_checking_sst_file_sizes_on_db_open =
static_cast<bool>(jskip_checking_sst_file_sizes_on_db_open);
}
/*
* Class: org_rocksdb_DBOptions
* Method: skipCheckingSstFileSizesOnDbOpen
* Signature: (J)Z
*/
jboolean Java_org_rocksdb_DBOptions_skipCheckingSstFileSizesOnDbOpen(
JNIEnv*, jobject, jlong jhandle) {
auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
return static_cast<jboolean>(opt->skip_checking_sst_file_sizes_on_db_open);
}
/*
* Class: org_rocksdb_DBOptions
* Method: setWalRecoveryMode

View File

@ -74,6 +74,8 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options)
write_thread_max_yield_usec(options.write_thread_max_yield_usec),
write_thread_slow_yield_usec(options.write_thread_slow_yield_usec),
skip_stats_update_on_db_open(options.skip_stats_update_on_db_open),
skip_checking_sst_file_sizes_on_db_open(
options.skip_checking_sst_file_sizes_on_db_open),
wal_recovery_mode(options.wal_recovery_mode),
allow_2pc(options.allow_2pc),
row_cache(options.row_cache),

View File

@ -68,6 +68,7 @@ struct ImmutableDBOptions {
uint64_t write_thread_max_yield_usec;
uint64_t write_thread_slow_yield_usec;
bool skip_stats_update_on_db_open;
bool skip_checking_sst_file_sizes_on_db_open;
WALRecoveryMode wal_recovery_mode;
bool allow_2pc;
std::shared_ptr<Cache> row_cache;

View File

@ -119,6 +119,8 @@ DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options,
immutable_db_options.write_thread_slow_yield_usec;
options.skip_stats_update_on_db_open =
immutable_db_options.skip_stats_update_on_db_open;
options.skip_checking_sst_file_sizes_on_db_open =
immutable_db_options.skip_checking_sst_file_sizes_on_db_open;
options.wal_recovery_mode = immutable_db_options.wal_recovery_mode;
options.allow_2pc = immutable_db_options.allow_2pc;
options.row_cache = immutable_db_options.row_cache;
@ -1473,6 +1475,9 @@ std::unordered_map<std::string, OptionTypeInfo>
{"skip_stats_update_on_db_open",
{offsetof(struct DBOptions, skip_stats_update_on_db_open),
OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
{"skip_checking_sst_file_sizes_on_db_open",
{offsetof(struct DBOptions, skip_checking_sst_file_sizes_on_db_open),
OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
{"new_table_reader_for_compaction_inputs",
{offsetof(struct DBOptions, new_table_reader_for_compaction_inputs),
OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},

View File

@ -248,6 +248,7 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) {
"new_table_reader_for_compaction_inputs=false;"
"keep_log_file_num=4890;"
"skip_stats_update_on_db_open=false;"
"skip_checking_sst_file_sizes_on_db_open=false;"
"max_manifest_file_size=4295009941;"
"db_log_dir=path/to/db_log_dir;"
"skip_log_error_on_recovery=true;"

View File

@ -265,6 +265,7 @@ void RandomInitDBOptions(DBOptions* db_opt, Random* rnd) {
db_opt->paranoid_checks = rnd->Uniform(2);
db_opt->skip_log_error_on_recovery = rnd->Uniform(2);
db_opt->skip_stats_update_on_db_open = rnd->Uniform(2);
db_opt->skip_checking_sst_file_sizes_on_db_open = rnd->Uniform(2);
db_opt->use_adaptive_mutex = rnd->Uniform(2);
db_opt->use_fsync = rnd->Uniform(2);
db_opt->recycle_log_file_num = rnd->Uniform(2);